morphik-core/sdks/python/morphik/sync.py

import json
import logging
from io import BytesIO, IOBase
from pathlib import Path
from typing import Any, BinaryIO, Dict, List, Optional, Type, Union

import httpx
from pydantic import BaseModel

from ._internal import FinalChunkResult, RuleOrDict, _MorphikClientLogic
from .models import (
    ChunkSource,
    CompletionResponse,  # Prompt override models
    Document,
    DocumentResult,
    FolderInfo,
    Graph,
    GraphPromptOverrides,
    IngestTextRequest,
    QueryPromptOverrides,
)

logger = logging.getLogger(__name__)


class Cache:
    def __init__(self, db: "Morphik", name: str):
        self._db = db
        self._name = name

    def update(self) -> bool:
        response = self._db._request("POST", f"cache/{self._name}/update")
        return response.get("success", False)

    def add_docs(self, docs: List[str]) -> bool:
        response = self._db._request("POST", f"cache/{self._name}/add_docs", {"docs": docs})
        return response.get("success", False)

    def query(
        self, query: str, max_tokens: Optional[int] = None, temperature: Optional[float] = None
    ) -> CompletionResponse:
        response = self._db._request(
            "POST",
            f"cache/{self._name}/query",
            params={"query": query, "max_tokens": max_tokens, "temperature": temperature},
            data="",
        )
        return CompletionResponse(**response)


class Folder:
    """
    A folder that allows operations to be scoped to a specific folder.

    Args:
        client: The Morphik client instance
        name: The name of the folder
        folder_id: Optional folder ID (if already known)
    """

    def __init__(self, client: "Morphik", name: str, folder_id: Optional[str] = None):
        self._client = client
        self._name = name
        self._id = folder_id

    @property
    def name(self) -> str:
        """Returns the folder name."""
        return self._name

    @property
    def id(self) -> Optional[str]:
        """Returns the folder ID if available."""
        return self._id

    def get_info(self) -> Dict[str, Any]:
        """
        Get detailed information about this folder.

        Returns:
            Dict[str, Any]: Detailed folder information
        """
        if not self._id:
            # If we don't have the ID, find the folder by name first
            folders = self._client.list_folders()
            for folder in folders:
                if folder.name == self._name:
                    self._id = folder.id
                    break
            if not self._id:
                raise ValueError(f"Folder '{self._name}' not found")

        return self._client._request("GET", f"folders/{self._id}")

    def signin(self, end_user_id: str) -> "UserScope":
        """
        Returns a UserScope object scoped to this folder and the end user.

        Args:
            end_user_id: The ID of the end user

        Returns:
            UserScope: A user scope scoped to this folder and the end user
        """
        return UserScope(client=self._client, end_user_id=end_user_id, folder_name=self._name)

    def ingest_text(
        self,
        content: str,
        filename: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None,
        rules: Optional[List[RuleOrDict]] = None,
        use_colpali: bool = True,
    ) -> Document:
        """
        Ingest a text document into Morphik within this folder.

        Args:
            content: Text content to ingest
            filename: Optional file name
            metadata: Optional metadata dictionary
            rules: Optional list of rules to apply during ingestion
            use_colpali: Whether to use ColPali-style embedding model

        Returns:
            Document: Metadata of the ingested document
        """
        rules_list = [self._client._convert_rule(r) for r in (rules or [])]
        payload = self._client._logic._prepare_ingest_text_request(
            content, filename, metadata, rules_list, use_colpali, self._name, None
        )
        response = self._client._request("POST", "ingest/text", data=payload)
        doc = self._client._logic._parse_document_response(response)
        doc._client = self._client
        return doc

    def ingest_file(
        self,
        file: Union[str, bytes, BinaryIO, Path],
        filename: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None,
        rules: Optional[List[RuleOrDict]] = None,
        use_colpali: bool = True,
    ) -> Document:
        """
        Ingest a file document into Morphik within this folder.

        Args:
            file: File to ingest (path string, bytes, file object, or Path)
            filename: Name of the file
            metadata: Optional metadata dictionary
            rules: Optional list of rules to apply during ingestion
            use_colpali: Whether to use ColPali-style embedding model

        Returns:
            Document: Metadata of the ingested document
        """
        # Process file input
        file_obj, filename = self._client._logic._prepare_file_for_upload(file, filename)

        try:
            # Prepare multipart form data
            files = {"file": (filename, file_obj)}

            # Create form data
            form_data = self._client._logic._prepare_ingest_file_form_data(metadata, rules, self._name, None)

            # use_colpali should be a query parameter as defined in the API
            response = self._client._request(
                "POST",
                "ingest/file",
                data=form_data,
                files=files,
                params={"use_colpali": str(use_colpali).lower()},
            )
            doc = self._client._logic._parse_document_response(response)
            doc._client = self._client
            return doc
        finally:
            # Close file if we opened it
            if isinstance(file, (str, Path)):
                file_obj.close()

    def ingest_files(
        self,
        files: List[Union[str, bytes, BinaryIO, Path]],
        metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
        rules: Optional[List[RuleOrDict]] = None,
        use_colpali: bool = True,
        parallel: bool = True,
    ) -> List[Document]:
        """
        Ingest multiple files into Morphik within this folder.

        Args:
            files: List of files to ingest
            metadata: Optional metadata
            rules: Optional list of rules to apply
            use_colpali: Whether to use ColPali-style embedding
            parallel: Whether to process files in parallel

        Returns:
            List[Document]: List of ingested documents
        """
        # Convert files to format expected by API
        file_objects = self._client._logic._prepare_files_for_upload(files)

        try:
            # Prepare form data
            data = self._client._logic._prepare_ingest_files_form_data(
                metadata, rules, use_colpali, parallel, self._name, None
            )

            response = self._client._request(
                "POST",
                "ingest/files",
                data=data,
                files=file_objects,
                params={"use_colpali": str(use_colpali).lower()},
            )

            if response.get("errors"):
                # Log errors but don't raise exception
                for error in response["errors"]:
                    logger.error(f"Failed to ingest {error['filename']}: {error['error']}")

            docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
            for doc in docs:
                doc._client = self._client
            return docs
        finally:
            # Clean up file objects
            for _, (_, file_obj) in file_objects:
                if isinstance(file_obj, (IOBase, BytesIO)) and not file_obj.closed:
                    file_obj.close()

    def ingest_directory(
        self,
        directory: Union[str, Path],
        recursive: bool = False,
        pattern: str = "*",
        metadata: Optional[Dict[str, Any]] = None,
        rules: Optional[List[RuleOrDict]] = None,
        use_colpali: bool = True,
        parallel: bool = True,
    ) -> List[Document]:
        """
        Ingest all files in a directory into Morphik within this folder.

        Args:
            directory: Path to directory containing files to ingest
            recursive: Whether to recursively process subdirectories
            pattern: Optional glob pattern to filter files
            metadata: Optional metadata dictionary to apply to all files
            rules: Optional list of rules to apply
            use_colpali: Whether to use ColPali-style embedding
            parallel: Whether to process files in parallel

        Returns:
            List[Document]: List of ingested documents
        """
        directory = Path(directory)
        if not directory.is_dir():
            raise ValueError(f"Directory not found: {directory}")

        # Collect all files matching pattern
        if recursive:
            files = list(directory.rglob(pattern))
        else:
            files = list(directory.glob(pattern))

        # Filter out directories
        files = [f for f in files if f.is_file()]

        if not files:
            return []

        # Use ingest_files with collected paths
        return self.ingest_files(
            files=files, metadata=metadata, rules=rules, use_colpali=use_colpali, parallel=parallel
        )

    def retrieve_chunks(
        self,
        query: str,
        filters: Optional[Dict[str, Any]] = None,
        k: int = 4,
        min_score: float = 0.0,
        use_colpali: bool = True,
    ) -> List[FinalChunkResult]:
        """
        Retrieve relevant chunks within this folder.

        Args:
            query: Search query text
            filters: Optional metadata filters
            k: Number of results (default: 4)
            min_score: Minimum similarity threshold (default: 0.0)
            use_colpali: Whether to use ColPali-style embedding model

        Returns:
            List[FinalChunkResult]: List of relevant chunks
        """
        request = {
            "query": query,
            "filters": filters,
            "k": k,
            "min_score": min_score,
            "use_colpali": use_colpali,
            "folder_name": self._name,  # Add folder name here
        }

        response = self._client._request("POST", "retrieve/chunks", request)
        return self._client._logic._parse_chunk_result_list_response(response)

    def retrieve_docs(
        self,
        query: str,
        filters: Optional[Dict[str, Any]] = None,
        k: int = 4,
        min_score: float = 0.0,
        use_colpali: bool = True,
    ) -> List[DocumentResult]:
        """
        Retrieve relevant documents within this folder.

        Args:
            query: Search query text
            filters: Optional metadata filters
            k: Number of results (default: 4)
            min_score: Minimum similarity threshold (default: 0.0)
            use_colpali: Whether to use ColPali-style embedding model

        Returns:
            List[DocumentResult]: List of relevant documents
        """
        request = {
            "query": query,
            "filters": filters,
            "k": k,
            "min_score": min_score,
            "use_colpali": use_colpali,
            "folder_name": self._name,  # Add folder name here
        }

        response = self._client._request("POST", "retrieve/docs", request)
        return self._client._logic._parse_document_result_list_response(response)

    def query(
        self,
        query: str,
        filters: Optional[Dict[str, Any]] = None,
        k: int = 4,
        min_score: float = 0.0,
        max_tokens: Optional[int] = None,
        temperature: Optional[float] = None,
        use_colpali: bool = True,
        graph_name: Optional[str] = None,
        hop_depth: int = 1,
        include_paths: bool = False,
        prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
        schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
    ) -> CompletionResponse:
        """
        Generate completion using relevant chunks as context within this folder.

        Args:
            query: Query text
            filters: Optional metadata filters
            k: Number of chunks to use as context (default: 4)
            min_score: Minimum similarity threshold (default: 0.0)
            max_tokens: Maximum tokens in completion
            temperature: Model temperature
            use_colpali: Whether to use ColPali-style embedding model
            graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
            hop_depth: Number of relationship hops to traverse in the graph (1-3)
            include_paths: Whether to include relationship paths in the response
            prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
            schema: Optional schema for structured output

        Returns:
            CompletionResponse: Generated completion
        """
        payload = self._client._logic._prepare_query_request(
            query,
            filters,
            k,
            min_score,
            max_tokens,
            temperature,
            use_colpali,
            graph_name,
            hop_depth,
            include_paths,
            prompt_overrides,
            self._name,
            None,
            schema,
        )

        # Add schema to payload if provided
        if schema:
            # If schema is a Pydantic model class, we need to serialize it to a schema dict
            if isinstance(schema, type) and issubclass(schema, BaseModel):
                payload["schema"] = schema.model_json_schema()
            else:
                payload["schema"] = schema

            # Add a hint to the query to return in JSON format
            payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."

        response = self._client._request("POST", "query", data=payload)
        return self._client._logic._parse_completion_response(response)

    def list_documents(
        self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None
    ) -> List[Document]:
        """
        List accessible documents within this folder.

        Args:
            skip: Number of documents to skip
            limit: Maximum number of documents to return
            filters: Optional filters

        Returns:
            List[Document]: List of documents
        """
        params, data = self._client._logic._prepare_list_documents_request(skip, limit, filters, self._name, None)
        response = self._client._request("POST", "documents", data=data, params=params)
        docs = self._client._logic._parse_document_list_response(response)
        for doc in docs:
            doc._client = self._client
        return docs

    def batch_get_documents(self, document_ids: List[str]) -> List[Document]:
        """
        Retrieve multiple documents by their IDs in a single batch operation within this folder.

        Args:
            document_ids: List of document IDs to retrieve

        Returns:
            List[Document]: List of document metadata for found documents
        """
        request = {"document_ids": document_ids, "folder_name": self._name}

        response = self._client._request("POST", "batch/documents", data=request)
        docs = [self._client._logic._parse_document_response(doc) for doc in response]
        for doc in docs:
            doc._client = self._client
        return docs

    def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
        """
        Retrieve specific chunks by their document ID and chunk number in a single batch operation within this folder.

        Args:
            sources: List of ChunkSource objects or dictionaries with document_id and chunk_number

        Returns:
            List[FinalChunkResult]: List of chunk results
        """
        # Convert to list of dictionaries if needed
        source_dicts = []
        for source in sources:
            if isinstance(source, dict):
                source_dicts.append(source)
            else:
                source_dicts.append(source.model_dump())

        # Add folder_name to request
        request = {"sources": source_dicts, "folder_name": self._name}

        response = self._client._request("POST", "batch/chunks", data=request)
        return self._client._logic._parse_chunk_result_list_response(response)

    def create_graph(
        self,
        name: str,
        filters: Optional[Dict[str, Any]] = None,
        documents: Optional[List[str]] = None,
        prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
    ) -> Graph:
        """
        Create a graph from documents within this folder.

        Args:
            name: Name of the graph to create
            filters: Optional metadata filters to determine which documents to include
            documents: Optional list of specific document IDs to include
            prompt_overrides: Optional customizations for entity extraction and resolution prompts

        Returns:
            Graph: The created graph object
        """
        # Convert prompt_overrides to dict if it's a model
        if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
            prompt_overrides = prompt_overrides.model_dump(exclude_none=True)

        request = {
            "name": name,
            "filters": filters,
            "documents": documents,
            "prompt_overrides": prompt_overrides,
            "folder_name": self._name,  # Add folder name here
        }

        response = self._client._request("POST", "graph/create", request)
        return self._client._logic._parse_graph_response(response)

    def update_graph(
        self,
        name: str,
        additional_filters: Optional[Dict[str, Any]] = None,
        additional_documents: Optional[List[str]] = None,
        prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
    ) -> Graph:
        """
        Update an existing graph with new documents from this folder.

        Args:
            name: Name of the graph to update
            additional_filters: Optional additional metadata filters to determine which new documents to include
            additional_documents: Optional list of additional document IDs to include
            prompt_overrides: Optional customizations for entity extraction and resolution prompts

        Returns:
            Graph: The updated graph
        """
        # Convert prompt_overrides to dict if it's a model
        if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
            prompt_overrides = prompt_overrides.model_dump(exclude_none=True)

        request = {
            "additional_filters": additional_filters,
            "additional_documents": additional_documents,
            "prompt_overrides": prompt_overrides,
            "folder_name": self._name,  # Add folder name here
        }

        response = self._client._request("POST", f"graph/{name}/update", request)
        return self._client._logic._parse_graph_response(response)

    def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
        """
        Delete a document by its filename within this folder.

        Args:
            filename: Filename of the document to delete

        Returns:
            Dict[str, str]: Deletion status
        """
        # First get the document ID
        response = self._client._request("GET", f"documents/filename/{filename}", params={"folder_name": self._name})
        doc = self._client._logic._parse_document_response(response)

        # Then delete by ID
        return self._client.delete_document(doc.external_id)


class UserScope:
    """
    A user scope that allows operations to be scoped to a specific end user and optionally a folder.

    Args:
        client: The Morphik client instance
        end_user_id: The ID of the end user
        folder_name: Optional folder name to further scope operations
    """

    def __init__(self, client: "Morphik", end_user_id: str, folder_name: Optional[str] = None):
        self._client = client
        self._end_user_id = end_user_id
        self._folder_name = folder_name

    @property
    def end_user_id(self) -> str:
        """Returns the end user ID."""
        return self._end_user_id

    @property
    def folder_name(self) -> Optional[str]:
        """Returns the folder name if any."""
        return self._folder_name

    def ingest_text(
        self,
        content: str,
        filename: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None,
        rules: Optional[List[RuleOrDict]] = None,
        use_colpali: bool = True,
    ) -> Document:
        """
        Ingest a text document into Morphik as this end user.

        Args:
            content: Text content to ingest
            filename: Optional file name
            metadata: Optional metadata dictionary
            rules: Optional list of rules to apply during ingestion
            use_colpali: Whether to use ColPali-style embedding model

        Returns:
            Document: Metadata of the ingested document
        """
        rules_list = [self._client._convert_rule(r) for r in (rules or [])]
        payload = self._client._logic._prepare_ingest_text_request(
            content,
            filename,
            metadata,
            rules_list,
            use_colpali,
            self._folder_name,
            self._end_user_id,
        )
        response = self._client._request("POST", "ingest/text", data=payload)
        doc = self._client._logic._parse_document_response(response)
        doc._client = self._client
        return doc

    def ingest_file(
        self,
        file: Union[str, bytes, BinaryIO, Path],
        filename: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None,
        rules: Optional[List[RuleOrDict]] = None,
        use_colpali: bool = True,
    ) -> Document:
        """
        Ingest a file document into Morphik as this end user.

        Args:
            file: File to ingest (path string, bytes, file object, or Path)
            filename: Name of the file
            metadata: Optional metadata dictionary
            rules: Optional list of rules to apply during ingestion
            use_colpali: Whether to use ColPali-style embedding model

        Returns:
            Document: Metadata of the ingested document
        """
        # Handle different file input types
        if isinstance(file, (str, Path)):
            file_path = Path(file)
            if not file_path.exists():
                raise ValueError(f"File not found: {file}")
            filename = file_path.name if filename is None else filename
            with open(file_path, "rb") as f:
                content = f.read()
                file_obj = BytesIO(content)
        elif isinstance(file, bytes):
            if filename is None:
                raise ValueError("filename is required when ingesting bytes")
            file_obj = BytesIO(file)
        else:
            if filename is None:
                raise ValueError("filename is required when ingesting file object")
            file_obj = file

        try:
            # Prepare multipart form data
            files = {"file": (filename, file_obj)}

            # Add metadata and rules
            form_data = {
                "metadata": json.dumps(metadata or {}),
                "rules": json.dumps([self._client._convert_rule(r) for r in (rules or [])]),
                "end_user_id": self._end_user_id,  # Add end user ID here
            }

            # Add folder name if scoped to a folder
            if self._folder_name:
                form_data["folder_name"] = self._folder_name

            # use_colpali should be a query parameter as defined in the API
            response = self._client._request(
                "POST",
                "ingest/file",
                data=form_data,
                files=files,
                params={"use_colpali": str(use_colpali).lower()},
            )
            doc = self._client._logic._parse_document_response(response)
            doc._client = self._client
            return doc
        finally:
            # Close file if we opened it
            if isinstance(file, (str, Path)):
                file_obj.close()

    def ingest_files(
        self,
        files: List[Union[str, bytes, BinaryIO, Path]],
        metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
        rules: Optional[List[RuleOrDict]] = None,
        use_colpali: bool = True,
        parallel: bool = True,
    ) -> List[Document]:
        """
        Ingest multiple files into Morphik as this end user.

        Args:
            files: List of files to ingest
            metadata: Optional metadata
            rules: Optional list of rules to apply
            use_colpali: Whether to use ColPali-style embedding
            parallel: Whether to process files in parallel

        Returns:
            List[Document]: List of ingested documents
        """
        # Convert files to format expected by API
        file_objects = []
        for file in files:
            if isinstance(file, (str, Path)):
                path = Path(file)
                file_objects.append(("files", (path.name, open(path, "rb"))))
            elif isinstance(file, bytes):
                file_objects.append(("files", ("file.bin", file)))
            else:
                file_objects.append(("files", (getattr(file, "name", "file.bin"), file)))

        try:
            # Prepare request data
            # Convert rules appropriately
            if rules:
                if all(isinstance(r, list) for r in rules):
                    # List of lists - per-file rules
                    converted_rules = [[self._client._convert_rule(r) for r in rule_list] for rule_list in rules]
                else:
                    # Flat list - shared rules for all files
                    converted_rules = [self._client._convert_rule(r) for r in rules]
            else:
                converted_rules = []

            data = {
                "metadata": json.dumps(metadata or {}),
                "rules": json.dumps(converted_rules),
                # Remove use_colpali from form data - it should be a query param
                "parallel": str(parallel).lower(),
                "end_user_id": self._end_user_id,  # Add end user ID here
            }

            # Add folder name if scoped to a folder
            if self._folder_name:
                data["folder_name"] = self._folder_name

            response = self._client._request(
                "POST",
                "ingest/files",
                data=data,
                files=file_objects,
                params={"use_colpali": str(use_colpali).lower()},
            )

            if response.get("errors"):
                # Log errors but don't raise exception
                for error in response["errors"]:
                    logger.error(f"Failed to ingest {error['filename']}: {error['error']}")

            docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]]
            for doc in docs:
                doc._client = self._client
            return docs
        finally:
            # Clean up file objects
            for _, (_, file_obj) in file_objects:
                if isinstance(file_obj, (IOBase, BytesIO)) and not file_obj.closed:
                    file_obj.close()

    def ingest_directory(
        self,
        directory: Union[str, Path],
        recursive: bool = False,
        pattern: str = "*",
        metadata: Optional[Dict[str, Any]] = None,
        rules: Optional[List[RuleOrDict]] = None,
        use_colpali: bool = True,
        parallel: bool = True,
    ) -> List[Document]:
        """
        Ingest all files in a directory into Morphik as this end user.

        Args:
            directory: Path to directory containing files to ingest
            recursive: Whether to recursively process subdirectories
            pattern: Optional glob pattern to filter files
            metadata: Optional metadata dictionary to apply to all files
            rules: Optional list of rules to apply
            use_colpali: Whether to use ColPali-style embedding
            parallel: Whether to process files in parallel

        Returns:
            List[Document]: List of ingested documents
        """
        directory = Path(directory)
        if not directory.is_dir():
            raise ValueError(f"Directory not found: {directory}")

        # Collect all files matching pattern
        if recursive:
            files = list(directory.rglob(pattern))
        else:
            files = list(directory.glob(pattern))

        # Filter out directories
        files = [f for f in files if f.is_file()]

        if not files:
            return []

        # Use ingest_files with collected paths
        return self.ingest_files(
            files=files, metadata=metadata, rules=rules, use_colpali=use_colpali, parallel=parallel
        )

    def retrieve_chunks(
        self,
        query: str,
        filters: Optional[Dict[str, Any]] = None,
        k: int = 4,
        min_score: float = 0.0,
        use_colpali: bool = True,
    ) -> List[FinalChunkResult]:
        """
        Retrieve relevant chunks as this end user.

        Args:
            query: Search query text
            filters: Optional metadata filters
            k: Number of results (default: 4)
            min_score: Minimum similarity threshold (default: 0.0)
            use_colpali: Whether to use ColPali-style embedding model

        Returns:
            List[FinalChunkResult]: List of relevant chunks
        """
        request = {
            "query": query,
            "filters": filters,
            "k": k,
            "min_score": min_score,
            "use_colpali": use_colpali,
            "end_user_id": self._end_user_id,  # Add end user ID here
        }

        # Add folder name if scoped to a folder
        if self._folder_name:
            request["folder_name"] = self._folder_name

        response = self._client._request("POST", "retrieve/chunks", request)
        return self._client._logic._parse_chunk_result_list_response(response)

    def retrieve_docs(
        self,
        query: str,
        filters: Optional[Dict[str, Any]] = None,
        k: int = 4,
        min_score: float = 0.0,
        use_colpali: bool = True,
    ) -> List[DocumentResult]:
        """
        Retrieve relevant documents as this end user.

        Args:
            query: Search query text
            filters: Optional metadata filters
            k: Number of results (default: 4)
            min_score: Minimum similarity threshold (default: 0.0)
            use_colpali: Whether to use ColPali-style embedding model

        Returns:
            List[DocumentResult]: List of relevant documents
        """
        request = {
            "query": query,
            "filters": filters,
            "k": k,
            "min_score": min_score,
            "use_colpali": use_colpali,
            "end_user_id": self._end_user_id,  # Add end user ID here
        }

        # Add folder name if scoped to a folder
        if self._folder_name:
            request["folder_name"] = self._folder_name

        response = self._client._request("POST", "retrieve/docs", request)
        return self._client._logic._parse_document_result_list_response(response)

    def query(
        self,
        query: str,
        filters: Optional[Dict[str, Any]] = None,
        k: int = 4,
        min_score: float = 0.0,
        max_tokens: Optional[int] = None,
        temperature: Optional[float] = None,
        use_colpali: bool = True,
        graph_name: Optional[str] = None,
        hop_depth: int = 1,
        include_paths: bool = False,
        prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
        schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
    ) -> CompletionResponse:
        """
        Generate completion using relevant chunks as context as this end user.

        Args:
            query: Query text
            filters: Optional metadata filters
            k: Number of chunks to use as context (default: 4)
            min_score: Minimum similarity threshold (default: 0.0)
            max_tokens: Maximum tokens in completion
            temperature: Model temperature
            use_colpali: Whether to use ColPali-style embedding model
            graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
            hop_depth: Number of relationship hops to traverse in the graph (1-3)
            include_paths: Whether to include relationship paths in the response
            prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
            schema: Optional schema for structured output

        Returns:
            CompletionResponse: Generated completion
        """
        payload = self._client._logic._prepare_query_request(
            query,
            filters,
            k,
            min_score,
            max_tokens,
            temperature,
            use_colpali,
            graph_name,
            hop_depth,
            include_paths,
            prompt_overrides,
            self._folder_name,
            self._end_user_id,
            schema,
        )

        # Add schema to payload if provided
        if schema:
            # If schema is a Pydantic model class, we need to serialize it to a schema dict
            if isinstance(schema, type) and issubclass(schema, BaseModel):
                payload["schema"] = schema.model_json_schema()
            else:
                payload["schema"] = schema

            # Add a hint to the query to return in JSON format
            payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."

        response = self._client._request("POST", "query", data=payload)
        return self._client._logic._parse_completion_response(response)

    def list_documents(
        self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None
    ) -> List[Document]:
        """
        List accessible documents for this end user.

        Args:
            skip: Number of documents to skip
            limit: Maximum number of documents to return
            filters: Optional filters

        Returns:
            List[Document]: List of documents
        """
        # Add end_user_id and folder_name to params
        params = {"skip": skip, "limit": limit, "end_user_id": self._end_user_id}

        # Add folder name if scoped to a folder
        if self._folder_name:
            params["folder_name"] = self._folder_name

        response = self._client._request("POST", "documents", data=filters or {}, params=params)

        docs = [self._client._logic._parse_document_response(doc) for doc in response]
        for doc in docs:
            doc._client = self._client
        return docs

    def batch_get_documents(self, document_ids: List[str]) -> List[Document]:
        """
        Retrieve multiple documents by their IDs in a single batch operation for this end user.

        Args:
            document_ids: List of document IDs to retrieve

        Returns:
            List[Document]: List of document metadata for found documents
        """
        request = {"document_ids": document_ids, "end_user_id": self._end_user_id}

        # Add folder name if scoped to a folder
        if self._folder_name:
            request["folder_name"] = self._folder_name

        response = self._client._request("POST", "batch/documents", data=request)
        docs = [self._client._logic._parse_document_response(doc) for doc in response]
        for doc in docs:
            doc._client = self._client
        return docs

    def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
        """
        Retrieve specific chunks by their document ID and chunk number in a single batch operation for this end user.

        Args:
            sources: List of ChunkSource objects or dictionaries with document_id and chunk_number

        Returns:
            List[FinalChunkResult]: List of chunk results
        """
        # Convert to list of dictionaries if needed
        source_dicts = []
        for source in sources:
            if isinstance(source, dict):
                source_dicts.append(source)
            else:
                source_dicts.append(source.model_dump())

        # Add end_user_id and folder_name to request
        request = {"sources": source_dicts, "end_user_id": self._end_user_id}

        # Add folder name if scoped to a folder
        if self._folder_name:
            request["folder_name"] = self._folder_name

        response = self._client._request("POST", "batch/chunks", data=request)
        return self._client._logic._parse_chunk_result_list_response(response)

    def create_graph(
        self,
        name: str,
        filters: Optional[Dict[str, Any]] = None,
        documents: Optional[List[str]] = None,
        prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
    ) -> Graph:
        """
        Create a graph from documents for this end user.

        Args:
            name: Name of the graph to create
            filters: Optional metadata filters to determine which documents to include
            documents: Optional list of specific document IDs to include
            prompt_overrides: Optional customizations for entity extraction and resolution prompts

        Returns:
            Graph: The created graph object
        """
        # Convert prompt_overrides to dict if it's a model
        if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
            prompt_overrides = prompt_overrides.model_dump(exclude_none=True)

        request = {
            "name": name,
            "filters": filters,
            "documents": documents,
            "prompt_overrides": prompt_overrides,
            "end_user_id": self._end_user_id,  # Add end user ID here
        }

        # Add folder name if scoped to a folder
        if self._folder_name:
            request["folder_name"] = self._folder_name

        response = self._client._request("POST", "graph/create", request)
        return self._client._logic._parse_graph_response(response)

    def update_graph(
        self,
        name: str,
        additional_filters: Optional[Dict[str, Any]] = None,
        additional_documents: Optional[List[str]] = None,
        prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
    ) -> Graph:
        """
        Update an existing graph with new documents for this end user.

        Args:
            name: Name of the graph to update
            additional_filters: Optional additional metadata filters to determine which new documents to include
            additional_documents: Optional list of additional document IDs to include
            prompt_overrides: Optional customizations for entity extraction and resolution prompts

        Returns:
            Graph: The updated graph
        """
        # Convert prompt_overrides to dict if it's a model
        if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
            prompt_overrides = prompt_overrides.model_dump(exclude_none=True)

        request = {
            "additional_filters": additional_filters,
            "additional_documents": additional_documents,
            "prompt_overrides": prompt_overrides,
            "end_user_id": self._end_user_id,  # Add end user ID here
        }

        # Add folder name if scoped to a folder
        if self._folder_name:
            request["folder_name"] = self._folder_name

        response = self._client._request("POST", f"graph/{name}/update", request)
        return self._client._logic._parse_graph_response(response)

    def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
        """
        Delete a document by its filename for this end user.

        Args:
            filename: Filename of the document to delete

        Returns:
            Dict[str, str]: Deletion status
        """
        # Build parameters for the filename lookup
        params = {"end_user_id": self._end_user_id}

        # Add folder name if scoped to a folder
        if self._folder_name:
            params["folder_name"] = self._folder_name

        # First get the document ID
        response = self._client._request("GET", f"documents/filename/{filename}", params=params)
        doc = self._client._logic._parse_document_response(response)

        # Then delete by ID
        return self._client.delete_document(doc.external_id)


class Morphik:
    """
    Morphik client for document operations.

    Args:
        uri (str, optional): Morphik URI in format "morphik://<owner_id>:<token>@<host>".
            If not provided, connects to http://localhost:8000 without authentication.
        timeout (int, optional): Request timeout in seconds. Defaults to 30.
        is_local (bool, optional): Whether connecting to local development server. Defaults to False.

    Examples:
        ```python
        # Without authentication
        db = Morphik()

        # With authentication
        db = Morphik("morphik://owner_id:token@api.morphik.ai")
        ```
    """

    def __init__(self, uri: Optional[str] = None, timeout: int = 30, is_local: bool = False):
        self._logic = _MorphikClientLogic(uri, timeout, is_local)
        self._client = httpx.Client(timeout=self._logic._timeout, verify=not self._logic._is_local)

    def _request(
        self,
        method: str,
        endpoint: str,
        data: Optional[Dict[str, Any]] = None,
        files: Optional[Dict[str, Any]] = None,
        params: Optional[Dict[str, Any]] = None,
    ) -> Dict[str, Any]:
        """Make HTTP request"""
        url = self._logic._get_url(endpoint)
        headers = self._logic._get_headers()
        if self._logic._auth_token:  # Only add auth header if we have a token
            headers["Authorization"] = f"Bearer {self._logic._auth_token}"

        # Configure request data based on type
        if files:
            # When uploading files, we need to make sure not to set Content-Type
            # Remove Content-Type if it exists - httpx will set the correct multipart boundary
            if "Content-Type" in headers:
                del headers["Content-Type"]

            # For file uploads with form data, use form data (not json)
            request_data = {"files": files}
            if data:
                request_data["data"] = data

            # Files are now properly handled
        else:
            # JSON for everything else
            headers["Content-Type"] = "application/json"
            request_data = {"json": data}

        response = self._client.request(
            method,
            url,
            headers=headers,
            params=params,
            **request_data,
        )
        try:
            response.raise_for_status()
            return response.json()
        except httpx.HTTPStatusError as e:
            # Print error response for debugging
            print(f"Error response: {e.response.status_code} - {e.response.text}")
            raise

    def _convert_rule(self, rule: RuleOrDict) -> Dict[str, Any]:
        """Convert a rule to a dictionary format"""
        return self._logic._convert_rule(rule)

    def create_folder(self, name: str, description: Optional[str] = None) -> Folder:
        """
        Create a folder to scope operations.

        Args:
            name: The name of the folder
            description: Optional description for the folder

        Returns:
            Folder: A folder object ready for scoped operations
        """
        payload = {"name": name}
        if description:
            payload["description"] = description

        response = self._request("POST", "folders", data=payload)
        folder_info = FolderInfo(**response)

        # Return a usable Folder object with the ID from the response
        return Folder(self, name, folder_id=folder_info.id)

    def get_folder_by_name(self, name: str) -> Folder:
        """
        Get a folder by name to scope operations.

        Args:
            name: The name of the folder

        Returns:
            Folder: A folder object for scoped operations
        """
        return Folder(self, name)

    def get_folder(self, folder_id: str) -> Folder:
        """
        Get a folder by ID.

        Args:
            folder_id: ID of the folder

        Returns:
            Folder: A folder object for scoped operations
        """
        response = self._request("GET", f"folders/{folder_id}")
        return Folder(self, response["name"], folder_id)

    def list_folders(self) -> List[Folder]:
        """
        List all folders the user has access to as Folder objects.

        Returns:
            List[Folder]: List of Folder objects ready for operations
        """
        folder_infos = self._request("GET", "folders")
        return [Folder(self, info["name"], info["id"]) for info in folder_infos]

    def add_document_to_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
        """
        Add a document to a folder.

        Args:
            folder_id: ID of the folder
            document_id: ID of the document

        Returns:
            Dict[str, str]: Success status
        """
        response = self._request("POST", f"folders/{folder_id}/documents/{document_id}")
        return response

    def remove_document_from_folder(self, folder_id: str, document_id: str) -> Dict[str, str]:
        """
        Remove a document from a folder.

        Args:
            folder_id: ID of the folder
            document_id: ID of the document

        Returns:
            Dict[str, str]: Success status
        """
        response = self._request("DELETE", f"folders/{folder_id}/documents/{document_id}")
        return response

    def signin(self, end_user_id: str) -> UserScope:
        """
        Sign in as an end user to scope operations.

        Args:
            end_user_id: The ID of the end user

        Returns:
            UserScope: A user scope object for scoped operations
        """
        return UserScope(self, end_user_id)

    def ingest_text(
        self,
        content: str,
        filename: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None,
        rules: Optional[List[RuleOrDict]] = None,
        use_colpali: bool = True,
    ) -> Document:
        """
        Ingest a text document into Morphik.

        Args:
            content: Text content to ingest
            metadata: Optional metadata dictionary
            rules: Optional list of rules to apply during ingestion. Can be:
                  - MetadataExtractionRule: Extract metadata using a schema
                  - NaturalLanguageRule: Transform content using natural language
            use_colpali: Whether to use ColPali-style embedding model to ingest the text
                (slower, but significantly better retrieval accuracy for text and images)
        Returns:
            Document: Metadata of the ingested document

        Example:
            ```python
            from morphik.rules import MetadataExtractionRule, NaturalLanguageRule
            from pydantic import BaseModel

            class DocumentInfo(BaseModel):
                title: str
                author: str
                date: str

            doc = db.ingest_text(
                "Machine learning is fascinating...",
                metadata={"category": "tech"},
                rules=[
                    # Extract metadata using schema
                    MetadataExtractionRule(schema=DocumentInfo),
                    # Transform content
                    NaturalLanguageRule(prompt="Shorten the content, use keywords")
                ]
            )
            ```
        """
        rules_list = [self._convert_rule(r) for r in (rules or [])]
        payload = self._logic._prepare_ingest_text_request(
            content, filename, metadata, rules_list, use_colpali, None, None
        )
        response = self._request("POST", "ingest/text", data=payload)
        doc = self._logic._parse_document_response(response)
        doc._client = self
        return doc

    def ingest_file(
        self,
        file: Union[str, bytes, BinaryIO, Path],
        filename: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None,
        rules: Optional[List[RuleOrDict]] = None,
        use_colpali: bool = True,
    ) -> Document:
        """
        Ingest a file document into Morphik.

        Args:
            file: File to ingest (path string, bytes, file object, or Path)
            filename: Name of the file
            metadata: Optional metadata dictionary
            rules: Optional list of rules to apply during ingestion. Can be:
                  - MetadataExtractionRule: Extract metadata using a schema
                  - NaturalLanguageRule: Transform content using natural language
            use_colpali: Whether to use ColPali-style embedding model to ingest the file
                (slower, but significantly better retrieval accuracy for images)

        Returns:
            Document: Metadata of the ingested document

        Example:
            ```python
            from morphik.rules import MetadataExtractionRule, NaturalLanguageRule
            from pydantic import BaseModel

            class DocumentInfo(BaseModel):
                title: str
                author: str
                department: str

            doc = db.ingest_file(
                "document.pdf",
                filename="document.pdf",
                metadata={"category": "research"},
                rules=[
                    MetadataExtractionRule(schema=DocumentInfo),
                    NaturalLanguageRule(prompt="Extract key points only")
                ], # Optional
                use_colpali=True, # Optional
            )
            ```
        """
        # Process file input
        file_obj, filename = self._logic._prepare_file_for_upload(file, filename)

        try:
            # Prepare multipart form data
            files = {"file": (filename, file_obj)}

            # Create form data
            form_data = self._logic._prepare_ingest_file_form_data(metadata, rules, None, None)

            # use_colpali should be a query parameter as defined in the API
            response = self._request(
                "POST",
                "ingest/file",
                data=form_data,
                files=files,
                params={"use_colpali": str(use_colpali).lower()},
            )
            doc = self._logic._parse_document_response(response)
            doc._client = self
            return doc
        finally:
            # Close file if we opened it
            if isinstance(file, (str, Path)):
                file_obj.close()

    def ingest_files(
        self,
        files: List[Union[str, bytes, BinaryIO, Path]],
        metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
        rules: Optional[List[RuleOrDict]] = None,
        use_colpali: bool = True,
        parallel: bool = True,
    ) -> List[Document]:
        """
        Ingest multiple files into Morphik.

        Args:
            files: List of files to ingest (path strings, bytes, file objects, or Paths)
            metadata: Optional metadata (single dict for all files or list of dicts)
            rules: Optional list of rules to apply
            use_colpali: Whether to use ColPali-style embedding
            parallel: Whether to process files in parallel

        Returns:
            List[Document]: List of successfully ingested documents

        Raises:
            ValueError: If metadata list length doesn't match files length
        """
        # Convert files to format expected by API
        file_objects = self._logic._prepare_files_for_upload(files)

        try:
            # Prepare form data
            # Prepare form data - use_colpali should be a query parameter, not form data
            data = self._logic._prepare_ingest_files_form_data(metadata, rules, use_colpali, parallel, None, None)

            response = self._request(
                "POST",
                "ingest/files",
                data=data,
                files=file_objects,
                params={"use_colpali": str(use_colpali).lower()},
            )

            if response.get("errors"):
                # Log errors but don't raise exception
                for error in response["errors"]:
                    logger.error(f"Failed to ingest {error['filename']}: {error['error']}")

            docs = [self._logic._parse_document_response(doc) for doc in response["documents"]]
            for doc in docs:
                doc._client = self
            return docs
        finally:
            # Clean up file objects
            for _, (_, file_obj) in file_objects:
                if isinstance(file_obj, (IOBase, BytesIO)) and not file_obj.closed:
                    file_obj.close()

    def ingest_directory(
        self,
        directory: Union[str, Path],
        recursive: bool = False,
        pattern: str = "*",
        metadata: Optional[Dict[str, Any]] = None,
        rules: Optional[List[RuleOrDict]] = None,
        use_colpali: bool = True,
        parallel: bool = True,
    ) -> List[Document]:
        """
        Ingest all files in a directory into Morphik.

        Args:
            directory: Path to directory containing files to ingest
            recursive: Whether to recursively process subdirectories
            pattern: Optional glob pattern to filter files (e.g. "*.pdf")
            metadata: Optional metadata dictionary to apply to all files
            rules: Optional list of rules to apply
            use_colpali: Whether to use ColPali-style embedding
            parallel: Whether to process files in parallel

        Returns:
            List[Document]: List of ingested documents

        Raises:
            ValueError: If directory not found
        """
        directory = Path(directory)
        if not directory.is_dir():
            raise ValueError(f"Directory not found: {directory}")

        # Collect all files matching pattern
        if recursive:
            files = list(directory.rglob(pattern))
        else:
            files = list(directory.glob(pattern))

        # Filter out directories
        files = [f for f in files if f.is_file()]

        if not files:
            return []

        # Use ingest_files with collected paths
        return self.ingest_files(
            files=files, metadata=metadata, rules=rules, use_colpali=use_colpali, parallel=parallel
        )

    def retrieve_chunks(
        self,
        query: str,
        filters: Optional[Dict[str, Any]] = None,
        k: int = 4,
        min_score: float = 0.0,
        use_colpali: bool = True,
    ) -> List[FinalChunkResult]:
        """
        Retrieve relevant chunks.

        Args:
            query: Search query text
            filters: Optional metadata filters
            k: Number of results (default: 4)
            min_score: Minimum similarity threshold (default: 0.0)
            use_colpali: Whether to use ColPali-style embedding model to retrieve the chunks
                (only works for documents ingested with `use_colpali=True`)
        Returns:
            List[ChunkResult]

        Example:
            ```python
            chunks = db.retrieve_chunks(
                "What are the key findings?",
                filters={"department": "research"}
            )
            ```
        """
        payload = self._logic._prepare_retrieve_chunks_request(query, filters, k, min_score, use_colpali, None, None)
        response = self._request("POST", "retrieve/chunks", data=payload)
        return self._logic._parse_chunk_result_list_response(response)

    def retrieve_docs(
        self,
        query: str,
        filters: Optional[Dict[str, Any]] = None,
        k: int = 4,
        min_score: float = 0.0,
        use_colpali: bool = True,
    ) -> List[DocumentResult]:
        """
        Retrieve relevant documents.

        Args:
            query: Search query text
            filters: Optional metadata filters
            k: Number of results (default: 4)
            min_score: Minimum similarity threshold (default: 0.0)
            use_colpali: Whether to use ColPali-style embedding model to retrieve the documents
                (only works for documents ingested with `use_colpali=True`)
        Returns:
            List[DocumentResult]

        Example:
            ```python
            docs = db.retrieve_docs(
                "machine learning",
                k=5
            )
            ```
        """
        payload = self._logic._prepare_retrieve_docs_request(query, filters, k, min_score, use_colpali, None, None)
        response = self._request("POST", "retrieve/docs", data=payload)
        return self._logic._parse_document_result_list_response(response)

    def query(
        self,
        query: str,
        filters: Optional[Dict[str, Any]] = None,
        k: int = 4,
        min_score: float = 0.0,
        max_tokens: Optional[int] = None,
        temperature: Optional[float] = None,
        use_colpali: bool = True,
        graph_name: Optional[str] = None,
        hop_depth: int = 1,
        include_paths: bool = False,
        prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None,
        schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
    ) -> CompletionResponse:
        """
        Generate completion using relevant chunks as context.

        Args:
            query: Query text
            filters: Optional metadata filters
            k: Number of chunks to use as context (default: 4)
            min_score: Minimum similarity threshold (default: 0.0)
            max_tokens: Maximum tokens in completion
            temperature: Model temperature
            use_colpali: Whether to use ColPali-style embedding model to generate the completion
                (only works for documents ingested with `use_colpali=True`)
            graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
            hop_depth: Number of relationship hops to traverse in the graph (1-3)
            include_paths: Whether to include relationship paths in the response
            prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
                Either a QueryPromptOverrides object or a dictionary with the same structure
            schema: Optional schema for structured output, can be a Pydantic model or a JSON schema dict
        Returns:
            CompletionResponse

        Example:
            ```python
            # Standard query
            response = db.query(
                "What are the key findings about customer satisfaction?",
                filters={"department": "research"},
                temperature=0.7
            )

            # Knowledge graph enhanced query
            response = db.query(
                "How does product X relate to customer segment Y?",
                graph_name="market_graph",
                hop_depth=2,
                include_paths=True
            )

            # With prompt customization
            from morphik.models import QueryPromptOverride, QueryPromptOverrides
            response = db.query(
                "What are the key findings?",
                prompt_overrides=QueryPromptOverrides(
                    query=QueryPromptOverride(
                        prompt_template="Answer the question in a formal, academic tone: {question}"
                    )
                )
            )

            # Or using a dictionary
            response = db.query(
                "What are the key findings?",
                prompt_overrides={
                    "query": {
                        "prompt_template": "Answer the question in a formal, academic tone: {question}"
                    }
                }
            )

            print(response.completion)

            # If include_paths=True, you can inspect the graph paths
            if response.metadata and "graph" in response.metadata:
                for path in response.metadata["graph"]["paths"]:
                    print(" -> ".join(path))

            # Using structured output with a Pydantic model
            from pydantic import BaseModel

            class ResearchFindings(BaseModel):
                main_finding: str
                supporting_evidence: List[str]
                limitations: List[str]

            response = db.query(
                "Summarize the key research findings from these documents",
                schema=ResearchFindings
            )

            # Access structured output
            if response.structured_output:
                findings = response.structured_output
                print(f"Main finding: {findings.main_finding}")
                print("Supporting evidence:")
                for evidence in findings.supporting_evidence:
                    print(f"- {evidence}")
            ```
        """
        payload = self._logic._prepare_query_request(
            query,
            filters,
            k,
            min_score,
            max_tokens,
            temperature,
            use_colpali,
            graph_name,
            hop_depth,
            include_paths,
            prompt_overrides,
            None,
            None,
            schema,
        )

        # Add schema to payload if provided
        if schema:
            # If schema is a Pydantic model class, we need to serialize it to a schema dict
            if isinstance(schema, type) and issubclass(schema, BaseModel):
                payload["schema"] = schema.model_json_schema()
            else:
                payload["schema"] = schema

            # Add a hint to the query to return in JSON format
            payload["query"] = f"{payload['query']}\nReturn the answer in JSON format according to the required schema."

        response = self._request("POST", "query", data=payload)
        return self._logic._parse_completion_response(response)

    def list_documents(
        self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None
    ) -> List[Document]:
        """
        List accessible documents.

        Args:
            skip: Number of documents to skip
            limit: Maximum number of documents to return
            filters: Optional filters

        Returns:
            List[Document]: List of accessible documents

        Example:
            ```python
            # Get first page
            docs = db.list_documents(limit=10)

            # Get next page
            next_page = db.list_documents(skip=10, limit=10, filters={"department": "research"})
            ```
        """
        params, data = self._logic._prepare_list_documents_request(skip, limit, filters, None, None)
        response = self._request("POST", "documents", data=data, params=params)
        docs = self._logic._parse_document_list_response(response)
        for doc in docs:
            doc._client = self
        return docs

    def get_document(self, document_id: str) -> Document:
        """
        Get document metadata by ID.

        Args:
            document_id: ID of the document

        Returns:
            Document: Document metadata

        Example:
            ```python
            doc = db.get_document("doc_123")
            print(f"Title: {doc.metadata.get('title')}")
            ```
        """
        response = self._request("GET", f"documents/{document_id}")
        doc = self._logic._parse_document_response(response)
        doc._client = self
        return doc

    def get_document_status(self, document_id: str) -> Dict[str, Any]:
        """
        Get the current processing status of a document.

        Args:
            document_id: ID of the document to check

        Returns:
            Dict[str, Any]: Status information including current status, potential errors, and other metadata

        Example:
            ```python
            status = db.get_document_status("doc_123")
            if status["status"] == "completed":
                print("Document processing complete")
            elif status["status"] == "failed":
                print(f"Processing failed: {status['error']}")
            else:
                print("Document still processing...")
            ```
        """
        response = self._request("GET", f"documents/{document_id}/status")
        return response

    def wait_for_document_completion(self, document_id: str, timeout_seconds=300, check_interval_seconds=2) -> Document:
        """
        Wait for a document's processing to complete.

        Args:
            document_id: ID of the document to wait for
            timeout_seconds: Maximum time to wait for completion (default: 300 seconds)
            check_interval_seconds: Time between status checks (default: 2 seconds)

        Returns:
            Document: Updated document with the latest status

        Raises:
            TimeoutError: If processing doesn't complete within the timeout period
            ValueError: If processing fails with an error

        Example:
            ```python
            # Upload a file and wait for processing to complete
            doc = db.ingest_file("large_document.pdf")
            try:
                completed_doc = db.wait_for_document_completion(doc.external_id)
                print(f"Processing complete! Document has {len(completed_doc.chunk_ids)} chunks")
            except TimeoutError:
                print("Processing is taking too long")
            except ValueError as e:
                print(f"Processing failed: {e}")
            ```
        """
        import time

        start_time = time.time()

        while (time.time() - start_time) < timeout_seconds:
            status = self.get_document_status(document_id)

            if status["status"] == "completed":
                # Get the full document now that it's complete
                return self.get_document(document_id)
            elif status["status"] == "failed":
                raise ValueError(f"Document processing failed: {status.get('error', 'Unknown error')}")

            # Wait before checking again
            time.sleep(check_interval_seconds)

        raise TimeoutError(f"Document processing did not complete within {timeout_seconds} seconds")

    def get_document_by_filename(self, filename: str) -> Document:
        """
        Get document metadata by filename.
        If multiple documents have the same filename, returns the most recently updated one.

        Args:
            filename: Filename of the document to retrieve

        Returns:
            Document: Document metadata

        Example:
            ```python
            doc = db.get_document_by_filename("report.pdf")
            print(f"Document ID: {doc.external_id}")
            ```
        """
        response = self._request("GET", f"documents/filename/{filename}")
        doc = self._logic._parse_document_response(response)
        doc._client = self
        return doc

    def update_document_with_text(
        self,
        document_id: str,
        content: str,
        filename: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None,
        rules: Optional[List] = None,
        update_strategy: str = "add",
        use_colpali: Optional[bool] = None,
    ) -> Document:
        """
        Update a document with new text content using the specified strategy.

        Args:
            document_id: ID of the document to update
            content: The new content to add
            filename: Optional new filename for the document
            metadata: Additional metadata to update (optional)
            rules: Optional list of rules to apply to the content
            update_strategy: Strategy for updating the document (currently only 'add' is supported)
            use_colpali: Whether to use multi-vector embedding

        Returns:
            Document: Updated document metadata

        Example:
            ```python
            # Add new content to an existing document
            updated_doc = db.update_document_with_text(
                document_id="doc_123",
                content="This is additional content that will be appended to the document.",
                filename="updated_document.txt",
                metadata={"category": "updated"},
                update_strategy="add"
            )
            print(f"Document version: {updated_doc.system_metadata.get('version')}")
            ```
        """
        # Use the dedicated text update endpoint
        request = IngestTextRequest(
            content=content,
            filename=filename,
            metadata=metadata or {},
            rules=[self._convert_rule(r) for r in (rules or [])],
            use_colpali=use_colpali if use_colpali is not None else True,
        )

        params = {}
        if update_strategy != "add":
            params["update_strategy"] = update_strategy

        response = self._request(
            "POST", f"documents/{document_id}/update_text", data=request.model_dump(), params=params
        )

        doc = self._logic._parse_document_response(response)
        doc._client = self
        return doc

    def update_document_with_file(
        self,
        document_id: str,
        file: Union[str, bytes, BinaryIO, Path],
        filename: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None,
        rules: Optional[List] = None,
        update_strategy: str = "add",
        use_colpali: Optional[bool] = None,
    ) -> Document:
        """
        Update a document with content from a file using the specified strategy.

        Args:
            document_id: ID of the document to update
            file: File to add (path string, bytes, file object, or Path)
            filename: Name of the file
            metadata: Additional metadata to update (optional)
            rules: Optional list of rules to apply to the content
            update_strategy: Strategy for updating the document (currently only 'add' is supported)
            use_colpali: Whether to use multi-vector embedding

        Returns:
            Document: Updated document metadata

        Example:
            ```python
            # Add content from a file to an existing document
            updated_doc = db.update_document_with_file(
                document_id="doc_123",
                file="path/to/update.pdf",
                metadata={"status": "updated"},
                update_strategy="add"
            )
            print(f"Document version: {updated_doc.system_metadata.get('version')}")
            ```
        """
        # Handle different file input types
        if isinstance(file, (str, Path)):
            file_path = Path(file)
            if not file_path.exists():
                raise ValueError(f"File not found: {file}")
            filename = file_path.name if filename is None else filename
            with open(file_path, "rb") as f:
                content = f.read()
                file_obj = BytesIO(content)
        elif isinstance(file, bytes):
            if filename is None:
                raise ValueError("filename is required when updating with bytes")
            file_obj = BytesIO(file)
        else:
            if filename is None:
                raise ValueError("filename is required when updating with file object")
            file_obj = file

        try:
            # Prepare multipart form data
            files = {"file": (filename, file_obj)}

            # Convert metadata and rules to JSON strings
            form_data = {
                "metadata": json.dumps(metadata or {}),
                "rules": json.dumps([self._convert_rule(r) for r in (rules or [])]),
                "update_strategy": update_strategy,
            }

            if use_colpali is not None:
                form_data["use_colpali"] = str(use_colpali).lower()

            # Use the dedicated file update endpoint
            response = self._request("POST", f"documents/{document_id}/update_file", data=form_data, files=files)

            doc = self._logic._parse_document_response(response)
            doc._client = self
            return doc
        finally:
            # Close file if we opened it
            if isinstance(file, (str, Path)):
                file_obj.close()

    def update_document_metadata(
        self,
        document_id: str,
        metadata: Dict[str, Any],
    ) -> Document:
        """
        Update a document's metadata only.

        Args:
            document_id: ID of the document to update
            metadata: Metadata to update

        Returns:
            Document: Updated document metadata

        Example:
            ```python
            # Update just the metadata of a document
            updated_doc = db.update_document_metadata(
                document_id="doc_123",
                metadata={"status": "reviewed", "reviewer": "Jane Smith"}
            )
            print(f"Updated metadata: {updated_doc.metadata}")
            ```
        """
        # Use the dedicated metadata update endpoint
        response = self._request("POST", f"documents/{document_id}/update_metadata", data=metadata)
        doc = self._logic._parse_document_response(response)
        doc._client = self
        return doc

    def update_document_by_filename_with_text(
        self,
        filename: str,
        content: str,
        new_filename: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None,
        rules: Optional[List] = None,
        update_strategy: str = "add",
        use_colpali: Optional[bool] = None,
    ) -> Document:
        """
        Update a document identified by filename with new text content using the specified strategy.

        Args:
            filename: Filename of the document to update
            content: The new content to add
            new_filename: Optional new filename for the document
            metadata: Additional metadata to update (optional)
            rules: Optional list of rules to apply to the content
            update_strategy: Strategy for updating the document (currently only 'add' is supported)
            use_colpali: Whether to use multi-vector embedding

        Returns:
            Document: Updated document metadata

        Example:
            ```python
            # Add new content to an existing document identified by filename
            updated_doc = db.update_document_by_filename_with_text(
                filename="report.pdf",
                content="This is additional content that will be appended to the document.",
                new_filename="updated_report.pdf",
                metadata={"category": "updated"},
                update_strategy="add"
            )
            print(f"Document version: {updated_doc.system_metadata.get('version')}")
            ```
        """
        # First get the document by filename to obtain its ID
        doc = self.get_document_by_filename(filename)

        # Then use the regular update_document_with_text endpoint with the document ID
        return self.update_document_with_text(
            document_id=doc.external_id,
            content=content,
            filename=new_filename,
            metadata=metadata,
            rules=rules,
            update_strategy=update_strategy,
            use_colpali=use_colpali,
        )

    def update_document_by_filename_with_file(
        self,
        filename: str,
        file: Union[str, bytes, BinaryIO, Path],
        new_filename: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None,
        rules: Optional[List] = None,
        update_strategy: str = "add",
        use_colpali: Optional[bool] = None,
    ) -> Document:
        """
        Update a document identified by filename with content from a file using the specified strategy.

        Args:
            filename: Filename of the document to update
            file: File to add (path string, bytes, file object, or Path)
            new_filename: Optional new filename for the document (defaults to the filename of the file)
            metadata: Additional metadata to update (optional)
            rules: Optional list of rules to apply to the content
            update_strategy: Strategy for updating the document (currently only 'add' is supported)
            use_colpali: Whether to use multi-vector embedding

        Returns:
            Document: Updated document metadata

        Example:
            ```python
            # Add content from a file to an existing document identified by filename
            updated_doc = db.update_document_by_filename_with_file(
                filename="report.pdf",
                file="path/to/update.pdf",
                metadata={"status": "updated"},
                update_strategy="add"
            )
            print(f"Document version: {updated_doc.system_metadata.get('version')}")
            ```
        """
        # First get the document by filename to obtain its ID
        doc = self.get_document_by_filename(filename)

        # Then use the regular update_document_with_file endpoint with the document ID
        return self.update_document_with_file(
            document_id=doc.external_id,
            file=file,
            filename=new_filename,
            metadata=metadata,
            rules=rules,
            update_strategy=update_strategy,
            use_colpali=use_colpali,
        )

    def update_document_by_filename_metadata(
        self,
        filename: str,
        metadata: Dict[str, Any],
        new_filename: Optional[str] = None,
    ) -> Document:
        """
        Update a document's metadata using filename to identify the document.

        Args:
            filename: Filename of the document to update
            metadata: Metadata to update
            new_filename: Optional new filename to assign to the document

        Returns:
            Document: Updated document metadata

        Example:
            ```python
            # Update just the metadata of a document identified by filename
            updated_doc = db.update_document_by_filename_metadata(
                filename="report.pdf",
                metadata={"status": "reviewed", "reviewer": "Jane Smith"},
                new_filename="reviewed_report.pdf"  # Optional: rename the file
            )
            print(f"Updated metadata: {updated_doc.metadata}")
            ```
        """
        # First get the document by filename to obtain its ID
        doc = self.get_document_by_filename(filename)

        # Update the metadata
        result = self.update_document_metadata(
            document_id=doc.external_id,
            metadata=metadata,
        )

        # If new_filename is provided, update the filename as well
        if new_filename:
            # Create a request that retains the just-updated metadata but also changes filename
            combined_metadata = result.metadata.copy()

            # Update the document again with filename change and the same metadata
            response = self._request(
                "POST",
                f"documents/{doc.external_id}/update_text",
                data={
                    "content": "",
                    "filename": new_filename,
                    "metadata": combined_metadata,
                    "rules": [],
                },
            )
            result = self._logic._parse_document_response(response)
            result._client = self

        return result

    def batch_get_documents(self, document_ids: List[str]) -> List[Document]:
        """
        Retrieve multiple documents by their IDs in a single batch operation.

        Args:
            document_ids: List of document IDs to retrieve

        Returns:
            List[Document]: List of document metadata for found documents

        Example:
            ```python
            docs = db.batch_get_documents(["doc_123", "doc_456", "doc_789"])
            for doc in docs:
                print(f"Document {doc.external_id}: {doc.metadata.get('title')}")
            ```
        """
        # API expects a dict with document_ids key, not a direct list
        response = self._request("POST", "batch/documents", data={"document_ids": document_ids})
        docs = self._logic._parse_document_list_response(response)
        for doc in docs:
            doc._client = self
        return docs

    def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]:
        """
        Retrieve specific chunks by their document ID and chunk number in a single batch operation.

        Args:
            sources: List of ChunkSource objects or dictionaries with document_id and chunk_number

        Returns:
            List[FinalChunkResult]: List of chunk results

        Example:
            ```python
            # Using dictionaries
            sources = [
                {"document_id": "doc_123", "chunk_number": 0},
                {"document_id": "doc_456", "chunk_number": 2}
            ]

            # Or using ChunkSource objects
            from morphik.models import ChunkSource
            sources = [
                ChunkSource(document_id="doc_123", chunk_number=0),
                ChunkSource(document_id="doc_456", chunk_number=2)
            ]

            chunks = db.batch_get_chunks(sources)
            for chunk in chunks:
                print(f"Chunk from {chunk.document_id}, number {chunk.chunk_number}: {chunk.content[:50]}...")
            ```
        """
        # Convert to list of dictionaries if needed
        source_dicts = []
        for source in sources:
            if isinstance(source, dict):
                source_dicts.append(source)
            else:
                source_dicts.append(source.model_dump())

        response = self._request("POST", "batch/chunks", data=source_dicts)
        return self._logic._parse_chunk_result_list_response(response)

    def create_cache(
        self,
        name: str,
        model: str,
        gguf_file: str,
        filters: Optional[Dict[str, Any]] = None,
        docs: Optional[List[str]] = None,
    ) -> Dict[str, Any]:
        """
        Create a new cache with specified configuration.

        Args:
            name: Name of the cache to create
            model: Name of the model to use (e.g. "llama2")
            gguf_file: Name of the GGUF file to use for the model
            filters: Optional metadata filters to determine which documents to include.
                These filters will be applied in addition to any specific docs provided.
            docs: Optional list of specific document IDs to include.
                These docs will be included in addition to any documents matching the filters.

        Returns:
            Dict[str, Any]: Created cache configuration

        Example:
            ```python
            # This will include both:
            # 1. Any documents with category="programming"
            # 2. The specific documents "doc1" and "doc2" (regardless of their category)
            cache = db.create_cache(
                name="programming_cache",
                model="llama2",
                gguf_file="llama-2-7b-chat.Q4_K_M.gguf",
                filters={"category": "programming"},
                docs=["doc1", "doc2"]
            )
            ```
        """
        # Build query parameters for name, model and gguf_file
        params = {"name": name, "model": model, "gguf_file": gguf_file}

        # Build request body for filters and docs
        request = {"filters": filters, "docs": docs}

        response = self._request("POST", "cache/create", request, params=params)
        return response

    def get_cache(self, name: str) -> Cache:
        """
        Get a cache by name.

        Args:
            name: Name of the cache to retrieve

        Returns:
            cache: A cache object that is used to interact with the cache.

        Example:
            ```python
            cache = db.get_cache("programming_cache")
            ```
        """
        response = self._request("GET", f"cache/{name}")
        if response.get("exists", False):
            return Cache(self, name)
        raise ValueError(f"Cache '{name}' not found")

    def create_graph(
        self,
        name: str,
        filters: Optional[Dict[str, Any]] = None,
        documents: Optional[List[str]] = None,
        prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
    ) -> Graph:
        """
        Create a graph from documents.

        This method extracts entities and relationships from documents
        matching the specified filters or document IDs and creates a graph.

        Args:
            name: Name of the graph to create
            filters: Optional metadata filters to determine which documents to include
            documents: Optional list of specific document IDs to include
            prompt_overrides: Optional customizations for entity extraction and resolution prompts
                Either a GraphPromptOverrides object or a dictionary with the same structure

        Returns:
            Graph: The created graph object

        Example:
            ```python
            # Create a graph from documents with category="research"
            graph = db.create_graph(
                name="research_graph",
                filters={"category": "research"}
            )

            # Create a graph from specific documents
            graph = db.create_graph(
                name="custom_graph",
                documents=["doc1", "doc2", "doc3"]
            )

            # With custom entity extraction examples
            from morphik.models import EntityExtractionPromptOverride, EntityExtractionExample, GraphPromptOverrides
            graph = db.create_graph(
                name="medical_graph",
                filters={"category": "medical"},
                prompt_overrides=GraphPromptOverrides(
                    entity_extraction=EntityExtractionPromptOverride(
                        examples=[
                            EntityExtractionExample(label="Insulin", type="MEDICATION"),
                            EntityExtractionExample(label="Diabetes", type="CONDITION")
                        ]
                    )
                )
            )
            ```
        """
        # Convert prompt_overrides to dict if it's a model
        if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
            prompt_overrides = prompt_overrides.model_dump(exclude_none=True)

        # Initialize request with required fields
        request = {"name": name}

        # Add optional fields only if they are not None
        if filters is not None:
            request["filters"] = filters
        if documents is not None:
            request["documents"] = documents
        if prompt_overrides is not None:
            request["prompt_overrides"] = prompt_overrides

        response = self._request("POST", "graph/create", request)
        return self._logic._parse_graph_response(response)

    def get_graph(self, name: str) -> Graph:
        """
        Get a graph by name.

        Args:
            name: Name of the graph to retrieve

        Returns:
            Graph: The requested graph object

        Example:
            ```python
            # Get a graph by name
            graph = db.get_graph("finance_graph")
            print(f"Graph has {len(graph.entities)} entities and {len(graph.relationships)} relationships")
            ```
        """
        response = self._request("GET", f"graph/{name}")
        return self._logic._parse_graph_response(response)

    def list_graphs(self) -> List[Graph]:
        """
        List all graphs the user has access to.

        Returns:
            List[Graph]: List of graph objects

        Example:
            ```python
            # List all accessible graphs
            graphs = db.list_graphs()
            for graph in graphs:
                print(f"Graph: {graph.name}, Entities: {len(graph.entities)}")
            ```
        """
        response = self._request("GET", "graphs")
        return self._logic._parse_graph_list_response(response)

    def update_graph(
        self,
        name: str,
        additional_filters: Optional[Dict[str, Any]] = None,
        additional_documents: Optional[List[str]] = None,
        prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None,
    ) -> Graph:
        """
        Update an existing graph with new documents.

        This method processes additional documents matching the original or new filters,
        extracts entities and relationships, and updates the graph with new information.

        Args:
            name: Name of the graph to update
            additional_filters: Optional additional metadata filters to determine which new documents to include
            additional_documents: Optional list of additional document IDs to include
            prompt_overrides: Optional customizations for entity extraction and resolution prompts
                Either a GraphPromptOverrides object or a dictionary with the same structure

        Returns:
            Graph: The updated graph

        Example:
            ```python
            # Update a graph with new documents
            updated_graph = db.update_graph(
                name="research_graph",
                additional_filters={"category": "new_research"},
                additional_documents=["doc4", "doc5"]
            )
            print(f"Graph now has {len(updated_graph.entities)} entities")

            # With entity resolution examples
            from morphik.models import EntityResolutionPromptOverride, EntityResolutionExample, GraphPromptOverrides
            updated_graph = db.update_graph(
                name="research_graph",
                additional_documents=["doc4"],
                prompt_overrides=GraphPromptOverrides(
                    entity_resolution=EntityResolutionPromptOverride(
                        examples=[
                            EntityResolutionExample(
                                canonical="Machine Learning",
                                variants=["ML", "machine learning", "AI/ML"]
                            )
                        ]
                    )
                )
            )
            ```
        """
        # Convert prompt_overrides to dict if it's a model
        if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides):
            prompt_overrides = prompt_overrides.model_dump(exclude_none=True)

        request = {
            "additional_filters": additional_filters,
            "additional_documents": additional_documents,
            "prompt_overrides": prompt_overrides,
        }

        response = self._request("POST", f"graph/{name}/update", request)
        return self._logic._parse_graph_response(response)

    def delete_document(self, document_id: str) -> Dict[str, str]:
        """
        Delete a document and all its associated data.

        This method deletes a document and all its associated data, including:
        - Document metadata
        - Document content in storage
        - Document chunks and embeddings in vector store

        Args:
            document_id: ID of the document to delete

        Returns:
            Dict[str, str]: Deletion status

        Example:
            ```python
            # Delete a document
            result = db.delete_document("doc_123")
            print(result["message"])  # Document doc_123 deleted successfully
            ```
        """
        response = self._request("DELETE", f"documents/{document_id}")
        return response

    def delete_document_by_filename(self, filename: str) -> Dict[str, str]:
        """
        Delete a document by its filename.

        This is a convenience method that first retrieves the document ID by filename
        and then deletes the document by ID.

        Args:
            filename: Filename of the document to delete

        Returns:
            Dict[str, str]: Deletion status

        Example:
            ```python
            # Delete a document by filename
            result = db.delete_document_by_filename("report.pdf")
            print(result["message"])
            ```
        """
        # First get the document by filename to obtain its ID
        doc = self.get_document_by_filename(filename)

        # Then delete the document by ID
        return self.delete_document(doc.external_id)

    def close(self):
        """Close the HTTP client"""
        self._client.close()

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    def create_app(self, app_id: str, name: str, expiry_days: int = 30) -> Dict[str, str]:
        """Create a new application in Morphik Cloud and obtain its auth URI.

        This wraps the enterprise endpoint ``/ee/create_app`` which
        returns a dictionary ``{\"uri\": ..., \"app_id\": ...}``.

        Parameters
        ----------
        app_id:
            Identifier for the new application.
        name:
            Human-readable application name (will be slugified by the server).
        expiry_days:
            Token validity period.  Defaults to 30 days.
        """

        payload = {"app_id": app_id, "name": name, "expiry_days": expiry_days}
        return self._request("POST", "ee/create_app", data=payload)