import json import logging from io import BytesIO, IOBase from pathlib import Path from typing import Dict, Any, List, Optional, Union, BinaryIO from PIL import Image from PIL.Image import Image as PILImage import httpx from .models import ( Document, DocumentResult, CompletionResponse, IngestTextRequest, ChunkSource, Graph, # Prompt override models GraphPromptOverrides, QueryPromptOverrides, ) from .rules import Rule from ._internal import _MorphikClientLogic, FinalChunkResult, RuleOrDict logger = logging.getLogger(__name__) class Cache: def __init__(self, db: "Morphik", name: str): self._db = db self._name = name def update(self) -> bool: response = self._db._request("POST", f"cache/{self._name}/update") return response.get("success", False) def add_docs(self, docs: List[str]) -> bool: response = self._db._request("POST", f"cache/{self._name}/add_docs", {"docs": docs}) return response.get("success", False) def query( self, query: str, max_tokens: Optional[int] = None, temperature: Optional[float] = None ) -> CompletionResponse: response = self._db._request( "POST", f"cache/{self._name}/query", params={"query": query, "max_tokens": max_tokens, "temperature": temperature}, data="", ) return CompletionResponse(**response) class Folder: """ A folder that allows operations to be scoped to a specific folder. Args: client: The Morphik client instance name: The name of the folder """ def __init__(self, client: "Morphik", name: str): self._client = client self._name = name @property def name(self) -> str: """Returns the folder name.""" return self._name def signin(self, end_user_id: str) -> "UserScope": """ Returns a UserScope object scoped to this folder and the end user. Args: end_user_id: The ID of the end user Returns: UserScope: A user scope scoped to this folder and the end user """ return UserScope(client=self._client, end_user_id=end_user_id, folder_name=self._name) def ingest_text( self, content: str, filename: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, rules: Optional[List[RuleOrDict]] = None, use_colpali: bool = True, ) -> Document: """ Ingest a text document into Morphik within this folder. Args: content: Text content to ingest filename: Optional file name metadata: Optional metadata dictionary rules: Optional list of rules to apply during ingestion use_colpali: Whether to use ColPali-style embedding model Returns: Document: Metadata of the ingested document """ rules_list = [self._client._convert_rule(r) for r in (rules or [])] payload = self._client._logic._prepare_ingest_text_request( content, filename, metadata, rules_list, use_colpali, self._name, None ) response = self._client._request("POST", "ingest/text", data=payload) doc = self._client._logic._parse_document_response(response) doc._client = self._client return doc def ingest_file( self, file: Union[str, bytes, BinaryIO, Path], filename: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, rules: Optional[List[RuleOrDict]] = None, use_colpali: bool = True, ) -> Document: """ Ingest a file document into Morphik within this folder. Args: file: File to ingest (path string, bytes, file object, or Path) filename: Name of the file metadata: Optional metadata dictionary rules: Optional list of rules to apply during ingestion use_colpali: Whether to use ColPali-style embedding model Returns: Document: Metadata of the ingested document """ # Process file input file_obj, filename = self._client._logic._prepare_file_for_upload(file, filename) try: # Prepare multipart form data files = {"file": (filename, file_obj)} # Create form data form_data = self._client._logic._prepare_ingest_file_form_data( metadata, rules, self._name, None ) response = self._client._request( "POST", f"ingest/file?use_colpali={str(use_colpali).lower()}", data=form_data, files=files, ) doc = self._client._logic._parse_document_response(response) doc._client = self._client return doc finally: # Close file if we opened it if isinstance(file, (str, Path)): file_obj.close() def ingest_files( self, files: List[Union[str, bytes, BinaryIO, Path]], metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, rules: Optional[List[RuleOrDict]] = None, use_colpali: bool = True, parallel: bool = True, ) -> List[Document]: """ Ingest multiple files into Morphik within this folder. Args: files: List of files to ingest metadata: Optional metadata rules: Optional list of rules to apply use_colpali: Whether to use ColPali-style embedding parallel: Whether to process files in parallel Returns: List[Document]: List of ingested documents """ # Convert files to format expected by API file_objects = self._client._logic._prepare_files_for_upload(files) try: # Prepare form data data = self._client._logic._prepare_ingest_files_form_data( metadata, rules, use_colpali, parallel, self._name, None ) response = self._client._request("POST", "ingest/files", data=data, files=file_objects) if response.get("errors"): # Log errors but don't raise exception for error in response["errors"]: logger.error(f"Failed to ingest {error['filename']}: {error['error']}") docs = [ self._client._logic._parse_document_response(doc) for doc in response["documents"] ] for doc in docs: doc._client = self._client return docs finally: # Clean up file objects for _, (_, file_obj) in file_objects: if isinstance(file_obj, (IOBase, BytesIO)) and not file_obj.closed: file_obj.close() def ingest_directory( self, directory: Union[str, Path], recursive: bool = False, pattern: str = "*", metadata: Optional[Dict[str, Any]] = None, rules: Optional[List[RuleOrDict]] = None, use_colpali: bool = True, parallel: bool = True, ) -> List[Document]: """ Ingest all files in a directory into Morphik within this folder. Args: directory: Path to directory containing files to ingest recursive: Whether to recursively process subdirectories pattern: Optional glob pattern to filter files metadata: Optional metadata dictionary to apply to all files rules: Optional list of rules to apply use_colpali: Whether to use ColPali-style embedding parallel: Whether to process files in parallel Returns: List[Document]: List of ingested documents """ directory = Path(directory) if not directory.is_dir(): raise ValueError(f"Directory not found: {directory}") # Collect all files matching pattern if recursive: files = list(directory.rglob(pattern)) else: files = list(directory.glob(pattern)) # Filter out directories files = [f for f in files if f.is_file()] if not files: return [] # Use ingest_files with collected paths return self.ingest_files( files=files, metadata=metadata, rules=rules, use_colpali=use_colpali, parallel=parallel ) def retrieve_chunks( self, query: str, filters: Optional[Dict[str, Any]] = None, k: int = 4, min_score: float = 0.0, use_colpali: bool = True, ) -> List[FinalChunkResult]: """ Retrieve relevant chunks within this folder. Args: query: Search query text filters: Optional metadata filters k: Number of results (default: 4) min_score: Minimum similarity threshold (default: 0.0) use_colpali: Whether to use ColPali-style embedding model Returns: List[FinalChunkResult]: List of relevant chunks """ request = { "query": query, "filters": filters, "k": k, "min_score": min_score, "use_colpali": use_colpali, "folder_name": self._name, # Add folder name here } response = self._client._request("POST", "retrieve/chunks", request) return self._client._logic._parse_chunk_result_list_response(response) def retrieve_docs( self, query: str, filters: Optional[Dict[str, Any]] = None, k: int = 4, min_score: float = 0.0, use_colpali: bool = True, ) -> List[DocumentResult]: """ Retrieve relevant documents within this folder. Args: query: Search query text filters: Optional metadata filters k: Number of results (default: 4) min_score: Minimum similarity threshold (default: 0.0) use_colpali: Whether to use ColPali-style embedding model Returns: List[DocumentResult]: List of relevant documents """ request = { "query": query, "filters": filters, "k": k, "min_score": min_score, "use_colpali": use_colpali, "folder_name": self._name, # Add folder name here } response = self._client._request("POST", "retrieve/docs", request) return self._client._logic._parse_document_result_list_response(response) def query( self, query: str, filters: Optional[Dict[str, Any]] = None, k: int = 4, min_score: float = 0.0, max_tokens: Optional[int] = None, temperature: Optional[float] = None, use_colpali: bool = True, graph_name: Optional[str] = None, hop_depth: int = 1, include_paths: bool = False, prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None, ) -> CompletionResponse: """ Generate completion using relevant chunks as context within this folder. Args: query: Query text filters: Optional metadata filters k: Number of chunks to use as context (default: 4) min_score: Minimum similarity threshold (default: 0.0) max_tokens: Maximum tokens in completion temperature: Model temperature use_colpali: Whether to use ColPali-style embedding model graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval hop_depth: Number of relationship hops to traverse in the graph (1-3) include_paths: Whether to include relationship paths in the response prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts Returns: CompletionResponse: Generated completion """ payload = self._client._logic._prepare_query_request( query, filters, k, min_score, max_tokens, temperature, use_colpali, graph_name, hop_depth, include_paths, prompt_overrides, self._name, None, ) response = self._client._request("POST", "query", data=payload) return self._client._logic._parse_completion_response(response) def list_documents( self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None ) -> List[Document]: """ List accessible documents within this folder. Args: skip: Number of documents to skip limit: Maximum number of documents to return filters: Optional filters Returns: List[Document]: List of documents """ params, data = self._client._logic._prepare_list_documents_request( skip, limit, filters, self._name, None ) response = self._client._request("POST", "documents", data=data, params=params) docs = self._client._logic._parse_document_list_response(response) for doc in docs: doc._client = self._client return docs def batch_get_documents(self, document_ids: List[str]) -> List[Document]: """ Retrieve multiple documents by their IDs in a single batch operation within this folder. Args: document_ids: List of document IDs to retrieve Returns: List[Document]: List of document metadata for found documents """ request = {"document_ids": document_ids, "folder_name": self._name} response = self._client._request("POST", "batch/documents", data=request) docs = [self._client._logic._parse_document_response(doc) for doc in response] for doc in docs: doc._client = self._client return docs def batch_get_chunks( self, sources: List[Union[ChunkSource, Dict[str, Any]]] ) -> List[FinalChunkResult]: """ Retrieve specific chunks by their document ID and chunk number in a single batch operation within this folder. Args: sources: List of ChunkSource objects or dictionaries with document_id and chunk_number Returns: List[FinalChunkResult]: List of chunk results """ # Convert to list of dictionaries if needed source_dicts = [] for source in sources: if isinstance(source, dict): source_dicts.append(source) else: source_dicts.append(source.model_dump()) # Add folder_name to request request = {"sources": source_dicts, "folder_name": self._name} response = self._client._request("POST", "batch/chunks", data=request) return self._client._logic._parse_chunk_result_list_response(response) def create_graph( self, name: str, filters: Optional[Dict[str, Any]] = None, documents: Optional[List[str]] = None, prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None, ) -> Graph: """ Create a graph from documents within this folder. Args: name: Name of the graph to create filters: Optional metadata filters to determine which documents to include documents: Optional list of specific document IDs to include prompt_overrides: Optional customizations for entity extraction and resolution prompts Returns: Graph: The created graph object """ # Convert prompt_overrides to dict if it's a model if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides): prompt_overrides = prompt_overrides.model_dump(exclude_none=True) request = { "name": name, "filters": filters, "documents": documents, "prompt_overrides": prompt_overrides, "folder_name": self._name, # Add folder name here } response = self._client._request("POST", "graph/create", request) return self._client._logic._parse_graph_response(response) def update_graph( self, name: str, additional_filters: Optional[Dict[str, Any]] = None, additional_documents: Optional[List[str]] = None, prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None, ) -> Graph: """ Update an existing graph with new documents from this folder. Args: name: Name of the graph to update additional_filters: Optional additional metadata filters to determine which new documents to include additional_documents: Optional list of additional document IDs to include prompt_overrides: Optional customizations for entity extraction and resolution prompts Returns: Graph: The updated graph """ # Convert prompt_overrides to dict if it's a model if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides): prompt_overrides = prompt_overrides.model_dump(exclude_none=True) request = { "additional_filters": additional_filters, "additional_documents": additional_documents, "prompt_overrides": prompt_overrides, "folder_name": self._name, # Add folder name here } response = self._client._request("POST", f"graph/{name}/update", request) return self._client._logic._parse_graph_response(response) def delete_document_by_filename(self, filename: str) -> Dict[str, str]: """ Delete a document by its filename within this folder. Args: filename: Filename of the document to delete Returns: Dict[str, str]: Deletion status """ # Get the document by filename with folder scope request = {"filename": filename, "folder_name": self._name} # First get the document ID response = self._client._request( "GET", f"documents/filename/{filename}", params={"folder_name": self._name} ) doc = self._client._logic._parse_document_response(response) # Then delete by ID return self._client.delete_document(doc.external_id) class UserScope: """ A user scope that allows operations to be scoped to a specific end user and optionally a folder. Args: client: The Morphik client instance end_user_id: The ID of the end user folder_name: Optional folder name to further scope operations """ def __init__(self, client: "Morphik", end_user_id: str, folder_name: Optional[str] = None): self._client = client self._end_user_id = end_user_id self._folder_name = folder_name @property def end_user_id(self) -> str: """Returns the end user ID.""" return self._end_user_id @property def folder_name(self) -> Optional[str]: """Returns the folder name if any.""" return self._folder_name def ingest_text( self, content: str, filename: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, rules: Optional[List[RuleOrDict]] = None, use_colpali: bool = True, ) -> Document: """ Ingest a text document into Morphik as this end user. Args: content: Text content to ingest filename: Optional file name metadata: Optional metadata dictionary rules: Optional list of rules to apply during ingestion use_colpali: Whether to use ColPali-style embedding model Returns: Document: Metadata of the ingested document """ rules_list = [self._client._convert_rule(r) for r in (rules or [])] payload = self._client._logic._prepare_ingest_text_request( content, filename, metadata, rules_list, use_colpali, self._folder_name, self._end_user_id, ) response = self._client._request("POST", "ingest/text", data=payload) doc = self._client._logic._parse_document_response(response) doc._client = self._client return doc def ingest_file( self, file: Union[str, bytes, BinaryIO, Path], filename: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, rules: Optional[List[RuleOrDict]] = None, use_colpali: bool = True, ) -> Document: """ Ingest a file document into Morphik as this end user. Args: file: File to ingest (path string, bytes, file object, or Path) filename: Name of the file metadata: Optional metadata dictionary rules: Optional list of rules to apply during ingestion use_colpali: Whether to use ColPali-style embedding model Returns: Document: Metadata of the ingested document """ # Handle different file input types if isinstance(file, (str, Path)): file_path = Path(file) if not file_path.exists(): raise ValueError(f"File not found: {file}") filename = file_path.name if filename is None else filename with open(file_path, "rb") as f: content = f.read() file_obj = BytesIO(content) elif isinstance(file, bytes): if filename is None: raise ValueError("filename is required when ingesting bytes") file_obj = BytesIO(file) else: if filename is None: raise ValueError("filename is required when ingesting file object") file_obj = file try: # Prepare multipart form data files = {"file": (filename, file_obj)} # Add metadata and rules form_data = { "metadata": json.dumps(metadata or {}), "rules": json.dumps([self._client._convert_rule(r) for r in (rules or [])]), "end_user_id": self._end_user_id, # Add end user ID here } # Add folder name if scoped to a folder if self._folder_name: form_data["folder_name"] = self._folder_name response = self._client._request( "POST", f"ingest/file?use_colpali={str(use_colpali).lower()}", data=form_data, files=files, ) doc = self._client._logic._parse_document_response(response) doc._client = self._client return doc finally: # Close file if we opened it if isinstance(file, (str, Path)): file_obj.close() def ingest_files( self, files: List[Union[str, bytes, BinaryIO, Path]], metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, rules: Optional[List[RuleOrDict]] = None, use_colpali: bool = True, parallel: bool = True, ) -> List[Document]: """ Ingest multiple files into Morphik as this end user. Args: files: List of files to ingest metadata: Optional metadata rules: Optional list of rules to apply use_colpali: Whether to use ColPali-style embedding parallel: Whether to process files in parallel Returns: List[Document]: List of ingested documents """ # Convert files to format expected by API file_objects = [] for file in files: if isinstance(file, (str, Path)): path = Path(file) file_objects.append(("files", (path.name, open(path, "rb")))) elif isinstance(file, bytes): file_objects.append(("files", ("file.bin", file))) else: file_objects.append(("files", (getattr(file, "name", "file.bin"), file))) try: # Prepare request data # Convert rules appropriately if rules: if all(isinstance(r, list) for r in rules): # List of lists - per-file rules converted_rules = [ [self._client._convert_rule(r) for r in rule_list] for rule_list in rules ] else: # Flat list - shared rules for all files converted_rules = [self._client._convert_rule(r) for r in rules] else: converted_rules = [] data = { "metadata": json.dumps(metadata or {}), "rules": json.dumps(converted_rules), "use_colpali": str(use_colpali).lower() if use_colpali is not None else None, "parallel": str(parallel).lower(), "end_user_id": self._end_user_id, # Add end user ID here } # Add folder name if scoped to a folder if self._folder_name: data["folder_name"] = self._folder_name response = self._client._request("POST", "ingest/files", data=data, files=file_objects) if response.get("errors"): # Log errors but don't raise exception for error in response["errors"]: logger.error(f"Failed to ingest {error['filename']}: {error['error']}") docs = [ self._client._logic._parse_document_response(doc) for doc in response["documents"] ] for doc in docs: doc._client = self._client return docs finally: # Clean up file objects for _, (_, file_obj) in file_objects: if isinstance(file_obj, (IOBase, BytesIO)) and not file_obj.closed: file_obj.close() def ingest_directory( self, directory: Union[str, Path], recursive: bool = False, pattern: str = "*", metadata: Optional[Dict[str, Any]] = None, rules: Optional[List[RuleOrDict]] = None, use_colpali: bool = True, parallel: bool = True, ) -> List[Document]: """ Ingest all files in a directory into Morphik as this end user. Args: directory: Path to directory containing files to ingest recursive: Whether to recursively process subdirectories pattern: Optional glob pattern to filter files metadata: Optional metadata dictionary to apply to all files rules: Optional list of rules to apply use_colpali: Whether to use ColPali-style embedding parallel: Whether to process files in parallel Returns: List[Document]: List of ingested documents """ directory = Path(directory) if not directory.is_dir(): raise ValueError(f"Directory not found: {directory}") # Collect all files matching pattern if recursive: files = list(directory.rglob(pattern)) else: files = list(directory.glob(pattern)) # Filter out directories files = [f for f in files if f.is_file()] if not files: return [] # Use ingest_files with collected paths return self.ingest_files( files=files, metadata=metadata, rules=rules, use_colpali=use_colpali, parallel=parallel ) def retrieve_chunks( self, query: str, filters: Optional[Dict[str, Any]] = None, k: int = 4, min_score: float = 0.0, use_colpali: bool = True, ) -> List[FinalChunkResult]: """ Retrieve relevant chunks as this end user. Args: query: Search query text filters: Optional metadata filters k: Number of results (default: 4) min_score: Minimum similarity threshold (default: 0.0) use_colpali: Whether to use ColPali-style embedding model Returns: List[FinalChunkResult]: List of relevant chunks """ request = { "query": query, "filters": filters, "k": k, "min_score": min_score, "use_colpali": use_colpali, "end_user_id": self._end_user_id, # Add end user ID here } # Add folder name if scoped to a folder if self._folder_name: request["folder_name"] = self._folder_name response = self._client._request("POST", "retrieve/chunks", request) return self._client._logic._parse_chunk_result_list_response(response) def retrieve_docs( self, query: str, filters: Optional[Dict[str, Any]] = None, k: int = 4, min_score: float = 0.0, use_colpali: bool = True, ) -> List[DocumentResult]: """ Retrieve relevant documents as this end user. Args: query: Search query text filters: Optional metadata filters k: Number of results (default: 4) min_score: Minimum similarity threshold (default: 0.0) use_colpali: Whether to use ColPali-style embedding model Returns: List[DocumentResult]: List of relevant documents """ request = { "query": query, "filters": filters, "k": k, "min_score": min_score, "use_colpali": use_colpali, "end_user_id": self._end_user_id, # Add end user ID here } # Add folder name if scoped to a folder if self._folder_name: request["folder_name"] = self._folder_name response = self._client._request("POST", "retrieve/docs", request) return self._client._logic._parse_document_result_list_response(response) def query( self, query: str, filters: Optional[Dict[str, Any]] = None, k: int = 4, min_score: float = 0.0, max_tokens: Optional[int] = None, temperature: Optional[float] = None, use_colpali: bool = True, graph_name: Optional[str] = None, hop_depth: int = 1, include_paths: bool = False, prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None, ) -> CompletionResponse: """ Generate completion using relevant chunks as context as this end user. Args: query: Query text filters: Optional metadata filters k: Number of chunks to use as context (default: 4) min_score: Minimum similarity threshold (default: 0.0) max_tokens: Maximum tokens in completion temperature: Model temperature use_colpali: Whether to use ColPali-style embedding model graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval hop_depth: Number of relationship hops to traverse in the graph (1-3) include_paths: Whether to include relationship paths in the response prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts Returns: CompletionResponse: Generated completion """ payload = self._client._logic._prepare_query_request( query, filters, k, min_score, max_tokens, temperature, use_colpali, graph_name, hop_depth, include_paths, prompt_overrides, self._folder_name, self._end_user_id, ) response = self._client._request("POST", "query", data=payload) return self._client._logic._parse_completion_response(response) def list_documents( self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None ) -> List[Document]: """ List accessible documents for this end user. Args: skip: Number of documents to skip limit: Maximum number of documents to return filters: Optional filters Returns: List[Document]: List of documents """ # Add end_user_id and folder_name to params params = {"skip": skip, "limit": limit, "end_user_id": self._end_user_id} # Add folder name if scoped to a folder if self._folder_name: params["folder_name"] = self._folder_name response = self._client._request("POST", f"documents", data=filters or {}, params=params) docs = [self._client._logic._parse_document_response(doc) for doc in response] for doc in docs: doc._client = self._client return docs def batch_get_documents(self, document_ids: List[str]) -> List[Document]: """ Retrieve multiple documents by their IDs in a single batch operation for this end user. Args: document_ids: List of document IDs to retrieve Returns: List[Document]: List of document metadata for found documents """ request = {"document_ids": document_ids, "end_user_id": self._end_user_id} # Add folder name if scoped to a folder if self._folder_name: request["folder_name"] = self._folder_name response = self._client._request("POST", "batch/documents", data=request) docs = [self._client._logic._parse_document_response(doc) for doc in response] for doc in docs: doc._client = self._client return docs def batch_get_chunks( self, sources: List[Union[ChunkSource, Dict[str, Any]]] ) -> List[FinalChunkResult]: """ Retrieve specific chunks by their document ID and chunk number in a single batch operation for this end user. Args: sources: List of ChunkSource objects or dictionaries with document_id and chunk_number Returns: List[FinalChunkResult]: List of chunk results """ # Convert to list of dictionaries if needed source_dicts = [] for source in sources: if isinstance(source, dict): source_dicts.append(source) else: source_dicts.append(source.model_dump()) # Add end_user_id and folder_name to request request = {"sources": source_dicts, "end_user_id": self._end_user_id} # Add folder name if scoped to a folder if self._folder_name: request["folder_name"] = self._folder_name response = self._client._request("POST", "batch/chunks", data=request) return self._client._logic._parse_chunk_result_list_response(response) def create_graph( self, name: str, filters: Optional[Dict[str, Any]] = None, documents: Optional[List[str]] = None, prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None, ) -> Graph: """ Create a graph from documents for this end user. Args: name: Name of the graph to create filters: Optional metadata filters to determine which documents to include documents: Optional list of specific document IDs to include prompt_overrides: Optional customizations for entity extraction and resolution prompts Returns: Graph: The created graph object """ # Convert prompt_overrides to dict if it's a model if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides): prompt_overrides = prompt_overrides.model_dump(exclude_none=True) request = { "name": name, "filters": filters, "documents": documents, "prompt_overrides": prompt_overrides, "end_user_id": self._end_user_id, # Add end user ID here } # Add folder name if scoped to a folder if self._folder_name: request["folder_name"] = self._folder_name response = self._client._request("POST", "graph/create", request) return self._client._logic._parse_graph_response(response) def update_graph( self, name: str, additional_filters: Optional[Dict[str, Any]] = None, additional_documents: Optional[List[str]] = None, prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None, ) -> Graph: """ Update an existing graph with new documents for this end user. Args: name: Name of the graph to update additional_filters: Optional additional metadata filters to determine which new documents to include additional_documents: Optional list of additional document IDs to include prompt_overrides: Optional customizations for entity extraction and resolution prompts Returns: Graph: The updated graph """ # Convert prompt_overrides to dict if it's a model if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides): prompt_overrides = prompt_overrides.model_dump(exclude_none=True) request = { "additional_filters": additional_filters, "additional_documents": additional_documents, "prompt_overrides": prompt_overrides, "end_user_id": self._end_user_id, # Add end user ID here } # Add folder name if scoped to a folder if self._folder_name: request["folder_name"] = self._folder_name response = self._client._request("POST", f"graph/{name}/update", request) return self._client._logic._parse_graph_response(response) def delete_document_by_filename(self, filename: str) -> Dict[str, str]: """ Delete a document by its filename for this end user. Args: filename: Filename of the document to delete Returns: Dict[str, str]: Deletion status """ # Build parameters for the filename lookup params = {"end_user_id": self._end_user_id} # Add folder name if scoped to a folder if self._folder_name: params["folder_name"] = self._folder_name # First get the document ID response = self._client._request("GET", f"documents/filename/{filename}", params=params) doc = self._client._logic._parse_document_response(response) # Then delete by ID return self._client.delete_document(doc.external_id) class Morphik: """ Morphik client for document operations. Args: uri (str, optional): Morphik URI in format "morphik://:@". If not provided, connects to http://localhost:8000 without authentication. timeout (int, optional): Request timeout in seconds. Defaults to 30. is_local (bool, optional): Whether connecting to local development server. Defaults to False. Examples: ```python # Without authentication db = Morphik() # With authentication db = Morphik("morphik://owner_id:token@api.morphik.ai") ``` """ def __init__(self, uri: Optional[str] = None, timeout: int = 30, is_local: bool = False): self._logic = _MorphikClientLogic(uri, timeout, is_local) self._client = httpx.Client(timeout=self._logic._timeout, verify=not self._logic._is_local) def _request( self, method: str, endpoint: str, data: Optional[Dict[str, Any]] = None, files: Optional[Dict[str, Any]] = None, params: Optional[Dict[str, Any]] = None, ) -> Dict[str, Any]: """Make HTTP request""" url = self._logic._get_url(endpoint) headers = self._logic._get_headers() if self._logic._auth_token: # Only add auth header if we have a token headers["Authorization"] = f"Bearer {self._logic._auth_token}" # Configure request data based on type if files: # Multipart form data for files request_data = {"files": files, "data": data} # Don't set Content-Type, let httpx handle it else: # JSON for everything else headers["Content-Type"] = "application/json" request_data = {"json": data} response = self._client.request( method, url, headers=headers, params=params, **request_data, ) response.raise_for_status() return response.json() def _convert_rule(self, rule: RuleOrDict) -> Dict[str, Any]: """Convert a rule to a dictionary format""" return self._logic._convert_rule(rule) def create_folder(self, name: str) -> Folder: """ Create a folder to scope operations. Args: name: The name of the folder Returns: Folder: A folder object for scoped operations """ return Folder(self, name) def get_folder(self, name: str) -> Folder: """ Get a folder by name to scope operations. Args: name: The name of the folder Returns: Folder: A folder object for scoped operations """ return Folder(self, name) def signin(self, end_user_id: str) -> UserScope: """ Sign in as an end user to scope operations. Args: end_user_id: The ID of the end user Returns: UserScope: A user scope object for scoped operations """ return UserScope(self, end_user_id) def ingest_text( self, content: str, filename: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, rules: Optional[List[RuleOrDict]] = None, use_colpali: bool = True, ) -> Document: """ Ingest a text document into Morphik. Args: content: Text content to ingest metadata: Optional metadata dictionary rules: Optional list of rules to apply during ingestion. Can be: - MetadataExtractionRule: Extract metadata using a schema - NaturalLanguageRule: Transform content using natural language use_colpali: Whether to use ColPali-style embedding model to ingest the text (slower, but significantly better retrieval accuracy for text and images) Returns: Document: Metadata of the ingested document Example: ```python from morphik.rules import MetadataExtractionRule, NaturalLanguageRule from pydantic import BaseModel class DocumentInfo(BaseModel): title: str author: str date: str doc = db.ingest_text( "Machine learning is fascinating...", metadata={"category": "tech"}, rules=[ # Extract metadata using schema MetadataExtractionRule(schema=DocumentInfo), # Transform content NaturalLanguageRule(prompt="Shorten the content, use keywords") ] ) ``` """ rules_list = [self._convert_rule(r) for r in (rules or [])] payload = self._logic._prepare_ingest_text_request( content, filename, metadata, rules_list, use_colpali, None, None ) response = self._request("POST", "ingest/text", data=payload) doc = self._logic._parse_document_response(response) doc._client = self return doc def ingest_file( self, file: Union[str, bytes, BinaryIO, Path], filename: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, rules: Optional[List[RuleOrDict]] = None, use_colpali: bool = True, ) -> Document: """ Ingest a file document into Morphik. Args: file: File to ingest (path string, bytes, file object, or Path) filename: Name of the file metadata: Optional metadata dictionary rules: Optional list of rules to apply during ingestion. Can be: - MetadataExtractionRule: Extract metadata using a schema - NaturalLanguageRule: Transform content using natural language use_colpali: Whether to use ColPali-style embedding model to ingest the file (slower, but significantly better retrieval accuracy for images) Returns: Document: Metadata of the ingested document Example: ```python from morphik.rules import MetadataExtractionRule, NaturalLanguageRule from pydantic import BaseModel class DocumentInfo(BaseModel): title: str author: str department: str doc = db.ingest_file( "document.pdf", filename="document.pdf", metadata={"category": "research"}, rules=[ MetadataExtractionRule(schema=DocumentInfo), NaturalLanguageRule(prompt="Extract key points only") ], # Optional use_colpali=True, # Optional ) ``` """ # Process file input file_obj, filename = self._logic._prepare_file_for_upload(file, filename) try: # Prepare multipart form data files = {"file": (filename, file_obj)} # Create form data form_data = self._logic._prepare_ingest_file_form_data(metadata, rules, None, None) response = self._request( "POST", f"ingest/file?use_colpali={str(use_colpali).lower()}", data=form_data, files=files, ) doc = self._logic._parse_document_response(response) doc._client = self return doc finally: # Close file if we opened it if isinstance(file, (str, Path)): file_obj.close() def ingest_files( self, files: List[Union[str, bytes, BinaryIO, Path]], metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, rules: Optional[List[RuleOrDict]] = None, use_colpali: bool = True, parallel: bool = True, ) -> List[Document]: """ Ingest multiple files into Morphik. Args: files: List of files to ingest (path strings, bytes, file objects, or Paths) metadata: Optional metadata (single dict for all files or list of dicts) rules: Optional list of rules to apply use_colpali: Whether to use ColPali-style embedding parallel: Whether to process files in parallel Returns: List[Document]: List of successfully ingested documents Raises: ValueError: If metadata list length doesn't match files length """ # Convert files to format expected by API file_objects = self._logic._prepare_files_for_upload(files) try: # Prepare form data data = self._logic._prepare_ingest_files_form_data( metadata, rules, use_colpali, parallel, None, None ) response = self._request("POST", "ingest/files", data=data, files=file_objects) if response.get("errors"): # Log errors but don't raise exception for error in response["errors"]: logger.error(f"Failed to ingest {error['filename']}: {error['error']}") docs = [self._logic._parse_document_response(doc) for doc in response["documents"]] for doc in docs: doc._client = self return docs finally: # Clean up file objects for _, (_, file_obj) in file_objects: if isinstance(file_obj, (IOBase, BytesIO)) and not file_obj.closed: file_obj.close() def ingest_directory( self, directory: Union[str, Path], recursive: bool = False, pattern: str = "*", metadata: Optional[Dict[str, Any]] = None, rules: Optional[List[RuleOrDict]] = None, use_colpali: bool = True, parallel: bool = True, ) -> List[Document]: """ Ingest all files in a directory into Morphik. Args: directory: Path to directory containing files to ingest recursive: Whether to recursively process subdirectories pattern: Optional glob pattern to filter files (e.g. "*.pdf") metadata: Optional metadata dictionary to apply to all files rules: Optional list of rules to apply use_colpali: Whether to use ColPali-style embedding parallel: Whether to process files in parallel Returns: List[Document]: List of ingested documents Raises: ValueError: If directory not found """ directory = Path(directory) if not directory.is_dir(): raise ValueError(f"Directory not found: {directory}") # Collect all files matching pattern if recursive: files = list(directory.rglob(pattern)) else: files = list(directory.glob(pattern)) # Filter out directories files = [f for f in files if f.is_file()] if not files: return [] # Use ingest_files with collected paths return self.ingest_files( files=files, metadata=metadata, rules=rules, use_colpali=use_colpali, parallel=parallel ) def retrieve_chunks( self, query: str, filters: Optional[Dict[str, Any]] = None, k: int = 4, min_score: float = 0.0, use_colpali: bool = True, ) -> List[FinalChunkResult]: """ Retrieve relevant chunks. Args: query: Search query text filters: Optional metadata filters k: Number of results (default: 4) min_score: Minimum similarity threshold (default: 0.0) use_colpali: Whether to use ColPali-style embedding model to retrieve the chunks (only works for documents ingested with `use_colpali=True`) Returns: List[ChunkResult] Example: ```python chunks = db.retrieve_chunks( "What are the key findings?", filters={"department": "research"} ) ``` """ payload = self._logic._prepare_retrieve_chunks_request( query, filters, k, min_score, use_colpali, None, None ) response = self._request("POST", "retrieve/chunks", data=payload) return self._logic._parse_chunk_result_list_response(response) def retrieve_docs( self, query: str, filters: Optional[Dict[str, Any]] = None, k: int = 4, min_score: float = 0.0, use_colpali: bool = True, ) -> List[DocumentResult]: """ Retrieve relevant documents. Args: query: Search query text filters: Optional metadata filters k: Number of results (default: 4) min_score: Minimum similarity threshold (default: 0.0) use_colpali: Whether to use ColPali-style embedding model to retrieve the documents (only works for documents ingested with `use_colpali=True`) Returns: List[DocumentResult] Example: ```python docs = db.retrieve_docs( "machine learning", k=5 ) ``` """ payload = self._logic._prepare_retrieve_docs_request( query, filters, k, min_score, use_colpali, None, None ) response = self._request("POST", "retrieve/docs", data=payload) return self._logic._parse_document_result_list_response(response) def query( self, query: str, filters: Optional[Dict[str, Any]] = None, k: int = 4, min_score: float = 0.0, max_tokens: Optional[int] = None, temperature: Optional[float] = None, use_colpali: bool = True, graph_name: Optional[str] = None, hop_depth: int = 1, include_paths: bool = False, prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None, ) -> CompletionResponse: """ Generate completion using relevant chunks as context. Args: query: Query text filters: Optional metadata filters k: Number of chunks to use as context (default: 4) min_score: Minimum similarity threshold (default: 0.0) max_tokens: Maximum tokens in completion temperature: Model temperature use_colpali: Whether to use ColPali-style embedding model to generate the completion (only works for documents ingested with `use_colpali=True`) graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval hop_depth: Number of relationship hops to traverse in the graph (1-3) include_paths: Whether to include relationship paths in the response prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts Either a QueryPromptOverrides object or a dictionary with the same structure Returns: CompletionResponse Example: ```python # Standard query response = db.query( "What are the key findings about customer satisfaction?", filters={"department": "research"}, temperature=0.7 ) # Knowledge graph enhanced query response = db.query( "How does product X relate to customer segment Y?", graph_name="market_graph", hop_depth=2, include_paths=True ) # With prompt customization from morphik.models import QueryPromptOverride, QueryPromptOverrides response = db.query( "What are the key findings?", prompt_overrides=QueryPromptOverrides( query=QueryPromptOverride( prompt_template="Answer the question in a formal, academic tone: {question}" ) ) ) # Or using a dictionary response = db.query( "What are the key findings?", prompt_overrides={ "query": { "prompt_template": "Answer the question in a formal, academic tone: {question}" } } ) print(response.completion) # If include_paths=True, you can inspect the graph paths if response.metadata and "graph" in response.metadata: for path in response.metadata["graph"]["paths"]: print(" -> ".join(path)) ``` """ payload = self._logic._prepare_query_request( query, filters, k, min_score, max_tokens, temperature, use_colpali, graph_name, hop_depth, include_paths, prompt_overrides, None, None, ) response = self._request("POST", "query", data=payload) return self._logic._parse_completion_response(response) def list_documents( self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None ) -> List[Document]: """ List accessible documents. Args: skip: Number of documents to skip limit: Maximum number of documents to return filters: Optional filters Returns: List[Document]: List of accessible documents Example: ```python # Get first page docs = db.list_documents(limit=10) # Get next page next_page = db.list_documents(skip=10, limit=10, filters={"department": "research"}) ``` """ params, data = self._logic._prepare_list_documents_request(skip, limit, filters, None, None) response = self._request("POST", "documents", data=data, params=params) docs = self._logic._parse_document_list_response(response) for doc in docs: doc._client = self return docs def get_document(self, document_id: str) -> Document: """ Get document metadata by ID. Args: document_id: ID of the document Returns: Document: Document metadata Example: ```python doc = db.get_document("doc_123") print(f"Title: {doc.metadata.get('title')}") ``` """ response = self._request("GET", f"documents/{document_id}") doc = self._logic._parse_document_response(response) doc._client = self return doc def get_document_by_filename(self, filename: str) -> Document: """ Get document metadata by filename. If multiple documents have the same filename, returns the most recently updated one. Args: filename: Filename of the document to retrieve Returns: Document: Document metadata Example: ```python doc = db.get_document_by_filename("report.pdf") print(f"Document ID: {doc.external_id}") ``` """ response = self._request("GET", f"documents/filename/{filename}") doc = self._logic._parse_document_response(response) doc._client = self return doc def update_document_with_text( self, document_id: str, content: str, filename: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, rules: Optional[List] = None, update_strategy: str = "add", use_colpali: Optional[bool] = None, ) -> Document: """ Update a document with new text content using the specified strategy. Args: document_id: ID of the document to update content: The new content to add filename: Optional new filename for the document metadata: Additional metadata to update (optional) rules: Optional list of rules to apply to the content update_strategy: Strategy for updating the document (currently only 'add' is supported) use_colpali: Whether to use multi-vector embedding Returns: Document: Updated document metadata Example: ```python # Add new content to an existing document updated_doc = db.update_document_with_text( document_id="doc_123", content="This is additional content that will be appended to the document.", filename="updated_document.txt", metadata={"category": "updated"}, update_strategy="add" ) print(f"Document version: {updated_doc.system_metadata.get('version')}") ``` """ # Use the dedicated text update endpoint request = IngestTextRequest( content=content, filename=filename, metadata=metadata or {}, rules=[self._convert_rule(r) for r in (rules or [])], use_colpali=use_colpali if use_colpali is not None else True, ) params = {} if update_strategy != "add": params["update_strategy"] = update_strategy response = self._request( "POST", f"documents/{document_id}/update_text", data=request.model_dump(), params=params ) doc = self._logic._parse_document_response(response) doc._client = self return doc def update_document_with_file( self, document_id: str, file: Union[str, bytes, BinaryIO, Path], filename: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, rules: Optional[List] = None, update_strategy: str = "add", use_colpali: Optional[bool] = None, ) -> Document: """ Update a document with content from a file using the specified strategy. Args: document_id: ID of the document to update file: File to add (path string, bytes, file object, or Path) filename: Name of the file metadata: Additional metadata to update (optional) rules: Optional list of rules to apply to the content update_strategy: Strategy for updating the document (currently only 'add' is supported) use_colpali: Whether to use multi-vector embedding Returns: Document: Updated document metadata Example: ```python # Add content from a file to an existing document updated_doc = db.update_document_with_file( document_id="doc_123", file="path/to/update.pdf", metadata={"status": "updated"}, update_strategy="add" ) print(f"Document version: {updated_doc.system_metadata.get('version')}") ``` """ # Handle different file input types if isinstance(file, (str, Path)): file_path = Path(file) if not file_path.exists(): raise ValueError(f"File not found: {file}") filename = file_path.name if filename is None else filename with open(file_path, "rb") as f: content = f.read() file_obj = BytesIO(content) elif isinstance(file, bytes): if filename is None: raise ValueError("filename is required when updating with bytes") file_obj = BytesIO(file) else: if filename is None: raise ValueError("filename is required when updating with file object") file_obj = file try: # Prepare multipart form data files = {"file": (filename, file_obj)} # Convert metadata and rules to JSON strings form_data = { "metadata": json.dumps(metadata or {}), "rules": json.dumps([self._convert_rule(r) for r in (rules or [])]), "update_strategy": update_strategy, } if use_colpali is not None: form_data["use_colpali"] = str(use_colpali).lower() # Use the dedicated file update endpoint response = self._request( "POST", f"documents/{document_id}/update_file", data=form_data, files=files ) doc = self._logic._parse_document_response(response) doc._client = self return doc finally: # Close file if we opened it if isinstance(file, (str, Path)): file_obj.close() def update_document_metadata( self, document_id: str, metadata: Dict[str, Any], ) -> Document: """ Update a document's metadata only. Args: document_id: ID of the document to update metadata: Metadata to update Returns: Document: Updated document metadata Example: ```python # Update just the metadata of a document updated_doc = db.update_document_metadata( document_id="doc_123", metadata={"status": "reviewed", "reviewer": "Jane Smith"} ) print(f"Updated metadata: {updated_doc.metadata}") ``` """ # Use the dedicated metadata update endpoint response = self._request("POST", f"documents/{document_id}/update_metadata", data=metadata) doc = self._logic._parse_document_response(response) doc._client = self return doc def update_document_by_filename_with_text( self, filename: str, content: str, new_filename: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, rules: Optional[List] = None, update_strategy: str = "add", use_colpali: Optional[bool] = None, ) -> Document: """ Update a document identified by filename with new text content using the specified strategy. Args: filename: Filename of the document to update content: The new content to add new_filename: Optional new filename for the document metadata: Additional metadata to update (optional) rules: Optional list of rules to apply to the content update_strategy: Strategy for updating the document (currently only 'add' is supported) use_colpali: Whether to use multi-vector embedding Returns: Document: Updated document metadata Example: ```python # Add new content to an existing document identified by filename updated_doc = db.update_document_by_filename_with_text( filename="report.pdf", content="This is additional content that will be appended to the document.", new_filename="updated_report.pdf", metadata={"category": "updated"}, update_strategy="add" ) print(f"Document version: {updated_doc.system_metadata.get('version')}") ``` """ # First get the document by filename to obtain its ID doc = self.get_document_by_filename(filename) # Then use the regular update_document_with_text endpoint with the document ID return self.update_document_with_text( document_id=doc.external_id, content=content, filename=new_filename, metadata=metadata, rules=rules, update_strategy=update_strategy, use_colpali=use_colpali, ) def update_document_by_filename_with_file( self, filename: str, file: Union[str, bytes, BinaryIO, Path], new_filename: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, rules: Optional[List] = None, update_strategy: str = "add", use_colpali: Optional[bool] = None, ) -> Document: """ Update a document identified by filename with content from a file using the specified strategy. Args: filename: Filename of the document to update file: File to add (path string, bytes, file object, or Path) new_filename: Optional new filename for the document (defaults to the filename of the file) metadata: Additional metadata to update (optional) rules: Optional list of rules to apply to the content update_strategy: Strategy for updating the document (currently only 'add' is supported) use_colpali: Whether to use multi-vector embedding Returns: Document: Updated document metadata Example: ```python # Add content from a file to an existing document identified by filename updated_doc = db.update_document_by_filename_with_file( filename="report.pdf", file="path/to/update.pdf", metadata={"status": "updated"}, update_strategy="add" ) print(f"Document version: {updated_doc.system_metadata.get('version')}") ``` """ # First get the document by filename to obtain its ID doc = self.get_document_by_filename(filename) # Then use the regular update_document_with_file endpoint with the document ID return self.update_document_with_file( document_id=doc.external_id, file=file, filename=new_filename, metadata=metadata, rules=rules, update_strategy=update_strategy, use_colpali=use_colpali, ) def update_document_by_filename_metadata( self, filename: str, metadata: Dict[str, Any], new_filename: Optional[str] = None, ) -> Document: """ Update a document's metadata using filename to identify the document. Args: filename: Filename of the document to update metadata: Metadata to update new_filename: Optional new filename to assign to the document Returns: Document: Updated document metadata Example: ```python # Update just the metadata of a document identified by filename updated_doc = db.update_document_by_filename_metadata( filename="report.pdf", metadata={"status": "reviewed", "reviewer": "Jane Smith"}, new_filename="reviewed_report.pdf" # Optional: rename the file ) print(f"Updated metadata: {updated_doc.metadata}") ``` """ # First get the document by filename to obtain its ID doc = self.get_document_by_filename(filename) # Update the metadata result = self.update_document_metadata( document_id=doc.external_id, metadata=metadata, ) # If new_filename is provided, update the filename as well if new_filename: # Create a request that retains the just-updated metadata but also changes filename combined_metadata = result.metadata.copy() # Update the document again with filename change and the same metadata response = self._request( "POST", f"documents/{doc.external_id}/update_text", data={ "content": "", "filename": new_filename, "metadata": combined_metadata, "rules": [], }, ) result = self._logic._parse_document_response(response) result._client = self return result def batch_get_documents(self, document_ids: List[str]) -> List[Document]: """ Retrieve multiple documents by their IDs in a single batch operation. Args: document_ids: List of document IDs to retrieve Returns: List[Document]: List of document metadata for found documents Example: ```python docs = db.batch_get_documents(["doc_123", "doc_456", "doc_789"]) for doc in docs: print(f"Document {doc.external_id}: {doc.metadata.get('title')}") ``` """ response = self._request("POST", "batch/documents", data=document_ids) docs = self._logic._parse_document_list_response(response) for doc in docs: doc._client = self return docs def batch_get_chunks( self, sources: List[Union[ChunkSource, Dict[str, Any]]] ) -> List[FinalChunkResult]: """ Retrieve specific chunks by their document ID and chunk number in a single batch operation. Args: sources: List of ChunkSource objects or dictionaries with document_id and chunk_number Returns: List[FinalChunkResult]: List of chunk results Example: ```python # Using dictionaries sources = [ {"document_id": "doc_123", "chunk_number": 0}, {"document_id": "doc_456", "chunk_number": 2} ] # Or using ChunkSource objects from morphik.models import ChunkSource sources = [ ChunkSource(document_id="doc_123", chunk_number=0), ChunkSource(document_id="doc_456", chunk_number=2) ] chunks = db.batch_get_chunks(sources) for chunk in chunks: print(f"Chunk from {chunk.document_id}, number {chunk.chunk_number}: {chunk.content[:50]}...") ``` """ # Convert to list of dictionaries if needed source_dicts = [] for source in sources: if isinstance(source, dict): source_dicts.append(source) else: source_dicts.append(source.model_dump()) response = self._request("POST", "batch/chunks", data=source_dicts) return self._logic._parse_chunk_result_list_response(response) def create_cache( self, name: str, model: str, gguf_file: str, filters: Optional[Dict[str, Any]] = None, docs: Optional[List[str]] = None, ) -> Dict[str, Any]: """ Create a new cache with specified configuration. Args: name: Name of the cache to create model: Name of the model to use (e.g. "llama2") gguf_file: Name of the GGUF file to use for the model filters: Optional metadata filters to determine which documents to include. These filters will be applied in addition to any specific docs provided. docs: Optional list of specific document IDs to include. These docs will be included in addition to any documents matching the filters. Returns: Dict[str, Any]: Created cache configuration Example: ```python # This will include both: # 1. Any documents with category="programming" # 2. The specific documents "doc1" and "doc2" (regardless of their category) cache = db.create_cache( name="programming_cache", model="llama2", gguf_file="llama-2-7b-chat.Q4_K_M.gguf", filters={"category": "programming"}, docs=["doc1", "doc2"] ) ``` """ # Build query parameters for name, model and gguf_file params = {"name": name, "model": model, "gguf_file": gguf_file} # Build request body for filters and docs request = {"filters": filters, "docs": docs} response = self._request("POST", "cache/create", request, params=params) return response def get_cache(self, name: str) -> Cache: """ Get a cache by name. Args: name: Name of the cache to retrieve Returns: cache: A cache object that is used to interact with the cache. Example: ```python cache = db.get_cache("programming_cache") ``` """ response = self._request("GET", f"cache/{name}") if response.get("exists", False): return Cache(self, name) raise ValueError(f"Cache '{name}' not found") def create_graph( self, name: str, filters: Optional[Dict[str, Any]] = None, documents: Optional[List[str]] = None, prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None, ) -> Graph: """ Create a graph from documents. This method extracts entities and relationships from documents matching the specified filters or document IDs and creates a graph. Args: name: Name of the graph to create filters: Optional metadata filters to determine which documents to include documents: Optional list of specific document IDs to include prompt_overrides: Optional customizations for entity extraction and resolution prompts Either a GraphPromptOverrides object or a dictionary with the same structure Returns: Graph: The created graph object Example: ```python # Create a graph from documents with category="research" graph = db.create_graph( name="research_graph", filters={"category": "research"} ) # Create a graph from specific documents graph = db.create_graph( name="custom_graph", documents=["doc1", "doc2", "doc3"] ) # With custom entity extraction examples from morphik.models import EntityExtractionPromptOverride, EntityExtractionExample, GraphPromptOverrides graph = db.create_graph( name="medical_graph", filters={"category": "medical"}, prompt_overrides=GraphPromptOverrides( entity_extraction=EntityExtractionPromptOverride( examples=[ EntityExtractionExample(label="Insulin", type="MEDICATION"), EntityExtractionExample(label="Diabetes", type="CONDITION") ] ) ) ) ``` """ # Convert prompt_overrides to dict if it's a model if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides): prompt_overrides = prompt_overrides.model_dump(exclude_none=True) request = { "name": name, "filters": filters, "documents": documents, "prompt_overrides": prompt_overrides, } response = self._request("POST", "graph/create", request) return self._logic._parse_graph_response(response) def get_graph(self, name: str) -> Graph: """ Get a graph by name. Args: name: Name of the graph to retrieve Returns: Graph: The requested graph object Example: ```python # Get a graph by name graph = db.get_graph("finance_graph") print(f"Graph has {len(graph.entities)} entities and {len(graph.relationships)} relationships") ``` """ response = self._request("GET", f"graph/{name}") return self._logic._parse_graph_response(response) def list_graphs(self) -> List[Graph]: """ List all graphs the user has access to. Returns: List[Graph]: List of graph objects Example: ```python # List all accessible graphs graphs = db.list_graphs() for graph in graphs: print(f"Graph: {graph.name}, Entities: {len(graph.entities)}") ``` """ response = self._request("GET", "graphs") return self._logic._parse_graph_list_response(response) def update_graph( self, name: str, additional_filters: Optional[Dict[str, Any]] = None, additional_documents: Optional[List[str]] = None, prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None, ) -> Graph: """ Update an existing graph with new documents. This method processes additional documents matching the original or new filters, extracts entities and relationships, and updates the graph with new information. Args: name: Name of the graph to update additional_filters: Optional additional metadata filters to determine which new documents to include additional_documents: Optional list of additional document IDs to include prompt_overrides: Optional customizations for entity extraction and resolution prompts Either a GraphPromptOverrides object or a dictionary with the same structure Returns: Graph: The updated graph Example: ```python # Update a graph with new documents updated_graph = db.update_graph( name="research_graph", additional_filters={"category": "new_research"}, additional_documents=["doc4", "doc5"] ) print(f"Graph now has {len(updated_graph.entities)} entities") # With entity resolution examples from morphik.models import EntityResolutionPromptOverride, EntityResolutionExample, GraphPromptOverrides updated_graph = db.update_graph( name="research_graph", additional_documents=["doc4"], prompt_overrides=GraphPromptOverrides( entity_resolution=EntityResolutionPromptOverride( examples=[ EntityResolutionExample( canonical="Machine Learning", variants=["ML", "machine learning", "AI/ML"] ) ] ) ) ) ``` """ # Convert prompt_overrides to dict if it's a model if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides): prompt_overrides = prompt_overrides.model_dump(exclude_none=True) request = { "additional_filters": additional_filters, "additional_documents": additional_documents, "prompt_overrides": prompt_overrides, } response = self._request("POST", f"graph/{name}/update", request) return self._logic._parse_graph_response(response) def delete_document(self, document_id: str) -> Dict[str, str]: """ Delete a document and all its associated data. This method deletes a document and all its associated data, including: - Document metadata - Document content in storage - Document chunks and embeddings in vector store Args: document_id: ID of the document to delete Returns: Dict[str, str]: Deletion status Example: ```python # Delete a document result = db.delete_document("doc_123") print(result["message"]) # Document doc_123 deleted successfully ``` """ response = self._request("DELETE", f"documents/{document_id}") return response def delete_document_by_filename(self, filename: str) -> Dict[str, str]: """ Delete a document by its filename. This is a convenience method that first retrieves the document ID by filename and then deletes the document by ID. Args: filename: Filename of the document to delete Returns: Dict[str, str]: Deletion status Example: ```python # Delete a document by filename result = db.delete_document_by_filename("report.pdf") print(result["message"]) ``` """ # First get the document by filename to obtain its ID doc = self.get_document_by_filename(filename) # Then delete the document by ID return self.delete_document(doc.external_id) def close(self): """Close the HTTP client""" self._client.close() def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close()