From 75556c924aefa3fc33d7f5fac9250487e60e70be Mon Sep 17 00:00:00 2001 From: Adityavardhan Agrawal Date: Sun, 13 Apr 2025 14:52:26 -0700 Subject: [PATCH] Add folders and user scopes (#82) --- .env.example | 1 - README.md | 6 +- core/api.py | 338 ++++- core/config.py | 62 +- core/database/base_database.py | 37 +- core/database/mongo_database.py | 329 ----- core/database/postgres_database.py | 291 ++++- core/models/completion.py | 2 + core/models/documents.py | 4 +- core/models/graph.py | 8 + core/models/request.py | 8 + core/services/document_service.py | 72 +- core/services/graph_service.py | 100 +- core/tests/integration/test_api.py | 650 +++++++++- core/vector_store/mongo_vector_store.py | 183 --- examples/multi_app_user_scoping.py | 82 ++ quick_setup.py | 93 +- requirements.txt | 2 - sanity_checks/mongo.py | 72 -- sdks/python/morphik/__init__.py | 2 +- sdks/python/morphik/_internal.py | 507 ++++++++ sdks/python/morphik/async_.py | 1510 ++++++++++++++++------ sdks/python/morphik/models.py | 63 +- sdks/python/morphik/sync.py | 1576 ++++++++++++++++++----- sdks/python/pyproject.toml | 2 +- 25 files changed, 4417 insertions(+), 1583 deletions(-) delete mode 100644 core/database/mongo_database.py delete mode 100644 core/vector_store/mongo_vector_store.py create mode 100644 examples/multi_app_user_scoping.py delete mode 100644 sanity_checks/mongo.py create mode 100644 sdks/python/morphik/_internal.py diff --git a/.env.example b/.env.example index 0e9bb34..74ccc03 100644 --- a/.env.example +++ b/.env.example @@ -1,6 +1,5 @@ JWT_SECRET_KEY="..." # Required in production, optional in dev mode (dev_mode=true in morphik.toml) POSTGRES_URI="postgresql+asyncpg://postgres:postgres@localhost:5432/morphik" # Required for PostgreSQL database -MONGODB_URI="..." # Optional: Only needed if using MongoDB UNSTRUCTURED_API_KEY="..." # Optional: Needed for parsing via unstructured API OPENAI_API_KEY="..." # Optional: Needed for OpenAI embeddings and completions diff --git a/README.md b/README.md index 082185c..96ffabb 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ Built for scale and performance, Morphik can handle millions of documents while - 🧩 **Extensible Architecture** - Support for custom parsers and embedding models - Multiple storage backends (S3, local) - - Vector store integrations (PostgreSQL/pgvector, MongoDB) + - Vector store integration with PostgreSQL/pgvector ## Quick Start @@ -162,7 +162,7 @@ for chunk in chunks: | **Knowledge Graphs** | ✅ Automated extraction & enhanced retrieval | ❌ | ❌ | ❌ | | **Rules Engine** | ✅ Natural language rules & schema definition | ❌ | ❌ | Limited | | **Caching** | ✅ Persistent KV-caching with selective updates | ❌ | ❌ | Limited | -| **Scalability** | ✅ Millions of documents with PostgreSQL/MongoDB | ✅ | ✅ | Limited | +| **Scalability** | ✅ Millions of documents with PostgreSQL | ✅ | ✅ | Limited | | **Video Content** | ✅ Native video parsing & transcription | ❌ | ❌ | ❌ | | **Deployment Options** | ✅ Self-hosted, cloud, or hybrid | Varies | Varies | Limited | | **Open Source** | ✅ MIT License | Varies | Varies | Varies | @@ -176,7 +176,7 @@ for chunk in chunks: - **Schema-like Rules for Unstructured Data**: Define rules to extract consistent metadata from unstructured content, bringing database-like queryability to any document format. -- **Enterprise-grade Scalability**: Built on proven database technologies (PostgreSQL/MongoDB) that can scale to millions of documents while maintaining sub-second retrieval times. +- **Enterprise-grade Scalability**: Built on proven PostgreSQL database technology that can scale to millions of documents while maintaining sub-second retrieval times. ## Documentation diff --git a/core/api.py b/core/api.py index 0b48122..0bcdaa3 100644 --- a/core/api.py +++ b/core/api.py @@ -20,9 +20,7 @@ from core.parser.morphik_parser import MorphikParser from core.services.document_service import DocumentService from core.services.telemetry import TelemetryService from core.config import get_settings -from core.database.mongo_database import MongoDatabase from core.database.postgres_database import PostgresDatabase -from core.vector_store.mongo_vector_store import MongoDBAtlasVectorStore from core.vector_store.multi_vector_store import MultiVectorStore from core.embedding.colpali_embedding_model import ColpaliEmbeddingModel from core.storage.s3_storage import S3Storage @@ -77,21 +75,9 @@ app.add_middleware( settings = get_settings() # Initialize database -match settings.DATABASE_PROVIDER: - case "postgres": - if not settings.POSTGRES_URI: - raise ValueError("PostgreSQL URI is required for PostgreSQL database") - database = PostgresDatabase(uri=settings.POSTGRES_URI) - case "mongodb": - if not settings.MONGODB_URI: - raise ValueError("MongoDB URI is required for MongoDB database") - database = MongoDatabase( - uri=settings.MONGODB_URI, - db_name=settings.DATABRIDGE_DB, - collection_name=settings.DOCUMENTS_COLLECTION, - ) - case _: - raise ValueError(f"Unsupported database provider: {settings.DATABASE_PROVIDER}") +if not settings.POSTGRES_URI: + raise ValueError("PostgreSQL URI is required for PostgreSQL database") +database = PostgresDatabase(uri=settings.POSTGRES_URI) @app.on_event("startup") @@ -144,24 +130,13 @@ async def initialize_user_limits_database(): await user_limits_db.initialize() # Initialize vector store -match settings.VECTOR_STORE_PROVIDER: - case "mongodb": - vector_store = MongoDBAtlasVectorStore( - uri=settings.MONGODB_URI, - database_name=settings.DATABRIDGE_DB, - collection_name=settings.CHUNKS_COLLECTION, - index_name=settings.VECTOR_INDEX_NAME, - ) - case "pgvector": - if not settings.POSTGRES_URI: - raise ValueError("PostgreSQL URI is required for pgvector store") - from core.vector_store.pgvector_store import PGVectorStore +if not settings.POSTGRES_URI: + raise ValueError("PostgreSQL URI is required for pgvector store") +from core.vector_store.pgvector_store import PGVectorStore - vector_store = PGVectorStore( - uri=settings.POSTGRES_URI, - ) - case _: - raise ValueError(f"Unsupported vector store provider: {settings.VECTOR_STORE_PROVIDER}") +vector_store = PGVectorStore( + uri=settings.POSTGRES_URI, +) # Initialize storage match settings.STORAGE_PROVIDER: @@ -310,6 +285,8 @@ async def ingest_text( - rules: Optional list of rules. Each rule should be either: - MetadataExtractionRule: {"type": "metadata_extraction", "schema": {...}} - NaturalLanguageRule: {"type": "natural_language", "prompt": "..."} + - folder_name: Optional folder to scope the document to + - end_user_id: Optional end-user ID to scope the document to auth: Authentication context Returns: @@ -324,6 +301,8 @@ async def ingest_text( "metadata": request.metadata, "rules": request.rules, "use_colpali": request.use_colpali, + "folder_name": request.folder_name, + "end_user_id": request.end_user_id, }, ): return await document_service.ingest_text( @@ -333,6 +312,8 @@ async def ingest_text( rules=request.rules, use_colpali=request.use_colpali, auth=auth, + folder_name=request.folder_name, + end_user_id=request.end_user_id, ) except PermissionError as e: raise HTTPException(status_code=403, detail=str(e)) @@ -345,6 +326,8 @@ async def ingest_file( rules: str = Form("[]"), auth: AuthContext = Depends(verify_token), use_colpali: Optional[bool] = None, + folder_name: Optional[str] = Form(None), + end_user_id: Optional[str] = Form(None), ) -> Document: """ Ingest a file document. @@ -356,6 +339,9 @@ async def ingest_file( - MetadataExtractionRule: {"type": "metadata_extraction", "schema": {...}} - NaturalLanguageRule: {"type": "natural_language", "prompt": "..."} auth: Authentication context + use_colpali: Whether to use ColPali embedding model + folder_name: Optional folder to scope the document to + end_user_id: Optional end-user ID to scope the document to Returns: Document: Metadata of ingested document @@ -374,15 +360,20 @@ async def ingest_file( "metadata": metadata_dict, "rules": rules_list, "use_colpali": use_colpali, + "folder_name": folder_name, + "end_user_id": end_user_id, }, ): logger.debug(f"API: Ingesting file with use_colpali: {use_colpali}") + return await document_service.ingest_file( file=file, metadata=metadata_dict, auth=auth, rules=rules_list, use_colpali=use_colpali, + folder_name=folder_name, + end_user_id=end_user_id, ) except json.JSONDecodeError as e: raise HTTPException(status_code=400, detail=f"Invalid JSON: {str(e)}") @@ -397,6 +388,8 @@ async def batch_ingest_files( rules: str = Form("[]"), use_colpali: Optional[bool] = Form(None), parallel: bool = Form(True), + folder_name: Optional[str] = Form(None), + end_user_id: Optional[str] = Form(None), auth: AuthContext = Depends(verify_token), ) -> BatchIngestResponse: """ @@ -410,6 +403,8 @@ async def batch_ingest_files( - A list of rule lists, one per file use_colpali: Whether to use ColPali-style embedding parallel: Whether to process files in parallel + folder_name: Optional folder to scope the documents to + end_user_id: Optional end-user ID to scope the documents to auth: Authentication context Returns: @@ -447,6 +442,8 @@ async def batch_ingest_files( documents = [] errors = [] + # We'll pass folder_name and end_user_id directly to the ingest_file functions + async with telemetry.track_operation( operation_type="batch_ingest", user_id=auth.entity_id, @@ -454,6 +451,8 @@ async def batch_ingest_files( "file_count": len(files), "metadata_type": "list" if isinstance(metadata_value, list) else "single", "rules_type": "per_file" if isinstance(rules_list, list) and rules_list and isinstance(rules_list[0], list) else "shared", + "folder_name": folder_name, + "end_user_id": end_user_id, }, ): if parallel: @@ -466,7 +465,9 @@ async def batch_ingest_files( metadata=metadata_item, auth=auth, rules=file_rules, - use_colpali=use_colpali + use_colpali=use_colpali, + folder_name=folder_name, + end_user_id=end_user_id ) tasks.append(task) @@ -490,7 +491,9 @@ async def batch_ingest_files( metadata=metadata_item, auth=auth, rules=file_rules, - use_colpali=use_colpali + use_colpali=use_colpali, + folder_name=folder_name, + end_user_id=end_user_id ) documents.append(doc) except Exception as e: @@ -504,7 +507,24 @@ async def batch_ingest_files( @app.post("/retrieve/chunks", response_model=List[ChunkResult]) async def retrieve_chunks(request: RetrieveRequest, auth: AuthContext = Depends(verify_token)): - """Retrieve relevant chunks.""" + """ + Retrieve relevant chunks. + + Args: + request: RetrieveRequest containing: + - query: Search query text + - filters: Optional metadata filters + - k: Number of results (default: 4) + - min_score: Minimum similarity threshold (default: 0.0) + - use_reranking: Whether to use reranking + - use_colpali: Whether to use ColPali-style embedding model + - folder_name: Optional folder to scope the search to + - end_user_id: Optional end-user ID to scope the search to + auth: Authentication context + + Returns: + List[ChunkResult]: List of relevant chunks + """ try: async with telemetry.track_operation( operation_type="retrieve_chunks", @@ -514,6 +534,8 @@ async def retrieve_chunks(request: RetrieveRequest, auth: AuthContext = Depends( "min_score": request.min_score, "use_reranking": request.use_reranking, "use_colpali": request.use_colpali, + "folder_name": request.folder_name, + "end_user_id": request.end_user_id, }, ): return await document_service.retrieve_chunks( @@ -524,6 +546,8 @@ async def retrieve_chunks(request: RetrieveRequest, auth: AuthContext = Depends( request.min_score, request.use_reranking, request.use_colpali, + request.folder_name, + request.end_user_id, ) except PermissionError as e: raise HTTPException(status_code=403, detail=str(e)) @@ -531,7 +555,24 @@ async def retrieve_chunks(request: RetrieveRequest, auth: AuthContext = Depends( @app.post("/retrieve/docs", response_model=List[DocumentResult]) async def retrieve_documents(request: RetrieveRequest, auth: AuthContext = Depends(verify_token)): - """Retrieve relevant documents.""" + """ + Retrieve relevant documents. + + Args: + request: RetrieveRequest containing: + - query: Search query text + - filters: Optional metadata filters + - k: Number of results (default: 4) + - min_score: Minimum similarity threshold (default: 0.0) + - use_reranking: Whether to use reranking + - use_colpali: Whether to use ColPali-style embedding model + - folder_name: Optional folder to scope the search to + - end_user_id: Optional end-user ID to scope the search to + auth: Authentication context + + Returns: + List[DocumentResult]: List of relevant documents + """ try: async with telemetry.track_operation( operation_type="retrieve_docs", @@ -541,6 +582,8 @@ async def retrieve_documents(request: RetrieveRequest, auth: AuthContext = Depen "min_score": request.min_score, "use_reranking": request.use_reranking, "use_colpali": request.use_colpali, + "folder_name": request.folder_name, + "end_user_id": request.end_user_id, }, ): return await document_service.retrieve_docs( @@ -551,39 +594,99 @@ async def retrieve_documents(request: RetrieveRequest, auth: AuthContext = Depen request.min_score, request.use_reranking, request.use_colpali, + request.folder_name, + request.end_user_id, ) except PermissionError as e: raise HTTPException(status_code=403, detail=str(e)) @app.post("/batch/documents", response_model=List[Document]) -async def batch_get_documents(document_ids: List[str], auth: AuthContext = Depends(verify_token)): - """Retrieve multiple documents by their IDs in a single batch operation.""" +async def batch_get_documents( + request: Dict[str, Any], + auth: AuthContext = Depends(verify_token) +): + """ + Retrieve multiple documents by their IDs in a single batch operation. + + Args: + request: Dictionary containing: + - document_ids: List of document IDs to retrieve + - folder_name: Optional folder to scope the operation to + - end_user_id: Optional end-user ID to scope the operation to + auth: Authentication context + + Returns: + List[Document]: List of documents matching the IDs + """ try: + # Extract document_ids from request + document_ids = request.get("document_ids", []) + folder_name = request.get("folder_name") + end_user_id = request.get("end_user_id") + + if not document_ids: + return [] + async with telemetry.track_operation( operation_type="batch_get_documents", user_id=auth.entity_id, metadata={ "document_count": len(document_ids), + "folder_name": folder_name, + "end_user_id": end_user_id, }, ): - return await document_service.batch_retrieve_documents(document_ids, auth) + return await document_service.batch_retrieve_documents(document_ids, auth, folder_name, end_user_id) except PermissionError as e: raise HTTPException(status_code=403, detail=str(e)) @app.post("/batch/chunks", response_model=List[ChunkResult]) -async def batch_get_chunks(chunk_ids: List[ChunkSource], auth: AuthContext = Depends(verify_token)): - """Retrieve specific chunks by their document ID and chunk number in a single batch operation.""" +async def batch_get_chunks( + request: Dict[str, Any], + auth: AuthContext = Depends(verify_token) +): + """ + Retrieve specific chunks by their document ID and chunk number in a single batch operation. + + Args: + request: Dictionary containing: + - sources: List of ChunkSource objects (with document_id and chunk_number) + - folder_name: Optional folder to scope the operation to + - end_user_id: Optional end-user ID to scope the operation to + auth: Authentication context + + Returns: + List[ChunkResult]: List of chunk results + """ try: + # Extract sources from request + sources = request.get("sources", []) + folder_name = request.get("folder_name") + end_user_id = request.get("end_user_id") + + if not sources: + return [] + async with telemetry.track_operation( operation_type="batch_get_chunks", user_id=auth.entity_id, metadata={ - "chunk_count": len(chunk_ids), + "chunk_count": len(sources), + "folder_name": folder_name, + "end_user_id": end_user_id, }, ): - return await document_service.batch_retrieve_chunks(chunk_ids, auth) + # Convert sources to ChunkSource objects if needed + chunk_sources = [] + for source in sources: + if isinstance(source, dict): + chunk_sources.append(ChunkSource(**source)) + else: + chunk_sources.append(source) + + return await document_service.batch_retrieve_chunks(chunk_sources, auth, folder_name, end_user_id) except PermissionError as e: raise HTTPException(status_code=403, detail=str(e)) @@ -592,10 +695,32 @@ async def batch_get_chunks(chunk_ids: List[ChunkSource], auth: AuthContext = Dep async def query_completion( request: CompletionQueryRequest, auth: AuthContext = Depends(verify_token) ): - """Generate completion using relevant chunks as context. + """ + Generate completion using relevant chunks as context. When graph_name is provided, the query will leverage the knowledge graph to enhance retrieval by finding relevant entities and their connected documents. + + Args: + request: CompletionQueryRequest containing: + - query: Query text + - filters: Optional metadata filters + - k: Number of chunks to use as context (default: 4) + - min_score: Minimum similarity threshold (default: 0.0) + - max_tokens: Maximum tokens in completion + - temperature: Model temperature + - use_reranking: Whether to use reranking + - use_colpali: Whether to use ColPali-style embedding model + - graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval + - hop_depth: Number of relationship hops to traverse in the graph (1-3) + - include_paths: Whether to include relationship paths in the response + - prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts + - folder_name: Optional folder to scope the operation to + - end_user_id: Optional end-user ID to scope the operation to + auth: Authentication context + + Returns: + CompletionResponse: Generated completion """ try: # Validate prompt overrides before proceeding @@ -620,6 +745,8 @@ async def query_completion( "graph_name": request.graph_name, "hop_depth": request.hop_depth, "include_paths": request.include_paths, + "folder_name": request.folder_name, + "end_user_id": request.end_user_id, }, ): return await document_service.query( @@ -636,6 +763,8 @@ async def query_completion( request.hop_depth, request.include_paths, request.prompt_overrides, + request.folder_name, + request.end_user_id, ) except ValueError as e: validate_prompt_overrides_with_http_exception(operation_type="query", error=e) @@ -649,9 +778,31 @@ async def list_documents( skip: int = 0, limit: int = 10000, filters: Optional[Dict[str, Any]] = None, + folder_name: Optional[str] = None, + end_user_id: Optional[str] = None, ): - """List accessible documents.""" - return await document_service.db.get_documents(auth, skip, limit, filters) + """ + List accessible documents. + + Args: + auth: Authentication context + skip: Number of documents to skip + limit: Maximum number of documents to return + filters: Optional metadata filters + folder_name: Optional folder to scope the operation to + end_user_id: Optional end-user ID to scope the operation to + + Returns: + List[Document]: List of accessible documents + """ + # Create system filters for folder and user scoping + system_filters = {} + if folder_name: + system_filters["folder_name"] = folder_name + if end_user_id: + system_filters["end_user_id"] = end_user_id + + return await document_service.db.get_documents(auth, skip, limit, filters, system_filters) @app.get("/documents/{document_id}", response_model=Document) @@ -700,10 +851,33 @@ async def delete_document(document_id: str, auth: AuthContext = Depends(verify_t @app.get("/documents/filename/{filename}", response_model=Document) -async def get_document_by_filename(filename: str, auth: AuthContext = Depends(verify_token)): - """Get document by filename.""" +async def get_document_by_filename( + filename: str, + auth: AuthContext = Depends(verify_token), + folder_name: Optional[str] = None, + end_user_id: Optional[str] = None, +): + """ + Get document by filename. + + Args: + filename: Filename of the document to retrieve + auth: Authentication context + folder_name: Optional folder to scope the operation to + end_user_id: Optional end-user ID to scope the operation to + + Returns: + Document: Document metadata if found and accessible + """ try: - doc = await document_service.db.get_document_by_filename(filename, auth) + # Create system filters for folder and user scoping + system_filters = {} + if folder_name: + system_filters["folder_name"] = folder_name + if end_user_id: + system_filters["end_user_id"] = end_user_id + + doc = await document_service.db.get_document_by_filename(filename, auth, system_filters) logger.debug(f"Found document by filename: {doc}") if not doc: raise HTTPException(status_code=404, detail=f"Document with filename '{filename}' not found") @@ -1071,6 +1245,9 @@ async def create_graph( - name: Name of the graph to create - filters: Optional metadata filters to determine which documents to include - documents: Optional list of specific document IDs to include + - prompt_overrides: Optional customizations for entity extraction and resolution prompts + - folder_name: Optional folder to scope the operation to + - end_user_id: Optional end-user ID to scope the operation to auth: Authentication context Returns: @@ -1093,14 +1270,24 @@ async def create_graph( "name": request.name, "filters": request.filters, "documents": request.documents, + "folder_name": request.folder_name, + "end_user_id": request.end_user_id, }, ): + # Create system filters for folder and user scoping + system_filters = {} + if request.folder_name: + system_filters["folder_name"] = request.folder_name + if request.end_user_id: + system_filters["end_user_id"] = request.end_user_id + return await document_service.create_graph( name=request.name, auth=auth, filters=request.filters, documents=request.documents, prompt_overrides=request.prompt_overrides, + system_filters=system_filters, ) except PermissionError as e: raise HTTPException(status_code=403, detail=str(e)) @@ -1112,6 +1299,8 @@ async def create_graph( async def get_graph( name: str, auth: AuthContext = Depends(verify_token), + folder_name: Optional[str] = None, + end_user_id: Optional[str] = None, ) -> Graph: """ Get a graph by name. @@ -1121,6 +1310,8 @@ async def get_graph( Args: name: Name of the graph to retrieve auth: Authentication context + folder_name: Optional folder to scope the operation to + end_user_id: Optional end-user ID to scope the operation to Returns: Graph: The requested graph object @@ -1129,9 +1320,20 @@ async def get_graph( async with telemetry.track_operation( operation_type="get_graph", user_id=auth.entity_id, - metadata={"name": name}, + metadata={ + "name": name, + "folder_name": folder_name, + "end_user_id": end_user_id + }, ): - graph = await document_service.db.get_graph(name, auth) + # Create system filters for folder and user scoping + system_filters = {} + if folder_name: + system_filters["folder_name"] = folder_name + if end_user_id: + system_filters["end_user_id"] = end_user_id + + graph = await document_service.db.get_graph(name, auth, system_filters) if not graph: raise HTTPException(status_code=404, detail=f"Graph '{name}' not found") return graph @@ -1144,6 +1346,8 @@ async def get_graph( @app.get("/graphs", response_model=List[Graph]) async def list_graphs( auth: AuthContext = Depends(verify_token), + folder_name: Optional[str] = None, + end_user_id: Optional[str] = None, ) -> List[Graph]: """ List all graphs the user has access to. @@ -1152,6 +1356,8 @@ async def list_graphs( Args: auth: Authentication context + folder_name: Optional folder to scope the operation to + end_user_id: Optional end-user ID to scope the operation to Returns: List[Graph]: List of graph objects @@ -1160,8 +1366,19 @@ async def list_graphs( async with telemetry.track_operation( operation_type="list_graphs", user_id=auth.entity_id, + metadata={ + "folder_name": folder_name, + "end_user_id": end_user_id + }, ): - return await document_service.db.list_graphs(auth) + # Create system filters for folder and user scoping + system_filters = {} + if folder_name: + system_filters["folder_name"] = folder_name + if end_user_id: + system_filters["end_user_id"] = end_user_id + + return await document_service.db.list_graphs(auth, system_filters) except PermissionError as e: raise HTTPException(status_code=403, detail=str(e)) except Exception as e: @@ -1186,6 +1403,9 @@ async def update_graph( request: UpdateGraphRequest containing: - additional_filters: Optional additional metadata filters to determine which new documents to include - additional_documents: Optional list of additional document IDs to include + - prompt_overrides: Optional customizations for entity extraction and resolution prompts + - folder_name: Optional folder to scope the operation to + - end_user_id: Optional end-user ID to scope the operation to auth: Authentication context Returns: @@ -1203,14 +1423,24 @@ async def update_graph( "name": name, "additional_filters": request.additional_filters, "additional_documents": request.additional_documents, + "folder_name": request.folder_name, + "end_user_id": request.end_user_id, }, ): + # Create system filters for folder and user scoping + system_filters = {} + if request.folder_name: + system_filters["folder_name"] = request.folder_name + if request.end_user_id: + system_filters["end_user_id"] = request.end_user_id + return await document_service.update_graph( name=name, auth=auth, additional_filters=request.additional_filters, additional_documents=request.additional_documents, prompt_overrides=request.prompt_overrides, + system_filters=system_filters, ) except PermissionError as e: raise HTTPException(status_code=403, detail=str(e)) diff --git a/core/config.py b/core/config.py index 5afdaf2..4ca6dad 100644 --- a/core/config.py +++ b/core/config.py @@ -13,7 +13,6 @@ class Settings(BaseSettings): # Environment variables JWT_SECRET_KEY: str POSTGRES_URI: Optional[str] = None - MONGODB_URI: Optional[str] = None UNSTRUCTURED_API_KEY: Optional[str] = None AWS_ACCESS_KEY: Optional[str] = None AWS_SECRET_ACCESS_KEY: Optional[str] = None @@ -42,9 +41,8 @@ class Settings(BaseSettings): # Database configuration - DATABASE_PROVIDER: Literal["postgres", "mongodb"] + DATABASE_PROVIDER: Literal["postgres"] DATABASE_NAME: Optional[str] = None - DOCUMENTS_COLLECTION: Optional[str] = None # Embedding configuration EMBEDDING_PROVIDER: Literal["litellm"] = "litellm" @@ -85,9 +83,8 @@ class Settings(BaseSettings): S3_BUCKET: Optional[str] = None # Vector store configuration - VECTOR_STORE_PROVIDER: Literal["pgvector", "mongodb"] + VECTOR_STORE_PROVIDER: Literal["pgvector"] VECTOR_STORE_DATABASE_NAME: Optional[str] = None - VECTOR_STORE_COLLECTION_NAME: Optional[str] = None # Colpali configuration ENABLE_COLPALI: bool @@ -164,24 +161,17 @@ def get_settings() -> Settings: # load database config database_config = {"DATABASE_PROVIDER": config["database"]["provider"]} - match database_config["DATABASE_PROVIDER"]: - case "mongodb": - database_config.update( - { - "DATABASE_NAME": config["database"]["database_name"], - "COLLECTION_NAME": config["database"]["collection_name"], - } - ) - case "postgres" if "POSTGRES_URI" in os.environ: - database_config.update({"POSTGRES_URI": os.environ["POSTGRES_URI"]}) - case "postgres": - msg = em.format( - missing_value="POSTGRES_URI", field="database.provider", value="postgres" - ) - raise ValueError(msg) - case _: - prov = database_config["DATABASE_PROVIDER"] - raise ValueError(f"Unknown database provider selected: '{prov}'") + if database_config["DATABASE_PROVIDER"] != "postgres": + prov = database_config["DATABASE_PROVIDER"] + raise ValueError(f"Unknown database provider selected: '{prov}'") + + if "POSTGRES_URI" in os.environ: + database_config.update({"POSTGRES_URI": os.environ["POSTGRES_URI"]}) + else: + msg = em.format( + missing_value="POSTGRES_URI", field="database.provider", value="postgres" + ) + raise ValueError(msg) # load embedding config embedding_config = { @@ -251,23 +241,15 @@ def get_settings() -> Settings: # load vector store config vector_store_config = {"VECTOR_STORE_PROVIDER": config["vector_store"]["provider"]} - match vector_store_config["VECTOR_STORE_PROVIDER"]: - case "mongodb": - vector_store_config.update( - { - "VECTOR_STORE_DATABASE_NAME": config["vector_store"]["database_name"], - "VECTOR_STORE_COLLECTION_NAME": config["vector_store"]["collection_name"], - } - ) - case "pgvector": - if "POSTGRES_URI" not in os.environ: - msg = em.format( - missing_value="POSTGRES_URI", field="vector_store.provider", value="pgvector" - ) - raise ValueError(msg) - case _: - prov = vector_store_config["VECTOR_STORE_PROVIDER"] - raise ValueError(f"Unknown vector store provider selected: '{prov}'") + if vector_store_config["VECTOR_STORE_PROVIDER"] != "pgvector": + prov = vector_store_config["VECTOR_STORE_PROVIDER"] + raise ValueError(f"Unknown vector store provider selected: '{prov}'") + + if "POSTGRES_URI" not in os.environ: + msg = em.format( + missing_value="POSTGRES_URI", field="vector_store.provider", value="pgvector" + ) + raise ValueError(msg) # load rules config rules_config = { diff --git a/core/database/base_database.py b/core/database/base_database.py index 9e1b807..01ba9e6 100644 --- a/core/database/base_database.py +++ b/core/database/base_database.py @@ -26,7 +26,7 @@ class BaseDatabase(ABC): pass @abstractmethod - async def get_document_by_filename(self, filename: str, auth: AuthContext) -> Optional[Document]: + async def get_document_by_filename(self, filename: str, auth: AuthContext, system_filters: Optional[Dict[str, Any]] = None) -> Optional[Document]: """ Retrieve document metadata by filename if user has access. If multiple documents have the same filename, returns the most recently updated one. @@ -34,6 +34,7 @@ class BaseDatabase(ABC): Args: filename: The filename to search for auth: Authentication context + system_filters: Optional system metadata filters (e.g. folder_name, end_user_id) Returns: Document if found and accessible, None otherwise @@ -41,14 +42,16 @@ class BaseDatabase(ABC): pass @abstractmethod - async def get_documents_by_id(self, document_ids: List[str], auth: AuthContext) -> List[Document]: + async def get_documents_by_id(self, document_ids: List[str], auth: AuthContext, system_filters: Optional[Dict[str, Any]] = None) -> List[Document]: """ Retrieve multiple documents by their IDs in a single batch operation. Only returns documents the user has access to. + Can filter by system metadata fields like folder_name and end_user_id. Args: document_ids: List of document IDs to retrieve auth: Authentication context + system_filters: Optional filters for system metadata fields Returns: List of Document objects that were found and user has access to @@ -62,10 +65,21 @@ class BaseDatabase(ABC): skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None, + system_filters: Optional[Dict[str, Any]] = None, ) -> List[Document]: """ List documents the user has access to. Supports pagination and filtering. + + Args: + auth: Authentication context + skip: Number of documents to skip (for pagination) + limit: Maximum number of documents to return + filters: Optional metadata filters + system_filters: Optional system metadata filters (e.g. folder_name, end_user_id) + + Returns: + List of documents matching the criteria """ pass @@ -89,9 +103,18 @@ class BaseDatabase(ABC): @abstractmethod async def find_authorized_and_filtered_documents( - self, auth: AuthContext, filters: Optional[Dict[str, Any]] = None + self, auth: AuthContext, filters: Optional[Dict[str, Any]] = None, system_filters: Optional[Dict[str, Any]] = None ) -> List[str]: - """Find document IDs matching filters that user has access to.""" + """Find document IDs matching filters that user has access to. + + Args: + auth: Authentication context + filters: Optional metadata filters + system_filters: Optional system metadata filters (e.g. folder_name, end_user_id) + + Returns: + List of document IDs matching the criteria + """ pass @abstractmethod @@ -142,12 +165,13 @@ class BaseDatabase(ABC): pass @abstractmethod - async def get_graph(self, name: str, auth: AuthContext) -> Optional[Graph]: + async def get_graph(self, name: str, auth: AuthContext, system_filters: Optional[Dict[str, Any]] = None) -> Optional[Graph]: """Get a graph by name. Args: name: Name of the graph auth: Authentication context + system_filters: Optional system metadata filters (e.g. folder_name, end_user_id) Returns: Optional[Graph]: Graph if found and accessible, None otherwise @@ -155,11 +179,12 @@ class BaseDatabase(ABC): pass @abstractmethod - async def list_graphs(self, auth: AuthContext) -> List[Graph]: + async def list_graphs(self, auth: AuthContext, system_filters: Optional[Dict[str, Any]] = None) -> List[Graph]: """List all graphs the user has access to. Args: auth: Authentication context + system_filters: Optional system metadata filters (e.g. folder_name, end_user_id) Returns: List[Graph]: List of graphs diff --git a/core/database/mongo_database.py b/core/database/mongo_database.py deleted file mode 100644 index d779332..0000000 --- a/core/database/mongo_database.py +++ /dev/null @@ -1,329 +0,0 @@ -from datetime import UTC, datetime -import logging -from typing import Dict, List, Optional, Any - -from motor.motor_asyncio import AsyncIOMotorClient -from pymongo import ReturnDocument -from pymongo.errors import PyMongoError - -from .base_database import BaseDatabase -from ..models.documents import Document -from ..models.auth import AuthContext, EntityType - -logger = logging.getLogger(__name__) - - -class MongoDatabase(BaseDatabase): - """MongoDB implementation for document metadata storage.""" - - def __init__( - self, - uri: str, - db_name: str, - collection_name: str, - ): - """Initialize MongoDB connection for document storage.""" - self.client = AsyncIOMotorClient(uri) - self.db = self.client[db_name] - self.collection = self.db[collection_name] - self.caches = self.db["caches"] # Collection for cache metadata - - async def initialize(self): - """Initialize database indexes.""" - try: - # Create indexes for common queries - await self.collection.create_index("external_id", unique=True) - await self.collection.create_index("owner.id") - await self.collection.create_index("access_control.readers") - await self.collection.create_index("access_control.writers") - await self.collection.create_index("access_control.admins") - await self.collection.create_index("system_metadata.created_at") - - logger.info("MongoDB indexes created successfully") - return True - except PyMongoError as e: - logger.error(f"Error creating MongoDB indexes: {str(e)}") - return False - - async def store_document(self, document: Document) -> bool: - """Store document metadata.""" - try: - doc_dict = document.model_dump() - - # Ensure system metadata - doc_dict["system_metadata"]["created_at"] = datetime.now(UTC) - doc_dict["system_metadata"]["updated_at"] = datetime.now(UTC) - doc_dict["metadata"]["external_id"] = doc_dict["external_id"] - - result = await self.collection.insert_one(doc_dict) - return bool(result.inserted_id) - - except PyMongoError as e: - logger.error(f"Error storing document metadata: {str(e)}") - return False - - async def get_document(self, document_id: str, auth: AuthContext) -> Optional[Document]: - """Retrieve document metadata by ID if user has access.""" - try: - # Build access filter - access_filter = self._build_access_filter(auth) - - # Query document - query = {"$and": [{"external_id": document_id}, access_filter]} - logger.debug(f"Querying document with query: {query}") - - doc_dict = await self.collection.find_one(query) - logger.debug(f"Found document: {doc_dict}") - return Document(**doc_dict) if doc_dict else None - - except PyMongoError as e: - logger.error(f"Error retrieving document metadata: {str(e)}") - raise e - - async def get_document_by_filename(self, filename: str, auth: AuthContext) -> Optional[Document]: - """Retrieve document metadata by filename if user has access. - If multiple documents have the same filename, returns the most recently updated one. - """ - try: - # Build access filter - access_filter = self._build_access_filter(auth) - - # Query document - query = {"$and": [{"filename": filename}, access_filter]} - logger.debug(f"Querying document by filename with query: {query}") - - # Sort by updated_at in descending order to get the most recent one - sort_criteria = [("system_metadata.updated_at", -1)] - - doc_dict = await self.collection.find_one(query, sort=sort_criteria) - logger.debug(f"Found document by filename: {doc_dict}") - - return Document(**doc_dict) if doc_dict else None - - except PyMongoError as e: - logger.error(f"Error retrieving document metadata by filename: {str(e)}") - raise e - - async def get_documents_by_id(self, document_ids: List[str], auth: AuthContext) -> List[Document]: - """ - Retrieve multiple documents by their IDs in a single batch operation. - Only returns documents the user has access to. - - Args: - document_ids: List of document IDs to retrieve - auth: Authentication context - - Returns: - List of Document objects that were found and user has access to - """ - try: - if not document_ids: - return [] - - # Build access filter - access_filter = self._build_access_filter(auth) - - # Query documents with both document IDs and access check in a single query - query = { - "$and": [ - {"external_id": {"$in": document_ids}}, - access_filter - ] - } - - logger.info(f"Batch retrieving {len(document_ids)} documents with a single query") - - # Execute batch query - cursor = self.collection.find(query) - - documents = [] - async for doc_dict in cursor: - documents.append(Document(**doc_dict)) - - logger.info(f"Found {len(documents)} documents in batch retrieval") - return documents - - except PyMongoError as e: - logger.error(f"Error batch retrieving documents: {str(e)}") - return [] - - async def get_documents( - self, - auth: AuthContext, - skip: int = 0, - limit: int = 100, - filters: Optional[Dict[str, Any]] = None, - ) -> List[Document]: - """List accessible documents with pagination and filtering.""" - try: - # Build query - auth_filter = self._build_access_filter(auth) - metadata_filter = self._build_metadata_filter(filters) - query = {"$and": [auth_filter, metadata_filter]} if metadata_filter else auth_filter - - # Execute paginated query - cursor = self.collection.find(query).skip(skip).limit(limit) - - documents = [] - async for doc_dict in cursor: - documents.append(Document(**doc_dict)) - - return documents - - except PyMongoError as e: - logger.error(f"Error listing documents: {str(e)}") - return [] - - async def update_document( - self, document_id: str, updates: Dict[str, Any], auth: AuthContext - ) -> bool: - """Update document metadata if user has write access.""" - try: - # Verify write access - if not await self.check_access(document_id, auth, "write"): - return False - - # Update system metadata - updates.setdefault("system_metadata", {}) - updates["system_metadata"]["updated_at"] = datetime.now(UTC) - - result = await self.collection.find_one_and_update( - {"external_id": document_id}, - {"$set": updates}, - return_document=ReturnDocument.AFTER, - ) - - return bool(result) - - except PyMongoError as e: - logger.error(f"Error updating document metadata: {str(e)}") - return False - - async def delete_document(self, document_id: str, auth: AuthContext) -> bool: - """Delete document if user has admin access.""" - try: - # Verify admin access - if not await self.check_access(document_id, auth, "admin"): - return False - - result = await self.collection.delete_one({"external_id": document_id}) - return bool(result.deleted_count) - - except PyMongoError as e: - logger.error(f"Error deleting document: {str(e)}") - return False - - async def find_authorized_and_filtered_documents( - self, auth: AuthContext, filters: Optional[Dict[str, Any]] = None - ) -> List[str]: - """Find document IDs matching filters and access permissions.""" - # Build query - auth_filter = self._build_access_filter(auth) - metadata_filter = self._build_metadata_filter(filters) - query = {"$and": [auth_filter, metadata_filter]} if metadata_filter else auth_filter - - # Get matching document IDs - cursor = self.collection.find(query, {"external_id": 1}) - - document_ids = [] - async for doc in cursor: - document_ids.append(doc["external_id"]) - - return document_ids - - async def check_access( - self, document_id: str, auth: AuthContext, required_permission: str = "read" - ) -> bool: - """Check if user has required permission for document.""" - try: - doc = await self.collection.find_one({"external_id": document_id}) - if not doc: - return False - - access_control = doc.get("access_control", {}) - - # Check owner access - owner = doc.get("owner", {}) - if owner.get("type") == auth.entity_type and owner.get("id") == auth.entity_id: - return True - - # Check permission-specific access - permission_map = {"read": "readers", "write": "writers", "admin": "admins"} - - permission_set = permission_map.get(required_permission) - if not permission_set: - return False - - return auth.entity_id in access_control.get(permission_set, set()) - - except PyMongoError as e: - logger.error(f"Error checking document access: {str(e)}") - return False - - def _build_access_filter(self, auth: AuthContext) -> Dict[str, Any]: - """Build MongoDB filter for access control.""" - base_filter = { - "$or": [ - {"owner.id": auth.entity_id}, - {"access_control.readers": auth.entity_id}, - {"access_control.writers": auth.entity_id}, - {"access_control.admins": auth.entity_id}, - ] - } - - if auth.entity_type == EntityType.DEVELOPER: - # Add app-specific access for developers - base_filter["$or"].append({"access_control.app_access": auth.app_id}) - - return base_filter - - def _build_metadata_filter(self, filters: Dict[str, Any]) -> Dict[str, Any]: - """Build MongoDB filter for metadata.""" - if not filters: - return {} - filter_dict = {} - for key, value in filters.items(): - filter_dict[f"metadata.{key}"] = value - return filter_dict - - async def store_cache_metadata(self, name: str, metadata: Dict[str, Any]) -> bool: - """Store metadata for a cache in MongoDB. - - Args: - name: Name of the cache - metadata: Cache metadata including model info and storage location - - Returns: - bool: Whether the operation was successful - """ - try: - # Add timestamp and ensure name is included - doc = { - "name": name, - "metadata": metadata, - "created_at": datetime.now(UTC), - "updated_at": datetime.now(UTC), - } - - # Upsert the document - result = await self.caches.update_one({"name": name}, {"$set": doc}, upsert=True) - return bool(result.modified_count or result.upserted_id) - except Exception as e: - logger.error(f"Failed to store cache metadata: {e}") - return False - - async def get_cache_metadata(self, name: str) -> Optional[Dict[str, Any]]: - """Get metadata for a cache from MongoDB. - - Args: - name: Name of the cache - - Returns: - Optional[Dict[str, Any]]: Cache metadata if found, None otherwise - """ - try: - doc = await self.caches.find_one({"name": name}) - return doc["metadata"] if doc else None - except Exception as e: - logger.error(f"Failed to get cache metadata: {e}") - return None diff --git a/core/database/postgres_database.py b/core/database/postgres_database.py index 150e59e..99a5bc9 100644 --- a/core/database/postgres_database.py +++ b/core/database/postgres_database.py @@ -51,6 +51,7 @@ class GraphModel(Base): entities = Column(JSONB, default=list) relationships = Column(JSONB, default=list) graph_metadata = Column(JSONB, default=dict) # Renamed from 'metadata' to avoid conflict + system_metadata = Column(JSONB, default=dict) # For folder_name and end_user_id document_ids = Column(JSONB, default=list) filters = Column(JSONB, nullable=True) created_at = Column(String) # ISO format string @@ -63,6 +64,7 @@ class GraphModel(Base): Index("idx_graph_name", "name"), Index("idx_graph_owner", "owner", postgresql_using="gin"), Index("idx_graph_access_control", "access_control", postgresql_using="gin"), + Index("idx_graph_system_metadata", "system_metadata", postgresql_using="gin"), ) @@ -139,6 +141,68 @@ class PostgresDatabase(BaseDatabase): ) ) logger.info("Added storage_files column to documents table") + + # Create indexes for folder_name and end_user_id in system_metadata for documents + await conn.execute( + text( + """ + CREATE INDEX IF NOT EXISTS idx_system_metadata_folder_name + ON documents ((system_metadata->>'folder_name')); + """ + ) + ) + + await conn.execute( + text( + """ + CREATE INDEX IF NOT EXISTS idx_system_metadata_end_user_id + ON documents ((system_metadata->>'end_user_id')); + """ + ) + ) + + # Check if system_metadata column exists in graphs table + result = await conn.execute( + text( + """ + SELECT column_name + FROM information_schema.columns + WHERE table_name = 'graphs' AND column_name = 'system_metadata' + """ + ) + ) + if not result.first(): + # Add system_metadata column to graphs table + await conn.execute( + text( + """ + ALTER TABLE graphs + ADD COLUMN IF NOT EXISTS system_metadata JSONB DEFAULT '{}'::jsonb + """ + ) + ) + logger.info("Added system_metadata column to graphs table") + + # Create indexes for folder_name and end_user_id in system_metadata for graphs + await conn.execute( + text( + """ + CREATE INDEX IF NOT EXISTS idx_graph_system_metadata_folder_name + ON graphs ((system_metadata->>'folder_name')); + """ + ) + ) + + await conn.execute( + text( + """ + CREATE INDEX IF NOT EXISTS idx_graph_system_metadata_end_user_id + ON graphs ((system_metadata->>'end_user_id')); + """ + ) + ) + + logger.info("Created indexes for folder_name and end_user_id in system_metadata") logger.info("PostgreSQL tables and indexes created successfully") self._initialized = True @@ -221,24 +285,42 @@ class PostgresDatabase(BaseDatabase): logger.error(f"Error retrieving document metadata: {str(e)}") return None - async def get_document_by_filename(self, filename: str, auth: AuthContext) -> Optional[Document]: + async def get_document_by_filename(self, filename: str, auth: AuthContext, system_filters: Optional[Dict[str, Any]] = None) -> Optional[Document]: """Retrieve document metadata by filename if user has access. If multiple documents have the same filename, returns the most recently updated one. + + Args: + filename: The filename to search for + auth: Authentication context + system_filters: Optional system metadata filters (e.g. folder_name, end_user_id) """ try: async with self.async_session() as session: # Build access filter access_filter = self._build_access_filter(auth) + system_metadata_filter = self._build_system_metadata_filter(system_filters) - # Query document + # Construct where clauses + where_clauses = [ + f"({access_filter})", + f"filename = '{filename.replace('\'', '\'\'')}'" # Escape single quotes + ] + + if system_metadata_filter: + where_clauses.append(f"({system_metadata_filter})") + + final_where_clause = " AND ".join(where_clauses) + + # Query document with system filters query = ( select(DocumentModel) - .where(DocumentModel.filename == filename) - .where(text(f"({access_filter})")) + .where(text(final_where_clause)) # Order by updated_at in system_metadata to get the most recent document .order_by(text("system_metadata->>'updated_at' DESC")) ) + logger.debug(f"Querying document by filename with system filters: {system_filters}") + result = await session.execute(query) doc_model = result.scalar_one_or_none() @@ -264,14 +346,16 @@ class PostgresDatabase(BaseDatabase): logger.error(f"Error retrieving document metadata by filename: {str(e)}") return None - async def get_documents_by_id(self, document_ids: List[str], auth: AuthContext) -> List[Document]: + async def get_documents_by_id(self, document_ids: List[str], auth: AuthContext, system_filters: Optional[Dict[str, Any]] = None) -> List[Document]: """ Retrieve multiple documents by their IDs in a single batch operation. Only returns documents the user has access to. + Can filter by system metadata fields like folder_name and end_user_id. Args: document_ids: List of document IDs to retrieve auth: Authentication context + system_filters: Optional filters for system metadata fields Returns: List of Document objects that were found and user has access to @@ -283,13 +367,21 @@ class PostgresDatabase(BaseDatabase): async with self.async_session() as session: # Build access filter access_filter = self._build_access_filter(auth) + system_metadata_filter = self._build_system_metadata_filter(system_filters) - # Query documents with both document IDs and access check in a single query - query = ( - select(DocumentModel) - .where(DocumentModel.external_id.in_(document_ids)) - .where(text(f"({access_filter})")) - ) + # Construct where clauses + where_clauses = [ + f"({access_filter})", + f"external_id IN ({', '.join([f'\'{doc_id}\'' for doc_id in document_ids])})" + ] + + if system_metadata_filter: + where_clauses.append(f"({system_metadata_filter})") + + final_where_clause = " AND ".join(where_clauses) + + # Query documents with document IDs, access check, and system filters in a single query + query = select(DocumentModel).where(text(final_where_clause)) logger.info(f"Batch retrieving {len(document_ids)} documents with a single query") @@ -328,6 +420,7 @@ class PostgresDatabase(BaseDatabase): skip: int = 0, limit: int = 10000, filters: Optional[Dict[str, Any]] = None, + system_filters: Optional[Dict[str, Any]] = None, ) -> List[Document]: """List documents the user has access to.""" try: @@ -335,10 +428,18 @@ class PostgresDatabase(BaseDatabase): # Build query access_filter = self._build_access_filter(auth) metadata_filter = self._build_metadata_filter(filters) + system_metadata_filter = self._build_system_metadata_filter(system_filters) - query = select(DocumentModel).where(text(f"({access_filter})")) + where_clauses = [f"({access_filter})"] + if metadata_filter: - query = query.where(text(metadata_filter)) + where_clauses.append(f"({metadata_filter})") + + if system_metadata_filter: + where_clauses.append(f"({system_metadata_filter})") + + final_where_clause = " AND ".join(where_clauses) + query = select(DocumentModel).where(text(final_where_clause)) query = query.offset(skip).limit(limit) @@ -373,9 +474,23 @@ class PostgresDatabase(BaseDatabase): try: if not await self.check_access(document_id, auth, "write"): return False + + # Get existing document to preserve system_metadata + existing_doc = await self.get_document(document_id, auth) + if not existing_doc: + return False # Update system metadata updates.setdefault("system_metadata", {}) + + # Preserve folder_name and end_user_id if not explicitly overridden + if existing_doc.system_metadata: + if "folder_name" in existing_doc.system_metadata and "folder_name" not in updates["system_metadata"]: + updates["system_metadata"]["folder_name"] = existing_doc.system_metadata["folder_name"] + + if "end_user_id" in existing_doc.system_metadata and "end_user_id" not in updates["system_metadata"]: + updates["system_metadata"]["end_user_id"] = existing_doc.system_metadata["end_user_id"] + updates["system_metadata"]["updated_at"] = datetime.now(UTC) # Serialize datetime objects to ISO format strings @@ -421,7 +536,7 @@ class PostgresDatabase(BaseDatabase): return False async def find_authorized_and_filtered_documents( - self, auth: AuthContext, filters: Optional[Dict[str, Any]] = None + self, auth: AuthContext, filters: Optional[Dict[str, Any]] = None, system_filters: Optional[Dict[str, Any]] = None ) -> List[str]: """Find document IDs matching filters and access permissions.""" try: @@ -429,14 +544,24 @@ class PostgresDatabase(BaseDatabase): # Build query access_filter = self._build_access_filter(auth) metadata_filter = self._build_metadata_filter(filters) + system_metadata_filter = self._build_system_metadata_filter(system_filters) logger.debug(f"Access filter: {access_filter}") logger.debug(f"Metadata filter: {metadata_filter}") + logger.debug(f"System metadata filter: {system_metadata_filter}") logger.debug(f"Original filters: {filters}") + logger.debug(f"System filters: {system_filters}") - query = select(DocumentModel.external_id).where(text(f"({access_filter})")) + where_clauses = [f"({access_filter})"] + if metadata_filter: - query = query.where(text(metadata_filter)) + where_clauses.append(f"({metadata_filter})") + + if system_metadata_filter: + where_clauses.append(f"({system_metadata_filter})") + + final_where_clause = " AND ".join(where_clauses) + query = select(DocumentModel.external_id).where(text(final_where_clause)) logger.debug(f"Final query: {query}") @@ -525,6 +650,25 @@ class PostgresDatabase(BaseDatabase): filter_conditions.append(f"doc_metadata->>'{key}' = '{value}'") return " AND ".join(filter_conditions) + + def _build_system_metadata_filter(self, system_filters: Optional[Dict[str, Any]]) -> str: + """Build PostgreSQL filter for system metadata.""" + if not system_filters: + return "" + + conditions = [] + for key, value in system_filters.items(): + if value is None: + continue + + if isinstance(value, str): + # Replace single quotes with double single quotes to escape them + escaped_value = value.replace("'", "''") + conditions.append(f"system_metadata->>'{key}' = '{escaped_value}'") + else: + conditions.append(f"system_metadata->>'{key}' = '{value}'") + + return " AND ".join(conditions) async def store_cache_metadata(self, name: str, metadata: Dict[str, Any]) -> bool: """Store metadata for a cache in PostgreSQL. @@ -618,12 +762,13 @@ class PostgresDatabase(BaseDatabase): logger.error(f"Error storing graph: {str(e)}") return False - async def get_graph(self, name: str, auth: AuthContext) -> Optional[Graph]: + async def get_graph(self, name: str, auth: AuthContext, system_filters: Optional[Dict[str, Any]] = None) -> Optional[Graph]: """Get a graph by name. Args: name: Name of the graph auth: Authentication context + system_filters: Optional system metadata filters (e.g. folder_name, end_user_id) Returns: Optional[Graph]: Graph if found and accessible, None otherwise @@ -637,7 +782,8 @@ class PostgresDatabase(BaseDatabase): # Build access filter access_filter = self._build_access_filter(auth) - # Query graph + # We need to check if the documents in the graph match the system filters + # First get the graph without system filters query = ( select(GraphModel) .where(GraphModel.name == name) @@ -648,6 +794,32 @@ class PostgresDatabase(BaseDatabase): graph_model = result.scalar_one_or_none() if graph_model: + # If system filters are provided, we need to filter the document_ids + document_ids = graph_model.document_ids + + if system_filters and document_ids: + # Apply system_filters to document_ids + system_metadata_filter = self._build_system_metadata_filter(system_filters) + + if system_metadata_filter: + # Get document IDs with system filters + doc_id_placeholders = ", ".join([f"'{doc_id}'" for doc_id in document_ids]) + filter_query = f""" + SELECT external_id FROM documents + WHERE external_id IN ({doc_id_placeholders}) + AND ({system_metadata_filter}) + """ + + filter_result = await session.execute(text(filter_query)) + filtered_doc_ids = [row[0] for row in filter_result.all()] + + # If no documents match system filters, return None + if not filtered_doc_ids: + return None + + # Update document_ids with filtered results + document_ids = filtered_doc_ids + # Convert to Graph model graph_dict = { "id": graph_model.id, @@ -655,7 +827,8 @@ class PostgresDatabase(BaseDatabase): "entities": graph_model.entities, "relationships": graph_model.relationships, "metadata": graph_model.graph_metadata, # Reference the renamed column - "document_ids": graph_model.document_ids, + "system_metadata": graph_model.system_metadata or {}, # Include system_metadata + "document_ids": document_ids, # Use possibly filtered document_ids "filters": graph_model.filters, "created_at": graph_model.created_at, "updated_at": graph_model.updated_at, @@ -670,11 +843,12 @@ class PostgresDatabase(BaseDatabase): logger.error(f"Error retrieving graph: {str(e)}") return None - async def list_graphs(self, auth: AuthContext) -> List[Graph]: + async def list_graphs(self, auth: AuthContext, system_filters: Optional[Dict[str, Any]] = None) -> List[Graph]: """List all graphs the user has access to. Args: auth: Authentication context + system_filters: Optional system metadata filters (e.g. folder_name, end_user_id) Returns: List[Graph]: List of graphs @@ -693,23 +867,66 @@ class PostgresDatabase(BaseDatabase): result = await session.execute(query) graph_models = result.scalars().all() - - return [ - Graph( - id=graph.id, - name=graph.name, - entities=graph.entities, - relationships=graph.relationships, - metadata=graph.graph_metadata, # Reference the renamed column - document_ids=graph.document_ids, - filters=graph.filters, - created_at=graph.created_at, - updated_at=graph.updated_at, - owner=graph.owner, - access_control=graph.access_control, - ) - for graph in graph_models - ] + + graphs = [] + + # If system filters are provided, we need to filter each graph's document_ids + if system_filters: + system_metadata_filter = self._build_system_metadata_filter(system_filters) + + for graph_model in graph_models: + document_ids = graph_model.document_ids + + if document_ids and system_metadata_filter: + # Get document IDs with system filters + doc_id_placeholders = ", ".join([f"'{doc_id}'" for doc_id in document_ids]) + filter_query = f""" + SELECT external_id FROM documents + WHERE external_id IN ({doc_id_placeholders}) + AND ({system_metadata_filter}) + """ + + filter_result = await session.execute(text(filter_query)) + filtered_doc_ids = [row[0] for row in filter_result.all()] + + # Only include graphs that have documents matching the system filters + if filtered_doc_ids: + graph = Graph( + id=graph_model.id, + name=graph_model.name, + entities=graph_model.entities, + relationships=graph_model.relationships, + metadata=graph_model.graph_metadata, # Reference the renamed column + system_metadata=graph_model.system_metadata or {}, # Include system_metadata + document_ids=filtered_doc_ids, # Use filtered document_ids + filters=graph_model.filters, + created_at=graph_model.created_at, + updated_at=graph_model.updated_at, + owner=graph_model.owner, + access_control=graph_model.access_control, + ) + graphs.append(graph) + else: + # No system filters, include all graphs + graphs = [ + Graph( + id=graph.id, + name=graph.name, + entities=graph.entities, + relationships=graph.relationships, + metadata=graph.graph_metadata, # Reference the renamed column + system_metadata=graph.system_metadata or {}, # Include system_metadata + document_ids=graph.document_ids, + filters=graph.filters, + created_at=graph.created_at, + updated_at=graph.updated_at, + owner=graph.owner, + access_control=graph.access_control, + ) + for graph in graph_models + ] + + return graphs except Exception as e: logger.error(f"Error listing graphs: {str(e)}") diff --git a/core/models/completion.py b/core/models/completion.py index 70633db..894012f 100644 --- a/core/models/completion.py +++ b/core/models/completion.py @@ -28,3 +28,5 @@ class CompletionRequest(BaseModel): max_tokens: Optional[int] = 1000 temperature: Optional[float] = 0.7 prompt_template: Optional[str] = None + folder_name: Optional[str] = None + end_user_id: Optional[str] = None diff --git a/core/models/documents.py b/core/models/documents.py index 96c80d4..5454ffb 100644 --- a/core/models/documents.py +++ b/core/models/documents.py @@ -27,7 +27,7 @@ class StorageFileInfo(BaseModel): class Document(BaseModel): - """Represents a document stored in MongoDB documents collection""" + """Represents a document stored in the database documents collection""" external_id: str = Field(default_factory=lambda: str(uuid.uuid4())) owner: Dict[str, str] @@ -44,6 +44,8 @@ class Document(BaseModel): "created_at": datetime.now(UTC), "updated_at": datetime.now(UTC), "version": 1, + "folder_name": None, + "end_user_id": None, } ) """metadata such as creation date etc.""" diff --git a/core/models/graph.py b/core/models/graph.py index aefa163..8325461 100644 --- a/core/models/graph.py +++ b/core/models/graph.py @@ -50,6 +50,14 @@ class Graph(BaseModel): entities: List[Entity] = Field(default_factory=list) relationships: List[Relationship] = Field(default_factory=list) metadata: Dict[str, Any] = Field(default_factory=dict) + system_metadata: Dict[str, Any] = Field( + default_factory=lambda: { + "created_at": datetime.now(UTC), + "updated_at": datetime.now(UTC), + "folder_name": None, + "end_user_id": None, + } + ) document_ids: List[str] = Field(default_factory=list) filters: Optional[Dict[str, Any]] = None created_at: datetime = Field(default_factory=lambda: datetime.now(UTC)) diff --git a/core/models/request.py b/core/models/request.py index 00d75e9..6ff3500 100644 --- a/core/models/request.py +++ b/core/models/request.py @@ -23,6 +23,8 @@ class RetrieveRequest(BaseModel): include_paths: Optional[bool] = Field( False, description="Whether to include relationship paths in the response" ) + folder_name: Optional[str] = Field(None, description="Optional folder scope for the operation") + end_user_id: Optional[str] = Field(None, description="Optional end-user scope for the operation") class CompletionQueryRequest(RetrieveRequest): @@ -44,6 +46,8 @@ class IngestTextRequest(BaseModel): metadata: Dict[str, Any] = Field(default_factory=dict) rules: List[Dict[str, Any]] = Field(default_factory=list) use_colpali: Optional[bool] = None + folder_name: Optional[str] = Field(None, description="Optional folder scope for the operation") + end_user_id: Optional[str] = Field(None, description="Optional end-user scope for the operation") class CreateGraphRequest(BaseModel): @@ -66,6 +70,8 @@ class CreateGraphRequest(BaseModel): } }} ) + folder_name: Optional[str] = Field(None, description="Optional folder scope for the operation") + end_user_id: Optional[str] = Field(None, description="Optional end-user scope for the operation") class UpdateGraphRequest(BaseModel): @@ -81,6 +87,8 @@ class UpdateGraphRequest(BaseModel): None, description="Optional customizations for entity extraction and resolution prompts" ) + folder_name: Optional[str] = Field(None, description="Optional folder scope for the operation") + end_user_id: Optional[str] = Field(None, description="Optional end-user scope for the operation") class BatchIngestResponse(BaseModel): diff --git a/core/services/document_service.py b/core/services/document_service.py index d635951..f55b500 100644 --- a/core/services/document_service.py +++ b/core/services/document_service.py @@ -95,6 +95,8 @@ class DocumentService: min_score: float = 0.0, use_reranking: Optional[bool] = None, use_colpali: Optional[bool] = None, + folder_name: Optional[str] = None, + end_user_id: Optional[str] = None, ) -> List[ChunkResult]: """Retrieve relevant chunks.""" settings = get_settings() @@ -106,7 +108,14 @@ class DocumentService: logger.info("Generated query embedding") # Find authorized documents - doc_ids = await self.db.find_authorized_and_filtered_documents(auth, filters) + # Build system filters for folder_name and end_user_id + system_filters = {} + if folder_name: + system_filters["folder_name"] = folder_name + if end_user_id: + system_filters["end_user_id"] = end_user_id + + doc_ids = await self.db.find_authorized_and_filtered_documents(auth, filters, system_filters) if not doc_ids: logger.info("No authorized documents found") return [] @@ -194,11 +203,13 @@ class DocumentService: min_score: float = 0.0, use_reranking: Optional[bool] = None, use_colpali: Optional[bool] = None, + folder_name: Optional[str] = None, + end_user_id: Optional[str] = None, ) -> List[DocumentResult]: """Retrieve relevant documents.""" # Get chunks first chunks = await self.retrieve_chunks( - query, auth, filters, k, min_score, use_reranking, use_colpali + query, auth, filters, k, min_score, use_reranking, use_colpali, folder_name, end_user_id ) # Convert to document results results = await self._create_document_results(auth, chunks) @@ -209,7 +220,9 @@ class DocumentService: async def batch_retrieve_documents( self, document_ids: List[str], - auth: AuthContext + auth: AuthContext, + folder_name: Optional[str] = None, + end_user_id: Optional[str] = None ) -> List[Document]: """ Retrieve multiple documents by their IDs in a single batch operation. @@ -224,15 +237,24 @@ class DocumentService: if not document_ids: return [] + # Build system filters for folder_name and end_user_id + system_filters = {} + if folder_name: + system_filters["folder_name"] = folder_name + if end_user_id: + system_filters["end_user_id"] = end_user_id + # Use the database's batch retrieval method - documents = await self.db.get_documents_by_id(document_ids, auth) + documents = await self.db.get_documents_by_id(document_ids, auth, system_filters) logger.info(f"Batch retrieved {len(documents)} documents out of {len(document_ids)} requested") return documents async def batch_retrieve_chunks( self, chunk_ids: List[ChunkSource], - auth: AuthContext + auth: AuthContext, + folder_name: Optional[str] = None, + end_user_id: Optional[str] = None ) -> List[ChunkResult]: """ Retrieve specific chunks by their document ID and chunk number in a single batch operation. @@ -251,7 +273,7 @@ class DocumentService: doc_ids = list({source.document_id for source in chunk_ids}) # Find authorized documents in a single query - authorized_docs = await self.batch_retrieve_documents(doc_ids, auth) + authorized_docs = await self.batch_retrieve_documents(doc_ids, auth, folder_name, end_user_id) authorized_doc_ids = {doc.external_id for doc in authorized_docs} # Filter sources to only include authorized documents @@ -292,6 +314,8 @@ class DocumentService: hop_depth: int = 1, include_paths: bool = False, prompt_overrides: Optional["QueryPromptOverrides"] = None, + folder_name: Optional[str] = None, + end_user_id: Optional[str] = None, ) -> CompletionResponse: """Generate completion using relevant chunks as context. @@ -329,11 +353,13 @@ class DocumentService: hop_depth=hop_depth, include_paths=include_paths, prompt_overrides=prompt_overrides, + folder_name=folder_name, + end_user_id=end_user_id ) # Standard retrieval without graph chunks = await self.retrieve_chunks( - query, auth, filters, k, min_score, use_reranking, use_colpali + query, auth, filters, k, min_score, use_reranking, use_colpali, folder_name, end_user_id ) documents = await self._create_document_results(auth, chunks) @@ -374,6 +400,8 @@ class DocumentService: auth: AuthContext = None, rules: Optional[List[str]] = None, use_colpali: Optional[bool] = None, + folder_name: Optional[str] = None, + end_user_id: Optional[str] = None, ) -> Document: """Ingest a text document.""" if "write" not in auth.permissions: @@ -396,6 +424,12 @@ class DocumentService: "user_id": [auth.user_id] if auth.user_id else [], # Add user_id to access control for filtering (as a list) }, ) + + # Add folder_name and end_user_id to system_metadata if provided + if folder_name: + doc.system_metadata["folder_name"] = folder_name + if end_user_id: + doc.system_metadata["end_user_id"] = end_user_id logger.debug(f"Created text document record with ID {doc.external_id}") if settings.MODE == "cloud" and auth.user_id: @@ -459,6 +493,8 @@ class DocumentService: auth: AuthContext, rules: Optional[List[str]] = None, use_colpali: Optional[bool] = None, + folder_name: Optional[str] = None, + end_user_id: Optional[str] = None, ) -> Document: """Ingest a file document.""" if "write" not in auth.permissions: @@ -527,6 +563,12 @@ class DocumentService: }, additional_metadata=additional_metadata, ) + + # Add folder_name and end_user_id to system_metadata if provided + if folder_name: + doc.system_metadata["folder_name"] = folder_name + if end_user_id: + doc.system_metadata["end_user_id"] = end_user_id if settings.MODE == "cloud" and auth.user_id: # Check limits before proceeding with parsing @@ -730,7 +772,13 @@ class DocumentService: chunks: List[Chunk], embeddings: List[List[float]], ) -> List[DocumentChunk]: - """Helper to create chunk objects""" + """Helper to create chunk objects + + Note: folder_name and end_user_id are not needed in chunk metadata because: + 1. Filtering by these values happens at the document level in find_authorized_and_filtered_documents + 2. Vector search is only performed on already authorized and filtered documents + 3. This approach is more efficient as it reduces the size of chunk metadata + """ return [ c.to_document_chunk(chunk_number=i, embedding=embedding, document_id=doc_id) for i, (embedding, c) in enumerate(zip(embeddings, chunks)) @@ -1341,6 +1389,7 @@ class DocumentService: filters: Optional[Dict[str, Any]] = None, documents: Optional[List[str]] = None, prompt_overrides: Optional[GraphPromptOverrides] = None, + system_filters: Optional[Dict[str, Any]] = None, ) -> Graph: """Create a graph from documents. @@ -1352,6 +1401,8 @@ class DocumentService: auth: Authentication context filters: Optional metadata filters to determine which documents to include documents: Optional list of specific document IDs to include + prompt_overrides: Optional customizations for entity extraction and resolution prompts + system_filters: Optional system filters like folder_name and end_user_id for scoping Returns: Graph: The created graph @@ -1364,6 +1415,7 @@ class DocumentService: filters=filters, documents=documents, prompt_overrides=prompt_overrides, + system_filters=system_filters, ) async def update_graph( @@ -1373,6 +1425,7 @@ class DocumentService: additional_filters: Optional[Dict[str, Any]] = None, additional_documents: Optional[List[str]] = None, prompt_overrides: Optional[GraphPromptOverrides] = None, + system_filters: Optional[Dict[str, Any]] = None, ) -> Graph: """Update an existing graph with new documents. @@ -1384,6 +1437,8 @@ class DocumentService: auth: Authentication context additional_filters: Optional additional metadata filters to determine which new documents to include additional_documents: Optional list of additional document IDs to include + prompt_overrides: Optional customizations for entity extraction and resolution prompts + system_filters: Optional system filters like folder_name and end_user_id for scoping Returns: Graph: The updated graph @@ -1396,6 +1451,7 @@ class DocumentService: additional_filters=additional_filters, additional_documents=additional_documents, prompt_overrides=prompt_overrides, + system_filters=system_filters, ) async def delete_document(self, document_id: str, auth: AuthContext) -> bool: diff --git a/core/services/graph_service.py b/core/services/graph_service.py index 2368267..0ebcea5 100644 --- a/core/services/graph_service.py +++ b/core/services/graph_service.py @@ -68,6 +68,7 @@ class GraphService: additional_filters: Optional[Dict[str, Any]] = None, additional_documents: Optional[List[str]] = None, prompt_overrides: Optional[GraphPromptOverrides] = None, + system_filters: Optional[Dict[str, Any]] = None, ) -> Graph: """Update an existing graph with new documents. @@ -81,10 +82,15 @@ class GraphService: additional_filters: Optional additional metadata filters to determine which new documents to include additional_documents: Optional list of specific additional document IDs to include prompt_overrides: Optional GraphPromptOverrides with customizations for prompts + system_filters: Optional system metadata filters (e.g. folder_name, end_user_id) to determine which documents to include Returns: Graph: The updated graph """ + # Initialize system_filters if None + if system_filters is None: + system_filters = {} + if "write" not in auth.permissions: raise PermissionError("User does not have write permission") @@ -99,7 +105,7 @@ class GraphService: # Find new documents to process document_ids = await self._get_new_document_ids( - auth, existing_graph, additional_filters, additional_documents + auth, existing_graph, additional_filters, additional_documents, system_filters ) if not document_ids and not explicit_doc_ids: @@ -123,7 +129,7 @@ class GraphService: # Batch retrieve all documents in a single call document_objects = await document_service.batch_retrieve_documents( - all_ids_to_retrieve, auth + all_ids_to_retrieve, auth, system_filters.get("folder_name", None), system_filters.get("end_user_id", None) ) # Process explicit documents if needed @@ -150,6 +156,8 @@ class GraphService: if doc_id not in {d.external_id for d in document_objects} ], auth, + system_filters.get("folder_name", None), + system_filters.get("end_user_id", None) ) logger.info(f"Additional filtered documents to include: {len(filtered_docs)}") document_objects.extend(filtered_docs) @@ -190,23 +198,28 @@ class GraphService: existing_graph: Graph, additional_filters: Optional[Dict[str, Any]] = None, additional_documents: Optional[List[str]] = None, + system_filters: Optional[Dict[str, Any]] = None, ) -> Set[str]: """Get IDs of new documents to add to the graph.""" + # Initialize system_filters if None + if system_filters is None: + system_filters = {} # Initialize with explicitly specified documents, ensuring it's a set document_ids = set(additional_documents or []) # Process documents matching additional filters - if additional_filters: - filtered_docs = await self.db.get_documents(auth, filters=additional_filters) + if additional_filters or system_filters: + filtered_docs = await self.db.get_documents(auth, filters=additional_filters, system_filters=system_filters) filter_doc_ids = {doc.external_id for doc in filtered_docs} - logger.info(f"Found {len(filter_doc_ids)} documents matching additional filters") + logger.info(f"Found {len(filter_doc_ids)} documents matching additional filters and system filters") document_ids.update(filter_doc_ids) # Process documents matching the original filters if existing_graph.filters: - filtered_docs = await self.db.get_documents(auth, filters=existing_graph.filters) + # Original filters shouldn't include system filters, as we're applying them separately + filtered_docs = await self.db.get_documents(auth, filters=existing_graph.filters, system_filters=system_filters) orig_filter_doc_ids = {doc.external_id for doc in filtered_docs} - logger.info(f"Found {len(orig_filter_doc_ids)} documents matching original filters") + logger.info(f"Found {len(orig_filter_doc_ids)} documents matching original filters and system filters") document_ids.update(orig_filter_doc_ids) # Get only the document IDs that are not already in the graph @@ -384,6 +397,7 @@ class GraphService: filters: Optional[Dict[str, Any]] = None, documents: Optional[List[str]] = None, prompt_overrides: Optional[GraphPromptOverrides] = None, + system_filters: Optional[Dict[str, Any]] = None, ) -> Graph: """Create a graph from documents. @@ -397,10 +411,15 @@ class GraphService: filters: Optional metadata filters to determine which documents to include documents: Optional list of specific document IDs to include prompt_overrides: Optional GraphPromptOverrides with customizations for prompts + system_filters: Optional system metadata filters (e.g. folder_name, end_user_id) to determine which documents to include Returns: Graph: The created graph """ + # Initialize system_filters if None + if system_filters is None: + system_filters = {} + if "write" not in auth.permissions: raise PermissionError("User does not have write permission") @@ -408,15 +427,28 @@ class GraphService: document_ids = set(documents or []) # If filters were provided, get matching documents - if filters: - filtered_docs = await self.db.get_documents(auth, filters=filters) + if filters or system_filters: + filtered_docs = await self.db.get_documents(auth, filters=filters, system_filters=system_filters) document_ids.update(doc.external_id for doc in filtered_docs) if not document_ids: raise ValueError("No documents found matching criteria") + # Convert system_filters for document retrieval + folder_name = system_filters.get("folder_name") if system_filters else None + end_user_id = system_filters.get("end_user_id") if system_filters else None + # Batch retrieve documents for authorization check - document_objects = await document_service.batch_retrieve_documents(list(document_ids), auth) + document_objects = await document_service.batch_retrieve_documents( + list(document_ids), + auth, + folder_name, + end_user_id + ) + + # Log for debugging + logger.info(f"Graph creation with folder_name={folder_name}, end_user_id={end_user_id}") + logger.info(f"Documents retrieved: {len(document_objects)} out of {len(document_ids)} requested") if not document_objects: raise ValueError("No authorized documents found matching criteria") @@ -434,6 +466,13 @@ class GraphService: "admins": [auth.entity_id], }, ) + + # Add folder_name and end_user_id to system_metadata if provided + if system_filters: + if "folder_name" in system_filters: + graph.system_metadata["folder_name"] = system_filters["folder_name"] + if "end_user_id" in system_filters: + graph.system_metadata["end_user_id"] = system_filters["end_user_id"] # Extract entities and relationships entities, relationships = await self._process_documents_for_entities( @@ -868,6 +907,8 @@ class GraphService: hop_depth: int = 1, include_paths: bool = False, prompt_overrides: Optional[QueryPromptOverrides] = None, + folder_name: Optional[str] = None, + end_user_id: Optional[str] = None, ) -> CompletionResponse: """Generate completion using knowledge graph-enhanced retrieval. @@ -899,8 +940,15 @@ class GraphService: # Validation is now handled by type annotations - # Get the knowledge graph - graph = await self.db.get_graph(graph_name, auth) + # Build system filters for scoping + system_filters = {} + if folder_name: + system_filters["folder_name"] = folder_name + if end_user_id: + system_filters["end_user_id"] = end_user_id + + logger.info(f"Querying graph with system_filters: {system_filters}") + graph = await self.db.get_graph(graph_name, auth, system_filters=system_filters) if not graph: logger.warning(f"Graph '{graph_name}' not found or not accessible") # Fall back to standard retrieval if graph not found @@ -915,12 +963,14 @@ class GraphService: use_reranking=use_reranking, use_colpali=use_colpali, graph_name=None, + folder_name=folder_name, + end_user_id=end_user_id, ) # Parallel approach # 1. Standard vector search vector_chunks = await document_service.retrieve_chunks( - query, auth, filters, k, min_score, use_reranking, use_colpali + query, auth, filters, k, min_score, use_reranking, use_colpali, folder_name, end_user_id ) logger.info(f"Vector search retrieved {len(vector_chunks)} chunks") @@ -990,7 +1040,7 @@ class GraphService: # Get specific chunks containing these entities graph_chunks = await self._retrieve_entity_chunks( - expanded_entities, auth, filters, document_service + expanded_entities, auth, filters, document_service, folder_name, end_user_id ) logger.info(f"Retrieved {len(graph_chunks)} chunks containing relevant entities") @@ -1015,6 +1065,8 @@ class GraphService: auth, graph_name, prompt_overrides, + folder_name=folder_name, + end_user_id=end_user_id, ) return completion_response @@ -1143,8 +1195,13 @@ class GraphService: auth: AuthContext, filters: Optional[Dict[str, Any]], document_service, + folder_name: Optional[str] = None, + end_user_id: Optional[str] = None, ) -> List[ChunkResult]: """Retrieve chunks containing the specified entities.""" + # Initialize filters if None + if filters is None: + filters = {} if not entities: return [] @@ -1158,9 +1215,9 @@ class GraphService: # Get unique document IDs for authorization check doc_ids = {doc_id for doc_id, _ in entity_chunk_sources} - - # Check document authorization - documents = await document_service.batch_retrieve_documents(list(doc_ids), auth) + + # Check document authorization with system filters + documents = await document_service.batch_retrieve_documents(list(doc_ids), auth, folder_name, end_user_id) # Apply filters if needed authorized_doc_ids = { @@ -1178,7 +1235,7 @@ class GraphService: # Retrieve and return chunks if we have any valid sources return ( - await document_service.batch_retrieve_chunks(chunk_sources, auth) + await document_service.batch_retrieve_chunks(chunk_sources, auth, folder_name=folder_name, end_user_id=end_user_id) if chunk_sources else [] ) @@ -1198,7 +1255,7 @@ class GraphService: chunk.score = min(1.0, (getattr(chunk, "score", 0.7) or 0.7) * 1.05) # Keep the higher-scored version - if chunk_key not in all_chunks or chunk.score > all_chunks[chunk_key].score: + if chunk_key not in all_chunks or chunk.score > (getattr(all_chunks.get(chunk_key), "score", 0) or 0): all_chunks[chunk_key] = chunk # Convert to list, sort by score, and return top k @@ -1330,6 +1387,8 @@ class GraphService: auth: Optional[AuthContext] = None, graph_name: Optional[str] = None, prompt_overrides: Optional[QueryPromptOverrides] = None, + folder_name: Optional[str] = None, + end_user_id: Optional[str] = None, ) -> CompletionResponse: """Generate completion using the retrieved chunks and optional path information.""" if not chunks: @@ -1370,6 +1429,8 @@ class GraphService: max_tokens=max_tokens, temperature=temperature, prompt_template=custom_prompt_template, + folder_name=folder_name, + end_user_id=end_user_id, ) # Get completion from model @@ -1387,6 +1448,7 @@ class GraphService: # Include graph metadata if paths were requested if include_paths: + # Initialize metadata if it doesn't exist if not hasattr(response, "metadata") or response.metadata is None: response.metadata = {} diff --git a/core/tests/integration/test_api.py b/core/tests/integration/test_api.py index 4cab9bb..d5bdfa6 100644 --- a/core/tests/integration/test_api.py +++ b/core/tests/integration/test_api.py @@ -29,7 +29,7 @@ logger = logging.getLogger(__name__) TEST_DATA_DIR = Path(__file__).parent / "test_data" JWT_SECRET = "your-secret-key-for-signing-tokens" TEST_USER_ID = "test_user" -TEST_POSTGRES_URI = "postgresql+asyncpg://postgres:postgres@localhost:5432/morphik_test" +TEST_POSTGRES_URI = "postgresql+asyncpg://morphik@localhost:5432/morphik_test" @pytest.fixture(scope="session") @@ -261,6 +261,41 @@ async def test_ingest_text_document_with_metadata(client: AsyncClient, content: return data["external_id"] +@pytest.mark.asyncio +async def test_ingest_text_document_folder_user( + client: AsyncClient, + content: str = "Test content for document ingestion with folder and user scoping", + metadata: dict = None, + folder_name: str = "test_folder", + end_user_id: str = "test_user@example.com" +): + """Test ingesting a text document with folder and user scoping""" + headers = create_auth_header() + + response = await client.post( + "/ingest/text", + json={ + "content": content, + "metadata": metadata or {}, + "folder_name": folder_name, + "end_user_id": end_user_id + }, + headers=headers, + ) + + assert response.status_code == 200 + data = response.json() + assert "external_id" in data + assert data["content_type"] == "text/plain" + assert data["system_metadata"]["folder_name"] == folder_name + assert data["system_metadata"]["end_user_id"] == end_user_id + + for key, value in (metadata or {}).items(): + assert data["metadata"][key] == value + + return data["external_id"] + + @pytest.mark.asyncio async def test_ingest_pdf(client: AsyncClient): """Test ingesting a pdf""" @@ -1507,6 +1542,196 @@ async def test_query_with_graph(client: AsyncClient): assert response_no_graph.status_code == 200 +@pytest.mark.asyncio +async def test_graph_with_folder_and_user_scope(client: AsyncClient): + """Test knowledge graph with folder and user scoping.""" + headers = create_auth_header() + + # Test folder + folder_name = "test_graph_folder" + + # Test user + user_id = "graph_test_user@example.com" + + # Ingest documents into folder with user scope using our helper function + doc_id1 = await test_ingest_text_document_folder_user( + client, + content="Tesla is an electric vehicle manufacturer. Elon Musk is the CEO of Tesla.", + metadata={"graph_scope_test": True}, + folder_name=folder_name, + end_user_id=user_id + ) + + doc_id2 = await test_ingest_text_document_folder_user( + client, + content="SpaceX develops spacecraft and rockets. Elon Musk is also the CEO of SpaceX.", + metadata={"graph_scope_test": True}, + folder_name=folder_name, + end_user_id=user_id + ) + + # Also ingest a document outside the folder/user scope + _ = await test_ingest_text_document_with_metadata( + client, + content="Elon Musk also founded Neuralink, a neurotechnology company.", + metadata={"graph_scope_test": True} + ) + + # Create a graph with folder and user scope + graph_name = "test_scoped_graph" + response = await client.post( + "/graph/create", + json={ + "name": graph_name, + "folder_name": folder_name, + "end_user_id": user_id + }, + headers=headers, + ) + + assert response.status_code == 200 + graph = response.json() + + # Verify graph was created with proper scoping + assert graph["name"] == graph_name + assert len(graph["document_ids"]) == 2 + assert all(doc_id in graph["document_ids"] for doc_id in [doc_id1, doc_id2]) + + # Verify we have the expected entities + entity_labels = [entity["label"].lower() for entity in graph["entities"]] + assert any("tesla" in label for label in entity_labels) + assert any("spacex" in label for label in entity_labels) + assert any("elon musk" in label for label in entity_labels) + + # First, let's check the retrieved chunks directly to verify scope is working + retrieve_response = await client.post( + "/retrieve/chunks", + json={ + "query": "What companies does Elon Musk lead?", + "folder_name": folder_name, + "end_user_id": user_id + }, + headers=headers, + ) + + assert retrieve_response.status_code == 200 + retrieved_chunks = retrieve_response.json() + + # Verify that none of the retrieved chunks contain "Neuralink" + for chunk in retrieved_chunks: + assert "neuralink" not in chunk["content"].lower() + + # First try querying without a graph to see if RAG works with just folder/user scope + response_no_graph = await client.post( + "/query", + json={ + "query": "What companies does Elon Musk lead?", + "folder_name": folder_name, + "end_user_id": user_id + }, + headers=headers, + ) + + assert response_no_graph.status_code == 200 + result_no_graph = response_no_graph.json() + + # Verify the completion has the expected content + completion_no_graph = result_no_graph["completion"].lower() + print("Completion without graph:") + print(completion_no_graph) + assert "tesla" in completion_no_graph + assert "spacex" in completion_no_graph + assert "neuralink" not in completion_no_graph + + # Now test querying with graph and folder/user scope + response = await client.post( + "/query", + json={ + "query": "What companies does Elon Musk lead?", + "graph_name": graph_name, + "folder_name": folder_name, + "end_user_id": user_id + }, + headers=headers, + ) + + assert response.status_code == 200 + result = response.json() + + # Log source chunks and graph information used for completion + print("\nSource chunks for graph-based completion:") + for source in result["sources"]: + print(f"Document ID: {source['document_id']}, Chunk: {source['chunk_number']}") + + # Check if there's graph metadata in the response + if result.get("metadata") and "graph" in result.get("metadata", {}): + print("\nGraph metadata used:") + print(result["metadata"]["graph"]) + + # Verify the completion has the expected content + completion = result["completion"].lower() + print("\nCompletion with graph:") + print(completion) + assert "tesla" in completion + assert "spacex" in completion + + # Verify Neuralink isn't included (it was outside folder/user scope) + assert "neuralink" not in completion + + # Test updating the graph with folder and user scope + doc_id3 = await test_ingest_text_document_folder_user( + client, + content="The Boring Company was founded by Elon Musk in 2016.", + metadata={"graph_scope_test": True}, + folder_name=folder_name, + end_user_id=user_id + ) + + # Update the graph + update_response = await client.post( + f"/graph/{graph_name}/update", + json={ + "additional_documents": [doc_id3], + "folder_name": folder_name, + "end_user_id": user_id + }, + headers=headers, + ) + + assert update_response.status_code == 200 + updated_graph = update_response.json() + + # Verify graph was updated + assert updated_graph["name"] == graph_name + assert len(updated_graph["document_ids"]) == 3 + assert all(doc_id in updated_graph["document_ids"] for doc_id in [doc_id1, doc_id2, doc_id3]) + + # Verify new entity was added + updated_entity_labels = [entity["label"].lower() for entity in updated_graph["entities"]] + assert any("boring company" in label for label in updated_entity_labels) + + # Test querying with updated graph + response = await client.post( + "/query", + json={ + "query": "List all companies founded or led by Elon Musk", + "graph_name": graph_name, + "folder_name": folder_name, + "end_user_id": user_id + }, + headers=headers, + ) + + assert response.status_code == 200 + updated_result = response.json() + + # Verify the completion includes the new company + updated_completion = updated_result["completion"].lower() + assert "tesla" in updated_completion + assert "spacex" in updated_completion + assert "boring company" in updated_completion + + @pytest.mark.asyncio async def test_batch_ingest_with_shared_metadata( client: AsyncClient @@ -1804,6 +2029,429 @@ async def test_batch_ingest_sequential_vs_parallel( assert len(result["errors"]) == 0 +@pytest.mark.asyncio +async def test_folder_scoping(client: AsyncClient): + """Test document operations with folder scoping.""" + headers = create_auth_header() + + # Test folder 1 + folder1_name = "test_folder_1" + folder1_content = "This is content in test folder 1." + + # Test folder 2 + folder2_name = "test_folder_2" + folder2_content = "This is different content in test folder 2." + + # Ingest document into folder 1 using our helper function + doc1_id = await test_ingest_text_document_folder_user( + client, + content=folder1_content, + metadata={"folder_test": True}, + folder_name=folder1_name, + end_user_id=None + ) + + # Get the document to verify + response = await client.get(f"/documents/{doc1_id}", headers=headers) + assert response.status_code == 200 + doc1 = response.json() + assert doc1["system_metadata"]["folder_name"] == folder1_name + + # Ingest document into folder 2 using our helper function + doc2_id = await test_ingest_text_document_folder_user( + client, + content=folder2_content, + metadata={"folder_test": True}, + folder_name=folder2_name, + end_user_id=None + ) + + # Get the document to verify + response = await client.get(f"/documents/{doc2_id}", headers=headers) + assert response.status_code == 200 + doc2 = response.json() + assert doc2["system_metadata"]["folder_name"] == folder2_name + + # Verify we can get documents by folder + response = await client.post( + "/documents", + json={"folder_test": True}, + headers=headers, + params={"folder_name": folder1_name} + ) + + assert response.status_code == 200 + folder1_docs = response.json() + assert len(folder1_docs) == 1 + assert folder1_docs[0]["external_id"] == doc1_id + + # Verify other folder's document isn't in results + assert not any(doc["external_id"] == doc2_id for doc in folder1_docs) + + # Test querying with folder scope + response = await client.post( + "/query", + json={ + "query": "What folder is this content in?", + "folder_name": folder1_name + }, + headers=headers, + ) + + assert response.status_code == 200 + result = response.json() + assert "completion" in result + + # Test folder-specific chunk retrieval + response = await client.post( + "/retrieve/chunks", + json={ + "query": "folder content", + "folder_name": folder2_name + }, + headers=headers, + ) + + assert response.status_code == 200 + chunks = response.json() + assert len(chunks) > 0 + assert folder2_content in chunks[0]["content"] + + # Test document update with folder preservation + updated_content = "This is updated content in test folder 1." + response = await client.post( + f"/documents/{doc1_id}/update_text", + json={ + "content": updated_content, + "metadata": {"updated": True}, + "folder_name": folder1_name # This should match original folder + }, + headers=headers, + ) + + assert response.status_code == 200 + updated_doc = response.json() + assert updated_doc["system_metadata"]["folder_name"] == folder1_name + + +@pytest.mark.asyncio +async def test_user_scoping(client: AsyncClient): + """Test document operations with end-user scoping.""" + headers = create_auth_header() + + # Test user 1 + user1_id = "test_user_1@example.com" + user1_content = "This is content created by test user 1." + + # Test user 2 + user2_id = "test_user_2@example.com" + user2_content = "This is different content created by test user 2." + + # Ingest document for user 1 using our helper function + doc1_id = await test_ingest_text_document_folder_user( + client, + content=user1_content, + metadata={"user_test": True}, + folder_name=None, + end_user_id=user1_id + ) + + # Get the document to verify + response = await client.get(f"/documents/{doc1_id}", headers=headers) + assert response.status_code == 200 + doc1 = response.json() + assert doc1["system_metadata"]["end_user_id"] == user1_id + + # Ingest document for user 2 using our helper function + doc2_id = await test_ingest_text_document_folder_user( + client, + content=user2_content, + metadata={"user_test": True}, + folder_name=None, + end_user_id=user2_id + ) + + # Get the document to verify + response = await client.get(f"/documents/{doc2_id}", headers=headers) + assert response.status_code == 200 + doc2 = response.json() + assert doc2["system_metadata"]["end_user_id"] == user2_id + + # Verify we can get documents by user + response = await client.post( + "/documents", + json={"user_test": True}, + headers=headers, + params={"end_user_id": user1_id} + ) + + assert response.status_code == 200 + user1_docs = response.json() + assert len(user1_docs) == 1 + assert user1_docs[0]["external_id"] == doc1_id + + # Verify other user's document isn't in results + assert not any(doc["external_id"] == doc2_id for doc in user1_docs) + + # Test querying with user scope + response = await client.post( + "/query", + json={ + "query": "What is my content?", + "end_user_id": user1_id + }, + headers=headers, + ) + + assert response.status_code == 200 + result = response.json() + assert "completion" in result + + # Test updating document with user preservation + updated_content = "This is updated content by test user 1." + response = await client.post( + f"/documents/{doc1_id}/update_text", + json={ + "content": updated_content, + "metadata": {"updated": True}, + "end_user_id": user1_id # Should preserve the user + }, + headers=headers, + ) + + assert response.status_code == 200 + updated_doc = response.json() + assert updated_doc["system_metadata"]["end_user_id"] == user1_id + + +@pytest.mark.asyncio +async def test_combined_folder_and_user_scoping(client: AsyncClient): + """Test document operations with combined folder and user scoping.""" + headers = create_auth_header() + + # Test folder + folder_name = "test_combined_folder" + + # Test users + user1_id = "combined_test_user_1@example.com" + user2_id = "combined_test_user_2@example.com" + + # Ingest document for user 1 in folder using our new helper function + user1_content = "This is content by user 1 in the combined test folder." + doc1_id = await test_ingest_text_document_folder_user( + client, + content=user1_content, + metadata={"combined_test": True}, + folder_name=folder_name, + end_user_id=user1_id + ) + + # Get the document to verify + response = await client.get(f"/documents/{doc1_id}", headers=headers) + assert response.status_code == 200 + doc1 = response.json() + assert doc1["system_metadata"]["folder_name"] == folder_name + assert doc1["system_metadata"]["end_user_id"] == user1_id + + # Ingest document for user 2 in folder using our new helper function + user2_content = "This is content by user 2 in the combined test folder." + doc2_id = await test_ingest_text_document_folder_user( + client, + content=user2_content, + metadata={"combined_test": True}, + folder_name=folder_name, + end_user_id=user2_id + ) + + # Get the document to verify + response = await client.get(f"/documents/{doc2_id}", headers=headers) + assert response.status_code == 200 + doc2 = response.json() + assert doc2["system_metadata"]["folder_name"] == folder_name + assert doc2["system_metadata"]["end_user_id"] == user2_id + + # Get all documents in folder + response = await client.post( + "/documents", + json={"combined_test": True}, + headers=headers, + params={"folder_name": folder_name} + ) + + assert response.status_code == 200 + folder_docs = response.json() + assert len(folder_docs) == 2 + + # Get user 1's documents in the folder + response = await client.post( + "/documents", + json={"combined_test": True}, + headers=headers, + params={"folder_name": folder_name, "end_user_id": user1_id} + ) + + assert response.status_code == 200 + user1_folder_docs = response.json() + assert len(user1_folder_docs) == 1 + assert user1_folder_docs[0]["external_id"] == doc1_id + + # Test querying with combined scope + response = await client.post( + "/query", + json={ + "query": "What is in this folder for this user?", + "folder_name": folder_name, + "end_user_id": user2_id + }, + headers=headers, + ) + + assert response.status_code == 200 + result = response.json() + assert "completion" in result + + # Test retrieving chunks with combined scope + response = await client.post( + "/retrieve/chunks", + json={ + "query": "combined test folder", + "folder_name": folder_name, + "end_user_id": user1_id + }, + headers=headers, + ) + + assert response.status_code == 200 + chunks = response.json() + assert len(chunks) > 0 + # Should only have user 1's content + assert any(user1_content in chunk["content"] for chunk in chunks) + assert not any(user2_content in chunk["content"] for chunk in chunks) + + +@pytest.mark.asyncio +async def test_system_metadata_filter_behavior(client: AsyncClient): + """Test detailed behavior of system_metadata filtering.""" + headers = create_auth_header() + + # Create documents with different system metadata combinations + + # Document with folder only + folder_only_content = "This document has only folder in system metadata." + folder_only_id = await test_ingest_text_document_folder_user( + client, + content=folder_only_content, + metadata={"filter_test": True}, + folder_name="test_filter_folder", + end_user_id=None # Only folder, no user + ) + + # Get the document to verify + response = await client.get(f"/documents/{folder_only_id}", headers=headers) + assert response.status_code == 200 + folder_only_doc = response.json() + + # Document with user only + user_only_content = "This document has only user in system metadata." + user_only_id = await test_ingest_text_document_folder_user( + client, + content=user_only_content, + metadata={"filter_test": True}, + folder_name=None, # No folder, only user + end_user_id="test_filter_user@example.com" + ) + + # Get the document to verify + response = await client.get(f"/documents/{user_only_id}", headers=headers) + assert response.status_code == 200 + user_only_doc = response.json() + + # Document with both folder and user + combined_content = "This document has both folder and user in system metadata." + combined_id = await test_ingest_text_document_folder_user( + client, + content=combined_content, + metadata={"filter_test": True}, + folder_name="test_filter_folder", + end_user_id="test_filter_user@example.com" + ) + + # Get the document to verify + response = await client.get(f"/documents/{combined_id}", headers=headers) + assert response.status_code == 200 + combined_doc = response.json() + + # Test queries with different filter combinations + + # Filter by folder only + response = await client.post( + "/documents", + json={"filter_test": True}, + headers=headers, + params={"folder_name": "test_filter_folder"} + ) + + assert response.status_code == 200 + folder_filtered_docs = response.json() + folder_doc_ids = [doc["external_id"] for doc in folder_filtered_docs] + assert folder_only_id in folder_doc_ids + assert combined_id in folder_doc_ids + assert user_only_id not in folder_doc_ids + + # Filter by user only + response = await client.post( + "/documents", + json={"filter_test": True}, + headers=headers, + params={"end_user_id": "test_filter_user@example.com"} + ) + + assert response.status_code == 200 + user_filtered_docs = response.json() + user_doc_ids = [doc["external_id"] for doc in user_filtered_docs] + assert user_only_id in user_doc_ids + assert combined_id in user_doc_ids + assert folder_only_id not in user_doc_ids + + # Filter by both folder and user + response = await client.post( + "/documents", + json={"filter_test": True}, + headers=headers, + params={ + "folder_name": "test_filter_folder", + "end_user_id": "test_filter_user@example.com" + } + ) + + assert response.status_code == 200 + combined_filtered_docs = response.json() + combined_doc_ids = [doc["external_id"] for doc in combined_filtered_docs] + assert len(combined_filtered_docs) == 1 + assert combined_id in combined_doc_ids + assert folder_only_id not in combined_doc_ids + assert user_only_id not in combined_doc_ids + + # Test with chunk retrieval + response = await client.post( + "/retrieve/chunks", + json={ + "query": "system metadata", + "folder_name": "test_filter_folder", + "end_user_id": "test_filter_user@example.com" + }, + headers=headers, + ) + + assert response.status_code == 200 + chunks = response.json() + assert len(chunks) > 0 + # Should only have the combined document content + assert any(combined_content in chunk["content"] for chunk in chunks) + assert not any(folder_only_content in chunk["content"] for chunk in chunks) + assert not any(user_only_content in chunk["content"] for chunk in chunks) + + @pytest.mark.asyncio async def test_delete_document(client: AsyncClient): """Test deleting a document and verifying it's gone.""" diff --git a/core/vector_store/mongo_vector_store.py b/core/vector_store/mongo_vector_store.py deleted file mode 100644 index 5873a58..0000000 --- a/core/vector_store/mongo_vector_store.py +++ /dev/null @@ -1,183 +0,0 @@ -from typing import List, Optional, Tuple -import logging -from motor.motor_asyncio import AsyncIOMotorClient -from pymongo.errors import PyMongoError - -from .base_vector_store import BaseVectorStore -from core.models.chunk import DocumentChunk - -logger = logging.getLogger(__name__) - - -class MongoDBAtlasVectorStore(BaseVectorStore): - """MongoDB Atlas Vector Search implementation.""" - - def __init__( - self, - uri: str, - database_name: str, - collection_name: str = "document_chunks", - index_name: str = "vector_index", - ): - """Initialize MongoDB connection for vector storage.""" - self.client = AsyncIOMotorClient(uri) - self.db = self.client[database_name] - self.collection = self.db[collection_name] - self.index_name = index_name - - async def initialize(self): - """Initialize vector search index if needed.""" - try: - # Create basic indexes - await self.collection.create_index("document_id") - await self.collection.create_index("chunk_number") - - # Note: Vector search index must be created via Atlas UI or API - # as it requires specific configuration - - logger.info("MongoDB vector store indexes initialized") - return True - except PyMongoError as e: - logger.error(f"Error initializing vector store indexes: {str(e)}") - return False - - async def store_embeddings(self, chunks: List[DocumentChunk]) -> Tuple[bool, List[str]]: - """Store document chunks with their embeddings.""" - try: - if not chunks: - return True, [] - - # Convert chunks to dicts - documents = [] - for chunk in chunks: - doc = chunk.model_dump() - # Ensure we have required fields - if not doc.get("embedding"): - logger.error( - f"Missing embedding for chunk " f"{chunk.document_id}-{chunk.chunk_number}" - ) - continue - documents.append(doc) - - if documents: - # Use ordered=False to continue even if some inserts fail - result = await self.collection.insert_many(documents, ordered=False) - return len(result.inserted_ids) > 0, [str(id) for id in result.inserted_ids] - else: - logger.error(f"No documents to store - here is the input: {chunks}") - return False, [] - - except PyMongoError as e: - logger.error(f"Error storing embeddings: {str(e)}") - return False, [] - - async def query_similar( - self, - query_embedding: List[float], - k: int, - doc_ids: Optional[List[str]] = None, - ) -> List[DocumentChunk]: - """Find similar chunks using MongoDB Atlas Vector Search.""" - try: - logger.debug( - f"Searching in database {self.db.name} " f"collection {self.collection.name}" - ) - logger.debug(f"Query vector looks like: {query_embedding}") - logger.debug(f"Doc IDs: {doc_ids}") - logger.debug(f"K is: {k}") - logger.debug(f"Index is: {self.index_name}") - - # Vector search pipeline - pipeline = [ - { - "$vectorSearch": { - "index": self.index_name, - "path": "embedding", - "queryVector": query_embedding, - "numCandidates": k * 40, # Get more candidates - "limit": k, - "filter": {"document_id": {"$in": doc_ids}} if doc_ids else {}, - } - }, - { - "$project": { - "score": {"$meta": "vectorSearchScore"}, - "document_id": 1, - "chunk_number": 1, - "content": 1, - "metadata": 1, - "_id": 0, - } - }, - ] - - # Execute search - cursor = self.collection.aggregate(pipeline) - chunks = [] - - async for result in cursor: - chunk = DocumentChunk( - document_id=result["document_id"], - chunk_number=result["chunk_number"], - content=result["content"], - embedding=[], # Don't send embeddings back - metadata=result.get("metadata", {}), - score=result.get("score", 0.0), - ) - chunks.append(chunk) - - return chunks - - except PyMongoError as e: - logger.error(f"MongoDB error: {e._message}") - logger.error(f"Error querying similar chunks: {str(e)}") - raise e - - async def get_chunks_by_id( - self, - chunk_identifiers: List[Tuple[str, int]], - ) -> List[DocumentChunk]: - """ - Retrieve specific chunks by document ID and chunk number in a single database query. - - Args: - chunk_identifiers: List of (document_id, chunk_number) tuples - - Returns: - List of DocumentChunk objects - """ - try: - if not chunk_identifiers: - return [] - - # Create a query with $or to find multiple chunks in a single query - query = {"$or": []} - for doc_id, chunk_num in chunk_identifiers: - query["$or"].append({ - "document_id": doc_id, - "chunk_number": chunk_num - }) - - logger.info(f"Batch retrieving {len(chunk_identifiers)} chunks with a single query") - - # Find all matching chunks in a single database query - cursor = self.collection.find(query) - chunks = [] - - async for result in cursor: - chunk = DocumentChunk( - document_id=result["document_id"], - chunk_number=result["chunk_number"], - content=result["content"], - embedding=[], # Don't send embeddings back - metadata=result.get("metadata", {}), - score=0.0, # No relevance score for direct retrieval - ) - chunks.append(chunk) - - logger.info(f"Found {len(chunks)} chunks in batch retrieval") - return chunks - - except PyMongoError as e: - logger.error(f"Error retrieving chunks by ID: {str(e)}") - return [] diff --git a/examples/multi_app_user_scoping.py b/examples/multi_app_user_scoping.py new file mode 100644 index 0000000..35425a0 --- /dev/null +++ b/examples/multi_app_user_scoping.py @@ -0,0 +1,82 @@ +import os +from dotenv import load_dotenv +from morphik import Morphik + +# Load environment variables +load_dotenv() + +# Connect to Morphik +db = Morphik(os.getenv("MORPHIK_URI"), timeout=10000, is_local=True) + +print("========== Customer Support Example ==========") +# Create a folder for application data +app_folder = db.create_folder("customer-support") +print(f"Created folder: {app_folder.name}") + +# Ingest documents into the folder +folder_doc = app_folder.ingest_text( + "Customer reported an issue with login functionality. Steps to reproduce: " + "1. Go to login page, 2. Enter credentials, 3. Click login button.", + filename="ticket-001.txt", + metadata={"category": "bug", "priority": "high", "status": "open"} +) +print(f"Ingested document into folder: {folder_doc.external_id}") + +# Perform a query in the folder context +folder_response = app_folder.query( + "What issues have been reported?", + k=2 +) +print("\nFolder Query Results:") +print(folder_response.completion) + +# Get statistics for the folder +folder_docs = app_folder.list_documents() +print(f"\nFolder Statistics: {len(folder_docs)} documents in '{app_folder.name}'") + +print("\n========== User Scoping Example ==========") +# Create a user scope +user_email = "support@example.com" +user = db.signin(user_email) +print(f"Created user scope for: {user.end_user_id}") + +# Ingest a document as this user +user_doc = user.ingest_text( + "User requested information about premium features. They are interested in the collaboration tools.", + filename="inquiry-001.txt", + metadata={"category": "inquiry", "priority": "medium", "status": "open"} +) +print(f"Ingested document as user: {user_doc.external_id}") + +# Query as this user +user_response = user.query( + "What customer inquiries do we have?", + k=2 +) +print("\nUser Query Results:") +print(user_response.completion) + +# Get documents for this user +user_docs = user.list_documents() +print(f"\nUser Statistics: {len(user_docs)} documents for user '{user.end_user_id}'") + +print("\n========== Combined Folder and User Scoping ==========") +# Create a user scoped to a specific folder +folder_user = app_folder.signin(user_email) +print(f"Created user scope for {folder_user.end_user_id} in folder {folder_user.folder_name}") + +# Ingest a document as this user in the folder context +folder_user_doc = folder_user.ingest_text( + "Customer called to follow up on ticket-001. They are still experiencing the login issue on Chrome.", + filename="ticket-002.txt", + metadata={"category": "follow-up", "priority": "high", "status": "open"} +) +print(f"Ingested document as user in folder: {folder_user_doc.external_id}") + +# Query as this user in the folder context +folder_user_response = folder_user.query( + "What high priority issues require attention?", + k=2 +) +print("\nFolder User Query Results:") +print(folder_user_response.completion) diff --git a/quick_setup.py b/quick_setup.py index bb9afb6..e42d488 100644 --- a/quick_setup.py +++ b/quick_setup.py @@ -9,15 +9,12 @@ import boto3 import botocore import tomli # for reading toml files from dotenv import find_dotenv, load_dotenv -from pymongo import MongoClient -from pymongo.errors import ConnectionFailure, OperationFailure -from pymongo.operations import SearchIndexModel # Force reload of environment variables load_dotenv(find_dotenv(), override=True) # Set up argument parser -parser = argparse.ArgumentParser(description="Setup S3 bucket and MongoDB collections") +parser = argparse.ArgumentParser(description="Setup S3 bucket") parser.add_argument("--debug", action="store_true", help="Enable debug logging") parser.add_argument("--quiet", action="store_true", help="Only show warning and error logs") args = parser.parse_args() @@ -48,16 +45,6 @@ with open(config_path, "rb") as f: STORAGE_PROVIDER = CONFIG["storage"]["provider"] DATABASE_PROVIDER = CONFIG["database"]["provider"] -# MongoDB specific config -if "mongodb" in CONFIG["database"]: - DATABASE_NAME = CONFIG["database"]["mongodb"]["database_name"] - DOCUMENTS_COLLECTION = "documents" - CHUNKS_COLLECTION = "document_chunks" - if "mongodb" in CONFIG["vector_store"]: - VECTOR_DIMENSIONS = CONFIG["embedding"]["dimensions"] - VECTOR_INDEX_NAME = "vector_index" - SIMILARITY_METRIC = CONFIG["embedding"]["similarity_metric"] - # Extract storage-specific configuration if STORAGE_PROVIDER == "aws-s3": DEFAULT_REGION = CONFIG["storage"]["region"] @@ -117,69 +104,6 @@ def bucket_exists(s3_client, bucket_name): # raise e -def setup_mongodb(): - """ - Set up MongoDB database, documents collection, and vector index on documents_chunk collection. - """ - # Load MongoDB URI from .env file - mongo_uri = os.getenv("MONGODB_URI") - if not mongo_uri: - raise ValueError("MONGODB_URI not found in .env file.") - - try: - # Connect to MongoDB - client = MongoClient(mongo_uri) - client.admin.command("ping") # Check connection - LOGGER.info("Connected to MongoDB successfully.") - - # Create or access the database - db = client[DATABASE_NAME] - LOGGER.info(f"Database '{DATABASE_NAME}' ready.") - - # Create 'documents' collection - if DOCUMENTS_COLLECTION not in db.list_collection_names(): - db.create_collection(DOCUMENTS_COLLECTION) - LOGGER.info(f"Collection '{DOCUMENTS_COLLECTION}' created.") - else: - LOGGER.info(f"Collection '{DOCUMENTS_COLLECTION}' already exists.") - - # Create 'documents_chunk' collection with vector index - if CHUNKS_COLLECTION not in db.list_collection_names(): - db.create_collection(CHUNKS_COLLECTION) - LOGGER.info(f"Collection '{CHUNKS_COLLECTION}' created.") - else: - LOGGER.info(f"Collection '{CHUNKS_COLLECTION}' already exists.") - - vector_index_definition = { - "fields": [ - { - "numDimensions": VECTOR_DIMENSIONS, - "path": "embedding", - "similarity": SIMILARITY_METRIC, - "type": "vector", - }, - {"path": "document_id", "type": "filter"}, - ] - } - vector_index = SearchIndexModel( - name=VECTOR_INDEX_NAME, - definition=vector_index_definition, - type="vectorSearch", - ) - db[CHUNKS_COLLECTION].create_search_index(model=vector_index) - LOGGER.info("Vector index 'vector_index' created on 'documents_chunk' collection.") - - except ConnectionFailure: - LOGGER.error("Failed to connect to MongoDB. Check your MongoDB URI and network connection.") - except OperationFailure as e: - LOGGER.error(f"MongoDB operation failed: {e}") - except Exception as e: - LOGGER.error(f"Unexpected error: {e}") - finally: - client.close() - LOGGER.info("MongoDB connection closed.") - - def setup(): # Setup S3 if configured if STORAGE_PROVIDER == "aws-s3": @@ -188,16 +112,11 @@ def setup(): LOGGER.info("S3 bucket setup completed.") # Setup database based on provider - match DATABASE_PROVIDER: - case "mongodb": - LOGGER.info("Setting up MongoDB...") - setup_mongodb() - LOGGER.info("MongoDB setup completed.") - case "postgres": - LOGGER.info("Postgres is setup on database intialization - nothing to do here!") - case _: - LOGGER.error(f"Unsupported database provider: {DATABASE_PROVIDER}") - raise ValueError(f"Unsupported database provider: {DATABASE_PROVIDER}") + if DATABASE_PROVIDER != "postgres": + LOGGER.error(f"Unsupported database provider: {DATABASE_PROVIDER}") + raise ValueError(f"Unsupported database provider: {DATABASE_PROVIDER}") + + LOGGER.info("Postgres is setup on database initialization - nothing to do here!") LOGGER.info("Setup completed successfully. Feel free to start the server now!") diff --git a/requirements.txt b/requirements.txt index 1721b48..7800ea7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -151,7 +151,6 @@ matplotlib-inline==0.1.7 mdurl==0.1.2 monotonic==1.6 more-itertools==10.5.0 -motor==3.4.0 mpmath==1.3.0 multidict==6.0.5 multiprocess==0.70.16 @@ -237,7 +236,6 @@ pydeck==0.9.1 pyee==12.1.1 Pygments==2.18.0 PyJWT==2.9.0 -pymongo==4.7.1 pypandoc==1.13 pyparsing==3.1.2 pypdf==4.3.1 diff --git a/sanity_checks/mongo.py b/sanity_checks/mongo.py deleted file mode 100644 index 45ab7d9..0000000 --- a/sanity_checks/mongo.py +++ /dev/null @@ -1,72 +0,0 @@ -from pymongo import MongoClient -from dotenv import load_dotenv -import os -import datetime - - -def test_mongo_operations(): - # Load environment variables - load_dotenv() - - # Get MongoDB URI from environment variable - mongo_uri = os.getenv("MONGODB_URI") - if not mongo_uri: - raise ValueError("MONGODB_URI environment variable not set") - - try: - # Connect to MongoDB - client = MongoClient(mongo_uri) - - # Test connection - client.admin.command("ping") - print("✅ Connected successfully to MongoDB") - - # Get database and collection - db = client.brandsyncaidb # Using a test database - collection = db.kb_chunked_embeddings - - # Insert a single document - test_doc = { - "name": "Test Document", - "timestamp": datetime.datetime.now(), - "value": 42, - } - - result = collection.insert_one(test_doc) - print(f"✅ Inserted document with ID: {result.inserted_id}") - - # Insert multiple documents - test_docs = [ - {"name": "Doc 1", "value": 1}, - {"name": "Doc 2", "value": 2}, - {"name": "Doc 3", "value": 3}, - ] - - result = collection.insert_many(test_docs) - print(f"✅ Inserted {len(result.inserted_ids)} documents") - - # Retrieve documents - print("\nRetrieving documents:") - for doc in collection.find(): - print(f"Found document: {doc}") - - # Find specific documents - print("\nFinding documents with value >= 2:") - query = {"value": {"$gte": 2}} - for doc in collection.find(query): - print(f"Found document: {doc}") - - # Clean up - delete all test documents - # DON'T DELETE IF It'S BRANDSYNCAI - # result = collection.delete_many({}) - print(f"\n✅ Cleaned up {result.deleted_count} test documents") - - except Exception as e: - print(f"❌ Error: {str(e)}") - finally: - client.close() - print("\n✅ Connection closed") - - -if __name__ == "__main__": - test_mongo_operations() diff --git a/sdks/python/morphik/__init__.py b/sdks/python/morphik/__init__.py index 2cd6621..eeb0cd6 100644 --- a/sdks/python/morphik/__init__.py +++ b/sdks/python/morphik/__init__.py @@ -12,4 +12,4 @@ __all__ = [ "Document", ] -__version__ = "0.1.0" +__version__ = "0.1.2" diff --git a/sdks/python/morphik/_internal.py b/sdks/python/morphik/_internal.py new file mode 100644 index 0000000..3d8637a --- /dev/null +++ b/sdks/python/morphik/_internal.py @@ -0,0 +1,507 @@ +import base64 +import io +import json +from io import BytesIO, IOBase +from PIL import Image +from PIL.Image import Image as PILImage +from pathlib import Path +from typing import Dict, Any, List, Optional, Union, Tuple, BinaryIO +from urllib.parse import urlparse + +import jwt +from pydantic import BaseModel, Field + +from .models import ( + Document, + ChunkResult, + DocumentResult, + CompletionResponse, + IngestTextRequest, + ChunkSource, + Graph, + # Prompt override models + GraphPromptOverrides, +) +from .rules import Rule + +# Type alias for rules +RuleOrDict = Union[Rule, Dict[str, Any]] + + +class FinalChunkResult(BaseModel): + content: str | PILImage = Field(..., description="Chunk content") + score: float = Field(..., description="Relevance score") + document_id: str = Field(..., description="Parent document ID") + chunk_number: int = Field(..., description="Chunk sequence number") + metadata: Dict[str, Any] = Field(default_factory=dict, description="Document metadata") + content_type: str = Field(..., description="Content type") + filename: Optional[str] = Field(None, description="Original filename") + download_url: Optional[str] = Field(None, description="URL to download full document") + + class Config: + arbitrary_types_allowed = True + + +class _MorphikClientLogic: + """ + Internal shared logic for Morphik clients. + + This class contains the shared logic between synchronous and asynchronous clients. + It handles URL generation, request preparation, and response parsing. + """ + + def __init__(self, uri: Optional[str] = None, timeout: int = 30, is_local: bool = False): + """Initialize shared client logic""" + self._timeout = timeout + self._is_local = is_local + + if uri: + self._setup_auth(uri) + else: + self._base_url = "http://localhost:8000" + self._auth_token = None + + def _setup_auth(self, uri: str) -> None: + """Setup authentication from URI""" + parsed = urlparse(uri) + if not parsed.netloc: + raise ValueError("Invalid URI format") + + # Split host and auth parts + auth, host = parsed.netloc.split("@") + _, self._auth_token = auth.split(":") + + # Set base URL + self._base_url = f"{'http' if self._is_local else 'https'}://{host}" + + # Basic token validation + jwt.decode(self._auth_token, options={"verify_signature": False}) + + def _convert_rule(self, rule: RuleOrDict) -> Dict[str, Any]: + """Convert a rule to a dictionary format""" + if hasattr(rule, "to_dict"): + return rule.to_dict() + return rule + + def _get_url(self, endpoint: str) -> str: + """Get the full URL for an API endpoint""" + return f"{self._base_url}/{endpoint.lstrip('/')}" + + def _get_headers(self) -> Dict[str, str]: + """Get base headers for API requests""" + headers = {"Content-Type": "application/json"} + return headers + + # Request preparation methods + + def _prepare_ingest_text_request( + self, + content: str, + filename: Optional[str], + metadata: Optional[Dict[str, Any]], + rules: Optional[List[RuleOrDict]], + use_colpali: bool, + folder_name: Optional[str], + end_user_id: Optional[str], + ) -> Dict[str, Any]: + """Prepare request for ingest_text endpoint""" + rules_dict = [self._convert_rule(r) for r in (rules or [])] + payload = { + "content": content, + "filename": filename, + "metadata": metadata or {}, + "rules": rules_dict, + "use_colpali": use_colpali, + } + if folder_name: + payload["folder_name"] = folder_name + if end_user_id: + payload["end_user_id"] = end_user_id + return payload + + def _prepare_file_for_upload( + self, + file: Union[str, bytes, BinaryIO, Path], + filename: Optional[str] = None, + ) -> Tuple[BinaryIO, str]: + """ + Process file input and return file object and filename. + Handles different file input types (str, Path, bytes, file-like object). + """ + if isinstance(file, (str, Path)): + file_path = Path(file) + if not file_path.exists(): + raise ValueError(f"File not found: {file}") + filename = file_path.name if filename is None else filename + with open(file_path, "rb") as f: + content = f.read() + file_obj = BytesIO(content) + elif isinstance(file, bytes): + if filename is None: + raise ValueError("filename is required when ingesting bytes") + file_obj = BytesIO(file) + else: + if filename is None: + raise ValueError("filename is required when ingesting file object") + file_obj = file + + return file_obj, filename + + def _prepare_files_for_upload( + self, + files: List[Union[str, bytes, BinaryIO, Path]], + ) -> List[Tuple[str, Tuple[str, BinaryIO]]]: + """ + Process multiple files and return a list of file objects in the format + expected by the API: [("files", (filename, file_obj)), ...] + """ + file_objects = [] + for file in files: + if isinstance(file, (str, Path)): + path = Path(file) + file_objects.append(("files", (path.name, open(path, "rb")))) + elif isinstance(file, bytes): + file_objects.append(("files", ("file.bin", BytesIO(file)))) + else: + file_objects.append(("files", (getattr(file, "name", "file.bin"), file))) + + return file_objects + + def _prepare_ingest_file_form_data( + self, + metadata: Optional[Dict[str, Any]], + rules: Optional[List[RuleOrDict]], + folder_name: Optional[str], + end_user_id: Optional[str], + ) -> Dict[str, Any]: + """Prepare form data for ingest_file endpoint""" + form_data = { + "metadata": json.dumps(metadata or {}), + "rules": json.dumps([self._convert_rule(r) for r in (rules or [])]), + } + if folder_name: + form_data["folder_name"] = folder_name + if end_user_id: + form_data["end_user_id"] = end_user_id + return form_data + + def _prepare_ingest_files_form_data( + self, + metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]], + rules: Optional[List[RuleOrDict]], + use_colpali: bool, + parallel: bool, + folder_name: Optional[str], + end_user_id: Optional[str], + ) -> Dict[str, Any]: + """Prepare form data for ingest_files endpoint""" + # Convert rules appropriately based on whether it's a flat list or list of lists + if rules: + if all(isinstance(r, list) for r in rules): + # List of lists - per-file rules + converted_rules = [ + [self._convert_rule(r) for r in rule_list] for rule_list in rules + ] + else: + # Flat list - shared rules for all files + converted_rules = [self._convert_rule(r) for r in rules] + else: + converted_rules = [] + + data = { + "metadata": json.dumps(metadata or {}), + "rules": json.dumps(converted_rules), + "use_colpali": str(use_colpali).lower() if use_colpali is not None else None, + "parallel": str(parallel).lower(), + } + + if folder_name: + data["folder_name"] = folder_name + if end_user_id: + data["end_user_id"] = end_user_id + + return data + + def _prepare_query_request( + self, + query: str, + filters: Optional[Dict[str, Any]], + k: int, + min_score: float, + max_tokens: Optional[int], + temperature: Optional[float], + use_colpali: bool, + graph_name: Optional[str], + hop_depth: int, + include_paths: bool, + prompt_overrides: Optional[Dict], + folder_name: Optional[str], + end_user_id: Optional[str], + ) -> Dict[str, Any]: + """Prepare request for query endpoint""" + payload = { + "query": query, + "filters": filters, + "k": k, + "min_score": min_score, + "max_tokens": max_tokens, + "temperature": temperature, + "use_colpali": use_colpali, + "graph_name": graph_name, + "hop_depth": hop_depth, + "include_paths": include_paths, + "prompt_overrides": prompt_overrides, + } + if folder_name: + payload["folder_name"] = folder_name + if end_user_id: + payload["end_user_id"] = end_user_id + # Filter out None values before sending + return {k_p: v_p for k_p, v_p in payload.items() if v_p is not None} + + def _prepare_retrieve_chunks_request( + self, + query: str, + filters: Optional[Dict[str, Any]], + k: int, + min_score: float, + use_colpali: bool, + folder_name: Optional[str], + end_user_id: Optional[str], + ) -> Dict[str, Any]: + """Prepare request for retrieve_chunks endpoint""" + request = { + "query": query, + "filters": filters, + "k": k, + "min_score": min_score, + "use_colpali": use_colpali, + } + if folder_name: + request["folder_name"] = folder_name + if end_user_id: + request["end_user_id"] = end_user_id + return request + + def _prepare_retrieve_docs_request( + self, + query: str, + filters: Optional[Dict[str, Any]], + k: int, + min_score: float, + use_colpali: bool, + folder_name: Optional[str], + end_user_id: Optional[str], + ) -> Dict[str, Any]: + """Prepare request for retrieve_docs endpoint""" + request = { + "query": query, + "filters": filters, + "k": k, + "min_score": min_score, + "use_colpali": use_colpali, + } + if folder_name: + request["folder_name"] = folder_name + if end_user_id: + request["end_user_id"] = end_user_id + return request + + def _prepare_list_documents_request( + self, + skip: int, + limit: int, + filters: Optional[Dict[str, Any]], + folder_name: Optional[str], + end_user_id: Optional[str], + ) -> Tuple[Dict[str, Any], Dict[str, Any]]: + """Prepare request for list_documents endpoint""" + params = { + "skip": skip, + "limit": limit, + } + if folder_name: + params["folder_name"] = folder_name + if end_user_id: + params["end_user_id"] = end_user_id + data = filters or {} + return params, data + + def _prepare_batch_get_documents_request( + self, document_ids: List[str], folder_name: Optional[str], end_user_id: Optional[str] + ) -> Dict[str, Any]: + """Prepare request for batch_get_documents endpoint""" + if folder_name or end_user_id: + request = {"document_ids": document_ids} + if folder_name: + request["folder_name"] = folder_name + if end_user_id: + request["end_user_id"] = end_user_id + return request + return document_ids # Return just IDs list if no scoping is needed + + def _prepare_batch_get_chunks_request( + self, + sources: List[Union[ChunkSource, Dict[str, Any]]], + folder_name: Optional[str], + end_user_id: Optional[str], + ) -> Dict[str, Any]: + """Prepare request for batch_get_chunks endpoint""" + source_dicts = [] + for source in sources: + if isinstance(source, dict): + source_dicts.append(source) + else: + source_dicts.append(source.model_dump()) + + if folder_name or end_user_id: + request = {"sources": source_dicts} + if folder_name: + request["folder_name"] = folder_name + if end_user_id: + request["end_user_id"] = end_user_id + return request + return source_dicts # Return just sources list if no scoping is needed + + def _prepare_create_graph_request( + self, + name: str, + filters: Optional[Dict[str, Any]], + documents: Optional[List[str]], + prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]], + folder_name: Optional[str], + end_user_id: Optional[str], + ) -> Dict[str, Any]: + """Prepare request for create_graph endpoint""" + # Convert prompt_overrides to dict if it's a model + if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides): + prompt_overrides = prompt_overrides.model_dump(exclude_none=True) + + request = { + "name": name, + "filters": filters, + "documents": documents, + "prompt_overrides": prompt_overrides, + } + if folder_name: + request["folder_name"] = folder_name + if end_user_id: + request["end_user_id"] = end_user_id + return request + + def _prepare_update_graph_request( + self, + name: str, + additional_filters: Optional[Dict[str, Any]], + additional_documents: Optional[List[str]], + prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]], + folder_name: Optional[str], + end_user_id: Optional[str], + ) -> Dict[str, Any]: + """Prepare request for update_graph endpoint""" + # Convert prompt_overrides to dict if it's a model + if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides): + prompt_overrides = prompt_overrides.model_dump(exclude_none=True) + + request = { + "additional_filters": additional_filters, + "additional_documents": additional_documents, + "prompt_overrides": prompt_overrides, + } + if folder_name: + request["folder_name"] = folder_name + if end_user_id: + request["end_user_id"] = end_user_id + return request + + def _prepare_update_document_with_text_request( + self, + document_id: str, + content: str, + filename: Optional[str], + metadata: Optional[Dict[str, Any]], + rules: Optional[List], + update_strategy: str, + use_colpali: Optional[bool], + ) -> Tuple[Dict[str, Any], Dict[str, Any]]: + """Prepare request for update_document_with_text endpoint""" + request = IngestTextRequest( + content=content, + filename=filename, + metadata=metadata or {}, + rules=[self._convert_rule(r) for r in (rules or [])], + use_colpali=use_colpali if use_colpali is not None else True, + ) + + params = {} + if update_strategy != "add": + params["update_strategy"] = update_strategy + + return params, request.model_dump() + + # Response parsing methods + + def _parse_document_response(self, response_json: Dict[str, Any]) -> Document: + """Parse document response""" + return Document(**response_json) + + def _parse_completion_response(self, response_json: Dict[str, Any]) -> CompletionResponse: + """Parse completion response""" + return CompletionResponse(**response_json) + + def _parse_document_list_response(self, response_json: List[Dict[str, Any]]) -> List[Document]: + """Parse document list response""" + docs = [Document(**doc) for doc in response_json] + return docs + + def _parse_document_result_list_response( + self, response_json: List[Dict[str, Any]] + ) -> List[DocumentResult]: + """Parse document result list response""" + return [DocumentResult(**r) for r in response_json] + + def _parse_chunk_result_list_response( + self, response_json: List[Dict[str, Any]] + ) -> List[FinalChunkResult]: + """Parse chunk result list response""" + chunks = [ChunkResult(**r) for r in response_json] + + final_chunks = [] + for chunk in chunks: + content = chunk.content + if chunk.metadata.get("is_image"): + try: + # Handle data URI format "data:image/png;base64,..." + if content.startswith("data:"): + # Extract the base64 part after the comma + content = content.split(",", 1)[1] + + # Now decode the base64 string + image_bytes = base64.b64decode(content) + content = Image.open(io.BytesIO(image_bytes)) + except Exception: + # Fall back to using the content as text + content = chunk.content + + final_chunks.append( + FinalChunkResult( + content=content, + score=chunk.score, + document_id=chunk.document_id, + chunk_number=chunk.chunk_number, + metadata=chunk.metadata, + content_type=chunk.content_type, + filename=chunk.filename, + download_url=chunk.download_url, + ) + ) + + return final_chunks + + def _parse_graph_response(self, response_json: Dict[str, Any]) -> Graph: + """Parse graph response""" + return Graph(**response_json) + + def _parse_graph_list_response(self, response_json: List[Dict[str, Any]]) -> List[Graph]: + """Parse graph list response""" + return [Graph(**graph) for graph in response_json] diff --git a/sdks/python/morphik/async_.py b/sdks/python/morphik/async_.py index a70d6f8..7b14816 100644 --- a/sdks/python/morphik/async_.py +++ b/sdks/python/morphik/async_.py @@ -1,39 +1,28 @@ -from io import BytesIO, IOBase import json import logging +from io import BytesIO, IOBase from pathlib import Path from typing import Dict, Any, List, Optional, Union, BinaryIO -from urllib.parse import urlparse import httpx -import jwt from PIL.Image import Image as PILImage -from pydantic import BaseModel, Field from .models import ( Document, - ChunkResult, DocumentResult, CompletionResponse, IngestTextRequest, ChunkSource, Graph, # Prompt override models - EntityExtractionExample, - EntityResolutionExample, - EntityExtractionPromptOverride, - EntityResolutionPromptOverride, - QueryPromptOverride, GraphPromptOverrides, - QueryPromptOverrides + QueryPromptOverrides, ) from .rules import Rule +from ._internal import _MorphikClientLogic, FinalChunkResult, RuleOrDict logger = logging.getLogger(__name__) -# Type alias for rules -RuleOrDict = Union[Rule, Dict[str, Any]] - class AsyncCache: def __init__(self, db: "AsyncMorphik", name: str): @@ -60,18 +49,941 @@ class AsyncCache: return CompletionResponse(**response) -class FinalChunkResult(BaseModel): - content: str | PILImage = Field(..., description="Chunk content") - score: float = Field(..., description="Relevance score") - document_id: str = Field(..., description="Parent document ID") - chunk_number: int = Field(..., description="Chunk sequence number") - metadata: Dict[str, Any] = Field(default_factory=dict, description="Document metadata") - content_type: str = Field(..., description="Content type") - filename: Optional[str] = Field(None, description="Original filename") - download_url: Optional[str] = Field(None, description="URL to download full document") +class AsyncFolder: + """ + A folder that allows operations to be scoped to a specific folder. - class Config: - arbitrary_types_allowed = True + Args: + client: The AsyncMorphik client instance + name: The name of the folder + """ + + def __init__(self, client: "AsyncMorphik", name: str): + self._client = client + self._name = name + + @property + def name(self) -> str: + """Returns the folder name.""" + return self._name + + def signin(self, end_user_id: str) -> "AsyncUserScope": + """ + Returns an AsyncUserScope object scoped to this folder and the end user. + + Args: + end_user_id: The ID of the end user + + Returns: + AsyncUserScope: A user scope scoped to this folder and the end user + """ + return AsyncUserScope(client=self._client, end_user_id=end_user_id, folder_name=self._name) + + async def ingest_text( + self, + content: str, + filename: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + rules: Optional[List[RuleOrDict]] = None, + use_colpali: bool = True, + ) -> Document: + """ + Ingest a text document into Morphik within this folder. + + Args: + content: Text content to ingest + filename: Optional file name + metadata: Optional metadata dictionary + rules: Optional list of rules to apply during ingestion + use_colpali: Whether to use ColPali-style embedding model + + Returns: + Document: Metadata of the ingested document + """ + rules_list = [self._client._convert_rule(r) for r in (rules or [])] + payload = self._client._logic._prepare_ingest_text_request( + content, filename, metadata, rules_list, use_colpali, self._name, None + ) + response = await self._client._request("POST", "ingest/text", data=payload) + doc = self._client._logic._parse_document_response(response) + doc._client = self._client + return doc + + async def ingest_file( + self, + file: Union[str, bytes, BinaryIO, Path], + filename: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + rules: Optional[List[RuleOrDict]] = None, + use_colpali: bool = True, + ) -> Document: + """ + Ingest a file document into Morphik within this folder. + + Args: + file: File to ingest (path string, bytes, file object, or Path) + filename: Name of the file + metadata: Optional metadata dictionary + rules: Optional list of rules to apply during ingestion + use_colpali: Whether to use ColPali-style embedding model + + Returns: + Document: Metadata of the ingested document + """ + # Process file input + file_obj, filename = self._client._logic._prepare_file_for_upload(file, filename) + + try: + # Prepare multipart form data + files = {"file": (filename, file_obj)} + + # Create form data + form_data = self._client._logic._prepare_ingest_file_form_data( + metadata, rules, self._name, None + ) + + response = await self._client._request( + "POST", + f"ingest/file?use_colpali={str(use_colpali).lower()}", + data=form_data, + files=files, + ) + doc = self._client._logic._parse_document_response(response) + doc._client = self._client + return doc + finally: + # Close file if we opened it + if isinstance(file, (str, Path)): + file_obj.close() + + async def ingest_files( + self, + files: List[Union[str, bytes, BinaryIO, Path]], + metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, + rules: Optional[List[RuleOrDict]] = None, + use_colpali: bool = True, + parallel: bool = True, + ) -> List[Document]: + """ + Ingest multiple files into Morphik within this folder. + + Args: + files: List of files to ingest + metadata: Optional metadata + rules: Optional list of rules to apply + use_colpali: Whether to use ColPali-style embedding + parallel: Whether to process files in parallel + + Returns: + List[Document]: List of ingested documents + """ + # Convert files to format expected by API + file_objects = self._client._logic._prepare_files_for_upload(files) + + try: + # Prepare form data + data = self._client._logic._prepare_ingest_files_form_data( + metadata, rules, use_colpali, parallel, self._name, None + ) + + response = await self._client._request( + "POST", "ingest/files", data=data, files=file_objects + ) + + if response.get("errors"): + # Log errors but don't raise exception + for error in response["errors"]: + logger.error(f"Failed to ingest {error['filename']}: {error['error']}") + + docs = [ + self._client._logic._parse_document_response(doc) for doc in response["documents"] + ] + for doc in docs: + doc._client = self._client + return docs + finally: + # Clean up file objects + for _, (_, file_obj) in file_objects: + if isinstance(file_obj, (IOBase, BytesIO)) and not file_obj.closed: + file_obj.close() + + async def ingest_directory( + self, + directory: Union[str, Path], + recursive: bool = False, + pattern: str = "*", + metadata: Optional[Dict[str, Any]] = None, + rules: Optional[List[RuleOrDict]] = None, + use_colpali: bool = True, + parallel: bool = True, + ) -> List[Document]: + """ + Ingest all files in a directory into Morphik within this folder. + + Args: + directory: Path to directory containing files to ingest + recursive: Whether to recursively process subdirectories + pattern: Optional glob pattern to filter files + metadata: Optional metadata dictionary to apply to all files + rules: Optional list of rules to apply + use_colpali: Whether to use ColPali-style embedding + parallel: Whether to process files in parallel + + Returns: + List[Document]: List of ingested documents + """ + directory = Path(directory) + if not directory.is_dir(): + raise ValueError(f"Directory not found: {directory}") + + # Collect all files matching pattern + if recursive: + files = list(directory.rglob(pattern)) + else: + files = list(directory.glob(pattern)) + + # Filter out directories + files = [f for f in files if f.is_file()] + + if not files: + return [] + + # Use ingest_files with collected paths + return await self.ingest_files( + files=files, metadata=metadata, rules=rules, use_colpali=use_colpali, parallel=parallel + ) + + async def retrieve_chunks( + self, + query: str, + filters: Optional[Dict[str, Any]] = None, + k: int = 4, + min_score: float = 0.0, + use_colpali: bool = True, + ) -> List[FinalChunkResult]: + """ + Retrieve relevant chunks within this folder. + + Args: + query: Search query text + filters: Optional metadata filters + k: Number of results (default: 4) + min_score: Minimum similarity threshold (default: 0.0) + use_colpali: Whether to use ColPali-style embedding model + + Returns: + List[FinalChunkResult]: List of relevant chunks + """ + payload = self._client._logic._prepare_retrieve_chunks_request( + query, filters, k, min_score, use_colpali, self._name, None + ) + response = await self._client._request("POST", "retrieve/chunks", data=payload) + return self._client._logic._parse_chunk_result_list_response(response) + + async def retrieve_docs( + self, + query: str, + filters: Optional[Dict[str, Any]] = None, + k: int = 4, + min_score: float = 0.0, + use_colpali: bool = True, + ) -> List[DocumentResult]: + """ + Retrieve relevant documents within this folder. + + Args: + query: Search query text + filters: Optional metadata filters + k: Number of results (default: 4) + min_score: Minimum similarity threshold (default: 0.0) + use_colpali: Whether to use ColPali-style embedding model + + Returns: + List[DocumentResult]: List of relevant documents + """ + payload = self._client._logic._prepare_retrieve_docs_request( + query, filters, k, min_score, use_colpali, self._name, None + ) + response = await self._client._request("POST", "retrieve/docs", data=payload) + return self._client._logic._parse_document_result_list_response(response) + + async def query( + self, + query: str, + filters: Optional[Dict[str, Any]] = None, + k: int = 4, + min_score: float = 0.0, + max_tokens: Optional[int] = None, + temperature: Optional[float] = None, + use_colpali: bool = True, + graph_name: Optional[str] = None, + hop_depth: int = 1, + include_paths: bool = False, + prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None, + ) -> CompletionResponse: + """ + Generate completion using relevant chunks as context within this folder. + + Args: + query: Query text + filters: Optional metadata filters + k: Number of chunks to use as context (default: 4) + min_score: Minimum similarity threshold (default: 0.0) + max_tokens: Maximum tokens in completion + temperature: Model temperature + use_colpali: Whether to use ColPali-style embedding model + graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval + hop_depth: Number of relationship hops to traverse in the graph (1-3) + include_paths: Whether to include relationship paths in the response + prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts + + Returns: + CompletionResponse: Generated completion + """ + payload = self._client._logic._prepare_query_request( + query, + filters, + k, + min_score, + max_tokens, + temperature, + use_colpali, + graph_name, + hop_depth, + include_paths, + prompt_overrides, + self._name, + None, + ) + response = await self._client._request("POST", "query", data=payload) + return self._client._logic._parse_completion_response(response) + + async def list_documents( + self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None + ) -> List[Document]: + """ + List accessible documents within this folder. + + Args: + skip: Number of documents to skip + limit: Maximum number of documents to return + filters: Optional filters + + Returns: + List[Document]: List of documents + """ + params, data = self._client._logic._prepare_list_documents_request( + skip, limit, filters, self._name, None + ) + response = await self._client._request("POST", "documents", data=data, params=params) + docs = self._client._logic._parse_document_list_response(response) + for doc in docs: + doc._client = self._client + return docs + + async def batch_get_documents(self, document_ids: List[str]) -> List[Document]: + """ + Retrieve multiple documents by their IDs in a single batch operation within this folder. + + Args: + document_ids: List of document IDs to retrieve + + Returns: + List[Document]: List of document metadata for found documents + """ + request = self._client._logic._prepare_batch_get_documents_request( + document_ids, self._name, None + ) + response = await self._client._request("POST", "batch/documents", data=request) + docs = self._client._logic._parse_document_list_response(response) + for doc in docs: + doc._client = self._client + return docs + + async def batch_get_chunks( + self, sources: List[Union[ChunkSource, Dict[str, Any]]] + ) -> List[FinalChunkResult]: + """ + Retrieve specific chunks by their document ID and chunk number in a single batch operation within this folder. + + Args: + sources: List of ChunkSource objects or dictionaries with document_id and chunk_number + + Returns: + List[FinalChunkResult]: List of chunk results + """ + request = self._client._logic._prepare_batch_get_chunks_request(sources, self._name, None) + response = await self._client._request("POST", "batch/chunks", data=request) + return self._client._logic._parse_chunk_result_list_response(response) + + async def create_graph( + self, + name: str, + filters: Optional[Dict[str, Any]] = None, + documents: Optional[List[str]] = None, + prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None, + ) -> Graph: + """ + Create a graph from documents within this folder. + + Args: + name: Name of the graph to create + filters: Optional metadata filters to determine which documents to include + documents: Optional list of specific document IDs to include + prompt_overrides: Optional customizations for entity extraction and resolution prompts + + Returns: + Graph: The created graph object + """ + request = self._client._logic._prepare_create_graph_request( + name, filters, documents, prompt_overrides, self._name, None + ) + response = await self._client._request("POST", "graph/create", data=request) + return self._client._logic._parse_graph_response(response) + + async def update_graph( + self, + name: str, + additional_filters: Optional[Dict[str, Any]] = None, + additional_documents: Optional[List[str]] = None, + prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None, + ) -> Graph: + """ + Update an existing graph with new documents from this folder. + + Args: + name: Name of the graph to update + additional_filters: Optional additional metadata filters to determine which new documents to include + additional_documents: Optional list of additional document IDs to include + prompt_overrides: Optional customizations for entity extraction and resolution prompts + + Returns: + Graph: The updated graph + """ + request = self._client._logic._prepare_update_graph_request( + name, additional_filters, additional_documents, prompt_overrides, self._name, None + ) + response = await self._client._request("POST", f"graph/{name}/update", data=request) + return self._client._logic._parse_graph_response(response) + + async def delete_document_by_filename(self, filename: str) -> Dict[str, str]: + """ + Delete a document by its filename within this folder. + + Args: + filename: Filename of the document to delete + + Returns: + Dict[str, str]: Deletion status + """ + # Get the document by filename with folder scope + request = {"filename": filename, "folder_name": self._name} + + # First get the document ID + response = await self._client._request( + "GET", f"documents/filename/{filename}", params={"folder_name": self._name} + ) + doc = self._client._logic._parse_document_response(response) + + # Then delete by ID + return await self._client.delete_document(doc.external_id) + + +class AsyncUserScope: + """ + A user scope that allows operations to be scoped to a specific end user and optionally a folder. + + Args: + client: The AsyncMorphik client instance + end_user_id: The ID of the end user + folder_name: Optional folder name to further scope operations + """ + + def __init__(self, client: "AsyncMorphik", end_user_id: str, folder_name: Optional[str] = None): + self._client = client + self._end_user_id = end_user_id + self._folder_name = folder_name + + @property + def end_user_id(self) -> str: + """Returns the end user ID.""" + return self._end_user_id + + @property + def folder_name(self) -> Optional[str]: + """Returns the folder name if any.""" + return self._folder_name + + async def ingest_text( + self, + content: str, + filename: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + rules: Optional[List[RuleOrDict]] = None, + use_colpali: bool = True, + ) -> Document: + """ + Ingest a text document into Morphik as this end user. + + Args: + content: Text content to ingest + filename: Optional file name + metadata: Optional metadata dictionary + rules: Optional list of rules to apply during ingestion + use_colpali: Whether to use ColPali-style embedding model + + Returns: + Document: Metadata of the ingested document + """ + rules_list = [self._client._convert_rule(r) for r in (rules or [])] + payload = self._client._logic._prepare_ingest_text_request( + content, + filename, + metadata, + rules_list, + use_colpali, + self._folder_name, + self._end_user_id, + ) + response = await self._client._request("POST", "ingest/text", data=payload) + doc = self._client._logic._parse_document_response(response) + doc._client = self._client + return doc + + async def ingest_file( + self, + file: Union[str, bytes, BinaryIO, Path], + filename: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + rules: Optional[List[RuleOrDict]] = None, + use_colpali: bool = True, + ) -> Document: + """ + Ingest a file document into Morphik as this end user. + + Args: + file: File to ingest (path string, bytes, file object, or Path) + filename: Name of the file + metadata: Optional metadata dictionary + rules: Optional list of rules to apply during ingestion + use_colpali: Whether to use ColPali-style embedding model + + Returns: + Document: Metadata of the ingested document + """ + # Handle different file input types + if isinstance(file, (str, Path)): + file_path = Path(file) + if not file_path.exists(): + raise ValueError(f"File not found: {file}") + filename = file_path.name if filename is None else filename + with open(file_path, "rb") as f: + content = f.read() + file_obj = BytesIO(content) + elif isinstance(file, bytes): + if filename is None: + raise ValueError("filename is required when ingesting bytes") + file_obj = BytesIO(file) + else: + if filename is None: + raise ValueError("filename is required when ingesting file object") + file_obj = file + + try: + # Prepare multipart form data + files = {"file": (filename, file_obj)} + + # Add metadata and rules + data = { + "metadata": json.dumps(metadata or {}), + "rules": json.dumps([self._client._convert_rule(r) for r in (rules or [])]), + "end_user_id": self._end_user_id, # Add end user ID here + } + + # Add folder name if scoped to a folder + if self._folder_name: + data["folder_name"] = self._folder_name + + response = await self._client._request("POST", "ingest/file", data=data, files=files) + doc = self._client._logic._parse_document_response(response) + doc._client = self._client + return doc + finally: + # Close file if we opened it + if isinstance(file, (str, Path)): + file_obj.close() + + async def ingest_files( + self, + files: List[Union[str, bytes, BinaryIO, Path]], + metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, + rules: Optional[List[RuleOrDict]] = None, + use_colpali: bool = True, + parallel: bool = True, + ) -> List[Document]: + """ + Ingest multiple files into Morphik as this end user. + + Args: + files: List of files to ingest + metadata: Optional metadata + rules: Optional list of rules to apply + use_colpali: Whether to use ColPali-style embedding + parallel: Whether to process files in parallel + + Returns: + List[Document]: List of ingested documents + """ + # Convert files to format expected by API + file_objects = [] + for file in files: + if isinstance(file, (str, Path)): + path = Path(file) + file_objects.append(("files", (path.name, open(path, "rb")))) + elif isinstance(file, bytes): + file_objects.append(("files", ("file.bin", file))) + else: + file_objects.append(("files", (getattr(file, "name", "file.bin"), file))) + + try: + # Prepare request data + # Convert rules appropriately + if rules: + if all(isinstance(r, list) for r in rules): + # List of lists - per-file rules + converted_rules = [ + [self._client._convert_rule(r) for r in rule_list] for rule_list in rules + ] + else: + # Flat list - shared rules for all files + converted_rules = [self._client._convert_rule(r) for r in rules] + else: + converted_rules = [] + + data = { + "metadata": json.dumps(metadata or {}), + "rules": json.dumps(converted_rules), + "use_colpali": str(use_colpali).lower() if use_colpali is not None else None, + "parallel": str(parallel).lower(), + "end_user_id": self._end_user_id, # Add end user ID here + } + + # Add folder name if scoped to a folder + if self._folder_name: + data["folder_name"] = self._folder_name + + response = await self._client._request( + "POST", "ingest/files", data=data, files=file_objects + ) + + if response.get("errors"): + # Log errors but don't raise exception + for error in response["errors"]: + logger.error(f"Failed to ingest {error['filename']}: {error['error']}") + + docs = [ + self._client._logic._parse_document_response(doc) for doc in response["documents"] + ] + for doc in docs: + doc._client = self._client + return docs + finally: + # Clean up file objects + for _, (_, file_obj) in file_objects: + if isinstance(file_obj, (IOBase, BytesIO)) and not file_obj.closed: + file_obj.close() + + async def ingest_directory( + self, + directory: Union[str, Path], + recursive: bool = False, + pattern: str = "*", + metadata: Optional[Dict[str, Any]] = None, + rules: Optional[List[RuleOrDict]] = None, + use_colpali: bool = True, + parallel: bool = True, + ) -> List[Document]: + """ + Ingest all files in a directory into Morphik as this end user. + + Args: + directory: Path to directory containing files to ingest + recursive: Whether to recursively process subdirectories + pattern: Optional glob pattern to filter files + metadata: Optional metadata dictionary to apply to all files + rules: Optional list of rules to apply + use_colpali: Whether to use ColPali-style embedding + parallel: Whether to process files in parallel + + Returns: + List[Document]: List of ingested documents + """ + directory = Path(directory) + if not directory.is_dir(): + raise ValueError(f"Directory not found: {directory}") + + # Collect all files matching pattern + if recursive: + files = list(directory.rglob(pattern)) + else: + files = list(directory.glob(pattern)) + + # Filter out directories + files = [f for f in files if f.is_file()] + + if not files: + return [] + + # Use ingest_files with collected paths + return await self.ingest_files( + files=files, metadata=metadata, rules=rules, use_colpali=use_colpali, parallel=parallel + ) + + async def retrieve_chunks( + self, + query: str, + filters: Optional[Dict[str, Any]] = None, + k: int = 4, + min_score: float = 0.0, + use_colpali: bool = True, + ) -> List[FinalChunkResult]: + """ + Retrieve relevant chunks as this end user. + + Args: + query: Search query text + filters: Optional metadata filters + k: Number of results (default: 4) + min_score: Minimum similarity threshold (default: 0.0) + use_colpali: Whether to use ColPali-style embedding model + + Returns: + List[FinalChunkResult]: List of relevant chunks + """ + payload = self._client._logic._prepare_retrieve_chunks_request( + query, filters, k, min_score, use_colpali, self._folder_name, self._end_user_id + ) + response = await self._client._request("POST", "retrieve/chunks", data=payload) + return self._client._logic._parse_chunk_result_list_response(response) + + async def retrieve_docs( + self, + query: str, + filters: Optional[Dict[str, Any]] = None, + k: int = 4, + min_score: float = 0.0, + use_colpali: bool = True, + ) -> List[DocumentResult]: + """ + Retrieve relevant documents as this end user. + + Args: + query: Search query text + filters: Optional metadata filters + k: Number of results (default: 4) + min_score: Minimum similarity threshold (default: 0.0) + use_colpali: Whether to use ColPali-style embedding model + + Returns: + List[DocumentResult]: List of relevant documents + """ + payload = self._client._logic._prepare_retrieve_docs_request( + query, filters, k, min_score, use_colpali, self._folder_name, self._end_user_id + ) + response = await self._client._request("POST", "retrieve/docs", data=payload) + return self._client._logic._parse_document_result_list_response(response) + + async def query( + self, + query: str, + filters: Optional[Dict[str, Any]] = None, + k: int = 4, + min_score: float = 0.0, + max_tokens: Optional[int] = None, + temperature: Optional[float] = None, + use_colpali: bool = True, + graph_name: Optional[str] = None, + hop_depth: int = 1, + include_paths: bool = False, + prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None, + ) -> CompletionResponse: + """ + Generate completion using relevant chunks as context as this end user. + + Args: + query: Query text + filters: Optional metadata filters + k: Number of chunks to use as context (default: 4) + min_score: Minimum similarity threshold (default: 0.0) + max_tokens: Maximum tokens in completion + temperature: Model temperature + use_colpali: Whether to use ColPali-style embedding model + graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval + hop_depth: Number of relationship hops to traverse in the graph (1-3) + include_paths: Whether to include relationship paths in the response + prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts + + Returns: + CompletionResponse: Generated completion + """ + payload = self._client._logic._prepare_query_request( + query, + filters, + k, + min_score, + max_tokens, + temperature, + use_colpali, + graph_name, + hop_depth, + include_paths, + prompt_overrides, + self._folder_name, + self._end_user_id, + ) + response = await self._client._request("POST", "query", data=payload) + return self._client._logic._parse_completion_response(response) + + async def list_documents( + self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None + ) -> List[Document]: + """ + List accessible documents for this end user. + + Args: + skip: Number of documents to skip + limit: Maximum number of documents to return + filters: Optional filters + + Returns: + List[Document]: List of documents + """ + params, data = self._client._logic._prepare_list_documents_request( + skip, limit, filters, self._folder_name, self._end_user_id + ) + response = await self._client._request("POST", "documents", data=data, params=params) + docs = self._client._logic._parse_document_list_response(response) + for doc in docs: + doc._client = self._client + return docs + + async def batch_get_documents(self, document_ids: List[str]) -> List[Document]: + """ + Retrieve multiple documents by their IDs in a single batch operation for this end user. + + Args: + document_ids: List of document IDs to retrieve + + Returns: + List[Document]: List of document metadata for found documents + """ + request = self._client._logic._prepare_batch_get_documents_request( + document_ids, self._folder_name, self._end_user_id + ) + response = await self._client._request("POST", "batch/documents", data=request) + docs = self._client._logic._parse_document_list_response(response) + for doc in docs: + doc._client = self._client + return docs + + async def batch_get_chunks( + self, sources: List[Union[ChunkSource, Dict[str, Any]]] + ) -> List[FinalChunkResult]: + """ + Retrieve specific chunks by their document ID and chunk number in a single batch operation for this end user. + + Args: + sources: List of ChunkSource objects or dictionaries with document_id and chunk_number + + Returns: + List[FinalChunkResult]: List of chunk results + """ + request = self._client._logic._prepare_batch_get_chunks_request( + sources, self._folder_name, self._end_user_id + ) + response = await self._client._request("POST", "batch/chunks", data=request) + return self._client._logic._parse_chunk_result_list_response(response) + + async def create_graph( + self, + name: str, + filters: Optional[Dict[str, Any]] = None, + documents: Optional[List[str]] = None, + prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None, + ) -> Graph: + """ + Create a graph from documents for this end user. + + Args: + name: Name of the graph to create + filters: Optional metadata filters to determine which documents to include + documents: Optional list of specific document IDs to include + prompt_overrides: Optional customizations for entity extraction and resolution prompts + + Returns: + Graph: The created graph object + """ + request = self._client._logic._prepare_create_graph_request( + name, filters, documents, prompt_overrides, self._folder_name, self._end_user_id + ) + response = await self._client._request("POST", "graph/create", data=request) + return self._client._logic._parse_graph_response(response) + + async def update_graph( + self, + name: str, + additional_filters: Optional[Dict[str, Any]] = None, + additional_documents: Optional[List[str]] = None, + prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None, + ) -> Graph: + """ + Update an existing graph with new documents for this end user. + + Args: + name: Name of the graph to update + additional_filters: Optional additional metadata filters to determine which new documents to include + additional_documents: Optional list of additional document IDs to include + prompt_overrides: Optional customizations for entity extraction and resolution prompts + + Returns: + Graph: The updated graph + """ + request = self._client._logic._prepare_update_graph_request( + name, + additional_filters, + additional_documents, + prompt_overrides, + self._folder_name, + self._end_user_id, + ) + response = await self._client._request("POST", f"graph/{name}/update", data=request) + return self._client._logic._parse_graph_response(response) + + async def delete_document_by_filename(self, filename: str) -> Dict[str, str]: + """ + Delete a document by its filename for this end user. + + Args: + filename: Filename of the document to delete + + Returns: + Dict[str, str]: Deletion status + """ + # Build parameters for the filename lookup + params = {"end_user_id": self._end_user_id} + + # Add folder name if scoped to a folder + if self._folder_name: + params["folder_name"] = self._folder_name + + # First get the document ID + response = await self._client._request( + "GET", f"documents/filename/{filename}", params=params + ) + doc = self._client._logic._parse_document_response(response) + + # Then delete by ID + return await self._client.delete_document(doc.external_id) class AsyncMorphik: @@ -97,39 +1009,12 @@ class AsyncMorphik: """ def __init__(self, uri: Optional[str] = None, timeout: int = 30, is_local: bool = False): - self._timeout = timeout - self._client = ( - httpx.AsyncClient(timeout=timeout) - if not is_local - else httpx.AsyncClient( - timeout=timeout, - verify=False, # Disable SSL for localhost - http2=False, # Force HTTP/1.1 - ) + self._logic = _MorphikClientLogic(uri, timeout, is_local) + self._client = httpx.AsyncClient( + timeout=self._logic._timeout, + verify=not self._logic._is_local, + http2=False if self._logic._is_local else True, ) - self._is_local = is_local - - if uri: - self._setup_auth(uri) - else: - self._base_url = "http://localhost:8000" - self._auth_token = None - - def _setup_auth(self, uri: str) -> None: - """Setup authentication from URI""" - parsed = urlparse(uri) - if not parsed.netloc: - raise ValueError("Invalid URI format") - - # Split host and auth parts - auth, host = parsed.netloc.split("@") - _, self._auth_token = auth.split(":") - - # Set base URL - self._base_url = f"{'http' if self._is_local else 'https'}://{host}" - - # Basic token validation - jwt.decode(self._auth_token, options={"verify_signature": False}) async def _request( self, @@ -140,9 +1025,10 @@ class AsyncMorphik: params: Optional[Dict[str, Any]] = None, ) -> Dict[str, Any]: """Make HTTP request""" - headers = {} - if self._auth_token: # Only add auth header if we have a token - headers["Authorization"] = f"Bearer {self._auth_token}" + url = self._logic._get_url(endpoint) + headers = self._logic._get_headers() + if self._logic._auth_token: # Only add auth header if we have a token + headers["Authorization"] = f"Bearer {self._logic._auth_token}" # Configure request data based on type if files: @@ -156,7 +1042,7 @@ class AsyncMorphik: response = await self._client.request( method, - f"{self._base_url}/{endpoint.lstrip('/')}", + url, headers=headers, params=params, **request_data, @@ -166,9 +1052,43 @@ class AsyncMorphik: def _convert_rule(self, rule: RuleOrDict) -> Dict[str, Any]: """Convert a rule to a dictionary format""" - if hasattr(rule, "to_dict"): - return rule.to_dict() - return rule + return self._logic._convert_rule(rule) + + def create_folder(self, name: str) -> AsyncFolder: + """ + Create a folder to scope operations. + + Args: + name: The name of the folder + + Returns: + AsyncFolder: A folder object for scoped operations + """ + return AsyncFolder(self, name) + + def get_folder(self, name: str) -> AsyncFolder: + """ + Get a folder by name to scope operations. + + Args: + name: The name of the folder + + Returns: + AsyncFolder: A folder object for scoped operations + """ + return AsyncFolder(self, name) + + def signin(self, end_user_id: str) -> AsyncUserScope: + """ + Sign in as an end user to scope operations. + + Args: + end_user_id: The ID of the end user + + Returns: + AsyncUserScope: A user scope object for scoped operations + """ + return AsyncUserScope(self, end_user_id) async def ingest_text( self, @@ -213,53 +1133,41 @@ class AsyncMorphik: ) ``` """ - request = IngestTextRequest( - content=content, - filename=filename, - metadata=metadata or {}, - rules=[self._convert_rule(r) for r in (rules or [])], - use_colpali=use_colpali, + rules_list = [self._convert_rule(r) for r in (rules or [])] + payload = self._logic._prepare_ingest_text_request( + content, filename, metadata, rules_list, use_colpali, None, None ) - response = await self._request("POST", "ingest/text", data=request.model_dump()) - doc = Document(**response) + response = await self._request("POST", "ingest/text", data=payload) + doc = self._logic._parse_document_response(response) doc._client = self return doc async def ingest_file( self, file: Union[str, bytes, BinaryIO, Path], - filename: str, + filename: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, rules: Optional[List[RuleOrDict]] = None, use_colpali: bool = True, ) -> Document: """Ingest a file document into Morphik.""" - # Handle different file input types - if isinstance(file, (str, Path)): - file_path = Path(file) - if not file_path.exists(): - raise ValueError(f"File not found: {file}") - with open(file_path, "rb") as f: - content = f.read() - file_obj = BytesIO(content) - elif isinstance(file, bytes): - file_obj = BytesIO(file) - else: - file_obj = file + # Process file input + file_obj, filename = self._logic._prepare_file_for_upload(file, filename) try: # Prepare multipart form data files = {"file": (filename, file_obj)} - # Add metadata and rules - data = { - "metadata": json.dumps(metadata or {}), - "rules": json.dumps([self._convert_rule(r) for r in (rules or [])]), - "use_colpali": json.dumps(use_colpali), - } + # Create form data + form_data = self._logic._prepare_ingest_file_form_data(metadata, rules, None, None) - response = await self._request("POST", "ingest/file", data=data, files=files) - doc = Document(**response) + response = await self._request( + "POST", + f"ingest/file?use_colpali={str(use_colpali).lower()}", + data=form_data, + files=files, + ) + doc = self._logic._parse_document_response(response) doc._client = self return doc finally: @@ -292,44 +1200,23 @@ class AsyncMorphik: ValueError: If metadata list length doesn't match files length """ # Convert files to format expected by API - file_objects = [] - for file in files: - if isinstance(file, (str, Path)): - path = Path(file) - file_objects.append(("files", (path.name, open(path, "rb")))) - elif isinstance(file, bytes): - file_objects.append(("files", ("file.bin", file))) - else: - file_objects.append(("files", (getattr(file, "name", "file.bin"), file))) + file_objects = self._logic._prepare_files_for_upload(files) try: - # Prepare request data - # Convert rules appropriately based on whether it's a flat list or list of lists - if rules: - if all(isinstance(r, list) for r in rules): - # List of lists - per-file rules - converted_rules = [[self._convert_rule(r) for r in rule_list] for rule_list in rules] - else: - # Flat list - shared rules for all files - converted_rules = [self._convert_rule(r) for r in rules] - else: - converted_rules = [] - - data = { - "metadata": json.dumps(metadata or {}), - "rules": json.dumps(converted_rules), - "use_colpali": str(use_colpali).lower() if use_colpali is not None else None, - "parallel": str(parallel).lower(), - } + # Prepare form data + data = self._logic._prepare_ingest_files_form_data( + metadata, rules, use_colpali, parallel, None, None + ) response = await self._request("POST", "ingest/files", data=data, files=file_objects) - + if response.get("errors"): # Log errors but don't raise exception for error in response["errors"]: logger.error(f"Failed to ingest {error['filename']}: {error['error']}") - - docs = [Document(**doc) for doc in response["documents"]] + + # Parse the documents from the response + docs = [self._client._logic._parse_document_response(doc) for doc in response["documents"]] for doc in docs: doc._client = self return docs @@ -379,17 +1266,13 @@ class AsyncMorphik: # Filter out directories files = [f for f in files if f.is_file()] - + if not files: return [] # Use ingest_files with collected paths return await self.ingest_files( - files=files, - metadata=metadata, - rules=rules, - use_colpali=use_colpali, - parallel=parallel + files=files, metadata=metadata, rules=rules, use_colpali=use_colpali, parallel=parallel ) async def retrieve_chunks( @@ -420,54 +1303,11 @@ class AsyncMorphik: ) ``` """ - request = { - "query": query, - "filters": filters, - "k": k, - "min_score": min_score, - "use_colpali": use_colpali, - } - - response = await self._request("POST", "retrieve/chunks", data=request) - chunks = [ChunkResult(**r) for r in response] - - final_chunks = [] - for chunk in chunks: - if chunk.metadata.get("is_image"): - try: - # Handle data URI format "data:image/png;base64,..." - content = chunk.content - if content.startswith("data:"): - # Extract the base64 part after the comma - content = content.split(",", 1)[1] - - # Now decode the base64 string - import base64 - import io - from PIL import Image - image_bytes = base64.b64decode(content) - content = Image.open(io.BytesIO(image_bytes)) - except Exception as e: - print(f"Error processing image: {str(e)}") - # Fall back to using the content as text - content = chunk.content - else: - content = chunk.content - - final_chunks.append( - FinalChunkResult( - content=content, - score=chunk.score, - document_id=chunk.document_id, - chunk_number=chunk.chunk_number, - metadata=chunk.metadata, - content_type=chunk.content_type, - filename=chunk.filename, - download_url=chunk.download_url, - ) - ) - - return final_chunks + payload = self._logic._prepare_retrieve_chunks_request( + query, filters, k, min_score, use_colpali, None, None + ) + response = await self._request("POST", "retrieve/chunks", data=payload) + return self._logic._parse_chunk_result_list_response(response) async def retrieve_docs( self, @@ -497,16 +1337,11 @@ class AsyncMorphik: ) ``` """ - request = { - "query": query, - "filters": filters, - "k": k, - "min_score": min_score, - "use_colpali": use_colpali, - } - - response = await self._request("POST", "retrieve/docs", data=request) - return [DocumentResult(**r) for r in response] + payload = self._logic._prepare_retrieve_docs_request( + query, filters, k, min_score, use_colpali, None, None + ) + response = await self._request("POST", "retrieve/docs", data=payload) + return self._logic._parse_document_result_list_response(response) async def query( self, @@ -549,7 +1384,7 @@ class AsyncMorphik: filters={"department": "research"}, temperature=0.7 ) - + # Knowledge graph enhanced query response = await db.query( "How does product X relate to customer segment Y?", @@ -557,7 +1392,7 @@ class AsyncMorphik: hop_depth=2, include_paths=True ) - + # With prompt customization from morphik.models import QueryPromptOverride, QueryPromptOverrides response = await db.query( @@ -568,7 +1403,7 @@ class AsyncMorphik: ) ) ) - + # Or using a dictionary response = await db.query( "What are the key findings?", @@ -578,35 +1413,32 @@ class AsyncMorphik: } } ) - + print(response.completion) - + # If include_paths=True, you can inspect the graph paths if response.metadata and "graph" in response.metadata: for path in response.metadata["graph"]["paths"]: print(" -> ".join(path)) ``` """ - # Convert prompt_overrides to dict if it's a model - if prompt_overrides and isinstance(prompt_overrides, QueryPromptOverrides): - prompt_overrides = prompt_overrides.model_dump(exclude_none=True) - - request = { - "query": query, - "filters": filters, - "k": k, - "min_score": min_score, - "max_tokens": max_tokens, - "temperature": temperature, - "use_colpali": use_colpali, - "graph_name": graph_name, - "hop_depth": hop_depth, - "include_paths": include_paths, - "prompt_overrides": prompt_overrides, - } - - response = await self._request("POST", "query", data=request) - return CompletionResponse(**response) + payload = self._logic._prepare_query_request( + query, + filters, + k, + min_score, + max_tokens, + temperature, + use_colpali, + graph_name, + hop_depth, + include_paths, + prompt_overrides, + None, + None, + ) + response = await self._request("POST", "query", data=payload) + return self._logic._parse_completion_response(response) async def list_documents( self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None @@ -631,11 +1463,9 @@ class AsyncMorphik: next_page = await db.list_documents(skip=10, limit=10, filters={"department": "research"}) ``` """ - # Use query params for pagination and POST body for filters - response = await self._request( - "POST", f"documents?skip={skip}&limit={limit}", data=filters or {} - ) - docs = [Document(**doc) for doc in response] + params, data = self._logic._prepare_list_documents_request(skip, limit, filters, None, None) + response = await self._request("POST", "documents", data=data, params=params) + docs = self._logic._parse_document_list_response(response) for doc in docs: doc._client = self return docs @@ -657,10 +1487,10 @@ class AsyncMorphik: ``` """ response = await self._request("GET", f"documents/{document_id}") - doc = Document(**response) + doc = self._logic._parse_document_response(response) doc._client = self return doc - + async def get_document_by_filename(self, filename: str) -> Document: """ Get document metadata by filename. @@ -679,10 +1509,10 @@ class AsyncMorphik: ``` """ response = await self._request("GET", f"documents/filename/{filename}") - doc = Document(**response) + doc = self._logic._parse_document_response(response) doc._client = self return doc - + async def update_document_with_text( self, document_id: str, @@ -695,7 +1525,7 @@ class AsyncMorphik: ) -> Document: """ Update a document with new text content using the specified strategy. - + Args: document_id: ID of the document to update content: The new content to add @@ -704,10 +1534,10 @@ class AsyncMorphik: rules: Optional list of rules to apply to the content update_strategy: Strategy for updating the document (currently only 'add' is supported) use_colpali: Whether to use multi-vector embedding - + Returns: Document: Updated document metadata - + Example: ```python # Add new content to an existing document @@ -729,22 +1559,19 @@ class AsyncMorphik: rules=[self._convert_rule(r) for r in (rules or [])], use_colpali=use_colpali if use_colpali is not None else True, ) - + params = {} if update_strategy != "add": params["update_strategy"] = update_strategy - + response = await self._request( - "POST", - f"documents/{document_id}/update_text", - data=request.model_dump(), - params=params + "POST", f"documents/{document_id}/update_text", data=request.model_dump(), params=params ) - - doc = Document(**response) + + doc = self._logic._parse_document_response(response) doc._client = self return doc - + async def update_document_with_file( self, document_id: str, @@ -757,7 +1584,7 @@ class AsyncMorphik: ) -> Document: """ Update a document with content from a file using the specified strategy. - + Args: document_id: ID of the document to update file: File to add (path string, bytes, file object, or Path) @@ -766,10 +1593,10 @@ class AsyncMorphik: rules: Optional list of rules to apply to the content update_strategy: Strategy for updating the document (currently only 'add' is supported) use_colpali: Whether to use multi-vector embedding - + Returns: Document: Updated document metadata - + Example: ```python # Add content from a file to an existing document @@ -799,34 +1626,34 @@ class AsyncMorphik: if filename is None: raise ValueError("filename is required when updating with file object") file_obj = file - + try: # Prepare multipart form data files = {"file": (filename, file_obj)} - + # Convert metadata and rules to JSON strings form_data = { "metadata": json.dumps(metadata or {}), "rules": json.dumps([self._convert_rule(r) for r in (rules or [])]), "update_strategy": update_strategy, } - + if use_colpali is not None: form_data["use_colpali"] = str(use_colpali).lower() - + # Use the dedicated file update endpoint response = await self._request( "POST", f"documents/{document_id}/update_file", data=form_data, files=files ) - - doc = Document(**response) + + doc = self._logic._parse_document_response(response) doc._client = self return doc finally: # Close file if we opened it if isinstance(file, (str, Path)): file_obj.close() - + async def update_document_metadata( self, document_id: str, @@ -834,14 +1661,14 @@ class AsyncMorphik: ) -> Document: """ Update a document's metadata only. - + Args: document_id: ID of the document to update metadata: Metadata to update - + Returns: Document: Updated document metadata - + Example: ```python # Update just the metadata of a document @@ -853,11 +1680,13 @@ class AsyncMorphik: ``` """ # Use the dedicated metadata update endpoint - response = await self._request("POST", f"documents/{document_id}/update_metadata", data=metadata) - doc = Document(**response) + response = await self._request( + "POST", f"documents/{document_id}/update_metadata", data=metadata + ) + doc = self._logic._parse_document_response(response) doc._client = self return doc - + async def update_document_by_filename_with_text( self, filename: str, @@ -898,7 +1727,7 @@ class AsyncMorphik: """ # First get the document by filename to obtain its ID doc = await self.get_document_by_filename(filename) - + # Then use the regular update_document_with_text endpoint with the document ID return await self.update_document_with_text( document_id=doc.external_id, @@ -907,9 +1736,9 @@ class AsyncMorphik: metadata=metadata, rules=rules, update_strategy=update_strategy, - use_colpali=use_colpali + use_colpali=use_colpali, ) - + async def update_document_by_filename_with_file( self, filename: str, @@ -949,7 +1778,7 @@ class AsyncMorphik: """ # First get the document by filename to obtain its ID doc = await self.get_document_by_filename(filename) - + # Then use the regular update_document_with_file endpoint with the document ID return await self.update_document_with_file( document_id=doc.external_id, @@ -958,9 +1787,9 @@ class AsyncMorphik: metadata=metadata, rules=rules, update_strategy=update_strategy, - use_colpali=use_colpali + use_colpali=use_colpali, ) - + async def update_document_by_filename_metadata( self, filename: str, @@ -969,15 +1798,15 @@ class AsyncMorphik: ) -> Document: """ Update a document's metadata using filename to identify the document. - + Args: filename: Filename of the document to update metadata: Metadata to update new_filename: Optional new filename to assign to the document - + Returns: Document: Updated document metadata - + Example: ```python # Update just the metadata of a document identified by filename @@ -991,44 +1820,44 @@ class AsyncMorphik: """ # First get the document by filename to obtain its ID doc = await self.get_document_by_filename(filename) - + # Update the metadata result = await self.update_document_metadata( document_id=doc.external_id, metadata=metadata, ) - + # If new_filename is provided, update the filename as well if new_filename: # Create a request that retains the just-updated metadata but also changes filename combined_metadata = result.metadata.copy() - + # Update the document again with filename change and the same metadata response = await self._request( - "POST", - f"documents/{doc.external_id}/update_text", + "POST", + f"documents/{doc.external_id}/update_text", data={ - "content": "", + "content": "", "filename": new_filename, "metadata": combined_metadata, - "rules": [] - } + "rules": [], + }, ) - result = Document(**response) + result = self._logic._parse_document_response(response) result._client = self - + return result - + async def batch_get_documents(self, document_ids: List[str]) -> List[Document]: """ Retrieve multiple documents by their IDs in a single batch operation. - + Args: document_ids: List of document IDs to retrieve - + Returns: List[Document]: List of document metadata for found documents - + Example: ```python docs = await db.batch_get_documents(["doc_123", "doc_456", "doc_789"]) @@ -1036,22 +1865,25 @@ class AsyncMorphik: print(f"Document {doc.external_id}: {doc.metadata.get('title')}") ``` """ - response = await self._request("POST", "batch/documents", data=document_ids) - docs = [Document(**doc) for doc in response] + request = self._logic._prepare_batch_get_documents_request(document_ids, None, None) + response = await self._request("POST", "batch/documents", data=request) + docs = self._logic._parse_document_list_response(response) for doc in docs: doc._client = self return docs - - async def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]: + + async def batch_get_chunks( + self, sources: List[Union[ChunkSource, Dict[str, Any]]] + ) -> List[FinalChunkResult]: """ Retrieve specific chunks by their document ID and chunk number in a single batch operation. - + Args: sources: List of ChunkSource objects or dictionaries with document_id and chunk_number - + Returns: List[FinalChunkResult]: List of chunk results - + Example: ```python # Using dictionaries @@ -1059,67 +1891,22 @@ class AsyncMorphik: {"document_id": "doc_123", "chunk_number": 0}, {"document_id": "doc_456", "chunk_number": 2} ] - + # Or using ChunkSource objects from morphik.models import ChunkSource sources = [ ChunkSource(document_id="doc_123", chunk_number=0), ChunkSource(document_id="doc_456", chunk_number=2) ] - + chunks = await db.batch_get_chunks(sources) for chunk in chunks: print(f"Chunk from {chunk.document_id}, number {chunk.chunk_number}: {chunk.content[:50]}...") ``` """ - # Convert to list of dictionaries if needed - source_dicts = [] - for source in sources: - if isinstance(source, dict): - source_dicts.append(source) - else: - source_dicts.append(source.model_dump()) - - response = await self._request("POST", "batch/chunks", data=source_dicts) - chunks = [ChunkResult(**r) for r in response] - - final_chunks = [] - for chunk in chunks: - if chunk.metadata.get("is_image"): - try: - # Handle data URI format "data:image/png;base64,..." - content = chunk.content - if content.startswith("data:"): - # Extract the base64 part after the comma - content = content.split(",", 1)[1] - - # Now decode the base64 string - import base64 - import io - from PIL import Image - image_bytes = base64.b64decode(content) - content = Image.open(io.BytesIO(image_bytes)) - except Exception as e: - print(f"Error processing image: {str(e)}") - # Fall back to using the content as text - content = chunk.content - else: - content = chunk.content - - final_chunks.append( - FinalChunkResult( - content=content, - score=chunk.score, - document_id=chunk.document_id, - chunk_number=chunk.chunk_number, - metadata=chunk.metadata, - content_type=chunk.content_type, - filename=chunk.filename, - download_url=chunk.download_url, - ) - ) - - return final_chunks + request = self._logic._prepare_batch_get_chunks_request(sources, None, None) + response = await self._request("POST", "batch/chunks", data=request) + return self._logic._parse_chunk_result_list_response(response) async def create_cache( self, @@ -1221,11 +2008,11 @@ class AsyncMorphik: name="custom_graph", documents=["doc1", "doc2", "doc3"] ) - + # With custom entity extraction examples from morphik.models import EntityExtractionPromptOverride, EntityExtractionExample, GraphPromptOverrides graph = await db.create_graph( - name="medical_graph", + name="medical_graph", filters={"category": "medical"}, prompt_overrides=GraphPromptOverrides( entity_extraction=EntityExtractionPromptOverride( @@ -1238,19 +2025,11 @@ class AsyncMorphik: ) ``` """ - # Convert prompt_overrides to dict if it's a model - if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides): - prompt_overrides = prompt_overrides.model_dump(exclude_none=True) - - request = { - "name": name, - "filters": filters, - "documents": documents, - "prompt_overrides": prompt_overrides, - } - - response = await self._request("POST", "graph/create", request) - return Graph(**response) + request = self._logic._prepare_create_graph_request( + name, filters, documents, prompt_overrides, None, None + ) + response = await self._request("POST", "graph/create", data=request) + return self._logic._parse_graph_response(response) async def get_graph(self, name: str) -> Graph: """ @@ -1270,7 +2049,7 @@ class AsyncMorphik: ``` """ response = await self._request("GET", f"graph/{name}") - return Graph(**response) + return self._logic._parse_graph_response(response) async def list_graphs(self) -> List[Graph]: """ @@ -1288,7 +2067,7 @@ class AsyncMorphik: ``` """ response = await self._request("GET", "graphs") - return [Graph(**graph) for graph in response] + return self._logic._parse_graph_list_response(response) async def update_graph( self, @@ -1332,7 +2111,7 @@ class AsyncMorphik: entity_resolution=EntityResolutionPromptOverride( examples=[ EntityResolutionExample( - canonical="Machine Learning", + canonical="Machine Learning", variants=["ML", "machine learning", "AI/ML"] ) ] @@ -1341,34 +2120,27 @@ class AsyncMorphik: ) ``` """ - # Convert prompt_overrides to dict if it's a model - if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides): - prompt_overrides = prompt_overrides.model_dump(exclude_none=True) - - request = { - "additional_filters": additional_filters, - "additional_documents": additional_documents, - "prompt_overrides": prompt_overrides, - } + request = self._logic._prepare_update_graph_request( + name, additional_filters, additional_documents, prompt_overrides, None, None + ) + response = await self._request("POST", f"graph/{name}/update", data=request) + return self._logic._parse_graph_response(response) - response = await self._request("POST", f"graph/{name}/update", request) - return Graph(**response) - async def delete_document(self, document_id: str) -> Dict[str, str]: """ Delete a document and all its associated data. - + This method deletes a document and all its associated data, including: - Document metadata - Document content in storage - Document chunks and embeddings in vector store - + Args: document_id: ID of the document to delete - + Returns: Dict[str, str]: Deletion status - + Example: ```python # Delete a document @@ -1378,20 +2150,20 @@ class AsyncMorphik: """ response = await self._request("DELETE", f"documents/{document_id}") return response - + async def delete_document_by_filename(self, filename: str) -> Dict[str, str]: """ Delete a document by its filename. - + This is a convenience method that first retrieves the document ID by filename and then deletes the document by ID. - + Args: filename: Filename of the document to delete - + Returns: Dict[str, str]: Deletion status - + Example: ```python # Delete a document by filename @@ -1401,7 +2173,7 @@ class AsyncMorphik: """ # First get the document by filename to obtain its ID doc = await self.get_document_by_filename(filename) - + # Then delete the document by ID return await self.delete_document(doc.external_id) diff --git a/sdks/python/morphik/models.py b/sdks/python/morphik/models.py index 611323f..2ef449f 100644 --- a/sdks/python/morphik/models.py +++ b/sdks/python/morphik/models.py @@ -21,10 +21,10 @@ class Document(BaseModel): default_factory=dict, description="Access control information" ) chunk_ids: List[str] = Field(default_factory=list, description="IDs of document chunks") - + # Client reference for update methods _client = None - + def update_with_text( self, content: str, @@ -36,7 +36,7 @@ class Document(BaseModel): ) -> "Document": """ Update this document with new text content using the specified strategy. - + Args: content: The new content to add filename: Optional new filename for the document @@ -44,13 +44,15 @@ class Document(BaseModel): rules: Optional list of rules to apply to the content update_strategy: Strategy for updating the document (currently only 'add' is supported) use_colpali: Whether to use multi-vector embedding - + Returns: Document: Updated document metadata """ if self._client is None: - raise ValueError("Document instance not connected to a client. Use a document returned from a Morphik client method.") - + raise ValueError( + "Document instance not connected to a client. Use a document returned from a Morphik client method." + ) + return self._client.update_document_with_text( document_id=self.external_id, content=content, @@ -58,9 +60,9 @@ class Document(BaseModel): metadata=metadata, rules=rules, update_strategy=update_strategy, - use_colpali=use_colpali + use_colpali=use_colpali, ) - + def update_with_file( self, file: "Union[str, bytes, BinaryIO, Path]", @@ -72,7 +74,7 @@ class Document(BaseModel): ) -> "Document": """ Update this document with content from a file using the specified strategy. - + Args: file: File to add (path string, bytes, file object, or Path) filename: Name of the file @@ -80,13 +82,15 @@ class Document(BaseModel): rules: Optional list of rules to apply to the content update_strategy: Strategy for updating the document (currently only 'add' is supported) use_colpali: Whether to use multi-vector embedding - + Returns: Document: Updated document metadata """ if self._client is None: - raise ValueError("Document instance not connected to a client. Use a document returned from a Morphik client method.") - + raise ValueError( + "Document instance not connected to a client. Use a document returned from a Morphik client method." + ) + return self._client.update_document_with_file( document_id=self.external_id, file=file, @@ -94,28 +98,29 @@ class Document(BaseModel): metadata=metadata, rules=rules, update_strategy=update_strategy, - use_colpali=use_colpali + use_colpali=use_colpali, ) - + def update_metadata( self, metadata: Dict[str, Any], ) -> "Document": """ Update this document's metadata only. - + Args: metadata: Metadata to update - + Returns: Document: Updated document metadata """ if self._client is None: - raise ValueError("Document instance not connected to a client. Use a document returned from a Morphik client method.") - + raise ValueError( + "Document instance not connected to a client. Use a document returned from a Morphik client method." + ) + return self._client.update_document_metadata( - document_id=self.external_id, - metadata=metadata + document_id=self.external_id, metadata=metadata ) @@ -159,7 +164,7 @@ class DocumentResult(BaseModel): class ChunkSource(BaseModel): """Source information for a chunk used in completion""" - + document_id: str = Field(..., description="ID of the source document") chunk_number: int = Field(..., description="Chunk number within the document") score: Optional[float] = Field(None, description="Relevance score") @@ -194,7 +199,9 @@ class Entity(BaseModel): type: str = Field(..., description="Entity type") properties: Dict[str, Any] = Field(default_factory=dict, description="Entity properties") document_ids: List[str] = Field(default_factory=list, description="Source document IDs") - chunk_sources: Dict[str, List[int]] = Field(default_factory=dict, description="Source chunk numbers by document ID") + chunk_sources: Dict[str, List[int]] = Field( + default_factory=dict, description="Source chunk numbers by document ID" + ) def __hash__(self): return hash(self.id) @@ -213,7 +220,9 @@ class Relationship(BaseModel): target_id: str = Field(..., description="Target entity ID") type: str = Field(..., description="Relationship type") document_ids: List[str] = Field(default_factory=list, description="Source document IDs") - chunk_sources: Dict[str, List[int]] = Field(default_factory=dict, description="Source chunk numbers by document ID") + chunk_sources: Dict[str, List[int]] = Field( + default_factory=dict, description="Source chunk numbers by document ID" + ) def __hash__(self): return hash(self.id) @@ -230,10 +239,14 @@ class Graph(BaseModel): id: str = Field(..., description="Unique graph identifier") name: str = Field(..., description="Graph name") entities: List[Entity] = Field(default_factory=list, description="Entities in the graph") - relationships: List[Relationship] = Field(default_factory=list, description="Relationships in the graph") + relationships: List[Relationship] = Field( + default_factory=list, description="Relationships in the graph" + ) metadata: Dict[str, Any] = Field(default_factory=dict, description="Graph metadata") document_ids: List[str] = Field(default_factory=list, description="Source document IDs") - filters: Optional[Dict[str, Any]] = Field(None, description="Document filters used to create the graph") + filters: Optional[Dict[str, Any]] = Field( + None, description="Document filters used to create the graph" + ) created_at: datetime = Field(..., description="Creation timestamp") updated_at: datetime = Field(..., description="Last update timestamp") owner: Dict[str, str] = Field(default_factory=dict, description="Graph owner information") diff --git a/sdks/python/morphik/sync.py b/sdks/python/morphik/sync.py index f05e258..a084983 100644 --- a/sdks/python/morphik/sync.py +++ b/sdks/python/morphik/sync.py @@ -1,42 +1,30 @@ -import base64 -from io import BytesIO, IOBase -import io -from PIL.Image import Image as PILImage -from PIL import Image import json import logging +from io import BytesIO, IOBase from pathlib import Path from typing import Dict, Any, List, Optional, Union, BinaryIO -from urllib.parse import urlparse -import jwt -from pydantic import BaseModel, Field -import requests +from PIL import Image +from PIL.Image import Image as PILImage + +import httpx from .models import ( - Document, - ChunkResult, - DocumentResult, - CompletionResponse, - IngestTextRequest, + Document, + DocumentResult, + CompletionResponse, + IngestTextRequest, ChunkSource, Graph, # Prompt override models - EntityExtractionExample, - EntityResolutionExample, - EntityExtractionPromptOverride, - EntityResolutionPromptOverride, - QueryPromptOverride, GraphPromptOverrides, - QueryPromptOverrides + QueryPromptOverrides, ) from .rules import Rule +from ._internal import _MorphikClientLogic, FinalChunkResult, RuleOrDict logger = logging.getLogger(__name__) -# Type alias for rules -RuleOrDict = Union[Rule, Dict[str, Any]] - class Cache: def __init__(self, db: "Morphik", name: str): @@ -63,18 +51,1038 @@ class Cache: return CompletionResponse(**response) -class FinalChunkResult(BaseModel): - content: str | PILImage = Field(..., description="Chunk content") - score: float = Field(..., description="Relevance score") - document_id: str = Field(..., description="Parent document ID") - chunk_number: int = Field(..., description="Chunk sequence number") - metadata: Dict[str, Any] = Field(default_factory=dict, description="Document metadata") - content_type: str = Field(..., description="Content type") - filename: Optional[str] = Field(None, description="Original filename") - download_url: Optional[str] = Field(None, description="URL to download full document") +class Folder: + """ + A folder that allows operations to be scoped to a specific folder. - class Config: - arbitrary_types_allowed = True + Args: + client: The Morphik client instance + name: The name of the folder + """ + + def __init__(self, client: "Morphik", name: str): + self._client = client + self._name = name + + @property + def name(self) -> str: + """Returns the folder name.""" + return self._name + + def signin(self, end_user_id: str) -> "UserScope": + """ + Returns a UserScope object scoped to this folder and the end user. + + Args: + end_user_id: The ID of the end user + + Returns: + UserScope: A user scope scoped to this folder and the end user + """ + return UserScope(client=self._client, end_user_id=end_user_id, folder_name=self._name) + + def ingest_text( + self, + content: str, + filename: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + rules: Optional[List[RuleOrDict]] = None, + use_colpali: bool = True, + ) -> Document: + """ + Ingest a text document into Morphik within this folder. + + Args: + content: Text content to ingest + filename: Optional file name + metadata: Optional metadata dictionary + rules: Optional list of rules to apply during ingestion + use_colpali: Whether to use ColPali-style embedding model + + Returns: + Document: Metadata of the ingested document + """ + rules_list = [self._client._convert_rule(r) for r in (rules or [])] + payload = self._client._logic._prepare_ingest_text_request( + content, filename, metadata, rules_list, use_colpali, self._name, None + ) + response = self._client._request("POST", "ingest/text", data=payload) + doc = self._client._logic._parse_document_response(response) + doc._client = self._client + return doc + + def ingest_file( + self, + file: Union[str, bytes, BinaryIO, Path], + filename: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + rules: Optional[List[RuleOrDict]] = None, + use_colpali: bool = True, + ) -> Document: + """ + Ingest a file document into Morphik within this folder. + + Args: + file: File to ingest (path string, bytes, file object, or Path) + filename: Name of the file + metadata: Optional metadata dictionary + rules: Optional list of rules to apply during ingestion + use_colpali: Whether to use ColPali-style embedding model + + Returns: + Document: Metadata of the ingested document + """ + # Process file input + file_obj, filename = self._client._logic._prepare_file_for_upload(file, filename) + + try: + # Prepare multipart form data + files = {"file": (filename, file_obj)} + + # Create form data + form_data = self._client._logic._prepare_ingest_file_form_data( + metadata, rules, self._name, None + ) + + response = self._client._request( + "POST", + f"ingest/file?use_colpali={str(use_colpali).lower()}", + data=form_data, + files=files, + ) + doc = self._client._logic._parse_document_response(response) + doc._client = self._client + return doc + finally: + # Close file if we opened it + if isinstance(file, (str, Path)): + file_obj.close() + + def ingest_files( + self, + files: List[Union[str, bytes, BinaryIO, Path]], + metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, + rules: Optional[List[RuleOrDict]] = None, + use_colpali: bool = True, + parallel: bool = True, + ) -> List[Document]: + """ + Ingest multiple files into Morphik within this folder. + + Args: + files: List of files to ingest + metadata: Optional metadata + rules: Optional list of rules to apply + use_colpali: Whether to use ColPali-style embedding + parallel: Whether to process files in parallel + + Returns: + List[Document]: List of ingested documents + """ + # Convert files to format expected by API + file_objects = self._client._logic._prepare_files_for_upload(files) + + try: + # Prepare form data + data = self._client._logic._prepare_ingest_files_form_data( + metadata, rules, use_colpali, parallel, self._name, None + ) + + response = self._client._request("POST", "ingest/files", data=data, files=file_objects) + + if response.get("errors"): + # Log errors but don't raise exception + for error in response["errors"]: + logger.error(f"Failed to ingest {error['filename']}: {error['error']}") + + docs = [ + self._client._logic._parse_document_response(doc) for doc in response["documents"] + ] + for doc in docs: + doc._client = self._client + return docs + finally: + # Clean up file objects + for _, (_, file_obj) in file_objects: + if isinstance(file_obj, (IOBase, BytesIO)) and not file_obj.closed: + file_obj.close() + + def ingest_directory( + self, + directory: Union[str, Path], + recursive: bool = False, + pattern: str = "*", + metadata: Optional[Dict[str, Any]] = None, + rules: Optional[List[RuleOrDict]] = None, + use_colpali: bool = True, + parallel: bool = True, + ) -> List[Document]: + """ + Ingest all files in a directory into Morphik within this folder. + + Args: + directory: Path to directory containing files to ingest + recursive: Whether to recursively process subdirectories + pattern: Optional glob pattern to filter files + metadata: Optional metadata dictionary to apply to all files + rules: Optional list of rules to apply + use_colpali: Whether to use ColPali-style embedding + parallel: Whether to process files in parallel + + Returns: + List[Document]: List of ingested documents + """ + directory = Path(directory) + if not directory.is_dir(): + raise ValueError(f"Directory not found: {directory}") + + # Collect all files matching pattern + if recursive: + files = list(directory.rglob(pattern)) + else: + files = list(directory.glob(pattern)) + + # Filter out directories + files = [f for f in files if f.is_file()] + + if not files: + return [] + + # Use ingest_files with collected paths + return self.ingest_files( + files=files, metadata=metadata, rules=rules, use_colpali=use_colpali, parallel=parallel + ) + + def retrieve_chunks( + self, + query: str, + filters: Optional[Dict[str, Any]] = None, + k: int = 4, + min_score: float = 0.0, + use_colpali: bool = True, + ) -> List[FinalChunkResult]: + """ + Retrieve relevant chunks within this folder. + + Args: + query: Search query text + filters: Optional metadata filters + k: Number of results (default: 4) + min_score: Minimum similarity threshold (default: 0.0) + use_colpali: Whether to use ColPali-style embedding model + + Returns: + List[FinalChunkResult]: List of relevant chunks + """ + request = { + "query": query, + "filters": filters, + "k": k, + "min_score": min_score, + "use_colpali": use_colpali, + "folder_name": self._name, # Add folder name here + } + + response = self._client._request("POST", "retrieve/chunks", request) + return self._client._logic._parse_chunk_result_list_response(response) + + def retrieve_docs( + self, + query: str, + filters: Optional[Dict[str, Any]] = None, + k: int = 4, + min_score: float = 0.0, + use_colpali: bool = True, + ) -> List[DocumentResult]: + """ + Retrieve relevant documents within this folder. + + Args: + query: Search query text + filters: Optional metadata filters + k: Number of results (default: 4) + min_score: Minimum similarity threshold (default: 0.0) + use_colpali: Whether to use ColPali-style embedding model + + Returns: + List[DocumentResult]: List of relevant documents + """ + request = { + "query": query, + "filters": filters, + "k": k, + "min_score": min_score, + "use_colpali": use_colpali, + "folder_name": self._name, # Add folder name here + } + + response = self._client._request("POST", "retrieve/docs", request) + return self._client._logic._parse_document_result_list_response(response) + + def query( + self, + query: str, + filters: Optional[Dict[str, Any]] = None, + k: int = 4, + min_score: float = 0.0, + max_tokens: Optional[int] = None, + temperature: Optional[float] = None, + use_colpali: bool = True, + graph_name: Optional[str] = None, + hop_depth: int = 1, + include_paths: bool = False, + prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None, + ) -> CompletionResponse: + """ + Generate completion using relevant chunks as context within this folder. + + Args: + query: Query text + filters: Optional metadata filters + k: Number of chunks to use as context (default: 4) + min_score: Minimum similarity threshold (default: 0.0) + max_tokens: Maximum tokens in completion + temperature: Model temperature + use_colpali: Whether to use ColPali-style embedding model + graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval + hop_depth: Number of relationship hops to traverse in the graph (1-3) + include_paths: Whether to include relationship paths in the response + prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts + + Returns: + CompletionResponse: Generated completion + """ + payload = self._client._logic._prepare_query_request( + query, + filters, + k, + min_score, + max_tokens, + temperature, + use_colpali, + graph_name, + hop_depth, + include_paths, + prompt_overrides, + self._name, + None, + ) + response = self._client._request("POST", "query", data=payload) + return self._client._logic._parse_completion_response(response) + + def list_documents( + self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None + ) -> List[Document]: + """ + List accessible documents within this folder. + + Args: + skip: Number of documents to skip + limit: Maximum number of documents to return + filters: Optional filters + + Returns: + List[Document]: List of documents + """ + params, data = self._client._logic._prepare_list_documents_request( + skip, limit, filters, self._name, None + ) + response = self._client._request("POST", "documents", data=data, params=params) + docs = self._client._logic._parse_document_list_response(response) + for doc in docs: + doc._client = self._client + return docs + + def batch_get_documents(self, document_ids: List[str]) -> List[Document]: + """ + Retrieve multiple documents by their IDs in a single batch operation within this folder. + + Args: + document_ids: List of document IDs to retrieve + + Returns: + List[Document]: List of document metadata for found documents + """ + request = {"document_ids": document_ids, "folder_name": self._name} + + response = self._client._request("POST", "batch/documents", data=request) + docs = [self._client._logic._parse_document_response(doc) for doc in response] + for doc in docs: + doc._client = self._client + return docs + + def batch_get_chunks( + self, sources: List[Union[ChunkSource, Dict[str, Any]]] + ) -> List[FinalChunkResult]: + """ + Retrieve specific chunks by their document ID and chunk number in a single batch operation within this folder. + + Args: + sources: List of ChunkSource objects or dictionaries with document_id and chunk_number + + Returns: + List[FinalChunkResult]: List of chunk results + """ + # Convert to list of dictionaries if needed + source_dicts = [] + for source in sources: + if isinstance(source, dict): + source_dicts.append(source) + else: + source_dicts.append(source.model_dump()) + + # Add folder_name to request + request = {"sources": source_dicts, "folder_name": self._name} + + response = self._client._request("POST", "batch/chunks", data=request) + return self._client._logic._parse_chunk_result_list_response(response) + + def create_graph( + self, + name: str, + filters: Optional[Dict[str, Any]] = None, + documents: Optional[List[str]] = None, + prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None, + ) -> Graph: + """ + Create a graph from documents within this folder. + + Args: + name: Name of the graph to create + filters: Optional metadata filters to determine which documents to include + documents: Optional list of specific document IDs to include + prompt_overrides: Optional customizations for entity extraction and resolution prompts + + Returns: + Graph: The created graph object + """ + # Convert prompt_overrides to dict if it's a model + if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides): + prompt_overrides = prompt_overrides.model_dump(exclude_none=True) + + request = { + "name": name, + "filters": filters, + "documents": documents, + "prompt_overrides": prompt_overrides, + "folder_name": self._name, # Add folder name here + } + + response = self._client._request("POST", "graph/create", request) + return self._client._logic._parse_graph_response(response) + + def update_graph( + self, + name: str, + additional_filters: Optional[Dict[str, Any]] = None, + additional_documents: Optional[List[str]] = None, + prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None, + ) -> Graph: + """ + Update an existing graph with new documents from this folder. + + Args: + name: Name of the graph to update + additional_filters: Optional additional metadata filters to determine which new documents to include + additional_documents: Optional list of additional document IDs to include + prompt_overrides: Optional customizations for entity extraction and resolution prompts + + Returns: + Graph: The updated graph + """ + # Convert prompt_overrides to dict if it's a model + if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides): + prompt_overrides = prompt_overrides.model_dump(exclude_none=True) + + request = { + "additional_filters": additional_filters, + "additional_documents": additional_documents, + "prompt_overrides": prompt_overrides, + "folder_name": self._name, # Add folder name here + } + + response = self._client._request("POST", f"graph/{name}/update", request) + return self._client._logic._parse_graph_response(response) + + def delete_document_by_filename(self, filename: str) -> Dict[str, str]: + """ + Delete a document by its filename within this folder. + + Args: + filename: Filename of the document to delete + + Returns: + Dict[str, str]: Deletion status + """ + # Get the document by filename with folder scope + request = {"filename": filename, "folder_name": self._name} + + # First get the document ID + response = self._client._request( + "GET", f"documents/filename/{filename}", params={"folder_name": self._name} + ) + doc = self._client._logic._parse_document_response(response) + + # Then delete by ID + return self._client.delete_document(doc.external_id) + + +class UserScope: + """ + A user scope that allows operations to be scoped to a specific end user and optionally a folder. + + Args: + client: The Morphik client instance + end_user_id: The ID of the end user + folder_name: Optional folder name to further scope operations + """ + + def __init__(self, client: "Morphik", end_user_id: str, folder_name: Optional[str] = None): + self._client = client + self._end_user_id = end_user_id + self._folder_name = folder_name + + @property + def end_user_id(self) -> str: + """Returns the end user ID.""" + return self._end_user_id + + @property + def folder_name(self) -> Optional[str]: + """Returns the folder name if any.""" + return self._folder_name + + def ingest_text( + self, + content: str, + filename: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + rules: Optional[List[RuleOrDict]] = None, + use_colpali: bool = True, + ) -> Document: + """ + Ingest a text document into Morphik as this end user. + + Args: + content: Text content to ingest + filename: Optional file name + metadata: Optional metadata dictionary + rules: Optional list of rules to apply during ingestion + use_colpali: Whether to use ColPali-style embedding model + + Returns: + Document: Metadata of the ingested document + """ + rules_list = [self._client._convert_rule(r) for r in (rules or [])] + payload = self._client._logic._prepare_ingest_text_request( + content, + filename, + metadata, + rules_list, + use_colpali, + self._folder_name, + self._end_user_id, + ) + response = self._client._request("POST", "ingest/text", data=payload) + doc = self._client._logic._parse_document_response(response) + doc._client = self._client + return doc + + def ingest_file( + self, + file: Union[str, bytes, BinaryIO, Path], + filename: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + rules: Optional[List[RuleOrDict]] = None, + use_colpali: bool = True, + ) -> Document: + """ + Ingest a file document into Morphik as this end user. + + Args: + file: File to ingest (path string, bytes, file object, or Path) + filename: Name of the file + metadata: Optional metadata dictionary + rules: Optional list of rules to apply during ingestion + use_colpali: Whether to use ColPali-style embedding model + + Returns: + Document: Metadata of the ingested document + """ + # Handle different file input types + if isinstance(file, (str, Path)): + file_path = Path(file) + if not file_path.exists(): + raise ValueError(f"File not found: {file}") + filename = file_path.name if filename is None else filename + with open(file_path, "rb") as f: + content = f.read() + file_obj = BytesIO(content) + elif isinstance(file, bytes): + if filename is None: + raise ValueError("filename is required when ingesting bytes") + file_obj = BytesIO(file) + else: + if filename is None: + raise ValueError("filename is required when ingesting file object") + file_obj = file + + try: + # Prepare multipart form data + files = {"file": (filename, file_obj)} + + # Add metadata and rules + form_data = { + "metadata": json.dumps(metadata or {}), + "rules": json.dumps([self._client._convert_rule(r) for r in (rules or [])]), + "end_user_id": self._end_user_id, # Add end user ID here + } + + # Add folder name if scoped to a folder + if self._folder_name: + form_data["folder_name"] = self._folder_name + + response = self._client._request( + "POST", + f"ingest/file?use_colpali={str(use_colpali).lower()}", + data=form_data, + files=files, + ) + doc = self._client._logic._parse_document_response(response) + doc._client = self._client + return doc + finally: + # Close file if we opened it + if isinstance(file, (str, Path)): + file_obj.close() + + def ingest_files( + self, + files: List[Union[str, bytes, BinaryIO, Path]], + metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, + rules: Optional[List[RuleOrDict]] = None, + use_colpali: bool = True, + parallel: bool = True, + ) -> List[Document]: + """ + Ingest multiple files into Morphik as this end user. + + Args: + files: List of files to ingest + metadata: Optional metadata + rules: Optional list of rules to apply + use_colpali: Whether to use ColPali-style embedding + parallel: Whether to process files in parallel + + Returns: + List[Document]: List of ingested documents + """ + # Convert files to format expected by API + file_objects = [] + for file in files: + if isinstance(file, (str, Path)): + path = Path(file) + file_objects.append(("files", (path.name, open(path, "rb")))) + elif isinstance(file, bytes): + file_objects.append(("files", ("file.bin", file))) + else: + file_objects.append(("files", (getattr(file, "name", "file.bin"), file))) + + try: + # Prepare request data + # Convert rules appropriately + if rules: + if all(isinstance(r, list) for r in rules): + # List of lists - per-file rules + converted_rules = [ + [self._client._convert_rule(r) for r in rule_list] for rule_list in rules + ] + else: + # Flat list - shared rules for all files + converted_rules = [self._client._convert_rule(r) for r in rules] + else: + converted_rules = [] + + data = { + "metadata": json.dumps(metadata or {}), + "rules": json.dumps(converted_rules), + "use_colpali": str(use_colpali).lower() if use_colpali is not None else None, + "parallel": str(parallel).lower(), + "end_user_id": self._end_user_id, # Add end user ID here + } + + # Add folder name if scoped to a folder + if self._folder_name: + data["folder_name"] = self._folder_name + + response = self._client._request("POST", "ingest/files", data=data, files=file_objects) + + if response.get("errors"): + # Log errors but don't raise exception + for error in response["errors"]: + logger.error(f"Failed to ingest {error['filename']}: {error['error']}") + + docs = [ + self._client._logic._parse_document_response(doc) for doc in response["documents"] + ] + for doc in docs: + doc._client = self._client + return docs + finally: + # Clean up file objects + for _, (_, file_obj) in file_objects: + if isinstance(file_obj, (IOBase, BytesIO)) and not file_obj.closed: + file_obj.close() + + def ingest_directory( + self, + directory: Union[str, Path], + recursive: bool = False, + pattern: str = "*", + metadata: Optional[Dict[str, Any]] = None, + rules: Optional[List[RuleOrDict]] = None, + use_colpali: bool = True, + parallel: bool = True, + ) -> List[Document]: + """ + Ingest all files in a directory into Morphik as this end user. + + Args: + directory: Path to directory containing files to ingest + recursive: Whether to recursively process subdirectories + pattern: Optional glob pattern to filter files + metadata: Optional metadata dictionary to apply to all files + rules: Optional list of rules to apply + use_colpali: Whether to use ColPali-style embedding + parallel: Whether to process files in parallel + + Returns: + List[Document]: List of ingested documents + """ + directory = Path(directory) + if not directory.is_dir(): + raise ValueError(f"Directory not found: {directory}") + + # Collect all files matching pattern + if recursive: + files = list(directory.rglob(pattern)) + else: + files = list(directory.glob(pattern)) + + # Filter out directories + files = [f for f in files if f.is_file()] + + if not files: + return [] + + # Use ingest_files with collected paths + return self.ingest_files( + files=files, metadata=metadata, rules=rules, use_colpali=use_colpali, parallel=parallel + ) + + def retrieve_chunks( + self, + query: str, + filters: Optional[Dict[str, Any]] = None, + k: int = 4, + min_score: float = 0.0, + use_colpali: bool = True, + ) -> List[FinalChunkResult]: + """ + Retrieve relevant chunks as this end user. + + Args: + query: Search query text + filters: Optional metadata filters + k: Number of results (default: 4) + min_score: Minimum similarity threshold (default: 0.0) + use_colpali: Whether to use ColPali-style embedding model + + Returns: + List[FinalChunkResult]: List of relevant chunks + """ + request = { + "query": query, + "filters": filters, + "k": k, + "min_score": min_score, + "use_colpali": use_colpali, + "end_user_id": self._end_user_id, # Add end user ID here + } + + # Add folder name if scoped to a folder + if self._folder_name: + request["folder_name"] = self._folder_name + + response = self._client._request("POST", "retrieve/chunks", request) + return self._client._logic._parse_chunk_result_list_response(response) + + def retrieve_docs( + self, + query: str, + filters: Optional[Dict[str, Any]] = None, + k: int = 4, + min_score: float = 0.0, + use_colpali: bool = True, + ) -> List[DocumentResult]: + """ + Retrieve relevant documents as this end user. + + Args: + query: Search query text + filters: Optional metadata filters + k: Number of results (default: 4) + min_score: Minimum similarity threshold (default: 0.0) + use_colpali: Whether to use ColPali-style embedding model + + Returns: + List[DocumentResult]: List of relevant documents + """ + request = { + "query": query, + "filters": filters, + "k": k, + "min_score": min_score, + "use_colpali": use_colpali, + "end_user_id": self._end_user_id, # Add end user ID here + } + + # Add folder name if scoped to a folder + if self._folder_name: + request["folder_name"] = self._folder_name + + response = self._client._request("POST", "retrieve/docs", request) + return self._client._logic._parse_document_result_list_response(response) + + def query( + self, + query: str, + filters: Optional[Dict[str, Any]] = None, + k: int = 4, + min_score: float = 0.0, + max_tokens: Optional[int] = None, + temperature: Optional[float] = None, + use_colpali: bool = True, + graph_name: Optional[str] = None, + hop_depth: int = 1, + include_paths: bool = False, + prompt_overrides: Optional[Union[QueryPromptOverrides, Dict[str, Any]]] = None, + ) -> CompletionResponse: + """ + Generate completion using relevant chunks as context as this end user. + + Args: + query: Query text + filters: Optional metadata filters + k: Number of chunks to use as context (default: 4) + min_score: Minimum similarity threshold (default: 0.0) + max_tokens: Maximum tokens in completion + temperature: Model temperature + use_colpali: Whether to use ColPali-style embedding model + graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval + hop_depth: Number of relationship hops to traverse in the graph (1-3) + include_paths: Whether to include relationship paths in the response + prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts + + Returns: + CompletionResponse: Generated completion + """ + payload = self._client._logic._prepare_query_request( + query, + filters, + k, + min_score, + max_tokens, + temperature, + use_colpali, + graph_name, + hop_depth, + include_paths, + prompt_overrides, + self._folder_name, + self._end_user_id, + ) + response = self._client._request("POST", "query", data=payload) + return self._client._logic._parse_completion_response(response) + + def list_documents( + self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None + ) -> List[Document]: + """ + List accessible documents for this end user. + + Args: + skip: Number of documents to skip + limit: Maximum number of documents to return + filters: Optional filters + + Returns: + List[Document]: List of documents + """ + # Add end_user_id and folder_name to params + params = {"skip": skip, "limit": limit, "end_user_id": self._end_user_id} + + # Add folder name if scoped to a folder + if self._folder_name: + params["folder_name"] = self._folder_name + + response = self._client._request("POST", f"documents", data=filters or {}, params=params) + + docs = [self._client._logic._parse_document_response(doc) for doc in response] + for doc in docs: + doc._client = self._client + return docs + + def batch_get_documents(self, document_ids: List[str]) -> List[Document]: + """ + Retrieve multiple documents by their IDs in a single batch operation for this end user. + + Args: + document_ids: List of document IDs to retrieve + + Returns: + List[Document]: List of document metadata for found documents + """ + request = {"document_ids": document_ids, "end_user_id": self._end_user_id} + + # Add folder name if scoped to a folder + if self._folder_name: + request["folder_name"] = self._folder_name + + response = self._client._request("POST", "batch/documents", data=request) + docs = [self._client._logic._parse_document_response(doc) for doc in response] + for doc in docs: + doc._client = self._client + return docs + + def batch_get_chunks( + self, sources: List[Union[ChunkSource, Dict[str, Any]]] + ) -> List[FinalChunkResult]: + """ + Retrieve specific chunks by their document ID and chunk number in a single batch operation for this end user. + + Args: + sources: List of ChunkSource objects or dictionaries with document_id and chunk_number + + Returns: + List[FinalChunkResult]: List of chunk results + """ + # Convert to list of dictionaries if needed + source_dicts = [] + for source in sources: + if isinstance(source, dict): + source_dicts.append(source) + else: + source_dicts.append(source.model_dump()) + + # Add end_user_id and folder_name to request + request = {"sources": source_dicts, "end_user_id": self._end_user_id} + + # Add folder name if scoped to a folder + if self._folder_name: + request["folder_name"] = self._folder_name + + response = self._client._request("POST", "batch/chunks", data=request) + return self._client._logic._parse_chunk_result_list_response(response) + + def create_graph( + self, + name: str, + filters: Optional[Dict[str, Any]] = None, + documents: Optional[List[str]] = None, + prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None, + ) -> Graph: + """ + Create a graph from documents for this end user. + + Args: + name: Name of the graph to create + filters: Optional metadata filters to determine which documents to include + documents: Optional list of specific document IDs to include + prompt_overrides: Optional customizations for entity extraction and resolution prompts + + Returns: + Graph: The created graph object + """ + # Convert prompt_overrides to dict if it's a model + if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides): + prompt_overrides = prompt_overrides.model_dump(exclude_none=True) + + request = { + "name": name, + "filters": filters, + "documents": documents, + "prompt_overrides": prompt_overrides, + "end_user_id": self._end_user_id, # Add end user ID here + } + + # Add folder name if scoped to a folder + if self._folder_name: + request["folder_name"] = self._folder_name + + response = self._client._request("POST", "graph/create", request) + return self._client._logic._parse_graph_response(response) + + def update_graph( + self, + name: str, + additional_filters: Optional[Dict[str, Any]] = None, + additional_documents: Optional[List[str]] = None, + prompt_overrides: Optional[Union[GraphPromptOverrides, Dict[str, Any]]] = None, + ) -> Graph: + """ + Update an existing graph with new documents for this end user. + + Args: + name: Name of the graph to update + additional_filters: Optional additional metadata filters to determine which new documents to include + additional_documents: Optional list of additional document IDs to include + prompt_overrides: Optional customizations for entity extraction and resolution prompts + + Returns: + Graph: The updated graph + """ + # Convert prompt_overrides to dict if it's a model + if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides): + prompt_overrides = prompt_overrides.model_dump(exclude_none=True) + + request = { + "additional_filters": additional_filters, + "additional_documents": additional_documents, + "prompt_overrides": prompt_overrides, + "end_user_id": self._end_user_id, # Add end user ID here + } + + # Add folder name if scoped to a folder + if self._folder_name: + request["folder_name"] = self._folder_name + + response = self._client._request("POST", f"graph/{name}/update", request) + return self._client._logic._parse_graph_response(response) + + def delete_document_by_filename(self, filename: str) -> Dict[str, str]: + """ + Delete a document by its filename for this end user. + + Args: + filename: Filename of the document to delete + + Returns: + Dict[str, str]: Deletion status + """ + # Build parameters for the filename lookup + params = {"end_user_id": self._end_user_id} + + # Add folder name if scoped to a folder + if self._folder_name: + params["folder_name"] = self._folder_name + + # First get the document ID + response = self._client._request("GET", f"documents/filename/{filename}", params=params) + doc = self._client._logic._parse_document_response(response) + + # Then delete by ID + return self._client.delete_document(doc.external_id) class Morphik: @@ -98,33 +1106,8 @@ class Morphik: """ def __init__(self, uri: Optional[str] = None, timeout: int = 30, is_local: bool = False): - self._timeout = timeout - self._session = requests.Session() - if is_local: - self._session.verify = False # Disable SSL for localhost - self._is_local = is_local - - if uri: - self._setup_auth(uri) - else: - self._base_url = "http://localhost:8000" - self._auth_token = None - - def _setup_auth(self, uri: str) -> None: - """Setup authentication from URI""" - parsed = urlparse(uri) - if not parsed.netloc: - raise ValueError("Invalid URI format") - - # Split host and auth parts - auth, host = parsed.netloc.split("@") - _, self._auth_token = auth.split(":") - - # Set base URL - self._base_url = f"{'http' if self._is_local else 'https'}://{host}" - - # Basic token validation - jwt.decode(self._auth_token, options={"verify_signature": False}) + self._logic = _MorphikClientLogic(uri, timeout, is_local) + self._client = httpx.Client(timeout=self._logic._timeout, verify=not self._logic._is_local) def _request( self, @@ -135,25 +1118,25 @@ class Morphik: params: Optional[Dict[str, Any]] = None, ) -> Dict[str, Any]: """Make HTTP request""" - headers = {} - if self._auth_token: # Only add auth header if we have a token - headers["Authorization"] = f"Bearer {self._auth_token}" + url = self._logic._get_url(endpoint) + headers = self._logic._get_headers() + if self._logic._auth_token: # Only add auth header if we have a token + headers["Authorization"] = f"Bearer {self._logic._auth_token}" # Configure request data based on type if files: # Multipart form data for files request_data = {"files": files, "data": data} - # Don't set Content-Type, let requests handle it + # Don't set Content-Type, let httpx handle it else: # JSON for everything else headers["Content-Type"] = "application/json" request_data = {"json": data} - response = self._session.request( + response = self._client.request( method, - f"{self._base_url}/{endpoint.lstrip('/')}", + url, headers=headers, - timeout=self._timeout, params=params, **request_data, ) @@ -162,9 +1145,43 @@ class Morphik: def _convert_rule(self, rule: RuleOrDict) -> Dict[str, Any]: """Convert a rule to a dictionary format""" - if hasattr(rule, "to_dict"): - return rule.to_dict() - return rule + return self._logic._convert_rule(rule) + + def create_folder(self, name: str) -> Folder: + """ + Create a folder to scope operations. + + Args: + name: The name of the folder + + Returns: + Folder: A folder object for scoped operations + """ + return Folder(self, name) + + def get_folder(self, name: str) -> Folder: + """ + Get a folder by name to scope operations. + + Args: + name: The name of the folder + + Returns: + Folder: A folder object for scoped operations + """ + return Folder(self, name) + + def signin(self, end_user_id: str) -> UserScope: + """ + Sign in as an end user to scope operations. + + Args: + end_user_id: The ID of the end user + + Returns: + UserScope: A user scope object for scoped operations + """ + return UserScope(self, end_user_id) def ingest_text( self, @@ -209,15 +1226,12 @@ class Morphik: ) ``` """ - request = IngestTextRequest( - content=content, - filename=filename, - metadata=metadata or {}, - rules=[self._convert_rule(r) for r in (rules or [])], - use_colpali=use_colpali, + rules_list = [self._convert_rule(r) for r in (rules or [])] + payload = self._logic._prepare_ingest_text_request( + content, filename, metadata, rules_list, use_colpali, None, None ) - response = self._request("POST", "ingest/text", data=request.model_dump()) - doc = Document(**response) + response = self._request("POST", "ingest/text", data=payload) + doc = self._logic._parse_document_response(response) doc._client = self return doc @@ -266,38 +1280,23 @@ class Morphik: ) ``` """ - # Handle different file input types - if isinstance(file, (str, Path)): - file_path = Path(file) - if not file_path.exists(): - raise ValueError(f"File not found: {file}") - filename = file_path.name if filename is None else filename - with open(file_path, "rb") as f: - content = f.read() - file_obj = BytesIO(content) - elif isinstance(file, bytes): - if filename is None: - raise ValueError("filename is required when ingesting bytes") - file_obj = BytesIO(file) - else: - if filename is None: - raise ValueError("filename is required when ingesting file object") - file_obj = file + # Process file input + file_obj, filename = self._logic._prepare_file_for_upload(file, filename) try: # Prepare multipart form data files = {"file": (filename, file_obj)} - # Add metadata and rules - form_data = { - "metadata": json.dumps(metadata or {}), - "rules": json.dumps([self._convert_rule(r) for r in (rules or [])]), - } + # Create form data + form_data = self._logic._prepare_ingest_file_form_data(metadata, rules, None, None) response = self._request( - "POST", f"ingest/file?use_colpali={str(use_colpali).lower()}", data=form_data, files=files + "POST", + f"ingest/file?use_colpali={str(use_colpali).lower()}", + data=form_data, + files=files, ) - doc = Document(**response) + doc = self._logic._parse_document_response(response) doc._client = self return doc finally: @@ -330,44 +1329,22 @@ class Morphik: ValueError: If metadata list length doesn't match files length """ # Convert files to format expected by API - file_objects = [] - for file in files: - if isinstance(file, (str, Path)): - path = Path(file) - file_objects.append(("files", (path.name, open(path, "rb")))) - elif isinstance(file, bytes): - file_objects.append(("files", ("file.bin", file))) - else: - file_objects.append(("files", (getattr(file, "name", "file.bin"), file))) + file_objects = self._logic._prepare_files_for_upload(files) try: - # Prepare request data - # Convert rules appropriately based on whether it's a flat list or list of lists - if rules: - if all(isinstance(r, list) for r in rules): - # List of lists - per-file rules - converted_rules = [[self._convert_rule(r) for r in rule_list] for rule_list in rules] - else: - # Flat list - shared rules for all files - converted_rules = [self._convert_rule(r) for r in rules] - else: - converted_rules = [] - - data = { - "metadata": json.dumps(metadata or {}), - "rules": json.dumps(converted_rules), - "use_colpali": str(use_colpali).lower() if use_colpali is not None else None, - "parallel": str(parallel).lower(), - } + # Prepare form data + data = self._logic._prepare_ingest_files_form_data( + metadata, rules, use_colpali, parallel, None, None + ) response = self._request("POST", "ingest/files", data=data, files=file_objects) - + if response.get("errors"): # Log errors but don't raise exception for error in response["errors"]: logger.error(f"Failed to ingest {error['filename']}: {error['error']}") - - docs = [Document(**doc) for doc in response["documents"]] + + docs = [self._logic._parse_document_response(doc) for doc in response["documents"]] for doc in docs: doc._client = self return docs @@ -417,17 +1394,13 @@ class Morphik: # Filter out directories files = [f for f in files if f.is_file()] - + if not files: return [] # Use ingest_files with collected paths return self.ingest_files( - files=files, - metadata=metadata, - rules=rules, - use_colpali=use_colpali, - parallel=parallel + files=files, metadata=metadata, rules=rules, use_colpali=use_colpali, parallel=parallel ) def retrieve_chunks( @@ -458,52 +1431,11 @@ class Morphik: ) ``` """ - request = { - "query": query, - "filters": filters, - "k": k, - "min_score": min_score, - "use_colpali": use_colpali, - } - - response = self._request("POST", "retrieve/chunks", request) - chunks = [ChunkResult(**r) for r in response] - - final_chunks = [] - - for chunk in chunks: - if chunk.metadata.get("is_image"): - try: - # Handle data URI format "data:image/png;base64,..." - content = chunk.content - if content.startswith("data:"): - # Extract the base64 part after the comma - content = content.split(",", 1)[1] - - # Now decode the base64 string - image_bytes = base64.b64decode(content) - content = Image.open(io.BytesIO(image_bytes)) - except Exception as e: - print(f"Error processing image: {str(e)}") - # Fall back to using the content as text - print(chunk.content) - else: - content = chunk.content - - final_chunks.append( - FinalChunkResult( - content=content, - score=chunk.score, - document_id=chunk.document_id, - chunk_number=chunk.chunk_number, - metadata=chunk.metadata, - content_type=chunk.content_type, - filename=chunk.filename, - download_url=chunk.download_url, - ) - ) - - return final_chunks + payload = self._logic._prepare_retrieve_chunks_request( + query, filters, k, min_score, use_colpali, None, None + ) + response = self._request("POST", "retrieve/chunks", data=payload) + return self._logic._parse_chunk_result_list_response(response) def retrieve_docs( self, @@ -533,16 +1465,11 @@ class Morphik: ) ``` """ - request = { - "query": query, - "filters": filters, - "k": k, - "min_score": min_score, - "use_colpali": use_colpali, - } - - response = self._request("POST", "retrieve/docs", request) - return [DocumentResult(**r) for r in response] + payload = self._logic._prepare_retrieve_docs_request( + query, filters, k, min_score, use_colpali, None, None + ) + response = self._request("POST", "retrieve/docs", data=payload) + return self._logic._parse_document_result_list_response(response) def query( self, @@ -585,7 +1512,7 @@ class Morphik: filters={"department": "research"}, temperature=0.7 ) - + # Knowledge graph enhanced query response = db.query( "How does product X relate to customer segment Y?", @@ -593,7 +1520,7 @@ class Morphik: hop_depth=2, include_paths=True ) - + # With prompt customization from morphik.models import QueryPromptOverride, QueryPromptOverrides response = db.query( @@ -604,7 +1531,7 @@ class Morphik: ) ) ) - + # Or using a dictionary response = db.query( "What are the key findings?", @@ -614,35 +1541,32 @@ class Morphik: } } ) - + print(response.completion) - + # If include_paths=True, you can inspect the graph paths if response.metadata and "graph" in response.metadata: for path in response.metadata["graph"]["paths"]: print(" -> ".join(path)) ``` """ - # Convert prompt_overrides to dict if it's a model - if prompt_overrides and isinstance(prompt_overrides, QueryPromptOverrides): - prompt_overrides = prompt_overrides.model_dump(exclude_none=True) - - request = { - "query": query, - "filters": filters, - "k": k, - "min_score": min_score, - "max_tokens": max_tokens, - "temperature": temperature, - "use_colpali": use_colpali, - "graph_name": graph_name, - "hop_depth": hop_depth, - "include_paths": include_paths, - "prompt_overrides": prompt_overrides, - } - - response = self._request("POST", "query", request) - return CompletionResponse(**response) + payload = self._logic._prepare_query_request( + query, + filters, + k, + min_score, + max_tokens, + temperature, + use_colpali, + graph_name, + hop_depth, + include_paths, + prompt_overrides, + None, + None, + ) + response = self._request("POST", "query", data=payload) + return self._logic._parse_completion_response(response) def list_documents( self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None @@ -667,9 +1591,9 @@ class Morphik: next_page = db.list_documents(skip=10, limit=10, filters={"department": "research"}) ``` """ - # Use query params for pagination and POST body for filters - response = self._request("POST", f"documents?skip={skip}&limit={limit}", data=filters or {}) - docs = [Document(**doc) for doc in response] + params, data = self._logic._prepare_list_documents_request(skip, limit, filters, None, None) + response = self._request("POST", "documents", data=data, params=params) + docs = self._logic._parse_document_list_response(response) for doc in docs: doc._client = self return docs @@ -691,10 +1615,10 @@ class Morphik: ``` """ response = self._request("GET", f"documents/{document_id}") - doc = Document(**response) + doc = self._logic._parse_document_response(response) doc._client = self return doc - + def get_document_by_filename(self, filename: str) -> Document: """ Get document metadata by filename. @@ -713,10 +1637,10 @@ class Morphik: ``` """ response = self._request("GET", f"documents/filename/{filename}") - doc = Document(**response) + doc = self._logic._parse_document_response(response) doc._client = self return doc - + def update_document_with_text( self, document_id: str, @@ -763,19 +1687,16 @@ class Morphik: rules=[self._convert_rule(r) for r in (rules or [])], use_colpali=use_colpali if use_colpali is not None else True, ) - + params = {} if update_strategy != "add": params["update_strategy"] = update_strategy - + response = self._request( - "POST", - f"documents/{document_id}/update_text", - data=request.model_dump(), - params=params + "POST", f"documents/{document_id}/update_text", data=request.model_dump(), params=params ) - - doc = Document(**response) + + doc = self._logic._parse_document_response(response) doc._client = self return doc @@ -833,34 +1754,34 @@ class Morphik: if filename is None: raise ValueError("filename is required when updating with file object") file_obj = file - + try: # Prepare multipart form data files = {"file": (filename, file_obj)} - + # Convert metadata and rules to JSON strings form_data = { "metadata": json.dumps(metadata or {}), "rules": json.dumps([self._convert_rule(r) for r in (rules or [])]), "update_strategy": update_strategy, } - + if use_colpali is not None: form_data["use_colpali"] = str(use_colpali).lower() - + # Use the dedicated file update endpoint response = self._request( "POST", f"documents/{document_id}/update_file", data=form_data, files=files ) - - doc = Document(**response) + + doc = self._logic._parse_document_response(response) doc._client = self return doc finally: # Close file if we opened it if isinstance(file, (str, Path)): file_obj.close() - + def update_document_metadata( self, document_id: str, @@ -868,14 +1789,14 @@ class Morphik: ) -> Document: """ Update a document's metadata only. - + Args: document_id: ID of the document to update metadata: Metadata to update - + Returns: Document: Updated document metadata - + Example: ```python # Update just the metadata of a document @@ -888,10 +1809,10 @@ class Morphik: """ # Use the dedicated metadata update endpoint response = self._request("POST", f"documents/{document_id}/update_metadata", data=metadata) - doc = Document(**response) + doc = self._logic._parse_document_response(response) doc._client = self return doc - + def update_document_by_filename_with_text( self, filename: str, @@ -932,7 +1853,7 @@ class Morphik: """ # First get the document by filename to obtain its ID doc = self.get_document_by_filename(filename) - + # Then use the regular update_document_with_text endpoint with the document ID return self.update_document_with_text( document_id=doc.external_id, @@ -941,9 +1862,9 @@ class Morphik: metadata=metadata, rules=rules, update_strategy=update_strategy, - use_colpali=use_colpali + use_colpali=use_colpali, ) - + def update_document_by_filename_with_file( self, filename: str, @@ -983,7 +1904,7 @@ class Morphik: """ # First get the document by filename to obtain its ID doc = self.get_document_by_filename(filename) - + # Then use the regular update_document_with_file endpoint with the document ID return self.update_document_with_file( document_id=doc.external_id, @@ -992,9 +1913,9 @@ class Morphik: metadata=metadata, rules=rules, update_strategy=update_strategy, - use_colpali=use_colpali + use_colpali=use_colpali, ) - + def update_document_by_filename_metadata( self, filename: str, @@ -1003,15 +1924,15 @@ class Morphik: ) -> Document: """ Update a document's metadata using filename to identify the document. - + Args: filename: Filename of the document to update metadata: Metadata to update new_filename: Optional new filename to assign to the document - + Returns: Document: Updated document metadata - + Example: ```python # Update just the metadata of a document identified by filename @@ -1025,44 +1946,44 @@ class Morphik: """ # First get the document by filename to obtain its ID doc = self.get_document_by_filename(filename) - + # Update the metadata result = self.update_document_metadata( document_id=doc.external_id, metadata=metadata, ) - + # If new_filename is provided, update the filename as well if new_filename: # Create a request that retains the just-updated metadata but also changes filename combined_metadata = result.metadata.copy() - + # Update the document again with filename change and the same metadata response = self._request( - "POST", - f"documents/{doc.external_id}/update_text", + "POST", + f"documents/{doc.external_id}/update_text", data={ - "content": "", + "content": "", "filename": new_filename, "metadata": combined_metadata, - "rules": [] - } + "rules": [], + }, ) - result = Document(**response) + result = self._logic._parse_document_response(response) result._client = self - + return result - + def batch_get_documents(self, document_ids: List[str]) -> List[Document]: """ Retrieve multiple documents by their IDs in a single batch operation. - + Args: document_ids: List of document IDs to retrieve - + Returns: List[Document]: List of document metadata for found documents - + Example: ```python docs = db.batch_get_documents(["doc_123", "doc_456", "doc_789"]) @@ -1071,21 +1992,23 @@ class Morphik: ``` """ response = self._request("POST", "batch/documents", data=document_ids) - docs = [Document(**doc) for doc in response] + docs = self._logic._parse_document_list_response(response) for doc in docs: doc._client = self return docs - - def batch_get_chunks(self, sources: List[Union[ChunkSource, Dict[str, Any]]]) -> List[FinalChunkResult]: + + def batch_get_chunks( + self, sources: List[Union[ChunkSource, Dict[str, Any]]] + ) -> List[FinalChunkResult]: """ Retrieve specific chunks by their document ID and chunk number in a single batch operation. - + Args: sources: List of ChunkSource objects or dictionaries with document_id and chunk_number - + Returns: List[FinalChunkResult]: List of chunk results - + Example: ```python # Using dictionaries @@ -1093,14 +2016,14 @@ class Morphik: {"document_id": "doc_123", "chunk_number": 0}, {"document_id": "doc_456", "chunk_number": 2} ] - + # Or using ChunkSource objects from morphik.models import ChunkSource sources = [ ChunkSource(document_id="doc_123", chunk_number=0), ChunkSource(document_id="doc_456", chunk_number=2) ] - + chunks = db.batch_get_chunks(sources) for chunk in chunks: print(f"Chunk from {chunk.document_id}, number {chunk.chunk_number}: {chunk.content[:50]}...") @@ -1113,44 +2036,9 @@ class Morphik: source_dicts.append(source) else: source_dicts.append(source.model_dump()) - + response = self._request("POST", "batch/chunks", data=source_dicts) - chunks = [ChunkResult(**r) for r in response] - - final_chunks = [] - for chunk in chunks: - if chunk.metadata.get("is_image"): - try: - # Handle data URI format "data:image/png;base64,..." - content = chunk.content - if content.startswith("data:"): - # Extract the base64 part after the comma - content = content.split(",", 1)[1] - - # Now decode the base64 string - image_bytes = base64.b64decode(content) - content = Image.open(io.BytesIO(image_bytes)) - except Exception as e: - print(f"Error processing image: {str(e)}") - # Fall back to using the content as text - content = chunk.content - else: - content = chunk.content - - final_chunks.append( - FinalChunkResult( - content=content, - score=chunk.score, - document_id=chunk.document_id, - chunk_number=chunk.chunk_number, - metadata=chunk.metadata, - content_type=chunk.content_type, - filename=chunk.filename, - download_url=chunk.download_url, - ) - ) - - return final_chunks + return self._logic._parse_chunk_result_list_response(response) def create_cache( self, @@ -1252,11 +2140,11 @@ class Morphik: name="custom_graph", documents=["doc1", "doc2", "doc3"] ) - + # With custom entity extraction examples from morphik.models import EntityExtractionPromptOverride, EntityExtractionExample, GraphPromptOverrides graph = db.create_graph( - name="medical_graph", + name="medical_graph", filters={"category": "medical"}, prompt_overrides=GraphPromptOverrides( entity_extraction=EntityExtractionPromptOverride( @@ -1272,7 +2160,7 @@ class Morphik: # Convert prompt_overrides to dict if it's a model if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides): prompt_overrides = prompt_overrides.model_dump(exclude_none=True) - + request = { "name": name, "filters": filters, @@ -1281,8 +2169,8 @@ class Morphik: } response = self._request("POST", "graph/create", request) - return Graph(**response) - + return self._logic._parse_graph_response(response) + def get_graph(self, name: str) -> Graph: """ Get a graph by name. @@ -1301,7 +2189,7 @@ class Morphik: ``` """ response = self._request("GET", f"graph/{name}") - return Graph(**response) + return self._logic._parse_graph_response(response) def list_graphs(self) -> List[Graph]: """ @@ -1319,8 +2207,8 @@ class Morphik: ``` """ response = self._request("GET", "graphs") - return [Graph(**graph) for graph in response] - + return self._logic._parse_graph_list_response(response) + def update_graph( self, name: str, @@ -1330,20 +2218,20 @@ class Morphik: ) -> Graph: """ Update an existing graph with new documents. - + This method processes additional documents matching the original or new filters, extracts entities and relationships, and updates the graph with new information. - + Args: name: Name of the graph to update additional_filters: Optional additional metadata filters to determine which new documents to include additional_documents: Optional list of additional document IDs to include prompt_overrides: Optional customizations for entity extraction and resolution prompts Either a GraphPromptOverrides object or a dictionary with the same structure - + Returns: Graph: The updated graph - + Example: ```python # Update a graph with new documents @@ -1353,7 +2241,7 @@ class Morphik: additional_documents=["doc4", "doc5"] ) print(f"Graph now has {len(updated_graph.entities)} entities") - + # With entity resolution examples from morphik.models import EntityResolutionPromptOverride, EntityResolutionExample, GraphPromptOverrides updated_graph = db.update_graph( @@ -1363,7 +2251,7 @@ class Morphik: entity_resolution=EntityResolutionPromptOverride( examples=[ EntityResolutionExample( - canonical="Machine Learning", + canonical="Machine Learning", variants=["ML", "machine learning", "AI/ML"] ) ] @@ -1375,7 +2263,7 @@ class Morphik: # Convert prompt_overrides to dict if it's a model if prompt_overrides and isinstance(prompt_overrides, GraphPromptOverrides): prompt_overrides = prompt_overrides.model_dump(exclude_none=True) - + request = { "additional_filters": additional_filters, "additional_documents": additional_documents, @@ -1383,23 +2271,23 @@ class Morphik: } response = self._request("POST", f"graph/{name}/update", request) - return Graph(**response) - + return self._logic._parse_graph_response(response) + def delete_document(self, document_id: str) -> Dict[str, str]: """ Delete a document and all its associated data. - + This method deletes a document and all its associated data, including: - Document metadata - Document content in storage - Document chunks and embeddings in vector store - + Args: document_id: ID of the document to delete - + Returns: Dict[str, str]: Deletion status - + Example: ```python # Delete a document @@ -1409,20 +2297,20 @@ class Morphik: """ response = self._request("DELETE", f"documents/{document_id}") return response - + def delete_document_by_filename(self, filename: str) -> Dict[str, str]: """ Delete a document by its filename. - + This is a convenience method that first retrieves the document ID by filename and then deletes the document by ID. - + Args: filename: Filename of the document to delete - + Returns: Dict[str, str]: Deletion status - + Example: ```python # Delete a document by filename @@ -1432,13 +2320,13 @@ class Morphik: """ # First get the document by filename to obtain its ID doc = self.get_document_by_filename(filename) - + # Then delete the document by ID return self.delete_document(doc.external_id) def close(self): - """Close the HTTP session""" - self._session.close() + """Close the HTTP client""" + self._client.close() def __enter__(self): return self diff --git a/sdks/python/pyproject.toml b/sdks/python/pyproject.toml index 25d0b2a..bd12d00 100644 --- a/sdks/python/pyproject.toml +++ b/sdks/python/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "morphik" -version = "0.1.0" +version = "0.1.2" authors = [ { name = "Morphik", email = "founders@morphik.ai" }, ]