2025-05-04 00:23:18 -07:00

2071 lines
78 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import asyncio
import base64
import json
import logging
import uuid
from datetime import UTC, datetime, timedelta
from pathlib import Path
from typing import Any, Dict, List, Optional
import arq
import jwt
import tomli
from fastapi import Depends, FastAPI, File, Form, Header, HTTPException, UploadFile
from fastapi.middleware.cors import CORSMiddleware
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from core.agent import MorphikAgent
from core.auth_utils import verify_token
from core.cache.llama_cache_factory import LlamaCacheFactory
from core.completion.litellm_completion import LiteLLMCompletionModel
from core.config import get_settings
from core.database.postgres_database import PostgresDatabase
from core.embedding.colpali_embedding_model import ColpaliEmbeddingModel
from core.embedding.litellm_embedding import LiteLLMEmbeddingModel
from core.limits_utils import check_and_increment_limits
from core.models.auth import AuthContext
from core.models.completion import ChunkSource, CompletionResponse
from core.models.documents import ChunkResult, Document, DocumentResult
from core.models.folders import Folder, FolderCreate
from core.models.graph import Graph
from core.models.prompts import validate_prompt_overrides_with_http_exception
from core.models.request import (
AgentQueryRequest,
BatchIngestResponse,
CompletionQueryRequest,
CreateGraphRequest,
GenerateUriRequest,
IngestTextRequest,
RetrieveRequest,
SetFolderRuleRequest,
UpdateGraphRequest,
)
from core.parser.morphik_parser import MorphikParser
from core.reranker.flag_reranker import FlagReranker
from core.services.document_service import DocumentService
from core.services.telemetry import TelemetryService
from core.storage.local_storage import LocalStorage
from core.storage.s3_storage import S3Storage
from core.vector_store.multi_vector_store import MultiVectorStore
from core.vector_store.pgvector_store import PGVectorStore
# Initialize FastAPI app
app = FastAPI(title="Morphik API")
logger = logging.getLogger(__name__)
# Add health check endpoints
@app.get("/health")
async def health_check():
"""Basic health check endpoint."""
return {"status": "healthy"}
@app.get("/health/ready")
async def readiness_check():
"""Readiness check that verifies the application is initialized."""
return {
"status": "ready",
"components": {
"database": settings.DATABASE_PROVIDER,
"vector_store": settings.VECTOR_STORE_PROVIDER,
"embedding": settings.EMBEDDING_PROVIDER,
"completion": settings.COMPLETION_PROVIDER,
"storage": settings.STORAGE_PROVIDER,
},
}
# Initialize telemetry
telemetry = TelemetryService()
# Add OpenTelemetry instrumentation - exclude HTTP send/receive spans
FastAPIInstrumentor.instrument_app(
app,
excluded_urls="health,health/.*", # Exclude health check endpoints
exclude_spans=["send", "receive"], # Exclude HTTP send/receive spans to reduce telemetry volume
http_capture_headers_server_request=None, # Don't capture request headers
http_capture_headers_server_response=None, # Don't capture response headers
tracer_provider=None, # Use the global tracer provider
)
# Add CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Initialize service
settings = get_settings()
# Initialize database
if not settings.POSTGRES_URI:
raise ValueError("PostgreSQL URI is required for PostgreSQL database")
database = PostgresDatabase(uri=settings.POSTGRES_URI)
# Redis settings already imported at top of file
@app.on_event("startup")
async def initialize_database():
"""Initialize database tables and indexes on application startup."""
logger.info("Initializing database...")
success = await database.initialize()
if success:
logger.info("Database initialization successful")
else:
logger.error("Database initialization failed")
# We don't raise an exception here to allow the app to continue starting
# even if there are initialization errors
@app.on_event("startup")
async def initialize_vector_store():
"""Initialize vector store tables and indexes on application startup."""
# First initialize the primary vector store (PGVectorStore if using pgvector)
logger.info("Initializing primary vector store...")
if hasattr(vector_store, "initialize"):
success = await vector_store.initialize()
if success:
logger.info("Primary vector store initialization successful")
else:
logger.error("Primary vector store initialization failed")
else:
logger.warning("Primary vector store does not have an initialize method")
# Then initialize the multivector store if enabled
if settings.ENABLE_COLPALI and colpali_vector_store:
logger.info("Initializing multivector store...")
# Handle both synchronous and asynchronous initialize methods
if hasattr(colpali_vector_store.initialize, "__awaitable__"):
success = await colpali_vector_store.initialize()
else:
success = colpali_vector_store.initialize()
if success:
logger.info("Multivector store initialization successful")
else:
logger.error("Multivector store initialization failed")
@app.on_event("startup")
async def initialize_user_limits_database():
"""Initialize user service on application startup."""
logger.info("Initializing user service...")
if settings.MODE == "cloud":
from core.database.user_limits_db import UserLimitsDatabase
user_limits_db = UserLimitsDatabase(uri=settings.POSTGRES_URI)
await user_limits_db.initialize()
@app.on_event("startup")
async def initialize_redis_pool():
"""Initialize the Redis connection pool for background tasks."""
global redis_pool
logger.info("Initializing Redis connection pool...")
# Get Redis settings from configuration
redis_host = settings.REDIS_HOST
redis_port = settings.REDIS_PORT
# Log the Redis connection details
logger.info(f"Connecting to Redis at {redis_host}:{redis_port}")
redis_settings = arq.connections.RedisSettings(
host=redis_host,
port=redis_port,
)
redis_pool = await arq.create_pool(redis_settings)
logger.info("Redis connection pool initialized successfully")
@app.on_event("shutdown")
async def close_redis_pool():
"""Close the Redis connection pool on application shutdown."""
global redis_pool
if redis_pool:
logger.info("Closing Redis connection pool...")
redis_pool.close()
await redis_pool.wait_closed()
logger.info("Redis connection pool closed")
# Initialize vector store
if not settings.POSTGRES_URI:
raise ValueError("PostgreSQL URI is required for pgvector store")
vector_store = PGVectorStore(
uri=settings.POSTGRES_URI,
)
# Initialize storage
match settings.STORAGE_PROVIDER:
case "local":
storage = LocalStorage(storage_path=settings.STORAGE_PATH)
case "aws-s3":
if not settings.AWS_ACCESS_KEY or not settings.AWS_SECRET_ACCESS_KEY:
raise ValueError("AWS credentials are required for S3 storage")
storage = S3Storage(
aws_access_key=settings.AWS_ACCESS_KEY,
aws_secret_key=settings.AWS_SECRET_ACCESS_KEY,
region_name=settings.AWS_REGION,
default_bucket=settings.S3_BUCKET,
)
case _:
raise ValueError(f"Unsupported storage provider: {settings.STORAGE_PROVIDER}")
# Initialize parser
parser = MorphikParser(
chunk_size=settings.CHUNK_SIZE,
chunk_overlap=settings.CHUNK_OVERLAP,
use_unstructured_api=settings.USE_UNSTRUCTURED_API,
unstructured_api_key=settings.UNSTRUCTURED_API_KEY,
assemblyai_api_key=settings.ASSEMBLYAI_API_KEY,
anthropic_api_key=settings.ANTHROPIC_API_KEY,
use_contextual_chunking=settings.USE_CONTEXTUAL_CHUNKING,
)
# Initialize embedding model
# Create a LiteLLM model using the registered model config
embedding_model = LiteLLMEmbeddingModel(
model_key=settings.EMBEDDING_MODEL,
)
logger.info(f"Initialized LiteLLM embedding model with model key: {settings.EMBEDDING_MODEL}")
# Initialize completion model
# Create a LiteLLM model using the registered model config
completion_model = LiteLLMCompletionModel(
model_key=settings.COMPLETION_MODEL,
)
logger.info(f"Initialized LiteLLM completion model with model key: {settings.COMPLETION_MODEL}")
# Initialize reranker
reranker = None
if settings.USE_RERANKING:
match settings.RERANKER_PROVIDER:
case "flag":
reranker = FlagReranker(
model_name=settings.RERANKER_MODEL,
device=settings.RERANKER_DEVICE,
use_fp16=settings.RERANKER_USE_FP16,
query_max_length=settings.RERANKER_QUERY_MAX_LENGTH,
passage_max_length=settings.RERANKER_PASSAGE_MAX_LENGTH,
)
case _:
raise ValueError(f"Unsupported reranker provider: {settings.RERANKER_PROVIDER}")
# Initialize cache factory
cache_factory = LlamaCacheFactory(Path(settings.STORAGE_PATH))
# Initialize ColPali embedding model if enabled
colpali_embedding_model = ColpaliEmbeddingModel() if settings.ENABLE_COLPALI else None
colpali_vector_store = MultiVectorStore(uri=settings.POSTGRES_URI) if settings.ENABLE_COLPALI else None
# Initialize document service with configured components
document_service = DocumentService(
storage=storage,
database=database,
vector_store=vector_store,
embedding_model=embedding_model,
completion_model=completion_model,
parser=parser,
reranker=reranker,
cache_factory=cache_factory,
enable_colpali=settings.ENABLE_COLPALI,
colpali_embedding_model=colpali_embedding_model,
colpali_vector_store=colpali_vector_store,
)
# Initialize the MorphikAgent once to load tool definitions and avoid repeated I/O
morphik_agent = MorphikAgent(document_service=document_service)
# ---------------------------------------------------------------------------
# Mount enterprise-only routes when the proprietary ``ee`` package
# is present.
# ---------------------------------------------------------------------------
try:
from ee.routers import init_app as _init_ee_app # type: ignore
_init_ee_app(app) # noqa: SLF001 runtime extension
logger.info("Enterprise routes mounted (ee package detected).")
except ModuleNotFoundError:
# Expected in OSS builds silently ignore.
logger.debug("Enterprise package not found running in community mode.")
@app.post("/ingest/text", response_model=Document)
@telemetry.track(operation_type="ingest_text", metadata_resolver=telemetry.ingest_text_metadata)
async def ingest_text(
request: IngestTextRequest,
auth: AuthContext = Depends(verify_token),
) -> Document:
"""
Ingest a text document.
Args:
request: IngestTextRequest containing:
- content: Text content to ingest
- filename: Optional filename to help determine content type
- metadata: Optional metadata dictionary
- rules: Optional list of rules. Each rule should be either:
- MetadataExtractionRule: {"type": "metadata_extraction", "schema": {...}}
- NaturalLanguageRule: {"type": "natural_language", "prompt": "..."}
- folder_name: Optional folder to scope the document to
- end_user_id: Optional end-user ID to scope the document to
auth: Authentication context
Returns:
Document: Metadata of ingested document
"""
try:
return await document_service.ingest_text(
content=request.content,
filename=request.filename,
metadata=request.metadata,
rules=request.rules,
use_colpali=request.use_colpali,
auth=auth,
folder_name=request.folder_name,
end_user_id=request.end_user_id,
)
except PermissionError as e:
raise HTTPException(status_code=403, detail=str(e))
# Redis pool for background tasks
redis_pool = None
def get_redis_pool():
"""Get the global Redis connection pool for background tasks."""
return redis_pool
@app.post("/ingest/file", response_model=Document)
@telemetry.track(operation_type="queue_ingest_file", metadata_resolver=telemetry.ingest_file_metadata)
async def ingest_file(
file: UploadFile,
metadata: str = Form("{}"),
rules: str = Form("[]"),
auth: AuthContext = Depends(verify_token),
use_colpali: Optional[bool] = None,
folder_name: Optional[str] = Form(None),
end_user_id: Optional[str] = Form(None),
redis: arq.ArqRedis = Depends(get_redis_pool),
) -> Document:
"""
Ingest a file document asynchronously.
Args:
file: File to ingest
metadata: JSON string of metadata
rules: JSON string of rules list. Each rule should be either:
- MetadataExtractionRule: {"type": "metadata_extraction", "schema": {...}}
- NaturalLanguageRule: {"type": "natural_language", "prompt": "..."}
auth: Authentication context
use_colpali: Whether to use ColPali embedding model
folder_name: Optional folder to scope the document to
end_user_id: Optional end-user ID to scope the document to
redis: Redis connection pool for background tasks
Returns:
Document with processing status that can be used to check progress
"""
try:
# Parse metadata and rules
metadata_dict = json.loads(metadata)
rules_list = json.loads(rules)
# Fix bool conversion: ensure string "false" is properly converted to False
def str2bool(v):
return v if isinstance(v, bool) else str(v).lower() in {"true", "1", "yes"}
use_colpali = str2bool(use_colpali)
# Ensure user has write permission
if "write" not in auth.permissions:
raise PermissionError("User does not have write permission")
logger.debug(f"API: Queueing file ingestion with use_colpali: {use_colpali}")
# Create a document with processing status
doc = Document(
content_type=file.content_type,
filename=file.filename,
metadata=metadata_dict,
owner={"type": auth.entity_type.value, "id": auth.entity_id},
access_control={
"readers": [auth.entity_id],
"writers": [auth.entity_id],
"admins": [auth.entity_id],
"user_id": [auth.user_id] if auth.user_id else [],
"app_access": ([auth.app_id] if auth.app_id else []),
},
system_metadata={"status": "processing"},
)
# Add folder_name and end_user_id to system_metadata if provided
if folder_name:
doc.system_metadata["folder_name"] = folder_name
if end_user_id:
doc.system_metadata["end_user_id"] = end_user_id
if auth.app_id:
doc.system_metadata["app_id"] = auth.app_id
# Set processing status
doc.system_metadata["status"] = "processing"
# Store the document in the database
success = await database.store_document(doc)
if not success:
raise Exception("Failed to store document metadata")
# If folder_name is provided, ensure the folder exists and add document to it
if folder_name:
try:
await document_service._ensure_folder_exists(folder_name, doc.external_id, auth)
logger.debug(f"Ensured folder '{folder_name}' exists and contains document {doc.external_id}")
except Exception as e:
# Log error but don't raise - we want document ingestion to continue even if folder operation fails
logger.error(f"Error ensuring folder exists: {e}")
# Read file content
file_content = await file.read()
# Generate a unique key for the file
file_key = f"ingest_uploads/{uuid.uuid4()}/{file.filename}"
# Store the file in the configured storage
file_content_base64 = base64.b64encode(file_content).decode()
bucket, stored_key = await storage.upload_from_base64(file_content_base64, file_key, file.content_type)
logger.debug(f"Stored file in bucket {bucket} with key {stored_key}")
# Update document with storage info
doc.storage_info = {"bucket": bucket, "key": stored_key}
# Initialize storage_files array with the first file
from datetime import UTC, datetime
from core.models.documents import StorageFileInfo
# Create a StorageFileInfo for the initial file
initial_file_info = StorageFileInfo(
bucket=bucket,
key=stored_key,
version=1,
filename=file.filename,
content_type=file.content_type,
timestamp=datetime.now(UTC),
)
doc.storage_files = [initial_file_info]
# Log storage files
logger.debug(f"Initial storage_files for {doc.external_id}: {doc.storage_files}")
# Update both storage_info and storage_files
await database.update_document(
document_id=doc.external_id,
updates={"storage_info": doc.storage_info, "storage_files": doc.storage_files},
auth=auth,
)
# Convert auth context to a dictionary for serialization
auth_dict = {
"entity_type": auth.entity_type.value,
"entity_id": auth.entity_id,
"app_id": auth.app_id,
"permissions": list(auth.permissions),
"user_id": auth.user_id,
}
# Enqueue the background job
job = await redis.enqueue_job(
"process_ingestion_job",
document_id=doc.external_id,
file_key=stored_key,
bucket=bucket,
original_filename=file.filename,
content_type=file.content_type,
metadata_json=metadata,
auth_dict=auth_dict,
rules_list=rules_list,
use_colpali=use_colpali,
folder_name=folder_name,
end_user_id=end_user_id,
)
logger.info(f"File ingestion job queued with ID: {job.job_id} for document: {doc.external_id}")
return doc
except json.JSONDecodeError as e:
raise HTTPException(status_code=400, detail=f"Invalid JSON: {str(e)}")
except PermissionError as e:
raise HTTPException(status_code=403, detail=str(e))
except Exception as e:
logger.error(f"Error during file ingestion: {str(e)}")
raise HTTPException(status_code=500, detail=f"Error during file ingestion: {str(e)}")
@app.post("/ingest/files", response_model=BatchIngestResponse)
@telemetry.track(operation_type="queue_batch_ingest", metadata_resolver=telemetry.batch_ingest_metadata)
async def batch_ingest_files(
files: List[UploadFile] = File(...),
metadata: str = Form("{}"),
rules: str = Form("[]"),
use_colpali: Optional[bool] = Form(None),
parallel: Optional[bool] = Form(True),
folder_name: Optional[str] = Form(None),
end_user_id: Optional[str] = Form(None),
auth: AuthContext = Depends(verify_token),
redis: arq.ArqRedis = Depends(get_redis_pool),
) -> BatchIngestResponse:
"""
Batch ingest multiple files using the task queue.
Args:
files: List of files to ingest
metadata: JSON string of metadata (either a single dict or list of dicts)
rules: JSON string of rules list. Can be either:
- A single list of rules to apply to all files
- A list of rule lists, one per file
use_colpali: Whether to use ColPali-style embedding
folder_name: Optional folder to scope the documents to
end_user_id: Optional end-user ID to scope the documents to
auth: Authentication context
redis: Redis connection pool for background tasks
Returns:
BatchIngestResponse containing:
- documents: List of created documents with processing status
- errors: List of errors that occurred during the batch operation
"""
if not files:
raise HTTPException(status_code=400, detail="No files provided for batch ingestion")
try:
metadata_value = json.loads(metadata)
rules_list = json.loads(rules)
# Fix bool conversion: ensure string "false" is properly converted to False
def str2bool(v):
return str(v).lower() in {"true", "1", "yes"}
use_colpali = str2bool(use_colpali)
# Ensure user has write permission
if "write" not in auth.permissions:
raise PermissionError("User does not have write permission")
except json.JSONDecodeError as e:
raise HTTPException(status_code=400, detail=f"Invalid JSON: {str(e)}")
except PermissionError as e:
raise HTTPException(status_code=403, detail=str(e))
# Validate metadata if it's a list
if isinstance(metadata_value, list) and len(metadata_value) != len(files):
raise HTTPException(
status_code=400,
detail=f"Number of metadata items ({len(metadata_value)}) must match number of files ({len(files)})",
)
# Validate rules if it's a list of lists
if isinstance(rules_list, list) and rules_list and isinstance(rules_list[0], list):
if len(rules_list) != len(files):
raise HTTPException(
status_code=400,
detail=f"Number of rule lists ({len(rules_list)}) must match number of files ({len(files)})",
)
# Convert auth context to a dictionary for serialization
auth_dict = {
"entity_type": auth.entity_type.value,
"entity_id": auth.entity_id,
"app_id": auth.app_id,
"permissions": list(auth.permissions),
"user_id": auth.user_id,
}
created_documents = []
try:
for i, file in enumerate(files):
# Get the metadata and rules for this file
metadata_item = metadata_value[i] if isinstance(metadata_value, list) else metadata_value
file_rules = (
rules_list[i]
if isinstance(rules_list, list) and rules_list and isinstance(rules_list[0], list)
else rules_list
)
# Create a document with processing status
doc = Document(
content_type=file.content_type,
filename=file.filename,
metadata=metadata_item,
owner={"type": auth.entity_type.value, "id": auth.entity_id},
access_control={
"readers": [auth.entity_id],
"writers": [auth.entity_id],
"admins": [auth.entity_id],
"user_id": [auth.user_id] if auth.user_id else [],
"app_access": ([auth.app_id] if auth.app_id else []),
},
)
# Add folder_name and end_user_id to system_metadata if provided
if folder_name:
doc.system_metadata["folder_name"] = folder_name
if end_user_id:
doc.system_metadata["end_user_id"] = end_user_id
if auth.app_id:
doc.system_metadata["app_id"] = auth.app_id
# Set processing status
doc.system_metadata["status"] = "processing"
# Store the document in the database
success = await database.store_document(doc)
if not success:
raise Exception(f"Failed to store document metadata for {file.filename}")
# If folder_name is provided, ensure the folder exists and add document to it
if folder_name:
try:
await document_service._ensure_folder_exists(folder_name, doc.external_id, auth)
logger.debug(f"Ensured folder '{folder_name}' exists and contains document {doc.external_id}")
except Exception as e:
# Log error but don't raise - we want document ingestion to continue even if folder operation fails
logger.error(f"Error ensuring folder exists: {e}")
# Read file content
file_content = await file.read()
# Generate a unique key for the file
file_key = f"ingest_uploads/{uuid.uuid4()}/{file.filename}"
# Store the file in the configured storage
file_content_base64 = base64.b64encode(file_content).decode()
bucket, stored_key = await storage.upload_from_base64(file_content_base64, file_key, file.content_type)
logger.debug(f"Stored file in bucket {bucket} with key {stored_key}")
# Update document with storage info
doc.storage_info = {"bucket": bucket, "key": stored_key}
await database.update_document(
document_id=doc.external_id, updates={"storage_info": doc.storage_info}, auth=auth
)
# Convert metadata to JSON string for job
metadata_json = json.dumps(metadata_item)
# Enqueue the background job
job = await redis.enqueue_job(
"process_ingestion_job",
document_id=doc.external_id,
file_key=stored_key,
bucket=bucket,
original_filename=file.filename,
content_type=file.content_type,
metadata_json=metadata_json,
auth_dict=auth_dict,
rules_list=file_rules,
use_colpali=use_colpali,
folder_name=folder_name,
end_user_id=end_user_id,
)
logger.info(f"File ingestion job queued with ID: {job.job_id} for document: {doc.external_id}")
# Add document to the list
created_documents.append(doc)
# Return information about created documents
return BatchIngestResponse(documents=created_documents, errors=[])
except Exception as e:
logger.error(f"Error queueing batch file ingestion: {str(e)}")
raise HTTPException(status_code=500, detail=f"Error queueing batch file ingestion: {str(e)}")
@app.post("/retrieve/chunks", response_model=List[ChunkResult])
@telemetry.track(operation_type="retrieve_chunks", metadata_resolver=telemetry.retrieve_chunks_metadata)
async def retrieve_chunks(request: RetrieveRequest, auth: AuthContext = Depends(verify_token)):
"""
Retrieve relevant chunks.
Args:
request: RetrieveRequest containing:
- query: Search query text
- filters: Optional metadata filters
- k: Number of results (default: 4)
- min_score: Minimum similarity threshold (default: 0.0)
- use_reranking: Whether to use reranking
- use_colpali: Whether to use ColPali-style embedding model
- folder_name: Optional folder to scope the search to
- end_user_id: Optional end-user ID to scope the search to
auth: Authentication context
Returns:
List[ChunkResult]: List of relevant chunks
"""
try:
return await document_service.retrieve_chunks(
request.query,
auth,
request.filters,
request.k,
request.min_score,
request.use_reranking,
request.use_colpali,
request.folder_name,
request.end_user_id,
)
except PermissionError as e:
raise HTTPException(status_code=403, detail=str(e))
@app.post("/retrieve/docs", response_model=List[DocumentResult])
@telemetry.track(operation_type="retrieve_docs", metadata_resolver=telemetry.retrieve_docs_metadata)
async def retrieve_documents(request: RetrieveRequest, auth: AuthContext = Depends(verify_token)):
"""
Retrieve relevant documents.
Args:
request: RetrieveRequest containing:
- query: Search query text
- filters: Optional metadata filters
- k: Number of results (default: 4)
- min_score: Minimum similarity threshold (default: 0.0)
- use_reranking: Whether to use reranking
- use_colpali: Whether to use ColPali-style embedding model
- folder_name: Optional folder to scope the search to
- end_user_id: Optional end-user ID to scope the search to
auth: Authentication context
Returns:
List[DocumentResult]: List of relevant documents
"""
try:
return await document_service.retrieve_docs(
request.query,
auth,
request.filters,
request.k,
request.min_score,
request.use_reranking,
request.use_colpali,
request.folder_name,
request.end_user_id,
)
except PermissionError as e:
raise HTTPException(status_code=403, detail=str(e))
@app.post("/batch/documents", response_model=List[Document])
@telemetry.track(operation_type="batch_get_documents", metadata_resolver=telemetry.batch_documents_metadata)
async def batch_get_documents(request: Dict[str, Any], auth: AuthContext = Depends(verify_token)):
"""
Retrieve multiple documents by their IDs in a single batch operation.
Args:
request: Dictionary containing:
- document_ids: List of document IDs to retrieve
- folder_name: Optional folder to scope the operation to
- end_user_id: Optional end-user ID to scope the operation to
auth: Authentication context
Returns:
List[Document]: List of documents matching the IDs
"""
try:
# Extract document_ids from request
document_ids = request.get("document_ids", [])
folder_name = request.get("folder_name")
end_user_id = request.get("end_user_id")
if not document_ids:
return []
# Create system filters for folder and user scoping
system_filters = {}
if folder_name:
system_filters["folder_name"] = folder_name
if end_user_id:
system_filters["end_user_id"] = end_user_id
if auth.app_id:
system_filters["app_id"] = auth.app_id
return await document_service.batch_retrieve_documents(document_ids, auth, folder_name, end_user_id)
except PermissionError as e:
raise HTTPException(status_code=403, detail=str(e))
@app.post("/batch/chunks", response_model=List[ChunkResult])
@telemetry.track(operation_type="batch_get_chunks", metadata_resolver=telemetry.batch_chunks_metadata)
async def batch_get_chunks(request: Dict[str, Any], auth: AuthContext = Depends(verify_token)):
"""
Retrieve specific chunks by their document ID and chunk number in a single batch operation.
Args:
request: Dictionary containing:
- sources: List of ChunkSource objects (with document_id and chunk_number)
- folder_name: Optional folder to scope the operation to
- end_user_id: Optional end-user ID to scope the operation to
auth: Authentication context
Returns:
List[ChunkResult]: List of chunk results
"""
try:
# Extract sources from request
sources = request.get("sources", [])
folder_name = request.get("folder_name")
end_user_id = request.get("end_user_id")
use_colpali = request.get("use_colpali")
if not sources:
return []
# Convert sources to ChunkSource objects if needed
chunk_sources = []
for source in sources:
if isinstance(source, dict):
chunk_sources.append(ChunkSource(**source))
else:
chunk_sources.append(source)
# Create system filters for folder and user scoping
system_filters = {}
if folder_name:
system_filters["folder_name"] = folder_name
if end_user_id:
system_filters["end_user_id"] = end_user_id
if auth.app_id:
system_filters["app_id"] = auth.app_id
return await document_service.batch_retrieve_chunks(chunk_sources, auth, folder_name, end_user_id, use_colpali)
except PermissionError as e:
raise HTTPException(status_code=403, detail=str(e))
@app.post("/query", response_model=CompletionResponse)
@telemetry.track(operation_type="query", metadata_resolver=telemetry.query_metadata)
async def query_completion(request: CompletionQueryRequest, auth: AuthContext = Depends(verify_token)):
"""
Generate completion using relevant chunks as context.
When graph_name is provided, the query will leverage the knowledge graph
to enhance retrieval by finding relevant entities and their connected documents.
Args:
request: CompletionQueryRequest containing:
- query: Query text
- filters: Optional metadata filters
- k: Number of chunks to use as context (default: 4)
- min_score: Minimum similarity threshold (default: 0.0)
- max_tokens: Maximum tokens in completion
- temperature: Model temperature
- use_reranking: Whether to use reranking
- use_colpali: Whether to use ColPali-style embedding model
- graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
- hop_depth: Number of relationship hops to traverse in the graph (1-3)
- include_paths: Whether to include relationship paths in the response
- prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
- folder_name: Optional folder to scope the operation to
- end_user_id: Optional end-user ID to scope the operation to
- schema: Optional schema for structured output
auth: Authentication context
Returns:
CompletionResponse: Generated text completion or structured output
"""
try:
# Validate prompt overrides before proceeding
if request.prompt_overrides:
validate_prompt_overrides_with_http_exception(request.prompt_overrides, operation_type="query")
# Check query limits if in cloud mode
if settings.MODE == "cloud" and auth.user_id:
# Check limits before proceeding
await check_and_increment_limits(auth, "query", 1)
return await document_service.query(
request.query,
auth,
request.filters,
request.k,
request.min_score,
request.max_tokens,
request.temperature,
request.use_reranking,
request.use_colpali,
request.graph_name,
request.hop_depth,
request.include_paths,
request.prompt_overrides,
request.folder_name,
request.end_user_id,
request.schema,
)
except ValueError as e:
validate_prompt_overrides_with_http_exception(operation_type="query", error=e)
except PermissionError as e:
raise HTTPException(status_code=403, detail=str(e))
@app.post("/agent", response_model=Dict[str, Any])
@telemetry.track(operation_type="agent_query")
async def agent_query(request: AgentQueryRequest, auth: AuthContext = Depends(verify_token)):
"""
Process a natural language query using the MorphikAgent and return the response.
"""
# Check free-tier agent call limits in cloud mode
if settings.MODE == "cloud" and auth.user_id:
await check_and_increment_limits(auth, "agent", 1)
# Use shared agent instance and pass auth to run
response_content, tool_history = await morphik_agent.run(request.query, auth)
# Return both in the response dictionary
return {"response": response_content, "tool_history": tool_history}
@app.post("/documents", response_model=List[Document])
async def list_documents(
auth: AuthContext = Depends(verify_token),
skip: int = 0,
limit: int = 10000,
filters: Optional[Dict[str, Any]] = None,
folder_name: Optional[str] = None,
end_user_id: Optional[str] = None,
):
"""
List accessible documents.
Args:
auth: Authentication context
skip: Number of documents to skip
limit: Maximum number of documents to return
filters: Optional metadata filters
folder_name: Optional folder to scope the operation to
end_user_id: Optional end-user ID to scope the operation to
Returns:
List[Document]: List of accessible documents
"""
# Create system filters for folder and user scoping
system_filters = {}
if folder_name:
system_filters["folder_name"] = folder_name
if end_user_id:
system_filters["end_user_id"] = end_user_id
if auth.app_id:
system_filters["app_id"] = auth.app_id
return await document_service.db.get_documents(auth, skip, limit, filters, system_filters)
@app.get("/documents/{document_id}", response_model=Document)
async def get_document(document_id: str, auth: AuthContext = Depends(verify_token)):
"""Get document by ID."""
try:
doc = await document_service.db.get_document(document_id, auth)
logger.debug(f"Found document: {doc}")
if not doc:
raise HTTPException(status_code=404, detail="Document not found")
return doc
except HTTPException as e:
logger.error(f"Error getting document: {e}")
raise e
@app.get("/documents/{document_id}/status", response_model=Dict[str, Any])
async def get_document_status(document_id: str, auth: AuthContext = Depends(verify_token)):
"""
Get the processing status of a document.
Args:
document_id: ID of the document to check
auth: Authentication context
Returns:
Dict containing status information for the document
"""
try:
doc = await document_service.db.get_document(document_id, auth)
if not doc:
raise HTTPException(status_code=404, detail="Document not found")
# Extract status information
status = doc.system_metadata.get("status", "unknown")
response = {
"document_id": doc.external_id,
"status": status,
"filename": doc.filename,
"created_at": doc.system_metadata.get("created_at"),
"updated_at": doc.system_metadata.get("updated_at"),
}
# Add error information if failed
if status == "failed":
response["error"] = doc.system_metadata.get("error", "Unknown error")
return response
except HTTPException:
raise
except Exception as e:
logger.error(f"Error getting document status: {str(e)}")
raise HTTPException(status_code=500, detail=f"Error getting document status: {str(e)}")
@app.delete("/documents/{document_id}")
@telemetry.track(operation_type="delete_document", metadata_resolver=telemetry.document_delete_metadata)
async def delete_document(document_id: str, auth: AuthContext = Depends(verify_token)):
"""
Delete a document and all associated data.
This endpoint deletes a document and all its associated data, including:
- Document metadata
- Document content in storage
- Document chunks and embeddings in vector store
Args:
document_id: ID of the document to delete
auth: Authentication context (must have write access to the document)
Returns:
Deletion status
"""
try:
success = await document_service.delete_document(document_id, auth)
if not success:
raise HTTPException(status_code=404, detail="Document not found or delete failed")
return {"status": "success", "message": f"Document {document_id} deleted successfully"}
except PermissionError as e:
raise HTTPException(status_code=403, detail=str(e))
@app.get("/documents/filename/{filename}", response_model=Document)
async def get_document_by_filename(
filename: str,
auth: AuthContext = Depends(verify_token),
folder_name: Optional[str] = None,
end_user_id: Optional[str] = None,
):
"""
Get document by filename.
Args:
filename: Filename of the document to retrieve
auth: Authentication context
folder_name: Optional folder to scope the operation to
end_user_id: Optional end-user ID to scope the operation to
Returns:
Document: Document metadata if found and accessible
"""
try:
# Create system filters for folder and user scoping
system_filters = {}
if folder_name:
system_filters["folder_name"] = folder_name
if end_user_id:
system_filters["end_user_id"] = end_user_id
doc = await document_service.db.get_document_by_filename(filename, auth, system_filters)
logger.debug(f"Found document by filename: {doc}")
if not doc:
raise HTTPException(status_code=404, detail=f"Document with filename '{filename}' not found")
return doc
except HTTPException as e:
logger.error(f"Error getting document by filename: {e}")
raise e
@app.post("/documents/{document_id}/update_text", response_model=Document)
@telemetry.track(operation_type="update_document_text", metadata_resolver=telemetry.document_update_text_metadata)
async def update_document_text(
document_id: str,
request: IngestTextRequest,
update_strategy: str = "add",
auth: AuthContext = Depends(verify_token),
):
"""
Update a document with new text content using the specified strategy.
Args:
document_id: ID of the document to update
request: Text content and metadata for the update
update_strategy: Strategy for updating the document (default: 'add')
Returns:
Document: Updated document metadata
"""
try:
doc = await document_service.update_document(
document_id=document_id,
auth=auth,
content=request.content,
file=None,
filename=request.filename,
metadata=request.metadata,
rules=request.rules,
update_strategy=update_strategy,
use_colpali=request.use_colpali,
)
if not doc:
raise HTTPException(status_code=404, detail="Document not found or update failed")
return doc
except PermissionError as e:
raise HTTPException(status_code=403, detail=str(e))
@app.post("/documents/{document_id}/update_file", response_model=Document)
@telemetry.track(operation_type="update_document_file", metadata_resolver=telemetry.document_update_file_metadata)
async def update_document_file(
document_id: str,
file: UploadFile,
metadata: str = Form("{}"),
rules: str = Form("[]"),
update_strategy: str = Form("add"),
use_colpali: Optional[bool] = None,
auth: AuthContext = Depends(verify_token),
):
"""
Update a document with content from a file using the specified strategy.
Args:
document_id: ID of the document to update
file: File to add to the document
metadata: JSON string of metadata to merge with existing metadata
rules: JSON string of rules to apply to the content
update_strategy: Strategy for updating the document (default: 'add')
use_colpali: Whether to use multi-vector embedding
Returns:
Document: Updated document metadata
"""
try:
metadata_dict = json.loads(metadata)
rules_list = json.loads(rules)
doc = await document_service.update_document(
document_id=document_id,
auth=auth,
content=None,
file=file,
filename=file.filename,
metadata=metadata_dict,
rules=rules_list,
update_strategy=update_strategy,
use_colpali=use_colpali,
)
if not doc:
raise HTTPException(status_code=404, detail="Document not found or update failed")
return doc
except json.JSONDecodeError as e:
raise HTTPException(status_code=400, detail=f"Invalid JSON: {str(e)}")
except PermissionError as e:
raise HTTPException(status_code=403, detail=str(e))
@app.post("/documents/{document_id}/update_metadata", response_model=Document)
@telemetry.track(
operation_type="update_document_metadata",
metadata_resolver=telemetry.document_update_metadata_resolver,
)
async def update_document_metadata(
document_id: str, metadata: Dict[str, Any], auth: AuthContext = Depends(verify_token)
):
"""
Update only a document's metadata.
Args:
document_id: ID of the document to update
metadata: New metadata to merge with existing metadata
Returns:
Document: Updated document metadata
"""
try:
doc = await document_service.update_document(
document_id=document_id,
auth=auth,
content=None,
file=None,
filename=None,
metadata=metadata,
rules=[],
update_strategy="add",
use_colpali=None,
)
if not doc:
raise HTTPException(status_code=404, detail="Document not found or update failed")
return doc
except PermissionError as e:
raise HTTPException(status_code=403, detail=str(e))
# Usage tracking endpoints
@app.get("/usage/stats")
@telemetry.track(operation_type="get_usage_stats", metadata_resolver=telemetry.usage_stats_metadata)
async def get_usage_stats(auth: AuthContext = Depends(verify_token)) -> Dict[str, int]:
"""Get usage statistics for the authenticated user."""
if not auth.permissions or "admin" not in auth.permissions:
return telemetry.get_user_usage(auth.entity_id)
return telemetry.get_user_usage(auth.entity_id)
@app.get("/usage/recent")
@telemetry.track(operation_type="get_recent_usage", metadata_resolver=telemetry.recent_usage_metadata)
async def get_recent_usage(
auth: AuthContext = Depends(verify_token),
operation_type: Optional[str] = None,
since: Optional[datetime] = None,
status: Optional[str] = None,
) -> List[Dict]:
"""Get recent usage records."""
if not auth.permissions or "admin" not in auth.permissions:
records = telemetry.get_recent_usage(
user_id=auth.entity_id, operation_type=operation_type, since=since, status=status
)
else:
records = telemetry.get_recent_usage(operation_type=operation_type, since=since, status=status)
return [
{
"timestamp": record.timestamp,
"operation_type": record.operation_type,
"tokens_used": record.tokens_used,
"user_id": record.user_id,
"duration_ms": record.duration_ms,
"status": record.status,
"metadata": record.metadata,
}
for record in records
]
# Cache endpoints
@app.post("/cache/create")
@telemetry.track(operation_type="create_cache", metadata_resolver=telemetry.cache_create_metadata)
async def create_cache(
name: str,
model: str,
gguf_file: str,
filters: Optional[Dict[str, Any]] = None,
docs: Optional[List[str]] = None,
auth: AuthContext = Depends(verify_token),
) -> Dict[str, Any]:
"""Create a new cache with specified configuration."""
try:
# Check cache creation limits if in cloud mode
if settings.MODE == "cloud" and auth.user_id:
# Check limits before proceeding
await check_and_increment_limits(auth, "cache", 1)
filter_docs = set(await document_service.db.get_documents(auth, filters=filters))
additional_docs = (
{await document_service.db.get_document(document_id=doc_id, auth=auth) for doc_id in docs}
if docs
else set()
)
docs_to_add = list(filter_docs.union(additional_docs))
if not docs_to_add:
raise HTTPException(status_code=400, detail="No documents to add to cache")
response = await document_service.create_cache(name, model, gguf_file, docs_to_add, filters)
return response
except PermissionError as e:
raise HTTPException(status_code=403, detail=str(e))
@app.get("/cache/{name}")
@telemetry.track(operation_type="get_cache", metadata_resolver=telemetry.cache_get_metadata)
async def get_cache(name: str, auth: AuthContext = Depends(verify_token)) -> Dict[str, Any]:
"""Get cache configuration by name."""
try:
exists = await document_service.load_cache(name)
return {"exists": exists}
except PermissionError as e:
raise HTTPException(status_code=403, detail=str(e))
@app.post("/cache/{name}/update")
@telemetry.track(operation_type="update_cache", metadata_resolver=telemetry.cache_update_metadata)
async def update_cache(name: str, auth: AuthContext = Depends(verify_token)) -> Dict[str, bool]:
"""Update cache with new documents matching its filter."""
try:
if name not in document_service.active_caches:
exists = await document_service.load_cache(name)
if not exists:
raise HTTPException(status_code=404, detail=f"Cache '{name}' not found")
cache = document_service.active_caches[name]
docs = await document_service.db.get_documents(auth, filters=cache.filters)
docs_to_add = [doc for doc in docs if doc.id not in cache.docs]
return cache.add_docs(docs_to_add)
except PermissionError as e:
raise HTTPException(status_code=403, detail=str(e))
@app.post("/cache/{name}/add_docs")
@telemetry.track(operation_type="add_docs_to_cache", metadata_resolver=telemetry.cache_add_docs_metadata)
async def add_docs_to_cache(name: str, docs: List[str], auth: AuthContext = Depends(verify_token)) -> Dict[str, bool]:
"""Add specific documents to the cache."""
try:
cache = document_service.active_caches[name]
docs_to_add = [
await document_service.db.get_document(doc_id, auth) for doc_id in docs if doc_id not in cache.docs
]
return cache.add_docs(docs_to_add)
except PermissionError as e:
raise HTTPException(status_code=403, detail=str(e))
@app.post("/cache/{name}/query")
@telemetry.track(operation_type="query_cache", metadata_resolver=telemetry.cache_query_metadata)
async def query_cache(
name: str,
query: str,
max_tokens: Optional[int] = None,
temperature: Optional[float] = None,
auth: AuthContext = Depends(verify_token),
) -> CompletionResponse:
"""Query the cache with a prompt."""
try:
# Check cache query limits if in cloud mode
if settings.MODE == "cloud" and auth.user_id:
# Check limits before proceeding
await check_and_increment_limits(auth, "cache_query", 1)
cache = document_service.active_caches[name]
logger.info(f"Cache state: {cache.state.n_tokens}")
return cache.query(query) # , max_tokens, temperature)
except PermissionError as e:
raise HTTPException(status_code=403, detail=str(e))
@app.post("/graph/create", response_model=Graph)
@telemetry.track(operation_type="create_graph", metadata_resolver=telemetry.create_graph_metadata)
async def create_graph(
request: CreateGraphRequest,
auth: AuthContext = Depends(verify_token),
) -> Graph:
"""
Create a graph from documents.
This endpoint extracts entities and relationships from documents
matching the specified filters or document IDs and creates a graph.
Args:
request: CreateGraphRequest containing:
- name: Name of the graph to create
- filters: Optional metadata filters to determine which documents to include
- documents: Optional list of specific document IDs to include
- prompt_overrides: Optional customizations for entity extraction and resolution prompts
- folder_name: Optional folder to scope the operation to
- end_user_id: Optional end-user ID to scope the operation to
auth: Authentication context
Returns:
Graph: The created graph object
"""
try:
# Validate prompt overrides before proceeding
if request.prompt_overrides:
validate_prompt_overrides_with_http_exception(request.prompt_overrides, operation_type="graph")
# Check graph creation limits if in cloud mode
if settings.MODE == "cloud" and auth.user_id:
# Check limits before proceeding
await check_and_increment_limits(auth, "graph", 1)
# Create system filters for folder and user scoping
system_filters = {}
if request.folder_name:
system_filters["folder_name"] = request.folder_name
if request.end_user_id:
system_filters["end_user_id"] = request.end_user_id
return await document_service.create_graph(
name=request.name,
auth=auth,
filters=request.filters,
documents=request.documents,
prompt_overrides=request.prompt_overrides,
system_filters=system_filters,
)
except PermissionError as e:
raise HTTPException(status_code=403, detail=str(e))
except ValueError as e:
validate_prompt_overrides_with_http_exception(operation_type="graph", error=e)
@app.post("/folders", response_model=Folder)
async def create_folder(
folder_create: FolderCreate,
auth: AuthContext = Depends(verify_token),
) -> Folder:
"""
Create a new folder.
Args:
folder_create: Folder creation request containing name and optional description
auth: Authentication context
Returns:
Folder: Created folder
"""
try:
async with telemetry.track_operation(
operation_type="create_folder",
user_id=auth.entity_id,
metadata={
"name": folder_create.name,
},
):
# Create a folder object with explicit ID
import uuid
folder_id = str(uuid.uuid4())
logger.info(f"Creating folder with ID: {folder_id}, auth.user_id: {auth.user_id}")
# Set up access control with user_id
access_control = {
"readers": [auth.entity_id],
"writers": [auth.entity_id],
"admins": [auth.entity_id],
}
if auth.user_id:
access_control["user_id"] = [auth.user_id]
logger.info(f"Adding user_id {auth.user_id} to folder access control")
folder = Folder(
id=folder_id,
name=folder_create.name,
description=folder_create.description,
owner={
"type": auth.entity_type.value,
"id": auth.entity_id,
},
access_control=access_control,
)
# Scope folder to the application ID for developer tokens
if auth.app_id:
folder.system_metadata["app_id"] = auth.app_id
# Store in database
success = await document_service.db.create_folder(folder)
if not success:
raise HTTPException(status_code=500, detail="Failed to create folder")
return folder
except Exception as e:
logger.error(f"Error creating folder: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.get("/folders", response_model=List[Folder])
async def list_folders(
auth: AuthContext = Depends(verify_token),
) -> List[Folder]:
"""
List all folders the user has access to.
Args:
auth: Authentication context
Returns:
List[Folder]: List of folders
"""
try:
async with telemetry.track_operation(
operation_type="list_folders",
user_id=auth.entity_id,
):
folders = await document_service.db.list_folders(auth)
return folders
except Exception as e:
logger.error(f"Error listing folders: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.get("/folders/{folder_id}", response_model=Folder)
async def get_folder(
folder_id: str,
auth: AuthContext = Depends(verify_token),
) -> Folder:
"""
Get a folder by ID.
Args:
folder_id: ID of the folder
auth: Authentication context
Returns:
Folder: Folder if found and accessible
"""
try:
async with telemetry.track_operation(
operation_type="get_folder",
user_id=auth.entity_id,
metadata={
"folder_id": folder_id,
},
):
folder = await document_service.db.get_folder(folder_id, auth)
if not folder:
raise HTTPException(status_code=404, detail=f"Folder {folder_id} not found")
return folder
except HTTPException:
raise
except Exception as e:
logger.error(f"Error getting folder: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/folders/{folder_id}/documents/{document_id}")
async def add_document_to_folder(
folder_id: str,
document_id: str,
auth: AuthContext = Depends(verify_token),
):
"""
Add a document to a folder.
Args:
folder_id: ID of the folder
document_id: ID of the document
auth: Authentication context
Returns:
Success status
"""
try:
async with telemetry.track_operation(
operation_type="add_document_to_folder",
user_id=auth.entity_id,
metadata={
"folder_id": folder_id,
"document_id": document_id,
},
):
success = await document_service.db.add_document_to_folder(folder_id, document_id, auth)
if not success:
raise HTTPException(status_code=500, detail="Failed to add document to folder")
return {"status": "success"}
except Exception as e:
logger.error(f"Error adding document to folder: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.delete("/folders/{folder_id}/documents/{document_id}")
async def remove_document_from_folder(
folder_id: str,
document_id: str,
auth: AuthContext = Depends(verify_token),
):
"""
Remove a document from a folder.
Args:
folder_id: ID of the folder
document_id: ID of the document
auth: Authentication context
Returns:
Success status
"""
try:
async with telemetry.track_operation(
operation_type="remove_document_from_folder",
user_id=auth.entity_id,
metadata={
"folder_id": folder_id,
"document_id": document_id,
},
):
success = await document_service.db.remove_document_from_folder(folder_id, document_id, auth)
if not success:
raise HTTPException(status_code=500, detail="Failed to remove document from folder")
return {"status": "success"}
except Exception as e:
logger.error(f"Error removing document from folder: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.get("/graph/{name}", response_model=Graph)
@telemetry.track(operation_type="get_graph", metadata_resolver=telemetry.get_graph_metadata)
async def get_graph(
name: str,
auth: AuthContext = Depends(verify_token),
folder_name: Optional[str] = None,
end_user_id: Optional[str] = None,
) -> Graph:
"""
Get a graph by name.
This endpoint retrieves a graph by its name if the user has access to it.
Args:
name: Name of the graph to retrieve
auth: Authentication context
folder_name: Optional folder to scope the operation to
end_user_id: Optional end-user ID to scope the operation to
Returns:
Graph: The requested graph object
"""
try:
# Create system filters for folder and user scoping
system_filters = {}
if folder_name:
system_filters["folder_name"] = folder_name
if end_user_id:
system_filters["end_user_id"] = end_user_id
graph = await document_service.db.get_graph(name, auth, system_filters)
if not graph:
raise HTTPException(status_code=404, detail=f"Graph '{name}' not found")
return graph
except PermissionError as e:
raise HTTPException(status_code=403, detail=str(e))
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/graphs", response_model=List[Graph])
@telemetry.track(operation_type="list_graphs", metadata_resolver=telemetry.list_graphs_metadata)
async def list_graphs(
auth: AuthContext = Depends(verify_token),
folder_name: Optional[str] = None,
end_user_id: Optional[str] = None,
) -> List[Graph]:
"""
List all graphs the user has access to.
This endpoint retrieves all graphs the user has access to.
Args:
auth: Authentication context
folder_name: Optional folder to scope the operation to
end_user_id: Optional end-user ID to scope the operation to
Returns:
List[Graph]: List of graph objects
"""
try:
# Create system filters for folder and user scoping
system_filters = {}
if folder_name:
system_filters["folder_name"] = folder_name
if end_user_id:
system_filters["end_user_id"] = end_user_id
return await document_service.db.list_graphs(auth, system_filters)
except PermissionError as e:
raise HTTPException(status_code=403, detail=str(e))
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/graph/{name}/update", response_model=Graph)
@telemetry.track(operation_type="update_graph", metadata_resolver=telemetry.update_graph_metadata)
async def update_graph(
name: str,
request: UpdateGraphRequest,
auth: AuthContext = Depends(verify_token),
) -> Graph:
"""
Update an existing graph with new documents.
This endpoint processes additional documents based on the original graph filters
and/or new filters/document IDs, extracts entities and relationships, and
updates the graph with new information.
Args:
name: Name of the graph to update
request: UpdateGraphRequest containing:
- additional_filters: Optional additional metadata filters to determine which new documents to include
- additional_documents: Optional list of additional document IDs to include
- prompt_overrides: Optional customizations for entity extraction and resolution prompts
- folder_name: Optional folder to scope the operation to
- end_user_id: Optional end-user ID to scope the operation to
auth: Authentication context
Returns:
Graph: The updated graph object
"""
try:
# Validate prompt overrides before proceeding
if request.prompt_overrides:
validate_prompt_overrides_with_http_exception(request.prompt_overrides, operation_type="graph")
# Create system filters for folder and user scoping
system_filters = {}
if request.folder_name:
system_filters["folder_name"] = request.folder_name
if request.end_user_id:
system_filters["end_user_id"] = request.end_user_id
return await document_service.update_graph(
name=name,
auth=auth,
additional_filters=request.additional_filters,
additional_documents=request.additional_documents,
prompt_overrides=request.prompt_overrides,
system_filters=system_filters,
)
except PermissionError as e:
raise HTTPException(status_code=403, detail=str(e))
except ValueError as e:
validate_prompt_overrides_with_http_exception(operation_type="graph", error=e)
except Exception as e:
logger.error(f"Error updating graph: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/local/generate_uri", include_in_schema=True)
async def generate_local_uri(
name: str = Form("admin"),
expiry_days: int = Form(30),
) -> Dict[str, str]:
"""Generate a local URI for development. This endpoint is unprotected."""
try:
# Clean name
name = name.replace(" ", "_").lower()
# Create payload
payload = {
"type": "developer",
"entity_id": name,
"permissions": ["read", "write", "admin"],
"exp": datetime.now(UTC) + timedelta(days=expiry_days),
}
# Generate token
token = jwt.encode(payload, settings.JWT_SECRET_KEY, algorithm=settings.JWT_ALGORITHM)
# Read config for host/port
with open("morphik.toml", "rb") as f:
config = tomli.load(f)
base_url = f"{config['api']['host']}:{config['api']['port']}".replace("localhost", "127.0.0.1")
# Generate URI
uri = f"morphik://{name}:{token}@{base_url}"
return {"uri": uri}
except Exception as e:
logger.error(f"Error generating local URI: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/cloud/generate_uri", include_in_schema=True)
async def generate_cloud_uri(
request: GenerateUriRequest,
authorization: str = Header(None),
) -> Dict[str, str]:
"""Generate a URI for cloud hosted applications."""
try:
app_id = request.app_id
name = request.name
user_id = request.user_id
expiry_days = request.expiry_days
logger.debug(f"Generating cloud URI for app_id={app_id}, name={name}, user_id={user_id}")
# Verify authorization header before proceeding
if not authorization:
logger.warning("Missing authorization header")
raise HTTPException(
status_code=401,
detail="Missing authorization header",
headers={"WWW-Authenticate": "Bearer"},
)
# Verify the token is valid
if not authorization.startswith("Bearer "):
raise HTTPException(status_code=401, detail="Invalid authorization header")
token = authorization[7:] # Remove "Bearer "
try:
# Decode the token to ensure it's valid
payload = jwt.decode(token, settings.JWT_SECRET_KEY, algorithms=[settings.JWT_ALGORITHM])
# Only allow users to create apps for themselves (or admin)
token_user_id = payload.get("user_id")
logger.debug(f"Token user ID: {token_user_id}")
logger.debug(f"User ID: {user_id}")
if not (token_user_id == user_id or "admin" in payload.get("permissions", [])):
raise HTTPException(
status_code=403,
detail="You can only create apps for your own account unless you have admin permissions",
)
except jwt.InvalidTokenError as e:
raise HTTPException(status_code=401, detail=str(e))
# Import UserService here to avoid circular imports
from core.services.user_service import UserService
user_service = UserService()
# Initialize user service if needed
await user_service.initialize()
# Clean name
name = name.replace(" ", "_").lower()
# Check if the user is within app limit and generate URI
uri = await user_service.generate_cloud_uri(user_id, app_id, name, expiry_days)
if not uri:
logger.debug("Application limit reached for this account tier with user_id: %s", user_id)
raise HTTPException(status_code=403, detail="Application limit reached for this account tier")
return {"uri": uri, "app_id": app_id}
except HTTPException:
# Re-raise HTTP exceptions
raise
except Exception as e:
logger.error(f"Error generating cloud URI: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/folders/{folder_id}/set_rule")
@telemetry.track(operation_type="set_folder_rule", metadata_resolver=telemetry.set_folder_rule_metadata)
async def set_folder_rule(
folder_id: str,
request: SetFolderRuleRequest,
auth: AuthContext = Depends(verify_token),
apply_to_existing: bool = True,
):
"""
Set extraction rules for a folder.
Args:
folder_id: ID of the folder to set rules for
request: SetFolderRuleRequest containing metadata extraction rules
auth: Authentication context
apply_to_existing: Whether to apply rules to existing documents in the folder
Returns:
Success status with processing results
"""
# Import text here to ensure it's available in this function's scope
from sqlalchemy import text
try:
# Log detailed information about the rules
logger.debug(f"Setting rules for folder {folder_id}")
logger.debug(f"Number of rules: {len(request.rules)}")
for i, rule in enumerate(request.rules):
logger.debug(f"\nRule {i + 1}:")
logger.debug(f"Type: {rule.type}")
logger.debug("Schema:")
for field_name, field_config in rule.schema.items():
logger.debug(f" Field: {field_name}")
logger.debug(f" Type: {field_config.get('type', 'unknown')}")
logger.debug(f" Description: {field_config.get('description', 'No description')}")
if "schema" in field_config:
logger.debug(" Has JSON schema: Yes")
logger.debug(f" Schema: {field_config['schema']}")
# Get the folder
folder = await document_service.db.get_folder(folder_id, auth)
if not folder:
raise HTTPException(status_code=404, detail=f"Folder {folder_id} not found")
# Check if user has write access to the folder
if not document_service.db._check_folder_access(folder, auth, "write"):
raise HTTPException(status_code=403, detail="You don't have write access to this folder")
# Update folder with rules
# Convert rules to dicts for JSON serialization
rules_dicts = [rule.model_dump() for rule in request.rules]
# Update the folder in the database
async with document_service.db.async_session() as session:
# Execute update query
await session.execute(
text(
"""
UPDATE folders
SET rules = :rules
WHERE id = :folder_id
"""
),
{"folder_id": folder_id, "rules": json.dumps(rules_dicts)},
)
await session.commit()
logger.info(f"Successfully updated folder {folder_id} with {len(request.rules)} rules")
# Get updated folder
updated_folder = await document_service.db.get_folder(folder_id, auth)
# If apply_to_existing is True, apply these rules to all existing documents in the folder
processing_results = {"processed": 0, "errors": []}
if apply_to_existing and folder.document_ids:
logger.info(f"Applying rules to {len(folder.document_ids)} existing documents in folder")
# Import rules processor
# Get all documents in the folder
documents = await document_service.db.get_documents_by_id(folder.document_ids, auth)
# Process each document
for doc in documents:
try:
# Get document content
logger.info(f"Processing document {doc.external_id}")
# For each document, apply the rules from the folder
doc_content = None
# Get content from system_metadata if available
if doc.system_metadata and "content" in doc.system_metadata:
doc_content = doc.system_metadata["content"]
logger.info(f"Retrieved content from system_metadata for document {doc.external_id}")
# If we still have no content, log error and continue
if not doc_content:
error_msg = f"No content found in system_metadata for document {doc.external_id}"
logger.error(error_msg)
processing_results["errors"].append({"document_id": doc.external_id, "error": error_msg})
continue
# Process document with rules
try:
# Convert request rules to actual rule models and apply them
from core.models.rules import MetadataExtractionRule
for rule_request in request.rules:
if rule_request.type == "metadata_extraction":
# Create the actual rule model
rule = MetadataExtractionRule(type=rule_request.type, schema=rule_request.schema)
# Apply the rule with retries
max_retries = 3
base_delay = 1 # seconds
extracted_metadata = None
last_error = None
for retry_count in range(max_retries):
try:
if retry_count > 0:
# Exponential backoff
delay = base_delay * (2 ** (retry_count - 1))
logger.info(f"Retry {retry_count}/{max_retries} after {delay}s delay")
await asyncio.sleep(delay)
extracted_metadata, _ = await rule.apply(doc_content, {})
logger.info(
f"Successfully extracted metadata on attempt {retry_count + 1}: "
f"{extracted_metadata}"
)
break # Success, exit retry loop
except Exception as rule_apply_error:
last_error = rule_apply_error
logger.warning(
f"Metadata extraction attempt {retry_count + 1} failed: "
f"{rule_apply_error}"
)
if retry_count == max_retries - 1: # Last attempt
logger.error(f"All {max_retries} metadata extraction attempts failed")
processing_results["errors"].append(
{
"document_id": doc.external_id,
"error": f"Failed to extract metadata after {max_retries} "
f"attempts: {str(last_error)}",
}
)
continue # Skip to next document
# Update document metadata if extraction succeeded
if extracted_metadata:
# Merge new metadata with existing
doc.metadata.update(extracted_metadata)
# Create an updates dict that only updates metadata
# We need to create system_metadata with all preserved fields
# Note: In the database, metadata is stored as 'doc_metadata', not 'metadata'
updates = {
"doc_metadata": doc.metadata, # Use doc_metadata for the database
"system_metadata": {}, # Will be merged with existing in update_document
}
# Explicitly preserve the content field in system_metadata
if "content" in doc.system_metadata:
updates["system_metadata"]["content"] = doc.system_metadata["content"]
# Log the updates we're making
logger.info(
f"Updating document {doc.external_id} with metadata: {extracted_metadata}"
)
logger.info(f"Full metadata being updated: {doc.metadata}")
logger.info(f"Update object being sent to database: {updates}")
logger.info(
f"Preserving content in system_metadata: {'content' in doc.system_metadata}"
)
# Update document in database
success = await document_service.db.update_document(doc.external_id, updates, auth)
if success:
logger.info(f"Updated metadata for document {doc.external_id}")
processing_results["processed"] += 1
else:
logger.error(f"Failed to update metadata for document {doc.external_id}")
processing_results["errors"].append(
{
"document_id": doc.external_id,
"error": "Failed to update document metadata",
}
)
except Exception as rule_error:
logger.error(f"Error processing rules for document {doc.external_id}: {rule_error}")
processing_results["errors"].append(
{
"document_id": doc.external_id,
"error": f"Error processing rules: {str(rule_error)}",
}
)
except Exception as doc_error:
logger.error(f"Error processing document {doc.external_id}: {doc_error}")
processing_results["errors"].append({"document_id": doc.external_id, "error": str(doc_error)})
return {
"status": "success",
"message": "Rules set successfully",
"folder_id": folder_id,
"rules": updated_folder.rules,
"processing_results": processing_results,
}
except HTTPException:
# Re-raise HTTP exceptions
raise
except Exception as e:
logger.error(f"Error setting folder rules: {e}")
raise HTTPException(status_code=500, detail=str(e))