from collections import defaultdict from typing import Dict, List, Union, Optional import logging from core.api import IngestRequest, QueryRequest from core.database.base_database import BaseDatabase from core.embedding_model.base_embedding_model import BaseEmbeddingModel from core.parser.base_parser import BaseParser from core.storage.base_storage import BaseStorage from core.vector_store.base_vector_store import BaseVectorStore from ..models.documents import ( Document, DocumentChunk, ChunkResult, DocumentContent, DocumentResult, QueryReturnType ) from ..models.auth import AuthContext logger = logging.getLogger(__name__) class DocumentService: def __init__( self, database: BaseDatabase, vector_store: BaseVectorStore, storage: BaseStorage, parser: BaseParser, embedding_model: BaseEmbeddingModel ): self.db = database self.vector_store = vector_store self.storage = storage self.parser = parser self.embedding_model = embedding_model async def ingest_document( self, request: IngestRequest, auth: AuthContext ) -> Document: """Ingest a new document with chunks.""" try: # 1. Create document record doc = Document( content_type=request.content_type, filename=request.filename, metadata=request.metadata, access_control={ "owner": { "type": auth.entity_type, "id": auth.entity_id }, "readers": {auth.entity_id}, "writers": {auth.entity_id}, "admins": {auth.entity_id} } ) # 2. Store file in storage if it's not text if request.content_type != "text/plain": storage_info = await self.storage.upload_from_base64( request.content, doc.external_id, request.content_type ) doc.storage_info = { "bucket": storage_info[0], "key": storage_info[1] } # 3. Parse content into chunks chunks = await self.parser.parse(request.content) # 4. Generate embeddings for chunks embeddings = await self.embedding_model.embed_for_ingestion(chunks) # 5. Create and store chunks with embeddings chunk_objects = [] for i, (content, embedding) in enumerate(zip(chunks, embeddings)): chunk = DocumentChunk( document_id=doc.external_id, content=content, embedding=embedding, chunk_number=i, metadata=doc.metadata # Inherit document metadata ) chunk_objects.append(chunk) # 6. Store chunks in vector store success = await self.vector_store.store_embeddings(chunk_objects) if not success: raise Exception("Failed to store chunk embeddings") # 7. Store document metadata if not await self.db.store_document(doc): raise Exception("Failed to store document metadata") return doc except Exception as e: # TODO: Clean up any stored data on failure raise Exception(f"Document ingestion failed: {str(e)}") async def query( self, request: QueryRequest, auth: AuthContext ) -> Union[List[ChunkResult], List[DocumentResult]]: """Query documents with specified return type.""" try: # 1. Get embedding for query query_embedding = await self.embedding_model.embed_for_query(request.query) # 2. Find authorized documents doc_ids = await self.db.find_documents(auth, request.filters) if not doc_ids: return [] # 3. Search chunks with vector similarity chunks = await self.vector_store.query_similar( query_embedding, k=request.k, auth=auth, filters={"document_id": {"$in": doc_ids}} ) # 4. Return results in requested format if request.return_type == QueryReturnType.CHUNKS: return await self._create_chunk_results(auth, chunks) else: return await self._create_document_results(auth, chunks) except Exception as e: logger.error(f"Query failed: {str(e)}") raise e async def _create_chunk_results(self, auth: AuthContext, chunks: List[DocumentChunk]) -> List[ChunkResult]: """Create ChunkResult objects with document metadata.""" results = [] for chunk in chunks: # Get document metadata doc = await self.db.get_document(chunk.document_id, auth) if not doc: continue # Generate download URL if needed download_url = None if doc.storage_info: download_url = await self.storage.get_download_url( doc.storage_info["bucket"], doc.storage_info["key"] ) results.append(ChunkResult( content=chunk.content, score=chunk.score, document_id=chunk.document_id, chunk_number=chunk.chunk_number, metadata=doc.metadata, content_type=doc.content_type, filename=doc.filename, download_url=download_url )) return results async def _create_document_results(self, auth: AuthContext, chunks: List[DocumentChunk]) -> List[DocumentResult]: """Group chunks by document and create DocumentResult objects.""" # Group chunks by document and get highest scoring chunk per doc doc_chunks: Dict[str, DocumentChunk] = {} for chunk in chunks: if chunk.document_id not in doc_chunks or chunk.score > doc_chunks[chunk.document_id].score: doc_chunks[chunk.document_id] = chunk results = [] for doc_id, chunk in doc_chunks.items(): # Get document metadata doc = await self.db.get_document(doc_id, auth) if not doc: continue # Create DocumentContent based on content type if doc.content_type == "text/plain": content = DocumentContent( type="string", value=chunk.content, filename=None ) else: # Generate download URL for file types download_url = await self.storage.get_download_url( doc.storage_info["bucket"], doc.storage_info["key"] ) content = DocumentContent( type="url", value=download_url, filename=doc.filename ) results.append(DocumentResult( score=chunk.score, document_id=doc_id, metadata=doc.metadata, content=content )) return results