import base64 from typing import Dict, Any, List, Optional from fastapi import UploadFile from core.models.request import IngestTextRequest from ..models.documents import Document, DocumentChunk, ChunkResult, DocumentContent, DocumentResult from ..models.auth import AuthContext from core.database.base_database import BaseDatabase from core.storage.base_storage import BaseStorage from core.vector_store.base_vector_store import BaseVectorStore from core.embedding_model.base_embedding_model import BaseEmbeddingModel from core.parser.base_parser import BaseParser from core.completion.base_completion import BaseCompletionModel from core.completion.base_completion import CompletionRequest, CompletionResponse import logging logger = logging.getLogger(__name__) class DocumentService: def __init__( self, database: BaseDatabase, vector_store: BaseVectorStore, storage: BaseStorage, parser: BaseParser, embedding_model: BaseEmbeddingModel, completion_model: BaseCompletionModel ): self.db = database self.vector_store = vector_store self.storage = storage self.parser = parser self.embedding_model = embedding_model self.completion_model = completion_model async def retrieve_chunks( self, query: str, auth: AuthContext, filters: Optional[Dict[str, Any]] = None, k: int = 4, min_score: float = 0.0 ) -> List[ChunkResult]: """Retrieve relevant chunks.""" # Get embedding for query query_embedding = await self.embedding_model.embed_for_query(query) logger.info("Generated query embedding") # Find authorized documents doc_ids = await self.db.find_authorized_and_filtered_documents(auth, filters) if not doc_ids: logger.info("No authorized documents found") return [] logger.info(f"Found {len(doc_ids)} authorized documents") # Search chunks with vector similarity chunks = await self.vector_store.query_similar( query_embedding, k=k, doc_ids=doc_ids, ) logger.info(f"Found {len(chunks)} similar chunks") # Create and return chunk results results = await self._create_chunk_results(auth, chunks) logger.info(f"Returning {len(results)} chunk results") return results async def retrieve_docs( self, query: str, auth: AuthContext, filters: Optional[Dict[str, Any]] = None, k: int = 4, min_score: float = 0.0 ) -> List[DocumentResult]: """Retrieve relevant documents.""" # Get chunks first chunks = await self.retrieve_chunks(query, auth, filters, k, min_score) # Convert to document results results = await self._create_document_results(auth, chunks) logger.info(f"Returning {len(results)} document results") return results async def query( self, query: str, auth: AuthContext, filters: Optional[Dict[str, Any]] = None, k: int = 4, min_score: float = 0.0, max_tokens: Optional[int] = None, temperature: Optional[float] = None ) -> CompletionResponse: """Generate completion using relevant chunks as context.""" # Get relevant chunks chunks = await self.retrieve_chunks(query, auth, filters, k, min_score) chunk_contents = [chunk.content for chunk in chunks] # Generate completion request = CompletionRequest( query=query, context_chunks=chunk_contents, max_tokens=max_tokens, temperature=temperature ) response = await self.completion_model.complete(request) return response async def ingest_text( self, request: IngestTextRequest, auth: AuthContext ) -> Document: """Ingest a text document.""" if "write" not in auth.permissions: logger.error(f"User {auth.entity_id} does not have write permission") raise PermissionError("User does not have write permission") # 1. Create document record doc = Document( content_type="text/plain", metadata=request.metadata, owner={ "type": auth.entity_type, "id": auth.entity_id }, access_control={ "readers": [auth.entity_id], "writers": [auth.entity_id], "admins": [auth.entity_id] } ) logger.info(f"Created text document record with ID {doc.external_id}") # 2. Parse content into chunks chunks = await self.parser.split_text(request.content) if not chunks: raise ValueError("No content chunks extracted from text") logger.info(f"Split text into {len(chunks)} chunks") # 3. Generate embeddings for chunks embeddings = await self.embedding_model.embed_for_ingestion(chunks) logger.info(f"Generated {len(embeddings)} embeddings") # 4. Create and store chunk objects chunk_objects = self._create_chunk_objects( doc.external_id, chunks, embeddings, doc.metadata ) logger.info(f"Created {len(chunk_objects)} chunk objects") # 5. Store everything await self._store_chunks_and_doc(chunk_objects, doc) logger.info(f"Successfully stored text document {doc.external_id}") return doc async def ingest_file( self, file: UploadFile, metadata: Dict[str, Any], auth: AuthContext ) -> Document: """Ingest a file document.""" if "write" not in auth.permissions: raise PermissionError("User does not have write permission") # 1. Create document record doc = Document( content_type=file.content_type, filename=file.filename, metadata=metadata, owner={ "type": auth.entity_type, "id": auth.entity_id }, access_control={ "readers": [auth.entity_id], "writers": [auth.entity_id], "admins": [auth.entity_id] } ) logger.info(f"Created file document record with ID {doc.external_id}") # 2. Read and store file file_content = await file.read() storage_info = await self.storage.upload_from_base64( base64.b64encode(file_content).decode(), doc.external_id, file.content_type ) doc.storage_info = { "bucket": storage_info[0], "key": storage_info[1] } logger.info( f"Stored file in bucket `{storage_info[0]}` with key `{storage_info[1]}`" ) # 3. Parse content into chunks chunks = await self.parser.parse_file(file_content, file.content_type) if not chunks: raise ValueError("No content chunks extracted from file") logger.info(f"Parsed file into {len(chunks)} chunks") # 4. Generate embeddings for chunks embeddings = await self.embedding_model.embed_for_ingestion(chunks) logger.info(f"Generated {len(embeddings)} embeddings") # 5. Create and store chunk objects chunk_objects = self._create_chunk_objects( doc.external_id, chunks, embeddings, doc.metadata ) logger.info(f"Created {len(chunk_objects)} chunk objects") # 6. Store everything doc.chunk_ids = await self._store_chunks_and_doc(chunk_objects, doc) logger.info(f"Successfully stored file document {doc.external_id}") return doc def _create_chunk_objects( self, doc_id: str, chunks: List[str], embeddings: List[List[float]], metadata: Dict[str, Any] ) -> List[DocumentChunk]: """Helper to create chunk objects""" return [ DocumentChunk( document_id=doc_id, content=content, embedding=embedding, chunk_number=i, metadata=metadata ) for i, (content, embedding) in enumerate(zip(chunks, embeddings)) ] async def _store_chunks_and_doc( self, chunk_objects: List[DocumentChunk], doc: Document ) -> List[str]: """Helper to store chunks and document""" # Store chunks in vector store success, result = await self.vector_store.store_embeddings(chunk_objects) if not success: raise Exception("Failed to store chunk embeddings") logger.debug("Stored chunk embeddings in vector store") doc.chunk_ids = result # Store document metadata if not await self.db.store_document(doc): raise Exception("Failed to store document metadata") logger.debug("Stored document metadata in database") logger.debug(f"Chunk IDs stored: {result}") return result async def _create_chunk_results( self, auth: AuthContext, chunks: List[DocumentChunk] ) -> List[ChunkResult]: """Create ChunkResult objects with document metadata.""" results = [] for chunk in chunks: # Get document metadata doc = await self.db.get_document(chunk.document_id, auth) if not doc: logger.warning(f"Document {chunk.document_id} not found") continue logger.debug(f"Retrieved metadata for document {chunk.document_id}") # Generate download URL if needed download_url = None if doc.storage_info: download_url = await self.storage.get_download_url( doc.storage_info["bucket"], doc.storage_info["key"] ) logger.debug( f"Generated download URL for document {chunk.document_id}" ) results.append(ChunkResult( content=chunk.content, score=chunk.score, document_id=chunk.document_id, chunk_number=chunk.chunk_number, metadata=doc.metadata, content_type=doc.content_type, filename=doc.filename, download_url=download_url )) logger.info(f"Created {len(results)} chunk results") return results async def _create_document_results( self, auth: AuthContext, chunks: List[DocumentChunk] ) -> List[DocumentResult]: """Group chunks by document and create DocumentResult objects.""" # Group chunks by document and get highest scoring chunk per doc doc_chunks: Dict[str, DocumentChunk] = {} for chunk in chunks: if (chunk.document_id not in doc_chunks or chunk.score > doc_chunks[chunk.document_id].score): doc_chunks[chunk.document_id] = chunk logger.info(f"Grouped chunks into {len(doc_chunks)} documents") logger.info(f"Document chunks: {doc_chunks}") results = [] for doc_id, chunk in doc_chunks.items(): # Get document metadata doc = await self.db.get_document(doc_id, auth) if not doc: logger.warning(f"Document {doc_id} not found") continue logger.info(f"Retrieved metadata for document {doc_id}") # Create DocumentContent based on content type if doc.content_type == "text/plain": content = DocumentContent( type="string", value=chunk.content, filename=None ) logger.debug(f"Created text content for document {doc_id}") else: # Generate download URL for file types download_url = await self.storage.get_download_url( doc.storage_info["bucket"], doc.storage_info["key"] ) content = DocumentContent( type="url", value=download_url, filename=doc.filename ) logger.debug(f"Created URL content for document {doc_id}") results.append(DocumentResult( score=chunk.score, document_id=doc_id, metadata=doc.metadata, content=content )) logger.info(f"Created {len(results)} document results") return results