morphik-core/core/services/document_service.py

import base64
from collections import defaultdict
from typing import Any, Dict, List, Union, Optional
import logging
from fastapi import UploadFile

from core.database.base_database import BaseDatabase
from core.embedding_model.base_embedding_model import BaseEmbeddingModel
from core.models.request import IngestTextRequest, QueryRequest
from core.parser.base_parser import BaseParser
from core.storage.base_storage import BaseStorage
from core.vector_store.base_vector_store import BaseVectorStore
from ..models.documents import (
    Document,
    DocumentChunk,
    ChunkResult,
    DocumentContent,
    DocumentResult,
    QueryReturnType
)
from ..models.auth import AuthContext


logger = logging.getLogger(__name__)


class DocumentService:
    def __init__(
        self,
        database: BaseDatabase,
        vector_store: BaseVectorStore,
        storage: BaseStorage,
        parser: BaseParser,
        embedding_model: BaseEmbeddingModel
    ):
        self.db = database
        self.vector_store = vector_store
        self.storage = storage
        self.parser = parser
        self.embedding_model = embedding_model

    async def ingest_text(
        self,
        request: IngestTextRequest,
        auth: AuthContext
    ) -> Document:
        """Ingest a text document."""
        if "write" not in auth.permissions:
            raise PermissionError("User does not have write permission")
        try:
            # 1. Create document record
            doc = Document(
                content_type="text/plain",
                metadata=request.metadata,
                owner={
                    "type": auth.entity_type,
                    "id": auth.entity_id
                },
                access_control={
                    "readers": [auth.entity_id],
                    "writers": {auth.entity_id},
                    "admins": {auth.entity_id}
                }
            )
            logger.info(f"Created text document record with ID {doc.external_id}")

            # 2. Parse content into chunks
            chunks = await self.parser.split_text(request.content)
            if not chunks:
                raise ValueError("No content chunks extracted from text")
            logger.info(f"Split text into {len(chunks)} chunks")

            # 3. Generate embeddings for chunks
            embeddings = await self.embedding_model.embed_for_ingestion(chunks)
            logger.info(f"Generated {len(embeddings)} embeddings")

            # 4. Create and store chunk objects
            chunk_objects = self._create_chunk_objects(
                doc.external_id,
                chunks,
                embeddings,
                doc.metadata
            )
            logger.info(f"Created {len(chunk_objects)} chunk objects")

            # 5. Store everything
            await self._store_chunks_and_doc(chunk_objects, doc)
            logger.info(f"Successfully stored text document {doc.external_id}")

            return doc

        except Exception as e:
            logger.error(f"Text document ingestion failed: {str(e)}")
            # TODO: Clean up any stored data on failure
            raise e

    async def ingest_file(
        self,
        file: UploadFile,
        metadata: Dict[str, Any],
        auth: AuthContext
    ) -> Document:
        """Ingest a file document."""
        if "write" not in auth.permissions:
            raise PermissionError("User does not have write permission")
        try:
            # 1. Create document record
            doc = Document(
                content_type=file.content_type,
                filename=file.filename,
                metadata=metadata,
                owner={
                    "type": auth.entity_type,
                    "id": auth.entity_id
                },
                access_control={
                    "readers": [auth.entity_id],
                    "writers": {auth.entity_id},
                    "admins": {auth.entity_id}
                }
            )
            logger.info(f"Created file document record with ID {doc.external_id}")

            # 2. Read and store file
            file_content = await file.read()
            storage_info = await self.storage.upload_from_base64(
                base64.b64encode(file_content).decode(),
                doc.external_id,
                file.content_type
            )
            doc.storage_info = {
                "bucket": storage_info[0],
                "key": storage_info[1]
            }
            logger.info(
                f"Stored file in bucket `{storage_info[0]}` with key `{storage_info[1]}`"
            )

            # 3. Parse content into chunks
            chunks = await self.parser.parse_file(file_content, file.content_type)
            if not chunks:
                raise ValueError("No content chunks extracted from file")
            logger.info(f"Parsed file into {len(chunks)} chunks")

            # 4. Generate embeddings for chunks
            embeddings = await self.embedding_model.embed_for_ingestion(chunks)
            logger.info(f"Generated {len(embeddings)} embeddings")

            # 5. Create and store chunk objects
            chunk_objects = self._create_chunk_objects(
                doc.external_id,
                chunks,
                embeddings,
                doc.metadata
            )
            logger.info(f"Created {len(chunk_objects)} chunk objects")

            # 6. Store everything
            doc.chunk_ids = await self._store_chunks_and_doc(chunk_objects, doc)
            logger.info(f"Successfully stored file document {doc.external_id}")

            return doc

        except Exception as e:
            logger.error(f"File document ingestion failed: {str(e)}")
            # TODO: Clean up any stored data on failure
            raise

    async def query(
        self,
        request: QueryRequest,
        auth: AuthContext
    ) -> Union[List[ChunkResult], List[DocumentResult]]:
        """Query documents with specified return type."""
        try:
            # 1. Get embedding for query
            query_embedding = await self.embedding_model.embed_for_query(
                request.query
            )
            logger.info("Generated query embedding")

            # 2. Find authorized documents
            doc_ids = await self.db.find_documents(auth, request.filters)
            if not doc_ids:
                logger.info("No authorized documents found")
                return []
            logger.info(f"Found {len(doc_ids)} authorized documents")

            # 3. Search chunks with vector similarity
            chunks = await self.vector_store.query_similar(
                query_embedding,
                k=request.k,
                auth=auth,
                filters={"document_id": {"$in": doc_ids}}
            )
            logger.info(f"Found {len(chunks)} similar chunks")

            # 4. Return results in requested format
            if request.return_type == QueryReturnType.CHUNKS:
                results = await self._create_chunk_results(auth, chunks)
                logger.info(f"Returning {len(results)} chunk results")
                return results
            else:
                results = await self._create_document_results(auth, chunks)
                logger.info(f"Returning {len(results)} document results")
                return results

        except Exception as e:
            logger.error(f"Query failed: {str(e)}")
            raise

    def _create_chunk_objects(
        self,
        doc_id: str,
        chunks: List[str],
        embeddings: List[List[float]],
        metadata: Dict[str, Any]
    ) -> List[DocumentChunk]:
        """Helper to create chunk objects"""
        return [
            DocumentChunk(
                document_id=doc_id,
                content=content,
                embedding=embedding,
                chunk_number=i,
                metadata=metadata
            )
            for i, (content, embedding) in enumerate(zip(chunks, embeddings))
        ]

    async def _store_chunks_and_doc(
        self,
        chunk_objects: List[DocumentChunk],
        doc: Document
    ) -> List[str]:
        """Helper to store chunks and document"""
        # Store chunks in vector store
        success, result = await self.vector_store.store_embeddings(chunk_objects)
        if not success:
            raise Exception("Failed to store chunk embeddings")
        logger.debug("Stored chunk embeddings in vector store")

        # Store document metadata
        if not await self.db.store_document(doc):
            raise Exception("Failed to store document metadata")
        logger.debug("Stored document metadata in database")

        return [str(id) for id in result.inserted_ids]

    async def _create_chunk_results(
        self,
        auth: AuthContext,
        chunks: List[DocumentChunk]
    ) -> List[ChunkResult]:
        """Create ChunkResult objects with document metadata."""
        results = []
        for chunk in chunks:
            # Get document metadata
            doc = await self.db.get_document(chunk.document_id, auth)
            if not doc:
                logger.warning(f"Document {chunk.document_id} not found")
                continue
            logger.debug(f"Retrieved metadata for document {chunk.document_id}")

            # Generate download URL if needed
            download_url = None
            if doc.storage_info:
                download_url = await self.storage.get_download_url(
                    doc.storage_info["bucket"],
                    doc.storage_info["key"]
                )
                logger.debug(
                    f"Generated download URL for document {chunk.document_id}"
                )

            results.append(ChunkResult(
                content=chunk.content,
                score=chunk.score,
                document_id=chunk.document_id,
                chunk_number=chunk.chunk_number,
                metadata=doc.metadata,
                content_type=doc.content_type,
                filename=doc.filename,
                download_url=download_url
            ))

        logger.info(f"Created {len(results)} chunk results")
        return results

    async def _create_document_results(
        self,
        auth: AuthContext,
        chunks: List[DocumentChunk]
    ) -> List[DocumentResult]:
        """Group chunks by document and create DocumentResult objects."""
        # Group chunks by document and get highest scoring chunk per doc
        doc_chunks: Dict[str, DocumentChunk] = {}
        for chunk in chunks:
            if (chunk.document_id not in doc_chunks or
                    chunk.score > doc_chunks[chunk.document_id].score):
                doc_chunks[chunk.document_id] = chunk
        logger.info(f"Grouped chunks into {len(doc_chunks)} documents")

        results = []
        for doc_id, chunk in doc_chunks.items():
            # Get document metadata
            doc = await self.db.get_document(doc_id, auth)
            if not doc:
                logger.warning(f"Document {doc_id} not found")
                continue
            logger.debug(f"Retrieved metadata for document {doc_id}")

            # Create DocumentContent based on content type
            if doc.content_type == "text/plain":
                content = DocumentContent(
                    type="string",
                    value=chunk.content,
                    filename=None
                )
                logger.debug(f"Created text content for document {doc_id}")
            else:
                # Generate download URL for file types
                download_url = await self.storage.get_download_url(
                    doc.storage_info["bucket"],
                    doc.storage_info["key"]
                )
                content = DocumentContent(
                    type="url",
                    value=download_url,
                    filename=doc.filename
                )
                logger.debug(f"Created URL content for document {doc_id}")

            results.append(DocumentResult(
                score=chunk.score,
                document_id=doc_id,
                metadata=doc.metadata,
                content=content
            ))

        logger.info(f"Created {len(results)} document results")
        return results
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`import base64`
system changes 2024-11-22 18:56:22 -05:00			`from collections import defaultdict`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`from typing import Any, Dict, List, Union, Optional`
system changes 2024-11-22 18:56:22 -05:00			`import logging`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`from fastapi import UploadFile`

system changes 2024-11-22 18:56:22 -05:00			`from core.database.base_database import BaseDatabase`
			`from core.embedding_model.base_embedding_model import BaseEmbeddingModel`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`from core.models.request import IngestTextRequest, QueryRequest`
system changes 2024-11-22 18:56:22 -05:00			`from core.parser.base_parser import BaseParser`
			`from core.storage.base_storage import BaseStorage`
			`from core.vector_store.base_vector_store import BaseVectorStore`
			`from ..models.documents import (`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`Document,`
			`DocumentChunk,`
			`ChunkResult,`
			`DocumentContent,`
			`DocumentResult,`
system changes 2024-11-22 18:56:22 -05:00			`QueryReturnType`
			`)`
			`from ..models.auth import AuthContext`


			`logger = logging.getLogger(__name__)`


			`class DocumentService:`
			`def __init__(`
			`self,`
			`database: BaseDatabase,`
			`vector_store: BaseVectorStore,`
			`storage: BaseStorage,`
			`parser: BaseParser,`
			`embedding_model: BaseEmbeddingModel`
			`):`
			`self.db = database`
			`self.vector_store = vector_store`
			`self.storage = storage`
			`self.parser = parser`
			`self.embedding_model = embedding_model`

separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`async def ingest_text(`
system changes 2024-11-22 18:56:22 -05:00			`self,`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`request: IngestTextRequest,`
system changes 2024-11-22 18:56:22 -05:00			`auth: AuthContext`
			`) -> Document:`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`"""Ingest a text document."""`
pass all tests apart from querying 2024-11-28 19:09:40 -05:00			`if "write" not in auth.permissions:`
			`raise PermissionError("User does not have write permission")`
system changes 2024-11-22 18:56:22 -05:00			`try:`
			`# 1. Create document record`
			`doc = Document(`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`content_type="text/plain",`
system changes 2024-11-22 18:56:22 -05:00			`metadata=request.metadata,`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`owner={`
			`"type": auth.entity_type,`
			`"id": auth.entity_id`
			`},`
system changes 2024-11-22 18:56:22 -05:00			`access_control={`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`"readers": [auth.entity_id],`
system changes 2024-11-22 18:56:22 -05:00			`"writers": {auth.entity_id},`
			`"admins": {auth.entity_id}`
			`}`
			`)`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`logger.info(f"Created text document record with ID {doc.external_id}")`
system changes 2024-11-22 18:56:22 -05:00
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`# 2. Parse content into chunks`
			`chunks = await self.parser.split_text(request.content)`
			`if not chunks:`
			`raise ValueError("No content chunks extracted from text")`
			`logger.info(f"Split text into {len(chunks)} chunks")`

			`# 3. Generate embeddings for chunks`
			`embeddings = await self.embedding_model.embed_for_ingestion(chunks)`
			`logger.info(f"Generated {len(embeddings)} embeddings")`

			`# 4. Create and store chunk objects`
			`chunk_objects = self._create_chunk_objects(`
			`doc.external_id,`
			`chunks,`
			`embeddings,`
			`doc.metadata`
			`)`
			`logger.info(f"Created {len(chunk_objects)} chunk objects")`

			`# 5. Store everything`
			`await self._store_chunks_and_doc(chunk_objects, doc)`
			`logger.info(f"Successfully stored text document {doc.external_id}")`

			`return doc`

			`except Exception as e:`
			`logger.error(f"Text document ingestion failed: {str(e)}")`
			`# TODO: Clean up any stored data on failure`
pass all tests apart from querying 2024-11-28 19:09:40 -05:00			`raise e`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00
			`async def ingest_file(`
			`self,`
			`file: UploadFile,`
			`metadata: Dict[str, Any],`
			`auth: AuthContext`
			`) -> Document:`
			`"""Ingest a file document."""`
pass all tests apart from querying 2024-11-28 19:09:40 -05:00			`if "write" not in auth.permissions:`
			`raise PermissionError("User does not have write permission")`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`try:`
			`# 1. Create document record`
			`doc = Document(`
			`content_type=file.content_type,`
			`filename=file.filename,`
			`metadata=metadata,`
			`owner={`
			`"type": auth.entity_type,`
			`"id": auth.entity_id`
			`},`
			`access_control={`
			`"readers": [auth.entity_id],`
			`"writers": {auth.entity_id},`
			`"admins": {auth.entity_id}`
system changes 2024-11-22 18:56:22 -05:00			`}`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`)`
			`logger.info(f"Created file document record with ID {doc.external_id}")`

			`# 2. Read and store file`
			`file_content = await file.read()`
			`storage_info = await self.storage.upload_from_base64(`
			`base64.b64encode(file_content).decode(),`
			`doc.external_id,`
			`file.content_type`
			`)`
			`doc.storage_info = {`
			`"bucket": storage_info[0],`
			`"key": storage_info[1]`
			`}`
			`logger.info(`
pass all tests apart from querying 2024-11-28 19:09:40 -05:00			f"Stored file in bucket `{storage_info[0]}` with key `{storage_info[1]}`"
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`)`
system changes 2024-11-22 18:56:22 -05:00
			`# 3. Parse content into chunks`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`chunks = await self.parser.parse_file(file_content, file.content_type)`
			`if not chunks:`
			`raise ValueError("No content chunks extracted from file")`
			`logger.info(f"Parsed file into {len(chunks)} chunks")`

system changes 2024-11-22 18:56:22 -05:00			`# 4. Generate embeddings for chunks`
			`embeddings = await self.embedding_model.embed_for_ingestion(chunks)`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`logger.info(f"Generated {len(embeddings)} embeddings")`
system changes 2024-11-22 18:56:22 -05:00
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`# 5. Create and store chunk objects`
			`chunk_objects = self._create_chunk_objects(`
			`doc.external_id,`
			`chunks,`
			`embeddings,`
			`doc.metadata`
			`)`
			`logger.info(f"Created {len(chunk_objects)} chunk objects")`
system changes 2024-11-22 18:56:22 -05:00
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`# 6. Store everything`
pass all tests apart from querying 2024-11-28 19:09:40 -05:00			`doc.chunk_ids = await self._store_chunks_and_doc(chunk_objects, doc)`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`logger.info(f"Successfully stored file document {doc.external_id}")`
system changes 2024-11-22 18:56:22 -05:00
			`return doc`

			`except Exception as e:`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`logger.error(f"File document ingestion failed: {str(e)}")`
system changes 2024-11-22 18:56:22 -05:00			`# TODO: Clean up any stored data on failure`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`raise`
system changes 2024-11-22 18:56:22 -05:00
			`async def query(`
			`self,`
			`request: QueryRequest,`
			`auth: AuthContext`
			`) -> Union[List[ChunkResult], List[DocumentResult]]:`
			`"""Query documents with specified return type."""`
			`try:`
			`# 1. Get embedding for query`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`query_embedding = await self.embedding_model.embed_for_query(`
			`request.query`
			`)`
			`logger.info("Generated query embedding")`
system changes 2024-11-22 18:56:22 -05:00
			`# 2. Find authorized documents`
			`doc_ids = await self.db.find_documents(auth, request.filters)`
			`if not doc_ids:`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`logger.info("No authorized documents found")`
system changes 2024-11-22 18:56:22 -05:00			`return []`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`logger.info(f"Found {len(doc_ids)} authorized documents")`
system changes 2024-11-22 18:56:22 -05:00
			`# 3. Search chunks with vector similarity`
			`chunks = await self.vector_store.query_similar(`
			`query_embedding,`
			`k=request.k,`
			`auth=auth,`
			`filters={"document_id": {"$in": doc_ids}}`
			`)`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`logger.info(f"Found {len(chunks)} similar chunks")`
system changes 2024-11-22 18:56:22 -05:00
			`# 4. Return results in requested format`
			`if request.return_type == QueryReturnType.CHUNKS:`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`results = await self._create_chunk_results(auth, chunks)`
			`logger.info(f"Returning {len(results)} chunk results")`
			`return results`
system changes 2024-11-22 18:56:22 -05:00			`else:`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`results = await self._create_document_results(auth, chunks)`
			`logger.info(f"Returning {len(results)} document results")`
			`return results`
system changes 2024-11-22 18:56:22 -05:00
			`except Exception as e:`
			`logger.error(f"Query failed: {str(e)}")`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`raise`
system changes 2024-11-22 18:56:22 -05:00
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`def _create_chunk_objects(`
			`self,`
			`doc_id: str,`
			`chunks: List[str],`
			`embeddings: List[List[float]],`
			`metadata: Dict[str, Any]`
			`) -> List[DocumentChunk]:`
			`"""Helper to create chunk objects"""`
			`return [`
			`DocumentChunk(`
			`document_id=doc_id,`
			`content=content,`
			`embedding=embedding,`
			`chunk_number=i,`
			`metadata=metadata`
			`)`
			`for i, (content, embedding) in enumerate(zip(chunks, embeddings))`
			`]`

			`async def _store_chunks_and_doc(`
			`self,`
			`chunk_objects: List[DocumentChunk],`
			`doc: Document`
pass all tests apart from querying 2024-11-28 19:09:40 -05:00			`) -> List[str]:`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`"""Helper to store chunks and document"""`
			`# Store chunks in vector store`
pass all tests apart from querying 2024-11-28 19:09:40 -05:00			`success, result = await self.vector_store.store_embeddings(chunk_objects)`
			`if not success:`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`raise Exception("Failed to store chunk embeddings")`
			`logger.debug("Stored chunk embeddings in vector store")`

			`# Store document metadata`
			`if not await self.db.store_document(doc):`
			`raise Exception("Failed to store document metadata")`
			`logger.debug("Stored document metadata in database")`

pass all tests apart from querying 2024-11-28 19:09:40 -05:00			`return [str(id) for id in result.inserted_ids]`

separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`async def _create_chunk_results(`
			`self,`
			`auth: AuthContext,`
			`chunks: List[DocumentChunk]`
			`) -> List[ChunkResult]:`
system changes 2024-11-22 18:56:22 -05:00			`"""Create ChunkResult objects with document metadata."""`
			`results = []`
			`for chunk in chunks:`
			`# Get document metadata`
			`doc = await self.db.get_document(chunk.document_id, auth)`
			`if not doc:`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`logger.warning(f"Document {chunk.document_id} not found")`
system changes 2024-11-22 18:56:22 -05:00			`continue`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`logger.debug(f"Retrieved metadata for document {chunk.document_id}")`
system changes 2024-11-22 18:56:22 -05:00
			`# Generate download URL if needed`
			`download_url = None`
			`if doc.storage_info:`
			`download_url = await self.storage.get_download_url(`
			`doc.storage_info["bucket"],`
			`doc.storage_info["key"]`
			`)`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`logger.debug(`
			`f"Generated download URL for document {chunk.document_id}"`
			`)`
system changes 2024-11-22 18:56:22 -05:00
			`results.append(ChunkResult(`
			`content=chunk.content,`
			`score=chunk.score,`
			`document_id=chunk.document_id,`
			`chunk_number=chunk.chunk_number,`
			`metadata=doc.metadata,`
			`content_type=doc.content_type,`
			`filename=doc.filename,`
			`download_url=download_url`
			`))`

separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`logger.info(f"Created {len(results)} chunk results")`
system changes 2024-11-22 18:56:22 -05:00			`return results`

separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`async def _create_document_results(`
			`self,`
			`auth: AuthContext,`
			`chunks: List[DocumentChunk]`
			`) -> List[DocumentResult]:`
system changes 2024-11-22 18:56:22 -05:00			`"""Group chunks by document and create DocumentResult objects."""`
			`# Group chunks by document and get highest scoring chunk per doc`
			`doc_chunks: Dict[str, DocumentChunk] = {}`
			`for chunk in chunks:`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`if (chunk.document_id not in doc_chunks or`
			`chunk.score > doc_chunks[chunk.document_id].score):`
system changes 2024-11-22 18:56:22 -05:00			`doc_chunks[chunk.document_id] = chunk`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`logger.info(f"Grouped chunks into {len(doc_chunks)} documents")`
system changes 2024-11-22 18:56:22 -05:00
			`results = []`
			`for doc_id, chunk in doc_chunks.items():`
			`# Get document metadata`
			`doc = await self.db.get_document(doc_id, auth)`
			`if not doc:`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`logger.warning(f"Document {doc_id} not found")`
system changes 2024-11-22 18:56:22 -05:00			`continue`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`logger.debug(f"Retrieved metadata for document {doc_id}")`
system changes 2024-11-22 18:56:22 -05:00
			`# Create DocumentContent based on content type`
			`if doc.content_type == "text/plain":`
			`content = DocumentContent(`
			`type="string",`
			`value=chunk.content,`
			`filename=None`
			`)`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`logger.debug(f"Created text content for document {doc_id}")`
system changes 2024-11-22 18:56:22 -05:00			`else:`
			`# Generate download URL for file types`
			`download_url = await self.storage.get_download_url(`
			`doc.storage_info["bucket"],`
			`doc.storage_info["key"]`
			`)`
			`content = DocumentContent(`
			`type="url",`
			`value=download_url,`
			`filename=doc.filename`
			`)`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`logger.debug(f"Created URL content for document {doc_id}")`
system changes 2024-11-22 18:56:22 -05:00
			`results.append(DocumentResult(`
			`score=chunk.score,`
			`document_id=doc_id,`
			`metadata=doc.metadata,`
			`content=content`
			`))`

separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`logger.info(f"Created {len(results)} document results")`
system changes 2024-11-22 18:56:22 -05:00			`return results`