2025-04-20 16:34:29 -07:00
|
|
|
import asyncio
|
2024-12-04 20:26:14 -05:00
|
|
|
import base64
|
2025-04-20 16:34:29 -07:00
|
|
|
import logging
|
|
|
|
import os
|
|
|
|
import tempfile
|
|
|
|
from datetime import UTC, datetime
|
2025-02-26 20:17:12 -05:00
|
|
|
from io import BytesIO
|
2025-04-23 10:39:06 -07:00
|
|
|
from typing import Any, Dict, List, Optional, Type, Union
|
2025-04-20 16:34:29 -07:00
|
|
|
|
|
|
|
import filetype
|
|
|
|
import pdf2image
|
2025-03-30 00:26:05 -07:00
|
|
|
import torch
|
|
|
|
from colpali_engine.models import ColIdefics3, ColIdefics3Processor
|
2025-04-20 16:34:29 -07:00
|
|
|
from fastapi import UploadFile
|
|
|
|
from filetype.types import IMAGE # , DOCUMENT, document
|
|
|
|
from PIL.Image import Image
|
2025-04-23 10:39:06 -07:00
|
|
|
from pydantic import BaseModel
|
2025-04-20 16:34:29 -07:00
|
|
|
|
|
|
|
from core.cache.base_cache import BaseCache
|
|
|
|
from core.cache.base_cache_factory import BaseCacheFactory
|
|
|
|
from core.completion.base_completion import BaseCompletionModel
|
|
|
|
from core.config import get_settings
|
2024-11-22 18:56:22 -05:00
|
|
|
from core.database.base_database import BaseDatabase
|
2024-12-27 11:19:07 +05:30
|
|
|
from core.embedding.base_embedding_model import BaseEmbeddingModel
|
2025-04-20 16:34:29 -07:00
|
|
|
from core.embedding.colpali_embedding_model import ColpaliEmbeddingModel
|
|
|
|
from core.models.chunk import Chunk, DocumentChunk
|
|
|
|
from core.models.completion import ChunkSource, CompletionRequest, CompletionResponse
|
|
|
|
from core.models.documents import ChunkResult, Document, DocumentContent, DocumentResult, StorageFileInfo
|
|
|
|
from core.models.prompts import GraphPromptOverrides, QueryPromptOverrides
|
2024-12-26 08:52:25 -05:00
|
|
|
from core.parser.base_parser import BaseParser
|
2025-01-02 03:42:47 -05:00
|
|
|
from core.reranker.base_reranker import BaseReranker
|
2025-04-20 16:34:29 -07:00
|
|
|
from core.services.graph_service import GraphService
|
2025-02-07 21:08:40 -05:00
|
|
|
from core.services.rules_processor import RulesProcessor
|
2025-04-20 16:34:29 -07:00
|
|
|
from core.storage.base_storage import BaseStorage
|
|
|
|
from core.vector_store.base_vector_store import BaseVectorStore
|
2025-02-26 20:17:12 -05:00
|
|
|
from core.vector_store.multi_vector_store import MultiVectorStore
|
2025-04-20 16:34:29 -07:00
|
|
|
|
|
|
|
from ..models.auth import AuthContext
|
|
|
|
from ..models.folders import Folder
|
|
|
|
from ..models.graph import Graph
|
2024-11-22 18:56:22 -05:00
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
2025-02-26 20:17:12 -05:00
|
|
|
IMAGE = {im.mime for im in IMAGE}
|
2024-11-22 18:56:22 -05:00
|
|
|
|
2025-04-13 14:11:12 -07:00
|
|
|
CHARS_PER_TOKEN = 4
|
|
|
|
TOKENS_PER_PAGE = 630
|
2025-03-13 11:26:01 -04:00
|
|
|
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2024-11-22 18:56:22 -05:00
|
|
|
class DocumentService:
|
2025-04-16 01:16:54 -07:00
|
|
|
async def _ensure_folder_exists(self, folder_name: str, document_id: str, auth: AuthContext) -> Optional[Folder]:
|
|
|
|
"""
|
|
|
|
Check if a folder exists, if not create it. Also adds the document to the folder.
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-16 01:16:54 -07:00
|
|
|
Args:
|
|
|
|
folder_name: Name of the folder
|
|
|
|
document_id: ID of the document to add to the folder
|
|
|
|
auth: Authentication context
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-16 01:16:54 -07:00
|
|
|
Returns:
|
|
|
|
Folder object if found or created, None on error
|
|
|
|
"""
|
|
|
|
try:
|
|
|
|
# First check if the folder already exists
|
|
|
|
folder = await self.db.get_folder_by_name(folder_name, auth)
|
|
|
|
if folder:
|
|
|
|
# Add document to existing folder
|
|
|
|
if document_id not in folder.document_ids:
|
|
|
|
success = await self.db.add_document_to_folder(folder.id, document_id, auth)
|
|
|
|
if not success:
|
|
|
|
logger.warning(f"Failed to add document {document_id} to existing folder {folder.name}")
|
|
|
|
return folder # Folder already exists
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-16 01:16:54 -07:00
|
|
|
# Create a new folder
|
|
|
|
folder = Folder(
|
|
|
|
name=folder_name,
|
|
|
|
owner={
|
|
|
|
"type": auth.entity_type.value,
|
|
|
|
"id": auth.entity_id,
|
|
|
|
},
|
|
|
|
document_ids=[document_id], # Add document_id to the new folder
|
|
|
|
)
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-16 01:16:54 -07:00
|
|
|
await self.db.create_folder(folder)
|
|
|
|
return folder
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-16 01:16:54 -07:00
|
|
|
except Exception as e:
|
|
|
|
# Log error but don't raise - we want document ingestion to continue even if folder creation fails
|
|
|
|
logger.error(f"Error ensuring folder exists: {e}")
|
|
|
|
return None
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2024-11-22 18:56:22 -05:00
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
database: BaseDatabase,
|
|
|
|
vector_store: BaseVectorStore,
|
|
|
|
storage: BaseStorage,
|
|
|
|
parser: BaseParser,
|
2024-12-26 08:52:25 -05:00
|
|
|
embedding_model: BaseEmbeddingModel,
|
2025-04-19 15:24:53 -07:00
|
|
|
completion_model: Optional[BaseCompletionModel] = None,
|
|
|
|
cache_factory: Optional[BaseCacheFactory] = None,
|
2025-01-09 15:47:25 +05:30
|
|
|
reranker: Optional[BaseReranker] = None,
|
2025-02-26 20:17:12 -05:00
|
|
|
enable_colpali: bool = False,
|
|
|
|
colpali_embedding_model: Optional[ColpaliEmbeddingModel] = None,
|
|
|
|
colpali_vector_store: Optional[MultiVectorStore] = None,
|
2024-11-22 18:56:22 -05:00
|
|
|
):
|
|
|
|
self.db = database
|
|
|
|
self.vector_store = vector_store
|
|
|
|
self.storage = storage
|
|
|
|
self.parser = parser
|
|
|
|
self.embedding_model = embedding_model
|
2024-12-26 08:52:25 -05:00
|
|
|
self.completion_model = completion_model
|
2025-01-02 03:42:47 -05:00
|
|
|
self.reranker = reranker
|
2025-01-29 10:19:28 +05:30
|
|
|
self.cache_factory = cache_factory
|
2025-02-07 21:08:40 -05:00
|
|
|
self.rules_processor = RulesProcessor()
|
2025-02-26 20:17:12 -05:00
|
|
|
self.colpali_embedding_model = colpali_embedding_model
|
|
|
|
self.colpali_vector_store = colpali_vector_store
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-19 15:24:53 -07:00
|
|
|
# Initialize the graph service only if completion_model is provided
|
|
|
|
# (e.g., not needed for ingestion worker)
|
|
|
|
if completion_model is not None:
|
|
|
|
self.graph_service = GraphService(
|
|
|
|
db=database,
|
|
|
|
embedding_model=embedding_model,
|
|
|
|
completion_model=completion_model,
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
self.graph_service = None
|
2025-02-26 20:17:12 -05:00
|
|
|
|
2025-04-06 12:48:22 -07:00
|
|
|
# MultiVectorStore initialization is now handled in the FastAPI startup event
|
|
|
|
# so we don't need to initialize it here again
|
2025-01-29 10:19:28 +05:30
|
|
|
|
|
|
|
# Cache-related data structures
|
|
|
|
# Maps cache name to active cache object
|
|
|
|
self.active_caches: Dict[str, BaseCache] = {}
|
2024-12-26 08:52:25 -05:00
|
|
|
|
2025-04-23 23:15:03 -07:00
|
|
|
# Store for aggregated metadata from chunk rules
|
|
|
|
self._last_aggregated_metadata: Dict[str, Any] = {}
|
|
|
|
|
2024-12-26 08:52:25 -05:00
|
|
|
async def retrieve_chunks(
|
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
auth: AuthContext,
|
|
|
|
filters: Optional[Dict[str, Any]] = None,
|
2025-01-02 03:42:47 -05:00
|
|
|
k: int = 5,
|
2024-12-26 11:34:24 -05:00
|
|
|
min_score: float = 0.0,
|
2025-01-02 03:42:47 -05:00
|
|
|
use_reranking: Optional[bool] = None,
|
2025-02-26 20:17:12 -05:00
|
|
|
use_colpali: Optional[bool] = None,
|
2025-04-13 14:52:26 -07:00
|
|
|
folder_name: Optional[str] = None,
|
|
|
|
end_user_id: Optional[str] = None,
|
2024-12-26 08:52:25 -05:00
|
|
|
) -> List[ChunkResult]:
|
|
|
|
"""Retrieve relevant chunks."""
|
2025-04-19 15:24:53 -07:00
|
|
|
|
|
|
|
# 4 configurations:
|
|
|
|
# 1. No reranking, no colpali -> just return regular chunks
|
|
|
|
# 2. No reranking, colpali -> return colpali chunks + regular chunks - no need to run smaller colpali model
|
|
|
|
# 3. Reranking, no colpali -> sort regular chunks by re-ranker score
|
|
|
|
# 4. Reranking, colpali -> return merged chunks sorted by smaller colpali model score
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-01-02 03:42:47 -05:00
|
|
|
settings = get_settings()
|
|
|
|
should_rerank = use_reranking if use_reranking is not None else settings.USE_RERANKING
|
2025-04-19 15:24:53 -07:00
|
|
|
using_colpali = use_colpali if use_colpali is not None else False
|
2025-01-02 03:42:47 -05:00
|
|
|
|
2025-04-20 22:36:27 -07:00
|
|
|
# Launch embedding queries concurrently
|
|
|
|
embedding_tasks = [self.embedding_model.embed_for_query(query)]
|
|
|
|
if using_colpali and self.colpali_embedding_model:
|
|
|
|
embedding_tasks.append(self.colpali_embedding_model.embed_for_query(query))
|
2024-12-26 08:52:25 -05:00
|
|
|
|
2025-04-13 14:52:26 -07:00
|
|
|
# Build system filters for folder_name and end_user_id
|
|
|
|
system_filters = {}
|
|
|
|
if folder_name:
|
|
|
|
system_filters["folder_name"] = folder_name
|
|
|
|
if end_user_id:
|
|
|
|
system_filters["end_user_id"] = end_user_id
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-20 22:36:27 -07:00
|
|
|
# Run embeddings and document authorization in parallel
|
|
|
|
results = await asyncio.gather(
|
|
|
|
asyncio.gather(*embedding_tasks),
|
|
|
|
self.db.find_authorized_and_filtered_documents(auth, filters, system_filters),
|
|
|
|
)
|
|
|
|
|
|
|
|
embedding_results, doc_ids = results
|
|
|
|
query_embedding_regular = embedding_results[0]
|
|
|
|
query_embedding_multivector = embedding_results[1] if len(embedding_results) > 1 else None
|
|
|
|
|
|
|
|
logger.info("Generated query embedding")
|
|
|
|
|
2024-12-26 08:52:25 -05:00
|
|
|
if not doc_ids:
|
|
|
|
logger.info("No authorized documents found")
|
|
|
|
return []
|
|
|
|
logger.info(f"Found {len(doc_ids)} authorized documents")
|
|
|
|
|
2025-04-19 15:24:53 -07:00
|
|
|
# Check if we're using colpali multivector search
|
|
|
|
search_multi = using_colpali and self.colpali_vector_store and query_embedding_multivector is not None
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-19 15:24:53 -07:00
|
|
|
# For regular reranking (without colpali), we'll use the existing reranker if available
|
|
|
|
# For colpali reranking, we'll handle it in _combine_multi_and_regular_chunks
|
|
|
|
use_standard_reranker = should_rerank and (not search_multi) and self.reranker is not None
|
2025-03-30 00:26:05 -07:00
|
|
|
|
2025-04-20 22:36:27 -07:00
|
|
|
# Search chunks with vector similarity in parallel
|
2025-04-19 15:24:53 -07:00
|
|
|
# When using standard reranker, we get more chunks initially to improve reranking quality
|
2025-04-20 22:36:27 -07:00
|
|
|
search_tasks = [
|
|
|
|
self.vector_store.query_similar(
|
|
|
|
query_embedding_regular, k=10 * k if use_standard_reranker else k, doc_ids=doc_ids
|
|
|
|
)
|
|
|
|
]
|
2025-02-26 20:17:12 -05:00
|
|
|
|
2025-04-20 22:36:27 -07:00
|
|
|
if search_multi:
|
|
|
|
search_tasks.append(
|
|
|
|
self.colpali_vector_store.query_similar(query_embedding_multivector, k=k, doc_ids=doc_ids)
|
|
|
|
)
|
|
|
|
|
|
|
|
search_results = await asyncio.gather(*search_tasks)
|
|
|
|
chunks = search_results[0]
|
|
|
|
chunks_multivector = search_results[1] if len(search_results) > 1 else []
|
2025-02-26 20:17:12 -05:00
|
|
|
|
2025-03-27 20:05:27 -07:00
|
|
|
logger.debug(f"Found {len(chunks)} similar chunks via regular embedding")
|
2025-04-19 15:24:53 -07:00
|
|
|
if using_colpali:
|
2025-04-20 16:34:29 -07:00
|
|
|
logger.debug(
|
2025-04-20 22:36:27 -07:00
|
|
|
f"Found {len(chunks_multivector)} similar chunks via multivector embedding "
|
|
|
|
f"since we are also using colpali"
|
2025-04-20 16:34:29 -07:00
|
|
|
)
|
2024-12-26 08:52:25 -05:00
|
|
|
|
2025-04-19 15:24:53 -07:00
|
|
|
# Rerank chunks using the standard reranker if enabled and available
|
|
|
|
# This handles configuration 3: Reranking without colpali
|
|
|
|
if chunks and use_standard_reranker:
|
2025-01-02 03:42:47 -05:00
|
|
|
chunks = await self.reranker.rerank(query, chunks)
|
|
|
|
chunks.sort(key=lambda x: x.score, reverse=True)
|
|
|
|
chunks = chunks[:k]
|
2025-03-27 20:05:27 -07:00
|
|
|
logger.debug(f"Reranked {k*10} chunks and selected the top {k}")
|
2025-01-02 03:42:47 -05:00
|
|
|
|
2025-04-19 15:24:53 -07:00
|
|
|
# Combine multiple chunk sources if needed
|
2025-04-20 16:34:29 -07:00
|
|
|
chunks = await self._combine_multi_and_regular_chunks(
|
|
|
|
query, chunks, chunks_multivector, should_rerank=should_rerank
|
|
|
|
)
|
2025-02-26 20:17:12 -05:00
|
|
|
|
2024-12-26 08:52:25 -05:00
|
|
|
# Create and return chunk results
|
|
|
|
results = await self._create_chunk_results(auth, chunks)
|
|
|
|
logger.info(f"Returning {len(results)} chunk results")
|
|
|
|
return results
|
|
|
|
|
2025-04-20 16:34:29 -07:00
|
|
|
async def _combine_multi_and_regular_chunks(
|
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
chunks: List[DocumentChunk],
|
|
|
|
chunks_multivector: List[DocumentChunk],
|
|
|
|
should_rerank: bool = None,
|
|
|
|
):
|
2025-04-19 15:24:53 -07:00
|
|
|
"""Combine and potentially rerank regular and colpali chunks based on configuration.
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-19 15:24:53 -07:00
|
|
|
# 4 configurations:
|
|
|
|
# 1. No reranking, no colpali -> just return regular chunks - this already happens upstream, correctly
|
|
|
|
# 2. No reranking, colpali -> return colpali chunks + regular chunks - no need to run smaller colpali model
|
|
|
|
# 3. Reranking, no colpali -> sort regular chunks by re-ranker score - this already happens upstream, correctly
|
|
|
|
# 4. Reranking, colpali -> return merged chunks sorted by smaller colpali model score
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-19 15:24:53 -07:00
|
|
|
Args:
|
|
|
|
query: The user query
|
|
|
|
chunks: Regular chunks with embeddings
|
|
|
|
chunks_multivector: Colpali multi-vector chunks
|
|
|
|
should_rerank: Whether reranking is enabled
|
|
|
|
"""
|
|
|
|
# Handle simple cases first
|
2025-03-30 00:26:05 -07:00
|
|
|
if len(chunks_multivector) == 0:
|
|
|
|
return chunks
|
2025-04-03 12:54:54 -07:00
|
|
|
if len(chunks) == 0:
|
|
|
|
return chunks_multivector
|
2025-03-30 00:26:05 -07:00
|
|
|
|
2025-04-19 15:24:53 -07:00
|
|
|
# Use global setting if not provided
|
|
|
|
if should_rerank is None:
|
|
|
|
settings = get_settings()
|
|
|
|
should_rerank = settings.USE_RERANKING
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-19 15:24:53 -07:00
|
|
|
# Check if we need to run the reranking - if reranking is disabled, we just combine the chunks
|
|
|
|
# This is Configuration 2: No reranking, with colpali
|
|
|
|
if not should_rerank:
|
|
|
|
# For configuration 2, simply combine the chunks with multivector chunks first
|
|
|
|
# since they are generally higher quality
|
|
|
|
logger.debug("Using configuration 2: No reranking, with colpali - combining chunks without rescoring")
|
|
|
|
combined_chunks = chunks_multivector + chunks
|
|
|
|
return combined_chunks
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-19 15:24:53 -07:00
|
|
|
# Configuration 4: Reranking with colpali
|
|
|
|
# Use colpali as a reranker to get consistent similarity scores for both types of chunks
|
|
|
|
logger.debug("Using configuration 4: Reranking with colpali - rescoring chunks with colpali model")
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-30 00:26:05 -07:00
|
|
|
model_name = "vidore/colSmol-256M"
|
2025-04-20 16:34:29 -07:00
|
|
|
device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
|
2025-03-30 00:26:05 -07:00
|
|
|
|
|
|
|
model = ColIdefics3.from_pretrained(
|
|
|
|
model_name,
|
|
|
|
torch_dtype=torch.bfloat16,
|
|
|
|
device_map=device, # "cuda:0", # or "mps" if on Apple Silicon
|
2025-04-20 22:36:27 -07:00
|
|
|
attn_implementation="eager", # "flash_attention_2" if is_flash_attn_2_available() else None,
|
|
|
|
# or "eager" if "mps"
|
2025-03-30 00:26:05 -07:00
|
|
|
).eval()
|
|
|
|
processor = ColIdefics3Processor.from_pretrained(model_name)
|
|
|
|
|
2025-04-19 15:24:53 -07:00
|
|
|
# Score regular chunks with colpali model for consistent comparison
|
2025-03-30 00:26:05 -07:00
|
|
|
batch_chunks = processor.process_queries([chunk.content for chunk in chunks]).to(device)
|
|
|
|
query_rep = processor.process_queries([query]).to(device)
|
|
|
|
multi_vec_representations = model(**batch_chunks)
|
|
|
|
query_rep = model(**query_rep)
|
|
|
|
scores = processor.score_multi_vector(query_rep, multi_vec_representations)
|
|
|
|
for chunk, score in zip(chunks, scores[0]):
|
|
|
|
chunk.score = score
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-19 15:24:53 -07:00
|
|
|
# Combine and sort all chunks
|
2025-03-30 00:26:05 -07:00
|
|
|
full_chunks = chunks + chunks_multivector
|
|
|
|
full_chunks.sort(key=lambda x: x.score, reverse=True)
|
|
|
|
return full_chunks
|
|
|
|
|
2024-12-26 08:52:25 -05:00
|
|
|
async def retrieve_docs(
|
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
auth: AuthContext,
|
|
|
|
filters: Optional[Dict[str, Any]] = None,
|
2025-01-02 03:42:47 -05:00
|
|
|
k: int = 5,
|
2024-12-26 11:34:24 -05:00
|
|
|
min_score: float = 0.0,
|
2025-01-02 03:42:47 -05:00
|
|
|
use_reranking: Optional[bool] = None,
|
2025-02-26 20:17:12 -05:00
|
|
|
use_colpali: Optional[bool] = None,
|
2025-04-13 14:52:26 -07:00
|
|
|
folder_name: Optional[str] = None,
|
|
|
|
end_user_id: Optional[str] = None,
|
2024-12-26 08:52:25 -05:00
|
|
|
) -> List[DocumentResult]:
|
|
|
|
"""Retrieve relevant documents."""
|
|
|
|
# Get chunks first
|
2025-02-26 20:17:12 -05:00
|
|
|
chunks = await self.retrieve_chunks(
|
2025-04-13 14:52:26 -07:00
|
|
|
query, auth, filters, k, min_score, use_reranking, use_colpali, folder_name, end_user_id
|
2025-02-26 20:17:12 -05:00
|
|
|
)
|
2024-12-26 08:52:25 -05:00
|
|
|
# Convert to document results
|
|
|
|
results = await self._create_document_results(auth, chunks)
|
2024-12-29 12:45:12 +05:30
|
|
|
documents = list(results.values())
|
|
|
|
logger.info(f"Returning {len(documents)} document results")
|
|
|
|
return documents
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-09 18:42:04 -04:00
|
|
|
async def batch_retrieve_documents(
|
|
|
|
self,
|
|
|
|
document_ids: List[str],
|
2025-04-13 14:52:26 -07:00
|
|
|
auth: AuthContext,
|
|
|
|
folder_name: Optional[str] = None,
|
2025-04-20 16:34:29 -07:00
|
|
|
end_user_id: Optional[str] = None,
|
2025-03-09 18:42:04 -04:00
|
|
|
) -> List[Document]:
|
|
|
|
"""
|
|
|
|
Retrieve multiple documents by their IDs in a single batch operation.
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-09 18:42:04 -04:00
|
|
|
Args:
|
|
|
|
document_ids: List of document IDs to retrieve
|
|
|
|
auth: Authentication context
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-09 18:42:04 -04:00
|
|
|
Returns:
|
|
|
|
List of Document objects that user has access to
|
|
|
|
"""
|
|
|
|
if not document_ids:
|
|
|
|
return []
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-13 14:52:26 -07:00
|
|
|
# Build system filters for folder_name and end_user_id
|
|
|
|
system_filters = {}
|
|
|
|
if folder_name:
|
|
|
|
system_filters["folder_name"] = folder_name
|
|
|
|
if end_user_id:
|
|
|
|
system_filters["end_user_id"] = end_user_id
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-09 18:42:04 -04:00
|
|
|
# Use the database's batch retrieval method
|
2025-04-13 14:52:26 -07:00
|
|
|
documents = await self.db.get_documents_by_id(document_ids, auth, system_filters)
|
2025-03-09 18:42:04 -04:00
|
|
|
logger.info(f"Batch retrieved {len(documents)} documents out of {len(document_ids)} requested")
|
|
|
|
return documents
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-09 18:42:04 -04:00
|
|
|
async def batch_retrieve_chunks(
|
|
|
|
self,
|
|
|
|
chunk_ids: List[ChunkSource],
|
2025-04-13 14:52:26 -07:00
|
|
|
auth: AuthContext,
|
|
|
|
folder_name: Optional[str] = None,
|
2025-04-18 23:11:48 -07:00
|
|
|
end_user_id: Optional[str] = None,
|
2025-04-20 16:34:29 -07:00
|
|
|
use_colpali: Optional[bool] = None,
|
2025-03-09 18:42:04 -04:00
|
|
|
) -> List[ChunkResult]:
|
|
|
|
"""
|
|
|
|
Retrieve specific chunks by their document ID and chunk number in a single batch operation.
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-09 18:42:04 -04:00
|
|
|
Args:
|
|
|
|
chunk_ids: List of ChunkSource objects with document_id and chunk_number
|
|
|
|
auth: Authentication context
|
2025-04-18 23:11:48 -07:00
|
|
|
folder_name: Optional folder to scope the operation to
|
|
|
|
end_user_id: Optional end-user ID to scope the operation to
|
|
|
|
use_colpali: Whether to use colpali multimodal features for image chunks
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-09 18:42:04 -04:00
|
|
|
Returns:
|
|
|
|
List of ChunkResult objects
|
|
|
|
"""
|
|
|
|
if not chunk_ids:
|
|
|
|
return []
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-09 18:42:04 -04:00
|
|
|
# Collect unique document IDs to check authorization in a single query
|
|
|
|
doc_ids = list({source.document_id for source in chunk_ids})
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-09 18:42:04 -04:00
|
|
|
# Find authorized documents in a single query
|
2025-04-13 14:52:26 -07:00
|
|
|
authorized_docs = await self.batch_retrieve_documents(doc_ids, auth, folder_name, end_user_id)
|
2025-03-09 18:42:04 -04:00
|
|
|
authorized_doc_ids = {doc.external_id for doc in authorized_docs}
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-09 18:42:04 -04:00
|
|
|
# Filter sources to only include authorized documents
|
2025-04-20 16:34:29 -07:00
|
|
|
authorized_sources = [source for source in chunk_ids if source.document_id in authorized_doc_ids]
|
|
|
|
|
2025-03-09 18:42:04 -04:00
|
|
|
if not authorized_sources:
|
|
|
|
return []
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-09 18:42:04 -04:00
|
|
|
# Create list of (document_id, chunk_number) tuples for vector store query
|
2025-04-20 16:34:29 -07:00
|
|
|
chunk_identifiers = [(source.document_id, source.chunk_number) for source in authorized_sources]
|
|
|
|
|
2025-04-20 22:36:27 -07:00
|
|
|
# Set up vector store retrieval tasks
|
|
|
|
retrieval_tasks = [self.vector_store.get_chunks_by_id(chunk_identifiers)]
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-20 22:36:27 -07:00
|
|
|
# Add colpali vector store task if needed
|
2025-04-18 23:11:48 -07:00
|
|
|
if use_colpali and self.colpali_vector_store:
|
2025-04-20 22:36:27 -07:00
|
|
|
logger.info("Preparing to retrieve chunks from both regular and colpali vector stores")
|
|
|
|
retrieval_tasks.append(self.colpali_vector_store.get_chunks_by_id(chunk_identifiers))
|
|
|
|
|
|
|
|
# Execute vector store retrievals in parallel
|
|
|
|
try:
|
|
|
|
vector_results = await asyncio.gather(*retrieval_tasks, return_exceptions=True)
|
|
|
|
|
|
|
|
# Process regular chunks
|
|
|
|
chunks = vector_results[0] if not isinstance(vector_results[0], Exception) else []
|
|
|
|
|
|
|
|
# Process colpali chunks if available
|
|
|
|
if len(vector_results) > 1 and not isinstance(vector_results[1], Exception):
|
|
|
|
colpali_chunks = vector_results[1]
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-18 23:11:48 -07:00
|
|
|
if colpali_chunks:
|
|
|
|
# Create a dictionary of (doc_id, chunk_number) -> chunk for fast lookup
|
|
|
|
chunk_dict = {(c.document_id, c.chunk_number): c for c in chunks}
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-18 23:11:48 -07:00
|
|
|
logger.debug(f"Found {len(colpali_chunks)} chunks in colpali store")
|
|
|
|
for colpali_chunk in colpali_chunks:
|
|
|
|
key = (colpali_chunk.document_id, colpali_chunk.chunk_number)
|
|
|
|
# Replace chunks with colpali chunks when available
|
|
|
|
chunk_dict[key] = colpali_chunk
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-18 23:11:48 -07:00
|
|
|
# Update chunks list with the combined/replaced chunks
|
|
|
|
chunks = list(chunk_dict.values())
|
|
|
|
logger.info(f"Enhanced {len(colpali_chunks)} chunks with colpali/multimodal data")
|
2025-04-20 22:36:27 -07:00
|
|
|
|
|
|
|
# Handle any exceptions that occurred during retrieval
|
|
|
|
for i, result in enumerate(vector_results):
|
|
|
|
if isinstance(result, Exception):
|
|
|
|
store_type = "regular" if i == 0 else "colpali"
|
|
|
|
logger.error(f"Error retrieving chunks from {store_type} vector store: {result}", exc_info=True)
|
|
|
|
if i == 0: # If regular store failed, we can't proceed
|
|
|
|
return []
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Error during parallel chunk retrieval: {e}", exc_info=True)
|
|
|
|
return []
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-09 18:42:04 -04:00
|
|
|
# Convert to chunk results
|
|
|
|
results = await self._create_chunk_results(auth, chunks)
|
|
|
|
logger.info(f"Batch retrieved {len(results)} chunks out of {len(chunk_ids)} requested")
|
|
|
|
return results
|
2024-12-26 08:52:25 -05:00
|
|
|
|
|
|
|
async def query(
|
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
auth: AuthContext,
|
|
|
|
filters: Optional[Dict[str, Any]] = None,
|
2024-12-31 06:58:34 -05:00
|
|
|
k: int = 20, # from contextual embedding paper
|
2024-12-26 08:52:25 -05:00
|
|
|
min_score: float = 0.0,
|
|
|
|
max_tokens: Optional[int] = None,
|
2024-12-26 11:34:24 -05:00
|
|
|
temperature: Optional[float] = None,
|
2025-01-02 03:42:47 -05:00
|
|
|
use_reranking: Optional[bool] = None,
|
2025-02-26 20:17:12 -05:00
|
|
|
use_colpali: Optional[bool] = None,
|
2025-03-17 17:36:43 -04:00
|
|
|
graph_name: Optional[str] = None,
|
|
|
|
hop_depth: int = 1,
|
|
|
|
include_paths: bool = False,
|
2025-03-31 21:30:48 -07:00
|
|
|
prompt_overrides: Optional["QueryPromptOverrides"] = None,
|
2025-04-13 14:52:26 -07:00
|
|
|
folder_name: Optional[str] = None,
|
|
|
|
end_user_id: Optional[str] = None,
|
2025-04-23 10:39:06 -07:00
|
|
|
schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
|
2024-12-26 08:52:25 -05:00
|
|
|
) -> CompletionResponse:
|
2025-03-17 17:36:43 -04:00
|
|
|
"""Generate completion using relevant chunks as context.
|
2025-04-20 16:34:29 -07:00
|
|
|
|
|
|
|
When graph_name is provided, the query will leverage the knowledge graph
|
2025-03-17 17:36:43 -04:00
|
|
|
to enhance retrieval by finding relevant entities and their connected documents.
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-17 17:36:43 -04:00
|
|
|
Args:
|
|
|
|
query: The query text
|
|
|
|
auth: Authentication context
|
|
|
|
filters: Optional metadata filters for documents
|
|
|
|
k: Number of chunks to retrieve
|
|
|
|
min_score: Minimum similarity score
|
|
|
|
max_tokens: Maximum tokens for completion
|
|
|
|
temperature: Temperature for completion
|
|
|
|
use_reranking: Whether to use reranking
|
|
|
|
use_colpali: Whether to use colpali embedding
|
|
|
|
graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
|
|
|
|
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
|
|
|
include_paths: Whether to include relationship paths in the response
|
2025-04-23 10:39:06 -07:00
|
|
|
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
|
|
|
folder_name: Optional folder to scope the operation to
|
|
|
|
end_user_id: Optional end-user ID to scope the operation to
|
|
|
|
schema: Optional schema for structured output
|
2025-03-17 17:36:43 -04:00
|
|
|
"""
|
|
|
|
if graph_name:
|
|
|
|
# Use knowledge graph enhanced retrieval via GraphService
|
|
|
|
return await self.graph_service.query_with_graph(
|
|
|
|
query=query,
|
|
|
|
graph_name=graph_name,
|
|
|
|
auth=auth,
|
|
|
|
document_service=self,
|
|
|
|
filters=filters,
|
|
|
|
k=k,
|
|
|
|
min_score=min_score,
|
|
|
|
max_tokens=max_tokens,
|
|
|
|
temperature=temperature,
|
|
|
|
use_reranking=use_reranking,
|
|
|
|
use_colpali=use_colpali,
|
|
|
|
hop_depth=hop_depth,
|
|
|
|
include_paths=include_paths,
|
2025-03-31 21:30:48 -07:00
|
|
|
prompt_overrides=prompt_overrides,
|
2025-04-13 14:52:26 -07:00
|
|
|
folder_name=folder_name,
|
2025-04-20 16:34:29 -07:00
|
|
|
end_user_id=end_user_id,
|
2025-03-17 17:36:43 -04:00
|
|
|
)
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-17 17:36:43 -04:00
|
|
|
# Standard retrieval without graph
|
2025-02-26 20:17:12 -05:00
|
|
|
chunks = await self.retrieve_chunks(
|
2025-04-13 14:52:26 -07:00
|
|
|
query, auth, filters, k, min_score, use_reranking, use_colpali, folder_name, end_user_id
|
2025-02-26 20:17:12 -05:00
|
|
|
)
|
2024-12-29 12:45:12 +05:30
|
|
|
documents = await self._create_document_results(auth, chunks)
|
|
|
|
|
2025-03-17 17:36:43 -04:00
|
|
|
# Create augmented chunk contents
|
2024-12-29 12:48:41 +05:30
|
|
|
chunk_contents = [chunk.augmented_content(documents[chunk.document_id]) for chunk in chunks]
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-09 18:42:04 -04:00
|
|
|
# Collect sources information
|
|
|
|
sources = [
|
2025-03-11 14:53:42 -05:00
|
|
|
ChunkSource(document_id=chunk.document_id, chunk_number=chunk.chunk_number, score=chunk.score)
|
2025-03-09 18:42:04 -04:00
|
|
|
for chunk in chunks
|
|
|
|
]
|
2024-12-26 08:52:25 -05:00
|
|
|
|
2025-03-31 21:30:48 -07:00
|
|
|
# Generate completion with prompt override if provided
|
|
|
|
custom_prompt_template = None
|
|
|
|
if prompt_overrides and prompt_overrides.query:
|
|
|
|
custom_prompt_template = prompt_overrides.query.prompt_template
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2024-12-26 08:52:25 -05:00
|
|
|
request = CompletionRequest(
|
|
|
|
query=query,
|
|
|
|
context_chunks=chunk_contents,
|
|
|
|
max_tokens=max_tokens,
|
2024-12-26 11:34:24 -05:00
|
|
|
temperature=temperature,
|
2025-03-31 21:30:48 -07:00
|
|
|
prompt_template=custom_prompt_template,
|
2025-04-23 10:39:06 -07:00
|
|
|
schema=schema,
|
2024-12-26 08:52:25 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
response = await self.completion_model.complete(request)
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-09 18:42:04 -04:00
|
|
|
# Add sources information at the document service level
|
|
|
|
response.sources = sources
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2024-12-26 08:52:25 -05:00
|
|
|
return response
|
2024-11-22 18:56:22 -05:00
|
|
|
|
2025-02-07 21:08:40 -05:00
|
|
|
async def ingest_text(
|
|
|
|
self,
|
|
|
|
content: str,
|
2025-03-05 09:56:02 -06:00
|
|
|
filename: Optional[str] = None,
|
2025-02-07 21:08:40 -05:00
|
|
|
metadata: Optional[Dict[str, Any]] = None,
|
|
|
|
auth: AuthContext = None,
|
|
|
|
rules: Optional[List[str]] = None,
|
2025-02-26 20:17:12 -05:00
|
|
|
use_colpali: Optional[bool] = None,
|
2025-04-13 14:52:26 -07:00
|
|
|
folder_name: Optional[str] = None,
|
|
|
|
end_user_id: Optional[str] = None,
|
2025-02-07 21:08:40 -05:00
|
|
|
) -> Document:
|
2024-11-24 14:29:25 -05:00
|
|
|
"""Ingest a text document."""
|
2024-11-28 19:09:40 -05:00
|
|
|
if "write" not in auth.permissions:
|
2024-12-15 14:31:25 -05:00
|
|
|
logger.error(f"User {auth.entity_id} does not have write permission")
|
2024-11-28 19:09:40 -05:00
|
|
|
raise PermissionError("User does not have write permission")
|
2024-11-24 14:29:25 -05:00
|
|
|
|
2025-03-27 17:30:02 -07:00
|
|
|
# First check ingest limits if in cloud mode
|
|
|
|
from core.config import get_settings
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-27 17:30:02 -07:00
|
|
|
settings = get_settings()
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2024-12-15 14:31:25 -05:00
|
|
|
doc = Document(
|
|
|
|
content_type="text/plain",
|
2025-03-05 09:56:02 -06:00
|
|
|
filename=filename,
|
2025-02-07 21:08:40 -05:00
|
|
|
metadata=metadata or {},
|
2024-12-26 11:34:24 -05:00
|
|
|
owner={"type": auth.entity_type, "id": auth.entity_id},
|
2024-12-15 14:31:25 -05:00
|
|
|
access_control={
|
|
|
|
"readers": [auth.entity_id],
|
|
|
|
"writers": [auth.entity_id],
|
2024-12-26 11:34:24 -05:00
|
|
|
"admins": [auth.entity_id],
|
2025-04-20 16:34:29 -07:00
|
|
|
"user_id": (
|
|
|
|
[auth.user_id] if auth.user_id else []
|
|
|
|
), # Add user_id to access control for filtering (as a list)
|
2024-12-26 11:34:24 -05:00
|
|
|
},
|
2024-12-15 14:31:25 -05:00
|
|
|
)
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-13 14:52:26 -07:00
|
|
|
# Add folder_name and end_user_id to system_metadata if provided
|
|
|
|
if folder_name:
|
|
|
|
doc.system_metadata["folder_name"] = folder_name
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-16 01:16:54 -07:00
|
|
|
# Check if the folder exists, if not create it
|
|
|
|
await self._ensure_folder_exists(folder_name, doc.external_id, auth)
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-13 14:52:26 -07:00
|
|
|
if end_user_id:
|
|
|
|
doc.system_metadata["end_user_id"] = end_user_id
|
2025-03-27 20:05:27 -07:00
|
|
|
logger.debug(f"Created text document record with ID {doc.external_id}")
|
2024-12-15 14:31:25 -05:00
|
|
|
|
2025-04-13 14:11:12 -07:00
|
|
|
if settings.MODE == "cloud" and auth.user_id:
|
|
|
|
# Check limits before proceeding
|
|
|
|
from core.api import check_and_increment_limits
|
2025-04-20 16:34:29 -07:00
|
|
|
|
|
|
|
num_pages = int(len(content) / (CHARS_PER_TOKEN * TOKENS_PER_PAGE)) #
|
2025-04-13 14:11:12 -07:00
|
|
|
await check_and_increment_limits(auth, "ingest", num_pages, doc.external_id)
|
|
|
|
|
2025-04-23 23:15:03 -07:00
|
|
|
# === Apply post_parsing rules ===
|
|
|
|
document_rule_metadata = {}
|
2025-02-07 21:08:40 -05:00
|
|
|
if rules:
|
2025-04-23 23:15:03 -07:00
|
|
|
logger.info("Applying post-parsing rules...")
|
|
|
|
document_rule_metadata, content = await self.rules_processor.process_document_rules(content, rules)
|
2025-02-07 21:08:40 -05:00
|
|
|
# Update document metadata with extracted metadata from rules
|
2025-04-23 23:15:03 -07:00
|
|
|
metadata.update(document_rule_metadata)
|
2025-02-07 21:08:40 -05:00
|
|
|
doc.metadata = metadata # Update doc metadata after rules
|
2025-04-23 23:15:03 -07:00
|
|
|
logger.info(f"Document metadata after post-parsing rules: {metadata}")
|
|
|
|
logger.info(f"Content length after post-parsing rules: {len(content)}")
|
2025-02-07 21:08:40 -05:00
|
|
|
|
|
|
|
# Store full content before chunking
|
|
|
|
doc.system_metadata["content"] = content
|
|
|
|
|
2025-04-23 23:15:03 -07:00
|
|
|
# Split text into chunks
|
|
|
|
parsed_chunks = await self.parser.split_text(content)
|
|
|
|
if not parsed_chunks:
|
|
|
|
raise ValueError("No content chunks extracted after rules processing")
|
|
|
|
logger.debug(f"Split processed text into {len(parsed_chunks)} chunks")
|
|
|
|
|
|
|
|
# === Apply post_chunking rules and aggregate metadata ===
|
|
|
|
processed_chunks = []
|
|
|
|
aggregated_chunk_metadata: Dict[str, Any] = {} # Initialize dict for aggregated metadata
|
|
|
|
chunk_contents = [] # Initialize list to collect chunk contents efficiently
|
|
|
|
|
|
|
|
if rules:
|
|
|
|
logger.info("Applying post-chunking rules...")
|
|
|
|
|
|
|
|
for chunk_obj in parsed_chunks:
|
|
|
|
# Get metadata *and* the potentially modified chunk
|
|
|
|
chunk_rule_metadata, processed_chunk = await self.rules_processor.process_chunk_rules(chunk_obj, rules)
|
|
|
|
processed_chunks.append(processed_chunk)
|
|
|
|
chunk_contents.append(processed_chunk.content) # Collect content as we process
|
|
|
|
# Aggregate the metadata extracted from this chunk
|
|
|
|
aggregated_chunk_metadata.update(chunk_rule_metadata)
|
|
|
|
logger.info(f"Finished applying post-chunking rules to {len(processed_chunks)} chunks.")
|
|
|
|
logger.info(f"Aggregated metadata from all chunks: {aggregated_chunk_metadata}")
|
|
|
|
|
|
|
|
# Update the document content with the stitched content from processed chunks
|
|
|
|
if processed_chunks:
|
|
|
|
logger.info("Updating document content with processed chunks...")
|
|
|
|
stitched_content = "\n".join(chunk_contents)
|
|
|
|
doc.system_metadata["content"] = stitched_content
|
|
|
|
logger.info(f"Updated document content with stitched chunks (length: {len(stitched_content)})")
|
|
|
|
else:
|
|
|
|
processed_chunks = parsed_chunks # No rules, use original chunks
|
2024-12-15 14:31:25 -05:00
|
|
|
|
2025-04-23 23:15:03 -07:00
|
|
|
# Generate embeddings for processed chunks
|
|
|
|
embeddings = await self.embedding_model.embed_for_ingestion(processed_chunks)
|
2025-03-27 20:05:27 -07:00
|
|
|
logger.debug(f"Generated {len(embeddings)} embeddings")
|
2025-04-23 23:15:03 -07:00
|
|
|
|
|
|
|
# Create chunk objects with processed chunk content
|
|
|
|
chunk_objects = self._create_chunk_objects(doc.external_id, processed_chunks, embeddings)
|
2025-03-27 20:05:27 -07:00
|
|
|
logger.debug(f"Created {len(chunk_objects)} chunk objects")
|
2024-12-15 14:31:25 -05:00
|
|
|
|
2025-02-26 20:17:12 -05:00
|
|
|
chunk_objects_multivector = []
|
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
if use_colpali and self.colpali_embedding_model:
|
2025-04-23 23:15:03 -07:00
|
|
|
embeddings_multivector = await self.colpali_embedding_model.embed_for_ingestion(processed_chunks)
|
2025-04-20 16:34:29 -07:00
|
|
|
logger.info(f"Generated {len(embeddings_multivector)} embeddings for multivector embedding")
|
2025-04-23 23:15:03 -07:00
|
|
|
chunk_objects_multivector = self._create_chunk_objects(
|
|
|
|
doc.external_id, processed_chunks, embeddings_multivector
|
|
|
|
)
|
2025-04-20 16:34:29 -07:00
|
|
|
logger.info(f"Created {len(chunk_objects_multivector)} chunk objects for multivector embedding")
|
2025-02-26 20:17:12 -05:00
|
|
|
|
|
|
|
# Create and store chunk objects
|
|
|
|
|
2025-04-23 23:15:03 -07:00
|
|
|
# === Merge aggregated chunk metadata into document metadata ===
|
|
|
|
if aggregated_chunk_metadata:
|
|
|
|
logger.info("Merging aggregated chunk metadata into document metadata...")
|
|
|
|
# Make sure doc.metadata exists
|
|
|
|
if not hasattr(doc, "metadata") or doc.metadata is None:
|
|
|
|
doc.metadata = {}
|
|
|
|
doc.metadata.update(aggregated_chunk_metadata)
|
|
|
|
logger.info(f"Final document metadata after merge: {doc.metadata}")
|
|
|
|
# ===========================================================
|
|
|
|
|
2025-02-07 21:08:40 -05:00
|
|
|
# Store everything
|
2025-02-26 20:17:12 -05:00
|
|
|
await self._store_chunks_and_doc(chunk_objects, doc, use_colpali, chunk_objects_multivector)
|
2025-03-27 20:05:27 -07:00
|
|
|
logger.debug(f"Successfully stored text document {doc.external_id}")
|
2024-12-15 14:31:25 -05:00
|
|
|
|
2025-04-21 00:33:25 -07:00
|
|
|
# Update the document status to completed after successful storage
|
|
|
|
# This matches the behavior in ingestion_worker.py
|
|
|
|
doc.system_metadata["status"] = "completed"
|
|
|
|
doc.system_metadata["updated_at"] = datetime.now(UTC)
|
|
|
|
await self.db.update_document(
|
|
|
|
document_id=doc.external_id, updates={"system_metadata": doc.system_metadata}, auth=auth
|
|
|
|
)
|
|
|
|
logger.debug(f"Updated document status to 'completed' for {doc.external_id}")
|
|
|
|
|
2024-12-15 14:31:25 -05:00
|
|
|
return doc
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-02-26 20:17:12 -05:00
|
|
|
def img_to_base64_str(self, img: Image):
|
|
|
|
buffered = BytesIO()
|
|
|
|
img.save(buffered, format="PNG")
|
|
|
|
buffered.seek(0)
|
|
|
|
img_byte = buffered.getvalue()
|
|
|
|
img_str = "data:image/png;base64," + base64.b64encode(img_byte).decode()
|
|
|
|
return img_str
|
|
|
|
|
2025-04-20 16:34:29 -07:00
|
|
|
def _create_chunks_multivector(self, file_type, file_content_base64: str, file_content: bytes, chunks: List[Chunk]):
|
2025-03-18 23:27:53 -04:00
|
|
|
# Handle the case where file_type is None
|
|
|
|
mime_type = file_type.mime if file_type is not None else "text/plain"
|
|
|
|
logger.info(f"Creating chunks for multivector embedding for file type {mime_type}")
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-18 23:27:53 -04:00
|
|
|
# If file_type is None, treat it as a text file
|
|
|
|
if file_type is None:
|
|
|
|
logger.info("File type is None, treating as text")
|
2025-04-20 16:34:29 -07:00
|
|
|
return [Chunk(content=chunk.content, metadata=(chunk.metadata | {"is_image": False})) for chunk in chunks]
|
|
|
|
|
2025-03-18 23:27:53 -04:00
|
|
|
match mime_type:
|
2025-02-26 20:17:12 -05:00
|
|
|
case file_type if file_type in IMAGE:
|
|
|
|
return [Chunk(content=file_content_base64, metadata={"is_image": True})]
|
|
|
|
case "application/pdf":
|
2025-02-26 22:36:25 -05:00
|
|
|
logger.info("Working with PDF file!")
|
2025-02-26 20:17:12 -05:00
|
|
|
images = pdf2image.convert_from_bytes(file_content)
|
|
|
|
images_b64 = [self.img_to_base64_str(image) for image in images]
|
2025-04-20 16:34:29 -07:00
|
|
|
return [Chunk(content=image_b64, metadata={"is_image": True}) for image_b64 in images_b64]
|
2025-02-28 15:06:59 -05:00
|
|
|
case "application/vnd.openxmlformats-officedocument.wordprocessingml.document" | "application/msword":
|
|
|
|
logger.info("Working with Word document!")
|
|
|
|
# Check if file content is empty
|
|
|
|
if not file_content or len(file_content) == 0:
|
|
|
|
logger.error("Word document content is empty")
|
|
|
|
return [
|
|
|
|
Chunk(content=chunk.content, metadata=(chunk.metadata | {"is_image": False}))
|
|
|
|
for chunk in chunks
|
|
|
|
]
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-02-28 15:06:59 -05:00
|
|
|
# Convert Word document to PDF first
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as temp_docx:
|
|
|
|
temp_docx.write(file_content)
|
|
|
|
temp_docx_path = temp_docx.name
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-02-28 15:06:59 -05:00
|
|
|
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
|
|
|
|
temp_pdf_path = temp_pdf.name
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-02-28 15:06:59 -05:00
|
|
|
try:
|
|
|
|
# Convert Word to PDF
|
|
|
|
import subprocess
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-02-28 15:06:59 -05:00
|
|
|
# Get the base filename without extension
|
|
|
|
base_filename = os.path.splitext(os.path.basename(temp_docx_path))[0]
|
|
|
|
output_dir = os.path.dirname(temp_pdf_path)
|
|
|
|
expected_pdf_path = os.path.join(output_dir, f"{base_filename}.pdf")
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-02-28 15:06:59 -05:00
|
|
|
result = subprocess.run(
|
2025-04-20 16:34:29 -07:00
|
|
|
[
|
|
|
|
"soffice",
|
|
|
|
"--headless",
|
|
|
|
"--convert-to",
|
|
|
|
"pdf",
|
|
|
|
"--outdir",
|
|
|
|
output_dir,
|
|
|
|
temp_docx_path,
|
|
|
|
],
|
2025-02-28 15:06:59 -05:00
|
|
|
capture_output=True,
|
2025-04-20 16:34:29 -07:00
|
|
|
text=True,
|
2025-02-28 15:06:59 -05:00
|
|
|
)
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-02-28 15:06:59 -05:00
|
|
|
if result.returncode != 0:
|
|
|
|
logger.error(f"Failed to convert Word to PDF: {result.stderr}")
|
|
|
|
return [
|
2025-04-20 16:34:29 -07:00
|
|
|
Chunk(
|
|
|
|
content=chunk.content,
|
|
|
|
metadata=(chunk.metadata | {"is_image": False}),
|
|
|
|
)
|
2025-02-28 15:06:59 -05:00
|
|
|
for chunk in chunks
|
|
|
|
]
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-02-28 15:06:59 -05:00
|
|
|
# LibreOffice creates the PDF with the same base name in the output directory
|
|
|
|
# Check if the expected PDF file exists
|
|
|
|
if not os.path.exists(expected_pdf_path) or os.path.getsize(expected_pdf_path) == 0:
|
|
|
|
logger.error(f"Generated PDF is empty or doesn't exist at expected path: {expected_pdf_path}")
|
|
|
|
return [
|
2025-04-20 16:34:29 -07:00
|
|
|
Chunk(
|
|
|
|
content=chunk.content,
|
|
|
|
metadata=(chunk.metadata | {"is_image": False}),
|
|
|
|
)
|
2025-02-28 15:06:59 -05:00
|
|
|
for chunk in chunks
|
|
|
|
]
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-02-28 15:06:59 -05:00
|
|
|
# Now process the PDF using the correct path
|
|
|
|
with open(expected_pdf_path, "rb") as pdf_file:
|
|
|
|
pdf_content = pdf_file.read()
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-02-28 15:06:59 -05:00
|
|
|
try:
|
|
|
|
images = pdf2image.convert_from_bytes(pdf_content)
|
|
|
|
if not images:
|
|
|
|
logger.warning("No images extracted from PDF")
|
|
|
|
return [
|
2025-04-20 16:34:29 -07:00
|
|
|
Chunk(
|
|
|
|
content=chunk.content,
|
|
|
|
metadata=(chunk.metadata | {"is_image": False}),
|
|
|
|
)
|
2025-02-28 15:06:59 -05:00
|
|
|
for chunk in chunks
|
|
|
|
]
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-02-28 15:06:59 -05:00
|
|
|
images_b64 = [self.img_to_base64_str(image) for image in images]
|
2025-04-20 16:34:29 -07:00
|
|
|
return [Chunk(content=image_b64, metadata={"is_image": True}) for image_b64 in images_b64]
|
2025-02-28 15:06:59 -05:00
|
|
|
except Exception as pdf_error:
|
|
|
|
logger.error(f"Error converting PDF to images: {str(pdf_error)}")
|
|
|
|
return [
|
2025-04-20 16:34:29 -07:00
|
|
|
Chunk(
|
|
|
|
content=chunk.content,
|
|
|
|
metadata=(chunk.metadata | {"is_image": False}),
|
|
|
|
)
|
2025-02-28 15:06:59 -05:00
|
|
|
for chunk in chunks
|
|
|
|
]
|
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Error processing Word document: {str(e)}")
|
|
|
|
return [
|
|
|
|
Chunk(content=chunk.content, metadata=(chunk.metadata | {"is_image": False}))
|
|
|
|
for chunk in chunks
|
|
|
|
]
|
|
|
|
finally:
|
|
|
|
# Clean up temporary files
|
|
|
|
if os.path.exists(temp_docx_path):
|
|
|
|
os.unlink(temp_docx_path)
|
|
|
|
if os.path.exists(temp_pdf_path):
|
|
|
|
os.unlink(temp_pdf_path)
|
|
|
|
# Also clean up the expected PDF path if it exists and is different from temp_pdf_path
|
2025-04-20 16:34:29 -07:00
|
|
|
if (
|
|
|
|
"expected_pdf_path" in locals()
|
|
|
|
and os.path.exists(expected_pdf_path)
|
|
|
|
and expected_pdf_path != temp_pdf_path
|
|
|
|
):
|
2025-02-28 15:06:59 -05:00
|
|
|
os.unlink(expected_pdf_path)
|
|
|
|
|
2025-02-26 20:17:12 -05:00
|
|
|
# case filetype.get_type(ext="txt"):
|
|
|
|
# logger.info(f"Found text input: chunks for multivector embedding")
|
|
|
|
# return chunks.copy()
|
|
|
|
# TODO: Add support for office documents
|
|
|
|
# case document.Xls | document.Xlsx | document.Ods |document.Odp:
|
|
|
|
# logger.warning(f"Colpali is not supported for file type {file_type.mime} - skipping")
|
|
|
|
# case file_type if file_type in DOCUMENT:
|
|
|
|
# pass
|
|
|
|
case _:
|
2025-04-20 16:34:29 -07:00
|
|
|
logger.warning(f"Colpali is not supported for file type {file_type.mime} - skipping")
|
2025-02-26 20:17:12 -05:00
|
|
|
return [
|
2025-04-20 16:34:29 -07:00
|
|
|
Chunk(content=chunk.content, metadata=(chunk.metadata | {"is_image": False})) for chunk in chunks
|
2025-02-26 20:17:12 -05:00
|
|
|
]
|
|
|
|
|
2024-11-24 14:29:25 -05:00
|
|
|
def _create_chunk_objects(
|
|
|
|
self,
|
|
|
|
doc_id: str,
|
2024-12-28 19:41:05 +05:30
|
|
|
chunks: List[Chunk],
|
2024-11-24 14:29:25 -05:00
|
|
|
embeddings: List[List[float]],
|
|
|
|
) -> List[DocumentChunk]:
|
2025-04-13 14:52:26 -07:00
|
|
|
"""Helper to create chunk objects
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-13 14:52:26 -07:00
|
|
|
Note: folder_name and end_user_id are not needed in chunk metadata because:
|
|
|
|
1. Filtering by these values happens at the document level in find_authorized_and_filtered_documents
|
|
|
|
2. Vector search is only performed on already authorized and filtered documents
|
|
|
|
3. This approach is more efficient as it reduces the size of chunk metadata
|
|
|
|
"""
|
2024-11-24 14:29:25 -05:00
|
|
|
return [
|
2024-12-28 19:41:05 +05:30
|
|
|
c.to_document_chunk(chunk_number=i, embedding=embedding, document_id=doc_id)
|
|
|
|
for i, (embedding, c) in enumerate(zip(embeddings, chunks))
|
2024-11-24 14:29:25 -05:00
|
|
|
]
|
|
|
|
|
|
|
|
async def _store_chunks_and_doc(
|
2025-02-26 20:17:12 -05:00
|
|
|
self,
|
|
|
|
chunk_objects: List[DocumentChunk],
|
|
|
|
doc: Document,
|
|
|
|
use_colpali: bool = False,
|
|
|
|
chunk_objects_multivector: Optional[List[DocumentChunk]] = None,
|
2025-03-13 11:26:01 -04:00
|
|
|
is_update: bool = False,
|
|
|
|
auth: Optional[AuthContext] = None,
|
2024-11-28 19:09:40 -05:00
|
|
|
) -> List[str]:
|
2024-11-24 14:29:25 -05:00
|
|
|
"""Helper to store chunks and document"""
|
2025-04-16 21:45:34 -07:00
|
|
|
# Add retry logic for vector store operations
|
|
|
|
max_retries = 3
|
|
|
|
retry_delay = 1.0
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-20 22:36:27 -07:00
|
|
|
# Helper function to store embeddings with retry
|
|
|
|
async def store_with_retry(store, objects, store_name="regular"):
|
|
|
|
attempt = 0
|
|
|
|
success = False
|
|
|
|
result = None
|
|
|
|
current_retry_delay = retry_delay
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-20 22:36:27 -07:00
|
|
|
while attempt < max_retries and not success:
|
|
|
|
try:
|
|
|
|
success, result = await store.store_embeddings(objects)
|
|
|
|
if not success:
|
|
|
|
raise Exception(f"Failed to store {store_name} chunk embeddings")
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
|
|
attempt += 1
|
|
|
|
error_msg = str(e)
|
|
|
|
if "connection was closed" in error_msg or "ConnectionDoesNotExistError" in error_msg:
|
|
|
|
if attempt < max_retries:
|
|
|
|
logger.warning(
|
|
|
|
f"Database connection error during {store_name} embeddings storage "
|
|
|
|
f"(attempt {attempt}/{max_retries}): {error_msg}. "
|
|
|
|
f"Retrying in {current_retry_delay}s..."
|
|
|
|
)
|
|
|
|
await asyncio.sleep(current_retry_delay)
|
|
|
|
# Increase delay for next retry (exponential backoff)
|
|
|
|
current_retry_delay *= 2
|
|
|
|
else:
|
|
|
|
logger.error(
|
|
|
|
f"All {store_name} database connection attempts failed "
|
|
|
|
f"after {max_retries} retries: {error_msg}"
|
|
|
|
)
|
|
|
|
raise Exception(f"Failed to store {store_name} chunk embeddings after multiple retries")
|
2025-04-16 21:45:34 -07:00
|
|
|
else:
|
2025-04-20 22:36:27 -07:00
|
|
|
# For other exceptions, don't retry
|
|
|
|
logger.error(f"Error storing {store_name} embeddings: {error_msg}")
|
|
|
|
raise
|
2025-02-26 20:17:12 -05:00
|
|
|
|
2025-04-20 22:36:27 -07:00
|
|
|
# Store document metadata with retry
|
|
|
|
async def store_document_with_retry():
|
2025-04-16 21:45:34 -07:00
|
|
|
attempt = 0
|
|
|
|
success = False
|
2025-04-20 22:36:27 -07:00
|
|
|
current_retry_delay = retry_delay
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-16 21:45:34 -07:00
|
|
|
while attempt < max_retries and not success:
|
|
|
|
try:
|
2025-04-20 22:36:27 -07:00
|
|
|
if is_update and auth:
|
|
|
|
# For updates, use update_document, serialize StorageFileInfo into plain dicts
|
|
|
|
updates = {
|
|
|
|
"chunk_ids": doc.chunk_ids,
|
|
|
|
"metadata": doc.metadata,
|
|
|
|
"system_metadata": doc.system_metadata,
|
|
|
|
"filename": doc.filename,
|
|
|
|
"content_type": doc.content_type,
|
|
|
|
"storage_info": doc.storage_info,
|
|
|
|
"storage_files": (
|
|
|
|
[
|
|
|
|
(
|
|
|
|
file.model_dump()
|
|
|
|
if hasattr(file, "model_dump")
|
|
|
|
else (file.dict() if hasattr(file, "dict") else file)
|
|
|
|
)
|
|
|
|
for file in doc.storage_files
|
|
|
|
]
|
|
|
|
if doc.storage_files
|
|
|
|
else []
|
|
|
|
),
|
|
|
|
}
|
|
|
|
success = await self.db.update_document(doc.external_id, updates, auth)
|
|
|
|
if not success:
|
|
|
|
raise Exception("Failed to update document metadata")
|
|
|
|
else:
|
|
|
|
# For new documents, use store_document
|
|
|
|
success = await self.db.store_document(doc)
|
|
|
|
if not success:
|
|
|
|
raise Exception("Failed to store document metadata")
|
|
|
|
return success
|
2025-04-16 21:45:34 -07:00
|
|
|
except Exception as e:
|
|
|
|
attempt += 1
|
|
|
|
error_msg = str(e)
|
|
|
|
if "connection was closed" in error_msg or "ConnectionDoesNotExistError" in error_msg:
|
|
|
|
if attempt < max_retries:
|
2025-04-20 16:34:29 -07:00
|
|
|
logger.warning(
|
2025-04-20 22:36:27 -07:00
|
|
|
f"Database connection error during document metadata storage "
|
|
|
|
f"(attempt {attempt}/{max_retries}): {error_msg}. "
|
|
|
|
f"Retrying in {current_retry_delay}s..."
|
2025-04-20 16:34:29 -07:00
|
|
|
)
|
2025-04-20 22:36:27 -07:00
|
|
|
await asyncio.sleep(current_retry_delay)
|
2025-04-16 21:45:34 -07:00
|
|
|
# Increase delay for next retry (exponential backoff)
|
2025-04-20 22:36:27 -07:00
|
|
|
current_retry_delay *= 2
|
2025-04-16 21:45:34 -07:00
|
|
|
else:
|
2025-04-20 16:34:29 -07:00
|
|
|
logger.error(
|
2025-04-20 22:36:27 -07:00
|
|
|
f"All database connection attempts failed " f"after {max_retries} retries: {error_msg}"
|
2025-04-20 16:34:29 -07:00
|
|
|
)
|
2025-04-20 22:36:27 -07:00
|
|
|
raise Exception("Failed to store document metadata after multiple retries")
|
2025-04-16 21:45:34 -07:00
|
|
|
else:
|
|
|
|
# For other exceptions, don't retry
|
2025-04-20 22:36:27 -07:00
|
|
|
logger.error(f"Error storing document metadata: {error_msg}")
|
2025-04-16 21:45:34 -07:00
|
|
|
raise
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-20 22:36:27 -07:00
|
|
|
# Run storage operations in parallel when possible
|
|
|
|
storage_tasks = [store_with_retry(self.vector_store, chunk_objects, "regular")]
|
2025-02-26 20:17:12 -05:00
|
|
|
|
2025-04-20 22:36:27 -07:00
|
|
|
# Add colpali storage task if needed
|
|
|
|
if use_colpali and self.colpali_vector_store and chunk_objects_multivector:
|
|
|
|
storage_tasks.append(store_with_retry(self.colpali_vector_store, chunk_objects_multivector, "colpali"))
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-20 22:36:27 -07:00
|
|
|
# Execute storage tasks concurrently
|
|
|
|
storage_results = await asyncio.gather(*storage_tasks)
|
|
|
|
|
|
|
|
# Combine chunk IDs
|
|
|
|
regular_chunk_ids = storage_results[0]
|
|
|
|
colpali_chunk_ids = storage_results[1] if len(storage_results) > 1 else []
|
|
|
|
doc.chunk_ids = regular_chunk_ids + colpali_chunk_ids
|
|
|
|
|
|
|
|
logger.debug(f"Stored chunk embeddings in vector stores: {len(doc.chunk_ids)} chunks total")
|
|
|
|
|
|
|
|
# Store document metadata (this must be done after chunk storage)
|
|
|
|
await store_document_with_retry()
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-16 21:45:34 -07:00
|
|
|
logger.debug("Stored document metadata in database")
|
2025-02-26 20:17:12 -05:00
|
|
|
logger.debug(f"Chunk IDs stored: {doc.chunk_ids}")
|
|
|
|
return doc.chunk_ids
|
2024-11-28 19:09:40 -05:00
|
|
|
|
2025-04-20 16:34:29 -07:00
|
|
|
async def _create_chunk_results(self, auth: AuthContext, chunks: List[DocumentChunk]) -> List[ChunkResult]:
|
2024-11-22 18:56:22 -05:00
|
|
|
"""Create ChunkResult objects with document metadata."""
|
|
|
|
results = []
|
2025-04-20 22:36:27 -07:00
|
|
|
if not chunks:
|
|
|
|
logger.info("No chunks provided, returning empty results")
|
|
|
|
return results
|
|
|
|
|
|
|
|
# Collect all unique document IDs from chunks
|
|
|
|
unique_doc_ids = list({chunk.document_id for chunk in chunks})
|
|
|
|
|
|
|
|
# Fetch all required documents in a single batch query
|
|
|
|
docs = await self.batch_retrieve_documents(unique_doc_ids, auth)
|
|
|
|
|
|
|
|
# Create a lookup dictionary of documents by ID
|
|
|
|
doc_map = {doc.external_id: doc for doc in docs}
|
|
|
|
logger.debug(f"Retrieved metadata for {len(doc_map)} unique documents in a single batch")
|
|
|
|
|
|
|
|
# Generate download URLs for all documents that have storage info
|
|
|
|
download_urls = {}
|
|
|
|
for doc_id, doc in doc_map.items():
|
|
|
|
if doc.storage_info:
|
|
|
|
download_urls[doc_id] = await self.storage.get_download_url(
|
|
|
|
doc.storage_info["bucket"], doc.storage_info["key"]
|
|
|
|
)
|
|
|
|
logger.debug(f"Generated download URL for document {doc_id}")
|
|
|
|
|
|
|
|
# Create chunk results using the lookup dictionaries
|
2024-11-22 18:56:22 -05:00
|
|
|
for chunk in chunks:
|
2025-04-20 22:36:27 -07:00
|
|
|
doc = doc_map.get(chunk.document_id)
|
2024-11-22 18:56:22 -05:00
|
|
|
if not doc:
|
2024-11-24 14:29:25 -05:00
|
|
|
logger.warning(f"Document {chunk.document_id} not found")
|
2024-11-22 18:56:22 -05:00
|
|
|
continue
|
2024-12-26 11:34:24 -05:00
|
|
|
|
2025-04-20 22:36:27 -07:00
|
|
|
metadata = doc.metadata.copy()
|
2025-02-26 20:17:12 -05:00
|
|
|
metadata["is_image"] = chunk.metadata.get("is_image", False)
|
2024-12-26 11:34:24 -05:00
|
|
|
results.append(
|
|
|
|
ChunkResult(
|
|
|
|
content=chunk.content,
|
|
|
|
score=chunk.score,
|
|
|
|
document_id=chunk.document_id,
|
|
|
|
chunk_number=chunk.chunk_number,
|
2025-02-26 20:17:12 -05:00
|
|
|
metadata=metadata,
|
2024-12-26 11:34:24 -05:00
|
|
|
content_type=doc.content_type,
|
|
|
|
filename=doc.filename,
|
2025-04-20 22:36:27 -07:00
|
|
|
download_url=download_urls.get(chunk.document_id),
|
2024-11-24 14:29:25 -05:00
|
|
|
)
|
2024-12-26 11:34:24 -05:00
|
|
|
)
|
2024-11-22 18:56:22 -05:00
|
|
|
|
2024-11-24 14:29:25 -05:00
|
|
|
logger.info(f"Created {len(results)} chunk results")
|
2024-11-22 18:56:22 -05:00
|
|
|
return results
|
|
|
|
|
2025-04-20 16:34:29 -07:00
|
|
|
async def _create_document_results(self, auth: AuthContext, chunks: List[ChunkResult]) -> Dict[str, DocumentResult]:
|
2024-11-22 18:56:22 -05:00
|
|
|
"""Group chunks by document and create DocumentResult objects."""
|
2025-04-20 22:36:27 -07:00
|
|
|
if not chunks:
|
|
|
|
logger.info("No chunks provided, returning empty results")
|
|
|
|
return {}
|
|
|
|
|
2024-11-22 18:56:22 -05:00
|
|
|
# Group chunks by document and get highest scoring chunk per doc
|
2024-12-28 19:41:05 +05:30
|
|
|
doc_chunks: Dict[str, ChunkResult] = {}
|
2024-11-22 18:56:22 -05:00
|
|
|
for chunk in chunks:
|
2025-04-20 16:34:29 -07:00
|
|
|
if chunk.document_id not in doc_chunks or chunk.score > doc_chunks[chunk.document_id].score:
|
2024-11-22 18:56:22 -05:00
|
|
|
doc_chunks[chunk.document_id] = chunk
|
2024-11-24 14:29:25 -05:00
|
|
|
logger.info(f"Grouped chunks into {len(doc_chunks)} documents")
|
2025-04-20 22:36:27 -07:00
|
|
|
|
|
|
|
# Get unique document IDs
|
|
|
|
unique_doc_ids = list(doc_chunks.keys())
|
|
|
|
|
|
|
|
# Fetch all documents in a single batch query
|
|
|
|
docs = await self.batch_retrieve_documents(unique_doc_ids, auth)
|
|
|
|
|
|
|
|
# Create a lookup dictionary of documents by ID
|
|
|
|
doc_map = {doc.external_id: doc for doc in docs}
|
|
|
|
logger.debug(f"Retrieved metadata for {len(doc_map)} unique documents in a single batch")
|
|
|
|
|
|
|
|
# Generate download URLs for non-text documents in a single loop
|
|
|
|
download_urls = {}
|
|
|
|
for doc_id, doc in doc_map.items():
|
|
|
|
if doc.content_type != "text/plain" and doc.storage_info:
|
|
|
|
download_urls[doc_id] = await self.storage.get_download_url(
|
|
|
|
doc.storage_info["bucket"], doc.storage_info["key"]
|
|
|
|
)
|
|
|
|
logger.debug(f"Generated download URL for document {doc_id}")
|
|
|
|
|
|
|
|
# Create document results using the lookup dictionaries
|
2024-12-29 12:45:12 +05:30
|
|
|
results = {}
|
2024-11-22 18:56:22 -05:00
|
|
|
for doc_id, chunk in doc_chunks.items():
|
2025-04-20 22:36:27 -07:00
|
|
|
doc = doc_map.get(doc_id)
|
2024-11-22 18:56:22 -05:00
|
|
|
if not doc:
|
2024-11-24 14:29:25 -05:00
|
|
|
logger.warning(f"Document {doc_id} not found")
|
2024-11-22 18:56:22 -05:00
|
|
|
continue
|
|
|
|
|
|
|
|
# Create DocumentContent based on content type
|
|
|
|
if doc.content_type == "text/plain":
|
2024-12-29 12:48:41 +05:30
|
|
|
content = DocumentContent(type="string", value=chunk.content, filename=None)
|
2024-11-24 14:29:25 -05:00
|
|
|
logger.debug(f"Created text content for document {doc_id}")
|
2024-11-22 18:56:22 -05:00
|
|
|
else:
|
2025-04-20 22:36:27 -07:00
|
|
|
# Use pre-generated download URL for file types
|
|
|
|
content = DocumentContent(type="url", value=download_urls.get(doc_id), filename=doc.filename)
|
2024-11-24 14:29:25 -05:00
|
|
|
logger.debug(f"Created URL content for document {doc_id}")
|
2025-04-20 22:36:27 -07:00
|
|
|
|
2024-12-29 12:45:12 +05:30
|
|
|
results[doc_id] = DocumentResult(
|
|
|
|
score=chunk.score,
|
|
|
|
document_id=doc_id,
|
|
|
|
metadata=doc.metadata,
|
|
|
|
content=content,
|
|
|
|
additional_metadata=doc.additional_metadata,
|
2024-12-26 11:34:24 -05:00
|
|
|
)
|
2024-11-22 18:56:22 -05:00
|
|
|
|
2024-11-24 14:29:25 -05:00
|
|
|
logger.info(f"Created {len(results)} document results")
|
2024-11-22 18:56:22 -05:00
|
|
|
return results
|
2025-01-29 10:19:28 +05:30
|
|
|
|
|
|
|
async def create_cache(
|
|
|
|
self,
|
|
|
|
name: str,
|
|
|
|
model: str,
|
|
|
|
gguf_file: str,
|
|
|
|
docs: List[Document | None],
|
|
|
|
filters: Optional[Dict[str, Any]] = None,
|
|
|
|
) -> Dict[str, str]:
|
|
|
|
"""Create a new cache with specified configuration.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
name: Name of the cache to create
|
|
|
|
model: Name of the model to use
|
|
|
|
gguf_file: Name of the GGUF file to use
|
|
|
|
filters: Optional metadata filters for documents to include
|
|
|
|
docs: Optional list of specific document IDs to include
|
|
|
|
"""
|
|
|
|
# Create cache metadata
|
|
|
|
metadata = {
|
|
|
|
"model": model,
|
|
|
|
"model_file": gguf_file,
|
|
|
|
"filters": filters,
|
|
|
|
"docs": [doc.model_dump_json() for doc in docs],
|
|
|
|
"storage_info": {
|
|
|
|
"bucket": "caches",
|
|
|
|
"key": f"{name}_state.pkl",
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
# Store metadata in database
|
|
|
|
success = await self.db.store_cache_metadata(name, metadata)
|
|
|
|
if not success:
|
|
|
|
logger.error(f"Failed to store cache metadata for cache {name}")
|
|
|
|
return {"success": False, "message": f"Failed to store cache metadata for cache {name}"}
|
|
|
|
|
|
|
|
# Create cache instance
|
|
|
|
cache = self.cache_factory.create_new_cache(
|
|
|
|
name=name, model=model, model_file=gguf_file, filters=filters, docs=docs
|
|
|
|
)
|
|
|
|
cache_bytes = cache.saveable_state
|
|
|
|
base64_cache_bytes = base64.b64encode(cache_bytes).decode()
|
|
|
|
bucket, key = await self.storage.upload_from_base64(
|
|
|
|
base64_cache_bytes,
|
|
|
|
key=metadata["storage_info"]["key"],
|
|
|
|
bucket=metadata["storage_info"]["bucket"],
|
|
|
|
)
|
|
|
|
return {
|
|
|
|
"success": True,
|
|
|
|
"message": f"Cache created successfully, state stored in bucket `{bucket}` with key `{key}`",
|
|
|
|
}
|
|
|
|
|
|
|
|
async def load_cache(self, name: str) -> bool:
|
|
|
|
"""Load a cache into memory.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
name: Name of the cache to load
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
bool: Whether the cache exists and was loaded successfully
|
|
|
|
"""
|
|
|
|
try:
|
|
|
|
# Get cache metadata from database
|
|
|
|
metadata = await self.db.get_cache_metadata(name)
|
|
|
|
if not metadata:
|
|
|
|
logger.error(f"No metadata found for cache {name}")
|
|
|
|
return False
|
|
|
|
|
|
|
|
# Get cache bytes from storage
|
|
|
|
cache_bytes = await self.storage.download_file(
|
|
|
|
metadata["storage_info"]["bucket"], "caches/" + metadata["storage_info"]["key"]
|
|
|
|
)
|
|
|
|
cache_bytes = cache_bytes.read()
|
2025-04-20 16:34:29 -07:00
|
|
|
cache = self.cache_factory.load_cache_from_bytes(name=name, cache_bytes=cache_bytes, metadata=metadata)
|
2025-01-29 10:19:28 +05:30
|
|
|
self.active_caches[name] = cache
|
|
|
|
return {"success": True, "message": "Cache loaded successfully"}
|
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Failed to load cache {name}: {e}")
|
|
|
|
# raise e
|
|
|
|
return {"success": False, "message": f"Failed to load cache {name}: {e}"}
|
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
async def update_document(
|
|
|
|
self,
|
|
|
|
document_id: str,
|
|
|
|
auth: AuthContext,
|
|
|
|
content: Optional[str] = None,
|
|
|
|
file: Optional[UploadFile] = None,
|
|
|
|
filename: Optional[str] = None,
|
|
|
|
metadata: Optional[Dict[str, Any]] = None,
|
|
|
|
rules: Optional[List] = None,
|
|
|
|
update_strategy: str = "add",
|
|
|
|
use_colpali: Optional[bool] = None,
|
|
|
|
) -> Optional[Document]:
|
|
|
|
"""
|
|
|
|
Update a document with new content and/or metadata using the specified strategy.
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
Args:
|
|
|
|
document_id: ID of the document to update
|
|
|
|
auth: Authentication context
|
|
|
|
content: The new text content to add (either content or file must be provided)
|
|
|
|
file: File to add (either content or file must be provided)
|
|
|
|
filename: Optional new filename for the document
|
|
|
|
metadata: Additional metadata to update
|
|
|
|
rules: Optional list of rules to apply to the content
|
|
|
|
update_strategy: Strategy for updating the document ('add' to append content)
|
|
|
|
use_colpali: Whether to use multi-vector embedding
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
Returns:
|
|
|
|
Updated document if successful, None if failed
|
|
|
|
"""
|
|
|
|
# Validate permissions and get document
|
|
|
|
doc = await self._validate_update_access(document_id, auth)
|
|
|
|
if not doc:
|
|
|
|
return None
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
# Get current content and determine update type
|
|
|
|
current_content = doc.system_metadata.get("content", "")
|
2025-04-20 16:34:29 -07:00
|
|
|
metadata_only_update = content is None and file is None and metadata is not None
|
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
# Process content based on update type
|
|
|
|
update_content = None
|
|
|
|
file_content = None
|
|
|
|
file_type = None
|
|
|
|
file_content_base64 = None
|
|
|
|
if content is not None:
|
|
|
|
update_content = await self._process_text_update(content, doc, filename, metadata, rules)
|
|
|
|
elif file is not None:
|
|
|
|
update_content, file_content, file_type, file_content_base64 = await self._process_file_update(
|
|
|
|
file, doc, metadata, rules
|
|
|
|
)
|
2025-04-19 15:24:53 -07:00
|
|
|
await self._update_storage_info(doc, file, file_content_base64)
|
2025-03-13 11:26:01 -04:00
|
|
|
elif not metadata_only_update:
|
|
|
|
logger.error("Neither content nor file provided for document update")
|
|
|
|
return None
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
# Apply content update strategy if we have new content
|
|
|
|
if update_content:
|
2025-04-19 15:24:53 -07:00
|
|
|
# Fix for initial file upload - if current_content is empty, just use the update_content
|
|
|
|
# without trying to use the update strategy (since there's nothing to update)
|
|
|
|
if not current_content:
|
|
|
|
logger.info(f"No current content found, using only new content of length {len(update_content)}")
|
|
|
|
updated_content = update_content
|
|
|
|
else:
|
|
|
|
updated_content = self._apply_update_strategy(current_content, update_content, update_strategy)
|
2025-04-20 16:34:29 -07:00
|
|
|
logger.info(
|
2025-04-20 22:36:27 -07:00
|
|
|
f"Applied update strategy '{update_strategy}': original length={len(current_content)}, "
|
|
|
|
f"new length={len(updated_content)}"
|
2025-04-20 16:34:29 -07:00
|
|
|
)
|
|
|
|
|
2025-04-19 15:24:53 -07:00
|
|
|
# Always update the content in system_metadata
|
2025-03-13 11:26:01 -04:00
|
|
|
doc.system_metadata["content"] = updated_content
|
2025-04-19 15:24:53 -07:00
|
|
|
logger.info(f"Updated system_metadata['content'] with content of length {len(updated_content)}")
|
2025-03-13 11:26:01 -04:00
|
|
|
else:
|
|
|
|
updated_content = current_content
|
2025-04-19 15:24:53 -07:00
|
|
|
logger.info(f"No content update - keeping current content of length {len(current_content)}")
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
# Update metadata and version information
|
|
|
|
self._update_metadata_and_version(doc, metadata, update_strategy, file)
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
# For metadata-only updates, we don't need to re-process chunks
|
|
|
|
if metadata_only_update:
|
|
|
|
return await self._update_document_metadata_only(doc, auth)
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
# Process content into chunks and generate embeddings
|
2025-04-23 23:15:03 -07:00
|
|
|
chunks, chunk_objects = await self._process_chunks_and_embeddings(doc.external_id, updated_content, rules)
|
2025-03-13 11:26:01 -04:00
|
|
|
if not chunks:
|
|
|
|
return None
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-23 23:15:03 -07:00
|
|
|
# If we have rules processing, the chunks may have modified content
|
|
|
|
# Update document content with stitched content from processed chunks
|
|
|
|
if rules and chunks:
|
|
|
|
chunk_contents = [chunk.content for chunk in chunks]
|
|
|
|
stitched_content = "\n".join(chunk_contents)
|
|
|
|
# Check if content actually changed
|
|
|
|
if stitched_content != updated_content:
|
|
|
|
logger.info("Updating document content with stitched content from processed chunks...")
|
|
|
|
doc.system_metadata["content"] = stitched_content
|
|
|
|
logger.info(f"Updated document content with stitched chunks (length: {len(stitched_content)})")
|
|
|
|
|
|
|
|
# Merge any aggregated metadata from chunk rules
|
|
|
|
if hasattr(self, "_last_aggregated_metadata") and self._last_aggregated_metadata:
|
|
|
|
logger.info("Merging aggregated chunk metadata into document metadata...")
|
|
|
|
# Make sure doc.metadata exists
|
|
|
|
if not hasattr(doc, "metadata") or doc.metadata is None:
|
|
|
|
doc.metadata = {}
|
|
|
|
doc.metadata.update(self._last_aggregated_metadata)
|
|
|
|
logger.info(f"Final document metadata after merge: {doc.metadata}")
|
|
|
|
# Clear the temporary metadata
|
|
|
|
self._last_aggregated_metadata = {}
|
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
# Handle colpali (multi-vector) embeddings if needed
|
|
|
|
chunk_objects_multivector = await self._process_colpali_embeddings(
|
|
|
|
use_colpali, doc.external_id, chunks, file, file_type, file_content, file_content_base64
|
|
|
|
)
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
# Store everything - this will replace existing chunks with new ones
|
|
|
|
await self._store_chunks_and_doc(
|
|
|
|
chunk_objects, doc, use_colpali, chunk_objects_multivector, is_update=True, auth=auth
|
|
|
|
)
|
|
|
|
logger.info(f"Successfully updated document {doc.external_id}")
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
return doc
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
async def _validate_update_access(self, document_id: str, auth: AuthContext) -> Optional[Document]:
|
|
|
|
"""Validate user permissions and document access."""
|
|
|
|
if "write" not in auth.permissions:
|
|
|
|
logger.error(f"User {auth.entity_id} does not have write permission")
|
|
|
|
raise PermissionError("User does not have write permission")
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
# Check if document exists and user has write access
|
|
|
|
doc = await self.db.get_document(document_id, auth)
|
|
|
|
if not doc:
|
|
|
|
logger.error(f"Document {document_id} not found or not accessible")
|
|
|
|
return None
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
if not await self.db.check_access(document_id, auth, "write"):
|
|
|
|
logger.error(f"User {auth.entity_id} does not have write permission for document {document_id}")
|
|
|
|
raise PermissionError(f"User does not have write permission for document {document_id}")
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
return doc
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
async def _process_text_update(
|
2025-04-20 16:34:29 -07:00
|
|
|
self,
|
|
|
|
content: str,
|
|
|
|
doc: Document,
|
|
|
|
filename: Optional[str],
|
|
|
|
metadata: Optional[Dict[str, Any]],
|
|
|
|
rules: Optional[List],
|
2025-03-13 11:26:01 -04:00
|
|
|
) -> str:
|
|
|
|
"""Process text content updates."""
|
|
|
|
update_content = content
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
# Update filename if provided
|
|
|
|
if filename:
|
|
|
|
doc.filename = filename
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-23 23:15:03 -07:00
|
|
|
# Apply post_parsing rules if provided
|
2025-03-13 11:26:01 -04:00
|
|
|
if rules:
|
2025-04-23 23:15:03 -07:00
|
|
|
logger.info("Applying post-parsing rules to text update...")
|
|
|
|
rule_metadata, modified_content = await self.rules_processor.process_document_rules(content, rules)
|
2025-03-13 11:26:01 -04:00
|
|
|
# Update metadata with extracted metadata from rules
|
|
|
|
if metadata is not None:
|
|
|
|
metadata.update(rule_metadata)
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-23 23:15:03 -07:00
|
|
|
update_content = modified_content
|
|
|
|
logger.info(f"Content length after post-parsing rules: {len(update_content)}")
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
return update_content
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
async def _process_file_update(
|
|
|
|
self,
|
|
|
|
file: UploadFile,
|
|
|
|
doc: Document,
|
|
|
|
metadata: Optional[Dict[str, Any]],
|
2025-04-20 16:34:29 -07:00
|
|
|
rules: Optional[List],
|
2025-03-13 11:26:01 -04:00
|
|
|
) -> tuple[str, bytes, Any, str]:
|
|
|
|
"""Process file content updates."""
|
|
|
|
# Read file content
|
|
|
|
file_content = await file.read()
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
# Parse the file content
|
2025-04-20 16:34:29 -07:00
|
|
|
additional_file_metadata, file_text = await self.parser.parse_file_to_text(file_content, file.filename)
|
2025-03-13 11:26:01 -04:00
|
|
|
logger.info(f"Parsed file into text of length {len(file_text)}")
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-23 23:15:03 -07:00
|
|
|
# Apply post_parsing rules if provided for file content
|
2025-03-13 11:26:01 -04:00
|
|
|
if rules:
|
2025-04-23 23:15:03 -07:00
|
|
|
logger.info("Applying post-parsing rules to file update...")
|
|
|
|
rule_metadata, modified_text = await self.rules_processor.process_document_rules(file_text, rules)
|
2025-03-13 11:26:01 -04:00
|
|
|
# Update metadata with extracted metadata from rules
|
|
|
|
if metadata is not None:
|
|
|
|
metadata.update(rule_metadata)
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-23 23:15:03 -07:00
|
|
|
file_text = modified_text
|
|
|
|
logger.info(f"File content length after post-parsing rules: {len(file_text)}")
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
# Add additional metadata from file if available
|
|
|
|
if additional_file_metadata:
|
|
|
|
if not doc.additional_metadata:
|
|
|
|
doc.additional_metadata = {}
|
|
|
|
doc.additional_metadata.update(additional_file_metadata)
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
# Store file in storage if needed
|
|
|
|
file_content_base64 = base64.b64encode(file_content).decode()
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
# Store file in storage and update storage info
|
|
|
|
await self._update_storage_info(doc, file, file_content_base64)
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
# Store file type
|
|
|
|
file_type = filetype.guess(file_content)
|
|
|
|
if file_type:
|
|
|
|
doc.content_type = file_type.mime
|
2025-03-18 23:27:53 -04:00
|
|
|
else:
|
|
|
|
# If filetype.guess failed, try to determine from filename
|
|
|
|
import mimetypes
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-18 23:27:53 -04:00
|
|
|
guessed_type = mimetypes.guess_type(file.filename)[0]
|
|
|
|
if guessed_type:
|
|
|
|
doc.content_type = guessed_type
|
|
|
|
else:
|
|
|
|
# Default fallback
|
2025-04-20 16:34:29 -07:00
|
|
|
doc.content_type = "text/plain" if file.filename.endswith(".txt") else "application/octet-stream"
|
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
# Update filename
|
|
|
|
doc.filename = file.filename
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
return file_text, file_content, file_type, file_content_base64
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
async def _update_storage_info(self, doc: Document, file: UploadFile, file_content_base64: str):
|
|
|
|
"""Update document storage information for file content."""
|
2025-04-19 15:24:53 -07:00
|
|
|
# Initialize storage_files array if needed - using the passed doc object directly
|
|
|
|
# No need to refetch from the database as we already have the full document state
|
|
|
|
if not hasattr(doc, "storage_files") or not doc.storage_files:
|
|
|
|
# Initialize empty list
|
|
|
|
doc.storage_files = []
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-19 15:24:53 -07:00
|
|
|
# If storage_files is empty but we have storage_info, migrate legacy data
|
|
|
|
if doc.storage_info and doc.storage_info.get("bucket") and doc.storage_info.get("key"):
|
|
|
|
# Create StorageFileInfo from storage_info
|
2025-03-13 11:26:01 -04:00
|
|
|
legacy_file_info = StorageFileInfo(
|
|
|
|
bucket=doc.storage_info.get("bucket", ""),
|
|
|
|
key=doc.storage_info.get("key", ""),
|
|
|
|
version=1,
|
|
|
|
filename=doc.filename,
|
|
|
|
content_type=doc.content_type,
|
2025-04-20 16:34:29 -07:00
|
|
|
timestamp=doc.system_metadata.get("updated_at", datetime.now(UTC)),
|
2025-03-13 11:26:01 -04:00
|
|
|
)
|
|
|
|
doc.storage_files.append(legacy_file_info)
|
2025-04-19 15:24:53 -07:00
|
|
|
logger.info(f"Migrated legacy storage_info to storage_files: {doc.storage_files}")
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-19 15:24:53 -07:00
|
|
|
# Upload the new file with a unique key including version number
|
|
|
|
# The version is based on the current length of storage_files to ensure correct versioning
|
|
|
|
version = len(doc.storage_files) + 1
|
|
|
|
file_extension = os.path.splitext(file.filename)[1] if file.filename else ""
|
|
|
|
storage_info = await self.storage.upload_from_base64(
|
|
|
|
file_content_base64, f"{doc.external_id}_{version}{file_extension}", file.content_type
|
|
|
|
)
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-19 15:24:53 -07:00
|
|
|
# Add the new file to storage_files
|
|
|
|
new_file_info = StorageFileInfo(
|
|
|
|
bucket=storage_info[0],
|
|
|
|
key=storage_info[1],
|
|
|
|
version=version,
|
|
|
|
filename=file.filename,
|
|
|
|
content_type=file.content_type,
|
2025-04-20 16:34:29 -07:00
|
|
|
timestamp=datetime.now(UTC),
|
2025-04-19 15:24:53 -07:00
|
|
|
)
|
|
|
|
doc.storage_files.append(new_file_info)
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-19 15:24:53 -07:00
|
|
|
# Still update legacy storage_info with the latest file for backward compatibility
|
|
|
|
doc.storage_info = {"bucket": storage_info[0], "key": storage_info[1]}
|
2025-03-13 11:26:01 -04:00
|
|
|
logger.info(f"Stored file in bucket `{storage_info[0]}` with key `{storage_info[1]}`")
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
def _apply_update_strategy(self, current_content: str, update_content: str, update_strategy: str) -> str:
|
|
|
|
"""Apply the update strategy to combine current and new content."""
|
|
|
|
if update_strategy == "add":
|
|
|
|
# Append the new content
|
|
|
|
return current_content + "\n\n" + update_content
|
|
|
|
else:
|
|
|
|
# For now, just use 'add' as default strategy
|
|
|
|
logger.warning(f"Unknown update strategy '{update_strategy}', defaulting to 'add'")
|
|
|
|
return current_content + "\n\n" + update_content
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
def _update_metadata_and_version(
|
2025-04-20 16:34:29 -07:00
|
|
|
self,
|
|
|
|
doc: Document,
|
|
|
|
metadata: Optional[Dict[str, Any]],
|
|
|
|
update_strategy: str,
|
|
|
|
file: Optional[UploadFile],
|
2025-03-13 11:26:01 -04:00
|
|
|
):
|
|
|
|
"""Update document metadata and version tracking."""
|
|
|
|
# Update metadata if provided - additive but replacing existing keys
|
|
|
|
if metadata:
|
|
|
|
doc.metadata.update(metadata)
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-18 00:26:27 -07:00
|
|
|
# Ensure external_id is preserved in metadata
|
|
|
|
doc.metadata["external_id"] = doc.external_id
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
# Increment version
|
|
|
|
current_version = doc.system_metadata.get("version", 1)
|
|
|
|
doc.system_metadata["version"] = current_version + 1
|
|
|
|
doc.system_metadata["updated_at"] = datetime.now(UTC)
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
# Track update history
|
|
|
|
if "update_history" not in doc.system_metadata:
|
|
|
|
doc.system_metadata["update_history"] = []
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
update_entry = {
|
|
|
|
"timestamp": datetime.now(UTC).isoformat(),
|
|
|
|
"version": current_version + 1,
|
|
|
|
"strategy": update_strategy,
|
|
|
|
}
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
if file:
|
|
|
|
update_entry["filename"] = file.filename
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
if metadata:
|
|
|
|
update_entry["metadata_updated"] = True
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
doc.system_metadata["update_history"].append(update_entry)
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-19 15:24:53 -07:00
|
|
|
# Ensure storage_files models are properly typed as StorageFileInfo objects
|
|
|
|
if hasattr(doc, "storage_files") and doc.storage_files:
|
|
|
|
# Convert to StorageFileInfo objects if they're dicts or ensure they're properly serializable
|
|
|
|
doc.storage_files = [
|
2025-04-20 16:34:29 -07:00
|
|
|
(
|
|
|
|
StorageFileInfo(**file)
|
|
|
|
if isinstance(file, dict)
|
|
|
|
else (
|
|
|
|
file
|
|
|
|
if isinstance(file, StorageFileInfo)
|
|
|
|
else (
|
|
|
|
StorageFileInfo(**file.model_dump())
|
|
|
|
if hasattr(file, "model_dump")
|
2025-04-20 22:36:27 -07:00
|
|
|
else StorageFileInfo(**file.dict()) if hasattr(file, "dict") else file
|
2025-04-20 16:34:29 -07:00
|
|
|
)
|
|
|
|
)
|
|
|
|
)
|
2025-04-19 15:24:53 -07:00
|
|
|
for file in doc.storage_files
|
|
|
|
]
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
async def _update_document_metadata_only(self, doc: Document, auth: AuthContext) -> Optional[Document]:
|
|
|
|
"""Update document metadata without reprocessing chunks."""
|
|
|
|
updates = {
|
|
|
|
"metadata": doc.metadata,
|
|
|
|
"system_metadata": doc.system_metadata,
|
|
|
|
"filename": doc.filename,
|
2025-04-19 15:24:53 -07:00
|
|
|
"storage_files": doc.storage_files if hasattr(doc, "storage_files") else None,
|
|
|
|
"storage_info": doc.storage_info if hasattr(doc, "storage_info") else None,
|
2025-03-13 11:26:01 -04:00
|
|
|
}
|
2025-04-19 15:24:53 -07:00
|
|
|
# Remove None values
|
|
|
|
updates = {k: v for k, v in updates.items() if v is not None}
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
success = await self.db.update_document(doc.external_id, updates, auth)
|
|
|
|
if not success:
|
|
|
|
logger.error(f"Failed to update document {doc.external_id} metadata")
|
|
|
|
return None
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
logger.info(f"Successfully updated document metadata for {doc.external_id}")
|
|
|
|
return doc
|
2025-04-20 16:34:29 -07:00
|
|
|
|
|
|
|
async def _process_chunks_and_embeddings(
|
2025-04-23 23:15:03 -07:00
|
|
|
self, doc_id: str, content: str, rules: Optional[List[Dict[str, Any]]] = None
|
2025-04-20 16:34:29 -07:00
|
|
|
) -> tuple[List[Chunk], List[DocumentChunk]]:
|
2025-03-13 11:26:01 -04:00
|
|
|
"""Process content into chunks and generate embeddings."""
|
|
|
|
# Split content into chunks
|
2025-04-23 23:15:03 -07:00
|
|
|
parsed_chunks = await self.parser.split_text(content)
|
|
|
|
if not parsed_chunks:
|
2025-03-13 11:26:01 -04:00
|
|
|
logger.error("No content chunks extracted after update")
|
|
|
|
return None, None
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-23 23:15:03 -07:00
|
|
|
logger.info(f"Split updated text into {len(parsed_chunks)} chunks")
|
|
|
|
|
|
|
|
# Apply post_chunking rules and aggregate metadata if provided
|
|
|
|
processed_chunks = []
|
|
|
|
aggregated_chunk_metadata: Dict[str, Any] = {} # Initialize dict for aggregated metadata
|
|
|
|
chunk_contents = [] # Initialize list to collect chunk contents efficiently
|
|
|
|
|
|
|
|
if rules:
|
|
|
|
logger.info("Applying post-chunking rules...")
|
|
|
|
|
|
|
|
for chunk_obj in parsed_chunks:
|
|
|
|
# Get metadata *and* the potentially modified chunk
|
|
|
|
chunk_rule_metadata, processed_chunk = await self.rules_processor.process_chunk_rules(chunk_obj, rules)
|
|
|
|
processed_chunks.append(processed_chunk)
|
|
|
|
chunk_contents.append(processed_chunk.content) # Collect content as we process
|
|
|
|
# Aggregate the metadata extracted from this chunk
|
|
|
|
aggregated_chunk_metadata.update(chunk_rule_metadata)
|
|
|
|
logger.info(f"Finished applying post-chunking rules to {len(processed_chunks)} chunks.")
|
|
|
|
logger.info(f"Aggregated metadata from all chunks: {aggregated_chunk_metadata}")
|
|
|
|
|
|
|
|
# Return this metadata so the calling method can update the document metadata
|
|
|
|
self._last_aggregated_metadata = aggregated_chunk_metadata
|
|
|
|
else:
|
|
|
|
processed_chunks = parsed_chunks # No rules, use original chunks
|
|
|
|
self._last_aggregated_metadata = {}
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-23 23:15:03 -07:00
|
|
|
# Generate embeddings for processed chunks
|
|
|
|
embeddings = await self.embedding_model.embed_for_ingestion(processed_chunks)
|
2025-03-13 11:26:01 -04:00
|
|
|
logger.info(f"Generated {len(embeddings)} embeddings")
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
# Create new chunk objects
|
2025-04-23 23:15:03 -07:00
|
|
|
chunk_objects = self._create_chunk_objects(doc_id, processed_chunks, embeddings)
|
2025-03-13 11:26:01 -04:00
|
|
|
logger.info(f"Created {len(chunk_objects)} chunk objects")
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-23 23:15:03 -07:00
|
|
|
return processed_chunks, chunk_objects
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
async def _process_colpali_embeddings(
|
|
|
|
self,
|
|
|
|
use_colpali: bool,
|
|
|
|
doc_id: str,
|
|
|
|
chunks: List[Chunk],
|
|
|
|
file: Optional[UploadFile],
|
|
|
|
file_type: Any,
|
|
|
|
file_content: Optional[bytes],
|
2025-04-20 16:34:29 -07:00
|
|
|
file_content_base64: Optional[str],
|
2025-03-13 11:26:01 -04:00
|
|
|
) -> List[DocumentChunk]:
|
|
|
|
"""Process colpali multi-vector embeddings if enabled."""
|
|
|
|
chunk_objects_multivector = []
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
if not (use_colpali and self.colpali_embedding_model and self.colpali_vector_store):
|
|
|
|
return chunk_objects_multivector
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
# For file updates, we need special handling for images and PDFs
|
|
|
|
if file and file_type and (file_type.mime in IMAGE or file_type.mime == "application/pdf"):
|
|
|
|
# Rewind the file and read it again if needed
|
2025-04-20 16:34:29 -07:00
|
|
|
if hasattr(file, "seek") and callable(file.seek) and not file_content:
|
2025-03-13 11:26:01 -04:00
|
|
|
await file.seek(0)
|
|
|
|
file_content = await file.read()
|
|
|
|
file_content_base64 = base64.b64encode(file_content).decode()
|
2025-04-20 16:34:29 -07:00
|
|
|
|
|
|
|
chunks_multivector = self._create_chunks_multivector(file_type, file_content_base64, file_content, chunks)
|
2025-03-13 11:26:01 -04:00
|
|
|
logger.info(f"Created {len(chunks_multivector)} chunks for multivector embedding")
|
|
|
|
colpali_embeddings = await self.colpali_embedding_model.embed_for_ingestion(chunks_multivector)
|
|
|
|
logger.info(f"Generated {len(colpali_embeddings)} embeddings for multivector embedding")
|
2025-04-20 16:34:29 -07:00
|
|
|
chunk_objects_multivector = self._create_chunk_objects(doc_id, chunks_multivector, colpali_embeddings)
|
2025-03-13 11:26:01 -04:00
|
|
|
else:
|
|
|
|
# For text updates or non-image/PDF files
|
|
|
|
embeddings_multivector = await self.colpali_embedding_model.embed_for_ingestion(chunks)
|
|
|
|
logger.info(f"Generated {len(embeddings_multivector)} embeddings for multivector embedding")
|
2025-04-20 16:34:29 -07:00
|
|
|
chunk_objects_multivector = self._create_chunk_objects(doc_id, chunks, embeddings_multivector)
|
|
|
|
|
2025-03-13 11:26:01 -04:00
|
|
|
logger.info(f"Created {len(chunk_objects_multivector)} chunk objects for multivector embedding")
|
|
|
|
return chunk_objects_multivector
|
|
|
|
|
2025-03-17 17:36:43 -04:00
|
|
|
async def create_graph(
|
|
|
|
self,
|
|
|
|
name: str,
|
|
|
|
auth: AuthContext,
|
|
|
|
filters: Optional[Dict[str, Any]] = None,
|
|
|
|
documents: Optional[List[str]] = None,
|
2025-03-31 21:30:48 -07:00
|
|
|
prompt_overrides: Optional[GraphPromptOverrides] = None,
|
2025-04-13 14:52:26 -07:00
|
|
|
system_filters: Optional[Dict[str, Any]] = None,
|
2025-03-17 17:36:43 -04:00
|
|
|
) -> Graph:
|
|
|
|
"""Create a graph from documents.
|
|
|
|
|
|
|
|
This function processes documents matching filters or specific document IDs,
|
|
|
|
extracts entities and relationships from document chunks, and saves them as a graph.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
name: Name of the graph to create
|
|
|
|
auth: Authentication context
|
|
|
|
filters: Optional metadata filters to determine which documents to include
|
|
|
|
documents: Optional list of specific document IDs to include
|
2025-04-13 14:52:26 -07:00
|
|
|
prompt_overrides: Optional customizations for entity extraction and resolution prompts
|
|
|
|
system_filters: Optional system filters like folder_name and end_user_id for scoping
|
2025-03-17 17:36:43 -04:00
|
|
|
|
|
|
|
Returns:
|
|
|
|
Graph: The created graph
|
|
|
|
"""
|
|
|
|
# Delegate to the GraphService
|
|
|
|
return await self.graph_service.create_graph(
|
|
|
|
name=name,
|
|
|
|
auth=auth,
|
|
|
|
document_service=self,
|
|
|
|
filters=filters,
|
|
|
|
documents=documents,
|
2025-03-31 21:30:48 -07:00
|
|
|
prompt_overrides=prompt_overrides,
|
2025-04-13 14:52:26 -07:00
|
|
|
system_filters=system_filters,
|
2025-03-17 17:36:43 -04:00
|
|
|
)
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-29 23:22:47 -07:00
|
|
|
async def update_graph(
|
|
|
|
self,
|
|
|
|
name: str,
|
|
|
|
auth: AuthContext,
|
|
|
|
additional_filters: Optional[Dict[str, Any]] = None,
|
|
|
|
additional_documents: Optional[List[str]] = None,
|
2025-03-31 21:30:48 -07:00
|
|
|
prompt_overrides: Optional[GraphPromptOverrides] = None,
|
2025-04-13 14:52:26 -07:00
|
|
|
system_filters: Optional[Dict[str, Any]] = None,
|
2025-03-29 23:22:47 -07:00
|
|
|
) -> Graph:
|
|
|
|
"""Update an existing graph with new documents.
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-29 23:22:47 -07:00
|
|
|
This function processes additional documents matching the original or new filters,
|
|
|
|
extracts entities and relationships, and updates the graph with new information.
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-29 23:22:47 -07:00
|
|
|
Args:
|
|
|
|
name: Name of the graph to update
|
|
|
|
auth: Authentication context
|
|
|
|
additional_filters: Optional additional metadata filters to determine which new documents to include
|
|
|
|
additional_documents: Optional list of additional document IDs to include
|
2025-04-13 14:52:26 -07:00
|
|
|
prompt_overrides: Optional customizations for entity extraction and resolution prompts
|
|
|
|
system_filters: Optional system filters like folder_name and end_user_id for scoping
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-29 23:22:47 -07:00
|
|
|
Returns:
|
|
|
|
Graph: The updated graph
|
|
|
|
"""
|
|
|
|
# Delegate to the GraphService
|
|
|
|
return await self.graph_service.update_graph(
|
|
|
|
name=name,
|
|
|
|
auth=auth,
|
|
|
|
document_service=self,
|
|
|
|
additional_filters=additional_filters,
|
|
|
|
additional_documents=additional_documents,
|
2025-03-31 21:30:48 -07:00
|
|
|
prompt_overrides=prompt_overrides,
|
2025-04-13 14:52:26 -07:00
|
|
|
system_filters=system_filters,
|
2025-03-29 23:22:47 -07:00
|
|
|
)
|
2025-03-17 17:36:43 -04:00
|
|
|
|
2025-03-29 18:42:52 -07:00
|
|
|
async def delete_document(self, document_id: str, auth: AuthContext) -> bool:
|
|
|
|
"""
|
|
|
|
Delete a document and all its associated data.
|
|
|
|
|
|
|
|
This method:
|
|
|
|
1. Checks if the user has write access to the document
|
|
|
|
2. Gets the document to retrieve its chunk IDs
|
|
|
|
3. Deletes the document from the database
|
|
|
|
4. Deletes all associated chunks from the vector store (if possible)
|
|
|
|
5. Deletes the original file from storage if present
|
|
|
|
|
|
|
|
Args:
|
|
|
|
document_id: ID of the document to delete
|
|
|
|
auth: Authentication context
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
bool: True if deletion was successful, False otherwise
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
PermissionError: If the user doesn't have write access
|
|
|
|
"""
|
|
|
|
# First get the document to retrieve its chunk IDs
|
|
|
|
document = await self.db.get_document(document_id, auth)
|
|
|
|
|
|
|
|
if not document:
|
|
|
|
logger.error(f"Document {document_id} not found")
|
|
|
|
return False
|
|
|
|
|
|
|
|
# Verify write access - the database layer also checks this, but we check here too
|
|
|
|
# to avoid unnecessary operations if the user doesn't have permission
|
|
|
|
if not await self.db.check_access(document_id, auth, "write"):
|
|
|
|
logger.error(f"User {auth.entity_id} doesn't have write access to document {document_id}")
|
|
|
|
raise PermissionError(f"User doesn't have write access to document {document_id}")
|
|
|
|
|
|
|
|
# Delete document from database
|
|
|
|
db_success = await self.db.delete_document(document_id, auth)
|
|
|
|
if not db_success:
|
|
|
|
logger.error(f"Failed to delete document {document_id} from database")
|
|
|
|
return False
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-29 18:42:52 -07:00
|
|
|
logger.info(f"Deleted document {document_id} from database")
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-20 22:36:27 -07:00
|
|
|
# Collect storage deletion tasks
|
|
|
|
storage_deletion_tasks = []
|
|
|
|
|
|
|
|
# Collect vector store deletion tasks
|
|
|
|
vector_deletion_tasks = []
|
|
|
|
|
|
|
|
# Add vector store deletion tasks if chunks exist
|
2025-04-20 16:34:29 -07:00
|
|
|
if hasattr(document, "chunk_ids") and document.chunk_ids:
|
2025-04-20 22:36:27 -07:00
|
|
|
# Try to delete chunks by document ID
|
|
|
|
# Note: Some vector stores may not implement this method
|
|
|
|
if hasattr(self.vector_store, "delete_chunks_by_document_id"):
|
|
|
|
vector_deletion_tasks.append(self.vector_store.delete_chunks_by_document_id(document_id))
|
|
|
|
|
|
|
|
# Try to delete from colpali vector store as well
|
|
|
|
if self.colpali_vector_store and hasattr(self.colpali_vector_store, "delete_chunks_by_document_id"):
|
|
|
|
vector_deletion_tasks.append(self.colpali_vector_store.delete_chunks_by_document_id(document_id))
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-20 22:36:27 -07:00
|
|
|
# Collect storage file deletion tasks
|
2025-04-20 16:34:29 -07:00
|
|
|
if hasattr(document, "storage_info") and document.storage_info:
|
2025-04-20 22:36:27 -07:00
|
|
|
bucket = document.storage_info.get("bucket")
|
|
|
|
key = document.storage_info.get("key")
|
|
|
|
if bucket and key and hasattr(self.storage, "delete_file"):
|
|
|
|
storage_deletion_tasks.append(self.storage.delete_file(bucket, key))
|
|
|
|
|
|
|
|
# Also handle the case of multiple file versions in storage_files
|
|
|
|
if hasattr(document, "storage_files") and document.storage_files:
|
|
|
|
for file_info in document.storage_files:
|
2025-04-30 19:42:43 -07:00
|
|
|
bucket = file_info.bucket
|
|
|
|
key = file_info.key
|
2025-04-20 22:36:27 -07:00
|
|
|
if bucket and key and hasattr(self.storage, "delete_file"):
|
|
|
|
storage_deletion_tasks.append(self.storage.delete_file(bucket, key))
|
|
|
|
|
|
|
|
# Execute deletion tasks in parallel
|
|
|
|
if vector_deletion_tasks or storage_deletion_tasks:
|
2025-03-29 18:42:52 -07:00
|
|
|
try:
|
2025-04-20 22:36:27 -07:00
|
|
|
# Run all deletion tasks concurrently
|
|
|
|
all_deletion_results = await asyncio.gather(
|
|
|
|
*vector_deletion_tasks, *storage_deletion_tasks, return_exceptions=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Log any errors but continue with deletion
|
|
|
|
for i, result in enumerate(all_deletion_results):
|
|
|
|
if isinstance(result, Exception):
|
|
|
|
# Determine if this was a vector store or storage deletion
|
|
|
|
task_type = "vector store" if i < len(vector_deletion_tasks) else "storage"
|
|
|
|
logger.error(f"Error during {task_type} deletion for document {document_id}: {result}")
|
|
|
|
|
2025-03-29 18:42:52 -07:00
|
|
|
except Exception as e:
|
2025-04-20 22:36:27 -07:00
|
|
|
logger.error(f"Error during parallel deletion operations for document {document_id}: {e}")
|
|
|
|
# We continue even if deletions fail - document is already deleted from DB
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-03-29 18:42:52 -07:00
|
|
|
logger.info(f"Successfully deleted document {document_id} and all associated data")
|
|
|
|
return True
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-01-29 10:19:28 +05:30
|
|
|
def close(self):
|
|
|
|
"""Close all resources."""
|
|
|
|
# Close any active caches
|
|
|
|
self.active_caches.clear()
|