pipethrough video timestamps on query

This commit is contained in:
Arnav Agrawal 2024-12-28 19:41:05 +05:30
parent 16e5decc4b
commit 196655fea3
10 changed files with 96 additions and 85 deletions

View File

@ -1,10 +1,14 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import List, Union from typing import List, Union
from core.models.documents import Chunk
class BaseEmbeddingModel(ABC): class BaseEmbeddingModel(ABC):
@abstractmethod @abstractmethod
async def embed_for_ingestion(self, text: Union[str, List[str]]) -> List[float]: async def embed_for_ingestion(
self, chunks: Union[Chunk, List[Chunk]]
) -> List[List[float]]:
"""Generate embeddings for input text""" """Generate embeddings for input text"""
pass pass

View File

@ -1,6 +1,7 @@
from typing import List, Union from typing import List, Union
from ollama import AsyncClient from ollama import AsyncClient
from core.embedding.base_embedding_model import BaseEmbeddingModel from core.embedding.base_embedding_model import BaseEmbeddingModel
from core.models.documents import Chunk
class OllamaEmbeddingModel(BaseEmbeddingModel): class OllamaEmbeddingModel(BaseEmbeddingModel):
@ -9,14 +10,16 @@ class OllamaEmbeddingModel(BaseEmbeddingModel):
self.client = AsyncClient(host=base_url) self.client = AsyncClient(host=base_url)
async def embed_for_ingestion( async def embed_for_ingestion(
self, text: Union[str, List[str]] self, chunks: Union[Chunk, List[Chunk]]
) -> List[List[float]]: ) -> List[List[float]]:
if isinstance(text, str): if isinstance(chunks, Chunk):
text = [text] chunks = [chunks]
embeddings: List[List[float]] = [] embeddings: List[List[float]] = []
for t in text: for c in chunks:
response = await self.client.embeddings(model=self.model_name, prompt=t) response = await self.client.embeddings(
model=self.model_name, prompt=c.content
)
embedding = list(response["embedding"]) embedding = list(response["embedding"])
embeddings.append(embedding) embeddings.append(embedding)
@ -24,4 +27,4 @@ class OllamaEmbeddingModel(BaseEmbeddingModel):
async def embed_for_query(self, text: str) -> List[float]: async def embed_for_query(self, text: str) -> List[float]:
response = await self.client.embeddings(model=self.model_name, prompt=text) response = await self.client.embeddings(model=self.model_name, prompt=text)
return response["embedding"] return list(response["embedding"])

View File

@ -1,6 +1,8 @@
from typing import List, Union from typing import List, Union
from openai import OpenAI from openai import OpenAI
from .base_embedding_model import BaseEmbeddingModel
from core.models.documents import Chunk
from core.embedding.base_embedding_model import BaseEmbeddingModel
class OpenAIEmbeddingModel(BaseEmbeddingModel): class OpenAIEmbeddingModel(BaseEmbeddingModel):
@ -9,8 +11,10 @@ class OpenAIEmbeddingModel(BaseEmbeddingModel):
self.model_name = model_name self.model_name = model_name
async def embed_for_ingestion( async def embed_for_ingestion(
self, text: Union[str, List[str]] self, chunks: Union[Chunk, List[Chunk]]
) -> List[List[float]]: ) -> List[List[float]]:
chunks = [chunks] if isinstance(chunks, Chunk) else chunks
text = [c.content for c in chunks]
response = self.client.embeddings.create(model=self.model_name, input=text) response = self.client.embeddings.create(model=self.model_name, input=text)
return [item.embedding for item in response.data] return [item.embedding for item in response.data]

View File

@ -39,15 +39,32 @@ class DocumentChunk(BaseModel):
"""Represents a chunk stored in VectorStore""" """Represents a chunk stored in VectorStore"""
document_id: str # external_id of parent document document_id: str # external_id of parent document
# TODO: This might be suboptimal due to storage size. consider moving to separate store.
content: str content: str
embedding: List[float] embedding: List[float]
chunk_number: int chunk_number: int
version: int = 1 # chunk-specific metadata
metadata: Dict[str, Any] = Field(default_factory=dict) metadata: Dict[str, Any] = Field(default_factory=dict)
score: float = 0.0 score: float = 0.0
class Chunk(BaseModel):
"""Represents a chunk containing content and metadata"""
content: str
metadata: Dict[str, Any] = Field(default_factory=dict)
def to_document_chunk(
self, document_id: str, chunk_number: int, embedding: List[float]
) -> DocumentChunk:
return DocumentChunk(
document_id=document_id,
content=self.content,
embedding=embedding,
chunk_number=chunk_number,
metadata=self.metadata,
)
class ChunkResult(BaseModel): class ChunkResult(BaseModel):
"""Query result at chunk level""" """Query result at chunk level"""

View File

@ -3,6 +3,8 @@ from typing import List, Tuple, Optional, Union, Dict
from bisect import bisect_left from bisect import bisect_left
import logging import logging
from core.models.documents import Chunk
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -82,3 +84,9 @@ class TimeSeriesData:
def times_for_content(self, content: str) -> List[float]: def times_for_content(self, content: str) -> List[float]:
"""Get all timestamps where this content appears""" """Get all timestamps where this content appears"""
return self.content_to_times[content] return self.content_to_times[content]
def to_chunks(self) -> List[Chunk]:
return [
Chunk(content=content, metadata={"timestamp": timestamp})
for content, timestamp in zip(self.contents, self.timestamps)
]

View File

@ -1,19 +1,18 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import List, Union from typing import List, Union
from fastapi import UploadFile from fastapi import UploadFile
from core.models.documents import Chunk
class BaseParser(ABC): class BaseParser(ABC):
"""Base class for document parsing""" """Base class for document parsing"""
@abstractmethod @abstractmethod
async def split_text(self, text: str) -> List[str]: async def split_text(self, text: str) -> List[Chunk]:
"""Split plain text into chunks""" """Split plain text into chunks"""
pass pass
@abstractmethod @abstractmethod
async def parse_file( async def parse_file(self, file: bytes, content_type: str) -> List[Chunk]:
self, file: Union[UploadFile, bytes], content_type: str
) -> List[str]:
"""Parse file content into text chunks""" """Parse file content into text chunks"""
pass pass

View File

@ -1,17 +1,13 @@
from typing import List, Union from typing import List, Optional
import logging import logging
import os import os
from fastapi import UploadFile
import tempfile import tempfile
import magic import magic
from core.models.documents import Chunk
from core.models.time_series import ( from core.parser.base_parser import BaseParser
TimeSeriesData, from core.parser.unstructured_parser import UnstructuredAPIParser
) # python-magic library for file type detection from core.parser.video.parse_video import VideoParser
from .base_parser import BaseParser
from .unstructured_parser import UnstructuredAPIParser
from .video.parse_video import VideoParser
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -35,7 +31,10 @@ class CombinedParser(BaseParser):
self.magic = magic.Magic(mime=True) self.magic = magic.Magic(mime=True)
def _is_video_file( def _is_video_file(
self, file_path: str = None, file_bytes: bytes = None, filename: str = None self,
file_path: Optional[str] = None,
file_bytes: Optional[bytes] = None,
filename: Optional[str] = None,
) -> bool: ) -> bool:
""" """
Detect if a file is a video using multiple methods: Detect if a file is a video using multiple methods:
@ -73,22 +72,12 @@ class CombinedParser(BaseParser):
return False return False
async def split_text(self, text: str) -> List[str]: async def split_text(self, text: str) -> List[Chunk]:
"""Split plain text into chunks using unstructured parser""" """Split plain text into chunks using unstructured parser"""
return await self.unstructured_parser.split_text(text) return await self.unstructured_parser.split_text(text)
async def parse_file( async def parse_file(self, file: bytes, content_type: str) -> List[Chunk]:
self, file: Union[UploadFile, bytes], content_type: str
) -> List[str]:
"""Parse file content into text chunks""" """Parse file content into text chunks"""
# For UploadFile, check both filename and content
if isinstance(file, UploadFile):
content = await file.read()
is_video = self._is_video_file(file_bytes=content, filename=file.filename)
# Reset file position for later use
await file.seek(0)
else:
# For bytes, we can only check content
is_video = self._is_video_file(file_bytes=file) is_video = self._is_video_file(file_bytes=file)
if is_video: if is_video:
@ -96,21 +85,13 @@ class CombinedParser(BaseParser):
else: else:
return await self.unstructured_parser.parse_file(file, content_type) return await self.unstructured_parser.parse_file(file, content_type)
async def _parse_video(self, file: Union[UploadFile, bytes]) -> List[str]: async def _parse_video(self, file: bytes) -> List[Chunk]:
"""Parse video file and combine transcript and frame descriptions into chunks""" """Parse video file and combine transcript and frame descriptions into chunks"""
# Save video to temporary file if needed # Save video to temporary file if needed
if isinstance(file, bytes):
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
temp_file.write(file) temp_file.write(file)
temp_file.close() temp_file.close()
video_path = temp_file.name video_path = temp_file.name
else:
# For UploadFile, save to temp file
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
content = await file.read()
temp_file.write(content)
temp_file.close()
video_path = temp_file.name
try: try:
# Process video # Process video
@ -121,10 +102,10 @@ class CombinedParser(BaseParser):
) )
results = parser.process_video() results = parser.process_video()
# Get all frame descriptions # Get all frame descriptions
frame_descriptions: TimeSeriesData = results["frame_descriptions"] frame_chunks = results["frame_descriptions"].to_chunks()
# Get all transcript text # Get all transcript text
transcript: TimeSeriesData = results["transcript"] transcript_chunks = results["transcript"].to_chunks()
return frame_descriptions.contents + transcript.contents return frame_chunks + transcript_chunks
finally: finally:
# Clean up temporary file # Clean up temporary file

View File

@ -2,6 +2,7 @@ from typing import List
import io import io
from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_unstructured import UnstructuredLoader from langchain_unstructured import UnstructuredLoader
from core.models.documents import Chunk
import logging import logging
from .base_parser import BaseParser from .base_parser import BaseParser
@ -24,11 +25,14 @@ class UnstructuredAPIParser(BaseParser):
separators=["\n\n", "\n", ". ", " ", ""], separators=["\n\n", "\n", ". ", " ", ""],
) )
async def split_text(self, text: str) -> List[str]: async def split_text(self, text: str) -> List[Chunk]:
"""Split plain text into chunks""" """Split plain text into chunks"""
return self.text_splitter.split_text(text) return [
Chunk(content=chunk, metadata={})
for chunk in self.text_splitter.split_text(text)
]
async def parse_file(self, file: bytes, content_type: str) -> List[str]: async def parse_file(self, file: bytes, content_type: str) -> List[Chunk]:
"""Parse file content using unstructured""" """Parse file content using unstructured"""
# Parse with unstructured # Parse with unstructured
loader = UnstructuredLoader( loader = UnstructuredLoader(
@ -38,4 +42,6 @@ class UnstructuredAPIParser(BaseParser):
chunking_strategy="by_title", chunking_strategy="by_title",
) )
elements = loader.load() elements = loader.load()
return [element.page_content for element in elements] return [
Chunk(content=element.page_content, metadata={}) for element in elements
]

View File

@ -3,7 +3,8 @@ from typing import Dict, Any, List, Optional
from fastapi import UploadFile from fastapi import UploadFile
from core.models.request import IngestTextRequest from core.models.request import IngestTextRequest
from ..models.documents import ( from core.models.documents import (
Chunk,
Document, Document,
DocumentChunk, DocumentChunk,
ChunkResult, ChunkResult,
@ -62,9 +63,7 @@ class DocumentService:
# Search chunks with vector similarity # Search chunks with vector similarity
chunks = await self.vector_store.query_similar( chunks = await self.vector_store.query_similar(
query_embedding, query_embedding, k=k, doc_ids=doc_ids
k=k,
doc_ids=doc_ids,
) )
logger.info(f"Found {len(chunks)} similar chunks") logger.info(f"Found {len(chunks)} similar chunks")
@ -148,9 +147,7 @@ class DocumentService:
logger.info(f"Generated {len(embeddings)} embeddings") logger.info(f"Generated {len(embeddings)} embeddings")
# 4. Create and store chunk objects # 4. Create and store chunk objects
chunk_objects = self._create_chunk_objects( chunk_objects = self._create_chunk_objects(doc.external_id, chunks, embeddings)
doc.external_id, chunks, embeddings, doc.metadata
)
logger.info(f"Created {len(chunk_objects)} chunk objects") logger.info(f"Created {len(chunk_objects)} chunk objects")
# 5. Store everything # 5. Store everything
@ -168,7 +165,7 @@ class DocumentService:
# 1. Create document record # 1. Create document record
doc = Document( doc = Document(
content_type=file.content_type, content_type=file.content_type or "",
filename=file.filename, filename=file.filename,
metadata=metadata, metadata=metadata,
owner={"type": auth.entity_type, "id": auth.entity_id}, owner={"type": auth.entity_type, "id": auth.entity_id},
@ -191,7 +188,7 @@ class DocumentService:
) )
# 3. Parse content into chunks # 3. Parse content into chunks
chunks = await self.parser.parse_file(file_content, file.content_type) chunks = await self.parser.parse_file(file_content, file.content_type or "")
if not chunks: if not chunks:
raise ValueError("No content chunks extracted from file") raise ValueError("No content chunks extracted from file")
logger.info(f"Parsed file into {len(chunks)} chunks") logger.info(f"Parsed file into {len(chunks)} chunks")
@ -201,9 +198,7 @@ class DocumentService:
logger.info(f"Generated {len(embeddings)} embeddings") logger.info(f"Generated {len(embeddings)} embeddings")
# 5. Create and store chunk objects # 5. Create and store chunk objects
chunk_objects = self._create_chunk_objects( chunk_objects = self._create_chunk_objects(doc.external_id, chunks, embeddings)
doc.external_id, chunks, embeddings, doc.metadata
)
logger.info(f"Created {len(chunk_objects)} chunk objects") logger.info(f"Created {len(chunk_objects)} chunk objects")
# 6. Store everything # 6. Store everything
@ -215,20 +210,13 @@ class DocumentService:
def _create_chunk_objects( def _create_chunk_objects(
self, self,
doc_id: str, doc_id: str,
chunks: List[str], chunks: List[Chunk],
embeddings: List[List[float]], embeddings: List[List[float]],
metadata: Dict[str, Any],
) -> List[DocumentChunk]: ) -> List[DocumentChunk]:
"""Helper to create chunk objects""" """Helper to create chunk objects"""
return [ return [
DocumentChunk( c.to_document_chunk(chunk_number=i, embedding=embedding, document_id=doc_id)
document_id=doc_id, for i, (embedding, c) in enumerate(zip(embeddings, chunks))
content=content,
embedding=embedding,
chunk_number=i,
metadata=metadata,
)
for i, (content, embedding) in enumerate(zip(chunks, embeddings))
] ]
async def _store_chunks_and_doc( async def _store_chunks_and_doc(
@ -287,11 +275,11 @@ class DocumentService:
return results return results
async def _create_document_results( async def _create_document_results(
self, auth: AuthContext, chunks: List[DocumentChunk] self, auth: AuthContext, chunks: List[ChunkResult]
) -> List[DocumentResult]: ) -> List[DocumentResult]:
"""Group chunks by document and create DocumentResult objects.""" """Group chunks by document and create DocumentResult objects."""
# Group chunks by document and get highest scoring chunk per doc # Group chunks by document and get highest scoring chunk per doc
doc_chunks: Dict[str, DocumentChunk] = {} doc_chunks: Dict[str, ChunkResult] = {}
for chunk in chunks: for chunk in chunks:
if ( if (
chunk.document_id not in doc_chunks chunk.document_id not in doc_chunks

View File

@ -6,16 +6,17 @@ from core.models.documents import DocumentChunk
class BaseVectorStore(ABC): class BaseVectorStore(ABC):
@abstractmethod @abstractmethod
def store_embeddings(self, chunks: List[DocumentChunk]) -> Tuple[bool, List[str]]: async def store_embeddings(
self, chunks: List[DocumentChunk]
) -> Tuple[bool, List[str]]:
"""Store document chunks and their embeddings""" """Store document chunks and their embeddings"""
pass pass
@abstractmethod @abstractmethod
def query_similar( async def query_similar(
self, self,
query_embedding: List[float], query_embedding: List[float],
k: int, k: int,
auth: AuthContext,
doc_ids: Optional[List[str]] = None, doc_ids: Optional[List[str]] = None,
) -> List[DocumentChunk]: ) -> List[DocumentChunk]:
"""Find similar chunks""" """Find similar chunks"""