mirror of
https://github.com/james-m-jordan/morphik-core.git
synced 2025-05-09 19:32:38 +00:00
pipethrough video timestamps on query
This commit is contained in:
parent
16e5decc4b
commit
196655fea3
@ -1,10 +1,14 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import List, Union
|
from typing import List, Union
|
||||||
|
|
||||||
|
from core.models.documents import Chunk
|
||||||
|
|
||||||
|
|
||||||
class BaseEmbeddingModel(ABC):
|
class BaseEmbeddingModel(ABC):
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
async def embed_for_ingestion(self, text: Union[str, List[str]]) -> List[float]:
|
async def embed_for_ingestion(
|
||||||
|
self, chunks: Union[Chunk, List[Chunk]]
|
||||||
|
) -> List[List[float]]:
|
||||||
"""Generate embeddings for input text"""
|
"""Generate embeddings for input text"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
from typing import List, Union
|
from typing import List, Union
|
||||||
from ollama import AsyncClient
|
from ollama import AsyncClient
|
||||||
from core.embedding.base_embedding_model import BaseEmbeddingModel
|
from core.embedding.base_embedding_model import BaseEmbeddingModel
|
||||||
|
from core.models.documents import Chunk
|
||||||
|
|
||||||
|
|
||||||
class OllamaEmbeddingModel(BaseEmbeddingModel):
|
class OllamaEmbeddingModel(BaseEmbeddingModel):
|
||||||
@ -9,14 +10,16 @@ class OllamaEmbeddingModel(BaseEmbeddingModel):
|
|||||||
self.client = AsyncClient(host=base_url)
|
self.client = AsyncClient(host=base_url)
|
||||||
|
|
||||||
async def embed_for_ingestion(
|
async def embed_for_ingestion(
|
||||||
self, text: Union[str, List[str]]
|
self, chunks: Union[Chunk, List[Chunk]]
|
||||||
) -> List[List[float]]:
|
) -> List[List[float]]:
|
||||||
if isinstance(text, str):
|
if isinstance(chunks, Chunk):
|
||||||
text = [text]
|
chunks = [chunks]
|
||||||
|
|
||||||
embeddings: List[List[float]] = []
|
embeddings: List[List[float]] = []
|
||||||
for t in text:
|
for c in chunks:
|
||||||
response = await self.client.embeddings(model=self.model_name, prompt=t)
|
response = await self.client.embeddings(
|
||||||
|
model=self.model_name, prompt=c.content
|
||||||
|
)
|
||||||
embedding = list(response["embedding"])
|
embedding = list(response["embedding"])
|
||||||
embeddings.append(embedding)
|
embeddings.append(embedding)
|
||||||
|
|
||||||
@ -24,4 +27,4 @@ class OllamaEmbeddingModel(BaseEmbeddingModel):
|
|||||||
|
|
||||||
async def embed_for_query(self, text: str) -> List[float]:
|
async def embed_for_query(self, text: str) -> List[float]:
|
||||||
response = await self.client.embeddings(model=self.model_name, prompt=text)
|
response = await self.client.embeddings(model=self.model_name, prompt=text)
|
||||||
return response["embedding"]
|
return list(response["embedding"])
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
from typing import List, Union
|
from typing import List, Union
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
from .base_embedding_model import BaseEmbeddingModel
|
|
||||||
|
from core.models.documents import Chunk
|
||||||
|
from core.embedding.base_embedding_model import BaseEmbeddingModel
|
||||||
|
|
||||||
|
|
||||||
class OpenAIEmbeddingModel(BaseEmbeddingModel):
|
class OpenAIEmbeddingModel(BaseEmbeddingModel):
|
||||||
@ -9,8 +11,10 @@ class OpenAIEmbeddingModel(BaseEmbeddingModel):
|
|||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
|
|
||||||
async def embed_for_ingestion(
|
async def embed_for_ingestion(
|
||||||
self, text: Union[str, List[str]]
|
self, chunks: Union[Chunk, List[Chunk]]
|
||||||
) -> List[List[float]]:
|
) -> List[List[float]]:
|
||||||
|
chunks = [chunks] if isinstance(chunks, Chunk) else chunks
|
||||||
|
text = [c.content for c in chunks]
|
||||||
response = self.client.embeddings.create(model=self.model_name, input=text)
|
response = self.client.embeddings.create(model=self.model_name, input=text)
|
||||||
|
|
||||||
return [item.embedding for item in response.data]
|
return [item.embedding for item in response.data]
|
||||||
|
@ -39,15 +39,32 @@ class DocumentChunk(BaseModel):
|
|||||||
"""Represents a chunk stored in VectorStore"""
|
"""Represents a chunk stored in VectorStore"""
|
||||||
|
|
||||||
document_id: str # external_id of parent document
|
document_id: str # external_id of parent document
|
||||||
# TODO: This might be suboptimal due to storage size. consider moving to separate store.
|
|
||||||
content: str
|
content: str
|
||||||
embedding: List[float]
|
embedding: List[float]
|
||||||
chunk_number: int
|
chunk_number: int
|
||||||
version: int = 1
|
# chunk-specific metadata
|
||||||
metadata: Dict[str, Any] = Field(default_factory=dict)
|
metadata: Dict[str, Any] = Field(default_factory=dict)
|
||||||
score: float = 0.0
|
score: float = 0.0
|
||||||
|
|
||||||
|
|
||||||
|
class Chunk(BaseModel):
|
||||||
|
"""Represents a chunk containing content and metadata"""
|
||||||
|
|
||||||
|
content: str
|
||||||
|
metadata: Dict[str, Any] = Field(default_factory=dict)
|
||||||
|
|
||||||
|
def to_document_chunk(
|
||||||
|
self, document_id: str, chunk_number: int, embedding: List[float]
|
||||||
|
) -> DocumentChunk:
|
||||||
|
return DocumentChunk(
|
||||||
|
document_id=document_id,
|
||||||
|
content=self.content,
|
||||||
|
embedding=embedding,
|
||||||
|
chunk_number=chunk_number,
|
||||||
|
metadata=self.metadata,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class ChunkResult(BaseModel):
|
class ChunkResult(BaseModel):
|
||||||
"""Query result at chunk level"""
|
"""Query result at chunk level"""
|
||||||
|
|
||||||
|
@ -3,6 +3,8 @@ from typing import List, Tuple, Optional, Union, Dict
|
|||||||
from bisect import bisect_left
|
from bisect import bisect_left
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
from core.models.documents import Chunk
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@ -82,3 +84,9 @@ class TimeSeriesData:
|
|||||||
def times_for_content(self, content: str) -> List[float]:
|
def times_for_content(self, content: str) -> List[float]:
|
||||||
"""Get all timestamps where this content appears"""
|
"""Get all timestamps where this content appears"""
|
||||||
return self.content_to_times[content]
|
return self.content_to_times[content]
|
||||||
|
|
||||||
|
def to_chunks(self) -> List[Chunk]:
|
||||||
|
return [
|
||||||
|
Chunk(content=content, metadata={"timestamp": timestamp})
|
||||||
|
for content, timestamp in zip(self.contents, self.timestamps)
|
||||||
|
]
|
||||||
|
@ -1,19 +1,18 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import List, Union
|
from typing import List, Union
|
||||||
from fastapi import UploadFile
|
from fastapi import UploadFile
|
||||||
|
from core.models.documents import Chunk
|
||||||
|
|
||||||
|
|
||||||
class BaseParser(ABC):
|
class BaseParser(ABC):
|
||||||
"""Base class for document parsing"""
|
"""Base class for document parsing"""
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
async def split_text(self, text: str) -> List[str]:
|
async def split_text(self, text: str) -> List[Chunk]:
|
||||||
"""Split plain text into chunks"""
|
"""Split plain text into chunks"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
async def parse_file(
|
async def parse_file(self, file: bytes, content_type: str) -> List[Chunk]:
|
||||||
self, file: Union[UploadFile, bytes], content_type: str
|
|
||||||
) -> List[str]:
|
|
||||||
"""Parse file content into text chunks"""
|
"""Parse file content into text chunks"""
|
||||||
pass
|
pass
|
||||||
|
@ -1,17 +1,13 @@
|
|||||||
from typing import List, Union
|
from typing import List, Optional
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from fastapi import UploadFile
|
|
||||||
import tempfile
|
import tempfile
|
||||||
import magic
|
import magic
|
||||||
|
from core.models.documents import Chunk
|
||||||
|
|
||||||
from core.models.time_series import (
|
from core.parser.base_parser import BaseParser
|
||||||
TimeSeriesData,
|
from core.parser.unstructured_parser import UnstructuredAPIParser
|
||||||
) # python-magic library for file type detection
|
from core.parser.video.parse_video import VideoParser
|
||||||
|
|
||||||
from .base_parser import BaseParser
|
|
||||||
from .unstructured_parser import UnstructuredAPIParser
|
|
||||||
from .video.parse_video import VideoParser
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -35,7 +31,10 @@ class CombinedParser(BaseParser):
|
|||||||
self.magic = magic.Magic(mime=True)
|
self.magic = magic.Magic(mime=True)
|
||||||
|
|
||||||
def _is_video_file(
|
def _is_video_file(
|
||||||
self, file_path: str = None, file_bytes: bytes = None, filename: str = None
|
self,
|
||||||
|
file_path: Optional[str] = None,
|
||||||
|
file_bytes: Optional[bytes] = None,
|
||||||
|
filename: Optional[str] = None,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""
|
"""
|
||||||
Detect if a file is a video using multiple methods:
|
Detect if a file is a video using multiple methods:
|
||||||
@ -73,22 +72,12 @@ class CombinedParser(BaseParser):
|
|||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
async def split_text(self, text: str) -> List[str]:
|
async def split_text(self, text: str) -> List[Chunk]:
|
||||||
"""Split plain text into chunks using unstructured parser"""
|
"""Split plain text into chunks using unstructured parser"""
|
||||||
return await self.unstructured_parser.split_text(text)
|
return await self.unstructured_parser.split_text(text)
|
||||||
|
|
||||||
async def parse_file(
|
async def parse_file(self, file: bytes, content_type: str) -> List[Chunk]:
|
||||||
self, file: Union[UploadFile, bytes], content_type: str
|
|
||||||
) -> List[str]:
|
|
||||||
"""Parse file content into text chunks"""
|
"""Parse file content into text chunks"""
|
||||||
# For UploadFile, check both filename and content
|
|
||||||
if isinstance(file, UploadFile):
|
|
||||||
content = await file.read()
|
|
||||||
is_video = self._is_video_file(file_bytes=content, filename=file.filename)
|
|
||||||
# Reset file position for later use
|
|
||||||
await file.seek(0)
|
|
||||||
else:
|
|
||||||
# For bytes, we can only check content
|
|
||||||
is_video = self._is_video_file(file_bytes=file)
|
is_video = self._is_video_file(file_bytes=file)
|
||||||
|
|
||||||
if is_video:
|
if is_video:
|
||||||
@ -96,21 +85,13 @@ class CombinedParser(BaseParser):
|
|||||||
else:
|
else:
|
||||||
return await self.unstructured_parser.parse_file(file, content_type)
|
return await self.unstructured_parser.parse_file(file, content_type)
|
||||||
|
|
||||||
async def _parse_video(self, file: Union[UploadFile, bytes]) -> List[str]:
|
async def _parse_video(self, file: bytes) -> List[Chunk]:
|
||||||
"""Parse video file and combine transcript and frame descriptions into chunks"""
|
"""Parse video file and combine transcript and frame descriptions into chunks"""
|
||||||
# Save video to temporary file if needed
|
# Save video to temporary file if needed
|
||||||
if isinstance(file, bytes):
|
|
||||||
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
|
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
|
||||||
temp_file.write(file)
|
temp_file.write(file)
|
||||||
temp_file.close()
|
temp_file.close()
|
||||||
video_path = temp_file.name
|
video_path = temp_file.name
|
||||||
else:
|
|
||||||
# For UploadFile, save to temp file
|
|
||||||
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
|
|
||||||
content = await file.read()
|
|
||||||
temp_file.write(content)
|
|
||||||
temp_file.close()
|
|
||||||
video_path = temp_file.name
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Process video
|
# Process video
|
||||||
@ -121,10 +102,10 @@ class CombinedParser(BaseParser):
|
|||||||
)
|
)
|
||||||
results = parser.process_video()
|
results = parser.process_video()
|
||||||
# Get all frame descriptions
|
# Get all frame descriptions
|
||||||
frame_descriptions: TimeSeriesData = results["frame_descriptions"]
|
frame_chunks = results["frame_descriptions"].to_chunks()
|
||||||
# Get all transcript text
|
# Get all transcript text
|
||||||
transcript: TimeSeriesData = results["transcript"]
|
transcript_chunks = results["transcript"].to_chunks()
|
||||||
return frame_descriptions.contents + transcript.contents
|
return frame_chunks + transcript_chunks
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
# Clean up temporary file
|
# Clean up temporary file
|
||||||
|
@ -2,6 +2,7 @@ from typing import List
|
|||||||
import io
|
import io
|
||||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||||
from langchain_unstructured import UnstructuredLoader
|
from langchain_unstructured import UnstructuredLoader
|
||||||
|
from core.models.documents import Chunk
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from .base_parser import BaseParser
|
from .base_parser import BaseParser
|
||||||
@ -24,11 +25,14 @@ class UnstructuredAPIParser(BaseParser):
|
|||||||
separators=["\n\n", "\n", ". ", " ", ""],
|
separators=["\n\n", "\n", ". ", " ", ""],
|
||||||
)
|
)
|
||||||
|
|
||||||
async def split_text(self, text: str) -> List[str]:
|
async def split_text(self, text: str) -> List[Chunk]:
|
||||||
"""Split plain text into chunks"""
|
"""Split plain text into chunks"""
|
||||||
return self.text_splitter.split_text(text)
|
return [
|
||||||
|
Chunk(content=chunk, metadata={})
|
||||||
|
for chunk in self.text_splitter.split_text(text)
|
||||||
|
]
|
||||||
|
|
||||||
async def parse_file(self, file: bytes, content_type: str) -> List[str]:
|
async def parse_file(self, file: bytes, content_type: str) -> List[Chunk]:
|
||||||
"""Parse file content using unstructured"""
|
"""Parse file content using unstructured"""
|
||||||
# Parse with unstructured
|
# Parse with unstructured
|
||||||
loader = UnstructuredLoader(
|
loader = UnstructuredLoader(
|
||||||
@ -38,4 +42,6 @@ class UnstructuredAPIParser(BaseParser):
|
|||||||
chunking_strategy="by_title",
|
chunking_strategy="by_title",
|
||||||
)
|
)
|
||||||
elements = loader.load()
|
elements = loader.load()
|
||||||
return [element.page_content for element in elements]
|
return [
|
||||||
|
Chunk(content=element.page_content, metadata={}) for element in elements
|
||||||
|
]
|
||||||
|
@ -3,7 +3,8 @@ from typing import Dict, Any, List, Optional
|
|||||||
from fastapi import UploadFile
|
from fastapi import UploadFile
|
||||||
|
|
||||||
from core.models.request import IngestTextRequest
|
from core.models.request import IngestTextRequest
|
||||||
from ..models.documents import (
|
from core.models.documents import (
|
||||||
|
Chunk,
|
||||||
Document,
|
Document,
|
||||||
DocumentChunk,
|
DocumentChunk,
|
||||||
ChunkResult,
|
ChunkResult,
|
||||||
@ -62,9 +63,7 @@ class DocumentService:
|
|||||||
|
|
||||||
# Search chunks with vector similarity
|
# Search chunks with vector similarity
|
||||||
chunks = await self.vector_store.query_similar(
|
chunks = await self.vector_store.query_similar(
|
||||||
query_embedding,
|
query_embedding, k=k, doc_ids=doc_ids
|
||||||
k=k,
|
|
||||||
doc_ids=doc_ids,
|
|
||||||
)
|
)
|
||||||
logger.info(f"Found {len(chunks)} similar chunks")
|
logger.info(f"Found {len(chunks)} similar chunks")
|
||||||
|
|
||||||
@ -148,9 +147,7 @@ class DocumentService:
|
|||||||
logger.info(f"Generated {len(embeddings)} embeddings")
|
logger.info(f"Generated {len(embeddings)} embeddings")
|
||||||
|
|
||||||
# 4. Create and store chunk objects
|
# 4. Create and store chunk objects
|
||||||
chunk_objects = self._create_chunk_objects(
|
chunk_objects = self._create_chunk_objects(doc.external_id, chunks, embeddings)
|
||||||
doc.external_id, chunks, embeddings, doc.metadata
|
|
||||||
)
|
|
||||||
logger.info(f"Created {len(chunk_objects)} chunk objects")
|
logger.info(f"Created {len(chunk_objects)} chunk objects")
|
||||||
|
|
||||||
# 5. Store everything
|
# 5. Store everything
|
||||||
@ -168,7 +165,7 @@ class DocumentService:
|
|||||||
|
|
||||||
# 1. Create document record
|
# 1. Create document record
|
||||||
doc = Document(
|
doc = Document(
|
||||||
content_type=file.content_type,
|
content_type=file.content_type or "",
|
||||||
filename=file.filename,
|
filename=file.filename,
|
||||||
metadata=metadata,
|
metadata=metadata,
|
||||||
owner={"type": auth.entity_type, "id": auth.entity_id},
|
owner={"type": auth.entity_type, "id": auth.entity_id},
|
||||||
@ -191,7 +188,7 @@ class DocumentService:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# 3. Parse content into chunks
|
# 3. Parse content into chunks
|
||||||
chunks = await self.parser.parse_file(file_content, file.content_type)
|
chunks = await self.parser.parse_file(file_content, file.content_type or "")
|
||||||
if not chunks:
|
if not chunks:
|
||||||
raise ValueError("No content chunks extracted from file")
|
raise ValueError("No content chunks extracted from file")
|
||||||
logger.info(f"Parsed file into {len(chunks)} chunks")
|
logger.info(f"Parsed file into {len(chunks)} chunks")
|
||||||
@ -201,9 +198,7 @@ class DocumentService:
|
|||||||
logger.info(f"Generated {len(embeddings)} embeddings")
|
logger.info(f"Generated {len(embeddings)} embeddings")
|
||||||
|
|
||||||
# 5. Create and store chunk objects
|
# 5. Create and store chunk objects
|
||||||
chunk_objects = self._create_chunk_objects(
|
chunk_objects = self._create_chunk_objects(doc.external_id, chunks, embeddings)
|
||||||
doc.external_id, chunks, embeddings, doc.metadata
|
|
||||||
)
|
|
||||||
logger.info(f"Created {len(chunk_objects)} chunk objects")
|
logger.info(f"Created {len(chunk_objects)} chunk objects")
|
||||||
|
|
||||||
# 6. Store everything
|
# 6. Store everything
|
||||||
@ -215,20 +210,13 @@ class DocumentService:
|
|||||||
def _create_chunk_objects(
|
def _create_chunk_objects(
|
||||||
self,
|
self,
|
||||||
doc_id: str,
|
doc_id: str,
|
||||||
chunks: List[str],
|
chunks: List[Chunk],
|
||||||
embeddings: List[List[float]],
|
embeddings: List[List[float]],
|
||||||
metadata: Dict[str, Any],
|
|
||||||
) -> List[DocumentChunk]:
|
) -> List[DocumentChunk]:
|
||||||
"""Helper to create chunk objects"""
|
"""Helper to create chunk objects"""
|
||||||
return [
|
return [
|
||||||
DocumentChunk(
|
c.to_document_chunk(chunk_number=i, embedding=embedding, document_id=doc_id)
|
||||||
document_id=doc_id,
|
for i, (embedding, c) in enumerate(zip(embeddings, chunks))
|
||||||
content=content,
|
|
||||||
embedding=embedding,
|
|
||||||
chunk_number=i,
|
|
||||||
metadata=metadata,
|
|
||||||
)
|
|
||||||
for i, (content, embedding) in enumerate(zip(chunks, embeddings))
|
|
||||||
]
|
]
|
||||||
|
|
||||||
async def _store_chunks_and_doc(
|
async def _store_chunks_and_doc(
|
||||||
@ -287,11 +275,11 @@ class DocumentService:
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
async def _create_document_results(
|
async def _create_document_results(
|
||||||
self, auth: AuthContext, chunks: List[DocumentChunk]
|
self, auth: AuthContext, chunks: List[ChunkResult]
|
||||||
) -> List[DocumentResult]:
|
) -> List[DocumentResult]:
|
||||||
"""Group chunks by document and create DocumentResult objects."""
|
"""Group chunks by document and create DocumentResult objects."""
|
||||||
# Group chunks by document and get highest scoring chunk per doc
|
# Group chunks by document and get highest scoring chunk per doc
|
||||||
doc_chunks: Dict[str, DocumentChunk] = {}
|
doc_chunks: Dict[str, ChunkResult] = {}
|
||||||
for chunk in chunks:
|
for chunk in chunks:
|
||||||
if (
|
if (
|
||||||
chunk.document_id not in doc_chunks
|
chunk.document_id not in doc_chunks
|
||||||
|
@ -6,16 +6,17 @@ from core.models.documents import DocumentChunk
|
|||||||
|
|
||||||
class BaseVectorStore(ABC):
|
class BaseVectorStore(ABC):
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def store_embeddings(self, chunks: List[DocumentChunk]) -> Tuple[bool, List[str]]:
|
async def store_embeddings(
|
||||||
|
self, chunks: List[DocumentChunk]
|
||||||
|
) -> Tuple[bool, List[str]]:
|
||||||
"""Store document chunks and their embeddings"""
|
"""Store document chunks and their embeddings"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def query_similar(
|
async def query_similar(
|
||||||
self,
|
self,
|
||||||
query_embedding: List[float],
|
query_embedding: List[float],
|
||||||
k: int,
|
k: int,
|
||||||
auth: AuthContext,
|
|
||||||
doc_ids: Optional[List[str]] = None,
|
doc_ids: Optional[List[str]] = None,
|
||||||
) -> List[DocumentChunk]:
|
) -> List[DocumentChunk]:
|
||||||
"""Find similar chunks"""
|
"""Find similar chunks"""
|
||||||
|
Loading…
x
Reference in New Issue
Block a user