fix ingestion bug (#8)

2025-05-09 19:32:38 +00:00 · 2024-12-30 11:46:42 -05:00 · 2024-12-30 11:46:42 -05:00 · 848198f478
commit 848198f478
parent 478c1acd8d
3 changed files with 26 additions and 29 deletions
--- a/core/models/chunk.py
+++ b/core/models/chunk.py
@ -2,8 +2,31 @@ from typing import Dict, Any, List
 from pydantic import BaseModel, Field


+class DocumentChunk(BaseModel):
+    """Represents a chunk stored in VectorStore"""
+
+    document_id: str  # external_id of parent document
+    content: str
+    embedding: List[float]
+    chunk_number: int
+    # chunk-specific metadata
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+    score: float = 0.0
+
+
 class Chunk(BaseModel):
    """Represents a chunk containing content and metadata"""

    content: str
    metadata: Dict[str, Any] = Field(default_factory=dict)
+
+    def to_document_chunk(
+        self, document_id: str, chunk_number: int, embedding: List[float]
+    ) -> DocumentChunk:
+        return DocumentChunk(
+            document_id=document_id,
+            content=self.content,
+            embedding=embedding,
+            chunk_number=chunk_number,
+            metadata=self.metadata,
+        )
--- a/core/models/documents.py
+++ b/core/models/documents.py
@ -6,7 +6,6 @@ import uuid
 import logging

 from core.models.video import TimeSeriesData
-from core.models.chunk import Chunk

 logger = logging.getLogger(__name__)

@ -42,30 +41,6 @@ class Document(BaseModel):
    chunk_ids: List[str] = Field(default_factory=list)


-class DocumentChunk(BaseModel):
-    """Represents a chunk stored in VectorStore"""
-
-    document_id: str  # external_id of parent document
-    content: str
-    embedding: List[float]
-    chunk_number: int
-    # chunk-specific metadata
-    metadata: Dict[str, Any] = Field(default_factory=dict)
-    score: float = 0.0
-
-
-def to_document_chunk(
-    chunk: Chunk, document_id: str, chunk_number: int, embedding: List[float]
-) -> DocumentChunk:
-    return DocumentChunk(
-        document_id=document_id,
-        content=chunk.content,
-        embedding=embedding,
-        chunk_number=chunk_number,
-        metadata=chunk.metadata,
-    )
-
-
 class DocumentContent(BaseModel):
    """Represents either a URL or content string"""

@ -116,8 +91,8 @@ class ChunkResult(BaseModel):
                if not isinstance(frame_description, dict) or not isinstance(transcript, dict):
                    logger.warning("Invalid frame description or transcript - not a dictionary")
                    return self.content
-                ts_frame = TimeSeriesData(frame_description)
-                ts_transcript = TimeSeriesData(transcript)
+                ts_frame = TimeSeriesData(time_to_content=frame_description)
+                ts_transcript = TimeSeriesData(time_to_content=transcript)
                timestamps = (
                    ts_frame.content_to_times[self.content]
                    + ts_transcript.content_to_times[self.content]
--- a/core/models/video.py
+++ b/core/models/video.py
@ -1,10 +1,9 @@
 from collections import defaultdict
-from numbers import Number
 from typing import List, Tuple, Optional, Union, Dict
 from bisect import bisect_left
 import logging

-from pydantic import BaseModel, Field, computed_field
+from pydantic import BaseModel, computed_field

 from core.models.chunk import Chunk