mirror of
https://github.com/james-m-jordan/morphik-core.git
synced 2025-05-09 19:32:38 +00:00
39 lines
1.0 KiB
Python
39 lines
1.0 KiB
Python
from typing import Any, Dict, List
|
|
|
|
import numpy as np
|
|
from pydantic import BaseModel, Field
|
|
|
|
Embedding = List[float] | List[List[float]] | np.ndarray
|
|
|
|
|
|
class DocumentChunk(BaseModel):
|
|
"""Represents a chunk stored in VectorStore"""
|
|
|
|
document_id: str # external_id of parent document
|
|
content: str
|
|
embedding: Embedding
|
|
chunk_number: int
|
|
# chunk-specific metadata
|
|
metadata: Dict[str, Any] = Field(default_factory=dict)
|
|
score: float = 0.0
|
|
|
|
model_config = {"arbitrary_types_allowed": True}
|
|
|
|
|
|
class Chunk(BaseModel):
|
|
"""Represents a chunk containing content and metadata"""
|
|
|
|
content: str
|
|
metadata: Dict[str, Any] = Field(default_factory=dict)
|
|
|
|
model_config = {"arbitrary_types_allowed": True}
|
|
|
|
def to_document_chunk(self, document_id: str, chunk_number: int, embedding: Embedding) -> DocumentChunk:
|
|
return DocumentChunk(
|
|
document_id=document_id,
|
|
content=self.content,
|
|
embedding=embedding,
|
|
chunk_number=chunk_number,
|
|
metadata=self.metadata,
|
|
)
|