morphik-core/core/models/documents.py

from typing import Dict, Any, List, Optional, Literal
from enum import Enum
from datetime import UTC, datetime
from PIL import Image
from pydantic import BaseModel, Field, field_validator
import uuid
import logging

from core.models.video import TimeSeriesData

logger = logging.getLogger(__name__)


class QueryReturnType(str, Enum):
    CHUNKS = "chunks"
    DOCUMENTS = "documents"


class StorageFileInfo(BaseModel):
    """Information about a file stored in storage"""
    bucket: str
    key: str
    version: int = 1
    filename: Optional[str] = None
    content_type: Optional[str] = None
    timestamp: datetime = Field(default_factory=lambda: datetime.now(UTC))
    
    
class Document(BaseModel):
    """Represents a document stored in the database documents collection"""

    external_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
    owner: Dict[str, str]
    content_type: str
    filename: Optional[str] = None
    metadata: Dict[str, Any] = Field(default_factory=dict)
    """user-defined metadata"""
    storage_info: Dict[str, str] = Field(default_factory=dict)
    """Legacy field for backwards compatibility - for single file storage"""
    storage_files: List[StorageFileInfo] = Field(default_factory=list)
    """List of files associated with this document"""
    system_metadata: Dict[str, Any] = Field(
        default_factory=lambda: {
            "created_at": datetime.now(UTC),
            "updated_at": datetime.now(UTC),
            "version": 1,
            "folder_name": None,
            "end_user_id": None,
        }
    )
    """metadata such as creation date etc."""
    additional_metadata: Dict[str, Any] = Field(default_factory=dict)
    """metadata to help with querying eg. frame descriptions and time-stamped transcript for videos"""
    access_control: Dict[str, List[str]] = Field(
        default_factory=lambda: {"readers": [], "writers": [], "admins": []}
    )
    chunk_ids: List[str] = Field(default_factory=list)

    def __hash__(self):
        return hash(self.external_id)

    def __eq__(self, other):
        if not isinstance(other, Document):
            return False
        return self.external_id == other.external_id


class DocumentContent(BaseModel):
    """Represents either a URL or content string"""

    type: Literal["url", "string"]
    value: str
    filename: Optional[str] = Field(None, description="Filename when type is url")

    @field_validator("filename")
    def filename_only_for_url(cls, v, values):
        logger.debug(f"Value looks like: {values}")
        if values.data.get("type") == "string" and v is not None:
            raise ValueError("filename can only be set when type is url")
        if values.data.get("type") == "url" and v is None:
            raise ValueError("filename is required when type is url")
        return v


class DocumentResult(BaseModel):
    """Query result at document level"""

    score: float  # Highest chunk score
    document_id: str  # external_id
    metadata: Dict[str, Any]
    content: DocumentContent
    additional_metadata: Dict[str, Any]


class ChunkResult(BaseModel):
    """Query result at chunk level"""

    content: str
    score: float
    document_id: str  # external_id
    chunk_number: int
    metadata: Dict[str, Any]
    content_type: str
    filename: Optional[str] = None
    download_url: Optional[str] = None

    def augmented_content(self, doc: DocumentResult) -> str | Image.Image:
        match self.metadata:
            case m if "timestamp" in m:
                # if timestamp present, then must be a video. In that case,
                # obtain the original document and augment the content with
                # frame/transcript information as well.
                frame_description = doc.additional_metadata.get("frame_description")
                transcript = doc.additional_metadata.get("transcript")
                if not isinstance(frame_description, dict) or not isinstance(transcript, dict):
                    logger.warning("Invalid frame description or transcript - not a dictionary")
                    return self.content
                ts_frame = TimeSeriesData(time_to_content=frame_description)
                ts_transcript = TimeSeriesData(time_to_content=transcript)
                timestamps = (
                    ts_frame.content_to_times[self.content]
                    + ts_transcript.content_to_times[self.content]
                )
                augmented_contents = [
                    f"Frame description: {ts_frame.at_time(t)} \n \n Transcript: {ts_transcript.at_time(t)}"
                    for t in timestamps
                ]
                return "\n\n".join(augmented_contents)
            # case m if m.get("is_image", False):
            #     try:
            #         # Handle data URI format "data:image/png;base64,..."
            #         content = self.content
            #         if content.startswith('data:'):
            #             # Extract the base64 part after the comma
            #             content = content.split(',', 1)[1]

            #         # Now decode the base64 string
            #         image_bytes = base64.b64decode(content)
            #         content = Image.open(io.BytesIO(image_bytes))
            #         return content
            #     except Exception as e:
            #         print(f"Error processing image: {str(e)}")
            #         # Fall back to using the content as text
            #         return self.content
            case _:
                return self.content
system changes 2024-11-22 18:56:22 -05:00			`from typing import Dict, Any, List, Optional, Literal`
			`from enum import Enum`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`from datetime import UTC, datetime`
add image processing to ollama 2025-02-26 22:36:25 -05:00			`from PIL import Image`
system changes 2024-11-22 18:56:22 -05:00			`from pydantic import BaseModel, Field, field_validator`
			`import uuid`
pass api tests 2024-12-02 20:03:35 -05:00			`import logging`
system changes 2024-11-22 18:56:22 -05:00
added frame/transcript augmentation for video retrieval 2024-12-29 12:45:12 +05:30			`from core.models.video import TimeSeriesData`

pass api tests 2024-12-02 20:03:35 -05:00			`logger = logging.getLogger(__name__)`
system changes 2024-11-22 18:56:22 -05:00

			`class QueryReturnType(str, Enum):`
			`CHUNKS = "chunks"`
			`DOCUMENTS = "documents"`


Add update methods with add update strategy (#53) 2025-03-13 11:26:01 -04:00			`class StorageFileInfo(BaseModel):`
			`"""Information about a file stored in storage"""`
			`bucket: str`
			`key: str`
			`version: int = 1`
			`filename: Optional[str] = None`
			`content_type: Optional[str] = None`
			`timestamp: datetime = Field(default_factory=lambda: datetime.now(UTC))`


system changes 2024-11-22 18:56:22 -05:00			`class Document(BaseModel):`
Add folders and user scopes (#82) 2025-04-13 14:52:26 -07:00			`"""Represents a document stored in the database documents collection"""`
add a video parser + formatting changes (#4) 2024-12-26 11:34:24 -05:00
system changes 2024-11-22 18:56:22 -05:00			`external_id: str = Field(default_factory=lambda: str(uuid.uuid4()))`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`owner: Dict[str, str]`
system changes 2024-11-22 18:56:22 -05:00			`content_type: str`
			`filename: Optional[str] = None`
			`metadata: Dict[str, Any] = Field(default_factory=dict)`
fix typing errors 2024-12-29 11:10:51 +05:30			`"""user-defined metadata"""`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`storage_info: Dict[str, str] = Field(default_factory=dict)`
Add update methods with add update strategy (#53) 2025-03-13 11:26:01 -04:00			`"""Legacy field for backwards compatibility - for single file storage"""`
			`storage_files: List[StorageFileInfo] = Field(default_factory=list)`
			`"""List of files associated with this document"""`
system changes 2024-11-22 18:56:22 -05:00			`system_metadata: Dict[str, Any] = Field(`
			`default_factory=lambda: {`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`"created_at": datetime.now(UTC),`
			`"updated_at": datetime.now(UTC),`
add a video parser + formatting changes (#4) 2024-12-26 11:34:24 -05:00			`"version": 1,`
Add folders and user scopes (#82) 2025-04-13 14:52:26 -07:00			`"folder_name": None,`
			`"end_user_id": None,`
system changes 2024-11-22 18:56:22 -05:00			`}`
			`)`
fix typing errors 2024-12-29 11:10:51 +05:30			`"""metadata such as creation date etc."""`
			`additional_metadata: Dict[str, Any] = Field(default_factory=dict)`
			`"""metadata to help with querying eg. frame descriptions and time-stamped transcript for videos"""`
separate text and doc ingestion pathways 2024-11-24 14:29:25 -05:00			`access_control: Dict[str, List[str]] = Field(`
add a video parser + formatting changes (#4) 2024-12-26 11:34:24 -05:00			`default_factory=lambda: {"readers": [], "writers": [], "admins": []}`
system changes 2024-11-22 18:56:22 -05:00			`)`
			`chunk_ids: List[str] = Field(default_factory=list)`

Add support for cache-augmented-generation (#30) 2025-01-29 10:19:28 +05:30			`def __hash__(self):`
			`return hash(self.external_id)`

			`def __eq__(self, other):`
			`if not isinstance(other, Document):`
			`return False`
			`return self.external_id == other.external_id`

system changes 2024-11-22 18:56:22 -05:00
			`class DocumentContent(BaseModel):`
			`"""Represents either a URL or content string"""`
add a video parser + formatting changes (#4) 2024-12-26 11:34:24 -05:00
refactor some stuff (#2) * refactor some stuff, remove bare try catches 2024-12-15 14:31:25 -05:00			`type: Literal["url", "string"]`
system changes 2024-11-22 18:56:22 -05:00			`value: str`
			`filename: Optional[str] = Field(None, description="Filename when type is url")`

add a video parser + formatting changes (#4) 2024-12-26 11:34:24 -05:00			`@field_validator("filename")`
system changes 2024-11-22 18:56:22 -05:00			`def filename_only_for_url(cls, v, values):`
pass api tests 2024-12-02 20:03:35 -05:00			`logger.debug(f"Value looks like: {values}")`
add a video parser + formatting changes (#4) 2024-12-26 11:34:24 -05:00			`if values.data.get("type") == "string" and v is not None:`
			`raise ValueError("filename can only be set when type is url")`
			`if values.data.get("type") == "url" and v is None:`
			`raise ValueError("filename is required when type is url")`
system changes 2024-11-22 18:56:22 -05:00			`return v`


			`class DocumentResult(BaseModel):`
			`"""Query result at document level"""`
add a video parser + formatting changes (#4) 2024-12-26 11:34:24 -05:00
			`score: float # Highest chunk score`
			`document_id: str # external_id`
system changes 2024-11-22 18:56:22 -05:00			`metadata: Dict[str, Any]`
			`content: DocumentContent`
added frame/transcript augmentation for video retrieval 2024-12-29 12:45:12 +05:30			`additional_metadata: Dict[str, Any]`


			`class ChunkResult(BaseModel):`
			`"""Query result at chunk level"""`

			`content: str`
			`score: float`
			`document_id: str # external_id`
			`chunk_number: int`
			`metadata: Dict[str, Any]`
			`content_type: str`
			`filename: Optional[str] = None`
			`download_url: Optional[str] = None`

add image processing to ollama 2025-02-26 22:36:25 -05:00			`def augmented_content(self, doc: DocumentResult) -> str \| Image.Image:`
added frame/transcript augmentation for video retrieval 2024-12-29 12:45:12 +05:30			`match self.metadata:`
			`case m if "timestamp" in m:`
			`# if timestamp present, then must be a video. In that case,`
			`# obtain the original document and augment the content with`
			`# frame/transcript information as well.`
			`frame_description = doc.additional_metadata.get("frame_description")`
			`transcript = doc.additional_metadata.get("transcript")`
reformat files 2024-12-29 12:48:41 +05:30			`if not isinstance(frame_description, dict) or not isinstance(transcript, dict):`
			`logger.warning("Invalid frame description or transcript - not a dictionary")`
added frame/transcript augmentation for video retrieval 2024-12-29 12:45:12 +05:30			`return self.content`
fix ingestion bug (#8) 2024-12-30 11:46:42 -05:00			`ts_frame = TimeSeriesData(time_to_content=frame_description)`
			`ts_transcript = TimeSeriesData(time_to_content=transcript)`
added frame/transcript augmentation for video retrieval 2024-12-29 12:45:12 +05:30			`timestamps = (`
			`ts_frame.content_to_times[self.content]`
			`+ ts_transcript.content_to_times[self.content]`
			`)`
			`augmented_contents = [`
			`f"Frame description: {ts_frame.at_time(t)} \n \n Transcript: {ts_transcript.at_time(t)}"`
			`for t in timestamps`
			`]`
			`return "\n\n".join(augmented_contents)`
add image processing to ollama 2025-02-26 22:36:25 -05:00			`# case m if m.get("is_image", False):`
			`# try:`
			`# # Handle data URI format "data:image/png;base64,..."`
			`# content = self.content`
			`# if content.startswith('data:'):`
			`# # Extract the base64 part after the comma`
			`# content = content.split(',', 1)[1]`
Implement knowledge graphs, and graph enhanced querying (#48) 2025-03-17 17:36:43 -04:00
add image processing to ollama 2025-02-26 22:36:25 -05:00			`# # Now decode the base64 string`
			`# image_bytes = base64.b64decode(content)`
			`# content = Image.open(io.BytesIO(image_bytes))`
			`# return content`
			`# except Exception as e:`
			`# print(f"Error processing image: {str(e)}")`
			`# # Fall back to using the content as text`
			`# return self.content`
added frame/transcript augmentation for video retrieval 2024-12-29 12:45:12 +05:30			`case _:`
			`return self.content`