2024-11-22 18:56:22 -05:00
|
|
|
from typing import Dict, Any, List, Optional, Literal
|
|
|
|
from enum import Enum
|
2024-11-24 14:29:25 -05:00
|
|
|
from datetime import UTC, datetime
|
2024-11-22 18:56:22 -05:00
|
|
|
from pydantic import BaseModel, Field, field_validator
|
|
|
|
import uuid
|
2024-12-02 20:03:35 -05:00
|
|
|
import logging
|
2024-11-22 18:56:22 -05:00
|
|
|
|
2024-12-02 20:03:35 -05:00
|
|
|
logger = logging.getLogger(__name__)
|
2024-11-22 18:56:22 -05:00
|
|
|
|
|
|
|
class EntityType(str, Enum):
|
|
|
|
USER = "user"
|
|
|
|
DEVELOPER = "developer"
|
|
|
|
|
|
|
|
|
|
|
|
class QueryReturnType(str, Enum):
|
|
|
|
CHUNKS = "chunks"
|
|
|
|
DOCUMENTS = "documents"
|
|
|
|
|
|
|
|
|
|
|
|
class Document(BaseModel):
|
|
|
|
"""Represents a document stored in MongoDB documents collection"""
|
|
|
|
external_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
2024-11-24 14:29:25 -05:00
|
|
|
owner: Dict[str, str]
|
2024-11-22 18:56:22 -05:00
|
|
|
content_type: str
|
|
|
|
filename: Optional[str] = None
|
|
|
|
metadata: Dict[str, Any] = Field(default_factory=dict)
|
2024-11-24 14:29:25 -05:00
|
|
|
storage_info: Dict[str, str] = Field(default_factory=dict)
|
2024-11-22 18:56:22 -05:00
|
|
|
system_metadata: Dict[str, Any] = Field(
|
|
|
|
default_factory=lambda: {
|
2024-11-24 14:29:25 -05:00
|
|
|
"created_at": datetime.now(UTC),
|
|
|
|
"updated_at": datetime.now(UTC),
|
2024-11-22 18:56:22 -05:00
|
|
|
"version": 1
|
|
|
|
}
|
|
|
|
)
|
2024-11-24 14:29:25 -05:00
|
|
|
access_control: Dict[str, List[str]] = Field(
|
2024-11-22 18:56:22 -05:00
|
|
|
default_factory=lambda: {
|
2024-11-24 14:29:25 -05:00
|
|
|
"readers": [],
|
|
|
|
"writers": [],
|
|
|
|
"admins": []
|
2024-11-22 18:56:22 -05:00
|
|
|
}
|
|
|
|
)
|
|
|
|
chunk_ids: List[str] = Field(default_factory=list)
|
|
|
|
|
|
|
|
|
|
|
|
class DocumentChunk(BaseModel):
|
|
|
|
"""Represents a chunk stored in VectorStore"""
|
|
|
|
document_id: str # external_id of parent document
|
|
|
|
# TODO: This might be suboptimal due to storage size. consider moving to separate store.
|
|
|
|
content: str
|
|
|
|
embedding: List[float]
|
|
|
|
chunk_number: int
|
|
|
|
version: int = 1
|
2024-12-02 20:03:35 -05:00
|
|
|
metadata: Dict[str, Any] = Field(default_factory=dict)
|
|
|
|
score: float = 0.0
|
2024-11-22 18:56:22 -05:00
|
|
|
|
|
|
|
|
|
|
|
class ChunkResult(BaseModel):
|
|
|
|
"""Query result at chunk level"""
|
|
|
|
content: str
|
|
|
|
score: float
|
|
|
|
document_id: str # external_id
|
|
|
|
chunk_number: int
|
|
|
|
metadata: Dict[str, Any]
|
|
|
|
content_type: str
|
|
|
|
filename: Optional[str] = None
|
|
|
|
download_url: Optional[str] = None
|
|
|
|
|
|
|
|
|
|
|
|
class DocumentContent(BaseModel):
|
|
|
|
"""Represents either a URL or content string"""
|
|
|
|
type: Literal["url", "string"]
|
|
|
|
value: str
|
|
|
|
filename: Optional[str] = Field(None, description="Filename when type is url")
|
|
|
|
|
|
|
|
@field_validator('filename')
|
|
|
|
def filename_only_for_url(cls, v, values):
|
2024-12-02 20:03:35 -05:00
|
|
|
logger.debug(f"Value looks like: {values}")
|
|
|
|
if values.data.get('type') == 'string' and v is not None:
|
2024-11-22 18:56:22 -05:00
|
|
|
raise ValueError('filename can only be set when type is url')
|
2024-12-02 20:03:35 -05:00
|
|
|
if values.data.get('type') == 'url' and v is None:
|
2024-11-22 18:56:22 -05:00
|
|
|
raise ValueError('filename is required when type is url')
|
|
|
|
return v
|
|
|
|
|
|
|
|
|
|
|
|
class DocumentResult(BaseModel):
|
|
|
|
"""Query result at document level"""
|
|
|
|
score: float # Highest chunk score
|
|
|
|
document_id: str # external_id
|
|
|
|
metadata: Dict[str, Any]
|
|
|
|
content: DocumentContent
|