244 lines
9.6 KiB
Python
Raw Normal View History

from typing import Dict, Any, List, Literal, Optional, Union
from io import BinaryIO
from pathlib import Path
from datetime import datetime
2024-12-22 19:46:53 -05:00
from pydantic import BaseModel, Field, field_validator
class Document(BaseModel):
"""Document metadata model"""
2024-12-22 19:46:53 -05:00
external_id: str = Field(..., description="Unique document identifier")
content_type: str = Field(..., description="Content type of the document")
filename: Optional[str] = Field(None, description="Original filename if available")
2024-12-29 12:48:41 +05:30
metadata: Dict[str, Any] = Field(default_factory=dict, description="User-defined metadata")
storage_info: Dict[str, str] = Field(
default_factory=dict, description="Storage-related information"
)
system_metadata: Dict[str, Any] = Field(
default_factory=dict, description="System-managed metadata"
)
access_control: Dict[str, Any] = Field(
default_factory=dict, description="Access control information"
)
2024-12-29 12:48:41 +05:30
chunk_ids: List[str] = Field(default_factory=list, description="IDs of document chunks")
# Client reference for update methods
_client = None
def update_with_text(
self,
content: str,
filename: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
rules: Optional[List] = None,
update_strategy: str = "add",
use_colpali: Optional[bool] = None,
) -> "Document":
"""
Update this document with new text content using the specified strategy.
Args:
content: The new content to add
filename: Optional new filename for the document
metadata: Additional metadata to update (optional)
rules: Optional list of rules to apply to the content
update_strategy: Strategy for updating the document (currently only 'add' is supported)
use_colpali: Whether to use multi-vector embedding
Returns:
Document: Updated document metadata
"""
if self._client is None:
raise ValueError("Document instance not connected to a client. Use a document returned from a DataBridge client method.")
return self._client.update_document_with_text(
document_id=self.external_id,
content=content,
filename=filename,
metadata=metadata,
rules=rules,
update_strategy=update_strategy,
use_colpali=use_colpali
)
def update_with_file(
self,
file: "Union[str, bytes, BinaryIO, Path]",
filename: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
rules: Optional[List] = None,
update_strategy: str = "add",
use_colpali: Optional[bool] = None,
) -> "Document":
"""
Update this document with content from a file using the specified strategy.
Args:
file: File to add (path string, bytes, file object, or Path)
filename: Name of the file
metadata: Additional metadata to update (optional)
rules: Optional list of rules to apply to the content
update_strategy: Strategy for updating the document (currently only 'add' is supported)
use_colpali: Whether to use multi-vector embedding
Returns:
Document: Updated document metadata
"""
if self._client is None:
raise ValueError("Document instance not connected to a client. Use a document returned from a DataBridge client method.")
return self._client.update_document_with_file(
document_id=self.external_id,
file=file,
filename=filename,
metadata=metadata,
rules=rules,
update_strategy=update_strategy,
use_colpali=use_colpali
)
def update_metadata(
self,
metadata: Dict[str, Any],
) -> "Document":
"""
Update this document's metadata only.
Args:
metadata: Metadata to update
Returns:
Document: Updated document metadata
"""
if self._client is None:
raise ValueError("Document instance not connected to a client. Use a document returned from a DataBridge client method.")
return self._client.update_document_metadata(
document_id=self.external_id,
metadata=metadata
)
2024-12-22 19:46:53 -05:00
class ChunkResult(BaseModel):
"""Query result at chunk level"""
2024-12-22 19:46:53 -05:00
content: str = Field(..., description="Chunk content")
score: float = Field(..., description="Relevance score")
document_id: str = Field(..., description="Parent document ID")
chunk_number: int = Field(..., description="Chunk sequence number")
2024-12-29 12:48:41 +05:30
metadata: Dict[str, Any] = Field(default_factory=dict, description="Document metadata")
2024-12-22 19:46:53 -05:00
content_type: str = Field(..., description="Content type")
filename: Optional[str] = Field(None, description="Original filename")
2024-12-29 12:48:41 +05:30
download_url: Optional[str] = Field(None, description="URL to download full document")
2024-12-22 19:46:53 -05:00
class DocumentContent(BaseModel):
"""Represents either a URL or content string"""
2024-12-29 12:48:41 +05:30
type: Literal["url", "string"] = Field(..., description="Content type (url or string)")
2024-12-22 19:46:53 -05:00
value: str = Field(..., description="The actual content or URL")
filename: Optional[str] = Field(None, description="Filename when type is url")
@field_validator("filename")
2024-12-22 19:46:53 -05:00
def filename_only_for_url(cls, v, values):
if values.data.get("type") == "string" and v is not None:
raise ValueError("filename can only be set when type is url")
if values.data.get("type") == "url" and v is None:
raise ValueError("filename is required when type is url")
2024-12-22 19:46:53 -05:00
return v
class DocumentResult(BaseModel):
"""Query result at document level"""
2024-12-22 19:46:53 -05:00
score: float = Field(..., description="Relevance score")
document_id: str = Field(..., description="Document ID")
2024-12-29 12:48:41 +05:30
metadata: Dict[str, Any] = Field(default_factory=dict, description="Document metadata")
2024-12-22 19:46:53 -05:00
content: DocumentContent = Field(..., description="Document content or URL")
2024-12-26 08:52:25 -05:00
class ChunkSource(BaseModel):
"""Source information for a chunk used in completion"""
document_id: str = Field(..., description="ID of the source document")
chunk_number: int = Field(..., description="Chunk number within the document")
score: Optional[float] = Field(None, description="Relevance score")
2024-12-26 08:52:25 -05:00
class CompletionResponse(BaseModel):
"""Completion response model"""
2024-12-26 08:52:25 -05:00
completion: str
usage: Dict[str, int]
sources: List[ChunkSource] = Field(
default_factory=list, description="Sources of chunks used in the completion"
)
metadata: Optional[Dict[str, Any]] = None
class IngestTextRequest(BaseModel):
"""Request model for ingesting text content"""
content: str
filename: Optional[str] = None
metadata: Dict[str, Any] = Field(default_factory=dict)
rules: List[Dict[str, Any]] = Field(default_factory=list)
use_colpali: bool = Field(default=False)
class Entity(BaseModel):
"""Represents an entity in a knowledge graph"""
id: str = Field(..., description="Unique entity identifier")
label: str = Field(..., description="Display label for the entity")
type: str = Field(..., description="Entity type")
properties: Dict[str, Any] = Field(default_factory=dict, description="Entity properties")
document_ids: List[str] = Field(default_factory=list, description="Source document IDs")
chunk_sources: Dict[str, List[int]] = Field(default_factory=dict, description="Source chunk numbers by document ID")
def __hash__(self):
return hash(self.id)
def __eq__(self, other):
if not isinstance(other, Entity):
return False
return self.id == other.id
class Relationship(BaseModel):
"""Represents a relationship between entities in a knowledge graph"""
id: str = Field(..., description="Unique relationship identifier")
source_id: str = Field(..., description="Source entity ID")
target_id: str = Field(..., description="Target entity ID")
type: str = Field(..., description="Relationship type")
document_ids: List[str] = Field(default_factory=list, description="Source document IDs")
chunk_sources: Dict[str, List[int]] = Field(default_factory=dict, description="Source chunk numbers by document ID")
def __hash__(self):
return hash(self.id)
def __eq__(self, other):
if not isinstance(other, Relationship):
return False
return self.id == other.id
class Graph(BaseModel):
"""Represents a knowledge graph"""
id: str = Field(..., description="Unique graph identifier")
name: str = Field(..., description="Graph name")
entities: List[Entity] = Field(default_factory=list, description="Entities in the graph")
relationships: List[Relationship] = Field(default_factory=list, description="Relationships in the graph")
metadata: Dict[str, Any] = Field(default_factory=dict, description="Graph metadata")
document_ids: List[str] = Field(default_factory=list, description="Source document IDs")
filters: Optional[Dict[str, Any]] = Field(None, description="Document filters used to create the graph")
created_at: datetime = Field(..., description="Creation timestamp")
updated_at: datetime = Field(..., description="Last update timestamp")
owner: Dict[str, str] = Field(default_factory=dict, description="Graph owner information")
access_control: Dict[str, List[str]] = Field(
default_factory=dict, description="Access control information"
)