225 lines
6.5 KiB
Python
Raw Normal View History

from typing import Dict, Any, List, Optional, Union
import httpx
from urllib.parse import urlparse
import jwt
2024-11-23 13:32:47 -05:00
from pydantic import BaseModel
import logging
2024-11-22 18:56:22 -05:00
import base64
2024-11-23 13:32:47 -05:00
logger = logging.getLogger(__name__)
2024-11-23 13:32:47 -05:00
class IngestRequest(BaseModel):
"""Structure for document ingestion"""
content: str
2024-11-23 13:32:47 -05:00
content_type: str
metadata: Dict[str, Any] = {}
filename: Optional[str] = None
2024-11-22 18:56:22 -05:00
2024-11-18 18:41:23 -05:00
class Document(BaseModel):
2024-11-23 13:32:47 -05:00
"""Document metadata model"""
external_id: str
content_type: str
filename: Optional[str] = None
metadata: Dict[str, Any] = {}
storage_info: Dict[str, str] = {}
system_metadata: Dict[str, Any] = {}
access_control: Dict[str, Any] = {}
chunk_ids: List[str] = []
class ChunkResult(BaseModel):
"""Query result at chunk level"""
content: str
score: float
document_id: str
chunk_number: int
metadata: Dict[str, Any]
content_type: str
filename: Optional[str] = None
download_url: Optional[str] = None
class DocumentResult(BaseModel):
"""Query result at document level"""
score: float
document_id: str
metadata: Dict[str, Any]
content: Dict[str, str]
2024-11-18 18:41:23 -05:00
class DataBridge:
"""
2024-11-23 13:32:47 -05:00
DataBridge client for document operations.
Usage:
2024-11-23 13:32:47 -05:00
async with DataBridge("databridge://owner123:token@databridge.local") as db:
doc_id = await db.ingest_document("content", content_type="text/plain")
results = await db.query("What is...")
"""
2024-11-23 13:32:47 -05:00
def __init__(self, uri: str, timeout: int = 30):
self._timeout = timeout
self._client = httpx.AsyncClient(timeout=timeout)
self._setup_auth(uri)
def _setup_auth(self, uri: str) -> None:
"""Setup authentication from URI"""
try:
parsed = urlparse(uri)
if not parsed.netloc:
raise ValueError("Invalid URI format")
2024-11-23 13:32:47 -05:00
2024-11-17 15:54:27 -05:00
split_uri = parsed.netloc.split('@')
2024-11-23 13:32:47 -05:00
self._base_url = (
f"{'http' if 'localhost' in split_uri[1] else 'https'}"
f"://{split_uri[1]}"
)
2024-11-17 15:54:27 -05:00
auth_parts = split_uri[0].split(':')
if len(auth_parts) != 2:
raise ValueError("URI must include owner_id and auth_token")
2024-11-23 13:32:47 -05:00
self._owner_id = auth_parts[0]
self._auth_token = auth_parts[1]
2024-11-23 13:32:47 -05:00
# Basic token validation
try:
2024-11-23 13:32:47 -05:00
jwt.decode(self._auth_token, options={"verify_signature": False})
except jwt.InvalidTokenError as e:
raise ValueError(f"Invalid auth token format: {str(e)}")
except Exception as e:
2024-11-23 13:32:47 -05:00
raise ValueError(f"Failed to setup authentication: {str(e)}")
2024-11-23 13:32:47 -05:00
async def _request(
self,
method: str,
endpoint: str,
2024-11-23 13:32:47 -05:00
data: Dict[str, Any] = None
) -> Dict[str, Any]:
2024-11-23 13:32:47 -05:00
"""Make authenticated HTTP request"""
headers = {
2024-11-23 13:32:47 -05:00
"Authorization": f"Bearer {self._auth_token}",
"Content-Type": "application/json"
}
try:
response = await self._client.request(
method,
2024-11-17 15:54:27 -05:00
f"{self._base_url}/{endpoint.lstrip('/')}",
json=data,
headers=headers
)
response.raise_for_status()
return response.json()
except httpx.HTTPStatusError as e:
if e.response.status_code == 401:
2024-11-23 13:32:47 -05:00
raise ValueError("Authentication failed")
raise ConnectionError(f"Request failed: {e.response.text}")
except Exception as e:
raise ConnectionError(f"Request failed: {str(e)}")
async def ingest_document(
self,
content: Union[str, bytes],
2024-11-23 13:32:47 -05:00
content_type: str,
2024-11-18 18:41:23 -05:00
metadata: Optional[Dict[str, Any]] = None,
filename: Optional[str] = None
2024-11-23 13:32:47 -05:00
) -> Document:
"""
Ingest a document into DataBridge.
Args:
content: Document content (string or bytes)
2024-11-23 13:32:47 -05:00
content_type: MIME type of the content
metadata: Optional document metadata
2024-11-23 13:32:47 -05:00
filename: Optional filename
Returns:
2024-11-23 13:32:47 -05:00
Document object with metadata
"""
2024-11-22 18:56:22 -05:00
# Handle content encoding
if isinstance(content, bytes):
2024-11-22 18:56:22 -05:00
encoded_content = base64.b64encode(content).decode('utf-8')
else:
2024-11-23 13:32:47 -05:00
encoded_content = content
request = IngestRequest(
content=encoded_content,
content_type=content_type,
metadata=metadata or {},
filename=filename
)
2024-11-23 13:32:47 -05:00
response = await self._request("POST", "documents", request.model_dump())
return Document(**response)
async def query(
self,
query: str,
2024-11-23 13:32:47 -05:00
return_type: str = "chunks",
filters: Optional[Dict[str, Any]] = None,
k: int = 4,
2024-11-23 13:32:47 -05:00
min_score: float = 0.0
) -> Union[List[ChunkResult], List[DocumentResult]]:
"""
Query documents in DataBridge.
Args:
query: Query string
2024-11-23 13:32:47 -05:00
return_type: Type of results ("chunks" or "documents")
filters: Optional metadata filters
2024-11-23 13:32:47 -05:00
k: Number of results to return
min_score: Minimum similarity score threshold
Returns:
2024-11-23 13:32:47 -05:00
List of ChunkResult or DocumentResult objects
"""
2024-11-23 13:32:47 -05:00
request = {
"query": query,
"return_type": return_type,
"filters": filters,
"k": k,
"min_score": min_score
}
2024-11-23 13:32:47 -05:00
response = await self._request("POST", "query", request)
2024-11-20 18:42:19 -05:00
2024-11-23 13:32:47 -05:00
if return_type == "chunks":
return [ChunkResult(**r) for r in response]
return [DocumentResult(**r) for r in response]
async def list_documents(
self,
skip: int = 0,
limit: int = 100
) -> List[Document]:
"""List accessible documents with pagination"""
response = await self._request(
"GET",
f"documents?skip={skip}&limit={limit}"
)
2024-11-18 18:41:23 -05:00
return [Document(**doc) for doc in response]
2024-11-23 13:32:47 -05:00
async def get_document(self, document_id: str) -> Document:
"""Get document by ID"""
response = await self._request("GET", f"documents/{document_id}")
2024-11-20 18:42:19 -05:00
return Document(**response)
async def close(self):
"""Close the HTTP client"""
await self._client.aclose()
async def __aenter__(self):
2024-11-23 13:32:47 -05:00
"""Context manager entry"""
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
2024-11-23 13:32:47 -05:00
"""Context manager exit"""
await self.close()