2024-11-16 01:48:15 -05:00
|
|
|
from typing import Dict, Any, List, Optional, Union
|
|
|
|
import httpx
|
|
|
|
from urllib.parse import urlparse
|
|
|
|
import jwt
|
2024-11-23 13:32:47 -05:00
|
|
|
from pydantic import BaseModel
|
2024-11-16 01:48:15 -05:00
|
|
|
import logging
|
2024-11-22 18:56:22 -05:00
|
|
|
import base64
|
2024-11-23 13:32:47 -05:00
|
|
|
|
|
|
|
|
2024-11-16 01:48:15 -05:00
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
2024-11-23 13:32:47 -05:00
|
|
|
class IngestRequest(BaseModel):
|
|
|
|
"""Structure for document ingestion"""
|
2024-11-16 01:48:15 -05:00
|
|
|
content: str
|
2024-11-23 13:32:47 -05:00
|
|
|
content_type: str
|
|
|
|
metadata: Dict[str, Any] = {}
|
|
|
|
filename: Optional[str] = None
|
2024-11-16 01:48:15 -05:00
|
|
|
|
2024-11-22 18:56:22 -05:00
|
|
|
|
2024-11-18 18:41:23 -05:00
|
|
|
class Document(BaseModel):
|
2024-11-23 13:32:47 -05:00
|
|
|
"""Document metadata model"""
|
|
|
|
external_id: str
|
|
|
|
content_type: str
|
|
|
|
filename: Optional[str] = None
|
|
|
|
metadata: Dict[str, Any] = {}
|
|
|
|
storage_info: Dict[str, str] = {}
|
|
|
|
system_metadata: Dict[str, Any] = {}
|
|
|
|
access_control: Dict[str, Any] = {}
|
|
|
|
chunk_ids: List[str] = []
|
|
|
|
|
|
|
|
|
|
|
|
class ChunkResult(BaseModel):
|
|
|
|
"""Query result at chunk level"""
|
|
|
|
content: str
|
|
|
|
score: float
|
|
|
|
document_id: str
|
|
|
|
chunk_number: int
|
|
|
|
metadata: Dict[str, Any]
|
|
|
|
content_type: str
|
|
|
|
filename: Optional[str] = None
|
|
|
|
download_url: Optional[str] = None
|
|
|
|
|
|
|
|
|
|
|
|
class DocumentResult(BaseModel):
|
|
|
|
"""Query result at document level"""
|
|
|
|
score: float
|
|
|
|
document_id: str
|
|
|
|
metadata: Dict[str, Any]
|
|
|
|
content: Dict[str, str]
|
2024-11-18 18:41:23 -05:00
|
|
|
|
2024-11-16 01:48:15 -05:00
|
|
|
|
|
|
|
class DataBridge:
|
|
|
|
"""
|
2024-11-23 13:32:47 -05:00
|
|
|
DataBridge client for document operations.
|
|
|
|
|
2024-11-16 01:48:15 -05:00
|
|
|
Usage:
|
2024-11-23 13:32:47 -05:00
|
|
|
async with DataBridge("databridge://owner123:token@databridge.local") as db:
|
|
|
|
doc_id = await db.ingest_document("content", content_type="text/plain")
|
|
|
|
results = await db.query("What is...")
|
2024-11-16 01:48:15 -05:00
|
|
|
"""
|
2024-11-23 13:32:47 -05:00
|
|
|
|
|
|
|
def __init__(self, uri: str, timeout: int = 30):
|
2024-11-16 01:48:15 -05:00
|
|
|
self._timeout = timeout
|
|
|
|
self._client = httpx.AsyncClient(timeout=timeout)
|
|
|
|
self._setup_auth(uri)
|
|
|
|
|
|
|
|
def _setup_auth(self, uri: str) -> None:
|
|
|
|
"""Setup authentication from URI"""
|
|
|
|
try:
|
|
|
|
parsed = urlparse(uri)
|
|
|
|
if not parsed.netloc:
|
|
|
|
raise ValueError("Invalid URI format")
|
2024-11-23 13:32:47 -05:00
|
|
|
|
2024-11-17 15:54:27 -05:00
|
|
|
split_uri = parsed.netloc.split('@')
|
2024-11-23 13:32:47 -05:00
|
|
|
self._base_url = (
|
|
|
|
f"{'http' if 'localhost' in split_uri[1] else 'https'}"
|
|
|
|
f"://{split_uri[1]}"
|
|
|
|
)
|
|
|
|
|
2024-11-17 15:54:27 -05:00
|
|
|
auth_parts = split_uri[0].split(':')
|
2024-11-16 01:48:15 -05:00
|
|
|
if len(auth_parts) != 2:
|
|
|
|
raise ValueError("URI must include owner_id and auth_token")
|
|
|
|
|
2024-11-23 13:32:47 -05:00
|
|
|
self._owner_id = auth_parts[0]
|
|
|
|
self._auth_token = auth_parts[1]
|
2024-11-16 01:48:15 -05:00
|
|
|
|
2024-11-23 13:32:47 -05:00
|
|
|
# Basic token validation
|
2024-11-16 01:48:15 -05:00
|
|
|
try:
|
2024-11-23 13:32:47 -05:00
|
|
|
jwt.decode(self._auth_token, options={"verify_signature": False})
|
2024-11-16 01:48:15 -05:00
|
|
|
except jwt.InvalidTokenError as e:
|
|
|
|
raise ValueError(f"Invalid auth token format: {str(e)}")
|
|
|
|
|
|
|
|
except Exception as e:
|
2024-11-23 13:32:47 -05:00
|
|
|
raise ValueError(f"Failed to setup authentication: {str(e)}")
|
2024-11-16 01:48:15 -05:00
|
|
|
|
2024-11-23 13:32:47 -05:00
|
|
|
async def _request(
|
2024-11-16 01:48:15 -05:00
|
|
|
self,
|
|
|
|
method: str,
|
|
|
|
endpoint: str,
|
2024-11-23 13:32:47 -05:00
|
|
|
data: Dict[str, Any] = None
|
2024-11-16 01:48:15 -05:00
|
|
|
) -> Dict[str, Any]:
|
2024-11-23 13:32:47 -05:00
|
|
|
"""Make authenticated HTTP request"""
|
2024-11-16 01:48:15 -05:00
|
|
|
headers = {
|
2024-11-23 13:32:47 -05:00
|
|
|
"Authorization": f"Bearer {self._auth_token}",
|
2024-11-16 01:48:15 -05:00
|
|
|
"Content-Type": "application/json"
|
|
|
|
}
|
|
|
|
|
|
|
|
try:
|
|
|
|
response = await self._client.request(
|
|
|
|
method,
|
2024-11-17 15:54:27 -05:00
|
|
|
f"{self._base_url}/{endpoint.lstrip('/')}",
|
2024-11-16 01:48:15 -05:00
|
|
|
json=data,
|
|
|
|
headers=headers
|
|
|
|
)
|
|
|
|
response.raise_for_status()
|
|
|
|
return response.json()
|
|
|
|
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
|
|
if e.response.status_code == 401:
|
2024-11-23 13:32:47 -05:00
|
|
|
raise ValueError("Authentication failed")
|
|
|
|
raise ConnectionError(f"Request failed: {e.response.text}")
|
|
|
|
|
2024-11-16 01:48:15 -05:00
|
|
|
except Exception as e:
|
|
|
|
raise ConnectionError(f"Request failed: {str(e)}")
|
|
|
|
|
|
|
|
async def ingest_document(
|
|
|
|
self,
|
|
|
|
content: Union[str, bytes],
|
2024-11-23 13:32:47 -05:00
|
|
|
content_type: str,
|
2024-11-18 18:41:23 -05:00
|
|
|
metadata: Optional[Dict[str, Any]] = None,
|
|
|
|
filename: Optional[str] = None
|
2024-11-23 13:32:47 -05:00
|
|
|
) -> Document:
|
2024-11-16 01:48:15 -05:00
|
|
|
"""
|
|
|
|
Ingest a document into DataBridge.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
content: Document content (string or bytes)
|
2024-11-23 13:32:47 -05:00
|
|
|
content_type: MIME type of the content
|
2024-11-16 01:48:15 -05:00
|
|
|
metadata: Optional document metadata
|
2024-11-23 13:32:47 -05:00
|
|
|
filename: Optional filename
|
|
|
|
|
2024-11-16 01:48:15 -05:00
|
|
|
Returns:
|
2024-11-23 13:32:47 -05:00
|
|
|
Document object with metadata
|
2024-11-16 01:48:15 -05:00
|
|
|
"""
|
2024-11-22 18:56:22 -05:00
|
|
|
# Handle content encoding
|
2024-11-16 01:48:15 -05:00
|
|
|
if isinstance(content, bytes):
|
2024-11-22 18:56:22 -05:00
|
|
|
encoded_content = base64.b64encode(content).decode('utf-8')
|
|
|
|
else:
|
2024-11-23 13:32:47 -05:00
|
|
|
encoded_content = content
|
|
|
|
|
|
|
|
request = IngestRequest(
|
|
|
|
content=encoded_content,
|
|
|
|
content_type=content_type,
|
|
|
|
metadata=metadata or {},
|
|
|
|
filename=filename
|
2024-11-16 01:48:15 -05:00
|
|
|
)
|
|
|
|
|
2024-11-23 13:32:47 -05:00
|
|
|
response = await self._request("POST", "documents", request.model_dump())
|
|
|
|
return Document(**response)
|
2024-11-16 01:48:15 -05:00
|
|
|
|
|
|
|
async def query(
|
|
|
|
self,
|
|
|
|
query: str,
|
2024-11-23 13:32:47 -05:00
|
|
|
return_type: str = "chunks",
|
|
|
|
filters: Optional[Dict[str, Any]] = None,
|
2024-11-16 01:48:15 -05:00
|
|
|
k: int = 4,
|
2024-11-23 13:32:47 -05:00
|
|
|
min_score: float = 0.0
|
|
|
|
) -> Union[List[ChunkResult], List[DocumentResult]]:
|
2024-11-16 01:48:15 -05:00
|
|
|
"""
|
|
|
|
Query documents in DataBridge.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
query: Query string
|
2024-11-23 13:32:47 -05:00
|
|
|
return_type: Type of results ("chunks" or "documents")
|
2024-11-16 01:48:15 -05:00
|
|
|
filters: Optional metadata filters
|
2024-11-23 13:32:47 -05:00
|
|
|
k: Number of results to return
|
|
|
|
min_score: Minimum similarity score threshold
|
|
|
|
|
2024-11-16 01:48:15 -05:00
|
|
|
Returns:
|
2024-11-23 13:32:47 -05:00
|
|
|
List of ChunkResult or DocumentResult objects
|
2024-11-16 01:48:15 -05:00
|
|
|
"""
|
2024-11-23 13:32:47 -05:00
|
|
|
request = {
|
|
|
|
"query": query,
|
|
|
|
"return_type": return_type,
|
|
|
|
"filters": filters,
|
|
|
|
"k": k,
|
|
|
|
"min_score": min_score
|
|
|
|
}
|
2024-11-16 01:48:15 -05:00
|
|
|
|
2024-11-23 13:32:47 -05:00
|
|
|
response = await self._request("POST", "query", request)
|
2024-11-20 18:42:19 -05:00
|
|
|
|
2024-11-23 13:32:47 -05:00
|
|
|
if return_type == "chunks":
|
|
|
|
return [ChunkResult(**r) for r in response]
|
|
|
|
return [DocumentResult(**r) for r in response]
|
|
|
|
|
|
|
|
async def list_documents(
|
|
|
|
self,
|
|
|
|
skip: int = 0,
|
|
|
|
limit: int = 100
|
|
|
|
) -> List[Document]:
|
|
|
|
"""List accessible documents with pagination"""
|
|
|
|
response = await self._request(
|
|
|
|
"GET",
|
|
|
|
f"documents?skip={skip}&limit={limit}"
|
|
|
|
)
|
2024-11-18 18:41:23 -05:00
|
|
|
return [Document(**doc) for doc in response]
|
2024-11-16 01:48:15 -05:00
|
|
|
|
2024-11-23 13:32:47 -05:00
|
|
|
async def get_document(self, document_id: str) -> Document:
|
|
|
|
"""Get document by ID"""
|
|
|
|
response = await self._request("GET", f"documents/{document_id}")
|
2024-11-20 18:42:19 -05:00
|
|
|
return Document(**response)
|
|
|
|
|
2024-11-16 01:48:15 -05:00
|
|
|
async def close(self):
|
|
|
|
"""Close the HTTP client"""
|
|
|
|
await self._client.aclose()
|
|
|
|
|
|
|
|
async def __aenter__(self):
|
2024-11-23 13:32:47 -05:00
|
|
|
"""Context manager entry"""
|
2024-11-16 01:48:15 -05:00
|
|
|
return self
|
|
|
|
|
|
|
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
2024-11-23 13:32:47 -05:00
|
|
|
"""Context manager exit"""
|
2024-11-16 01:48:15 -05:00
|
|
|
await self.close()
|