Adityavardhan Agrawal df8d7fcdd0
refactor some stuff (#2)
* refactor some stuff, remove bare try catches
2024-12-15 14:31:25 -05:00

334 lines
9.5 KiB
Python

import json
from typing import Dict, Any, List, Optional, Union, BinaryIO
import httpx
from urllib.parse import urlparse
import jwt
from pydantic import BaseModel
from pathlib import Path
from io import BytesIO
class IngestTextRequest(BaseModel):
"""Request model for text ingestion"""
content: str
metadata: Dict[str, Any] = {}
class Document(BaseModel):
"""Document metadata model"""
external_id: str
content_type: str
filename: Optional[str] = None
metadata: Dict[str, Any] = {}
storage_info: Dict[str, str] = {}
system_metadata: Dict[str, Any] = {}
access_control: Dict[str, Any] = {}
chunk_ids: List[str] = []
class ChunkResult(BaseModel):
"""Query result at chunk level"""
content: str
score: float
document_id: str
chunk_number: int
metadata: Dict[str, Any]
content_type: str
filename: Optional[str] = None
download_url: Optional[str] = None
class DocumentResult(BaseModel):
"""Query result at document level"""
score: float
document_id: str
metadata: Dict[str, Any]
content: Dict[str, str]
class DataBridge:
"""
DataBridge client for document operations.
Args:
uri (str): DataBridge URI in the format "databridge://<owner_id>:<token>@<host>"
timeout (int, optional): Request timeout in seconds. Defaults to 30.
Examples:
```python
async with DataBridge("databridge://owner_id:token@api.databridge.ai") as db:
# Ingest text
doc = await db.ingest_text(
"Sample content",
metadata={"category": "sample"}
)
# Query documents
results = await db.query("search query")
```
"""
def __init__(self, uri: str, timeout: int = 30):
self._timeout = timeout
self._client = httpx.AsyncClient(timeout=timeout)
self._setup_auth(uri)
def _setup_auth(self, uri: str) -> None:
"""Setup authentication from URI"""
parsed = urlparse(uri)
if not parsed.netloc:
raise ValueError("Invalid URI format")
# Split host and auth parts
auth, host = parsed.netloc.split('@')
self._owner_id, self._auth_token = auth.split(':')
# Set base URL
self._base_url = f"{'http' if 'localhost' in host else 'https'}://{host}"
# Basic token validation
jwt.decode(self._auth_token, options={"verify_signature": False})
async def _request(
self,
method: str,
endpoint: str,
data: Optional[Dict[str, Any]] = None,
files: Optional[Dict[str, Any]] = None
) -> Dict[str, Any]:
"""Make authenticated HTTP request"""
headers = {"Authorization": f"Bearer {self._auth_token}"}
if not files:
headers["Content-Type"] = "application/json"
response = await self._client.request(
method,
f"{self._base_url}/{endpoint.lstrip('/')}",
json=data if not files else None,
files=files,
data=data if files else None,
headers=headers
)
response.raise_for_status()
return response.json()
async def ingest_text(
self,
content: str,
metadata: Optional[Dict[str, Any]] = None
) -> Document:
"""
Ingest a text document into DataBridge.
Args:
content: Text content to ingest
metadata: Optional metadata dictionary
Returns:
Document: Metadata of the ingested document
Example:
```python
doc = await db.ingest_text(
"Machine learning is fascinating...",
metadata={
"title": "ML Introduction",
"category": "tech"
}
)
```
"""
request = IngestTextRequest(
content=content,
metadata=metadata or {}
)
response = await self._request(
"POST",
"ingest/text",
request.model_dump()
)
return Document(**response)
async def ingest_file(
self,
file: Union[str, bytes, BinaryIO, Path],
filename: str,
content_type: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None
) -> Document:
"""
Ingest a file document into DataBridge.
Args:
file: File to ingest (path string, bytes, file object, or Path)
filename: Name of the file
content_type: MIME type (optional, will be guessed if not provided)
metadata: Optional metadata dictionary
Returns:
Document: Metadata of the ingested document
Example:
```python
# From file path
doc = await db.ingest_file(
"document.pdf",
filename="document.pdf",
content_type="application/pdf",
metadata={"department": "research"}
)
# From file object
with open("document.pdf", "rb") as f:
doc = await db.ingest_file(f, "document.pdf")
```
"""
# Handle different file input types
if isinstance(file, (str, Path)):
file_path = Path(file)
if not file_path.exists():
raise ValueError(f"File not found: {file}")
with open(file_path, "rb") as f:
content = f.read()
file_obj = BytesIO(content)
elif isinstance(file, bytes):
file_obj = BytesIO(file)
else:
file_obj = file
try:
# Prepare multipart form data
files = {
"file": (filename, file_obj, content_type or "application/octet-stream")
}
# Add metadata
data = {"metadata": json.dumps(metadata or {})}
response = await self._request(
"POST",
"ingest/file",
data=data,
files=files
)
return Document(**response)
finally:
# Close file if we opened it
if isinstance(file, (str, Path)):
file_obj.close()
async def query(
self,
query: str,
return_type: str = "chunks",
filters: Optional[Dict[str, Any]] = None,
k: int = 4,
min_score: float = 0.0
) -> Union[List[ChunkResult], List[DocumentResult]]:
"""
Query documents in DataBridge.
Args:
query: Search query text
return_type: Type of results ("chunks" or "documents")
filters: Optional metadata filters
k: Number of results (default: 4)
min_score: Minimum similarity threshold (default: 0.0)
Returns:
List[ChunkResult] or List[DocumentResult] depending on return_type
Example:
```python
# Query for chunks
chunks = await db.query(
"What are the key findings?",
return_type="chunks",
filters={"department": "research"}
)
# Query for documents
docs = await db.query(
"machine learning",
return_type="documents",
k=5
)
```
"""
request = {
"query": query,
"return_type": return_type,
"filters": filters,
"k": k,
"min_score": min_score
}
response = await self._request("POST", "query", request)
if return_type == "chunks":
return [ChunkResult(**r) for r in response]
return [DocumentResult(**r) for r in response]
async def list_documents(
self,
skip: int = 0,
limit: int = 100,
filters: Optional[Dict[str, Any]] = None
) -> List[Document]:
"""
List accessible documents.
Args:
skip: Number of documents to skip
limit: Maximum number of documents to return
filters: Optional filters
Returns:
List[Document]: List of accessible documents
Example:
```python
# Get first page
docs = await db.list_documents(limit=10)
# Get next page
next_page = await db.list_documents(skip=10, limit=10, filters={"department": "research"})
```
"""
response = await self._request(
"GET",
f"documents?skip={skip}&limit={limit}&filters={filters}"
)
return [Document(**doc) for doc in response]
async def get_document(self, document_id: str) -> Document:
"""
Get document metadata by ID.
Args:
document_id: ID of the document
Returns:
Document: Document metadata
Example:
```python
doc = await db.get_document("doc_123")
print(f"Title: {doc.metadata.get('title')}")
```
"""
response = await self._request("GET", f"documents/{document_id}")
return Document(**response)
async def close(self):
"""Close the HTTP client"""
await self._client.aclose()
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
await self.close()