334 lines
9.5 KiB
Python
Raw Normal View History

import json
2024-12-03 21:46:25 -05:00
from typing import Dict, Any, List, Optional, Union, BinaryIO
import httpx
from urllib.parse import urlparse
import jwt
2024-11-23 13:32:47 -05:00
from pydantic import BaseModel
2024-12-03 21:46:25 -05:00
from pathlib import Path
from io import BytesIO
2024-11-23 13:32:47 -05:00
2024-12-03 21:46:25 -05:00
class IngestTextRequest(BaseModel):
"""Request model for text ingestion"""
content: str
2024-11-23 13:32:47 -05:00
metadata: Dict[str, Any] = {}
2024-11-22 18:56:22 -05:00
2024-11-18 18:41:23 -05:00
class Document(BaseModel):
2024-11-23 13:32:47 -05:00
"""Document metadata model"""
external_id: str
content_type: str
filename: Optional[str] = None
metadata: Dict[str, Any] = {}
storage_info: Dict[str, str] = {}
system_metadata: Dict[str, Any] = {}
access_control: Dict[str, Any] = {}
chunk_ids: List[str] = []
class ChunkResult(BaseModel):
"""Query result at chunk level"""
content: str
score: float
document_id: str
chunk_number: int
metadata: Dict[str, Any]
content_type: str
filename: Optional[str] = None
download_url: Optional[str] = None
class DocumentResult(BaseModel):
"""Query result at document level"""
score: float
document_id: str
metadata: Dict[str, Any]
content: Dict[str, str]
2024-11-18 18:41:23 -05:00
class DataBridge:
"""
2024-11-23 13:32:47 -05:00
DataBridge client for document operations.
2024-12-03 21:46:25 -05:00
Args:
uri (str): DataBridge URI in the format "databridge://<owner_id>:<token>@<host>"
timeout (int, optional): Request timeout in seconds. Defaults to 30.
Examples:
```python
async with DataBridge("databridge://owner_id:token@api.databridge.ai") as db:
# Ingest text
doc = await db.ingest_text(
"Sample content",
metadata={"category": "sample"}
)
# Query documents
results = await db.query("search query")
```
"""
2024-11-23 13:32:47 -05:00
def __init__(self, uri: str, timeout: int = 30):
self._timeout = timeout
self._client = httpx.AsyncClient(timeout=timeout)
self._setup_auth(uri)
def _setup_auth(self, uri: str) -> None:
"""Setup authentication from URI"""
2024-12-03 21:46:25 -05:00
parsed = urlparse(uri)
if not parsed.netloc:
raise ValueError("Invalid URI format")
2024-12-03 21:46:25 -05:00
# Split host and auth parts
auth, host = parsed.netloc.split('@')
self._owner_id, self._auth_token = auth.split(':')
# Set base URL
self._base_url = f"{'http' if 'localhost' in host else 'https'}://{host}"
2024-12-03 21:46:25 -05:00
# Basic token validation
jwt.decode(self._auth_token, options={"verify_signature": False})
2024-11-23 13:32:47 -05:00
async def _request(
self,
method: str,
endpoint: str,
2024-12-03 21:46:25 -05:00
data: Optional[Dict[str, Any]] = None,
files: Optional[Dict[str, Any]] = None
) -> Dict[str, Any]:
2024-11-23 13:32:47 -05:00
"""Make authenticated HTTP request"""
2024-12-03 21:46:25 -05:00
headers = {"Authorization": f"Bearer {self._auth_token}"}
if not files:
headers["Content-Type"] = "application/json"
response = await self._client.request(
method,
f"{self._base_url}/{endpoint.lstrip('/')}",
json=data if not files else None,
files=files,
data=data if files else None,
headers=headers
)
response.raise_for_status()
return response.json()
2024-12-03 21:46:25 -05:00
async def ingest_text(
self,
content: str,
metadata: Optional[Dict[str, Any]] = None
) -> Document:
"""
Ingest a text document into DataBridge.
Args:
content: Text content to ingest
metadata: Optional metadata dictionary
Returns:
Document: Metadata of the ingested document
Example:
```python
doc = await db.ingest_text(
"Machine learning is fascinating...",
metadata={
"title": "ML Introduction",
"category": "tech"
}
)
2024-12-03 21:46:25 -05:00
```
"""
request = IngestTextRequest(
content=content,
metadata=metadata or {}
)
2024-11-23 13:32:47 -05:00
2024-12-03 21:46:25 -05:00
response = await self._request(
"POST",
"ingest/text",
request.model_dump()
)
return Document(**response)
2024-12-03 21:46:25 -05:00
async def ingest_file(
self,
2024-12-03 21:46:25 -05:00
file: Union[str, bytes, BinaryIO, Path],
filename: str,
content_type: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None
2024-11-23 13:32:47 -05:00
) -> Document:
"""
2024-12-03 21:46:25 -05:00
Ingest a file document into DataBridge.
Args:
2024-12-03 21:46:25 -05:00
file: File to ingest (path string, bytes, file object, or Path)
filename: Name of the file
content_type: MIME type (optional, will be guessed if not provided)
metadata: Optional metadata dictionary
Returns:
2024-12-03 21:46:25 -05:00
Document: Metadata of the ingested document
Example:
```python
# From file path
doc = await db.ingest_file(
"document.pdf",
filename="document.pdf",
content_type="application/pdf",
metadata={"department": "research"}
)
# From file object
with open("document.pdf", "rb") as f:
doc = await db.ingest_file(f, "document.pdf")
```
"""
2024-12-03 21:46:25 -05:00
# Handle different file input types
if isinstance(file, (str, Path)):
file_path = Path(file)
if not file_path.exists():
raise ValueError(f"File not found: {file}")
with open(file_path, "rb") as f:
content = f.read()
file_obj = BytesIO(content)
2024-12-03 21:46:25 -05:00
elif isinstance(file, bytes):
file_obj = BytesIO(file)
2024-11-22 18:56:22 -05:00
else:
2024-12-03 21:46:25 -05:00
file_obj = file
2024-12-03 21:46:25 -05:00
try:
# Prepare multipart form data
files = {
"file": (filename, file_obj, content_type or "application/octet-stream")
}
2024-12-03 21:46:25 -05:00
# Add metadata
data = {"metadata": json.dumps(metadata or {})}
response = await self._request(
"POST",
"ingest/file",
data=data,
files=files
)
return Document(**response)
finally:
# Close file if we opened it
if isinstance(file, (str, Path)):
file_obj.close()
async def query(
self,
query: str,
2024-11-23 13:32:47 -05:00
return_type: str = "chunks",
filters: Optional[Dict[str, Any]] = None,
k: int = 4,
2024-11-23 13:32:47 -05:00
min_score: float = 0.0
) -> Union[List[ChunkResult], List[DocumentResult]]:
"""
Query documents in DataBridge.
Args:
2024-12-03 21:46:25 -05:00
query: Search query text
2024-11-23 13:32:47 -05:00
return_type: Type of results ("chunks" or "documents")
filters: Optional metadata filters
2024-12-03 21:46:25 -05:00
k: Number of results (default: 4)
min_score: Minimum similarity threshold (default: 0.0)
Returns:
2024-12-03 21:46:25 -05:00
List[ChunkResult] or List[DocumentResult] depending on return_type
2024-12-03 21:46:25 -05:00
Example:
```python
# Query for chunks
chunks = await db.query(
"What are the key findings?",
return_type="chunks",
filters={"department": "research"}
)
2024-12-03 21:46:25 -05:00
# Query for documents
docs = await db.query(
"machine learning",
return_type="documents",
k=5
)
```
"""
2024-11-23 13:32:47 -05:00
request = {
"query": query,
"return_type": return_type,
"filters": filters,
"k": k,
"min_score": min_score
}
2024-11-23 13:32:47 -05:00
response = await self._request("POST", "query", request)
2024-11-23 13:32:47 -05:00
if return_type == "chunks":
return [ChunkResult(**r) for r in response]
return [DocumentResult(**r) for r in response]
async def list_documents(
self,
skip: int = 0,
limit: int = 100,
filters: Optional[Dict[str, Any]] = None
2024-11-23 13:32:47 -05:00
) -> List[Document]:
2024-12-03 21:46:25 -05:00
"""
List accessible documents.
2024-12-03 21:46:25 -05:00
Args:
skip: Number of documents to skip
limit: Maximum number of documents to return
filters: Optional filters
2024-12-03 21:46:25 -05:00
Returns:
List[Document]: List of accessible documents
Example:
```python
# Get first page
docs = await db.list_documents(limit=10)
# Get next page
next_page = await db.list_documents(skip=10, limit=10, filters={"department": "research"})
2024-12-03 21:46:25 -05:00
```
"""
2024-11-23 13:32:47 -05:00
response = await self._request(
"GET",
f"documents?skip={skip}&limit={limit}&filters={filters}"
2024-11-23 13:32:47 -05:00
)
2024-11-18 18:41:23 -05:00
return [Document(**doc) for doc in response]
2024-11-23 13:32:47 -05:00
async def get_document(self, document_id: str) -> Document:
2024-12-03 21:46:25 -05:00
"""
Get document metadata by ID.
Args:
document_id: ID of the document
Returns:
Document: Document metadata
Example:
```python
doc = await db.get_document("doc_123")
print(f"Title: {doc.metadata.get('title')}")
```
"""
2024-11-23 13:32:47 -05:00
response = await self._request("GET", f"documents/{document_id}")
2024-11-20 18:42:19 -05:00
return Document(**response)
async def close(self):
"""Close the HTTP client"""
await self._client.aclose()
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
await self.close()