303 lines
8.8 KiB
Python
Raw Normal View History

2024-12-22 19:46:53 -05:00
from io import BytesIO
import json
2024-12-03 21:46:25 -05:00
from pathlib import Path
2024-12-22 19:46:53 -05:00
from typing import Dict, Any, List, Optional, Union, BinaryIO
from urllib.parse import urlparse
2024-12-17 21:40:38 -05:00
2024-12-22 19:46:53 -05:00
import jwt
import requests
2024-12-17 21:40:38 -05:00
2024-12-22 19:46:53 -05:00
from .models import Document, IngestTextRequest, ChunkResult, DocumentResult
2024-11-18 18:41:23 -05:00
class DataBridge:
"""
2024-11-23 13:32:47 -05:00
DataBridge client for document operations.
2024-12-03 21:46:25 -05:00
Args:
uri (str): DataBridge URI in the format "databridge://<owner_id>:<token>@<host>"
timeout (int, optional): Request timeout in seconds. Defaults to 30.
2024-12-22 19:46:53 -05:00
is_local (bool, optional): Whether connecting to local development server. Defaults to False.
2024-12-03 21:46:25 -05:00
Examples:
```python
2024-12-22 19:46:53 -05:00
with DataBridge("databridge://owner_id:token@api.databridge.ai") as db:
2024-12-03 21:46:25 -05:00
# Ingest text
2024-12-22 19:46:53 -05:00
doc = db.ingest_text(
2024-12-03 21:46:25 -05:00
"Sample content",
metadata={"category": "sample"}
)
# Query documents
2024-12-22 19:46:53 -05:00
results = db.query("search query")
2024-12-03 21:46:25 -05:00
```
"""
2024-11-23 13:32:47 -05:00
2024-12-17 21:40:38 -05:00
def __init__(self, uri: str, timeout: int = 30, is_local: bool = False):
self._timeout = timeout
2024-12-22 19:46:53 -05:00
self._session = requests.Session()
if is_local:
self._session.verify = False # Disable SSL for localhost
2024-12-17 21:40:38 -05:00
self._is_local = is_local
self._setup_auth(uri)
def _setup_auth(self, uri: str) -> None:
"""Setup authentication from URI"""
2024-12-03 21:46:25 -05:00
parsed = urlparse(uri)
if not parsed.netloc:
raise ValueError("Invalid URI format")
2024-12-03 21:46:25 -05:00
# Split host and auth parts
auth, host = parsed.netloc.split('@')
self._owner_id, self._auth_token = auth.split(':')
# Set base URL
2024-12-17 21:40:38 -05:00
self._base_url = f"{'http' if self._is_local else 'https'}://{host}"
2024-12-03 21:46:25 -05:00
# Basic token validation
jwt.decode(self._auth_token, options={"verify_signature": False})
2024-12-22 19:46:53 -05:00
def _request(
self,
method: str,
endpoint: str,
2024-12-03 21:46:25 -05:00
data: Optional[Dict[str, Any]] = None,
files: Optional[Dict[str, Any]] = None
) -> Dict[str, Any]:
2024-11-23 13:32:47 -05:00
"""Make authenticated HTTP request"""
2024-12-03 21:46:25 -05:00
headers = {"Authorization": f"Bearer {self._auth_token}"}
if not files:
headers["Content-Type"] = "application/json"
2024-12-22 19:46:53 -05:00
response = self._session.request(
2024-12-03 21:46:25 -05:00
method,
f"{self._base_url}/{endpoint.lstrip('/')}",
json=data if not files else None,
files=files,
data=data if files else None,
2024-12-22 19:46:53 -05:00
headers=headers,
timeout=self._timeout
2024-12-03 21:46:25 -05:00
)
response.raise_for_status()
return response.json()
2024-12-22 19:46:53 -05:00
def ingest_text(
2024-12-03 21:46:25 -05:00
self,
content: str,
metadata: Optional[Dict[str, Any]] = None
) -> Document:
"""
Ingest a text document into DataBridge.
Args:
content: Text content to ingest
metadata: Optional metadata dictionary
Returns:
Document: Metadata of the ingested document
Example:
```python
2024-12-22 19:46:53 -05:00
doc = db.ingest_text(
2024-12-03 21:46:25 -05:00
"Machine learning is fascinating...",
metadata={
"title": "ML Introduction",
"category": "tech"
}
)
2024-12-03 21:46:25 -05:00
```
"""
request = IngestTextRequest(
content=content,
metadata=metadata or {}
)
2024-11-23 13:32:47 -05:00
2024-12-22 19:46:53 -05:00
response = self._request(
2024-12-03 21:46:25 -05:00
"POST",
"ingest/text",
request.model_dump()
)
return Document(**response)
2024-12-22 19:46:53 -05:00
def ingest_file(
self,
2024-12-03 21:46:25 -05:00
file: Union[str, bytes, BinaryIO, Path],
filename: str,
content_type: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None
2024-11-23 13:32:47 -05:00
) -> Document:
"""
2024-12-03 21:46:25 -05:00
Ingest a file document into DataBridge.
Args:
2024-12-03 21:46:25 -05:00
file: File to ingest (path string, bytes, file object, or Path)
filename: Name of the file
content_type: MIME type (optional, will be guessed if not provided)
metadata: Optional metadata dictionary
Returns:
2024-12-03 21:46:25 -05:00
Document: Metadata of the ingested document
Example:
```python
# From file path
2024-12-22 19:46:53 -05:00
doc = db.ingest_file(
2024-12-03 21:46:25 -05:00
"document.pdf",
filename="document.pdf",
content_type="application/pdf",
metadata={"department": "research"}
)
# From file object
with open("document.pdf", "rb") as f:
2024-12-22 19:46:53 -05:00
doc = db.ingest_file(f, "document.pdf")
2024-12-03 21:46:25 -05:00
```
"""
2024-12-03 21:46:25 -05:00
# Handle different file input types
if isinstance(file, (str, Path)):
file_path = Path(file)
if not file_path.exists():
raise ValueError(f"File not found: {file}")
with open(file_path, "rb") as f:
content = f.read()
file_obj = BytesIO(content)
2024-12-03 21:46:25 -05:00
elif isinstance(file, bytes):
file_obj = BytesIO(file)
2024-11-22 18:56:22 -05:00
else:
2024-12-03 21:46:25 -05:00
file_obj = file
2024-12-03 21:46:25 -05:00
try:
# Prepare multipart form data
files = {
"file": (filename, file_obj, content_type or "application/octet-stream")
}
2024-12-03 21:46:25 -05:00
# Add metadata
data = {"metadata": json.dumps(metadata or {})}
2024-12-22 19:46:53 -05:00
response = self._request(
2024-12-03 21:46:25 -05:00
"POST",
"ingest/file",
data=data,
files=files
)
return Document(**response)
finally:
# Close file if we opened it
if isinstance(file, (str, Path)):
file_obj.close()
2024-12-22 19:46:53 -05:00
def query(
self,
query: str,
2024-11-23 13:32:47 -05:00
return_type: str = "chunks",
filters: Optional[Dict[str, Any]] = None,
k: int = 4,
2024-11-23 13:32:47 -05:00
min_score: float = 0.0
) -> Union[List[ChunkResult], List[DocumentResult]]:
"""
Query documents in DataBridge.
Args:
2024-12-03 21:46:25 -05:00
query: Search query text
2024-11-23 13:32:47 -05:00
return_type: Type of results ("chunks" or "documents")
filters: Optional metadata filters
2024-12-03 21:46:25 -05:00
k: Number of results (default: 4)
min_score: Minimum similarity threshold (default: 0.0)
Returns:
2024-12-03 21:46:25 -05:00
List[ChunkResult] or List[DocumentResult] depending on return_type
2024-12-03 21:46:25 -05:00
Example:
```python
# Query for chunks
2024-12-22 19:46:53 -05:00
chunks = db.query(
2024-12-03 21:46:25 -05:00
"What are the key findings?",
return_type="chunks",
filters={"department": "research"}
)
2024-12-03 21:46:25 -05:00
# Query for documents
2024-12-22 19:46:53 -05:00
docs = db.query(
2024-12-03 21:46:25 -05:00
"machine learning",
return_type="documents",
k=5
)
```
"""
2024-11-23 13:32:47 -05:00
request = {
"query": query,
"return_type": return_type,
"filters": filters,
"k": k,
"min_score": min_score
}
2024-12-22 19:46:53 -05:00
response = self._request("POST", "query", request)
2024-11-23 13:32:47 -05:00
if return_type == "chunks":
return [ChunkResult(**r) for r in response]
return [DocumentResult(**r) for r in response]
2024-12-22 19:46:53 -05:00
def list_documents(
2024-11-23 13:32:47 -05:00
self,
skip: int = 0,
limit: int = 100,
filters: Optional[Dict[str, Any]] = None
2024-11-23 13:32:47 -05:00
) -> List[Document]:
2024-12-03 21:46:25 -05:00
"""
List accessible documents.
2024-12-03 21:46:25 -05:00
Args:
skip: Number of documents to skip
limit: Maximum number of documents to return
filters: Optional filters
2024-12-03 21:46:25 -05:00
Returns:
List[Document]: List of accessible documents
Example:
```python
# Get first page
2024-12-22 19:46:53 -05:00
docs = db.list_documents(limit=10)
2024-12-03 21:46:25 -05:00
# Get next page
2024-12-22 19:46:53 -05:00
next_page = db.list_documents(skip=10, limit=10, filters={"department": "research"})
2024-12-03 21:46:25 -05:00
```
"""
2024-12-22 19:46:53 -05:00
response = self._request(
2024-11-23 13:32:47 -05:00
"GET",
f"documents?skip={skip}&limit={limit}&filters={filters}"
2024-11-23 13:32:47 -05:00
)
2024-11-18 18:41:23 -05:00
return [Document(**doc) for doc in response]
2024-12-22 19:46:53 -05:00
def get_document(self, document_id: str) -> Document:
2024-12-03 21:46:25 -05:00
"""
Get document metadata by ID.
Args:
document_id: ID of the document
Returns:
Document: Document metadata
Example:
```python
2024-12-22 19:46:53 -05:00
doc = db.get_document("doc_123")
2024-12-03 21:46:25 -05:00
print(f"Title: {doc.metadata.get('title')}")
```
"""
2024-12-22 19:46:53 -05:00
response = self._request("GET", f"documents/{document_id}")
2024-11-20 18:42:19 -05:00
return Document(**response)
2024-12-22 19:46:53 -05:00
def close(self):
"""Close the HTTP session"""
self._session.close()
2024-12-22 19:46:53 -05:00
def __enter__(self):
return self
2024-12-22 19:46:53 -05:00
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()