346 lines
10 KiB
Python
Raw Normal View History

2024-12-22 19:46:53 -05:00
from io import BytesIO
import json
2024-12-03 21:46:25 -05:00
from pathlib import Path
2024-12-22 19:46:53 -05:00
from typing import Dict, Any, List, Optional, Union, BinaryIO
from urllib.parse import urlparse
2024-12-17 21:40:38 -05:00
2024-12-22 19:46:53 -05:00
import jwt
import requests
2024-12-17 21:40:38 -05:00
from .models import (
Document,
IngestTextRequest,
ChunkResult,
DocumentResult,
CompletionResponse,
)
2024-11-18 18:41:23 -05:00
class DataBridge:
"""
2024-11-23 13:32:47 -05:00
DataBridge client for document operations.
2024-12-26 08:52:25 -05:00
2024-12-03 21:46:25 -05:00
Args:
uri (str, optional): DataBridge URI in format "databridge://<owner_id>:<token>@<host>".
If not provided, connects to http://localhost:8000 without authentication.
2024-12-03 21:46:25 -05:00
timeout (int, optional): Request timeout in seconds. Defaults to 30.
2024-12-22 19:46:53 -05:00
is_local (bool, optional): Whether connecting to local development server. Defaults to False.
2024-12-26 08:52:25 -05:00
2024-12-03 21:46:25 -05:00
Examples:
```python
# Without authentication
db = DataBridge()
2024-12-26 08:52:25 -05:00
# With authentication
db = DataBridge("databridge://owner_id:token@api.databridge.ai")
2024-12-03 21:46:25 -05:00
```
"""
2024-11-23 13:32:47 -05:00
def __init__(self, uri: Optional[str] = None, timeout: int = 30, is_local: bool = False):
self._timeout = timeout
2024-12-22 19:46:53 -05:00
self._session = requests.Session()
if is_local:
self._session.verify = False # Disable SSL for localhost
2024-12-17 21:40:38 -05:00
self._is_local = is_local
if uri:
self._setup_auth(uri)
else:
self._base_url = "http://localhost:8000"
self._auth_token = None
def _setup_auth(self, uri: str) -> None:
"""Setup authentication from URI"""
2024-12-03 21:46:25 -05:00
parsed = urlparse(uri)
if not parsed.netloc:
raise ValueError("Invalid URI format")
2024-12-03 21:46:25 -05:00
# Split host and auth parts
auth, host = parsed.netloc.split("@")
_, self._auth_token = auth.split(":")
2024-12-26 08:52:25 -05:00
2024-12-03 21:46:25 -05:00
# Set base URL
2024-12-17 21:40:38 -05:00
self._base_url = f"{'http' if self._is_local else 'https'}://{host}"
2024-12-03 21:46:25 -05:00
# Basic token validation
jwt.decode(self._auth_token, options={"verify_signature": False})
2024-12-22 19:46:53 -05:00
def _request(
self,
method: str,
endpoint: str,
2024-12-03 21:46:25 -05:00
data: Optional[Dict[str, Any]] = None,
files: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
"""Make HTTP request"""
headers = {}
if self._auth_token: # Only add auth header if we have a token
headers["Authorization"] = f"Bearer {self._auth_token}"
2024-12-26 08:52:25 -05:00
2024-12-03 21:46:25 -05:00
if not files:
headers["Content-Type"] = "application/json"
2024-12-22 19:46:53 -05:00
response = self._session.request(
2024-12-03 21:46:25 -05:00
method,
f"{self._base_url}/{endpoint.lstrip('/')}",
json=data if not files else None,
files=files,
data=data if files else None,
2024-12-22 19:46:53 -05:00
headers=headers,
timeout=self._timeout,
2024-12-03 21:46:25 -05:00
)
response.raise_for_status()
return response.json()
2024-12-29 12:48:41 +05:30
def ingest_text(self, content: str, metadata: Optional[Dict[str, Any]] = None) -> Document:
2024-12-03 21:46:25 -05:00
"""
Ingest a text document into DataBridge.
2024-12-26 08:52:25 -05:00
2024-12-03 21:46:25 -05:00
Args:
content: Text content to ingest
metadata: Optional metadata dictionary
2024-12-26 08:52:25 -05:00
2024-12-03 21:46:25 -05:00
Returns:
Document: Metadata of the ingested document
2024-12-26 08:52:25 -05:00
2024-12-03 21:46:25 -05:00
Example:
```python
2024-12-22 19:46:53 -05:00
doc = db.ingest_text(
2024-12-03 21:46:25 -05:00
"Machine learning is fascinating...",
metadata={
"title": "ML Introduction",
"category": "tech"
}
)
2024-12-03 21:46:25 -05:00
```
"""
request = IngestTextRequest(content=content, metadata=metadata or {})
2024-11-23 13:32:47 -05:00
response = self._request("POST", "ingest/text", request.model_dump())
2024-12-03 21:46:25 -05:00
return Document(**response)
2024-12-22 19:46:53 -05:00
def ingest_file(
self,
2024-12-03 21:46:25 -05:00
file: Union[str, bytes, BinaryIO, Path],
filename: str,
content_type: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
2024-11-23 13:32:47 -05:00
) -> Document:
"""
2024-12-03 21:46:25 -05:00
Ingest a file document into DataBridge.
2024-12-26 08:52:25 -05:00
Args:
2024-12-03 21:46:25 -05:00
file: File to ingest (path string, bytes, file object, or Path)
filename: Name of the file
content_type: MIME type (optional, will be guessed if not provided)
metadata: Optional metadata dictionary
2024-12-26 08:52:25 -05:00
Returns:
2024-12-03 21:46:25 -05:00
Document: Metadata of the ingested document
2024-12-26 08:52:25 -05:00
2024-12-03 21:46:25 -05:00
Example:
```python
# From file path
2024-12-22 19:46:53 -05:00
doc = db.ingest_file(
2024-12-03 21:46:25 -05:00
"document.pdf",
filename="document.pdf",
content_type="application/pdf",
metadata={"department": "research"}
)
2024-12-26 08:52:25 -05:00
2024-12-03 21:46:25 -05:00
# From file object
with open("document.pdf", "rb") as f:
2024-12-22 19:46:53 -05:00
doc = db.ingest_file(f, "document.pdf")
2024-12-03 21:46:25 -05:00
```
"""
2024-12-03 21:46:25 -05:00
# Handle different file input types
if isinstance(file, (str, Path)):
file_path = Path(file)
if not file_path.exists():
raise ValueError(f"File not found: {file}")
with open(file_path, "rb") as f:
content = f.read()
file_obj = BytesIO(content)
2024-12-03 21:46:25 -05:00
elif isinstance(file, bytes):
file_obj = BytesIO(file)
2024-11-22 18:56:22 -05:00
else:
2024-12-03 21:46:25 -05:00
file_obj = file
2024-12-03 21:46:25 -05:00
try:
# Prepare multipart form data
2024-12-29 12:48:41 +05:30
files = {"file": (filename, file_obj, content_type or "application/octet-stream")}
2024-12-03 21:46:25 -05:00
# Add metadata
data = {"metadata": json.dumps(metadata or {})}
response = self._request("POST", "ingest/file", data=data, files=files)
2024-12-03 21:46:25 -05:00
return Document(**response)
finally:
# Close file if we opened it
if isinstance(file, (str, Path)):
file_obj.close()
2024-12-26 08:52:25 -05:00
def retrieve_chunks(
self,
query: str,
2024-11-23 13:32:47 -05:00
filters: Optional[Dict[str, Any]] = None,
k: int = 4,
min_score: float = 0.0,
2024-12-26 08:52:25 -05:00
) -> List[ChunkResult]:
"""
2024-12-26 08:52:25 -05:00
Retrieve relevant chunks.
Args:
2024-12-03 21:46:25 -05:00
query: Search query text
filters: Optional metadata filters
2024-12-03 21:46:25 -05:00
k: Number of results (default: 4)
min_score: Minimum similarity threshold (default: 0.0)
Returns:
2024-12-26 08:52:25 -05:00
List[ChunkResult]
2024-12-03 21:46:25 -05:00
Example:
```python
2024-12-26 08:52:25 -05:00
chunks = db.retrieve_chunks(
2024-12-03 21:46:25 -05:00
"What are the key findings?",
filters={"department": "research"}
)
2024-12-26 08:52:25 -05:00
```
"""
request = {"query": query, "filters": filters, "k": k, "min_score": min_score}
2024-12-26 08:52:25 -05:00
2024-12-31 10:22:25 +05:30
response = self._request("POST", "retrieve/chunks", request)
2024-12-26 08:52:25 -05:00
return [ChunkResult(**r) for r in response]
def retrieve_docs(
self,
query: str,
filters: Optional[Dict[str, Any]] = None,
k: int = 4,
min_score: float = 0.0,
2024-12-26 08:52:25 -05:00
) -> List[DocumentResult]:
"""
Retrieve relevant documents.
Args:
query: Search query text
filters: Optional metadata filters
k: Number of results (default: 4)
min_score: Minimum similarity threshold (default: 0.0)
Returns:
List[DocumentResult]
2024-12-26 08:52:25 -05:00
Example:
```python
docs = db.retrieve_docs(
2024-12-03 21:46:25 -05:00
"machine learning",
k=5
)
```
"""
request = {"query": query, "filters": filters, "k": k, "min_score": min_score}
2024-12-26 08:52:25 -05:00
response = self._request("POST", "retrieve/docs", request)
2024-11-23 13:32:47 -05:00
return [DocumentResult(**r) for r in response]
2024-12-26 08:52:25 -05:00
def query(
self,
query: str,
filters: Optional[Dict[str, Any]] = None,
k: int = 4,
min_score: float = 0.0,
max_tokens: Optional[int] = None,
temperature: Optional[float] = None,
) -> CompletionResponse:
"""
Generate completion using relevant chunks as context.
Args:
query: Query text
filters: Optional metadata filters
k: Number of chunks to use as context (default: 4)
min_score: Minimum similarity threshold (default: 0.0)
max_tokens: Maximum tokens in completion
temperature: Model temperature
Returns:
CompletionResponse
Example:
```python
response = db.query(
"What are the key findings about customer satisfaction?",
filters={"department": "research"},
temperature=0.7
)
print(response.completion)
```
"""
request = {
"query": query,
"filters": filters,
"k": k,
"min_score": min_score,
"max_tokens": max_tokens,
"temperature": temperature,
}
response = self._request("POST", "query", request)
return CompletionResponse(**response)
2024-12-22 19:46:53 -05:00
def list_documents(
self, skip: int = 0, limit: int = 100, filters: Optional[Dict[str, Any]] = None
2024-11-23 13:32:47 -05:00
) -> List[Document]:
2024-12-03 21:46:25 -05:00
"""
List accessible documents.
2024-12-26 08:52:25 -05:00
2024-12-03 21:46:25 -05:00
Args:
skip: Number of documents to skip
limit: Maximum number of documents to return
filters: Optional filters
2024-12-03 21:46:25 -05:00
Returns:
List[Document]: List of accessible documents
2024-12-26 08:52:25 -05:00
2024-12-03 21:46:25 -05:00
Example:
```python
# Get first page
2024-12-22 19:46:53 -05:00
docs = db.list_documents(limit=10)
2024-12-26 08:52:25 -05:00
2024-12-03 21:46:25 -05:00
# Get next page
2024-12-22 19:46:53 -05:00
next_page = db.list_documents(skip=10, limit=10, filters={"department": "research"})
2024-12-03 21:46:25 -05:00
```
"""
2024-12-29 12:48:41 +05:30
response = self._request("GET", f"documents?skip={skip}&limit={limit}&filters={filters}")
2024-11-18 18:41:23 -05:00
return [Document(**doc) for doc in response]
2024-12-22 19:46:53 -05:00
def get_document(self, document_id: str) -> Document:
2024-12-03 21:46:25 -05:00
"""
Get document metadata by ID.
2024-12-26 08:52:25 -05:00
2024-12-03 21:46:25 -05:00
Args:
document_id: ID of the document
2024-12-26 08:52:25 -05:00
2024-12-03 21:46:25 -05:00
Returns:
Document: Document metadata
2024-12-26 08:52:25 -05:00
2024-12-03 21:46:25 -05:00
Example:
```python
2024-12-22 19:46:53 -05:00
doc = db.get_document("doc_123")
2024-12-03 21:46:25 -05:00
print(f"Title: {doc.metadata.get('title')}")
```
"""
2024-12-22 19:46:53 -05:00
response = self._request("GET", f"documents/{document_id}")
2024-11-20 18:42:19 -05:00
return Document(**response)
2024-12-22 19:46:53 -05:00
def close(self):
"""Close the HTTP session"""
self._session.close()
2024-12-22 19:46:53 -05:00
def __enter__(self):
return self
2024-12-22 19:46:53 -05:00
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()