SDK changes, add a sync client

2025-05-09 19:32:38 +00:00 · 2024-12-22 19:46:53 -05:00 · 2024-12-22 19:46:53 -05:00 · a925dcd7ac
commit a925dcd7ac
parent 58726e1f2f
5 changed files with 381 additions and 69 deletions
--- a/sdks/python/databridge/init.py
+++ b/sdks/python/databridge/init.py
@ -1,10 +1,15 @@
-from .client import DataBridge
-from .exceptions import DataBridgeError, AuthenticationError
-
-__version__ = "0.1.4"
+"""
+DataBridge Python SDK for document ingestion and querying.
+"""

+from .sync import DataBridge
+from .async_ import AsyncDataBridge
+from .models import Document, IngestTextRequest
 __all__ = [
    "DataBridge",
-    "DataBridgeError",
-    "AuthenticationError",
+    "AsyncDataBridge",
+    "Document",
+    "IngestTextRequest",
 ]
+
+__version__ = "0.1.5"
--- a/sdks/python/databridge/async_.py
+++ b/sdks/python/databridge/async_.py
@ -1,77 +1,27 @@
-import json
-from typing import Dict, Any, List, Literal, Optional, Union, BinaryIO
-import httpx
-from urllib.parse import urlparse
-import jwt
-from pydantic import BaseModel, Field, field_validator
-from pathlib import Path
 from io import BytesIO
+import json
+from pathlib import Path
+from typing import Dict, Any, List, Optional, Union, BinaryIO
+from urllib.parse import urlparse
+
+import httpx
+import jwt
+
+from .models import Document, IngestTextRequest, ChunkResult, DocumentResult


-class IngestTextRequest(BaseModel):
-    """Request model for text ingestion"""
-    content: str
-    metadata: Dict[str, Any] = {}
-
-
-class Document(BaseModel):
-    """Document metadata model"""
-    external_id: str
-    content_type: str
-    filename: Optional[str] = None
-    metadata: Dict[str, Any] = {}
-    storage_info: Dict[str, str] = {}
-    system_metadata: Dict[str, Any] = {}
-    access_control: Dict[str, Any] = {}
-    chunk_ids: List[str] = []
-
-
-class ChunkResult(BaseModel):
-    """Query result at chunk level"""
-    content: str
-    score: float
-    document_id: str
-    chunk_number: int
-    metadata: Dict[str, Any]
-    content_type: str
-    filename: Optional[str] = None
-    download_url: Optional[str] = None
-
-
-class DocumentContent(BaseModel):
-    """Represents either a URL or content string"""
-    type: Literal["url", "string"]
-    value: str
-    filename: Optional[str] = Field(None, description="Filename when type is url")
-
-    @field_validator('filename')
-    def filename_only_for_url(cls, v, values):
-        if values.data.get('type') == 'string' and v is not None:
-            raise ValueError('filename can only be set when type is url')
-        if values.data.get('type') == 'url' and v is None:
-            raise ValueError('filename is required when type is url')
-        return v
-
-
-class DocumentResult(BaseModel):
-    """Query result at document level"""
-    score: float
-    document_id: str
-    metadata: Dict[str, Any]
-    content: DocumentContent
-
-
-class DataBridge:
+class AsyncDataBridge:
    """
    DataBridge client for document operations.
    
    Args:
        uri (str): DataBridge URI in the format "databridge://<owner_id>:<token>@<host>"
        timeout (int, optional): Request timeout in seconds. Defaults to 30.
+        is_local (bool, optional): Whether to connect to a local server. Defaults to False.
    
    Examples:
        ```python
-        async with DataBridge("databridge://owner_id:token@api.databridge.ai") as db:
+        async with AsyncDataBridge("databridge://owner_id:token@api.databridge.ai") as db:
            # Ingest text
            doc = await db.ingest_text(
                "Sample content",
--- a/sdks/python/databridge/models.py
+++ b/sdks/python/databridge/models.py
@ -0,0 +1,55 @@
+from typing import Dict, Any, List, Literal, Optional
+from pydantic import BaseModel, Field, field_validator
+
+
+class Document(BaseModel):
+    """Document metadata model"""
+    external_id: str = Field(..., description="Unique document identifier")
+    content_type: str = Field(..., description="Content type of the document")
+    filename: Optional[str] = Field(None, description="Original filename if available")
+    metadata: Dict[str, Any] = Field(default_factory=dict, description="User-defined metadata")
+    storage_info: Dict[str, str] = Field(default_factory=dict, description="Storage-related information")
+    system_metadata: Dict[str, Any] = Field(default_factory=dict, description="System-managed metadata")
+    access_control: Dict[str, Any] = Field(default_factory=dict, description="Access control information")
+    chunk_ids: List[str] = Field(default_factory=list, description="IDs of document chunks")
+
+
+class IngestTextRequest(BaseModel):
+    """Request model for text ingestion"""
+    content: str = Field(..., description="Text content to ingest")
+    metadata: Dict[str, Any] = Field(default_factory=dict, description="Optional metadata")
+
+
+class ChunkResult(BaseModel):
+    """Query result at chunk level"""
+    content: str = Field(..., description="Chunk content")
+    score: float = Field(..., description="Relevance score")
+    document_id: str = Field(..., description="Parent document ID")
+    chunk_number: int = Field(..., description="Chunk sequence number")
+    metadata: Dict[str, Any] = Field(default_factory=dict, description="Document metadata")
+    content_type: str = Field(..., description="Content type")
+    filename: Optional[str] = Field(None, description="Original filename")
+    download_url: Optional[str] = Field(None, description="URL to download full document")
+
+
+class DocumentContent(BaseModel):
+    """Represents either a URL or content string"""
+    type: Literal["url", "string"] = Field(..., description="Content type (url or string)")
+    value: str = Field(..., description="The actual content or URL")
+    filename: Optional[str] = Field(None, description="Filename when type is url")
+
+    @field_validator('filename')
+    def filename_only_for_url(cls, v, values):
+        if values.data.get('type') == 'string' and v is not None:
+            raise ValueError('filename can only be set when type is url')
+        if values.data.get('type') == 'url' and v is None:
+            raise ValueError('filename is required when type is url')
+        return v
+
+
+class DocumentResult(BaseModel):
+    """Query result at document level"""
+    score: float = Field(..., description="Relevance score")
+    document_id: str = Field(..., description="Document ID")
+    metadata: Dict[str, Any] = Field(default_factory=dict, description="Document metadata")
+    content: DocumentContent = Field(..., description="Document content or URL")
--- a/sdks/python/databridge/sync.py
+++ b/sdks/python/databridge/sync.py
@ -0,0 +1,302 @@
+from io import BytesIO
+import json
+from pathlib import Path
+from typing import Dict, Any, List, Optional, Union, BinaryIO
+from urllib.parse import urlparse
+
+import jwt
+import requests
+
+from .models import Document, IngestTextRequest, ChunkResult, DocumentResult
+
+
+class DataBridge:
+    """
+    DataBridge client for document operations.
+    
+    Args:
+        uri (str): DataBridge URI in the format "databridge://<owner_id>:<token>@<host>"
+        timeout (int, optional): Request timeout in seconds. Defaults to 30.
+        is_local (bool, optional): Whether connecting to local development server. Defaults to False.
+    
+    Examples:
+        ```python
+        with DataBridge("databridge://owner_id:token@api.databridge.ai") as db:
+            # Ingest text
+            doc = db.ingest_text(
+                "Sample content",
+                metadata={"category": "sample"}
+            )
+            
+            # Query documents
+            results = db.query("search query")
+        ```
+    """
+
+    def __init__(self, uri: str, timeout: int = 30, is_local: bool = False):
+        self._timeout = timeout
+        self._session = requests.Session()
+        if is_local:
+            self._session.verify = False  # Disable SSL for localhost
+        self._is_local = is_local
+        self._setup_auth(uri)
+
+    def _setup_auth(self, uri: str) -> None:
+        """Setup authentication from URI"""
+        parsed = urlparse(uri)
+        if not parsed.netloc:
+            raise ValueError("Invalid URI format")
+
+        # Split host and auth parts
+        auth, host = parsed.netloc.split('@')
+        self._owner_id, self._auth_token = auth.split(':')
+            
+        # Set base URL
+        self._base_url = f"{'http' if self._is_local else 'https'}://{host}"
+
+        # Basic token validation
+        jwt.decode(self._auth_token, options={"verify_signature": False})
+
+    def _request(
+        self,
+        method: str,
+        endpoint: str,
+        data: Optional[Dict[str, Any]] = None,
+        files: Optional[Dict[str, Any]] = None
+    ) -> Dict[str, Any]:
+        """Make authenticated HTTP request"""
+        headers = {"Authorization": f"Bearer {self._auth_token}"}
+        
+        if not files:
+            headers["Content-Type"] = "application/json"
+
+        response = self._session.request(
+            method,
+            f"{self._base_url}/{endpoint.lstrip('/')}",
+            json=data if not files else None,
+            files=files,
+            data=data if files else None,
+            headers=headers,
+            timeout=self._timeout
+        )
+        response.raise_for_status()
+        return response.json()
+
+    def ingest_text(
+        self,
+        content: str,
+        metadata: Optional[Dict[str, Any]] = None
+    ) -> Document:
+        """
+        Ingest a text document into DataBridge.
+        
+        Args:
+            content: Text content to ingest
+            metadata: Optional metadata dictionary
+        
+        Returns:
+            Document: Metadata of the ingested document
+        
+        Example:
+            ```python
+            doc = db.ingest_text(
+                "Machine learning is fascinating...",
+                metadata={
+                    "title": "ML Introduction",
+                    "category": "tech"
+                }
+            )
+            ```
+        """
+        request = IngestTextRequest(
+            content=content,
+            metadata=metadata or {}
+        )
+
+        response = self._request(
+            "POST",
+            "ingest/text",
+            request.model_dump()
+        )
+        return Document(**response)
+
+    def ingest_file(
+        self,
+        file: Union[str, bytes, BinaryIO, Path],
+        filename: str,
+        content_type: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None
+    ) -> Document:
+        """
+        Ingest a file document into DataBridge.
+        
+        Args:
+            file: File to ingest (path string, bytes, file object, or Path)
+            filename: Name of the file
+            content_type: MIME type (optional, will be guessed if not provided)
+            metadata: Optional metadata dictionary
+        
+        Returns:
+            Document: Metadata of the ingested document
+        
+        Example:
+            ```python
+            # From file path
+            doc = db.ingest_file(
+                "document.pdf",
+                filename="document.pdf",
+                content_type="application/pdf",
+                metadata={"department": "research"}
+            )
+            
+            # From file object
+            with open("document.pdf", "rb") as f:
+                doc = db.ingest_file(f, "document.pdf")
+            ```
+        """
+        # Handle different file input types
+        if isinstance(file, (str, Path)):
+            file_path = Path(file)
+            if not file_path.exists():
+                raise ValueError(f"File not found: {file}")
+            with open(file_path, "rb") as f:
+                content = f.read()
+                file_obj = BytesIO(content)
+        elif isinstance(file, bytes):
+            file_obj = BytesIO(file)
+        else:
+            file_obj = file
+
+        try:
+            # Prepare multipart form data
+            files = {
+                "file": (filename, file_obj, content_type or "application/octet-stream")
+            }
+
+            # Add metadata
+            data = {"metadata": json.dumps(metadata or {})}
+
+            response = self._request(
+                "POST",
+                "ingest/file",
+                data=data,
+                files=files
+            )
+            return Document(**response)
+        finally:
+            # Close file if we opened it
+            if isinstance(file, (str, Path)):
+                file_obj.close()
+
+    def query(
+        self,
+        query: str,
+        return_type: str = "chunks",
+        filters: Optional[Dict[str, Any]] = None,
+        k: int = 4,
+        min_score: float = 0.0
+    ) -> Union[List[ChunkResult], List[DocumentResult]]:
+        """
+        Query documents in DataBridge.
+
+        Args:
+            query: Search query text
+            return_type: Type of results ("chunks" or "documents")
+            filters: Optional metadata filters
+            k: Number of results (default: 4)
+            min_score: Minimum similarity threshold (default: 0.0)
+
+        Returns:
+            List[ChunkResult] or List[DocumentResult] depending on return_type
+
+        Example:
+            ```python
+            # Query for chunks
+            chunks = db.query(
+                "What are the key findings?",
+                return_type="chunks",
+                filters={"department": "research"}
+            )
+
+            # Query for documents
+            docs = db.query(
+                "machine learning",
+                return_type="documents",
+                k=5
+            )
+            ```
+        """
+        request = {
+            "query": query,
+            "return_type": return_type,
+            "filters": filters,
+            "k": k,
+            "min_score": min_score
+        }
+
+        response = self._request("POST", "query", request)
+
+        if return_type == "chunks":
+            return [ChunkResult(**r) for r in response]
+        return [DocumentResult(**r) for r in response]
+
+    def list_documents(
+        self,
+        skip: int = 0,
+        limit: int = 100,
+        filters: Optional[Dict[str, Any]] = None
+    ) -> List[Document]:
+        """
+        List accessible documents.
+        
+        Args:
+            skip: Number of documents to skip
+            limit: Maximum number of documents to return
+            filters: Optional filters
+
+        Returns:
+            List[Document]: List of accessible documents
+        
+        Example:
+            ```python
+            # Get first page
+            docs = db.list_documents(limit=10)
+            
+            # Get next page
+            next_page = db.list_documents(skip=10, limit=10, filters={"department": "research"})
+            ```
+        """
+        response = self._request(
+            "GET",
+            f"documents?skip={skip}&limit={limit}&filters={filters}"
+        )
+        return [Document(**doc) for doc in response]
+
+    def get_document(self, document_id: str) -> Document:
+        """
+        Get document metadata by ID.
+        
+        Args:
+            document_id: ID of the document
+        
+        Returns:
+            Document: Document metadata
+        
+        Example:
+            ```python
+            doc = db.get_document("doc_123")
+            print(f"Title: {doc.metadata.get('title')}")
+            ```
+        """
+        response = self._request("GET", f"documents/{document_id}")
+        return Document(**response)
+
+    def close(self):
+        """Close the HTTP session"""
+        self._session.close()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
--- a/sdks/python/pyproject.toml
+++ b/sdks/python/pyproject.toml
@ -4,7 +4,7 @@ build-backend = "hatchling.build"

 [project]
 name = "databridge-client"
-version = "0.1.4"
+version = "0.1.5"
 authors = [
    { name = "DataBridge", email = "databridgesuperuser@gmail.com" },
 ]