refactor some stuff (#2)

* refactor some stuff, remove bare try catches
2025-05-09 19:32:38 +00:00 · 2024-12-15 14:31:25 -05:00 · 2024-12-15 14:31:25 -05:00 · df8d7fcdd0
commit df8d7fcdd0
parent 0f492ed7c0
16 changed files with 207 additions and 246 deletions
--- a/core/api.py
+++ b/core/api.py
@ -1,6 +1,6 @@
 import json
 from datetime import datetime, UTC
-from typing import List, Union
+from typing import Any, Dict, List, Optional, Union
 from fastapi import (
    FastAPI,
    Form,
@ -16,10 +16,9 @@ from core.models.request import IngestTextRequest, QueryRequest
 from core.models.documents import (
    Document,
    DocumentResult,
-    ChunkResult,
-    EntityType
+    ChunkResult
 )
-from core.models.auth import AuthContext
+from core.models.auth import AuthContext, EntityType
 from core.services.document_service import DocumentService
 from core.config import get_settings
 from core.database.mongo_database import MongoDatabase
@ -127,8 +126,6 @@ async def ingest_text(
        return await document_service.ingest_text(request, auth)
    except PermissionError as e:
        raise HTTPException(status_code=403, detail=str(e))
-    except Exception as e:
-        raise HTTPException(status_code=400, detail=str(e))


@app.post("/ingest/file", response_model=Document)
@ -141,13 +138,11 @@ async def ingest_file(
    try:
        metadata_dict = json.loads(metadata)
        doc = await document_service.ingest_file(file, metadata_dict, auth)
-        return doc  # Should just send a success response, not sure why we're sending a document #TODO: discuss with bhau
+        return doc  # TODO: Might be lighter on network to just send the document ID.
    except PermissionError as e:
        raise HTTPException(status_code=403, detail=str(e))
    except json.JSONDecodeError:
        raise HTTPException(400, "Invalid metadata JSON")
-    except Exception as e:
-        raise HTTPException(status_code=400, detail=str(e))


@app.post("/query", response_model=Union[List[ChunkResult], List[DocumentResult]])
@ -156,24 +151,18 @@ async def query_documents(
    auth: AuthContext = Depends(verify_token)
 ):
    """Query documents with specified return type."""
-    try:
-        return await document_service.query(request, auth)
-    except Exception as e:
-        logger.error(f"Query failed: {str(e)}")
-        raise HTTPException(status_code=400, detail=str(e))
+    return await document_service.query(request, auth)


@app.get("/documents", response_model=List[Document])
 async def list_documents(
    auth: AuthContext = Depends(verify_token),
    skip: int = 0,
-    limit: int = 100
+    limit: int = 100,
+    filters: Optional[Dict[str, Any]] = None
 ):
    """List accessible documents."""
-    try:
-        return await document_service.db.get_documents(auth, skip, limit)
-    except Exception as e:
-        raise HTTPException(status_code=400, detail=str(e))
+    return await document_service.db.get_documents(auth, skip, limit, filters)


@app.get("/documents/{document_id}", response_model=Document)
@ -189,6 +178,5 @@ async def get_document(
            raise HTTPException(status_code=404, detail="Document not found")
        return doc
    except HTTPException as e:
-        raise e  # Return the HTTPException as is
-    except Exception as e:
-        raise HTTPException(status_code=400, detail=str(e))
+        logger.error(f"Error getting document: {e}")
+        raise e
--- a/core/database/base_database.py
+++ b/core/database/base_database.py
@ -62,17 +62,14 @@ class BaseDatabase(ABC):
        Returns: Success status
        """
        pass
-
+    
    @abstractmethod
-    async def find_documents(
+    async def find_authorized_and_filtered_documents(
        self,
        auth: AuthContext,
        filters: Optional[Dict[str, Any]] = None
    ) -> List[str]:
-        """
-        Find document IDs matching filters that user has access to.
-        Returns: List of document IDs
-        """
+        """Find document IDs matching filters that user has access to."""
        pass

    @abstractmethod
--- a/core/database/mongo_database.py
+++ b/core/database/mongo_database.py
@ -152,7 +152,7 @@ class MongoDatabase(BaseDatabase):
            logger.error(f"Error deleting document: {str(e)}")
            return False

-    async def find_documents(
+    async def find_authorized_and_filtered_documents(
        self,
        auth: AuthContext,
        filters: Optional[Dict[str, Any]] = None
--- a/core/models/auth.py
+++ b/core/models/auth.py
@ -13,4 +13,5 @@ class AuthContext(BaseModel):
    entity_type: EntityType
    entity_id: str  # uuid
    app_id: Optional[str] = None  # uuid, only for developers
+    # TODO: remove permissions, not required here.
    permissions: Set[str] = {"read"}
--- a/core/models/documents.py
+++ b/core/models/documents.py
@ -7,10 +7,6 @@ import logging

 logger = logging.getLogger(__name__)

-class EntityType(str, Enum):
-    USER = "user"
-    DEVELOPER = "developer"
-

 class QueryReturnType(str, Enum):
    CHUNKS = "chunks"
@ -68,7 +64,7 @@ class ChunkResult(BaseModel):

 class DocumentContent(BaseModel):
    """Represents either a URL or content string"""
-    type: Literal["url", "string"] 
+    type: Literal["url", "string"]
    value: str
    filename: Optional[str] = Field(None, description="Filename when type is url")

--- a/core/parser/base_parser.py
+++ b/core/parser/base_parser.py
@ -2,14 +2,15 @@ from abc import ABC, abstractmethod
 from typing import List, Union
 from fastapi import UploadFile

+
 class BaseParser(ABC):
    """Base class for document parsing"""
-    
+
    @abstractmethod
    async def split_text(self, text: str) -> List[str]:
        """Split plain text into chunks"""
        pass
-    
+
    @abstractmethod
    async def parse_file(self, file: Union[UploadFile, bytes], content_type: str) -> List[str]:
        """Parse file content into text chunks"""
--- a/core/parser/unstructured_parser.py
+++ b/core/parser/unstructured_parser.py
@ -1,7 +1,7 @@
 from typing import List
 import io
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from unstructured.partition.auto import partition
+from langchain_unstructured import UnstructuredLoader
 import logging

 from .base_parser import BaseParser
@ -26,30 +26,17 @@ class UnstructuredAPIParser(BaseParser):

    async def split_text(self, text: str) -> List[str]:
        """Split plain text into chunks"""
-        try:
-            return self.text_splitter.split_text(text)
-        except Exception as e:
-            logger.error(f"Failed to split text: {str(e)}")
-            raise
+        return self.text_splitter.split_text(text)

    async def parse_file(self, file: bytes, content_type: str) -> List[str]:
        """Parse file content using unstructured"""
-        try:
-            # Parse with unstructured
-            elements = partition(
-                file=io.BytesIO(file),
-                content_type=content_type,
-                api_key=self.api_key
-            )
-
-            # Extract text from elements
-            chunks = []
-            for element in elements:
-                if hasattr(element, 'text') and element.text:
-                    chunks.append(element.text.strip())
-
-            return chunks
-
-        except Exception as e:
-            logger.error(f"Failed to parse file: {str(e)}")
-            raise e
+        # Parse with unstructured
+        loader = UnstructuredLoader(
+            file=io.BytesIO(file),
+            content_type=content_type,
+            partition_via_api=True,
+            api_key=self.api_key,
+            chunking_strategy="by_title"
+        )
+        elements = loader.load()
+        return [element.page_content for element in elements]
--- a/core/services/document_service.py
+++ b/core/services/document_service.py
@ -45,53 +45,49 @@ class DocumentService:
    ) -> Document:
        """Ingest a text document."""
        if "write" not in auth.permissions:
+            logger.error(f"User {auth.entity_id} does not have write permission")
            raise PermissionError("User does not have write permission")
-        try:
-            # 1. Create document record
-            doc = Document(
-                content_type="text/plain",
-                metadata=request.metadata,
-                owner={
-                    "type": auth.entity_type,
-                    "id": auth.entity_id
-                },
-                access_control={
-                    "readers": [auth.entity_id],
-                    "writers": {auth.entity_id},
-                    "admins": {auth.entity_id}
-                }
-            )
-            logger.info(f"Created text document record with ID {doc.external_id}")

-            # 2. Parse content into chunks
-            chunks = await self.parser.split_text(request.content)
-            if not chunks:
-                raise ValueError("No content chunks extracted from text")
-            logger.info(f"Split text into {len(chunks)} chunks")
+        # 1. Create document record
+        doc = Document(
+            content_type="text/plain",
+            metadata=request.metadata,
+            owner={
+                "type": auth.entity_type,
+                "id": auth.entity_id
+            },
+            access_control={
+                "readers": [auth.entity_id],
+                "writers": [auth.entity_id],
+                "admins": [auth.entity_id]
+            }
+        )
+        logger.info(f"Created text document record with ID {doc.external_id}")

-            # 3. Generate embeddings for chunks
-            embeddings = await self.embedding_model.embed_for_ingestion(chunks)
-            logger.info(f"Generated {len(embeddings)} embeddings")
+        # 2. Parse content into chunks
+        chunks = await self.parser.split_text(request.content)
+        if not chunks:
+            raise ValueError("No content chunks extracted from text")
+        logger.info(f"Split text into {len(chunks)} chunks")

-            # 4. Create and store chunk objects
-            chunk_objects = self._create_chunk_objects(
-                doc.external_id,
-                chunks,
-                embeddings,
-                doc.metadata
-            )
-            logger.info(f"Created {len(chunk_objects)} chunk objects")
+        # 3. Generate embeddings for chunks
+        embeddings = await self.embedding_model.embed_for_ingestion(chunks)
+        logger.info(f"Generated {len(embeddings)} embeddings")

-            # 5. Store everything
-            await self._store_chunks_and_doc(chunk_objects, doc)
-            logger.info(f"Successfully stored text document {doc.external_id}")
+        # 4. Create and store chunk objects
+        chunk_objects = self._create_chunk_objects(
+            doc.external_id,
+            chunks,
+            embeddings,
+            doc.metadata
+        )
+        logger.info(f"Created {len(chunk_objects)} chunk objects")

-            return doc
+        # 5. Store everything
+        await self._store_chunks_and_doc(chunk_objects, doc)
+        logger.info(f"Successfully stored text document {doc.external_id}")

-        except Exception as e:
-            logger.error(f"Text document ingestion failed: {str(e)}")
-            # TODO: Clean up any stored data on failure
-            raise e
+        return doc

    async def ingest_file(
        self,
@ -102,68 +98,63 @@ class DocumentService:
        """Ingest a file document."""
        if "write" not in auth.permissions:
            raise PermissionError("User does not have write permission")
-        try:
-            # 1. Create document record
-            doc = Document(
-                content_type=file.content_type,
-                filename=file.filename,
-                metadata=metadata,
-                owner={
-                    "type": auth.entity_type,
-                    "id": auth.entity_id
-                },
-                access_control={
-                    "readers": [auth.entity_id],
-                    "writers": {auth.entity_id},
-                    "admins": {auth.entity_id}
-                }
-            )
-            logger.info(f"Created file document record with ID {doc.external_id}")

-            # 2. Read and store file
-            file_content = await file.read()
-            storage_info = await self.storage.upload_from_base64(
-                base64.b64encode(file_content).decode(),
-                doc.external_id,
-                file.content_type
-            )
-            doc.storage_info = {
-                "bucket": storage_info[0],
-                "key": storage_info[1]
+        # 1. Create document record
+        doc = Document(
+            content_type=file.content_type,
+            filename=file.filename,
+            metadata=metadata,
+            owner={
+                "type": auth.entity_type,
+                "id": auth.entity_id
+            },
+            access_control={
+                "readers": [auth.entity_id],
+                "writers": [auth.entity_id],
+                "admins": [auth.entity_id]
            }
-            logger.info(
-                f"Stored file in bucket `{storage_info[0]}` with key `{storage_info[1]}`"
-            )
+        )
+        logger.info(f"Created file document record with ID {doc.external_id}")

-            # 3. Parse content into chunks
-            chunks = await self.parser.parse_file(file_content, file.content_type)
-            if not chunks:
-                raise ValueError("No content chunks extracted from file")
-            logger.info(f"Parsed file into {len(chunks)} chunks")
+        # 2. Read and store file
+        file_content = await file.read()
+        storage_info = await self.storage.upload_from_base64(
+            base64.b64encode(file_content).decode(),
+            doc.external_id,
+            file.content_type
+        )
+        doc.storage_info = {
+            "bucket": storage_info[0],
+            "key": storage_info[1]
+        }
+        logger.info(
+            f"Stored file in bucket `{storage_info[0]}` with key `{storage_info[1]}`"
+        )

-            # 4. Generate embeddings for chunks
-            embeddings = await self.embedding_model.embed_for_ingestion(chunks)
-            logger.info(f"Generated {len(embeddings)} embeddings")
+        # 3. Parse content into chunks
+        chunks = await self.parser.parse_file(file_content, file.content_type)
+        if not chunks:
+            raise ValueError("No content chunks extracted from file")
+        logger.info(f"Parsed file into {len(chunks)} chunks")

-            # 5. Create and store chunk objects
-            chunk_objects = self._create_chunk_objects(
-                doc.external_id,
-                chunks,
-                embeddings,
-                doc.metadata
-            )
-            logger.info(f"Created {len(chunk_objects)} chunk objects")
+        # 4. Generate embeddings for chunks
+        embeddings = await self.embedding_model.embed_for_ingestion(chunks)
+        logger.info(f"Generated {len(embeddings)} embeddings")

-            # 6. Store everything
-            doc.chunk_ids = await self._store_chunks_and_doc(chunk_objects, doc)
-            logger.info(f"Successfully stored file document {doc.external_id}")
+        # 5. Create and store chunk objects
+        chunk_objects = self._create_chunk_objects(
+            doc.external_id,
+            chunks,
+            embeddings,
+            doc.metadata
+        )
+        logger.info(f"Created {len(chunk_objects)} chunk objects")

-            return doc
+        # 6. Store everything
+        doc.chunk_ids = await self._store_chunks_and_doc(chunk_objects, doc)
+        logger.info(f"Successfully stored file document {doc.external_id}")

-        except Exception as e:
-            logger.error(f"File document ingestion failed: {str(e)}")
-            # TODO: Clean up any stored data on failure
-            raise
+        return doc

    async def query(
        self,
@ -171,39 +162,37 @@ class DocumentService:
        auth: AuthContext
    ) -> Union[List[ChunkResult], List[DocumentResult]]:
        """Query documents with specified return type."""
-        try:
-            # 1. Get embedding for query
-            query_embedding = await self.embedding_model.embed_for_query(request.query)
-            logger.info("Generated query embedding")
+        # TODO: k does not make sense for Documents, it's about chunks. 
+        # We should also look into document ordering. Figure these out.
+        
+        # 1. Get embedding for query
+        query_embedding = await self.embedding_model.embed_for_query(request.query)
+        logger.info("Generated query embedding")

-            # 2. Find authorized documents
-            doc_ids = await self.db.find_documents(auth, request.filters)
-            if not doc_ids:
-                logger.info("No authorized documents found")
-                return []
-            logger.info(f"Found {len(doc_ids)} authorized documents")
+        # 2. Find authorized documents
+        doc_ids = await self.db.find_authorized_and_filtered_documents(auth, request.filters)
+        if not doc_ids:
+            logger.info("No authorized documents found")
+            return []
+        logger.info(f"Found {len(doc_ids)} authorized documents")

-            # 3. Search chunks with vector similarity
-            chunks = await self.vector_store.query_similar(
-                query_embedding,
-                k=request.k,
-                doc_ids=doc_ids,
-            )
-            logger.info(f"Found {len(chunks)} similar chunks")
+        # 3. Search chunks with vector similarity
+        chunks = await self.vector_store.query_similar(
+            query_embedding,
+            k=request.k,
+            doc_ids=doc_ids,
+        )
+        logger.info(f"Found {len(chunks)} similar chunks")

-            # 4. Return results in requested format
-            if request.return_type == QueryReturnType.CHUNKS:
-                results = await self._create_chunk_results(auth, chunks)
-                logger.info(f"Returning {len(results)} chunk results")
-                return results
-            else:
-                results = await self._create_document_results(auth, chunks)
-                logger.info(f"Returning {len(results)} document results")
-                return results
-
-        except Exception as e:
-            logger.error(f"Query failed: {str(e)}")
-            raise e
+        # 4. Return results in requested format
+        if request.return_type == QueryReturnType.CHUNKS:
+            results = await self._create_chunk_results(auth, chunks)
+            logger.info(f"Returning {len(results)} chunk results")
+            return results
+        else:
+            results = await self._create_document_results(auth, chunks)
+            logger.info(f"Returning {len(results)} document results")
+            return results

    def _create_chunk_objects(
        self,
@ -240,8 +229,8 @@ class DocumentService:
        if not await self.db.store_document(doc):
            raise Exception("Failed to store document metadata")
        logger.debug("Stored document metadata in database")
-
-        return [str(id) for id in result.inserted_ids]
+        logger.debug(f"Chunk IDs stored: {result}")
+        return result

    async def _create_chunk_results(
        self,
--- a/core/storage/base_storage.py
+++ b/core/storage/base_storage.py
@ -1,24 +1,23 @@
 from abc import ABC, abstractmethod
 from typing import Tuple, Optional, Union, BinaryIO
-from pathlib import Path


 class BaseStorage(ABC):
    """Base interface for storage providers."""
-    
+
    @abstractmethod
-    async def upload_file(self, 
-                         file: Union[str, bytes, BinaryIO], 
-                         key: str,
-                         content_type: Optional[str] = None) -> Tuple[str, str]:
+    async def upload_file(self,
+                          file: Union[str, bytes, BinaryIO],
+                          key: str,
+                          content_type: Optional[str] = None) -> Tuple[str, str]:
        """
        Upload a file to storage.
-        
+
        Args:
            file: File content as string, bytes or file object
            key: Storage key/path for the file
            content_type: Optional MIME type
-            
+
        Returns:
            Tuple[str, str]: (bucket/container name, storage key)
        """
@ -26,17 +25,17 @@ class BaseStorage(ABC):

    @abstractmethod
    async def upload_from_base64(self,
-                                content: str,
-                                key: str,
-                                content_type: Optional[str] = None) -> Tuple[str, str]:
+                                 content: str,
+                                 key: str,
+                                 content_type: Optional[str] = None) -> Tuple[str, str]:
        """
        Upload base64 encoded content.
-        
+
        Args:
            content: Base64 encoded content
            key: Storage key/path
            content_type: Optional MIME type
-            
+
        Returns:
            Tuple[str, str]: (bucket/container name, storage key)
        """
@ -46,11 +45,11 @@ class BaseStorage(ABC):
    async def download_file(self, bucket: str, key: str) -> bytes:
        """
        Download file from storage.
-        
+
        Args:
            bucket: Bucket/container name
            key: Storage key/path
-            
+
        Returns:
            bytes: File content
        """
@ -60,12 +59,12 @@ class BaseStorage(ABC):
    async def get_download_url(self, bucket: str, key: str, expires_in: int = 3600) -> str:
        """
        Get temporary download URL.
-        
+
        Args:
            bucket: Bucket/container name
            key: Storage key/path
            expires_in: URL expiration in seconds
-            
+
        Returns:
            str: Presigned download URL
        """
@ -75,11 +74,11 @@ class BaseStorage(ABC):
    async def delete_file(self, bucket: str, key: str) -> bool:
        """
        Delete file from storage.
-        
+
        Args:
            bucket: Bucket/container name
            key: Storage key/path
-            
+
        Returns:
            bool: True if successful
        """
--- a/core/storage/s3_storage.py
+++ b/core/storage/s3_storage.py
@ -17,6 +17,7 @@ logger = logging.getLogger(__name__)
 class S3Storage(BaseStorage):
    """AWS S3 storage implementation."""
    
+    # TODO: Remove hardcoded values.
    def __init__(
        self,
        aws_access_key: str,
--- a/core/storage/utils_file_extensions.py
+++ b/core/storage/utils_file_extensions.py
@ -40,4 +40,4 @@ def detect_file_type(content: str) -> str:
        'application/msword': '.doc',
        'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx'
    }
-    return extension_map.get(detected_type, '.bin')
+    return extension_map.get(detected_type, '.bin')
--- a/core/vector_store/base_vector_store.py
+++ b/core/vector_store/base_vector_store.py
@ -6,7 +6,7 @@ from core.models.documents import DocumentChunk

 class BaseVectorStore(ABC):
    @abstractmethod
-    def store_embeddings(self, chunks: List[DocumentChunk]) -> Tuple[bool, Optional[Any]]:
+    def store_embeddings(self, chunks: List[DocumentChunk]) -> Tuple[bool, List[str]]:
        """Store document chunks and their embeddings"""
        pass

--- a/core/vector_store/mongo_vector_store.py
+++ b/core/vector_store/mongo_vector_store.py
@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import List, Optional, Tuple
 import logging
 from motor.motor_asyncio import AsyncIOMotorClient
 from pymongo.errors import PyMongoError
@ -41,11 +41,11 @@ class MongoDBAtlasVectorStore(BaseVectorStore):
            logger.error(f"Error initializing vector store indexes: {str(e)}")
            return False

-    async def store_embeddings(self, chunks: List[DocumentChunk]) -> bool:
+    async def store_embeddings(self, chunks: List[DocumentChunk]) -> Tuple[bool, List[str]]:
        """Store document chunks with their embeddings."""
        try:
            if not chunks:
-                return True
+                return True, []

            # Convert chunks to dicts
            documents = []
@ -65,12 +65,12 @@ class MongoDBAtlasVectorStore(BaseVectorStore):
                result = await self.collection.insert_many(
                    documents, ordered=False
                )
-                return len(result.inserted_ids) > 0, result
-            return False, None
+                return len(result.inserted_ids) > 0, [str(id) for id in result.inserted_ids]
+            return False, []

        except PyMongoError as e:
            logger.error(f"Error storing embeddings: {str(e)}")
-            return False
+            return False, []

    async def query_similar(
        self,
--- a/generate_local_uri.py
+++ b/generate_local_uri.py
@ -23,4 +23,4 @@ token = jwt.encode(payload, jwt_secret, algorithm="HS256")

 # Create URI
 uri = f"databridge://test_dev:{token}@127.0.0.1:8000"
-print(uri)
+print(uri)
--- a/sdks/python/databridge/client.py
+++ b/sdks/python/databridge/client.py
@ -1,10 +1,11 @@
+import json
 from typing import Dict, Any, List, Optional, Union, BinaryIO
 import httpx
 from urllib.parse import urlparse
 import jwt
 from pydantic import BaseModel
-import json
 from pathlib import Path
+from io import BytesIO


 class IngestTextRequest(BaseModel):
@ -189,9 +190,10 @@ class DataBridge:
            file_path = Path(file)
            if not file_path.exists():
                raise ValueError(f"File not found: {file}")
-            file_obj = open(file_path, "rb")
+            with open(file_path, "rb") as f:
+                content = f.read()
+                file_obj = BytesIO(content)
        elif isinstance(file, bytes):
-            from io import BytesIO
            file_obj = BytesIO(file)
        else:
            file_obj = file
@ -201,7 +203,7 @@ class DataBridge:
            files = {
                "file": (filename, file_obj, content_type or "application/octet-stream")
            }
-            
+
            # Add metadata
            data = {"metadata": json.dumps(metadata or {})}

@ -227,17 +229,17 @@ class DataBridge:
    ) -> Union[List[ChunkResult], List[DocumentResult]]:
        """
        Query documents in DataBridge.
-        
+
        Args:
            query: Search query text
            return_type: Type of results ("chunks" or "documents")
            filters: Optional metadata filters
            k: Number of results (default: 4)
            min_score: Minimum similarity threshold (default: 0.0)
-        
+
        Returns:
            List[ChunkResult] or List[DocumentResult] depending on return_type
-        
+
        Example:
            ```python
            # Query for chunks
@ -246,7 +248,7 @@ class DataBridge:
                return_type="chunks",
                filters={"department": "research"}
            )
-            
+
            # Query for documents
            docs = await db.query(
                "machine learning",
@ -264,7 +266,7 @@ class DataBridge:
        }

        response = await self._request("POST", "query", request)
-        
+
        if return_type == "chunks":
            return [ChunkResult(**r) for r in response]
        return [DocumentResult(**r) for r in response]
@ -272,15 +274,17 @@ class DataBridge:
    async def list_documents(
        self,
        skip: int = 0,
-        limit: int = 100
+        limit: int = 100,
+        filters: Optional[Dict[str, Any]] = None
    ) -> List[Document]:
        """
-        List accessible documents with pagination.
+        List accessible documents.
        
        Args:
            skip: Number of documents to skip
            limit: Maximum number of documents to return
-        
+            filters: Optional filters
+
        Returns:
            List[Document]: List of accessible documents
        
@ -290,12 +294,12 @@ class DataBridge:
            docs = await db.list_documents(limit=10)
            
            # Get next page
-            next_page = await db.list_documents(skip=10, limit=10)
+            next_page = await db.list_documents(skip=10, limit=10, filters={"department": "research"})
            ```
        """
        response = await self._request(
            "GET",
-            f"documents?skip={skip}&limit={limit}"
+            f"documents?skip={skip}&limit={limit}&filters={filters}"
        )
        return [Document(**doc) for doc in response]

--- a/sdks/python/examples/basic_usage.py
+++ b/sdks/python/examples/basic_usage.py
@ -24,16 +24,14 @@ class LocalDataBridge(DataBridge):
            self._base_url = self._base_url.replace("https://", "http://")


-# Get DataBridge URI from environment
-# Format: databridge://owner_id:token@host
-DATABRIDGE_URI = "databridge://test_dev:eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0eXBlIjoiZGV2ZWxvcGVyIiwiZW50aXR5X2lkIjoidGVzdF9kZXYiLCJwZXJtaXNzaW9ucyI6WyJyZWFkIiwid3JpdGUiLCJhZG1pbiJdLCJleHAiOjE3MzU3ODk3NTd9.lYL1czdIeclMIl8BohTd3C9lpnvMi1gdNGpiec9sXv4@127.0.0.1:8000"
-print(f"DATABRIDGE_URI: {DATABRIDGE_URI}")
+# Can be generated using generate_local_uri.py
+DATABRIDGE_URI = ""


 async def demonstrate_text_ingestion(db: LocalDataBridge):
    """Demonstrate text document ingestion"""
    print("\n=== Text Ingestion Example ===")
-    
+
    # Ingest a text document with metadata
    doc = await db.ingest_text(
        content=(
@ -49,25 +47,25 @@ async def demonstrate_text_ingestion(db: LocalDataBridge):
            "difficulty": "intermediate"
        }
    )
-    
+
    print("✓ Text document ingested")
    print(f"  ID: {doc.external_id}")
    print(f"  Content Type: {doc.content_type}")
    print(f"  Tags: {doc.metadata.get('tags')}")
-    
+
    return doc.external_id


 async def demonstrate_file_ingestion(db: LocalDataBridge):
    """Demonstrate file document ingestion"""
    print("\n=== File Ingestion Example ===")
-    
+
    # Create a sample PDF file
    pdf_path = Path("sample.pdf")
    if not pdf_path.exists():
        print("× Sample PDF not found, skipping file ingestion")
        return
-    
+
    # Method 1: Ingest using file path
    doc1 = await db.ingest_file(
        file=pdf_path,
@ -82,7 +80,7 @@ async def demonstrate_file_ingestion(db: LocalDataBridge):
    print("✓ File ingested from path")
    print(f"  ID: {doc1.external_id}")
    print(f"  Storage Info: {doc1.storage_info}")
-    
+
    # Method 2: Ingest using file object
    with open(pdf_path, "rb") as f:
        doc2 = await db.ingest_file(
@ -92,14 +90,14 @@ async def demonstrate_file_ingestion(db: LocalDataBridge):
        )
    print("✓ File ingested from file object")
    print(f"  ID: {doc2.external_id}")
-    
+
    return doc1.external_id


 async def demonstrate_querying(db: LocalDataBridge, doc_id: str):
    """Demonstrate document querying"""
    print("\n=== Querying Example ===")
-    
+
    # Query 1: Search for chunks
    print("\nSearching for chunks about machine learning:")
    chunks = await db.query(
@ -108,12 +106,12 @@ async def demonstrate_querying(db: LocalDataBridge, doc_id: str):
        k=2,
        filters={"category": "technical"}
    )
-    
+
    for chunk in chunks:
        print(f"\nChunk from document {chunk.document_id}")
        print(f"Score: {chunk.score:.2f}")
        print(f"Content: {chunk.content[:200]}...")
-    
+
    # Query 2: Search for documents
    print("\nSearching for documents about deep learning:")
    docs = await db.query(
@ -121,7 +119,7 @@ async def demonstrate_querying(db: LocalDataBridge, doc_id: str):
        return_type="documents",
        filters={"tags": ["ML", "AI"]}
    )
-    
+
    for doc in docs:
        print(f"\nDocument: {doc.document_id}")
        print(f"Score: {doc.score:.2f}")
@ -131,13 +129,13 @@ async def demonstrate_querying(db: LocalDataBridge, doc_id: str):
 async def demonstrate_document_management(db: LocalDataBridge):
    """Demonstrate document management operations"""
    print("\n=== Document Management Example ===")
-    
+
    # List documents with pagination
    print("\nListing first 5 documents:")
    docs = await db.list_documents(limit=5)
    for doc in docs:
        print(f"- {doc.external_id}: {doc.metadata.get('title', 'Untitled')}")
-    
+
    if docs:
        # Get specific document details
        doc_id = docs[0].external_id