refactor some stuff (#2)

* refactor some stuff, remove bare try catches
This commit is contained in:
Adityavardhan Agrawal 2024-12-15 14:31:25 -05:00 committed by GitHub
parent 0f492ed7c0
commit df8d7fcdd0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
16 changed files with 207 additions and 246 deletions

View File

@ -1,6 +1,6 @@
import json
from datetime import datetime, UTC
from typing import List, Union
from typing import Any, Dict, List, Optional, Union
from fastapi import (
FastAPI,
Form,
@ -16,10 +16,9 @@ from core.models.request import IngestTextRequest, QueryRequest
from core.models.documents import (
Document,
DocumentResult,
ChunkResult,
EntityType
ChunkResult
)
from core.models.auth import AuthContext
from core.models.auth import AuthContext, EntityType
from core.services.document_service import DocumentService
from core.config import get_settings
from core.database.mongo_database import MongoDatabase
@ -127,8 +126,6 @@ async def ingest_text(
return await document_service.ingest_text(request, auth)
except PermissionError as e:
raise HTTPException(status_code=403, detail=str(e))
except Exception as e:
raise HTTPException(status_code=400, detail=str(e))
@app.post("/ingest/file", response_model=Document)
@ -141,13 +138,11 @@ async def ingest_file(
try:
metadata_dict = json.loads(metadata)
doc = await document_service.ingest_file(file, metadata_dict, auth)
return doc # Should just send a success response, not sure why we're sending a document #TODO: discuss with bhau
return doc # TODO: Might be lighter on network to just send the document ID.
except PermissionError as e:
raise HTTPException(status_code=403, detail=str(e))
except json.JSONDecodeError:
raise HTTPException(400, "Invalid metadata JSON")
except Exception as e:
raise HTTPException(status_code=400, detail=str(e))
@app.post("/query", response_model=Union[List[ChunkResult], List[DocumentResult]])
@ -156,24 +151,18 @@ async def query_documents(
auth: AuthContext = Depends(verify_token)
):
"""Query documents with specified return type."""
try:
return await document_service.query(request, auth)
except Exception as e:
logger.error(f"Query failed: {str(e)}")
raise HTTPException(status_code=400, detail=str(e))
return await document_service.query(request, auth)
@app.get("/documents", response_model=List[Document])
async def list_documents(
auth: AuthContext = Depends(verify_token),
skip: int = 0,
limit: int = 100
limit: int = 100,
filters: Optional[Dict[str, Any]] = None
):
"""List accessible documents."""
try:
return await document_service.db.get_documents(auth, skip, limit)
except Exception as e:
raise HTTPException(status_code=400, detail=str(e))
return await document_service.db.get_documents(auth, skip, limit, filters)
@app.get("/documents/{document_id}", response_model=Document)
@ -189,6 +178,5 @@ async def get_document(
raise HTTPException(status_code=404, detail="Document not found")
return doc
except HTTPException as e:
raise e # Return the HTTPException as is
except Exception as e:
raise HTTPException(status_code=400, detail=str(e))
logger.error(f"Error getting document: {e}")
raise e

View File

@ -62,17 +62,14 @@ class BaseDatabase(ABC):
Returns: Success status
"""
pass
@abstractmethod
async def find_documents(
async def find_authorized_and_filtered_documents(
self,
auth: AuthContext,
filters: Optional[Dict[str, Any]] = None
) -> List[str]:
"""
Find document IDs matching filters that user has access to.
Returns: List of document IDs
"""
"""Find document IDs matching filters that user has access to."""
pass
@abstractmethod

View File

@ -152,7 +152,7 @@ class MongoDatabase(BaseDatabase):
logger.error(f"Error deleting document: {str(e)}")
return False
async def find_documents(
async def find_authorized_and_filtered_documents(
self,
auth: AuthContext,
filters: Optional[Dict[str, Any]] = None

View File

@ -13,4 +13,5 @@ class AuthContext(BaseModel):
entity_type: EntityType
entity_id: str # uuid
app_id: Optional[str] = None # uuid, only for developers
# TODO: remove permissions, not required here.
permissions: Set[str] = {"read"}

View File

@ -7,10 +7,6 @@ import logging
logger = logging.getLogger(__name__)
class EntityType(str, Enum):
USER = "user"
DEVELOPER = "developer"
class QueryReturnType(str, Enum):
CHUNKS = "chunks"
@ -68,7 +64,7 @@ class ChunkResult(BaseModel):
class DocumentContent(BaseModel):
"""Represents either a URL or content string"""
type: Literal["url", "string"]
type: Literal["url", "string"]
value: str
filename: Optional[str] = Field(None, description="Filename when type is url")

View File

@ -2,14 +2,15 @@ from abc import ABC, abstractmethod
from typing import List, Union
from fastapi import UploadFile
class BaseParser(ABC):
"""Base class for document parsing"""
@abstractmethod
async def split_text(self, text: str) -> List[str]:
"""Split plain text into chunks"""
pass
@abstractmethod
async def parse_file(self, file: Union[UploadFile, bytes], content_type: str) -> List[str]:
"""Parse file content into text chunks"""

View File

@ -1,7 +1,7 @@
from typing import List
import io
from langchain.text_splitter import RecursiveCharacterTextSplitter
from unstructured.partition.auto import partition
from langchain_unstructured import UnstructuredLoader
import logging
from .base_parser import BaseParser
@ -26,30 +26,17 @@ class UnstructuredAPIParser(BaseParser):
async def split_text(self, text: str) -> List[str]:
"""Split plain text into chunks"""
try:
return self.text_splitter.split_text(text)
except Exception as e:
logger.error(f"Failed to split text: {str(e)}")
raise
return self.text_splitter.split_text(text)
async def parse_file(self, file: bytes, content_type: str) -> List[str]:
"""Parse file content using unstructured"""
try:
# Parse with unstructured
elements = partition(
file=io.BytesIO(file),
content_type=content_type,
api_key=self.api_key
)
# Extract text from elements
chunks = []
for element in elements:
if hasattr(element, 'text') and element.text:
chunks.append(element.text.strip())
return chunks
except Exception as e:
logger.error(f"Failed to parse file: {str(e)}")
raise e
# Parse with unstructured
loader = UnstructuredLoader(
file=io.BytesIO(file),
content_type=content_type,
partition_via_api=True,
api_key=self.api_key,
chunking_strategy="by_title"
)
elements = loader.load()
return [element.page_content for element in elements]

View File

@ -45,53 +45,49 @@ class DocumentService:
) -> Document:
"""Ingest a text document."""
if "write" not in auth.permissions:
logger.error(f"User {auth.entity_id} does not have write permission")
raise PermissionError("User does not have write permission")
try:
# 1. Create document record
doc = Document(
content_type="text/plain",
metadata=request.metadata,
owner={
"type": auth.entity_type,
"id": auth.entity_id
},
access_control={
"readers": [auth.entity_id],
"writers": {auth.entity_id},
"admins": {auth.entity_id}
}
)
logger.info(f"Created text document record with ID {doc.external_id}")
# 2. Parse content into chunks
chunks = await self.parser.split_text(request.content)
if not chunks:
raise ValueError("No content chunks extracted from text")
logger.info(f"Split text into {len(chunks)} chunks")
# 1. Create document record
doc = Document(
content_type="text/plain",
metadata=request.metadata,
owner={
"type": auth.entity_type,
"id": auth.entity_id
},
access_control={
"readers": [auth.entity_id],
"writers": [auth.entity_id],
"admins": [auth.entity_id]
}
)
logger.info(f"Created text document record with ID {doc.external_id}")
# 3. Generate embeddings for chunks
embeddings = await self.embedding_model.embed_for_ingestion(chunks)
logger.info(f"Generated {len(embeddings)} embeddings")
# 2. Parse content into chunks
chunks = await self.parser.split_text(request.content)
if not chunks:
raise ValueError("No content chunks extracted from text")
logger.info(f"Split text into {len(chunks)} chunks")
# 4. Create and store chunk objects
chunk_objects = self._create_chunk_objects(
doc.external_id,
chunks,
embeddings,
doc.metadata
)
logger.info(f"Created {len(chunk_objects)} chunk objects")
# 3. Generate embeddings for chunks
embeddings = await self.embedding_model.embed_for_ingestion(chunks)
logger.info(f"Generated {len(embeddings)} embeddings")
# 5. Store everything
await self._store_chunks_and_doc(chunk_objects, doc)
logger.info(f"Successfully stored text document {doc.external_id}")
# 4. Create and store chunk objects
chunk_objects = self._create_chunk_objects(
doc.external_id,
chunks,
embeddings,
doc.metadata
)
logger.info(f"Created {len(chunk_objects)} chunk objects")
return doc
# 5. Store everything
await self._store_chunks_and_doc(chunk_objects, doc)
logger.info(f"Successfully stored text document {doc.external_id}")
except Exception as e:
logger.error(f"Text document ingestion failed: {str(e)}")
# TODO: Clean up any stored data on failure
raise e
return doc
async def ingest_file(
self,
@ -102,68 +98,63 @@ class DocumentService:
"""Ingest a file document."""
if "write" not in auth.permissions:
raise PermissionError("User does not have write permission")
try:
# 1. Create document record
doc = Document(
content_type=file.content_type,
filename=file.filename,
metadata=metadata,
owner={
"type": auth.entity_type,
"id": auth.entity_id
},
access_control={
"readers": [auth.entity_id],
"writers": {auth.entity_id},
"admins": {auth.entity_id}
}
)
logger.info(f"Created file document record with ID {doc.external_id}")
# 2. Read and store file
file_content = await file.read()
storage_info = await self.storage.upload_from_base64(
base64.b64encode(file_content).decode(),
doc.external_id,
file.content_type
)
doc.storage_info = {
"bucket": storage_info[0],
"key": storage_info[1]
# 1. Create document record
doc = Document(
content_type=file.content_type,
filename=file.filename,
metadata=metadata,
owner={
"type": auth.entity_type,
"id": auth.entity_id
},
access_control={
"readers": [auth.entity_id],
"writers": [auth.entity_id],
"admins": [auth.entity_id]
}
logger.info(
f"Stored file in bucket `{storage_info[0]}` with key `{storage_info[1]}`"
)
)
logger.info(f"Created file document record with ID {doc.external_id}")
# 3. Parse content into chunks
chunks = await self.parser.parse_file(file_content, file.content_type)
if not chunks:
raise ValueError("No content chunks extracted from file")
logger.info(f"Parsed file into {len(chunks)} chunks")
# 2. Read and store file
file_content = await file.read()
storage_info = await self.storage.upload_from_base64(
base64.b64encode(file_content).decode(),
doc.external_id,
file.content_type
)
doc.storage_info = {
"bucket": storage_info[0],
"key": storage_info[1]
}
logger.info(
f"Stored file in bucket `{storage_info[0]}` with key `{storage_info[1]}`"
)
# 4. Generate embeddings for chunks
embeddings = await self.embedding_model.embed_for_ingestion(chunks)
logger.info(f"Generated {len(embeddings)} embeddings")
# 3. Parse content into chunks
chunks = await self.parser.parse_file(file_content, file.content_type)
if not chunks:
raise ValueError("No content chunks extracted from file")
logger.info(f"Parsed file into {len(chunks)} chunks")
# 5. Create and store chunk objects
chunk_objects = self._create_chunk_objects(
doc.external_id,
chunks,
embeddings,
doc.metadata
)
logger.info(f"Created {len(chunk_objects)} chunk objects")
# 4. Generate embeddings for chunks
embeddings = await self.embedding_model.embed_for_ingestion(chunks)
logger.info(f"Generated {len(embeddings)} embeddings")
# 6. Store everything
doc.chunk_ids = await self._store_chunks_and_doc(chunk_objects, doc)
logger.info(f"Successfully stored file document {doc.external_id}")
# 5. Create and store chunk objects
chunk_objects = self._create_chunk_objects(
doc.external_id,
chunks,
embeddings,
doc.metadata
)
logger.info(f"Created {len(chunk_objects)} chunk objects")
return doc
# 6. Store everything
doc.chunk_ids = await self._store_chunks_and_doc(chunk_objects, doc)
logger.info(f"Successfully stored file document {doc.external_id}")
except Exception as e:
logger.error(f"File document ingestion failed: {str(e)}")
# TODO: Clean up any stored data on failure
raise
return doc
async def query(
self,
@ -171,39 +162,37 @@ class DocumentService:
auth: AuthContext
) -> Union[List[ChunkResult], List[DocumentResult]]:
"""Query documents with specified return type."""
try:
# 1. Get embedding for query
query_embedding = await self.embedding_model.embed_for_query(request.query)
logger.info("Generated query embedding")
# TODO: k does not make sense for Documents, it's about chunks.
# We should also look into document ordering. Figure these out.
# 1. Get embedding for query
query_embedding = await self.embedding_model.embed_for_query(request.query)
logger.info("Generated query embedding")
# 2. Find authorized documents
doc_ids = await self.db.find_documents(auth, request.filters)
if not doc_ids:
logger.info("No authorized documents found")
return []
logger.info(f"Found {len(doc_ids)} authorized documents")
# 2. Find authorized documents
doc_ids = await self.db.find_authorized_and_filtered_documents(auth, request.filters)
if not doc_ids:
logger.info("No authorized documents found")
return []
logger.info(f"Found {len(doc_ids)} authorized documents")
# 3. Search chunks with vector similarity
chunks = await self.vector_store.query_similar(
query_embedding,
k=request.k,
doc_ids=doc_ids,
)
logger.info(f"Found {len(chunks)} similar chunks")
# 3. Search chunks with vector similarity
chunks = await self.vector_store.query_similar(
query_embedding,
k=request.k,
doc_ids=doc_ids,
)
logger.info(f"Found {len(chunks)} similar chunks")
# 4. Return results in requested format
if request.return_type == QueryReturnType.CHUNKS:
results = await self._create_chunk_results(auth, chunks)
logger.info(f"Returning {len(results)} chunk results")
return results
else:
results = await self._create_document_results(auth, chunks)
logger.info(f"Returning {len(results)} document results")
return results
except Exception as e:
logger.error(f"Query failed: {str(e)}")
raise e
# 4. Return results in requested format
if request.return_type == QueryReturnType.CHUNKS:
results = await self._create_chunk_results(auth, chunks)
logger.info(f"Returning {len(results)} chunk results")
return results
else:
results = await self._create_document_results(auth, chunks)
logger.info(f"Returning {len(results)} document results")
return results
def _create_chunk_objects(
self,
@ -240,8 +229,8 @@ class DocumentService:
if not await self.db.store_document(doc):
raise Exception("Failed to store document metadata")
logger.debug("Stored document metadata in database")
return [str(id) for id in result.inserted_ids]
logger.debug(f"Chunk IDs stored: {result}")
return result
async def _create_chunk_results(
self,

View File

@ -1,24 +1,23 @@
from abc import ABC, abstractmethod
from typing import Tuple, Optional, Union, BinaryIO
from pathlib import Path
class BaseStorage(ABC):
"""Base interface for storage providers."""
@abstractmethod
async def upload_file(self,
file: Union[str, bytes, BinaryIO],
key: str,
content_type: Optional[str] = None) -> Tuple[str, str]:
async def upload_file(self,
file: Union[str, bytes, BinaryIO],
key: str,
content_type: Optional[str] = None) -> Tuple[str, str]:
"""
Upload a file to storage.
Args:
file: File content as string, bytes or file object
key: Storage key/path for the file
content_type: Optional MIME type
Returns:
Tuple[str, str]: (bucket/container name, storage key)
"""
@ -26,17 +25,17 @@ class BaseStorage(ABC):
@abstractmethod
async def upload_from_base64(self,
content: str,
key: str,
content_type: Optional[str] = None) -> Tuple[str, str]:
content: str,
key: str,
content_type: Optional[str] = None) -> Tuple[str, str]:
"""
Upload base64 encoded content.
Args:
content: Base64 encoded content
key: Storage key/path
content_type: Optional MIME type
Returns:
Tuple[str, str]: (bucket/container name, storage key)
"""
@ -46,11 +45,11 @@ class BaseStorage(ABC):
async def download_file(self, bucket: str, key: str) -> bytes:
"""
Download file from storage.
Args:
bucket: Bucket/container name
key: Storage key/path
Returns:
bytes: File content
"""
@ -60,12 +59,12 @@ class BaseStorage(ABC):
async def get_download_url(self, bucket: str, key: str, expires_in: int = 3600) -> str:
"""
Get temporary download URL.
Args:
bucket: Bucket/container name
key: Storage key/path
expires_in: URL expiration in seconds
Returns:
str: Presigned download URL
"""
@ -75,11 +74,11 @@ class BaseStorage(ABC):
async def delete_file(self, bucket: str, key: str) -> bool:
"""
Delete file from storage.
Args:
bucket: Bucket/container name
key: Storage key/path
Returns:
bool: True if successful
"""

View File

@ -17,6 +17,7 @@ logger = logging.getLogger(__name__)
class S3Storage(BaseStorage):
"""AWS S3 storage implementation."""
# TODO: Remove hardcoded values.
def __init__(
self,
aws_access_key: str,

View File

@ -40,4 +40,4 @@ def detect_file_type(content: str) -> str:
'application/msword': '.doc',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx'
}
return extension_map.get(detected_type, '.bin')
return extension_map.get(detected_type, '.bin')

View File

@ -6,7 +6,7 @@ from core.models.documents import DocumentChunk
class BaseVectorStore(ABC):
@abstractmethod
def store_embeddings(self, chunks: List[DocumentChunk]) -> Tuple[bool, Optional[Any]]:
def store_embeddings(self, chunks: List[DocumentChunk]) -> Tuple[bool, List[str]]:
"""Store document chunks and their embeddings"""
pass

View File

@ -1,4 +1,4 @@
from typing import List, Optional
from typing import List, Optional, Tuple
import logging
from motor.motor_asyncio import AsyncIOMotorClient
from pymongo.errors import PyMongoError
@ -41,11 +41,11 @@ class MongoDBAtlasVectorStore(BaseVectorStore):
logger.error(f"Error initializing vector store indexes: {str(e)}")
return False
async def store_embeddings(self, chunks: List[DocumentChunk]) -> bool:
async def store_embeddings(self, chunks: List[DocumentChunk]) -> Tuple[bool, List[str]]:
"""Store document chunks with their embeddings."""
try:
if not chunks:
return True
return True, []
# Convert chunks to dicts
documents = []
@ -65,12 +65,12 @@ class MongoDBAtlasVectorStore(BaseVectorStore):
result = await self.collection.insert_many(
documents, ordered=False
)
return len(result.inserted_ids) > 0, result
return False, None
return len(result.inserted_ids) > 0, [str(id) for id in result.inserted_ids]
return False, []
except PyMongoError as e:
logger.error(f"Error storing embeddings: {str(e)}")
return False
return False, []
async def query_similar(
self,

View File

@ -23,4 +23,4 @@ token = jwt.encode(payload, jwt_secret, algorithm="HS256")
# Create URI
uri = f"databridge://test_dev:{token}@127.0.0.1:8000"
print(uri)
print(uri)

View File

@ -1,10 +1,11 @@
import json
from typing import Dict, Any, List, Optional, Union, BinaryIO
import httpx
from urllib.parse import urlparse
import jwt
from pydantic import BaseModel
import json
from pathlib import Path
from io import BytesIO
class IngestTextRequest(BaseModel):
@ -189,9 +190,10 @@ class DataBridge:
file_path = Path(file)
if not file_path.exists():
raise ValueError(f"File not found: {file}")
file_obj = open(file_path, "rb")
with open(file_path, "rb") as f:
content = f.read()
file_obj = BytesIO(content)
elif isinstance(file, bytes):
from io import BytesIO
file_obj = BytesIO(file)
else:
file_obj = file
@ -201,7 +203,7 @@ class DataBridge:
files = {
"file": (filename, file_obj, content_type or "application/octet-stream")
}
# Add metadata
data = {"metadata": json.dumps(metadata or {})}
@ -227,17 +229,17 @@ class DataBridge:
) -> Union[List[ChunkResult], List[DocumentResult]]:
"""
Query documents in DataBridge.
Args:
query: Search query text
return_type: Type of results ("chunks" or "documents")
filters: Optional metadata filters
k: Number of results (default: 4)
min_score: Minimum similarity threshold (default: 0.0)
Returns:
List[ChunkResult] or List[DocumentResult] depending on return_type
Example:
```python
# Query for chunks
@ -246,7 +248,7 @@ class DataBridge:
return_type="chunks",
filters={"department": "research"}
)
# Query for documents
docs = await db.query(
"machine learning",
@ -264,7 +266,7 @@ class DataBridge:
}
response = await self._request("POST", "query", request)
if return_type == "chunks":
return [ChunkResult(**r) for r in response]
return [DocumentResult(**r) for r in response]
@ -272,15 +274,17 @@ class DataBridge:
async def list_documents(
self,
skip: int = 0,
limit: int = 100
limit: int = 100,
filters: Optional[Dict[str, Any]] = None
) -> List[Document]:
"""
List accessible documents with pagination.
List accessible documents.
Args:
skip: Number of documents to skip
limit: Maximum number of documents to return
filters: Optional filters
Returns:
List[Document]: List of accessible documents
@ -290,12 +294,12 @@ class DataBridge:
docs = await db.list_documents(limit=10)
# Get next page
next_page = await db.list_documents(skip=10, limit=10)
next_page = await db.list_documents(skip=10, limit=10, filters={"department": "research"})
```
"""
response = await self._request(
"GET",
f"documents?skip={skip}&limit={limit}"
f"documents?skip={skip}&limit={limit}&filters={filters}"
)
return [Document(**doc) for doc in response]

View File

@ -24,16 +24,14 @@ class LocalDataBridge(DataBridge):
self._base_url = self._base_url.replace("https://", "http://")
# Get DataBridge URI from environment
# Format: databridge://owner_id:token@host
DATABRIDGE_URI = "databridge://test_dev:eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0eXBlIjoiZGV2ZWxvcGVyIiwiZW50aXR5X2lkIjoidGVzdF9kZXYiLCJwZXJtaXNzaW9ucyI6WyJyZWFkIiwid3JpdGUiLCJhZG1pbiJdLCJleHAiOjE3MzU3ODk3NTd9.lYL1czdIeclMIl8BohTd3C9lpnvMi1gdNGpiec9sXv4@127.0.0.1:8000"
print(f"DATABRIDGE_URI: {DATABRIDGE_URI}")
# Can be generated using generate_local_uri.py
DATABRIDGE_URI = ""
async def demonstrate_text_ingestion(db: LocalDataBridge):
"""Demonstrate text document ingestion"""
print("\n=== Text Ingestion Example ===")
# Ingest a text document with metadata
doc = await db.ingest_text(
content=(
@ -49,25 +47,25 @@ async def demonstrate_text_ingestion(db: LocalDataBridge):
"difficulty": "intermediate"
}
)
print("✓ Text document ingested")
print(f" ID: {doc.external_id}")
print(f" Content Type: {doc.content_type}")
print(f" Tags: {doc.metadata.get('tags')}")
return doc.external_id
async def demonstrate_file_ingestion(db: LocalDataBridge):
"""Demonstrate file document ingestion"""
print("\n=== File Ingestion Example ===")
# Create a sample PDF file
pdf_path = Path("sample.pdf")
if not pdf_path.exists():
print("× Sample PDF not found, skipping file ingestion")
return
# Method 1: Ingest using file path
doc1 = await db.ingest_file(
file=pdf_path,
@ -82,7 +80,7 @@ async def demonstrate_file_ingestion(db: LocalDataBridge):
print("✓ File ingested from path")
print(f" ID: {doc1.external_id}")
print(f" Storage Info: {doc1.storage_info}")
# Method 2: Ingest using file object
with open(pdf_path, "rb") as f:
doc2 = await db.ingest_file(
@ -92,14 +90,14 @@ async def demonstrate_file_ingestion(db: LocalDataBridge):
)
print("✓ File ingested from file object")
print(f" ID: {doc2.external_id}")
return doc1.external_id
async def demonstrate_querying(db: LocalDataBridge, doc_id: str):
"""Demonstrate document querying"""
print("\n=== Querying Example ===")
# Query 1: Search for chunks
print("\nSearching for chunks about machine learning:")
chunks = await db.query(
@ -108,12 +106,12 @@ async def demonstrate_querying(db: LocalDataBridge, doc_id: str):
k=2,
filters={"category": "technical"}
)
for chunk in chunks:
print(f"\nChunk from document {chunk.document_id}")
print(f"Score: {chunk.score:.2f}")
print(f"Content: {chunk.content[:200]}...")
# Query 2: Search for documents
print("\nSearching for documents about deep learning:")
docs = await db.query(
@ -121,7 +119,7 @@ async def demonstrate_querying(db: LocalDataBridge, doc_id: str):
return_type="documents",
filters={"tags": ["ML", "AI"]}
)
for doc in docs:
print(f"\nDocument: {doc.document_id}")
print(f"Score: {doc.score:.2f}")
@ -131,13 +129,13 @@ async def demonstrate_querying(db: LocalDataBridge, doc_id: str):
async def demonstrate_document_management(db: LocalDataBridge):
"""Demonstrate document management operations"""
print("\n=== Document Management Example ===")
# List documents with pagination
print("\nListing first 5 documents:")
docs = await db.list_documents(limit=5)
for doc in docs:
print(f"- {doc.external_id}: {doc.metadata.get('title', 'Untitled')}")
if docs:
# Get specific document details
doc_id = docs[0].external_id