mirror of
https://github.com/james-m-jordan/morphik-core.git
synced 2025-05-09 19:32:38 +00:00
restructuring and WIP api and sdk changes
This commit is contained in:
parent
30577dc0ff
commit
1a926c7be0
0
core/__init__.py
Normal file
0
core/__init__.py
Normal file
274
core/api.py
Normal file
274
core/api.py
Normal file
@ -0,0 +1,274 @@
|
||||
from fastapi import FastAPI, HTTPException, Depends, Header, Request, status
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import JSONResponse
|
||||
from typing import Dict, Any, List, Optional, Annotated
|
||||
from pydantic import BaseModel, Field
|
||||
import jwt
|
||||
import os
|
||||
from datetime import datetime, UTC
|
||||
import logging
|
||||
from .vector_store.mongo_vector_store import MongoDBAtlasVectorStore
|
||||
from .embedding_model.openai_embedding_model import OpenAIEmbeddingModel
|
||||
from .parser.unstructured_parser import UnstructuredAPIParser
|
||||
from .planner.simple_planner import SimpleRAGPlanner
|
||||
from .document import Document, DocumentChunk
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Initialize FastAPI app
|
||||
app = FastAPI(
|
||||
title="DataBridge API",
|
||||
description="REST API for DataBridge document ingestion and querying",
|
||||
version="1.0.0"
|
||||
)
|
||||
|
||||
# Add CORS middleware
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
|
||||
class DataBridgeException(HTTPException):
|
||||
def __init__(self, detail: str, status_code: int = 400):
|
||||
super().__init__(status_code=status_code, detail=detail)
|
||||
|
||||
|
||||
class AuthenticationError(DataBridgeException):
|
||||
def __init__(self, detail: str = "Authentication failed"):
|
||||
super().__init__(detail=detail, status_code=status.HTTP_401_UNAUTHORIZED)
|
||||
|
||||
|
||||
class ServiceConfig:
|
||||
"""Service-wide configuration and component management"""
|
||||
def __init__(self):
|
||||
self.jwt_secret = os.getenv("JWT_SECRET_KEY")
|
||||
if not self.jwt_secret:
|
||||
raise ValueError("JWT_SECRET_KEY environment variable not set")
|
||||
|
||||
# Required environment variables
|
||||
required_vars = {
|
||||
"MONGODB_URI": "MongoDB connection string",
|
||||
"OPENAI_API_KEY": "OpenAI API key",
|
||||
"UNSTRUCTURED_API_KEY": "Unstructured API key"
|
||||
}
|
||||
|
||||
missing = [f"{var} ({desc})" for var, desc in required_vars.items() if not os.getenv(var)]
|
||||
if missing:
|
||||
raise ValueError(f"Missing required environment variables: {', '.join(missing)}")
|
||||
|
||||
# Initialize core components
|
||||
self._init_components()
|
||||
|
||||
def _init_components(self):
|
||||
"""Initialize service components"""
|
||||
try:
|
||||
self.vector_store = MongoDBAtlasVectorStore(
|
||||
connection_string=os.getenv("MONGODB_URI"),
|
||||
database_name=os.getenv("DB_NAME", "databridge"),
|
||||
collection_name=os.getenv("COLLECTION_NAME", "embeddings")
|
||||
)
|
||||
|
||||
self.embedding_model = OpenAIEmbeddingModel(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model_name=os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")
|
||||
)
|
||||
|
||||
self.parser = UnstructuredAPIParser(
|
||||
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
|
||||
chunk_size=int(os.getenv("CHUNK_SIZE", "1000")),
|
||||
chunk_overlap=int(os.getenv("CHUNK_OVERLAP", "200"))
|
||||
)
|
||||
|
||||
self.planner = SimpleRAGPlanner(
|
||||
default_k=int(os.getenv("DEFAULT_K", "4"))
|
||||
)
|
||||
except Exception as e:
|
||||
raise ValueError(f"Failed to initialize components: {str(e)}")
|
||||
|
||||
async def verify_token(self, token: str, owner_id: str) -> bool:
|
||||
"""Verify JWT token and owner_id"""
|
||||
try:
|
||||
payload = jwt.decode(token, self.jwt_secret, algorithms=["HS256"])
|
||||
if payload.get("owner_id") != owner_id:
|
||||
raise AuthenticationError("Owner ID mismatch")
|
||||
if datetime.fromtimestamp(payload["exp"], UTC) < datetime.now(UTC):
|
||||
raise AuthenticationError("Token has expired")
|
||||
return True
|
||||
except jwt.InvalidTokenError:
|
||||
raise AuthenticationError("Invalid token")
|
||||
except Exception as e:
|
||||
raise AuthenticationError(f"Authentication failed: {str(e)}")
|
||||
|
||||
|
||||
# Initialize service
|
||||
service = ServiceConfig()
|
||||
|
||||
|
||||
# Request/Response Models
|
||||
class IngestRequest(BaseModel):
|
||||
content: str = Field(..., description="Document content (text or base64)")
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict, description="Document metadata")
|
||||
|
||||
|
||||
class QueryRequest(BaseModel):
|
||||
query: str = Field(..., description="Query string")
|
||||
k: Optional[int] = Field(default=4, description="Number of results to return")
|
||||
filters: Optional[Dict[str, Any]] = Field(default=None,
|
||||
description="Optional metadata filters")
|
||||
|
||||
|
||||
class IngestResponse(BaseModel):
|
||||
document_id: str = Field(..., description="Ingested document ID")
|
||||
message: str = Field(default="Document ingested successfully")
|
||||
|
||||
|
||||
class QueryResponse(BaseModel):
|
||||
results: List[Dict[str, Any]] = Field(..., description="Query results")
|
||||
total_results: int = Field(..., description="Total number of results")
|
||||
|
||||
|
||||
# Authentication dependency
|
||||
async def verify_auth(
|
||||
owner_id: Annotated[str, Header(alias="X-Owner-ID")],
|
||||
auth_token: Annotated[str, Header(alias="X-Auth-Token")]
|
||||
) -> str:
|
||||
"""Verify authentication headers"""
|
||||
await service.verify_token(auth_token, owner_id)
|
||||
return owner_id
|
||||
|
||||
|
||||
# Error handler middleware
|
||||
@app.middleware("http")
|
||||
async def error_handler(request: Request, call_next):
|
||||
try:
|
||||
return await call_next(request)
|
||||
except DataBridgeException as e:
|
||||
return JSONResponse(
|
||||
status_code=e.status_code,
|
||||
content={"error": e.detail}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.exception("Unexpected error")
|
||||
return JSONResponse(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
content={"error": "Internal server error"}
|
||||
)
|
||||
|
||||
|
||||
# API Routes
|
||||
@app.post("/ingest", response_model=IngestResponse)
|
||||
async def ingest_document(
|
||||
request: IngestRequest,
|
||||
owner_id: str = Depends(verify_auth)
|
||||
) -> IngestResponse:
|
||||
"""
|
||||
Ingest a document into DataBridge.
|
||||
All configuration and credentials are handled server-side.
|
||||
"""
|
||||
logger.info(f"Ingesting document for owner {owner_id}")
|
||||
|
||||
# Add owner_id to metadata
|
||||
request.metadata['owner_id'] = owner_id
|
||||
|
||||
# Create document
|
||||
doc = Document(request.content, request.metadata, owner_id)
|
||||
|
||||
# Parse into chunks
|
||||
chunk_texts = service.parser.parse(request.content, request.metadata)
|
||||
# Create embeddings and chunks
|
||||
chunks = []
|
||||
for chunk_text in chunk_texts:
|
||||
embedding = await service.embedding_model.embed(chunk_text)
|
||||
chunk = DocumentChunk(chunk_text, embedding, doc.id)
|
||||
chunk.metadata = {
|
||||
'owner_id': owner_id,
|
||||
**request.metadata
|
||||
}
|
||||
chunks.append(chunk)
|
||||
|
||||
# Store in vector store
|
||||
if not service.vector_store.store_embeddings(chunks):
|
||||
raise DataBridgeException(
|
||||
"Failed to store embeddings",
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR
|
||||
)
|
||||
|
||||
return IngestResponse(document_id=doc.id)
|
||||
|
||||
|
||||
@app.post("/query", response_model=QueryResponse)
|
||||
async def query_documents(
|
||||
request: QueryRequest,
|
||||
owner_id: str = Depends(verify_auth)
|
||||
) -> QueryResponse:
|
||||
"""
|
||||
Query documents in DataBridge.
|
||||
All configuration and credentials are handled server-side.
|
||||
"""
|
||||
logger.info(f"Processing query for owner {owner_id}")
|
||||
print("ADILOG ")
|
||||
# Create plan
|
||||
plan = service.planner.plan_retrieval(request.query, k=request.k)
|
||||
|
||||
# Get query embedding
|
||||
query_embedding = await service.embedding_model.embed(request.query)
|
||||
|
||||
# Query vector store
|
||||
chunks = service.vector_store.query_similar(
|
||||
query_embedding,
|
||||
k=plan["k"],
|
||||
owner_id=owner_id,
|
||||
filters=request.filters
|
||||
)
|
||||
|
||||
# Format results
|
||||
results = [
|
||||
{
|
||||
"content": chunk.content,
|
||||
"doc_id": chunk.doc_id,
|
||||
"chunk_id": chunk.id,
|
||||
"score": getattr(chunk, "score", None),
|
||||
"metadata": {k:v for k,v in chunk.metadata.items() if k != 'owner_id'}
|
||||
}
|
||||
for chunk in chunks
|
||||
]
|
||||
|
||||
return QueryResponse(
|
||||
results=results,
|
||||
total_results=len(results)
|
||||
)
|
||||
|
||||
|
||||
# Health check endpoint
|
||||
@app.get("/health")
|
||||
async def health_check():
|
||||
"""Check service health"""
|
||||
try:
|
||||
# Verify MongoDB connection
|
||||
service.vector_store.collection.find_one({})
|
||||
return {"status": "healthy"}
|
||||
except Exception as e:
|
||||
raise DataBridgeException(
|
||||
f"Service unhealthy: {str(e)}",
|
||||
status_code=status.HTTP_503_SERVICE_UNAVAILABLE
|
||||
)
|
||||
|
||||
|
||||
# Startup and shutdown events
|
||||
@app.on_event("startup")
|
||||
async def startup_event():
|
||||
"""Verify all connections on startup"""
|
||||
logger.info("Starting DataBridge service")
|
||||
await health_check()
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown_event():
|
||||
"""Cleanup on shutdown"""
|
||||
logger.info("Shutting down DataBridge service")
|
35
core/auth.py
Normal file
35
core/auth.py
Normal file
@ -0,0 +1,35 @@
|
||||
from fastapi import Request, HTTPException
|
||||
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
||||
import jwt
|
||||
|
||||
security = HTTPBearer()
|
||||
|
||||
|
||||
class DataBridgeAuth:
|
||||
def __init__(self, secret_key: str):
|
||||
self.secret_key = secret_key
|
||||
|
||||
async def __call__(self, request: Request, credentials: HTTPAuthorizationCredentials = Depends(security)) -> str:
|
||||
try:
|
||||
token = credentials.credentials
|
||||
payload = jwt.decode(token, self.secret_key, algorithms=["HS256"])
|
||||
|
||||
# Validate owner_id from token matches header
|
||||
owner_id = request.headers.get("X-Owner-ID")
|
||||
if owner_id != payload.get("owner_id"):
|
||||
raise HTTPException(
|
||||
status_code=401,
|
||||
detail="Owner ID mismatch"
|
||||
)
|
||||
|
||||
return owner_id
|
||||
except jwt.ExpiredSignatureError:
|
||||
raise HTTPException(
|
||||
status_code=401,
|
||||
detail="Token has expired"
|
||||
)
|
||||
except jwt.InvalidTokenError:
|
||||
raise HTTPException(
|
||||
status_code=401,
|
||||
detail="Invalid token"
|
||||
)
|
@ -1,10 +1,10 @@
|
||||
from typing import Dict, Any, List
|
||||
from databridge_uri import DataBridgeURI
|
||||
from document import Document, DocumentChunk
|
||||
from mongo_vector_store import MongoDBAtlasVectorStore
|
||||
from openai_embedding_model import OpenAIEmbeddingModel
|
||||
from unstructured_parser import UnstructuredAPIParser
|
||||
from simple_planner import SimpleRAGPlanner
|
||||
from .databridge_uri import DataBridgeURI
|
||||
from .document import Document, DocumentChunk
|
||||
from .vector_store.mongo_vector_store import MongoDBAtlasVectorStore
|
||||
from .embedding_model.openai_embedding_model import OpenAIEmbeddingModel
|
||||
from .parser.unstructured_parser import UnstructuredAPIParser
|
||||
from .planner.simple_planner import SimpleRAGPlanner
|
||||
|
||||
|
||||
class DataBridge:
|
0
core/embedding_model/__init__.py
Normal file
0
core/embedding_model/__init__.py
Normal file
@ -1,6 +1,6 @@
|
||||
from typing import List, Union
|
||||
import openai
|
||||
from base_embedding_model import BaseEmbeddingModel
|
||||
from .base_embedding_model import BaseEmbeddingModel
|
||||
|
||||
|
||||
class OpenAIEmbeddingModel(BaseEmbeddingModel):
|
27
core/main.py
Normal file
27
core/main.py
Normal file
@ -0,0 +1,27 @@
|
||||
from fastapi import FastAPI, Depends
|
||||
from .api import app as api_app
|
||||
from .auth import DataBridgeAuth
|
||||
import os
|
||||
|
||||
app = FastAPI()
|
||||
auth = DataBridgeAuth(secret_key=os.getenv("JWT_SECRET_KEY", "your-secret-key"))
|
||||
|
||||
# Mount the API with authentication
|
||||
app.mount("/api/v1", api_app)
|
||||
|
||||
# Add authentication middleware to all routes
|
||||
@app.middleware("http")
|
||||
async def authenticate_requests(request: Request, call_next):
|
||||
if request.url.path.startswith("/api/v1"):
|
||||
try:
|
||||
await auth(request)
|
||||
except HTTPException as e:
|
||||
return JSONResponse(
|
||||
status_code=e.status_code,
|
||||
content={"detail": e.detail}
|
||||
)
|
||||
return await call_next(request)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
0
core/parser/__init__.py
Normal file
0
core/parser/__init__.py
Normal file
@ -1,5 +1,5 @@
|
||||
from typing import Dict, Any, List
|
||||
from base_parser import BaseParser
|
||||
from .base_parser import BaseParser
|
||||
from unstructured.partition.auto import partition
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
import os
|
0
core/planner/__init__.py
Normal file
0
core/planner/__init__.py
Normal file
@ -1,5 +1,5 @@
|
||||
from typing import Dict, Any
|
||||
from base_planner import BasePlanner
|
||||
from .base_planner import BasePlanner
|
||||
|
||||
|
||||
class SimpleRAGPlanner(BasePlanner):
|
0
core/vector_store/__init__.py
Normal file
0
core/vector_store/__init__.py
Normal file
@ -1,6 +1,6 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List
|
||||
from document import DocumentChunk
|
||||
from core.document import DocumentChunk
|
||||
|
||||
|
||||
class BaseVectorStore(ABC):
|
@ -1,7 +1,7 @@
|
||||
from typing import List, Dict, Any
|
||||
from pymongo import MongoClient
|
||||
from base_vector_store import BaseVectorStore
|
||||
from document import DocumentChunk
|
||||
from .base_vector_store import BaseVectorStore
|
||||
from core.document import DocumentChunk
|
||||
|
||||
|
||||
class MongoDBAtlasVectorStore(BaseVectorStore):
|
||||
@ -45,18 +45,21 @@ class MongoDBAtlasVectorStore(BaseVectorStore):
|
||||
documents = []
|
||||
for chunk in chunks:
|
||||
doc = {
|
||||
"_id": chunk.id, # Use chunk.id as MongoDB _id
|
||||
"_id": chunk.id,
|
||||
"text": chunk.content,
|
||||
"embedding": chunk.embedding,
|
||||
"doc_id": chunk.doc_id,
|
||||
"owner_id": chunk.metadata.get("owner_id"),
|
||||
"metadata": chunk.metadata
|
||||
}
|
||||
print("BHAU")
|
||||
print(doc)
|
||||
documents.append(doc)
|
||||
|
||||
if documents:
|
||||
# Use ordered=False to continue even if some inserts fail
|
||||
result = self.collection.insert_many(documents, ordered=False)
|
||||
print(result)
|
||||
return len(result.inserted_ids) > 0
|
||||
return True
|
||||
|
||||
@ -89,8 +92,11 @@ class MongoDBAtlasVectorStore(BaseVectorStore):
|
||||
}
|
||||
}
|
||||
]
|
||||
# print("ADILOG: " + str(pipeline))
|
||||
|
||||
results = list(self.collection.aggregate(pipeline))
|
||||
print("ADILOG")
|
||||
print(results)
|
||||
chunks = []
|
||||
|
||||
for result in results:
|
0
examples/__init__.py
Normal file
0
examples/__init__.py
Normal file
@ -1,6 +1,8 @@
|
||||
from datetime import datetime, timedelta, UTC # Note: using UTC for timezone awareness
|
||||
import sys; sys.path.append('.')
|
||||
|
||||
from datetime import datetime, timedelta, UTC
|
||||
import base64
|
||||
from databridge import DataBridge
|
||||
from core.databridge import DataBridge
|
||||
import jwt
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
@ -47,7 +49,7 @@ async def main():
|
||||
bridge = DataBridge(create_databridge_uri())
|
||||
|
||||
# Example: Ingest a PDF document
|
||||
with open("sample.pdf", "rb") as f:
|
||||
with open("examples/sample.pdf", "rb") as f:
|
||||
pdf_content = base64.b64encode(f.read()).decode()
|
||||
|
||||
await bridge.ingest_document(
|
89
printer.py
Normal file
89
printer.py
Normal file
@ -0,0 +1,89 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
def should_ignore_directory(dirname):
|
||||
"""
|
||||
Check if directory should be ignored.
|
||||
|
||||
Args:
|
||||
dirname (str): Name of the directory
|
||||
|
||||
Returns:
|
||||
bool: True if directory should be ignored, False otherwise
|
||||
"""
|
||||
# List of directory names to ignore
|
||||
ignore_dirs = {
|
||||
'venv',
|
||||
'env',
|
||||
'.venv',
|
||||
'virtualenv',
|
||||
'__pycache__',
|
||||
'.pytest_cache',
|
||||
'.mypy_cache',
|
||||
'.tox'
|
||||
}
|
||||
return dirname in ignore_dirs
|
||||
|
||||
def aggregate_python_files(root_dir, output_file, script_name):
|
||||
"""
|
||||
Recursively search through directories starting from root_dir,
|
||||
find all Python files, and write their contents to a single output file.
|
||||
Ignores virtual environment directories, __init__.py files and the script itself.
|
||||
|
||||
Args:
|
||||
root_dir (str): The root directory to start the search from
|
||||
output_file (str): The name of the output file to create
|
||||
script_name (str): Name of this script to ignore
|
||||
"""
|
||||
# Convert root_dir to absolute path
|
||||
root_dir = os.path.abspath(root_dir)
|
||||
|
||||
# Use with statement to properly handle file opening/closing
|
||||
with open(output_file, 'w', encoding='utf-8') as outfile:
|
||||
# Walk through all directories
|
||||
for dirpath, dirnames, filenames in os.walk(root_dir, topdown=True):
|
||||
# Modify dirnames in place to skip ignored directories
|
||||
dirnames[:] = [d for d in dirnames if not should_ignore_directory(d)]
|
||||
|
||||
# Filter for Python files, excluding __init__.py and this script
|
||||
python_files = [
|
||||
f for f in filenames
|
||||
if f.endswith('.py')
|
||||
and f != '__init__.py'
|
||||
and f != script_name
|
||||
and f != output_file
|
||||
]
|
||||
|
||||
for py_file in python_files:
|
||||
# Get the full file path
|
||||
file_path = os.path.join(dirpath, py_file)
|
||||
# Get relative path from root_dir
|
||||
rel_path = os.path.relpath(file_path, root_dir)
|
||||
|
||||
try:
|
||||
# Read the content of the Python file
|
||||
with open(file_path, 'r', encoding='utf-8') as infile:
|
||||
content = infile.read()
|
||||
|
||||
# Write the file path and contents to the output file
|
||||
outfile.write(f"{rel_path}\n")
|
||||
outfile.write(content)
|
||||
outfile.write("\n\n" + "="*80 + "\n\n") # Separator between files
|
||||
|
||||
except Exception as e:
|
||||
outfile.write(f"Error reading {rel_path}: {str(e)}\n\n")
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Get the name of this script
|
||||
script_name = os.path.basename(__file__)
|
||||
|
||||
# Get current directory as default root
|
||||
current_dir = os.getcwd()
|
||||
|
||||
# Output file name
|
||||
output_file = "python_files_contents.txt"
|
||||
|
||||
print(f"Starting to process Python files from: {current_dir}")
|
||||
print(f"Ignoring {script_name}, all __init__.py files, and virtual environment directories")
|
||||
aggregate_python_files(current_dir, output_file, script_name)
|
||||
print(f"Finished! Results written to: {output_file}")
|
0
sdks/python/README.md
Normal file
0
sdks/python/README.md
Normal file
5
sdks/python/databridge/__init__.py
Normal file
5
sdks/python/databridge/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
from .client import DataBridge
|
||||
from .exceptions import DataBridgeError
|
||||
from .types import ContentType
|
||||
|
||||
__all__ = ['DataBridge', 'DataBridgeError', 'ContentType']
|
198
sdks/python/databridge/client.py
Normal file
198
sdks/python/databridge/client.py
Normal file
@ -0,0 +1,198 @@
|
||||
from typing import Dict, Any, List, Optional, Union
|
||||
import httpx
|
||||
from urllib.parse import urlparse
|
||||
import jwt
|
||||
from datetime import datetime, UTC
|
||||
import asyncio
|
||||
from dataclasses import dataclass
|
||||
from .exceptions import AuthenticationError
|
||||
from .types import ContentType
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class QueryResult:
|
||||
"""Structured query result"""
|
||||
content: str
|
||||
doc_id: str
|
||||
chunk_id: str
|
||||
score: Optional[float]
|
||||
metadata: Dict[str, Any]
|
||||
|
||||
|
||||
class DataBridge:
|
||||
"""
|
||||
DataBridge client for document ingestion and querying.
|
||||
|
||||
Usage:
|
||||
db = DataBridge("databridge://owner123:token@databridge.local")
|
||||
doc_id = await db.ingest_document("content", {"title": "My Doc"})
|
||||
results = await db.query("What is...")
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
uri: str,
|
||||
base_url: str = "https://api.databridge.ai/v1",
|
||||
timeout: int = 30,
|
||||
max_retries: int = 3
|
||||
):
|
||||
self._base_url = base_url.rstrip('/')
|
||||
self._timeout = timeout
|
||||
self._max_retries = max_retries
|
||||
self._client = httpx.AsyncClient(timeout=timeout)
|
||||
self._setup_auth(uri)
|
||||
|
||||
def _setup_auth(self, uri: str) -> None:
|
||||
"""Setup authentication from URI"""
|
||||
try:
|
||||
parsed = urlparse(uri)
|
||||
if not parsed.netloc:
|
||||
raise ValueError("Invalid URI format")
|
||||
|
||||
auth_parts = parsed.netloc.split('@')[0].split(':')
|
||||
if len(auth_parts) != 2:
|
||||
raise ValueError("URI must include owner_id and auth_token")
|
||||
|
||||
self._owner_id = auth_parts[0]
|
||||
self._auth_token = auth_parts[1]
|
||||
|
||||
# Validate token structure (not signature)
|
||||
try:
|
||||
decoded = jwt.decode(self._auth_token, options={"verify_signature": False})
|
||||
self._token_expiry = datetime.fromtimestamp(decoded['exp'], UTC)
|
||||
except jwt.InvalidTokenError as e:
|
||||
raise ValueError(f"Invalid auth token format: {str(e)}")
|
||||
|
||||
except Exception as e:
|
||||
raise AuthenticationError(f"Failed to setup authentication: {str(e)}")
|
||||
|
||||
async def _make_request(
|
||||
self,
|
||||
method: str,
|
||||
endpoint: str,
|
||||
data: Dict[str, Any] = None,
|
||||
retry_count: int = 0
|
||||
) -> Dict[str, Any]:
|
||||
"""Make authenticated HTTP request with retries"""
|
||||
# if datetime.now(UTC) > self._token_expiry:
|
||||
# raise AuthenticationError("Authentication token has expired")
|
||||
headers = {
|
||||
"X-Owner-ID": self._owner_id,
|
||||
"X-Auth-Token": self._auth_token,
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
try:
|
||||
response = await self._client.request(
|
||||
method,
|
||||
f"http://localhost:8000/{endpoint.lstrip('/')}",
|
||||
json=data,
|
||||
headers=headers
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
if e.response.status_code == 401:
|
||||
raise AuthenticationError("Authentication failed: " + str(e))
|
||||
elif e.response.status_code >= 500 and retry_count < self._max_retries:
|
||||
await asyncio.sleep(2 ** retry_count) # Exponential backoff
|
||||
return await self._make_request(method, endpoint, data, retry_count + 1)
|
||||
else:
|
||||
raise ConnectionError(f"Request failed: {e.response.text}")
|
||||
except Exception as e:
|
||||
raise ConnectionError(f"Request failed: {str(e)}")
|
||||
|
||||
async def ingest_document(
|
||||
self,
|
||||
content: Union[str, bytes],
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
content_type: ContentType = ContentType.TEXT
|
||||
) -> str:
|
||||
"""
|
||||
Ingest a document into DataBridge.
|
||||
|
||||
Args:
|
||||
content: Document content (string or bytes)
|
||||
metadata: Optional document metadata
|
||||
content_type: Type of the content being ingested
|
||||
|
||||
Returns:
|
||||
Document ID of the ingested document
|
||||
"""
|
||||
if isinstance(content, bytes):
|
||||
import base64
|
||||
content = base64.b64encode(content).decode()
|
||||
metadata = metadata or {}
|
||||
metadata["is_base64"] = True
|
||||
|
||||
metadata = metadata or {}
|
||||
metadata["content_type"] = content_type
|
||||
|
||||
response = await self._make_request(
|
||||
"POST",
|
||||
"ingest",
|
||||
{
|
||||
"content": content,
|
||||
"metadata": metadata
|
||||
}
|
||||
)
|
||||
|
||||
return response["document_id"]
|
||||
|
||||
async def query(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
filters: Optional[Dict[str, Any]] = None
|
||||
) -> List[QueryResult]:
|
||||
"""
|
||||
Query documents in DataBridge.
|
||||
|
||||
Args:
|
||||
query: Query string
|
||||
k: Number of results to return
|
||||
filters: Optional metadata filters
|
||||
|
||||
Returns:
|
||||
List of QueryResult objects
|
||||
"""
|
||||
response = await self._make_request(
|
||||
"POST",
|
||||
"query",
|
||||
{
|
||||
"query": query,
|
||||
"k": k,
|
||||
"filters": filters
|
||||
}
|
||||
)
|
||||
|
||||
return [
|
||||
QueryResult(
|
||||
content=result["content"],
|
||||
doc_id=result["doc_id"],
|
||||
chunk_id=result["chunk_id"],
|
||||
score=result.get("score"),
|
||||
metadata=result.get("metadata", {})
|
||||
)
|
||||
for result in response["results"]
|
||||
]
|
||||
|
||||
async def close(self):
|
||||
"""Close the HTTP client"""
|
||||
await self._client.aclose()
|
||||
|
||||
async def __aenter__(self):
|
||||
"""Async context manager entry"""
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||
"""Async context manager exit"""
|
||||
await self.close()
|
||||
|
||||
def __repr__(self) -> str:
|
||||
"""Safe string representation"""
|
||||
return f"DataBridge(owner_id='{self._owner_id}')"
|
13
sdks/python/databridge/exceptions.py
Normal file
13
sdks/python/databridge/exceptions.py
Normal file
@ -0,0 +1,13 @@
|
||||
class DataBridgeError(Exception):
|
||||
"""Base exception for DataBridge SDK"""
|
||||
pass
|
||||
|
||||
|
||||
class AuthenticationError(DataBridgeError):
|
||||
"""Authentication related errors"""
|
||||
pass
|
||||
|
||||
|
||||
class ConnectionError(DataBridgeError):
|
||||
"""Connection related errors"""
|
||||
pass
|
9
sdks/python/databridge/types.py
Normal file
9
sdks/python/databridge/types.py
Normal file
@ -0,0 +1,9 @@
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class ContentType(str, Enum):
|
||||
"""Supported content types"""
|
||||
TEXT = "text/plain"
|
||||
PDF = "application/pdf"
|
||||
DOCX = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
HTML = "text/html"
|
197
sdks/python/examples/basic_usage.py
Normal file
197
sdks/python/examples/basic_usage.py
Normal file
@ -0,0 +1,197 @@
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datetime import UTC, datetime, timedelta
|
||||
|
||||
from dotenv import load_dotenv
|
||||
import jwt
|
||||
|
||||
# we can't pip install, this basically acts like pip install.
|
||||
sdk_root = Path(__file__).parent.parent
|
||||
sys.path.append(str(sdk_root))
|
||||
|
||||
from databridge import DataBridge, ContentType, DataBridgeError
|
||||
|
||||
|
||||
def create_test_uri():
|
||||
"""Create a test URI with a valid JWT token"""
|
||||
token = jwt.encode(
|
||||
{
|
||||
'owner_id': 'test_user_123',
|
||||
'exp': datetime.now(UTC) + timedelta(days=30)
|
||||
},
|
||||
"your-secret-key-for-signing-tokens",
|
||||
algorithm='HS256'
|
||||
)
|
||||
return f"databridge://test_user_123:{token}@localhost:8000"
|
||||
|
||||
|
||||
async def example_text():
|
||||
"""Example of ingesting and querying text documents"""
|
||||
print("\n=== Text Document Example ===")
|
||||
load_dotenv()
|
||||
uri = os.getenv("DATABRIDGE_URI")
|
||||
if not uri:
|
||||
raise ValueError("Please set DATABRIDGE_URI environment variable")
|
||||
|
||||
db = DataBridge(create_test_uri())
|
||||
|
||||
try:
|
||||
# Ingest a simple text document
|
||||
content = """
|
||||
Machine learning (ML) is a type of artificial intelligence (AI) that allows
|
||||
software applications to become more accurate at predicting outcomes without
|
||||
being explicitly programmed to do so. Machine learning algorithms use historical
|
||||
data as input to predict new output values.
|
||||
"""
|
||||
|
||||
doc_id = await db.ingest_document(
|
||||
content=content,
|
||||
metadata={
|
||||
"title": "ML Introduction",
|
||||
"category": "tech",
|
||||
"tags": ["ML", "AI", "technology"]
|
||||
}
|
||||
)
|
||||
print(f"✓ Document ingested successfully (ID: {doc_id})")
|
||||
|
||||
# Query the document
|
||||
results = await db.query(
|
||||
query="What is machine learning?",
|
||||
k=1 # Get top result
|
||||
)
|
||||
|
||||
print("\nQuery Results:")
|
||||
for result in results:
|
||||
print(f"Content: {result.content.strip()}")
|
||||
print(f"Score: {result.score:.2f}")
|
||||
print(f"Metadata: {result.metadata}")
|
||||
|
||||
except DataBridgeError as e:
|
||||
print(f"× Error: {str(e)}")
|
||||
|
||||
finally:
|
||||
await db.close()
|
||||
|
||||
|
||||
async def example_pdf():
|
||||
"""Example of ingesting and querying PDF documents"""
|
||||
print("\n=== PDF Document Example ===")
|
||||
|
||||
uri = os.getenv("DATABRIDGE_URI")
|
||||
if not uri:
|
||||
raise ValueError("Please set DATABRIDGE_URI environment variable")
|
||||
|
||||
# Path to a sample PDF in the examples directory
|
||||
pdf_path = Path(__file__).parent / "sample.pdf"
|
||||
if not pdf_path.exists():
|
||||
print("× sample.pdf not found in examples directory")
|
||||
return
|
||||
|
||||
db = DataBridge(uri)
|
||||
|
||||
try:
|
||||
# Read and ingest PDF
|
||||
with open(pdf_path, "rb") as f:
|
||||
pdf_content = f.read()
|
||||
|
||||
doc_id = await db.ingest_document(
|
||||
content=pdf_content,
|
||||
metadata={
|
||||
"title": "Sample Document",
|
||||
"source": "examples",
|
||||
"file_type": "pdf"
|
||||
},
|
||||
content_type=ContentType.PDF
|
||||
)
|
||||
print(f"✓ PDF ingested successfully (ID: {doc_id})")
|
||||
|
||||
# Query the PDF content
|
||||
results = await db.query(
|
||||
query="What is the main topic of this document?",
|
||||
k=2, # Get top 2 results
|
||||
filters={"file_type": "pdf"} # Only search PDF documents
|
||||
)
|
||||
|
||||
print("\nQuery Results:")
|
||||
for i, result in enumerate(results, 1):
|
||||
print(f"\nResult {i}:")
|
||||
print(f"Content: {result.content[:200]}...")
|
||||
print(f"Score: {result.score:.2f}")
|
||||
print(f"Document ID: {result.doc_id}")
|
||||
|
||||
except DataBridgeError as e:
|
||||
print(f"× Error: {str(e)}")
|
||||
|
||||
finally:
|
||||
await db.close()
|
||||
|
||||
|
||||
async def example_batch():
|
||||
"""Example of batch operations"""
|
||||
print("\n=== Batch Operations Example ===")
|
||||
|
||||
uri = os.getenv("DATABRIDGE_URI")
|
||||
if not uri:
|
||||
raise ValueError("Please set DATABRIDGE_URI environment variable")
|
||||
|
||||
db = DataBridge(uri)
|
||||
|
||||
try:
|
||||
# Prepare multiple documents
|
||||
documents = [
|
||||
{
|
||||
"content": "Python is a programming language.",
|
||||
"metadata": {"category": "programming", "level": "basic"}
|
||||
},
|
||||
{
|
||||
"content": "JavaScript runs in the browser.",
|
||||
"metadata": {"category": "programming", "level": "basic"}
|
||||
},
|
||||
{
|
||||
"content": "Docker containers package applications.",
|
||||
"metadata": {"category": "devops", "level": "intermediate"}
|
||||
}
|
||||
]
|
||||
|
||||
# Ingest multiple documents
|
||||
doc_ids = []
|
||||
for doc in documents:
|
||||
doc_id = await db.ingest_document(
|
||||
content=doc["content"],
|
||||
metadata=doc["metadata"]
|
||||
)
|
||||
doc_ids.append(doc_id)
|
||||
print(f"✓ Ingested {len(doc_ids)} documents")
|
||||
|
||||
# Query with filters
|
||||
results = await db.query(
|
||||
query="What is Python?",
|
||||
filters={"category": "programming"}
|
||||
)
|
||||
|
||||
print("\nQuery Results (Programming category only):")
|
||||
for result in results:
|
||||
print(f"\nContent: {result.content}")
|
||||
print(f"Category: {result.metadata['category']}")
|
||||
print(f"Level: {result.metadata['level']}")
|
||||
|
||||
except DataBridgeError as e:
|
||||
print(f"× Error: {str(e)}")
|
||||
|
||||
finally:
|
||||
await db.close()
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all examples"""
|
||||
try:
|
||||
await example_text()
|
||||
await example_pdf()
|
||||
await example_batch()
|
||||
except Exception as e:
|
||||
print(f"× Main error: {str(e)}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
17
sdks/python/pyproject.toml
Normal file
17
sdks/python/pyproject.toml
Normal file
@ -0,0 +1,17 @@
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project]
|
||||
name = "databridge-client"
|
||||
version = "0.1.0"
|
||||
description = "Python client for DataBridge RAG service"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.8"
|
||||
dependencies = [
|
||||
"httpx>=0.24.0",
|
||||
"pyjwt>=2.0.0"
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
dev = ["pytest", "pytest-asyncio", "black", "isort"]
|
12
sdks/python/setup.py
Normal file
12
sdks/python/setup.py
Normal file
@ -0,0 +1,12 @@
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
setup(
|
||||
name="databridge",
|
||||
version="0.1.0",
|
||||
packages=find_packages(),
|
||||
install_requires=[
|
||||
"httpx",
|
||||
"pyjwt",
|
||||
],
|
||||
python_requires=">=3.7",
|
||||
)
|
30
start_server.py
Normal file
30
start_server.py
Normal file
@ -0,0 +1,30 @@
|
||||
import uvicorn
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
def main():
|
||||
# Load environment variables from .env file
|
||||
load_dotenv()
|
||||
|
||||
# Verify required environment variables
|
||||
required_vars = [
|
||||
"MONGODB_URI",
|
||||
"OPENAI_API_KEY",
|
||||
"UNSTRUCTURED_API_KEY",
|
||||
"JWT_SECRET_KEY"
|
||||
]
|
||||
|
||||
missing = [var for var in required_vars if not os.getenv(var)]
|
||||
if missing:
|
||||
raise ValueError(f"Missing required environment variables: {', '.join(missing)}")
|
||||
|
||||
# Start server
|
||||
uvicorn.run(
|
||||
"core.api:app",
|
||||
host="0.0.0.0", # Listen on all available interfaces
|
||||
port=8000,
|
||||
reload=True # Enable auto-reload during development
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
x
Reference in New Issue
Block a user