bug fixes and end-to-end testing

This commit is contained in:
Arnav Agrawal 2024-12-17 21:40:38 -05:00
parent df8d7fcdd0
commit 4f2f221d40
13 changed files with 139 additions and 146 deletions

View File

@ -1,32 +1,13 @@
# MongoDB settings
MONGODB_URI=your_mongodb_connection_string
DB_NAME=DataBridgeTest
COLLECTION_NAME=test
MONGODB_URI="mongodb+srv://..."
DATABRIDGE_DB="DataBridgeDB"
DOCUMENTS_COLLECTION='documents' # Optional, default is 'documents'
CHUNKS_COLLECTION='document_chunks' # Optional, default is 'document_chunks'
# OpenAI settings
OPENAI_API_KEY=your_openai_api_key
EMBEDDING_MODEL=text-embedding-3-small
OPENAI_API_KEY="sk-proj-..."
# Unstructured API settings
UNSTRUCTURED_API_KEY=your_unstructured_api_key
UNSTRUCTURED_API_URL=https://api.unstructured.io
UNSTRUCTURED_API_KEY="..."
# Document processing settings
CHUNK_SIZE=1000
CHUNK_OVERLAP=200
DEFAULT_K=4
AWS_ACCESS_KEY="..."
AWS_SECRET_ACCESS_KEY="..."
# Storage settings
AWS_ACCESS_KEY=your_aws_access_key
AWS_SECRET_KEY=your_aws_secret_key
AWS_REGION=us-east-2
S3_BUCKET=databridge-storage
# Auth settings
JWT_SECRET_KEY=your_jwt_secret_key
JWT_ALGORITHM=HS256
# Server settings
HOST=127.0.0.1
PORT=8000
RELOAD=true
JWT_SECRET_KEY="..."

104
README.md
View File

@ -2,66 +2,104 @@
DataBridge is an extensible, open-source document processing and retrieval system designed for building document-based applications. It provides a modular architecture for integrating document parsing, embedding generation, and vector search capabilities.
## Table of Contents
- [Features](#features)
- [Starting the Server](#starting-the-server)
- [Quick Start](#quick-start)
- [Architecture](#architecture)
- [Current Integrations](#current-integrations)
- [Adding New Components](#adding-new-components)
- [API Documentation](#api-documentation)
- [Key Endpoints](#key-endpoints)
- [License](#license)
- [Contributing](#contributing)
## Features
- 🔌 **Extensible Architecture**: Built with modularity in mind - easily extend or replace any component:
- Document Parsing: Currently integrated with Unstructured API
- Vector Store: Currently using MongoDB Atlas
- Embedding Model: Currently using OpenAI
- Storage: Currently using AWS S3
- 🔌 **Extensible Architecture**: Modular design for easy component extension or replacement
- 🔍 **Vector Search**: Semantic search capabilities
- 🔐 **Authentication**: JWT-based auth with developer and end-user access modes
- 📊 **Metadata**: Rich metadata filtering and organization
- 📊 **Components**: Document Parsing (Unstructured API), Vector Store (MongoDB Atlas), Embedding Model (OpenAI), Storage (AWS S3)
- 🚀 **Python SDK**: Simple client SDK for quick integration
## Starting the Server
1. Clone the repository:
```bash
git clone https://github.com/databridge-org/databridge-core.git
```
2. Setup your python environment (Python 3.12 supported, but other versions may work):
```bash
cd databridge-core
python -m venv .venv
source .venv/bin/activate
```
3. Install the required dependencies:
```bash
pip install -r requirements.txt
```
4. Set up your environment variables, using the `.env.example` file as a reference, and creating a `.env` file in the project directory:
```bash
cp .env.example .env
```
<!-- TODO: Add instructions for setting up the environment variables, like setting up monogo account, openai account, etc. -->
5. Generate a local URI:
```bash
python generate_local_uri.py
```
Copy the output and save it for use with the client SDK.
6. Start the server:
```bash
python start_server.py
```
*Tip*: Visit `http://localhost:8000/docs` for the complete OpenAPI documentation.
## Quick Start
Ensure the server is running, then use the SDK to ingest and query documents.
1. Install the SDK:
```bash
pip install databridge-client
```
2. Set up your environment variables:
```env
MONGODB_URI=your_mongodb_connection_string
OPENAI_API_KEY=your_openai_api_key
UNSTRUCTURED_API_KEY=your_unstructured_api_key
JWT_SECRET_KEY=your_jwt_secret
AWS_ACCESS_KEY=your_aws_access_key
AWS_SECRET_ACCESS_KEY=your_aws_secret_key
```
3. Start the server:
```bash
python start_server.py
```
4. Use the SDK:
2. Use the SDK:
```python
import asyncio
from databridge import DataBridge
async def main():
# Initialize client
db = DataBridge("databridge://owner_id:auth_token@your-domain.com")
db = DataBridge("your_databridge_uri_here", is_local=True)
files = ["annual_report_2022.pdf", "marketing_strategy.docx" ,"product_launch_presentation.pptx", "company_logo.png"]
# Ingest a document
doc_id = await db.ingest_document(
content="Your document content",
metadata={"title": "My Document"}
)
for file in files:
await db.ingest_file(
file=file,
file_name=file,
metadata={"category": "Company Related"} # Optionally add any metadata
)
# Query documents
results = await db.query(
query="What is...",
k=4 # Number of results
query="What did our target market say about our product?",
return_type="chunks",
filters={"category": "Company Related"}
)
await db.close()
print(results)
asyncio.run(main())
```
For other examples <!-- -like how to make xyz in 10 lines of code- --> checkout our [documentation](https://databridge.gitbook.io/databridge-docs)!
## Architecture
DataBridge uses a modular architecture with the following base components that can be extended or replaced:

View File

@ -46,18 +46,23 @@ settings = get_settings()
# Initialize components
database = MongoDatabase(
**settings.get_mongodb_settings()
uri=settings.MONGODB_URI,
db_name=settings.DATABRIDGE_DB,
collection_name=settings.DOCUMENTS_COLLECTION
)
vector_store = MongoDBAtlasVectorStore(
settings.MONGODB_URI,
settings.DATABRIDGE_DB,
settings.CHUNKS_COLLECTION,
settings.VECTOR_INDEX_NAME
uri=settings.MONGODB_URI,
database_name=settings.DATABRIDGE_DB,
collection_name=settings.CHUNKS_COLLECTION,
index_name=settings.VECTOR_INDEX_NAME
)
storage = S3Storage(
**settings.get_storage_settings()
aws_access_key=settings.AWS_ACCESS_KEY,
aws_secret_key=settings.AWS_SECRET_ACCESS_KEY,
region_name=settings.AWS_REGION,
default_bucket=settings.S3_BUCKET
)
parser = UnstructuredAPIParser(

View File

@ -53,62 +53,6 @@ class Settings(BaseSettings):
PORT: int = Field(8000, env="PORT")
RELOAD: bool = Field(False, env="RELOAD")
def get_mongodb_settings(self) -> Dict[str, Any]:
"""Get MongoDB related settings."""
return {
"uri": self.MONGODB_URI,
"db_name": self.DATABRIDGE_DB,
"collection_name": self.DOCUMENTS_COLLECTION
}
def get_vector_store_settings(self) -> Dict[str, Any]:
"""Get vector store related settings."""
return {
"uri": self.MONGODB_URI,
"database_name": self.DATABRIDGE_DB,
"collection_name": self.CHUNKS_COLLECTION,
"index_name": self.VECTOR_INDEX_NAME
}
def get_storage_settings(self) -> Dict[str, Any]:
"""Get storage related settings."""
return {
"aws_access_key": self.AWS_ACCESS_KEY,
"aws_secret_key": self.AWS_SECRET_ACCESS_KEY,
"region_name": self.AWS_REGION,
"default_bucket": self.S3_BUCKET
}
def get_parser_settings(self) -> Dict[str, Any]:
"""Get document parser settings."""
return {
"api_key": self.UNSTRUCTURED_API_KEY,
"chunk_size": self.CHUNK_SIZE,
"chunk_overlap": self.CHUNK_OVERLAP
}
def get_embedding_settings(self) -> Dict[str, Any]:
"""Get embedding model settings."""
return {
"api_key": self.OPENAI_API_KEY,
"model_name": self.EMBEDDING_MODEL
}
def get_server_settings(self) -> Dict[str, Any]:
"""Get server related settings."""
return {
"host": self.HOST,
"port": self.PORT,
"reload": self.RELOAD,
}
def get_auth_settings(self) -> Dict[str, Any]:
"""Get authentication related settings."""
return {
"secret_key": self.JWT_SECRET_KEY,
"algorithm": self.JWT_ALGORITHM
}
class Config:
env_file = ".env"
case_sensitive = True

View File

@ -33,7 +33,6 @@ class UnstructuredAPIParser(BaseParser):
# Parse with unstructured
loader = UnstructuredLoader(
file=io.BytesIO(file),
content_type=content_type,
partition_via_api=True,
api_key=self.api_key,
chunking_strategy="by_title"

View File

@ -225,6 +225,7 @@ class DocumentService:
raise Exception("Failed to store chunk embeddings")
logger.debug("Stored chunk embeddings in vector store")
doc.chunk_ids = result
# Store document metadata
if not await self.db.store_document(doc):
raise Exception("Failed to store document metadata")

View File

@ -2,7 +2,7 @@ from datetime import datetime, timedelta, UTC
import jwt
from dotenv import load_dotenv
import os
import random
load_dotenv()
# Get JWT secret from env
@ -13,7 +13,7 @@ if not jwt_secret:
# Create payload
payload = {
"type": "developer",
"entity_id": "test_dev",
"entity_id": f"test_dev_{random.randint(0, 1000000)}",
"permissions": ["read", "write", "admin"],
"exp": datetime.now(UTC) + timedelta(days=30)
}

15
sdks/python/PUBLISH.md Normal file
View File

@ -0,0 +1,15 @@
# Publish to PyPI
- `cd` into the `sdks/python` directory
- Update the package version in `pyproject.toml`, `databridge/__init__.py`.
- Ensure you have the correct PyPI API key/certificates/ssh keys installed
```bash
# ensure you've activated the correct python environment
pip install build twine
rm -rf dist
python -m build
twine check dist/*
twine upload dist/*
```

View File

@ -1,7 +1,7 @@
from .client import DataBridge
from .exceptions import DataBridgeError, AuthenticationError
__version__ = "0.1.2"
__version__ = "0.1.4"
__all__ = [
"DataBridge",

View File

@ -1,9 +1,9 @@
import json
from typing import Dict, Any, List, Optional, Union, BinaryIO
from typing import Dict, Any, List, Literal, Optional, Union, BinaryIO
import httpx
from urllib.parse import urlparse
import jwt
from pydantic import BaseModel
from pydantic import BaseModel, Field, field_validator
from pathlib import Path
from io import BytesIO
@ -38,12 +38,27 @@ class ChunkResult(BaseModel):
download_url: Optional[str] = None
class DocumentContent(BaseModel):
"""Represents either a URL or content string"""
type: Literal["url", "string"]
value: str
filename: Optional[str] = Field(None, description="Filename when type is url")
@field_validator('filename')
def filename_only_for_url(cls, v, values):
if values.data.get('type') == 'string' and v is not None:
raise ValueError('filename can only be set when type is url')
if values.data.get('type') == 'url' and v is None:
raise ValueError('filename is required when type is url')
return v
class DocumentResult(BaseModel):
"""Query result at document level"""
score: float
document_id: str
metadata: Dict[str, Any]
content: Dict[str, str]
content: DocumentContent
class DataBridge:
@ -68,9 +83,14 @@ class DataBridge:
```
"""
def __init__(self, uri: str, timeout: int = 30):
def __init__(self, uri: str, timeout: int = 30, is_local: bool = False):
self._timeout = timeout
self._client = httpx.AsyncClient(timeout=timeout)
self._client = httpx.AsyncClient(timeout=timeout) if not is_local else httpx.AsyncClient(
timeout=timeout,
verify=False, # Disable SSL for localhost
http2=False # Force HTTP/1.1
)
self._is_local = is_local
self._setup_auth(uri)
def _setup_auth(self, uri: str) -> None:
@ -84,7 +104,7 @@ class DataBridge:
self._owner_id, self._auth_token = auth.split(':')
# Set base URL
self._base_url = f"{'http' if 'localhost' in host else 'https'}://{host}"
self._base_url = f"{'http' if self._is_local else 'https'}://{host}"
# Basic token validation
jwt.decode(self._auth_token, options={"verify_signature": False})

View File

@ -4,7 +4,7 @@ build-backend = "hatchling.build"
[project]
name = "databridge-client"
version = "0.1.2"
version = "0.1.4"
authors = [
{ name = "DataBridge", email = "databridgesuperuser@gmail.com" },
]
@ -14,6 +14,7 @@ requires-python = ">=3.8"
dependencies = [
"httpx>=0.24.0",
"pyjwt>=2.0.0",
"pydantic==2.10.3",
]
[tool.hatch.build.targets.wheel]

View File

@ -1,12 +0,0 @@
from setuptools import setup, find_packages
setup(
name="databridge-client",
version="0.1.2",
packages=find_packages(),
install_requires=[
"httpx",
"pyjwt",
],
python_requires=">=3.7",
)

View File

@ -15,6 +15,7 @@ def main():
"core.api:app",
host=settings.HOST,
port=settings.PORT,
loop="asyncio",
# reload=settings.RELOAD
)