bug fixes and end-to-end testing

2025-05-09 19:32:38 +00:00 · 2024-12-17 21:40:38 -05:00 · 2024-12-17 21:40:38 -05:00 · 4f2f221d40
commit 4f2f221d40
parent df8d7fcdd0
13 changed files with 139 additions and 146 deletions
--- a/.env.example
+++ b/.env.example
@ -1,32 +1,13 @@
-# MongoDB settings
-MONGODB_URI=your_mongodb_connection_string
-DB_NAME=DataBridgeTest
-COLLECTION_NAME=test
+MONGODB_URI="mongodb+srv://..."
+DATABRIDGE_DB="DataBridgeDB"
+DOCUMENTS_COLLECTION='documents' # Optional, default is 'documents'
+CHUNKS_COLLECTION='document_chunks' # Optional, default is 'document_chunks'

-# OpenAI settings
-OPENAI_API_KEY=your_openai_api_key
-EMBEDDING_MODEL=text-embedding-3-small
+OPENAI_API_KEY="sk-proj-..."

-# Unstructured API settings
-UNSTRUCTURED_API_KEY=your_unstructured_api_key
-UNSTRUCTURED_API_URL=https://api.unstructured.io
+UNSTRUCTURED_API_KEY="..."

-# Document processing settings
-CHUNK_SIZE=1000
-CHUNK_OVERLAP=200
-DEFAULT_K=4
+AWS_ACCESS_KEY="..."
+AWS_SECRET_ACCESS_KEY="..."

-# Storage settings
-AWS_ACCESS_KEY=your_aws_access_key
-AWS_SECRET_KEY=your_aws_secret_key
-AWS_REGION=us-east-2
-S3_BUCKET=databridge-storage
-
-# Auth settings
-JWT_SECRET_KEY=your_jwt_secret_key
-JWT_ALGORITHM=HS256
-
-# Server settings
-HOST=127.0.0.1
-PORT=8000
-RELOAD=true
+JWT_SECRET_KEY="..."
--- a/README.md
+++ b/README.md
@ -2,66 +2,104 @@

 DataBridge is an extensible, open-source document processing and retrieval system designed for building document-based applications. It provides a modular architecture for integrating document parsing, embedding generation, and vector search capabilities.

+## Table of Contents
+- [Features](#features)
+- [Starting the Server](#starting-the-server)
+- [Quick Start](#quick-start)
+- [Architecture](#architecture)
+  - [Current Integrations](#current-integrations)
+  - [Adding New Components](#adding-new-components)
+- [API Documentation](#api-documentation)
+  - [Key Endpoints](#key-endpoints)
+- [License](#license)
+- [Contributing](#contributing)
+
 ## Features

- 🔌 **Extensible Architecture**: Built with modularity in mind - easily extend or replace any component:
-  - Document Parsing: Currently integrated with Unstructured API
-  - Vector Store: Currently using MongoDB Atlas
-  - Embedding Model: Currently using OpenAI
-  - Storage: Currently using AWS S3
+- 🔌 **Extensible Architecture**: Modular design for easy component extension or replacement
 - 🔍 **Vector Search**: Semantic search capabilities
 - 🔐 **Authentication**: JWT-based auth with developer and end-user access modes
- 📊 **Metadata**: Rich metadata filtering and organization
+- 📊 **Components**: Document Parsing (Unstructured API), Vector Store (MongoDB Atlas), Embedding Model (OpenAI), Storage (AWS S3)
 - 🚀 **Python SDK**: Simple client SDK for quick integration

+## Starting the Server
+
+1. Clone the repository:
+```bash
+git clone https://github.com/databridge-org/databridge-core.git
+```
+
+2. Setup your python environment (Python 3.12 supported, but other versions may work):
+```bash
+cd databridge-core
+python -m venv .venv
+source .venv/bin/activate
+```
+
+3. Install the required dependencies:
+```bash
+pip install -r requirements.txt
+```
+
+4. Set up your environment variables, using the `.env.example` file as a reference, and creating a `.env` file in the project directory:
+
+```bash
+cp .env.example .env
+```
+
+<!-- TODO: Add instructions for setting up the environment variables, like setting up monogo account, openai account, etc. -->
+
+5. Generate a local URI:
+```bash
+python generate_local_uri.py
+```
+Copy the output and save it for use with the client SDK.
+
+6. Start the server:
+```bash
+python start_server.py
+```
+*Tip*: Visit `http://localhost:8000/docs` for the complete OpenAPI documentation.
+
 ## Quick Start

+Ensure the server is running, then use the SDK to ingest and query documents.
+
 1. Install the SDK:
 ```bash
 pip install databridge-client
 ```
-
-2. Set up your environment variables:
-```env
-MONGODB_URI=your_mongodb_connection_string
-OPENAI_API_KEY=your_openai_api_key
-UNSTRUCTURED_API_KEY=your_unstructured_api_key
-JWT_SECRET_KEY=your_jwt_secret
-AWS_ACCESS_KEY=your_aws_access_key
-AWS_SECRET_ACCESS_KEY=your_aws_secret_key
-```
-
-3. Start the server:
-```bash
-python start_server.py
-```
-
-4. Use the SDK:
+2. Use the SDK:
 ```python
 import asyncio
 from databridge import DataBridge

 async def main():
    # Initialize client
-    db = DataBridge("databridge://owner_id:auth_token@your-domain.com")
+    db = DataBridge("your_databridge_uri_here", is_local=True)
+    files = ["annual_report_2022.pdf", "marketing_strategy.docx" ,"product_launch_presentation.pptx", "company_logo.png"]
    
-    # Ingest a document
-    doc_id = await db.ingest_document(
-        content="Your document content",
-        metadata={"title": "My Document"}
-    )
+    for file in files:
+      await db.ingest_file(
+          file=file,
+          file_name=file,
+          metadata={"category": "Company Related"} # Optionally add any metadata
+      )
    
    # Query documents
    results = await db.query(
-        query="What is...",
-        k=4  # Number of results
+        query="What did our target market say about our product?",
+        return_type="chunks",
+        filters={"category": "Company Related"}
    )
-    
-    await db.close()
+
+    print(results)

 asyncio.run(main())
 ```

+For other examples <!-- -like how to make xyz in 10 lines of code- --> checkout our [documentation](https://databridge.gitbook.io/databridge-docs)!
+
 ## Architecture

 DataBridge uses a modular architecture with the following base components that can be extended or replaced:
--- a/core/api.py
+++ b/core/api.py
@ -46,18 +46,23 @@ settings = get_settings()

 # Initialize components
 database = MongoDatabase(
-    **settings.get_mongodb_settings()
+    uri=settings.MONGODB_URI,
+    db_name=settings.DATABRIDGE_DB,
+    collection_name=settings.DOCUMENTS_COLLECTION
 )

 vector_store = MongoDBAtlasVectorStore(
-    settings.MONGODB_URI,
-    settings.DATABRIDGE_DB,
-    settings.CHUNKS_COLLECTION,
-    settings.VECTOR_INDEX_NAME
+    uri=settings.MONGODB_URI,
+    database_name=settings.DATABRIDGE_DB,
+    collection_name=settings.CHUNKS_COLLECTION,
+    index_name=settings.VECTOR_INDEX_NAME
 )

 storage = S3Storage(
-    **settings.get_storage_settings()
+    aws_access_key=settings.AWS_ACCESS_KEY,
+    aws_secret_key=settings.AWS_SECRET_ACCESS_KEY,
+    region_name=settings.AWS_REGION,
+    default_bucket=settings.S3_BUCKET
 )

 parser = UnstructuredAPIParser(
--- a/core/config.py
+++ b/core/config.py
@ -53,62 +53,6 @@ class Settings(BaseSettings):
    PORT: int = Field(8000, env="PORT")
    RELOAD: bool = Field(False, env="RELOAD")

-    def get_mongodb_settings(self) -> Dict[str, Any]:
-        """Get MongoDB related settings."""
-        return {
-            "uri": self.MONGODB_URI,
-            "db_name": self.DATABRIDGE_DB,
-            "collection_name": self.DOCUMENTS_COLLECTION
-        }
-
-    def get_vector_store_settings(self) -> Dict[str, Any]:
-        """Get vector store related settings."""
-        return {
-            "uri": self.MONGODB_URI,
-            "database_name": self.DATABRIDGE_DB,
-            "collection_name": self.CHUNKS_COLLECTION,
-            "index_name": self.VECTOR_INDEX_NAME
-        }
-
-    def get_storage_settings(self) -> Dict[str, Any]:
-        """Get storage related settings."""
-        return {
-            "aws_access_key": self.AWS_ACCESS_KEY,
-            "aws_secret_key": self.AWS_SECRET_ACCESS_KEY,
-            "region_name": self.AWS_REGION,
-            "default_bucket": self.S3_BUCKET
-        }
-
-    def get_parser_settings(self) -> Dict[str, Any]:
-        """Get document parser settings."""
-        return {
-            "api_key": self.UNSTRUCTURED_API_KEY,
-            "chunk_size": self.CHUNK_SIZE,
-            "chunk_overlap": self.CHUNK_OVERLAP
-        }
-
-    def get_embedding_settings(self) -> Dict[str, Any]:
-        """Get embedding model settings."""
-        return {
-            "api_key": self.OPENAI_API_KEY,
-            "model_name": self.EMBEDDING_MODEL
-        }
-
-    def get_server_settings(self) -> Dict[str, Any]:
-        """Get server related settings."""
-        return {
-            "host": self.HOST,
-            "port": self.PORT,
-            "reload": self.RELOAD,
-        }
-
-    def get_auth_settings(self) -> Dict[str, Any]:
-        """Get authentication related settings."""
-        return {
-            "secret_key": self.JWT_SECRET_KEY,
-            "algorithm": self.JWT_ALGORITHM
-        }
-
    class Config:
        env_file = ".env"
        case_sensitive = True
--- a/core/parser/unstructured_parser.py
+++ b/core/parser/unstructured_parser.py
@ -33,7 +33,6 @@ class UnstructuredAPIParser(BaseParser):
        # Parse with unstructured
        loader = UnstructuredLoader(
            file=io.BytesIO(file),
-            content_type=content_type,
            partition_via_api=True,
            api_key=self.api_key,
            chunking_strategy="by_title"
--- a/core/services/document_service.py
+++ b/core/services/document_service.py
@ -225,6 +225,7 @@ class DocumentService:
            raise Exception("Failed to store chunk embeddings")
        logger.debug("Stored chunk embeddings in vector store")

+        doc.chunk_ids = result
        # Store document metadata
        if not await self.db.store_document(doc):
            raise Exception("Failed to store document metadata")
--- a/generate_local_uri.py
+++ b/generate_local_uri.py
@ -2,7 +2,7 @@ from datetime import datetime, timedelta, UTC
 import jwt
 from dotenv import load_dotenv
 import os
-
+import random
 load_dotenv()

 # Get JWT secret from env
@ -13,7 +13,7 @@ if not jwt_secret:
 # Create payload
 payload = {
    "type": "developer",
-    "entity_id": "test_dev",
+    "entity_id": f"test_dev_{random.randint(0, 1000000)}",
    "permissions": ["read", "write", "admin"],
    "exp": datetime.now(UTC) + timedelta(days=30)
 }
--- a/sdks/python/PUBLISH.md
+++ b/sdks/python/PUBLISH.md
@ -0,0 +1,15 @@
+# Publish to PyPI
+
+- `cd` into the `sdks/python` directory
+- Update the package version in `pyproject.toml`, `databridge/__init__.py`.
+- Ensure you have the correct PyPI API key/certificates/ssh keys installed
+
+```bash
+# ensure you've activated the correct python environment
+pip install build twine
+
+rm -rf dist
+python -m build
+twine check dist/*
+twine upload dist/*
+```
--- a/sdks/python/databridge/init.py
+++ b/sdks/python/databridge/init.py
@ -1,7 +1,7 @@
 from .client import DataBridge
 from .exceptions import DataBridgeError, AuthenticationError

-__version__ = "0.1.2"
+__version__ = "0.1.4"

 __all__ = [
    "DataBridge",
--- a/sdks/python/databridge/client.py
+++ b/sdks/python/databridge/client.py
@ -1,9 +1,9 @@
 import json
-from typing import Dict, Any, List, Optional, Union, BinaryIO
+from typing import Dict, Any, List, Literal, Optional, Union, BinaryIO
 import httpx
 from urllib.parse import urlparse
 import jwt
-from pydantic import BaseModel
+from pydantic import BaseModel, Field, field_validator
 from pathlib import Path
 from io import BytesIO

@ -38,12 +38,27 @@ class ChunkResult(BaseModel):
    download_url: Optional[str] = None


+class DocumentContent(BaseModel):
+    """Represents either a URL or content string"""
+    type: Literal["url", "string"]
+    value: str
+    filename: Optional[str] = Field(None, description="Filename when type is url")
+
+    @field_validator('filename')
+    def filename_only_for_url(cls, v, values):
+        if values.data.get('type') == 'string' and v is not None:
+            raise ValueError('filename can only be set when type is url')
+        if values.data.get('type') == 'url' and v is None:
+            raise ValueError('filename is required when type is url')
+        return v
+
+
 class DocumentResult(BaseModel):
    """Query result at document level"""
    score: float
    document_id: str
    metadata: Dict[str, Any]
-    content: Dict[str, str]
+    content: DocumentContent


 class DataBridge:
@ -68,9 +83,14 @@ class DataBridge:
        ```
    """

-    def __init__(self, uri: str, timeout: int = 30):
+    def __init__(self, uri: str, timeout: int = 30, is_local: bool = False):
        self._timeout = timeout
-        self._client = httpx.AsyncClient(timeout=timeout)
+        self._client = httpx.AsyncClient(timeout=timeout) if not is_local else httpx.AsyncClient(
+            timeout=timeout,
+            verify=False,  # Disable SSL for localhost
+            http2=False    # Force HTTP/1.1
+        )
+        self._is_local = is_local
        self._setup_auth(uri)

    def _setup_auth(self, uri: str) -> None:
@ -84,7 +104,7 @@ class DataBridge:
        self._owner_id, self._auth_token = auth.split(':')
            
        # Set base URL
-        self._base_url = f"{'http' if 'localhost' in host else 'https'}://{host}"
+        self._base_url = f"{'http' if self._is_local else 'https'}://{host}"

        # Basic token validation
        jwt.decode(self._auth_token, options={"verify_signature": False})
--- a/sdks/python/pyproject.toml
+++ b/sdks/python/pyproject.toml
@ -4,7 +4,7 @@ build-backend = "hatchling.build"

 [project]
 name = "databridge-client"
-version = "0.1.2"
+version = "0.1.4"
 authors = [
    { name = "DataBridge", email = "databridgesuperuser@gmail.com" },
 ]
@ -14,6 +14,7 @@ requires-python = ">=3.8"
 dependencies = [
    "httpx>=0.24.0",
    "pyjwt>=2.0.0",
+    "pydantic==2.10.3",
 ]

 [tool.hatch.build.targets.wheel]
--- a/sdks/python/setup.py
+++ b/sdks/python/setup.py
@ -1,12 +0,0 @@
-from setuptools import setup, find_packages
-
-setup(
-    name="databridge-client",
-    version="0.1.2",
-    packages=find_packages(),
-    install_requires=[
-        "httpx",
-        "pyjwt",
-    ],
-    python_requires=">=3.7",
-)
--- a/start_server.py
+++ b/start_server.py
@ -15,6 +15,7 @@ def main():
        "core.api:app",
        host=settings.HOST,
        port=settings.PORT,
+        loop="asyncio",
        # reload=settings.RELOAD
    )