mirror of
https://github.com/james-m-jordan/morphik-core.git
synced 2025-05-09 19:32:38 +00:00
bug fixes and end-to-end testing
This commit is contained in:
parent
df8d7fcdd0
commit
4f2f221d40
37
.env.example
37
.env.example
@ -1,32 +1,13 @@
|
||||
# MongoDB settings
|
||||
MONGODB_URI=your_mongodb_connection_string
|
||||
DB_NAME=DataBridgeTest
|
||||
COLLECTION_NAME=test
|
||||
MONGODB_URI="mongodb+srv://..."
|
||||
DATABRIDGE_DB="DataBridgeDB"
|
||||
DOCUMENTS_COLLECTION='documents' # Optional, default is 'documents'
|
||||
CHUNKS_COLLECTION='document_chunks' # Optional, default is 'document_chunks'
|
||||
|
||||
# OpenAI settings
|
||||
OPENAI_API_KEY=your_openai_api_key
|
||||
EMBEDDING_MODEL=text-embedding-3-small
|
||||
OPENAI_API_KEY="sk-proj-..."
|
||||
|
||||
# Unstructured API settings
|
||||
UNSTRUCTURED_API_KEY=your_unstructured_api_key
|
||||
UNSTRUCTURED_API_URL=https://api.unstructured.io
|
||||
UNSTRUCTURED_API_KEY="..."
|
||||
|
||||
# Document processing settings
|
||||
CHUNK_SIZE=1000
|
||||
CHUNK_OVERLAP=200
|
||||
DEFAULT_K=4
|
||||
AWS_ACCESS_KEY="..."
|
||||
AWS_SECRET_ACCESS_KEY="..."
|
||||
|
||||
# Storage settings
|
||||
AWS_ACCESS_KEY=your_aws_access_key
|
||||
AWS_SECRET_KEY=your_aws_secret_key
|
||||
AWS_REGION=us-east-2
|
||||
S3_BUCKET=databridge-storage
|
||||
|
||||
# Auth settings
|
||||
JWT_SECRET_KEY=your_jwt_secret_key
|
||||
JWT_ALGORITHM=HS256
|
||||
|
||||
# Server settings
|
||||
HOST=127.0.0.1
|
||||
PORT=8000
|
||||
RELOAD=true
|
||||
JWT_SECRET_KEY="..."
|
||||
|
104
README.md
104
README.md
@ -2,66 +2,104 @@
|
||||
|
||||
DataBridge is an extensible, open-source document processing and retrieval system designed for building document-based applications. It provides a modular architecture for integrating document parsing, embedding generation, and vector search capabilities.
|
||||
|
||||
## Table of Contents
|
||||
- [Features](#features)
|
||||
- [Starting the Server](#starting-the-server)
|
||||
- [Quick Start](#quick-start)
|
||||
- [Architecture](#architecture)
|
||||
- [Current Integrations](#current-integrations)
|
||||
- [Adding New Components](#adding-new-components)
|
||||
- [API Documentation](#api-documentation)
|
||||
- [Key Endpoints](#key-endpoints)
|
||||
- [License](#license)
|
||||
- [Contributing](#contributing)
|
||||
|
||||
## Features
|
||||
|
||||
- 🔌 **Extensible Architecture**: Built with modularity in mind - easily extend or replace any component:
|
||||
- Document Parsing: Currently integrated with Unstructured API
|
||||
- Vector Store: Currently using MongoDB Atlas
|
||||
- Embedding Model: Currently using OpenAI
|
||||
- Storage: Currently using AWS S3
|
||||
- 🔌 **Extensible Architecture**: Modular design for easy component extension or replacement
|
||||
- 🔍 **Vector Search**: Semantic search capabilities
|
||||
- 🔐 **Authentication**: JWT-based auth with developer and end-user access modes
|
||||
- 📊 **Metadata**: Rich metadata filtering and organization
|
||||
- 📊 **Components**: Document Parsing (Unstructured API), Vector Store (MongoDB Atlas), Embedding Model (OpenAI), Storage (AWS S3)
|
||||
- 🚀 **Python SDK**: Simple client SDK for quick integration
|
||||
|
||||
## Starting the Server
|
||||
|
||||
1. Clone the repository:
|
||||
```bash
|
||||
git clone https://github.com/databridge-org/databridge-core.git
|
||||
```
|
||||
|
||||
2. Setup your python environment (Python 3.12 supported, but other versions may work):
|
||||
```bash
|
||||
cd databridge-core
|
||||
python -m venv .venv
|
||||
source .venv/bin/activate
|
||||
```
|
||||
|
||||
3. Install the required dependencies:
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
4. Set up your environment variables, using the `.env.example` file as a reference, and creating a `.env` file in the project directory:
|
||||
|
||||
```bash
|
||||
cp .env.example .env
|
||||
```
|
||||
|
||||
<!-- TODO: Add instructions for setting up the environment variables, like setting up monogo account, openai account, etc. -->
|
||||
|
||||
5. Generate a local URI:
|
||||
```bash
|
||||
python generate_local_uri.py
|
||||
```
|
||||
Copy the output and save it for use with the client SDK.
|
||||
|
||||
6. Start the server:
|
||||
```bash
|
||||
python start_server.py
|
||||
```
|
||||
*Tip*: Visit `http://localhost:8000/docs` for the complete OpenAPI documentation.
|
||||
|
||||
## Quick Start
|
||||
|
||||
Ensure the server is running, then use the SDK to ingest and query documents.
|
||||
|
||||
1. Install the SDK:
|
||||
```bash
|
||||
pip install databridge-client
|
||||
```
|
||||
|
||||
2. Set up your environment variables:
|
||||
```env
|
||||
MONGODB_URI=your_mongodb_connection_string
|
||||
OPENAI_API_KEY=your_openai_api_key
|
||||
UNSTRUCTURED_API_KEY=your_unstructured_api_key
|
||||
JWT_SECRET_KEY=your_jwt_secret
|
||||
AWS_ACCESS_KEY=your_aws_access_key
|
||||
AWS_SECRET_ACCESS_KEY=your_aws_secret_key
|
||||
```
|
||||
|
||||
3. Start the server:
|
||||
```bash
|
||||
python start_server.py
|
||||
```
|
||||
|
||||
4. Use the SDK:
|
||||
2. Use the SDK:
|
||||
```python
|
||||
import asyncio
|
||||
from databridge import DataBridge
|
||||
|
||||
async def main():
|
||||
# Initialize client
|
||||
db = DataBridge("databridge://owner_id:auth_token@your-domain.com")
|
||||
db = DataBridge("your_databridge_uri_here", is_local=True)
|
||||
files = ["annual_report_2022.pdf", "marketing_strategy.docx" ,"product_launch_presentation.pptx", "company_logo.png"]
|
||||
|
||||
# Ingest a document
|
||||
doc_id = await db.ingest_document(
|
||||
content="Your document content",
|
||||
metadata={"title": "My Document"}
|
||||
)
|
||||
for file in files:
|
||||
await db.ingest_file(
|
||||
file=file,
|
||||
file_name=file,
|
||||
metadata={"category": "Company Related"} # Optionally add any metadata
|
||||
)
|
||||
|
||||
# Query documents
|
||||
results = await db.query(
|
||||
query="What is...",
|
||||
k=4 # Number of results
|
||||
query="What did our target market say about our product?",
|
||||
return_type="chunks",
|
||||
filters={"category": "Company Related"}
|
||||
)
|
||||
|
||||
await db.close()
|
||||
|
||||
print(results)
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
For other examples <!-- -like how to make xyz in 10 lines of code- --> checkout our [documentation](https://databridge.gitbook.io/databridge-docs)!
|
||||
|
||||
## Architecture
|
||||
|
||||
DataBridge uses a modular architecture with the following base components that can be extended or replaced:
|
||||
|
17
core/api.py
17
core/api.py
@ -46,18 +46,23 @@ settings = get_settings()
|
||||
|
||||
# Initialize components
|
||||
database = MongoDatabase(
|
||||
**settings.get_mongodb_settings()
|
||||
uri=settings.MONGODB_URI,
|
||||
db_name=settings.DATABRIDGE_DB,
|
||||
collection_name=settings.DOCUMENTS_COLLECTION
|
||||
)
|
||||
|
||||
vector_store = MongoDBAtlasVectorStore(
|
||||
settings.MONGODB_URI,
|
||||
settings.DATABRIDGE_DB,
|
||||
settings.CHUNKS_COLLECTION,
|
||||
settings.VECTOR_INDEX_NAME
|
||||
uri=settings.MONGODB_URI,
|
||||
database_name=settings.DATABRIDGE_DB,
|
||||
collection_name=settings.CHUNKS_COLLECTION,
|
||||
index_name=settings.VECTOR_INDEX_NAME
|
||||
)
|
||||
|
||||
storage = S3Storage(
|
||||
**settings.get_storage_settings()
|
||||
aws_access_key=settings.AWS_ACCESS_KEY,
|
||||
aws_secret_key=settings.AWS_SECRET_ACCESS_KEY,
|
||||
region_name=settings.AWS_REGION,
|
||||
default_bucket=settings.S3_BUCKET
|
||||
)
|
||||
|
||||
parser = UnstructuredAPIParser(
|
||||
|
@ -53,62 +53,6 @@ class Settings(BaseSettings):
|
||||
PORT: int = Field(8000, env="PORT")
|
||||
RELOAD: bool = Field(False, env="RELOAD")
|
||||
|
||||
def get_mongodb_settings(self) -> Dict[str, Any]:
|
||||
"""Get MongoDB related settings."""
|
||||
return {
|
||||
"uri": self.MONGODB_URI,
|
||||
"db_name": self.DATABRIDGE_DB,
|
||||
"collection_name": self.DOCUMENTS_COLLECTION
|
||||
}
|
||||
|
||||
def get_vector_store_settings(self) -> Dict[str, Any]:
|
||||
"""Get vector store related settings."""
|
||||
return {
|
||||
"uri": self.MONGODB_URI,
|
||||
"database_name": self.DATABRIDGE_DB,
|
||||
"collection_name": self.CHUNKS_COLLECTION,
|
||||
"index_name": self.VECTOR_INDEX_NAME
|
||||
}
|
||||
|
||||
def get_storage_settings(self) -> Dict[str, Any]:
|
||||
"""Get storage related settings."""
|
||||
return {
|
||||
"aws_access_key": self.AWS_ACCESS_KEY,
|
||||
"aws_secret_key": self.AWS_SECRET_ACCESS_KEY,
|
||||
"region_name": self.AWS_REGION,
|
||||
"default_bucket": self.S3_BUCKET
|
||||
}
|
||||
|
||||
def get_parser_settings(self) -> Dict[str, Any]:
|
||||
"""Get document parser settings."""
|
||||
return {
|
||||
"api_key": self.UNSTRUCTURED_API_KEY,
|
||||
"chunk_size": self.CHUNK_SIZE,
|
||||
"chunk_overlap": self.CHUNK_OVERLAP
|
||||
}
|
||||
|
||||
def get_embedding_settings(self) -> Dict[str, Any]:
|
||||
"""Get embedding model settings."""
|
||||
return {
|
||||
"api_key": self.OPENAI_API_KEY,
|
||||
"model_name": self.EMBEDDING_MODEL
|
||||
}
|
||||
|
||||
def get_server_settings(self) -> Dict[str, Any]:
|
||||
"""Get server related settings."""
|
||||
return {
|
||||
"host": self.HOST,
|
||||
"port": self.PORT,
|
||||
"reload": self.RELOAD,
|
||||
}
|
||||
|
||||
def get_auth_settings(self) -> Dict[str, Any]:
|
||||
"""Get authentication related settings."""
|
||||
return {
|
||||
"secret_key": self.JWT_SECRET_KEY,
|
||||
"algorithm": self.JWT_ALGORITHM
|
||||
}
|
||||
|
||||
class Config:
|
||||
env_file = ".env"
|
||||
case_sensitive = True
|
||||
|
@ -33,7 +33,6 @@ class UnstructuredAPIParser(BaseParser):
|
||||
# Parse with unstructured
|
||||
loader = UnstructuredLoader(
|
||||
file=io.BytesIO(file),
|
||||
content_type=content_type,
|
||||
partition_via_api=True,
|
||||
api_key=self.api_key,
|
||||
chunking_strategy="by_title"
|
||||
|
@ -225,6 +225,7 @@ class DocumentService:
|
||||
raise Exception("Failed to store chunk embeddings")
|
||||
logger.debug("Stored chunk embeddings in vector store")
|
||||
|
||||
doc.chunk_ids = result
|
||||
# Store document metadata
|
||||
if not await self.db.store_document(doc):
|
||||
raise Exception("Failed to store document metadata")
|
||||
|
@ -2,7 +2,7 @@ from datetime import datetime, timedelta, UTC
|
||||
import jwt
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
|
||||
import random
|
||||
load_dotenv()
|
||||
|
||||
# Get JWT secret from env
|
||||
@ -13,7 +13,7 @@ if not jwt_secret:
|
||||
# Create payload
|
||||
payload = {
|
||||
"type": "developer",
|
||||
"entity_id": "test_dev",
|
||||
"entity_id": f"test_dev_{random.randint(0, 1000000)}",
|
||||
"permissions": ["read", "write", "admin"],
|
||||
"exp": datetime.now(UTC) + timedelta(days=30)
|
||||
}
|
||||
|
15
sdks/python/PUBLISH.md
Normal file
15
sdks/python/PUBLISH.md
Normal file
@ -0,0 +1,15 @@
|
||||
# Publish to PyPI
|
||||
|
||||
- `cd` into the `sdks/python` directory
|
||||
- Update the package version in `pyproject.toml`, `databridge/__init__.py`.
|
||||
- Ensure you have the correct PyPI API key/certificates/ssh keys installed
|
||||
|
||||
```bash
|
||||
# ensure you've activated the correct python environment
|
||||
pip install build twine
|
||||
|
||||
rm -rf dist
|
||||
python -m build
|
||||
twine check dist/*
|
||||
twine upload dist/*
|
||||
```
|
@ -1,7 +1,7 @@
|
||||
from .client import DataBridge
|
||||
from .exceptions import DataBridgeError, AuthenticationError
|
||||
|
||||
__version__ = "0.1.2"
|
||||
__version__ = "0.1.4"
|
||||
|
||||
__all__ = [
|
||||
"DataBridge",
|
||||
|
@ -1,9 +1,9 @@
|
||||
import json
|
||||
from typing import Dict, Any, List, Optional, Union, BinaryIO
|
||||
from typing import Dict, Any, List, Literal, Optional, Union, BinaryIO
|
||||
import httpx
|
||||
from urllib.parse import urlparse
|
||||
import jwt
|
||||
from pydantic import BaseModel
|
||||
from pydantic import BaseModel, Field, field_validator
|
||||
from pathlib import Path
|
||||
from io import BytesIO
|
||||
|
||||
@ -38,12 +38,27 @@ class ChunkResult(BaseModel):
|
||||
download_url: Optional[str] = None
|
||||
|
||||
|
||||
class DocumentContent(BaseModel):
|
||||
"""Represents either a URL or content string"""
|
||||
type: Literal["url", "string"]
|
||||
value: str
|
||||
filename: Optional[str] = Field(None, description="Filename when type is url")
|
||||
|
||||
@field_validator('filename')
|
||||
def filename_only_for_url(cls, v, values):
|
||||
if values.data.get('type') == 'string' and v is not None:
|
||||
raise ValueError('filename can only be set when type is url')
|
||||
if values.data.get('type') == 'url' and v is None:
|
||||
raise ValueError('filename is required when type is url')
|
||||
return v
|
||||
|
||||
|
||||
class DocumentResult(BaseModel):
|
||||
"""Query result at document level"""
|
||||
score: float
|
||||
document_id: str
|
||||
metadata: Dict[str, Any]
|
||||
content: Dict[str, str]
|
||||
content: DocumentContent
|
||||
|
||||
|
||||
class DataBridge:
|
||||
@ -68,9 +83,14 @@ class DataBridge:
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, uri: str, timeout: int = 30):
|
||||
def __init__(self, uri: str, timeout: int = 30, is_local: bool = False):
|
||||
self._timeout = timeout
|
||||
self._client = httpx.AsyncClient(timeout=timeout)
|
||||
self._client = httpx.AsyncClient(timeout=timeout) if not is_local else httpx.AsyncClient(
|
||||
timeout=timeout,
|
||||
verify=False, # Disable SSL for localhost
|
||||
http2=False # Force HTTP/1.1
|
||||
)
|
||||
self._is_local = is_local
|
||||
self._setup_auth(uri)
|
||||
|
||||
def _setup_auth(self, uri: str) -> None:
|
||||
@ -84,7 +104,7 @@ class DataBridge:
|
||||
self._owner_id, self._auth_token = auth.split(':')
|
||||
|
||||
# Set base URL
|
||||
self._base_url = f"{'http' if 'localhost' in host else 'https'}://{host}"
|
||||
self._base_url = f"{'http' if self._is_local else 'https'}://{host}"
|
||||
|
||||
# Basic token validation
|
||||
jwt.decode(self._auth_token, options={"verify_signature": False})
|
||||
|
@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
||||
|
||||
[project]
|
||||
name = "databridge-client"
|
||||
version = "0.1.2"
|
||||
version = "0.1.4"
|
||||
authors = [
|
||||
{ name = "DataBridge", email = "databridgesuperuser@gmail.com" },
|
||||
]
|
||||
@ -14,6 +14,7 @@ requires-python = ">=3.8"
|
||||
dependencies = [
|
||||
"httpx>=0.24.0",
|
||||
"pyjwt>=2.0.0",
|
||||
"pydantic==2.10.3",
|
||||
]
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
|
@ -1,12 +0,0 @@
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
setup(
|
||||
name="databridge-client",
|
||||
version="0.1.2",
|
||||
packages=find_packages(),
|
||||
install_requires=[
|
||||
"httpx",
|
||||
"pyjwt",
|
||||
],
|
||||
python_requires=">=3.7",
|
||||
)
|
@ -15,6 +15,7 @@ def main():
|
||||
"core.api:app",
|
||||
host=settings.HOST,
|
||||
port=settings.PORT,
|
||||
loop="asyncio",
|
||||
# reload=settings.RELOAD
|
||||
)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user