mirror of
https://github.com/james-m-jordan/morphik-core.git
synced 2025-05-09 19:32:38 +00:00
basic ingestion using unstructured and k-nearest retrieval works
This commit is contained in:
parent
db013e4713
commit
30577dc0ff
23
.gitignore
vendored
Normal file
23
.gitignore
vendored
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
# Python-related files
|
||||||
|
*__pycache__/
|
||||||
|
*.pyc
|
||||||
|
*.pyo
|
||||||
|
*.pyd
|
||||||
|
.Python
|
||||||
|
env/
|
||||||
|
.env
|
||||||
|
venv/*
|
||||||
|
ENV/
|
||||||
|
dist/
|
||||||
|
build/
|
||||||
|
*.egg-info/
|
||||||
|
.eggs/
|
||||||
|
*.egg
|
||||||
|
*.pytest_cache/
|
||||||
|
|
||||||
|
|
||||||
|
# Virtual environment
|
||||||
|
.venv/
|
||||||
|
.vscode/
|
||||||
|
|
||||||
|
*.DS_Store
|
0
__init__.py
Normal file
0
__init__.py
Normal file
9
base_embedding_model.py
Normal file
9
base_embedding_model.py
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import List, Union
|
||||||
|
|
||||||
|
|
||||||
|
class BaseEmbeddingModel(ABC):
|
||||||
|
@abstractmethod
|
||||||
|
async def embed(self, text: Union[str, List[str]]) -> List[float]:
|
||||||
|
"""Generate embeddings for input text"""
|
||||||
|
pass
|
9
base_parser.py
Normal file
9
base_parser.py
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import Dict, Any, List
|
||||||
|
|
||||||
|
|
||||||
|
class BaseParser(ABC):
|
||||||
|
@abstractmethod
|
||||||
|
def parse(self, content: str, metadata: Dict[str, Any]) -> List[str]:
|
||||||
|
"""Parse content into chunks"""
|
||||||
|
pass
|
9
base_planner.py
Normal file
9
base_planner.py
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import Dict, Any
|
||||||
|
|
||||||
|
|
||||||
|
class BasePlanner(ABC):
|
||||||
|
@abstractmethod
|
||||||
|
def plan_retrieval(self, query: str, **kwargs) -> Dict[str, Any]:
|
||||||
|
"""Create execution plan for retrieval"""
|
||||||
|
pass
|
15
base_vector_store.py
Normal file
15
base_vector_store.py
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import List
|
||||||
|
from document import DocumentChunk
|
||||||
|
|
||||||
|
|
||||||
|
class BaseVectorStore(ABC):
|
||||||
|
@abstractmethod
|
||||||
|
def store_embeddings(self, chunks: List[DocumentChunk]) -> bool:
|
||||||
|
"""Store document chunks and their embeddings"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def query_similar(self, query_embedding: List[float], k: int, owner_id: str) -> List[DocumentChunk]:
|
||||||
|
"""Find similar chunks"""
|
||||||
|
pass
|
171
core.py
171
core.py
@ -1,171 +0,0 @@
|
|||||||
from typing import List, Dict, Any, Optional
|
|
||||||
from abc import ABC, abstractmethod
|
|
||||||
from datetime import datetime
|
|
||||||
import uuid
|
|
||||||
|
|
||||||
# Base Classes and Interfaces
|
|
||||||
|
|
||||||
class Document:
|
|
||||||
def __init__(self, content: str, metadata: Dict[str, Any], owner_id: str):
|
|
||||||
self.id = str(uuid.uuid4())
|
|
||||||
self.content = content
|
|
||||||
self.metadata = metadata
|
|
||||||
self.owner_id = owner_id
|
|
||||||
self.created_at = datetime.utcnow()
|
|
||||||
self.chunks: List[DocumentChunk] = []
|
|
||||||
|
|
||||||
class DocumentChunk:
|
|
||||||
def __init__(self, content: str, embedding: List[float], doc_id: str):
|
|
||||||
self.id = str(uuid.uuid4())
|
|
||||||
self.content = content
|
|
||||||
self.embedding = embedding
|
|
||||||
self.doc_id = doc_id
|
|
||||||
|
|
||||||
class BaseParser(ABC):
|
|
||||||
@abstractmethod
|
|
||||||
def parse(self, content: str, metadata: Dict[str, Any]) -> List[str]:
|
|
||||||
"""Parse content into chunks"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class BasePlanner(ABC):
|
|
||||||
@abstractmethod
|
|
||||||
def plan_retrieval(self, query: str, **kwargs) -> Dict[str, Any]:
|
|
||||||
"""Create execution plan for retrieval"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class BaseVectorStore(ABC):
|
|
||||||
@abstractmethod
|
|
||||||
def store_embeddings(self, chunks: List[DocumentChunk]) -> bool:
|
|
||||||
"""Store document chunks and their embeddings"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def query_similar(self, query_embedding: List[float], k: int, owner_id: str) -> List[DocumentChunk]:
|
|
||||||
"""Find similar chunks"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Concrete Implementations
|
|
||||||
|
|
||||||
class SimpleParser(BaseParser):
|
|
||||||
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
|
|
||||||
self.chunk_size = chunk_size
|
|
||||||
self.chunk_overlap = chunk_overlap
|
|
||||||
|
|
||||||
def parse(self, content: str, metadata: Dict[str, Any]) -> List[str]:
|
|
||||||
# Simple implementation - split by chunk_size
|
|
||||||
chunks = []
|
|
||||||
for i in range(0, len(content), self.chunk_size - self.chunk_overlap):
|
|
||||||
chunk = content[i:i + self.chunk_size]
|
|
||||||
if chunk:
|
|
||||||
chunks.append(chunk)
|
|
||||||
return chunks
|
|
||||||
|
|
||||||
class SimpleRAGPlanner(BasePlanner):
|
|
||||||
def __init__(self, k: int = 3):
|
|
||||||
self.k = k
|
|
||||||
|
|
||||||
def plan_retrieval(self, query: str, **kwargs) -> Dict[str, Any]:
|
|
||||||
return {
|
|
||||||
"strategy": "simple_rag",
|
|
||||||
"k": kwargs.get("k", self.k),
|
|
||||||
"query": query
|
|
||||||
}
|
|
||||||
|
|
||||||
# Main DataBridge Class
|
|
||||||
|
|
||||||
class DataBridge:
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
parser: BaseParser,
|
|
||||||
planner: BasePlanner,
|
|
||||||
vector_store: BaseVectorStore,
|
|
||||||
embedding_model: Any # This would be your chosen embedding model
|
|
||||||
):
|
|
||||||
self.parser = parser
|
|
||||||
self.planner = planner
|
|
||||||
self.vector_store = vector_store
|
|
||||||
self.embedding_model = embedding_model
|
|
||||||
|
|
||||||
async def ingest_document(
|
|
||||||
self,
|
|
||||||
content: str,
|
|
||||||
metadata: Dict[str, Any],
|
|
||||||
owner_id: str
|
|
||||||
) -> Document:
|
|
||||||
# Create document
|
|
||||||
doc = Document(content, metadata, owner_id)
|
|
||||||
|
|
||||||
# Parse into chunks
|
|
||||||
chunk_texts = self.parser.parse(content, metadata)
|
|
||||||
|
|
||||||
# Create embeddings and chunks
|
|
||||||
for chunk_text in chunk_texts:
|
|
||||||
embedding = await self.embedding_model.embed(chunk_text)
|
|
||||||
chunk = DocumentChunk(chunk_text, embedding, doc.id)
|
|
||||||
doc.chunks.append(chunk)
|
|
||||||
|
|
||||||
# Store in vector store
|
|
||||||
success = self.vector_store.store_embeddings(doc.chunks)
|
|
||||||
if not success:
|
|
||||||
raise Exception("Failed to store embeddings")
|
|
||||||
|
|
||||||
return doc
|
|
||||||
|
|
||||||
async def query(
|
|
||||||
self,
|
|
||||||
query: str,
|
|
||||||
owner_id: str,
|
|
||||||
**kwargs
|
|
||||||
) -> List[Dict[str, Any]]:
|
|
||||||
# Create plan
|
|
||||||
plan = self.planner.plan_retrieval(query, **kwargs)
|
|
||||||
|
|
||||||
# Get query embedding
|
|
||||||
query_embedding = await self.embedding_model.embed(query)
|
|
||||||
|
|
||||||
# Execute plan
|
|
||||||
chunks = self.vector_store.query_similar(
|
|
||||||
query_embedding,
|
|
||||||
k=plan["k"],
|
|
||||||
owner_id=owner_id
|
|
||||||
)
|
|
||||||
|
|
||||||
# Format results
|
|
||||||
results = []
|
|
||||||
for chunk in chunks:
|
|
||||||
results.append({
|
|
||||||
"content": chunk.content,
|
|
||||||
"doc_id": chunk.doc_id,
|
|
||||||
"chunk_id": chunk.id,
|
|
||||||
"score": chunk.score if hasattr(chunk, "score") else None
|
|
||||||
})
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
||||||
# Example usage
|
|
||||||
"""
|
|
||||||
# Initialize components
|
|
||||||
parser = SimpleParser()
|
|
||||||
planner = SimpleRAGPlanner()
|
|
||||||
vector_store = YourVectorStore() # Implement with chosen backend
|
|
||||||
embedding_model = YourEmbeddingModel() # Implement with chosen model
|
|
||||||
|
|
||||||
# Create DataBridge instance
|
|
||||||
db = DataBridge(parser, planner, vector_store, embedding_model)
|
|
||||||
|
|
||||||
# Ingest a document
|
|
||||||
doc = await db.ingest_document(
|
|
||||||
content="Your document content here",
|
|
||||||
metadata={"source": "pdf", "title": "Example Doc"},
|
|
||||||
owner_id="user123"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Query the system
|
|
||||||
results = await db.query(
|
|
||||||
query="Your query here",
|
|
||||||
owner_id="user123",
|
|
||||||
k=5 # optional override
|
|
||||||
)
|
|
||||||
"""
|
|
106
databridge.py
Normal file
106
databridge.py
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
from typing import Dict, Any, List
|
||||||
|
from databridge_uri import DataBridgeURI
|
||||||
|
from document import Document, DocumentChunk
|
||||||
|
from mongo_vector_store import MongoDBAtlasVectorStore
|
||||||
|
from openai_embedding_model import OpenAIEmbeddingModel
|
||||||
|
from unstructured_parser import UnstructuredAPIParser
|
||||||
|
from simple_planner import SimpleRAGPlanner
|
||||||
|
|
||||||
|
|
||||||
|
class DataBridge:
|
||||||
|
"""
|
||||||
|
DataBridge with owner authentication and authorization.
|
||||||
|
Configured via URI containing owner credentials.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, uri: str):
|
||||||
|
# Parse URI and initialize configuration
|
||||||
|
self.config = DataBridgeURI(uri)
|
||||||
|
|
||||||
|
# Initialize components
|
||||||
|
self._init_components()
|
||||||
|
|
||||||
|
def _init_components(self):
|
||||||
|
"""Initialize all required components using the URI configuration"""
|
||||||
|
self.embedding_model = OpenAIEmbeddingModel(
|
||||||
|
api_key=self.config.openai_api_key,
|
||||||
|
model_name=self.config.embedding_model
|
||||||
|
)
|
||||||
|
|
||||||
|
self.parser = UnstructuredAPIParser(
|
||||||
|
api_key=self.config.unstructured_api_key,
|
||||||
|
chunk_size=1000,
|
||||||
|
chunk_overlap=200
|
||||||
|
)
|
||||||
|
|
||||||
|
self.vector_store = MongoDBAtlasVectorStore(
|
||||||
|
connection_string=self.config.mongo_uri,
|
||||||
|
database_name=self.config.db_name,
|
||||||
|
collection_name=self.config.collection_name
|
||||||
|
)
|
||||||
|
|
||||||
|
self.planner = SimpleRAGPlanner(default_k=4)
|
||||||
|
|
||||||
|
async def ingest_document(
|
||||||
|
self,
|
||||||
|
content: str,
|
||||||
|
metadata: Dict[str, Any]
|
||||||
|
) -> Document:
|
||||||
|
"""
|
||||||
|
Ingest a document using the owner ID from the URI configuration.
|
||||||
|
"""
|
||||||
|
# Add owner_id to metadata
|
||||||
|
metadata['owner_id'] = self.config.owner_id
|
||||||
|
|
||||||
|
# Create document
|
||||||
|
doc = Document(content, metadata, self.config.owner_id)
|
||||||
|
|
||||||
|
# Parse into chunks
|
||||||
|
chunk_texts = self.parser.parse(content, metadata)
|
||||||
|
|
||||||
|
# Create embeddings and chunks
|
||||||
|
for chunk_text in chunk_texts:
|
||||||
|
embedding = await self.embedding_model.embed(chunk_text)
|
||||||
|
chunk = DocumentChunk(chunk_text, embedding, doc.id)
|
||||||
|
chunk.metadata = {'owner_id': self.config.owner_id}
|
||||||
|
doc.chunks.append(chunk)
|
||||||
|
|
||||||
|
# Store in vector store
|
||||||
|
success = self.vector_store.store_embeddings(doc.chunks)
|
||||||
|
if not success:
|
||||||
|
raise Exception("Failed to store embeddings")
|
||||||
|
|
||||||
|
return doc
|
||||||
|
|
||||||
|
async def query(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
**kwargs
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Query the document store using the owner ID from the URI configuration.
|
||||||
|
"""
|
||||||
|
# Create plan
|
||||||
|
plan = self.planner.plan_retrieval(query, **kwargs)
|
||||||
|
|
||||||
|
# Get query embedding
|
||||||
|
query_embedding = await self.embedding_model.embed(query)
|
||||||
|
|
||||||
|
# Execute plan
|
||||||
|
chunks = self.vector_store.query_similar(
|
||||||
|
query_embedding,
|
||||||
|
k=plan["k"],
|
||||||
|
owner_id=self.config.owner_id
|
||||||
|
)
|
||||||
|
|
||||||
|
# Format results
|
||||||
|
results = []
|
||||||
|
for chunk in chunks:
|
||||||
|
results.append({
|
||||||
|
"content": chunk.content,
|
||||||
|
"doc_id": chunk.doc_id,
|
||||||
|
"chunk_id": chunk.id,
|
||||||
|
"score": chunk.score if hasattr(chunk, "score") else None
|
||||||
|
})
|
||||||
|
|
||||||
|
return results
|
61
databridge_uri.py
Normal file
61
databridge_uri.py
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
from urllib.parse import urlparse, parse_qs
|
||||||
|
from typing import Optional, Dict, Any
|
||||||
|
import os
|
||||||
|
import jwt
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
|
||||||
|
class DataBridgeURI:
|
||||||
|
"""
|
||||||
|
Handles parsing and validation of DataBridge URIs with owner authentication
|
||||||
|
Format: databridge://<owner_id>:<auth_token>@host/path?params
|
||||||
|
"""
|
||||||
|
def __init__(self, uri: str):
|
||||||
|
self.uri = uri
|
||||||
|
self._parse_uri()
|
||||||
|
|
||||||
|
def _parse_uri(self):
|
||||||
|
parsed = urlparse(self.uri)
|
||||||
|
query_params = parse_qs(parsed.query)
|
||||||
|
|
||||||
|
# Parse authentication info from netloc
|
||||||
|
auth_parts = parsed.netloc.split('@')[0].split(':')
|
||||||
|
if len(auth_parts) != 2:
|
||||||
|
raise ValueError("URI must include owner_id and auth_token")
|
||||||
|
|
||||||
|
self.owner_id = auth_parts[0]
|
||||||
|
self.auth_token = auth_parts[1]
|
||||||
|
|
||||||
|
# Validate and decode auth token
|
||||||
|
try:
|
||||||
|
self._validate_auth_token()
|
||||||
|
except Exception as e:
|
||||||
|
raise ValueError(f"Invalid auth token: {str(e)}")
|
||||||
|
|
||||||
|
# Get the original MongoDB URI from environment - use it as is
|
||||||
|
self.mongo_uri = os.getenv("MONGODB_URI")
|
||||||
|
if not self.mongo_uri:
|
||||||
|
raise ValueError("MONGODB_URI environment variable not set")
|
||||||
|
|
||||||
|
# Get configuration from query parameters
|
||||||
|
self.openai_api_key = query_params.get('openai_key', [os.getenv('OPENAI_API_KEY', '')])[0]
|
||||||
|
self.unstructured_api_key = query_params.get('unstructured_key', [os.getenv('UNSTRUCTURED_API_KEY', '')])[0]
|
||||||
|
self.db_name = query_params.get('db', ['brandsyncaidb'])[0]
|
||||||
|
self.collection_name = query_params.get('collection', ['kb_chunked_embeddings'])[0]
|
||||||
|
self.embedding_model = query_params.get('embedding_model', ['text-embedding-3-small'])[0]
|
||||||
|
|
||||||
|
# Validate required fields
|
||||||
|
if not all([self.mongo_uri, self.openai_api_key, self.unstructured_api_key]):
|
||||||
|
raise ValueError("Missing required configuration in DataBridge URI")
|
||||||
|
|
||||||
|
def _validate_auth_token(self):
|
||||||
|
"""Validate the auth token and extract any additional claims"""
|
||||||
|
try:
|
||||||
|
decoded = jwt.decode(self.auth_token, 'your-secret-key', algorithms=['HS256'])
|
||||||
|
if decoded.get('owner_id') != self.owner_id:
|
||||||
|
raise ValueError("Token owner_id mismatch")
|
||||||
|
self.auth_claims = decoded
|
||||||
|
except jwt.ExpiredSignatureError:
|
||||||
|
raise ValueError("Auth token has expired")
|
||||||
|
except jwt.InvalidTokenError:
|
||||||
|
raise ValueError("Invalid auth token")
|
21
document.py
Normal file
21
document.py
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
from typing import Dict, Any, List
|
||||||
|
import uuid
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
|
class Document:
|
||||||
|
def __init__(self, content: str, metadata: Dict[str, Any], owner_id: str):
|
||||||
|
self.id = str(uuid.uuid4())
|
||||||
|
self.content = content
|
||||||
|
self.metadata = metadata
|
||||||
|
self.owner_id = owner_id
|
||||||
|
self.created_at = datetime.utcnow()
|
||||||
|
self.chunks: List[DocumentChunk] = []
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentChunk:
|
||||||
|
def __init__(self, content: str, embedding: List[float], doc_id: str):
|
||||||
|
self.id = str(uuid.uuid4())
|
||||||
|
self.content = content
|
||||||
|
self.embedding = embedding
|
||||||
|
self.doc_id = doc_id
|
111
mongo_vector_store.py
Normal file
111
mongo_vector_store.py
Normal file
@ -0,0 +1,111 @@
|
|||||||
|
from typing import List, Dict, Any
|
||||||
|
from pymongo import MongoClient
|
||||||
|
from base_vector_store import BaseVectorStore
|
||||||
|
from document import DocumentChunk
|
||||||
|
|
||||||
|
|
||||||
|
class MongoDBAtlasVectorStore(BaseVectorStore):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
connection_string: str,
|
||||||
|
database_name: str,
|
||||||
|
collection_name: str = "kb_chunked_embeddings",
|
||||||
|
index_name: str = "vector_index"
|
||||||
|
):
|
||||||
|
self.client = MongoClient(connection_string)
|
||||||
|
self.db = self.client[database_name]
|
||||||
|
self.collection = self.db[collection_name]
|
||||||
|
self.index_name = index_name
|
||||||
|
|
||||||
|
# Ensure vector search index exists
|
||||||
|
# self._ensure_index()
|
||||||
|
|
||||||
|
def _ensure_index(self):
|
||||||
|
"""Ensure the vector search index exists"""
|
||||||
|
try:
|
||||||
|
# Check if index exists
|
||||||
|
indexes = self.collection.list_indexes()
|
||||||
|
index_exists = any(index.get('name') == self.index_name for index in indexes)
|
||||||
|
|
||||||
|
if not index_exists:
|
||||||
|
# Create the vector search index if it doesn't exist
|
||||||
|
self.collection.create_index(
|
||||||
|
[("embedding", "vectorSearch")],
|
||||||
|
name=self.index_name,
|
||||||
|
vectorSearchOptions={
|
||||||
|
"dimensions": 1536, # For OpenAI embeddings
|
||||||
|
"similarity": "cosine"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Warning: Could not create vector index: {str(e)}")
|
||||||
|
|
||||||
|
def store_embeddings(self, chunks: List[DocumentChunk]) -> bool:
|
||||||
|
try:
|
||||||
|
documents = []
|
||||||
|
for chunk in chunks:
|
||||||
|
doc = {
|
||||||
|
"_id": chunk.id, # Use chunk.id as MongoDB _id
|
||||||
|
"text": chunk.content,
|
||||||
|
"embedding": chunk.embedding,
|
||||||
|
"doc_id": chunk.doc_id,
|
||||||
|
"owner_id": chunk.metadata.get("owner_id"),
|
||||||
|
"metadata": chunk.metadata
|
||||||
|
}
|
||||||
|
documents.append(doc)
|
||||||
|
|
||||||
|
if documents:
|
||||||
|
# Use ordered=False to continue even if some inserts fail
|
||||||
|
result = self.collection.insert_many(documents, ordered=False)
|
||||||
|
return len(result.inserted_ids) > 0
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error storing embeddings: {str(e)}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def query_similar(
|
||||||
|
self,
|
||||||
|
query_embedding: List[float],
|
||||||
|
k: int,
|
||||||
|
owner_id: str,
|
||||||
|
filters: Dict[str, Any] = None
|
||||||
|
) -> List[DocumentChunk]:
|
||||||
|
"""Find similar chunks using MongoDB Atlas Vector Search."""
|
||||||
|
base_filter = {"owner_id": owner_id}
|
||||||
|
if filters:
|
||||||
|
base_filter.update(filters)
|
||||||
|
|
||||||
|
try:
|
||||||
|
pipeline = [
|
||||||
|
{
|
||||||
|
"$vectorSearch": {
|
||||||
|
"index": self.index_name,
|
||||||
|
"path": "embedding",
|
||||||
|
"queryVector": query_embedding,
|
||||||
|
"numCandidates": k * 10,
|
||||||
|
"limit": k,
|
||||||
|
"filter": base_filter
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
results = list(self.collection.aggregate(pipeline))
|
||||||
|
chunks = []
|
||||||
|
|
||||||
|
for result in results:
|
||||||
|
chunk = DocumentChunk(
|
||||||
|
content=result["text"],
|
||||||
|
embedding=result["embedding"],
|
||||||
|
doc_id=result["doc_id"]
|
||||||
|
)
|
||||||
|
chunk.score = result.get("score", 0)
|
||||||
|
# Add metadata back to chunk
|
||||||
|
chunk.metadata = result.get("metadata", {})
|
||||||
|
chunks.append(chunk)
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error querying similar documents: {str(e)}")
|
||||||
|
return []
|
23
openai_embedding_model.py
Normal file
23
openai_embedding_model.py
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
from typing import List, Union
|
||||||
|
import openai
|
||||||
|
from base_embedding_model import BaseEmbeddingModel
|
||||||
|
|
||||||
|
|
||||||
|
class OpenAIEmbeddingModel(BaseEmbeddingModel):
|
||||||
|
def __init__(self, api_key: str, model_name: str = "text-embedding-3-small"):
|
||||||
|
self.client = openai.Client(api_key=api_key)
|
||||||
|
self.model_name = model_name
|
||||||
|
|
||||||
|
async def embed(self, text: Union[str, List[str]]) -> List[float]:
|
||||||
|
if isinstance(text, str):
|
||||||
|
text = [text]
|
||||||
|
|
||||||
|
response = self.client.embeddings.create(
|
||||||
|
model=self.model_name,
|
||||||
|
input=text
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(text) == 1:
|
||||||
|
return response.data[0].embedding
|
||||||
|
|
||||||
|
return [item.embedding for item in response.data]
|
141
requirements.txt
Normal file
141
requirements.txt
Normal file
@ -0,0 +1,141 @@
|
|||||||
|
aiohappyeyeballs==2.4.3
|
||||||
|
aiohttp==3.11.2
|
||||||
|
aiosignal==1.3.1
|
||||||
|
annotated-types==0.7.0
|
||||||
|
antlr4-python3-runtime==4.9.3
|
||||||
|
anyio==4.6.2.post1
|
||||||
|
attrs==24.2.0
|
||||||
|
backoff==2.2.1
|
||||||
|
beautifulsoup4==4.12.3
|
||||||
|
cachetools==5.5.0
|
||||||
|
certifi==2024.8.30
|
||||||
|
cffi==1.17.1
|
||||||
|
chardet==5.2.0
|
||||||
|
charset-normalizer==3.4.0
|
||||||
|
click==8.1.7
|
||||||
|
coloredlogs==15.0.1
|
||||||
|
contourpy==1.3.1
|
||||||
|
cryptography==43.0.3
|
||||||
|
cycler==0.12.1
|
||||||
|
dataclasses-json==0.6.7
|
||||||
|
Deprecated==1.2.14
|
||||||
|
distro==1.9.0
|
||||||
|
dnspython==2.7.0
|
||||||
|
effdet==0.4.1
|
||||||
|
emoji==2.14.0
|
||||||
|
eval_type_backport==0.2.0
|
||||||
|
filelock==3.16.1
|
||||||
|
filetype==1.2.0
|
||||||
|
flatbuffers==24.3.25
|
||||||
|
fonttools==4.55.0
|
||||||
|
frozenlist==1.5.0
|
||||||
|
fsspec==2024.10.0
|
||||||
|
google-api-core==2.23.0
|
||||||
|
google-auth==2.36.0
|
||||||
|
google-cloud-vision==3.8.1
|
||||||
|
googleapis-common-protos==1.66.0
|
||||||
|
grpcio==1.67.1
|
||||||
|
grpcio-status==1.67.1
|
||||||
|
h11==0.14.0
|
||||||
|
html5lib==1.1
|
||||||
|
httpcore==1.0.6
|
||||||
|
httpx==0.27.2
|
||||||
|
huggingface-hub==0.26.2
|
||||||
|
humanfriendly==10.0
|
||||||
|
idna==3.10
|
||||||
|
iopath==0.1.10
|
||||||
|
Jinja2==3.1.4
|
||||||
|
jiter==0.7.1
|
||||||
|
joblib==1.4.2
|
||||||
|
jsonpatch==1.33
|
||||||
|
jsonpath-python==1.0.6
|
||||||
|
jsonpointer==3.0.0
|
||||||
|
jwt==1.3.1
|
||||||
|
kiwisolver==1.4.7
|
||||||
|
langchain==0.3.7
|
||||||
|
langchain-core==0.3.18
|
||||||
|
langchain-text-splitters==0.3.2
|
||||||
|
langdetect==1.0.9
|
||||||
|
langsmith==0.1.143
|
||||||
|
layoutparser==0.3.4
|
||||||
|
lxml==5.3.0
|
||||||
|
MarkupSafe==3.0.2
|
||||||
|
marshmallow==3.23.1
|
||||||
|
matplotlib==3.9.2
|
||||||
|
mpmath==1.3.0
|
||||||
|
multidict==6.1.0
|
||||||
|
mypy-extensions==1.0.0
|
||||||
|
nest-asyncio==1.6.0
|
||||||
|
networkx==3.4.2
|
||||||
|
nltk==3.9.1
|
||||||
|
numpy==1.26.4
|
||||||
|
olefile==0.47
|
||||||
|
omegaconf==2.3.0
|
||||||
|
onnx==1.17.0
|
||||||
|
onnxruntime==1.20.0
|
||||||
|
openai==1.54.4
|
||||||
|
opencv-python==4.10.0.84
|
||||||
|
orjson==3.10.11
|
||||||
|
packaging==24.2
|
||||||
|
pandas==2.2.3
|
||||||
|
pdf2image==1.17.0
|
||||||
|
pdfminer.six==20231228
|
||||||
|
pdfplumber==0.11.4
|
||||||
|
pi_heif==0.20.0
|
||||||
|
pikepdf==9.4.1
|
||||||
|
pillow==11.0.0
|
||||||
|
portalocker==2.10.1
|
||||||
|
propcache==0.2.0
|
||||||
|
proto-plus==1.25.0
|
||||||
|
protobuf==5.28.3
|
||||||
|
psutil==6.1.0
|
||||||
|
pyasn1==0.6.1
|
||||||
|
pyasn1_modules==0.4.1
|
||||||
|
pycocotools==2.0.8
|
||||||
|
pycparser==2.22
|
||||||
|
pydantic==2.9.2
|
||||||
|
pydantic_core==2.23.4
|
||||||
|
PyJWT==2.9.0
|
||||||
|
pymongo==4.10.1
|
||||||
|
pyparsing==3.2.0
|
||||||
|
pypdf==5.1.0
|
||||||
|
pypdfium2==4.30.0
|
||||||
|
python-dateutil==2.8.2
|
||||||
|
python-dotenv==1.0.1
|
||||||
|
python-iso639==2024.10.22
|
||||||
|
python-magic==0.4.27
|
||||||
|
python-multipart==0.0.17
|
||||||
|
python-oxmsg==0.0.1
|
||||||
|
pytz==2024.2
|
||||||
|
PyYAML==6.0.2
|
||||||
|
RapidFuzz==3.10.1
|
||||||
|
regex==2024.11.6
|
||||||
|
requests==2.32.3
|
||||||
|
requests-toolbelt==1.0.0
|
||||||
|
rsa==4.9
|
||||||
|
safetensors==0.4.5
|
||||||
|
scipy==1.14.1
|
||||||
|
setuptools==75.5.0
|
||||||
|
six==1.16.0
|
||||||
|
sniffio==1.3.1
|
||||||
|
soupsieve==2.6
|
||||||
|
SQLAlchemy==2.0.36
|
||||||
|
sympy==1.13.1
|
||||||
|
tenacity==9.0.0
|
||||||
|
timm==1.0.11
|
||||||
|
tokenizers==0.20.3
|
||||||
|
torch==2.5.1
|
||||||
|
torchvision==0.20.1
|
||||||
|
tqdm==4.67.0
|
||||||
|
transformers==4.46.2
|
||||||
|
typing-inspect==0.9.0
|
||||||
|
typing_extensions==4.12.2
|
||||||
|
tzdata==2024.2
|
||||||
|
unstructured==0.16.5
|
||||||
|
unstructured-client==0.27.0
|
||||||
|
unstructured-inference==0.8.1
|
||||||
|
unstructured.pytesseract==0.3.13
|
||||||
|
urllib3==2.2.3
|
||||||
|
webencodings==0.5.1
|
||||||
|
wrapt==1.16.0
|
||||||
|
yarl==1.17.1
|
BIN
sample.pdf
Normal file
BIN
sample.pdf
Normal file
Binary file not shown.
72
sanity_checks/mongo.py
Normal file
72
sanity_checks/mongo.py
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
from pymongo import MongoClient
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import os
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
|
||||||
|
def test_mongo_operations():
|
||||||
|
# Load environment variables
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
# Get MongoDB URI from environment variable
|
||||||
|
mongo_uri = os.getenv("MONGODB_URI")
|
||||||
|
if not mongo_uri:
|
||||||
|
raise ValueError("MONGODB_URI environment variable not set")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Connect to MongoDB
|
||||||
|
client = MongoClient(mongo_uri)
|
||||||
|
|
||||||
|
# Test connection
|
||||||
|
client.admin.command('ping')
|
||||||
|
print("✅ Connected successfully to MongoDB")
|
||||||
|
|
||||||
|
# Get database and collection
|
||||||
|
db = client.brandsyncaidb # Using a test database
|
||||||
|
collection = db.kb_chunked_embeddings
|
||||||
|
|
||||||
|
# Insert a single document
|
||||||
|
test_doc = {
|
||||||
|
"name": "Test Document",
|
||||||
|
"timestamp": datetime.datetime.now(),
|
||||||
|
"value": 42
|
||||||
|
}
|
||||||
|
|
||||||
|
result = collection.insert_one(test_doc)
|
||||||
|
print(f"✅ Inserted document with ID: {result.inserted_id}")
|
||||||
|
|
||||||
|
# Insert multiple documents
|
||||||
|
test_docs = [
|
||||||
|
{"name": "Doc 1", "value": 1},
|
||||||
|
{"name": "Doc 2", "value": 2},
|
||||||
|
{"name": "Doc 3", "value": 3}
|
||||||
|
]
|
||||||
|
|
||||||
|
result = collection.insert_many(test_docs)
|
||||||
|
print(f"✅ Inserted {len(result.inserted_ids)} documents")
|
||||||
|
|
||||||
|
# Retrieve documents
|
||||||
|
print("\nRetrieving documents:")
|
||||||
|
for doc in collection.find():
|
||||||
|
print(f"Found document: {doc}")
|
||||||
|
|
||||||
|
# Find specific documents
|
||||||
|
print("\nFinding documents with value >= 2:")
|
||||||
|
query = {"value": {"$gte": 2}}
|
||||||
|
for doc in collection.find(query):
|
||||||
|
print(f"Found document: {doc}")
|
||||||
|
|
||||||
|
# Clean up - delete all test documents
|
||||||
|
# DON'T DELETE IF It'S BRANDSYNCAI
|
||||||
|
# result = collection.delete_many({})
|
||||||
|
print(f"\n✅ Cleaned up {result.deleted_count} test documents")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error: {str(e)}")
|
||||||
|
finally:
|
||||||
|
client.close()
|
||||||
|
print("\n✅ Connection closed")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_mongo_operations()
|
74
simple_example.py
Normal file
74
simple_example.py
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
from datetime import datetime, timedelta, UTC # Note: using UTC for timezone awareness
|
||||||
|
import base64
|
||||||
|
from databridge import DataBridge
|
||||||
|
import jwt
|
||||||
|
import os
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
|
||||||
|
def create_databridge_uri() -> str:
|
||||||
|
"""Create DataBridge URI from environment variables"""
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
# Get credentials from environment
|
||||||
|
mongo_uri = os.getenv("MONGODB_URI")
|
||||||
|
openai_key = os.getenv("OPENAI_API_KEY")
|
||||||
|
unstructured_key = os.getenv("UNSTRUCTURED_API_KEY")
|
||||||
|
owner_id = os.getenv("DATABRIDGE_OWNER", "admin")
|
||||||
|
|
||||||
|
# Validate required credentials
|
||||||
|
if not all([mongo_uri, openai_key, unstructured_key]):
|
||||||
|
raise ValueError("Missing required environment variables")
|
||||||
|
|
||||||
|
# Generate auth token
|
||||||
|
auth_token = jwt.encode(
|
||||||
|
{
|
||||||
|
'owner_id': owner_id,
|
||||||
|
'exp': datetime.now(UTC) + timedelta(days=30)
|
||||||
|
},
|
||||||
|
'your-secret-key', # In production, use proper secret
|
||||||
|
algorithm='HS256'
|
||||||
|
)
|
||||||
|
|
||||||
|
# For DataBridge URI, use any host identifier (it won't affect MongoDB connection)
|
||||||
|
uri = (
|
||||||
|
f"databridge://{owner_id}:{auth_token}@databridge.local"
|
||||||
|
f"?openai_key={openai_key}"
|
||||||
|
f"&unstructured_key={unstructured_key}"
|
||||||
|
f"&db=brandsyncaidb"
|
||||||
|
f"&collection=kb_chunked_embeddings"
|
||||||
|
)
|
||||||
|
|
||||||
|
return uri
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
# Initialize DataBridge
|
||||||
|
bridge = DataBridge(create_databridge_uri())
|
||||||
|
|
||||||
|
# Example: Ingest a PDF document
|
||||||
|
with open("sample.pdf", "rb") as f:
|
||||||
|
pdf_content = base64.b64encode(f.read()).decode()
|
||||||
|
|
||||||
|
await bridge.ingest_document(
|
||||||
|
content=pdf_content,
|
||||||
|
metadata={
|
||||||
|
"content_type": "application/pdf",
|
||||||
|
"is_base64": True,
|
||||||
|
"title": "Sample PDF"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Example: Query documents
|
||||||
|
results = await bridge.query(
|
||||||
|
query="What is machine learning?",
|
||||||
|
k=4
|
||||||
|
)
|
||||||
|
|
||||||
|
for result in results:
|
||||||
|
print(f"Content: {result['content'][:200]}...")
|
||||||
|
print(f"Score: {result['score']}\n")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import asyncio
|
||||||
|
asyncio.run(main())
|
17
simple_planner.py
Normal file
17
simple_planner.py
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
from typing import Dict, Any
|
||||||
|
from base_planner import BasePlanner
|
||||||
|
|
||||||
|
|
||||||
|
class SimpleRAGPlanner(BasePlanner):
|
||||||
|
def __init__(self, default_k: int = 3):
|
||||||
|
self.default_k = default_k
|
||||||
|
|
||||||
|
def plan_retrieval(self, query: str, **kwargs) -> Dict[str, Any]:
|
||||||
|
"""Create a simple retrieval plan."""
|
||||||
|
return {
|
||||||
|
"strategy": "simple_retrieval",
|
||||||
|
"k": kwargs.get("k", self.default_k),
|
||||||
|
"query": query,
|
||||||
|
"filters": kwargs.get("filters", {}),
|
||||||
|
"min_score": kwargs.get("min_score", 0.0)
|
||||||
|
}
|
72
unstructured_parser.py
Normal file
72
unstructured_parser.py
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
from typing import Dict, Any, List
|
||||||
|
from base_parser import BaseParser
|
||||||
|
from unstructured.partition.auto import partition
|
||||||
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
import base64
|
||||||
|
|
||||||
|
|
||||||
|
class UnstructuredAPIParser(BaseParser):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
api_key: str,
|
||||||
|
chunk_size: int = 1000,
|
||||||
|
chunk_overlap: int = 200,
|
||||||
|
api_url: str = "https://api.unstructuredapp.io"
|
||||||
|
):
|
||||||
|
self.api_key = api_key
|
||||||
|
self.api_url = api_url
|
||||||
|
self.chunk_size = chunk_size
|
||||||
|
self.chunk_overlap = chunk_overlap
|
||||||
|
self.text_splitter = RecursiveCharacterTextSplitter(
|
||||||
|
chunk_size=chunk_size,
|
||||||
|
chunk_overlap=chunk_overlap,
|
||||||
|
length_function=len,
|
||||||
|
separators=["\n\n", "\n", ". ", " ", ""]
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse(self, content: str, metadata: Dict[str, Any]) -> List[str]:
|
||||||
|
"""Parse content using Unstructured API and split into chunks."""
|
||||||
|
try:
|
||||||
|
# Create temporary file for content
|
||||||
|
with tempfile.NamedTemporaryFile(delete=False, suffix=self._get_file_extension(metadata)) as temp_file:
|
||||||
|
if metadata.get("is_base64", False):
|
||||||
|
temp_file.write(base64.b64decode(content))
|
||||||
|
else:
|
||||||
|
temp_file.write(content.encode('utf-8'))
|
||||||
|
temp_file_path = temp_file.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Use Unstructured API for parsing
|
||||||
|
elements = partition(
|
||||||
|
filename=temp_file_path,
|
||||||
|
api_key=self.api_key,
|
||||||
|
api_url=self.api_url,
|
||||||
|
partition_via_api=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Combine elements and split into chunks
|
||||||
|
full_text = "\n\n".join(str(element) for element in elements)
|
||||||
|
chunks = self.text_splitter.split_text(full_text)
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
finally:
|
||||||
|
# Clean up temporary file
|
||||||
|
os.unlink(temp_file_path)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
raise Exception(f"Error parsing document: {str(e)}")
|
||||||
|
|
||||||
|
def _get_file_extension(self, metadata: Dict[str, Any]) -> str:
|
||||||
|
"""Get appropriate file extension based on content type."""
|
||||||
|
content_type_mapping = {
|
||||||
|
'application/pdf': '.pdf',
|
||||||
|
'application/msword': '.doc',
|
||||||
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
|
||||||
|
'image/jpeg': '.jpg',
|
||||||
|
'image/png': '.png',
|
||||||
|
'text/plain': '.txt',
|
||||||
|
'text/html': '.html'
|
||||||
|
}
|
||||||
|
return content_type_mapping.get(metadata.get('content_type'), '.txt')
|
Loading…
x
Reference in New Issue
Block a user