mirror of
https://github.com/james-m-jordan/morphik-core.git
synced 2025-05-09 19:32:38 +00:00
112 lines
3.7 KiB
Python
112 lines
3.7 KiB
Python
from typing import List, Dict, Any
|
|
from pymongo import MongoClient
|
|
from base_vector_store import BaseVectorStore
|
|
from document import DocumentChunk
|
|
|
|
|
|
class MongoDBAtlasVectorStore(BaseVectorStore):
|
|
def __init__(
|
|
self,
|
|
connection_string: str,
|
|
database_name: str,
|
|
collection_name: str = "kb_chunked_embeddings",
|
|
index_name: str = "vector_index"
|
|
):
|
|
self.client = MongoClient(connection_string)
|
|
self.db = self.client[database_name]
|
|
self.collection = self.db[collection_name]
|
|
self.index_name = index_name
|
|
|
|
# Ensure vector search index exists
|
|
# self._ensure_index()
|
|
|
|
def _ensure_index(self):
|
|
"""Ensure the vector search index exists"""
|
|
try:
|
|
# Check if index exists
|
|
indexes = self.collection.list_indexes()
|
|
index_exists = any(index.get('name') == self.index_name for index in indexes)
|
|
|
|
if not index_exists:
|
|
# Create the vector search index if it doesn't exist
|
|
self.collection.create_index(
|
|
[("embedding", "vectorSearch")],
|
|
name=self.index_name,
|
|
vectorSearchOptions={
|
|
"dimensions": 1536, # For OpenAI embeddings
|
|
"similarity": "cosine"
|
|
}
|
|
)
|
|
except Exception as e:
|
|
print(f"Warning: Could not create vector index: {str(e)}")
|
|
|
|
def store_embeddings(self, chunks: List[DocumentChunk]) -> bool:
|
|
try:
|
|
documents = []
|
|
for chunk in chunks:
|
|
doc = {
|
|
"_id": chunk.id, # Use chunk.id as MongoDB _id
|
|
"text": chunk.content,
|
|
"embedding": chunk.embedding,
|
|
"doc_id": chunk.doc_id,
|
|
"owner_id": chunk.metadata.get("owner_id"),
|
|
"metadata": chunk.metadata
|
|
}
|
|
documents.append(doc)
|
|
|
|
if documents:
|
|
# Use ordered=False to continue even if some inserts fail
|
|
result = self.collection.insert_many(documents, ordered=False)
|
|
return len(result.inserted_ids) > 0
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"Error storing embeddings: {str(e)}")
|
|
return False
|
|
|
|
def query_similar(
|
|
self,
|
|
query_embedding: List[float],
|
|
k: int,
|
|
owner_id: str,
|
|
filters: Dict[str, Any] = None
|
|
) -> List[DocumentChunk]:
|
|
"""Find similar chunks using MongoDB Atlas Vector Search."""
|
|
base_filter = {"owner_id": owner_id}
|
|
if filters:
|
|
base_filter.update(filters)
|
|
|
|
try:
|
|
pipeline = [
|
|
{
|
|
"$vectorSearch": {
|
|
"index": self.index_name,
|
|
"path": "embedding",
|
|
"queryVector": query_embedding,
|
|
"numCandidates": k * 10,
|
|
"limit": k,
|
|
"filter": base_filter
|
|
}
|
|
}
|
|
]
|
|
|
|
results = list(self.collection.aggregate(pipeline))
|
|
chunks = []
|
|
|
|
for result in results:
|
|
chunk = DocumentChunk(
|
|
content=result["text"],
|
|
embedding=result["embedding"],
|
|
doc_id=result["doc_id"]
|
|
)
|
|
chunk.score = result.get("score", 0)
|
|
# Add metadata back to chunk
|
|
chunk.metadata = result.get("metadata", {})
|
|
chunks.append(chunk)
|
|
|
|
return chunks
|
|
|
|
except Exception as e:
|
|
print(f"Error querying similar documents: {str(e)}")
|
|
return []
|