morphik-core/mongo_vector_store.py

112 lines
3.7 KiB
Python

from typing import List, Dict, Any
from pymongo import MongoClient
from base_vector_store import BaseVectorStore
from document import DocumentChunk
class MongoDBAtlasVectorStore(BaseVectorStore):
def __init__(
self,
connection_string: str,
database_name: str,
collection_name: str = "kb_chunked_embeddings",
index_name: str = "vector_index"
):
self.client = MongoClient(connection_string)
self.db = self.client[database_name]
self.collection = self.db[collection_name]
self.index_name = index_name
# Ensure vector search index exists
# self._ensure_index()
def _ensure_index(self):
"""Ensure the vector search index exists"""
try:
# Check if index exists
indexes = self.collection.list_indexes()
index_exists = any(index.get('name') == self.index_name for index in indexes)
if not index_exists:
# Create the vector search index if it doesn't exist
self.collection.create_index(
[("embedding", "vectorSearch")],
name=self.index_name,
vectorSearchOptions={
"dimensions": 1536, # For OpenAI embeddings
"similarity": "cosine"
}
)
except Exception as e:
print(f"Warning: Could not create vector index: {str(e)}")
def store_embeddings(self, chunks: List[DocumentChunk]) -> bool:
try:
documents = []
for chunk in chunks:
doc = {
"_id": chunk.id, # Use chunk.id as MongoDB _id
"text": chunk.content,
"embedding": chunk.embedding,
"doc_id": chunk.doc_id,
"owner_id": chunk.metadata.get("owner_id"),
"metadata": chunk.metadata
}
documents.append(doc)
if documents:
# Use ordered=False to continue even if some inserts fail
result = self.collection.insert_many(documents, ordered=False)
return len(result.inserted_ids) > 0
return True
except Exception as e:
print(f"Error storing embeddings: {str(e)}")
return False
def query_similar(
self,
query_embedding: List[float],
k: int,
owner_id: str,
filters: Dict[str, Any] = None
) -> List[DocumentChunk]:
"""Find similar chunks using MongoDB Atlas Vector Search."""
base_filter = {"owner_id": owner_id}
if filters:
base_filter.update(filters)
try:
pipeline = [
{
"$vectorSearch": {
"index": self.index_name,
"path": "embedding",
"queryVector": query_embedding,
"numCandidates": k * 10,
"limit": k,
"filter": base_filter
}
}
]
results = list(self.collection.aggregate(pipeline))
chunks = []
for result in results:
chunk = DocumentChunk(
content=result["text"],
embedding=result["embedding"],
doc_id=result["doc_id"]
)
chunk.score = result.get("score", 0)
# Add metadata back to chunk
chunk.metadata = result.get("metadata", {})
chunks.append(chunk)
return chunks
except Exception as e:
print(f"Error querying similar documents: {str(e)}")
return []