mirror of
https://github.com/james-m-jordan/morphik-core.git
synced 2025-05-09 19:32:38 +00:00
refactor some stuff (#2)
* refactor some stuff, remove bare try catches
This commit is contained in:
parent
0f492ed7c0
commit
df8d7fcdd0
32
core/api.py
32
core/api.py
@ -1,6 +1,6 @@
|
||||
import json
|
||||
from datetime import datetime, UTC
|
||||
from typing import List, Union
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
from fastapi import (
|
||||
FastAPI,
|
||||
Form,
|
||||
@ -16,10 +16,9 @@ from core.models.request import IngestTextRequest, QueryRequest
|
||||
from core.models.documents import (
|
||||
Document,
|
||||
DocumentResult,
|
||||
ChunkResult,
|
||||
EntityType
|
||||
ChunkResult
|
||||
)
|
||||
from core.models.auth import AuthContext
|
||||
from core.models.auth import AuthContext, EntityType
|
||||
from core.services.document_service import DocumentService
|
||||
from core.config import get_settings
|
||||
from core.database.mongo_database import MongoDatabase
|
||||
@ -127,8 +126,6 @@ async def ingest_text(
|
||||
return await document_service.ingest_text(request, auth)
|
||||
except PermissionError as e:
|
||||
raise HTTPException(status_code=403, detail=str(e))
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
|
||||
@app.post("/ingest/file", response_model=Document)
|
||||
@ -141,13 +138,11 @@ async def ingest_file(
|
||||
try:
|
||||
metadata_dict = json.loads(metadata)
|
||||
doc = await document_service.ingest_file(file, metadata_dict, auth)
|
||||
return doc # Should just send a success response, not sure why we're sending a document #TODO: discuss with bhau
|
||||
return doc # TODO: Might be lighter on network to just send the document ID.
|
||||
except PermissionError as e:
|
||||
raise HTTPException(status_code=403, detail=str(e))
|
||||
except json.JSONDecodeError:
|
||||
raise HTTPException(400, "Invalid metadata JSON")
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
|
||||
@app.post("/query", response_model=Union[List[ChunkResult], List[DocumentResult]])
|
||||
@ -156,24 +151,18 @@ async def query_documents(
|
||||
auth: AuthContext = Depends(verify_token)
|
||||
):
|
||||
"""Query documents with specified return type."""
|
||||
try:
|
||||
return await document_service.query(request, auth)
|
||||
except Exception as e:
|
||||
logger.error(f"Query failed: {str(e)}")
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
return await document_service.query(request, auth)
|
||||
|
||||
|
||||
@app.get("/documents", response_model=List[Document])
|
||||
async def list_documents(
|
||||
auth: AuthContext = Depends(verify_token),
|
||||
skip: int = 0,
|
||||
limit: int = 100
|
||||
limit: int = 100,
|
||||
filters: Optional[Dict[str, Any]] = None
|
||||
):
|
||||
"""List accessible documents."""
|
||||
try:
|
||||
return await document_service.db.get_documents(auth, skip, limit)
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
return await document_service.db.get_documents(auth, skip, limit, filters)
|
||||
|
||||
|
||||
@app.get("/documents/{document_id}", response_model=Document)
|
||||
@ -189,6 +178,5 @@ async def get_document(
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
return doc
|
||||
except HTTPException as e:
|
||||
raise e # Return the HTTPException as is
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
logger.error(f"Error getting document: {e}")
|
||||
raise e
|
||||
|
@ -62,17 +62,14 @@ class BaseDatabase(ABC):
|
||||
Returns: Success status
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@abstractmethod
|
||||
async def find_documents(
|
||||
async def find_authorized_and_filtered_documents(
|
||||
self,
|
||||
auth: AuthContext,
|
||||
filters: Optional[Dict[str, Any]] = None
|
||||
) -> List[str]:
|
||||
"""
|
||||
Find document IDs matching filters that user has access to.
|
||||
Returns: List of document IDs
|
||||
"""
|
||||
"""Find document IDs matching filters that user has access to."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
|
@ -152,7 +152,7 @@ class MongoDatabase(BaseDatabase):
|
||||
logger.error(f"Error deleting document: {str(e)}")
|
||||
return False
|
||||
|
||||
async def find_documents(
|
||||
async def find_authorized_and_filtered_documents(
|
||||
self,
|
||||
auth: AuthContext,
|
||||
filters: Optional[Dict[str, Any]] = None
|
||||
|
@ -13,4 +13,5 @@ class AuthContext(BaseModel):
|
||||
entity_type: EntityType
|
||||
entity_id: str # uuid
|
||||
app_id: Optional[str] = None # uuid, only for developers
|
||||
# TODO: remove permissions, not required here.
|
||||
permissions: Set[str] = {"read"}
|
||||
|
@ -7,10 +7,6 @@ import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class EntityType(str, Enum):
|
||||
USER = "user"
|
||||
DEVELOPER = "developer"
|
||||
|
||||
|
||||
class QueryReturnType(str, Enum):
|
||||
CHUNKS = "chunks"
|
||||
@ -68,7 +64,7 @@ class ChunkResult(BaseModel):
|
||||
|
||||
class DocumentContent(BaseModel):
|
||||
"""Represents either a URL or content string"""
|
||||
type: Literal["url", "string"]
|
||||
type: Literal["url", "string"]
|
||||
value: str
|
||||
filename: Optional[str] = Field(None, description="Filename when type is url")
|
||||
|
||||
|
@ -2,14 +2,15 @@ from abc import ABC, abstractmethod
|
||||
from typing import List, Union
|
||||
from fastapi import UploadFile
|
||||
|
||||
|
||||
class BaseParser(ABC):
|
||||
"""Base class for document parsing"""
|
||||
|
||||
|
||||
@abstractmethod
|
||||
async def split_text(self, text: str) -> List[str]:
|
||||
"""Split plain text into chunks"""
|
||||
pass
|
||||
|
||||
|
||||
@abstractmethod
|
||||
async def parse_file(self, file: Union[UploadFile, bytes], content_type: str) -> List[str]:
|
||||
"""Parse file content into text chunks"""
|
||||
|
@ -1,7 +1,7 @@
|
||||
from typing import List
|
||||
import io
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from unstructured.partition.auto import partition
|
||||
from langchain_unstructured import UnstructuredLoader
|
||||
import logging
|
||||
|
||||
from .base_parser import BaseParser
|
||||
@ -26,30 +26,17 @@ class UnstructuredAPIParser(BaseParser):
|
||||
|
||||
async def split_text(self, text: str) -> List[str]:
|
||||
"""Split plain text into chunks"""
|
||||
try:
|
||||
return self.text_splitter.split_text(text)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to split text: {str(e)}")
|
||||
raise
|
||||
return self.text_splitter.split_text(text)
|
||||
|
||||
async def parse_file(self, file: bytes, content_type: str) -> List[str]:
|
||||
"""Parse file content using unstructured"""
|
||||
try:
|
||||
# Parse with unstructured
|
||||
elements = partition(
|
||||
file=io.BytesIO(file),
|
||||
content_type=content_type,
|
||||
api_key=self.api_key
|
||||
)
|
||||
|
||||
# Extract text from elements
|
||||
chunks = []
|
||||
for element in elements:
|
||||
if hasattr(element, 'text') and element.text:
|
||||
chunks.append(element.text.strip())
|
||||
|
||||
return chunks
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse file: {str(e)}")
|
||||
raise e
|
||||
# Parse with unstructured
|
||||
loader = UnstructuredLoader(
|
||||
file=io.BytesIO(file),
|
||||
content_type=content_type,
|
||||
partition_via_api=True,
|
||||
api_key=self.api_key,
|
||||
chunking_strategy="by_title"
|
||||
)
|
||||
elements = loader.load()
|
||||
return [element.page_content for element in elements]
|
||||
|
@ -45,53 +45,49 @@ class DocumentService:
|
||||
) -> Document:
|
||||
"""Ingest a text document."""
|
||||
if "write" not in auth.permissions:
|
||||
logger.error(f"User {auth.entity_id} does not have write permission")
|
||||
raise PermissionError("User does not have write permission")
|
||||
try:
|
||||
# 1. Create document record
|
||||
doc = Document(
|
||||
content_type="text/plain",
|
||||
metadata=request.metadata,
|
||||
owner={
|
||||
"type": auth.entity_type,
|
||||
"id": auth.entity_id
|
||||
},
|
||||
access_control={
|
||||
"readers": [auth.entity_id],
|
||||
"writers": {auth.entity_id},
|
||||
"admins": {auth.entity_id}
|
||||
}
|
||||
)
|
||||
logger.info(f"Created text document record with ID {doc.external_id}")
|
||||
|
||||
# 2. Parse content into chunks
|
||||
chunks = await self.parser.split_text(request.content)
|
||||
if not chunks:
|
||||
raise ValueError("No content chunks extracted from text")
|
||||
logger.info(f"Split text into {len(chunks)} chunks")
|
||||
# 1. Create document record
|
||||
doc = Document(
|
||||
content_type="text/plain",
|
||||
metadata=request.metadata,
|
||||
owner={
|
||||
"type": auth.entity_type,
|
||||
"id": auth.entity_id
|
||||
},
|
||||
access_control={
|
||||
"readers": [auth.entity_id],
|
||||
"writers": [auth.entity_id],
|
||||
"admins": [auth.entity_id]
|
||||
}
|
||||
)
|
||||
logger.info(f"Created text document record with ID {doc.external_id}")
|
||||
|
||||
# 3. Generate embeddings for chunks
|
||||
embeddings = await self.embedding_model.embed_for_ingestion(chunks)
|
||||
logger.info(f"Generated {len(embeddings)} embeddings")
|
||||
# 2. Parse content into chunks
|
||||
chunks = await self.parser.split_text(request.content)
|
||||
if not chunks:
|
||||
raise ValueError("No content chunks extracted from text")
|
||||
logger.info(f"Split text into {len(chunks)} chunks")
|
||||
|
||||
# 4. Create and store chunk objects
|
||||
chunk_objects = self._create_chunk_objects(
|
||||
doc.external_id,
|
||||
chunks,
|
||||
embeddings,
|
||||
doc.metadata
|
||||
)
|
||||
logger.info(f"Created {len(chunk_objects)} chunk objects")
|
||||
# 3. Generate embeddings for chunks
|
||||
embeddings = await self.embedding_model.embed_for_ingestion(chunks)
|
||||
logger.info(f"Generated {len(embeddings)} embeddings")
|
||||
|
||||
# 5. Store everything
|
||||
await self._store_chunks_and_doc(chunk_objects, doc)
|
||||
logger.info(f"Successfully stored text document {doc.external_id}")
|
||||
# 4. Create and store chunk objects
|
||||
chunk_objects = self._create_chunk_objects(
|
||||
doc.external_id,
|
||||
chunks,
|
||||
embeddings,
|
||||
doc.metadata
|
||||
)
|
||||
logger.info(f"Created {len(chunk_objects)} chunk objects")
|
||||
|
||||
return doc
|
||||
# 5. Store everything
|
||||
await self._store_chunks_and_doc(chunk_objects, doc)
|
||||
logger.info(f"Successfully stored text document {doc.external_id}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Text document ingestion failed: {str(e)}")
|
||||
# TODO: Clean up any stored data on failure
|
||||
raise e
|
||||
return doc
|
||||
|
||||
async def ingest_file(
|
||||
self,
|
||||
@ -102,68 +98,63 @@ class DocumentService:
|
||||
"""Ingest a file document."""
|
||||
if "write" not in auth.permissions:
|
||||
raise PermissionError("User does not have write permission")
|
||||
try:
|
||||
# 1. Create document record
|
||||
doc = Document(
|
||||
content_type=file.content_type,
|
||||
filename=file.filename,
|
||||
metadata=metadata,
|
||||
owner={
|
||||
"type": auth.entity_type,
|
||||
"id": auth.entity_id
|
||||
},
|
||||
access_control={
|
||||
"readers": [auth.entity_id],
|
||||
"writers": {auth.entity_id},
|
||||
"admins": {auth.entity_id}
|
||||
}
|
||||
)
|
||||
logger.info(f"Created file document record with ID {doc.external_id}")
|
||||
|
||||
# 2. Read and store file
|
||||
file_content = await file.read()
|
||||
storage_info = await self.storage.upload_from_base64(
|
||||
base64.b64encode(file_content).decode(),
|
||||
doc.external_id,
|
||||
file.content_type
|
||||
)
|
||||
doc.storage_info = {
|
||||
"bucket": storage_info[0],
|
||||
"key": storage_info[1]
|
||||
# 1. Create document record
|
||||
doc = Document(
|
||||
content_type=file.content_type,
|
||||
filename=file.filename,
|
||||
metadata=metadata,
|
||||
owner={
|
||||
"type": auth.entity_type,
|
||||
"id": auth.entity_id
|
||||
},
|
||||
access_control={
|
||||
"readers": [auth.entity_id],
|
||||
"writers": [auth.entity_id],
|
||||
"admins": [auth.entity_id]
|
||||
}
|
||||
logger.info(
|
||||
f"Stored file in bucket `{storage_info[0]}` with key `{storage_info[1]}`"
|
||||
)
|
||||
)
|
||||
logger.info(f"Created file document record with ID {doc.external_id}")
|
||||
|
||||
# 3. Parse content into chunks
|
||||
chunks = await self.parser.parse_file(file_content, file.content_type)
|
||||
if not chunks:
|
||||
raise ValueError("No content chunks extracted from file")
|
||||
logger.info(f"Parsed file into {len(chunks)} chunks")
|
||||
# 2. Read and store file
|
||||
file_content = await file.read()
|
||||
storage_info = await self.storage.upload_from_base64(
|
||||
base64.b64encode(file_content).decode(),
|
||||
doc.external_id,
|
||||
file.content_type
|
||||
)
|
||||
doc.storage_info = {
|
||||
"bucket": storage_info[0],
|
||||
"key": storage_info[1]
|
||||
}
|
||||
logger.info(
|
||||
f"Stored file in bucket `{storage_info[0]}` with key `{storage_info[1]}`"
|
||||
)
|
||||
|
||||
# 4. Generate embeddings for chunks
|
||||
embeddings = await self.embedding_model.embed_for_ingestion(chunks)
|
||||
logger.info(f"Generated {len(embeddings)} embeddings")
|
||||
# 3. Parse content into chunks
|
||||
chunks = await self.parser.parse_file(file_content, file.content_type)
|
||||
if not chunks:
|
||||
raise ValueError("No content chunks extracted from file")
|
||||
logger.info(f"Parsed file into {len(chunks)} chunks")
|
||||
|
||||
# 5. Create and store chunk objects
|
||||
chunk_objects = self._create_chunk_objects(
|
||||
doc.external_id,
|
||||
chunks,
|
||||
embeddings,
|
||||
doc.metadata
|
||||
)
|
||||
logger.info(f"Created {len(chunk_objects)} chunk objects")
|
||||
# 4. Generate embeddings for chunks
|
||||
embeddings = await self.embedding_model.embed_for_ingestion(chunks)
|
||||
logger.info(f"Generated {len(embeddings)} embeddings")
|
||||
|
||||
# 6. Store everything
|
||||
doc.chunk_ids = await self._store_chunks_and_doc(chunk_objects, doc)
|
||||
logger.info(f"Successfully stored file document {doc.external_id}")
|
||||
# 5. Create and store chunk objects
|
||||
chunk_objects = self._create_chunk_objects(
|
||||
doc.external_id,
|
||||
chunks,
|
||||
embeddings,
|
||||
doc.metadata
|
||||
)
|
||||
logger.info(f"Created {len(chunk_objects)} chunk objects")
|
||||
|
||||
return doc
|
||||
# 6. Store everything
|
||||
doc.chunk_ids = await self._store_chunks_and_doc(chunk_objects, doc)
|
||||
logger.info(f"Successfully stored file document {doc.external_id}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"File document ingestion failed: {str(e)}")
|
||||
# TODO: Clean up any stored data on failure
|
||||
raise
|
||||
return doc
|
||||
|
||||
async def query(
|
||||
self,
|
||||
@ -171,39 +162,37 @@ class DocumentService:
|
||||
auth: AuthContext
|
||||
) -> Union[List[ChunkResult], List[DocumentResult]]:
|
||||
"""Query documents with specified return type."""
|
||||
try:
|
||||
# 1. Get embedding for query
|
||||
query_embedding = await self.embedding_model.embed_for_query(request.query)
|
||||
logger.info("Generated query embedding")
|
||||
# TODO: k does not make sense for Documents, it's about chunks.
|
||||
# We should also look into document ordering. Figure these out.
|
||||
|
||||
# 1. Get embedding for query
|
||||
query_embedding = await self.embedding_model.embed_for_query(request.query)
|
||||
logger.info("Generated query embedding")
|
||||
|
||||
# 2. Find authorized documents
|
||||
doc_ids = await self.db.find_documents(auth, request.filters)
|
||||
if not doc_ids:
|
||||
logger.info("No authorized documents found")
|
||||
return []
|
||||
logger.info(f"Found {len(doc_ids)} authorized documents")
|
||||
# 2. Find authorized documents
|
||||
doc_ids = await self.db.find_authorized_and_filtered_documents(auth, request.filters)
|
||||
if not doc_ids:
|
||||
logger.info("No authorized documents found")
|
||||
return []
|
||||
logger.info(f"Found {len(doc_ids)} authorized documents")
|
||||
|
||||
# 3. Search chunks with vector similarity
|
||||
chunks = await self.vector_store.query_similar(
|
||||
query_embedding,
|
||||
k=request.k,
|
||||
doc_ids=doc_ids,
|
||||
)
|
||||
logger.info(f"Found {len(chunks)} similar chunks")
|
||||
# 3. Search chunks with vector similarity
|
||||
chunks = await self.vector_store.query_similar(
|
||||
query_embedding,
|
||||
k=request.k,
|
||||
doc_ids=doc_ids,
|
||||
)
|
||||
logger.info(f"Found {len(chunks)} similar chunks")
|
||||
|
||||
# 4. Return results in requested format
|
||||
if request.return_type == QueryReturnType.CHUNKS:
|
||||
results = await self._create_chunk_results(auth, chunks)
|
||||
logger.info(f"Returning {len(results)} chunk results")
|
||||
return results
|
||||
else:
|
||||
results = await self._create_document_results(auth, chunks)
|
||||
logger.info(f"Returning {len(results)} document results")
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Query failed: {str(e)}")
|
||||
raise e
|
||||
# 4. Return results in requested format
|
||||
if request.return_type == QueryReturnType.CHUNKS:
|
||||
results = await self._create_chunk_results(auth, chunks)
|
||||
logger.info(f"Returning {len(results)} chunk results")
|
||||
return results
|
||||
else:
|
||||
results = await self._create_document_results(auth, chunks)
|
||||
logger.info(f"Returning {len(results)} document results")
|
||||
return results
|
||||
|
||||
def _create_chunk_objects(
|
||||
self,
|
||||
@ -240,8 +229,8 @@ class DocumentService:
|
||||
if not await self.db.store_document(doc):
|
||||
raise Exception("Failed to store document metadata")
|
||||
logger.debug("Stored document metadata in database")
|
||||
|
||||
return [str(id) for id in result.inserted_ids]
|
||||
logger.debug(f"Chunk IDs stored: {result}")
|
||||
return result
|
||||
|
||||
async def _create_chunk_results(
|
||||
self,
|
||||
|
@ -1,24 +1,23 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Tuple, Optional, Union, BinaryIO
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class BaseStorage(ABC):
|
||||
"""Base interface for storage providers."""
|
||||
|
||||
|
||||
@abstractmethod
|
||||
async def upload_file(self,
|
||||
file: Union[str, bytes, BinaryIO],
|
||||
key: str,
|
||||
content_type: Optional[str] = None) -> Tuple[str, str]:
|
||||
async def upload_file(self,
|
||||
file: Union[str, bytes, BinaryIO],
|
||||
key: str,
|
||||
content_type: Optional[str] = None) -> Tuple[str, str]:
|
||||
"""
|
||||
Upload a file to storage.
|
||||
|
||||
|
||||
Args:
|
||||
file: File content as string, bytes or file object
|
||||
key: Storage key/path for the file
|
||||
content_type: Optional MIME type
|
||||
|
||||
|
||||
Returns:
|
||||
Tuple[str, str]: (bucket/container name, storage key)
|
||||
"""
|
||||
@ -26,17 +25,17 @@ class BaseStorage(ABC):
|
||||
|
||||
@abstractmethod
|
||||
async def upload_from_base64(self,
|
||||
content: str,
|
||||
key: str,
|
||||
content_type: Optional[str] = None) -> Tuple[str, str]:
|
||||
content: str,
|
||||
key: str,
|
||||
content_type: Optional[str] = None) -> Tuple[str, str]:
|
||||
"""
|
||||
Upload base64 encoded content.
|
||||
|
||||
|
||||
Args:
|
||||
content: Base64 encoded content
|
||||
key: Storage key/path
|
||||
content_type: Optional MIME type
|
||||
|
||||
|
||||
Returns:
|
||||
Tuple[str, str]: (bucket/container name, storage key)
|
||||
"""
|
||||
@ -46,11 +45,11 @@ class BaseStorage(ABC):
|
||||
async def download_file(self, bucket: str, key: str) -> bytes:
|
||||
"""
|
||||
Download file from storage.
|
||||
|
||||
|
||||
Args:
|
||||
bucket: Bucket/container name
|
||||
key: Storage key/path
|
||||
|
||||
|
||||
Returns:
|
||||
bytes: File content
|
||||
"""
|
||||
@ -60,12 +59,12 @@ class BaseStorage(ABC):
|
||||
async def get_download_url(self, bucket: str, key: str, expires_in: int = 3600) -> str:
|
||||
"""
|
||||
Get temporary download URL.
|
||||
|
||||
|
||||
Args:
|
||||
bucket: Bucket/container name
|
||||
key: Storage key/path
|
||||
expires_in: URL expiration in seconds
|
||||
|
||||
|
||||
Returns:
|
||||
str: Presigned download URL
|
||||
"""
|
||||
@ -75,11 +74,11 @@ class BaseStorage(ABC):
|
||||
async def delete_file(self, bucket: str, key: str) -> bool:
|
||||
"""
|
||||
Delete file from storage.
|
||||
|
||||
|
||||
Args:
|
||||
bucket: Bucket/container name
|
||||
key: Storage key/path
|
||||
|
||||
|
||||
Returns:
|
||||
bool: True if successful
|
||||
"""
|
||||
|
@ -17,6 +17,7 @@ logger = logging.getLogger(__name__)
|
||||
class S3Storage(BaseStorage):
|
||||
"""AWS S3 storage implementation."""
|
||||
|
||||
# TODO: Remove hardcoded values.
|
||||
def __init__(
|
||||
self,
|
||||
aws_access_key: str,
|
||||
|
@ -40,4 +40,4 @@ def detect_file_type(content: str) -> str:
|
||||
'application/msword': '.doc',
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx'
|
||||
}
|
||||
return extension_map.get(detected_type, '.bin')
|
||||
return extension_map.get(detected_type, '.bin')
|
||||
|
@ -6,7 +6,7 @@ from core.models.documents import DocumentChunk
|
||||
|
||||
class BaseVectorStore(ABC):
|
||||
@abstractmethod
|
||||
def store_embeddings(self, chunks: List[DocumentChunk]) -> Tuple[bool, Optional[Any]]:
|
||||
def store_embeddings(self, chunks: List[DocumentChunk]) -> Tuple[bool, List[str]]:
|
||||
"""Store document chunks and their embeddings"""
|
||||
pass
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
from typing import List, Optional
|
||||
from typing import List, Optional, Tuple
|
||||
import logging
|
||||
from motor.motor_asyncio import AsyncIOMotorClient
|
||||
from pymongo.errors import PyMongoError
|
||||
@ -41,11 +41,11 @@ class MongoDBAtlasVectorStore(BaseVectorStore):
|
||||
logger.error(f"Error initializing vector store indexes: {str(e)}")
|
||||
return False
|
||||
|
||||
async def store_embeddings(self, chunks: List[DocumentChunk]) -> bool:
|
||||
async def store_embeddings(self, chunks: List[DocumentChunk]) -> Tuple[bool, List[str]]:
|
||||
"""Store document chunks with their embeddings."""
|
||||
try:
|
||||
if not chunks:
|
||||
return True
|
||||
return True, []
|
||||
|
||||
# Convert chunks to dicts
|
||||
documents = []
|
||||
@ -65,12 +65,12 @@ class MongoDBAtlasVectorStore(BaseVectorStore):
|
||||
result = await self.collection.insert_many(
|
||||
documents, ordered=False
|
||||
)
|
||||
return len(result.inserted_ids) > 0, result
|
||||
return False, None
|
||||
return len(result.inserted_ids) > 0, [str(id) for id in result.inserted_ids]
|
||||
return False, []
|
||||
|
||||
except PyMongoError as e:
|
||||
logger.error(f"Error storing embeddings: {str(e)}")
|
||||
return False
|
||||
return False, []
|
||||
|
||||
async def query_similar(
|
||||
self,
|
||||
|
@ -23,4 +23,4 @@ token = jwt.encode(payload, jwt_secret, algorithm="HS256")
|
||||
|
||||
# Create URI
|
||||
uri = f"databridge://test_dev:{token}@127.0.0.1:8000"
|
||||
print(uri)
|
||||
print(uri)
|
||||
|
@ -1,10 +1,11 @@
|
||||
import json
|
||||
from typing import Dict, Any, List, Optional, Union, BinaryIO
|
||||
import httpx
|
||||
from urllib.parse import urlparse
|
||||
import jwt
|
||||
from pydantic import BaseModel
|
||||
import json
|
||||
from pathlib import Path
|
||||
from io import BytesIO
|
||||
|
||||
|
||||
class IngestTextRequest(BaseModel):
|
||||
@ -189,9 +190,10 @@ class DataBridge:
|
||||
file_path = Path(file)
|
||||
if not file_path.exists():
|
||||
raise ValueError(f"File not found: {file}")
|
||||
file_obj = open(file_path, "rb")
|
||||
with open(file_path, "rb") as f:
|
||||
content = f.read()
|
||||
file_obj = BytesIO(content)
|
||||
elif isinstance(file, bytes):
|
||||
from io import BytesIO
|
||||
file_obj = BytesIO(file)
|
||||
else:
|
||||
file_obj = file
|
||||
@ -201,7 +203,7 @@ class DataBridge:
|
||||
files = {
|
||||
"file": (filename, file_obj, content_type or "application/octet-stream")
|
||||
}
|
||||
|
||||
|
||||
# Add metadata
|
||||
data = {"metadata": json.dumps(metadata or {})}
|
||||
|
||||
@ -227,17 +229,17 @@ class DataBridge:
|
||||
) -> Union[List[ChunkResult], List[DocumentResult]]:
|
||||
"""
|
||||
Query documents in DataBridge.
|
||||
|
||||
|
||||
Args:
|
||||
query: Search query text
|
||||
return_type: Type of results ("chunks" or "documents")
|
||||
filters: Optional metadata filters
|
||||
k: Number of results (default: 4)
|
||||
min_score: Minimum similarity threshold (default: 0.0)
|
||||
|
||||
|
||||
Returns:
|
||||
List[ChunkResult] or List[DocumentResult] depending on return_type
|
||||
|
||||
|
||||
Example:
|
||||
```python
|
||||
# Query for chunks
|
||||
@ -246,7 +248,7 @@ class DataBridge:
|
||||
return_type="chunks",
|
||||
filters={"department": "research"}
|
||||
)
|
||||
|
||||
|
||||
# Query for documents
|
||||
docs = await db.query(
|
||||
"machine learning",
|
||||
@ -264,7 +266,7 @@ class DataBridge:
|
||||
}
|
||||
|
||||
response = await self._request("POST", "query", request)
|
||||
|
||||
|
||||
if return_type == "chunks":
|
||||
return [ChunkResult(**r) for r in response]
|
||||
return [DocumentResult(**r) for r in response]
|
||||
@ -272,15 +274,17 @@ class DataBridge:
|
||||
async def list_documents(
|
||||
self,
|
||||
skip: int = 0,
|
||||
limit: int = 100
|
||||
limit: int = 100,
|
||||
filters: Optional[Dict[str, Any]] = None
|
||||
) -> List[Document]:
|
||||
"""
|
||||
List accessible documents with pagination.
|
||||
List accessible documents.
|
||||
|
||||
Args:
|
||||
skip: Number of documents to skip
|
||||
limit: Maximum number of documents to return
|
||||
|
||||
filters: Optional filters
|
||||
|
||||
Returns:
|
||||
List[Document]: List of accessible documents
|
||||
|
||||
@ -290,12 +294,12 @@ class DataBridge:
|
||||
docs = await db.list_documents(limit=10)
|
||||
|
||||
# Get next page
|
||||
next_page = await db.list_documents(skip=10, limit=10)
|
||||
next_page = await db.list_documents(skip=10, limit=10, filters={"department": "research"})
|
||||
```
|
||||
"""
|
||||
response = await self._request(
|
||||
"GET",
|
||||
f"documents?skip={skip}&limit={limit}"
|
||||
f"documents?skip={skip}&limit={limit}&filters={filters}"
|
||||
)
|
||||
return [Document(**doc) for doc in response]
|
||||
|
||||
|
@ -24,16 +24,14 @@ class LocalDataBridge(DataBridge):
|
||||
self._base_url = self._base_url.replace("https://", "http://")
|
||||
|
||||
|
||||
# Get DataBridge URI from environment
|
||||
# Format: databridge://owner_id:token@host
|
||||
DATABRIDGE_URI = "databridge://test_dev:eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0eXBlIjoiZGV2ZWxvcGVyIiwiZW50aXR5X2lkIjoidGVzdF9kZXYiLCJwZXJtaXNzaW9ucyI6WyJyZWFkIiwid3JpdGUiLCJhZG1pbiJdLCJleHAiOjE3MzU3ODk3NTd9.lYL1czdIeclMIl8BohTd3C9lpnvMi1gdNGpiec9sXv4@127.0.0.1:8000"
|
||||
print(f"DATABRIDGE_URI: {DATABRIDGE_URI}")
|
||||
# Can be generated using generate_local_uri.py
|
||||
DATABRIDGE_URI = ""
|
||||
|
||||
|
||||
async def demonstrate_text_ingestion(db: LocalDataBridge):
|
||||
"""Demonstrate text document ingestion"""
|
||||
print("\n=== Text Ingestion Example ===")
|
||||
|
||||
|
||||
# Ingest a text document with metadata
|
||||
doc = await db.ingest_text(
|
||||
content=(
|
||||
@ -49,25 +47,25 @@ async def demonstrate_text_ingestion(db: LocalDataBridge):
|
||||
"difficulty": "intermediate"
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
print("✓ Text document ingested")
|
||||
print(f" ID: {doc.external_id}")
|
||||
print(f" Content Type: {doc.content_type}")
|
||||
print(f" Tags: {doc.metadata.get('tags')}")
|
||||
|
||||
|
||||
return doc.external_id
|
||||
|
||||
|
||||
async def demonstrate_file_ingestion(db: LocalDataBridge):
|
||||
"""Demonstrate file document ingestion"""
|
||||
print("\n=== File Ingestion Example ===")
|
||||
|
||||
|
||||
# Create a sample PDF file
|
||||
pdf_path = Path("sample.pdf")
|
||||
if not pdf_path.exists():
|
||||
print("× Sample PDF not found, skipping file ingestion")
|
||||
return
|
||||
|
||||
|
||||
# Method 1: Ingest using file path
|
||||
doc1 = await db.ingest_file(
|
||||
file=pdf_path,
|
||||
@ -82,7 +80,7 @@ async def demonstrate_file_ingestion(db: LocalDataBridge):
|
||||
print("✓ File ingested from path")
|
||||
print(f" ID: {doc1.external_id}")
|
||||
print(f" Storage Info: {doc1.storage_info}")
|
||||
|
||||
|
||||
# Method 2: Ingest using file object
|
||||
with open(pdf_path, "rb") as f:
|
||||
doc2 = await db.ingest_file(
|
||||
@ -92,14 +90,14 @@ async def demonstrate_file_ingestion(db: LocalDataBridge):
|
||||
)
|
||||
print("✓ File ingested from file object")
|
||||
print(f" ID: {doc2.external_id}")
|
||||
|
||||
|
||||
return doc1.external_id
|
||||
|
||||
|
||||
async def demonstrate_querying(db: LocalDataBridge, doc_id: str):
|
||||
"""Demonstrate document querying"""
|
||||
print("\n=== Querying Example ===")
|
||||
|
||||
|
||||
# Query 1: Search for chunks
|
||||
print("\nSearching for chunks about machine learning:")
|
||||
chunks = await db.query(
|
||||
@ -108,12 +106,12 @@ async def demonstrate_querying(db: LocalDataBridge, doc_id: str):
|
||||
k=2,
|
||||
filters={"category": "technical"}
|
||||
)
|
||||
|
||||
|
||||
for chunk in chunks:
|
||||
print(f"\nChunk from document {chunk.document_id}")
|
||||
print(f"Score: {chunk.score:.2f}")
|
||||
print(f"Content: {chunk.content[:200]}...")
|
||||
|
||||
|
||||
# Query 2: Search for documents
|
||||
print("\nSearching for documents about deep learning:")
|
||||
docs = await db.query(
|
||||
@ -121,7 +119,7 @@ async def demonstrate_querying(db: LocalDataBridge, doc_id: str):
|
||||
return_type="documents",
|
||||
filters={"tags": ["ML", "AI"]}
|
||||
)
|
||||
|
||||
|
||||
for doc in docs:
|
||||
print(f"\nDocument: {doc.document_id}")
|
||||
print(f"Score: {doc.score:.2f}")
|
||||
@ -131,13 +129,13 @@ async def demonstrate_querying(db: LocalDataBridge, doc_id: str):
|
||||
async def demonstrate_document_management(db: LocalDataBridge):
|
||||
"""Demonstrate document management operations"""
|
||||
print("\n=== Document Management Example ===")
|
||||
|
||||
|
||||
# List documents with pagination
|
||||
print("\nListing first 5 documents:")
|
||||
docs = await db.list_documents(limit=5)
|
||||
for doc in docs:
|
||||
print(f"- {doc.external_id}: {doc.metadata.get('title', 'Untitled')}")
|
||||
|
||||
|
||||
if docs:
|
||||
# Get specific document details
|
||||
doc_id = docs[0].external_id
|
||||
|
Loading…
x
Reference in New Issue
Block a user