basic ingestion using unstructured and k-nearest retrieval works

2025-05-09 19:32:38 +00:00 · 2024-11-14 23:18:37 -05:00 · 2024-11-14 23:18:37 -05:00 · 30577dc0ff
commit 30577dc0ff
parent db013e4713
18 changed files with 763 additions and 171 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,23 @@
 # Python-related files
 *__pycache__/
 *.pyc
 *.pyo
 *.pyd
 .Python
 env/
 .env
 venv/*
 ENV/
 dist/
 build/
 *.egg-info/
 .eggs/
 *.egg
 *.pytest_cache/
 # Virtual environment
 .venv/
 .vscode/
 *.DS_Store
--- a/init.py
+++ b/init.py
--- a/base_embedding_model.py
+++ b/base_embedding_model.py
@ -0,0 +1,9 @@
 from abc import ABC, abstractmethod
 from typing import List, Union
 class BaseEmbeddingModel(ABC):
    @abstractmethod
    async def embed(self, text: Union[str, List[str]]) -> List[float]:
        """Generate embeddings for input text"""
        pass
--- a/base_parser.py
+++ b/base_parser.py
@ -0,0 +1,9 @@
 from abc import ABC, abstractmethod
 from typing import Dict, Any, List
 class BaseParser(ABC):
    @abstractmethod
    def parse(self, content: str, metadata: Dict[str, Any]) -> List[str]:
        """Parse content into chunks"""
        pass
--- a/base_planner.py
+++ b/base_planner.py
@ -0,0 +1,9 @@
 from abc import ABC, abstractmethod
 from typing import Dict, Any
 class BasePlanner(ABC):
    @abstractmethod
    def plan_retrieval(self, query: str, **kwargs) -> Dict[str, Any]:
        """Create execution plan for retrieval"""
        pass
--- a/base_vector_store.py
+++ b/base_vector_store.py
@ -0,0 +1,15 @@
 from abc import ABC, abstractmethod
 from typing import List
 from document import DocumentChunk
 class BaseVectorStore(ABC):
    @abstractmethod
    def store_embeddings(self, chunks: List[DocumentChunk]) -> bool:
        """Store document chunks and their embeddings"""
        pass
    @abstractmethod
    def query_similar(self, query_embedding: List[float], k: int, owner_id: str) -> List[DocumentChunk]:
        """Find similar chunks"""
        pass
--- a/core.py
+++ b/core.py
@ -1,171 +0,0 @@
 from typing import List, Dict, Any, Optional
 from abc import ABC, abstractmethod
 from datetime import datetime
 import uuid
 # Base Classes and Interfaces
 class Document:
    def __init__(self, content: str, metadata: Dict[str, Any], owner_id: str):
        self.id = str(uuid.uuid4())
        self.content = content
        self.metadata = metadata
        self.owner_id = owner_id
        self.created_at = datetime.utcnow()
        self.chunks: List[DocumentChunk] = []
 class DocumentChunk:
    def __init__(self, content: str, embedding: List[float], doc_id: str):
        self.id = str(uuid.uuid4())
        self.content = content
        self.embedding = embedding
        self.doc_id = doc_id
 class BaseParser(ABC):
    @abstractmethod
    def parse(self, content: str, metadata: Dict[str, Any]) -> List[str]:
        """Parse content into chunks"""
        pass
 class BasePlanner(ABC):
    @abstractmethod
    def plan_retrieval(self, query: str, **kwargs) -> Dict[str, Any]:
        """Create execution plan for retrieval"""
        pass
 class BaseVectorStore(ABC):
    @abstractmethod
    def store_embeddings(self, chunks: List[DocumentChunk]) -> bool:
        """Store document chunks and their embeddings"""
        pass
    @abstractmethod
    def query_similar(self, query_embedding: List[float], k: int, owner_id: str) -> List[DocumentChunk]:
        """Find similar chunks"""
        pass
 # Concrete Implementations
 class SimpleParser(BaseParser):
    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
    def parse(self, content: str, metadata: Dict[str, Any]) -> List[str]:
        # Simple implementation - split by chunk_size
        chunks = []
        for i in range(0, len(content), self.chunk_size - self.chunk_overlap):
            chunk = content[i:i + self.chunk_size]
            if chunk:
                chunks.append(chunk)
        return chunks
 class SimpleRAGPlanner(BasePlanner):
    def __init__(self, k: int = 3):
        self.k = k
    def plan_retrieval(self, query: str, **kwargs) -> Dict[str, Any]:
        return {
            "strategy": "simple_rag",
            "k": kwargs.get("k", self.k),
            "query": query
        }
 # Main DataBridge Class
 class DataBridge:
    def __init__(
        self,
        parser: BaseParser,
        planner: BasePlanner,
        vector_store: BaseVectorStore,
        embedding_model: Any  # This would be your chosen embedding model
    ):
        self.parser = parser
        self.planner = planner
        self.vector_store = vector_store
        self.embedding_model = embedding_model
    async def ingest_document(
        self,
        content: str,
        metadata: Dict[str, Any],
        owner_id: str
    ) -> Document:
        # Create document
        doc = Document(content, metadata, owner_id)
        # Parse into chunks
        chunk_texts = self.parser.parse(content, metadata)
        # Create embeddings and chunks
        for chunk_text in chunk_texts:
            embedding = await self.embedding_model.embed(chunk_text)
            chunk = DocumentChunk(chunk_text, embedding, doc.id)
            doc.chunks.append(chunk)
        # Store in vector store
        success = self.vector_store.store_embeddings(doc.chunks)
        if not success:
            raise Exception("Failed to store embeddings")
        return doc
    async def query(
        self,
        query: str,
        owner_id: str,
        **kwargs
    ) -> List[Dict[str, Any]]:
        # Create plan
        plan = self.planner.plan_retrieval(query, **kwargs)
        # Get query embedding
        query_embedding = await self.embedding_model.embed(query)
        # Execute plan
        chunks = self.vector_store.query_similar(
            query_embedding,
            k=plan["k"],
            owner_id=owner_id
        )
        # Format results
        results = []
        for chunk in chunks:
            results.append({
                "content": chunk.content,
                "doc_id": chunk.doc_id,
                "chunk_id": chunk.id,
                "score": chunk.score if hasattr(chunk, "score") else None
            })
        return results
 # Example usage
 """
 # Initialize components
 parser = SimpleParser()
 planner = SimpleRAGPlanner()
 vector_store = YourVectorStore()  # Implement with chosen backend
 embedding_model = YourEmbeddingModel()  # Implement with chosen model
 # Create DataBridge instance
 db = DataBridge(parser, planner, vector_store, embedding_model)
 # Ingest a document
 doc = await db.ingest_document(
    content="Your document content here",
    metadata={"source": "pdf", "title": "Example Doc"},
    owner_id="user123"
 )
 # Query the system
 results = await db.query(
    query="Your query here",
    owner_id="user123",
    k=5  # optional override
 )
 """
--- a/databridge.py
+++ b/databridge.py
@ -0,0 +1,106 @@
 from typing import Dict, Any, List
 from databridge_uri import DataBridgeURI
 from document import Document, DocumentChunk
 from mongo_vector_store import MongoDBAtlasVectorStore
 from openai_embedding_model import OpenAIEmbeddingModel
 from unstructured_parser import UnstructuredAPIParser
 from simple_planner import SimpleRAGPlanner
 class DataBridge:
    """
    DataBridge with owner authentication and authorization.
    Configured via URI containing owner credentials.
    """
    def __init__(self, uri: str):
        # Parse URI and initialize configuration
        self.config = DataBridgeURI(uri)
        # Initialize components
        self._init_components()
    def _init_components(self):
        """Initialize all required components using the URI configuration"""
        self.embedding_model = OpenAIEmbeddingModel(
            api_key=self.config.openai_api_key,
            model_name=self.config.embedding_model
        )
        self.parser = UnstructuredAPIParser(
            api_key=self.config.unstructured_api_key,
            chunk_size=1000,
            chunk_overlap=200
        )
        self.vector_store = MongoDBAtlasVectorStore(
            connection_string=self.config.mongo_uri,
            database_name=self.config.db_name,
            collection_name=self.config.collection_name
        )
        self.planner = SimpleRAGPlanner(default_k=4)
    async def ingest_document(
        self,
        content: str,
        metadata: Dict[str, Any]
    ) -> Document:
        """
        Ingest a document using the owner ID from the URI configuration.
        """
        # Add owner_id to metadata
        metadata['owner_id'] = self.config.owner_id
        # Create document
        doc = Document(content, metadata, self.config.owner_id)
        # Parse into chunks
        chunk_texts = self.parser.parse(content, metadata)
        # Create embeddings and chunks
        for chunk_text in chunk_texts:
            embedding = await self.embedding_model.embed(chunk_text)
            chunk = DocumentChunk(chunk_text, embedding, doc.id)
            chunk.metadata = {'owner_id': self.config.owner_id}
            doc.chunks.append(chunk)
        # Store in vector store
        success = self.vector_store.store_embeddings(doc.chunks)
        if not success:
            raise Exception("Failed to store embeddings")
        return doc
    async def query(
        self,
        query: str,
        **kwargs
    ) -> List[Dict[str, Any]]:
        """
        Query the document store using the owner ID from the URI configuration.
        """
        # Create plan
        plan = self.planner.plan_retrieval(query, **kwargs)
        # Get query embedding
        query_embedding = await self.embedding_model.embed(query)
        # Execute plan
        chunks = self.vector_store.query_similar(
            query_embedding,
            k=plan["k"],
            owner_id=self.config.owner_id
        )
        # Format results
        results = []
        for chunk in chunks:
            results.append({
                "content": chunk.content,
                "doc_id": chunk.doc_id,
                "chunk_id": chunk.id,
                "score": chunk.score if hasattr(chunk, "score") else None
            })
        return results
--- a/databridge_uri.py
+++ b/databridge_uri.py
@ -0,0 +1,61 @@
 from urllib.parse import urlparse, parse_qs
 from typing import Optional, Dict, Any
 import os
 import jwt
 from datetime import datetime, timedelta
 class DataBridgeURI:
    """
    Handles parsing and validation of DataBridge URIs with owner authentication
    Format: databridge://<owner_id>:<auth_token>@host/path?params
    """
    def __init__(self, uri: str):
        self.uri = uri
        self._parse_uri()
    def _parse_uri(self):
        parsed = urlparse(self.uri)
        query_params = parse_qs(parsed.query)
        # Parse authentication info from netloc
        auth_parts = parsed.netloc.split('@')[0].split(':')
        if len(auth_parts) != 2:
            raise ValueError("URI must include owner_id and auth_token")
        self.owner_id = auth_parts[0]
        self.auth_token = auth_parts[1]
        # Validate and decode auth token
        try:
            self._validate_auth_token()
        except Exception as e:
            raise ValueError(f"Invalid auth token: {str(e)}")
        # Get the original MongoDB URI from environment - use it as is
        self.mongo_uri = os.getenv("MONGODB_URI")
        if not self.mongo_uri:
            raise ValueError("MONGODB_URI environment variable not set")
        # Get configuration from query parameters
        self.openai_api_key = query_params.get('openai_key', [os.getenv('OPENAI_API_KEY', '')])[0]
        self.unstructured_api_key = query_params.get('unstructured_key', [os.getenv('UNSTRUCTURED_API_KEY', '')])[0]
        self.db_name = query_params.get('db', ['brandsyncaidb'])[0]
        self.collection_name = query_params.get('collection', ['kb_chunked_embeddings'])[0]
        self.embedding_model = query_params.get('embedding_model', ['text-embedding-3-small'])[0]
        # Validate required fields
        if not all([self.mongo_uri, self.openai_api_key, self.unstructured_api_key]):
            raise ValueError("Missing required configuration in DataBridge URI")
    def _validate_auth_token(self):
        """Validate the auth token and extract any additional claims"""
        try:
            decoded = jwt.decode(self.auth_token, 'your-secret-key', algorithms=['HS256'])
            if decoded.get('owner_id') != self.owner_id:
                raise ValueError("Token owner_id mismatch")
            self.auth_claims = decoded
        except jwt.ExpiredSignatureError:
            raise ValueError("Auth token has expired")
        except jwt.InvalidTokenError:
            raise ValueError("Invalid auth token")
--- a/document.py
+++ b/document.py
@ -0,0 +1,21 @@
 from typing import Dict, Any, List
 import uuid
 from datetime import datetime
 class Document:
    def __init__(self, content: str, metadata: Dict[str, Any], owner_id: str):
        self.id = str(uuid.uuid4())
        self.content = content
        self.metadata = metadata
        self.owner_id = owner_id
        self.created_at = datetime.utcnow()
        self.chunks: List[DocumentChunk] = []
 class DocumentChunk:
    def __init__(self, content: str, embedding: List[float], doc_id: str):
        self.id = str(uuid.uuid4())
        self.content = content
        self.embedding = embedding
        self.doc_id = doc_id
--- a/mongo_vector_store.py
+++ b/mongo_vector_store.py
@ -0,0 +1,111 @@
 from typing import List, Dict, Any
 from pymongo import MongoClient
 from base_vector_store import BaseVectorStore
 from document import DocumentChunk
 class MongoDBAtlasVectorStore(BaseVectorStore):
    def __init__(
        self,
        connection_string: str,
        database_name: str,
        collection_name: str = "kb_chunked_embeddings",
        index_name: str = "vector_index"
    ):
        self.client = MongoClient(connection_string)
        self.db = self.client[database_name]
        self.collection = self.db[collection_name]
        self.index_name = index_name
        # Ensure vector search index exists
        # self._ensure_index()
    def _ensure_index(self):
        """Ensure the vector search index exists"""
        try:
            # Check if index exists
            indexes = self.collection.list_indexes()
            index_exists = any(index.get('name') == self.index_name for index in indexes)
            if not index_exists:
                # Create the vector search index if it doesn't exist
                self.collection.create_index(
                    [("embedding", "vectorSearch")],
                    name=self.index_name,
                    vectorSearchOptions={
                        "dimensions": 1536,  # For OpenAI embeddings
                        "similarity": "cosine"
                    }
                )
        except Exception as e:
            print(f"Warning: Could not create vector index: {str(e)}")
    def store_embeddings(self, chunks: List[DocumentChunk]) -> bool:
        try:
            documents = []
            for chunk in chunks:
                doc = {
                    "_id": chunk.id,  # Use chunk.id as MongoDB _id
                    "text": chunk.content,
                    "embedding": chunk.embedding,
                    "doc_id": chunk.doc_id,
                    "owner_id": chunk.metadata.get("owner_id"),
                    "metadata": chunk.metadata
                }
                documents.append(doc)
            if documents:
                # Use ordered=False to continue even if some inserts fail
                result = self.collection.insert_many(documents, ordered=False)
                return len(result.inserted_ids) > 0
            return True
        except Exception as e:
            print(f"Error storing embeddings: {str(e)}")
            return False
    def query_similar(
        self,
        query_embedding: List[float],
        k: int,
        owner_id: str,
        filters: Dict[str, Any] = None
    ) -> List[DocumentChunk]:
        """Find similar chunks using MongoDB Atlas Vector Search."""
        base_filter = {"owner_id": owner_id}
        if filters:
            base_filter.update(filters)
        try:
            pipeline = [
                {
                    "$vectorSearch": {
                        "index": self.index_name,
                        "path": "embedding",
                        "queryVector": query_embedding,
                        "numCandidates": k * 10,
                        "limit": k,
                        "filter": base_filter
                    }
                }
            ]
            results = list(self.collection.aggregate(pipeline))
            chunks = []
            for result in results:
                chunk = DocumentChunk(
                    content=result["text"],
                    embedding=result["embedding"],
                    doc_id=result["doc_id"]
                )
                chunk.score = result.get("score", 0)
                # Add metadata back to chunk
                chunk.metadata = result.get("metadata", {})
                chunks.append(chunk)
            return chunks
        except Exception as e:
            print(f"Error querying similar documents: {str(e)}")
            return []
--- a/openai_embedding_model.py
+++ b/openai_embedding_model.py
@ -0,0 +1,23 @@
 from typing import List, Union
 import openai
 from base_embedding_model import BaseEmbeddingModel
 class OpenAIEmbeddingModel(BaseEmbeddingModel):
    def __init__(self, api_key: str, model_name: str = "text-embedding-3-small"):
        self.client = openai.Client(api_key=api_key)
        self.model_name = model_name
    async def embed(self, text: Union[str, List[str]]) -> List[float]:
        if isinstance(text, str):
            text = [text]
        response = self.client.embeddings.create(
            model=self.model_name,
            input=text
        )
        if len(text) == 1:
            return response.data[0].embedding
        return [item.embedding for item in response.data]
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,141 @@
 aiohappyeyeballs==2.4.3
 aiohttp==3.11.2
 aiosignal==1.3.1
 annotated-types==0.7.0
 antlr4-python3-runtime==4.9.3
 anyio==4.6.2.post1
 attrs==24.2.0
 backoff==2.2.1
 beautifulsoup4==4.12.3
 cachetools==5.5.0
 certifi==2024.8.30
 cffi==1.17.1
 chardet==5.2.0
 charset-normalizer==3.4.0
 click==8.1.7
 coloredlogs==15.0.1
 contourpy==1.3.1
 cryptography==43.0.3
 cycler==0.12.1
 dataclasses-json==0.6.7
 Deprecated==1.2.14
 distro==1.9.0
 dnspython==2.7.0
 effdet==0.4.1
 emoji==2.14.0
 eval_type_backport==0.2.0
 filelock==3.16.1
 filetype==1.2.0
 flatbuffers==24.3.25
 fonttools==4.55.0
 frozenlist==1.5.0
 fsspec==2024.10.0
 google-api-core==2.23.0
 google-auth==2.36.0
 google-cloud-vision==3.8.1
 googleapis-common-protos==1.66.0
 grpcio==1.67.1
 grpcio-status==1.67.1
 h11==0.14.0
 html5lib==1.1
 httpcore==1.0.6
 httpx==0.27.2
 huggingface-hub==0.26.2
 humanfriendly==10.0
 idna==3.10
 iopath==0.1.10
 Jinja2==3.1.4
 jiter==0.7.1
 joblib==1.4.2
 jsonpatch==1.33
 jsonpath-python==1.0.6
 jsonpointer==3.0.0
 jwt==1.3.1
 kiwisolver==1.4.7
 langchain==0.3.7
 langchain-core==0.3.18
 langchain-text-splitters==0.3.2
 langdetect==1.0.9
 langsmith==0.1.143
 layoutparser==0.3.4
 lxml==5.3.0
 MarkupSafe==3.0.2
 marshmallow==3.23.1
 matplotlib==3.9.2
 mpmath==1.3.0
 multidict==6.1.0
 mypy-extensions==1.0.0
 nest-asyncio==1.6.0
 networkx==3.4.2
 nltk==3.9.1
 numpy==1.26.4
 olefile==0.47
 omegaconf==2.3.0
 onnx==1.17.0
 onnxruntime==1.20.0
 openai==1.54.4
 opencv-python==4.10.0.84
 orjson==3.10.11
 packaging==24.2
 pandas==2.2.3
 pdf2image==1.17.0
 pdfminer.six==20231228
 pdfplumber==0.11.4
 pi_heif==0.20.0
 pikepdf==9.4.1
 pillow==11.0.0
 portalocker==2.10.1
 propcache==0.2.0
 proto-plus==1.25.0
 protobuf==5.28.3
 psutil==6.1.0
 pyasn1==0.6.1
 pyasn1_modules==0.4.1
 pycocotools==2.0.8
 pycparser==2.22
 pydantic==2.9.2
 pydantic_core==2.23.4
 PyJWT==2.9.0
 pymongo==4.10.1
 pyparsing==3.2.0
 pypdf==5.1.0
 pypdfium2==4.30.0
 python-dateutil==2.8.2
 python-dotenv==1.0.1
 python-iso639==2024.10.22
 python-magic==0.4.27
 python-multipart==0.0.17
 python-oxmsg==0.0.1
 pytz==2024.2
 PyYAML==6.0.2
 RapidFuzz==3.10.1
 regex==2024.11.6
 requests==2.32.3
 requests-toolbelt==1.0.0
 rsa==4.9
 safetensors==0.4.5
 scipy==1.14.1
 setuptools==75.5.0
 six==1.16.0
 sniffio==1.3.1
 soupsieve==2.6
 SQLAlchemy==2.0.36
 sympy==1.13.1
 tenacity==9.0.0
 timm==1.0.11
 tokenizers==0.20.3
 torch==2.5.1
 torchvision==0.20.1
 tqdm==4.67.0
 transformers==4.46.2
 typing-inspect==0.9.0
 typing_extensions==4.12.2
 tzdata==2024.2
 unstructured==0.16.5
 unstructured-client==0.27.0
 unstructured-inference==0.8.1
 unstructured.pytesseract==0.3.13
 urllib3==2.2.3
 webencodings==0.5.1
 wrapt==1.16.0
 yarl==1.17.1
--- a/sample.pdf
+++ b/sample.pdf
--- a/sanity_checks/mongo.py
+++ b/sanity_checks/mongo.py
@ -0,0 +1,72 @@
 from pymongo import MongoClient
 from dotenv import load_dotenv
 import os
 import datetime
 def test_mongo_operations():
    # Load environment variables
    load_dotenv()
    # Get MongoDB URI from environment variable
    mongo_uri = os.getenv("MONGODB_URI")
    if not mongo_uri:
        raise ValueError("MONGODB_URI environment variable not set")
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_uri)
        # Test connection
        client.admin.command('ping')
        print("✅ Connected successfully to MongoDB")
        # Get database and collection
        db = client.brandsyncaidb  # Using a test database
        collection = db.kb_chunked_embeddings
        # Insert a single document
        test_doc = {
            "name": "Test Document",
            "timestamp": datetime.datetime.now(),
            "value": 42
        }
        result = collection.insert_one(test_doc)
        print(f"✅ Inserted document with ID: {result.inserted_id}")
        # Insert multiple documents
        test_docs = [
            {"name": "Doc 1", "value": 1},
            {"name": "Doc 2", "value": 2},
            {"name": "Doc 3", "value": 3}
        ]
        result = collection.insert_many(test_docs)
        print(f"✅ Inserted {len(result.inserted_ids)} documents")
        # Retrieve documents
        print("\nRetrieving documents:")
        for doc in collection.find():
            print(f"Found document: {doc}")
        # Find specific documents
        print("\nFinding documents with value >= 2:")
        query = {"value": {"$gte": 2}}
        for doc in collection.find(query):
            print(f"Found document: {doc}")
        # Clean up - delete all test documents
        # DON'T DELETE IF It'S BRANDSYNCAI
        # result = collection.delete_many({})
        print(f"\n✅ Cleaned up {result.deleted_count} test documents")
    except Exception as e:
        print(f"❌ Error: {str(e)}")
    finally:
        client.close()
        print("\n✅ Connection closed")
 if __name__ == "__main__":
    test_mongo_operations()
--- a/simple_example.py
+++ b/simple_example.py
@ -0,0 +1,74 @@
 from datetime import datetime, timedelta, UTC  # Note: using UTC for timezone awareness
 import base64
 from databridge import DataBridge
 import jwt
 import os
 from dotenv import load_dotenv
 def create_databridge_uri() -> str:
    """Create DataBridge URI from environment variables"""
    load_dotenv()
    # Get credentials from environment
    mongo_uri = os.getenv("MONGODB_URI")
    openai_key = os.getenv("OPENAI_API_KEY")
    unstructured_key = os.getenv("UNSTRUCTURED_API_KEY")
    owner_id = os.getenv("DATABRIDGE_OWNER", "admin")
    # Validate required credentials
    if not all([mongo_uri, openai_key, unstructured_key]):
        raise ValueError("Missing required environment variables")
    # Generate auth token
    auth_token = jwt.encode(
        {
            'owner_id': owner_id,
            'exp': datetime.now(UTC) + timedelta(days=30)
        },
        'your-secret-key',  # In production, use proper secret
        algorithm='HS256'
    )
    # For DataBridge URI, use any host identifier (it won't affect MongoDB connection)
    uri = (
        f"databridge://{owner_id}:{auth_token}@databridge.local"
        f"?openai_key={openai_key}"
        f"&unstructured_key={unstructured_key}"
        f"&db=brandsyncaidb"
        f"&collection=kb_chunked_embeddings"
    )
    return uri
 async def main():
    # Initialize DataBridge
    bridge = DataBridge(create_databridge_uri())
    # Example: Ingest a PDF document
    with open("sample.pdf", "rb") as f:
        pdf_content = base64.b64encode(f.read()).decode()
    await bridge.ingest_document(
        content=pdf_content,
        metadata={
            "content_type": "application/pdf",
            "is_base64": True,
            "title": "Sample PDF"
        }
    )
    # Example: Query documents
    results = await bridge.query(
        query="What is machine learning?",
        k=4
    )
    for result in results:
        print(f"Content: {result['content'][:200]}...")
        print(f"Score: {result['score']}\n")
 if __name__ == "__main__":
    import asyncio
    asyncio.run(main())
--- a/simple_planner.py
+++ b/simple_planner.py
@ -0,0 +1,17 @@
 from typing import Dict, Any
 from base_planner import BasePlanner
 class SimpleRAGPlanner(BasePlanner):
    def __init__(self, default_k: int = 3):
        self.default_k = default_k
    def plan_retrieval(self, query: str, **kwargs) -> Dict[str, Any]:
        """Create a simple retrieval plan."""
        return {
            "strategy": "simple_retrieval",
            "k": kwargs.get("k", self.default_k),
            "query": query,
            "filters": kwargs.get("filters", {}),
            "min_score": kwargs.get("min_score", 0.0)
        }
--- a/unstructured_parser.py
+++ b/unstructured_parser.py
@ -0,0 +1,72 @@
 from typing import Dict, Any, List
 from base_parser import BaseParser
 from unstructured.partition.auto import partition
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 import os
 import tempfile
 import base64
 class UnstructuredAPIParser(BaseParser):
    def __init__(
        self,
        api_key: str,
        chunk_size: int = 1000,
        chunk_overlap: int = 200,
        api_url: str = "https://api.unstructuredapp.io"
    ):
        self.api_key = api_key
        self.api_url = api_url
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
            separators=["\n\n", "\n", ". ", " ", ""]
        )
    def parse(self, content: str, metadata: Dict[str, Any]) -> List[str]:
        """Parse content using Unstructured API and split into chunks."""
        try:
            # Create temporary file for content
            with tempfile.NamedTemporaryFile(delete=False, suffix=self._get_file_extension(metadata)) as temp_file:
                if metadata.get("is_base64", False):
                    temp_file.write(base64.b64decode(content))
                else:
                    temp_file.write(content.encode('utf-8'))
                temp_file_path = temp_file.name
            try:
                # Use Unstructured API for parsing
                elements = partition(
                    filename=temp_file_path,
                    api_key=self.api_key,
                    api_url=self.api_url,
                    partition_via_api=True
                )
                # Combine elements and split into chunks
                full_text = "\n\n".join(str(element) for element in elements)
                chunks = self.text_splitter.split_text(full_text)
                return chunks
            finally:
                # Clean up temporary file
                os.unlink(temp_file_path)
        except Exception as e:
            raise Exception(f"Error parsing document: {str(e)}")
    def _get_file_extension(self, metadata: Dict[str, Any]) -> str:
        """Get appropriate file extension based on content type."""
        content_type_mapping = {
            'application/pdf': '.pdf',
            'application/msword': '.doc',
            'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
            'image/jpeg': '.jpg',
            'image/png': '.png',
            'text/plain': '.txt',
            'text/html': '.html'
        }
        return content_type_mapping.get(metadata.get('content_type'), '.txt')