morphik-core/shell.py

#!/usr/bin/env python3
"""
DataBridge interactive CLI.
Assumes a DataBridge server is running.

Usage:
    Without authentication (connects to localhost):
        python shell.py
    
    With authentication:
        python shell.py <uri>
        Example: python shell.py "databridge://user:token@localhost:8000"

This provides the exact same interface as the Python SDK:
    db.ingest_text("content", metadata={...})
    db.ingest_file("path/to/file")
    db.query("what are the key findings?")
    etc...
"""

import sys
from pathlib import Path
import time
from typing import Any, Dict, List, Optional
import requests

# Add local SDK to path before other imports
_SDK_PATH = str(Path(__file__).parent / "sdks" / "python")
if _SDK_PATH not in sys.path:
    sys.path.insert(0, _SDK_PATH)

from databridge import DataBridge  # noqa: E402


class DB:
    def __init__(self, uri: str = None):
        """Initialize DataBridge with optional URI"""
        self._client = DataBridge(uri, is_local=True, timeout=1000)
        self.base_url = "http://localhost:8000"  # For health check only

    def check_health(self, max_retries=30, retry_interval=1) -> bool:
        """Check if DataBridge server is healthy with retries"""
        health_url = f"{self.base_url}/health"

        for attempt in range(max_retries):
            try:
                response = requests.get(health_url, timeout=5)
                if response.status_code == 200:
                    return True
            except requests.exceptions.RequestException:
                pass

            if attempt < max_retries - 1:
                print(
                    f"Waiting for DataBridge server to be ready... (attempt {attempt + 1}/{max_retries})"
                )
                time.sleep(retry_interval)

        return False

    def ingest_text(
        self,
        content: str,
        metadata: Optional[Dict[str, Any]] = None,
        rules: Optional[List[Dict[str, Any]]] = None,
        use_colpali: bool = True,
    ) -> dict:
        """
        Ingest text content into DataBridge.

        Args:
            content: Text content to ingest
            metadata: Optional metadata dictionary
            rules: Optional list of rule objects. Examples:
                  [{"type": "metadata_extraction", "schema": {"name": "string"}},
                   {"type": "natural_language", "prompt": "Remove PII"}]
            use_colpali: Whether to use ColPali-style embedding model to ingest the text
        """
        doc = self._client.ingest_text(
            content, metadata=metadata or {}, rules=rules, use_colpali=use_colpali
        )
        return doc.model_dump()

    def ingest_file(
        self,
        file: str,
        filename: str = None,
        metadata: dict = None,
        rules: Optional[List[Dict[str, Any]]] = None,
        use_colpali: bool = True,
    ) -> dict:
        """
        Ingest a file into DataBridge.

        Args:
            file: Path to file to ingest
            filename: Optional filename (defaults to basename of file path)
            metadata: Optional metadata dictionary
            rules: Optional list of rule objects. Examples:
                  [{"type": "metadata_extraction", "schema": {"title": "string"}},
                   {"type": "natural_language", "prompt": "Summarize"}]
            use_colpali: Whether to use ColPali-style embedding model to ingest the file
        """
        file_path = Path(file)
        filename = filename or file_path.name
        doc = self._client.ingest_file(
            file=file_path,
            filename=filename,
            metadata=metadata or {},
            rules=rules,
            use_colpali=use_colpali,
        )
        return doc.model_dump()

    def retrieve_chunks(
        self, query: str, filters: dict = None, k: int = 4, min_score: float = 0.0, use_colpali: bool = True
    ) -> list:
        """
        Search for relevant chunks
        
        Args:
            query: Search query text
            filters: Optional metadata filters
            k: Number of results (default: 4)
            min_score: Minimum similarity threshold (default: 0.0)
            use_colpali: Whether to use ColPali-style embedding model for retrieval
        """
        results = self._client.retrieve_chunks(
            query, filters=filters or {}, k=k, min_score=min_score, use_colpali=use_colpali
        )
        return [r.model_dump() for r in results]

    def retrieve_docs(
        self, query: str, filters: dict = None, k: int = 4, min_score: float = 0.0, use_colpali: bool = True
    ) -> list:
        """
        Retrieve relevant documents
        
        Args:
            query: Search query text
            filters: Optional metadata filters
            k: Number of results (default: 4)
            min_score: Minimum similarity threshold (default: 0.0)
            use_colpali: Whether to use ColPali-style embedding model for retrieval
        """
        results = self._client.retrieve_docs(
            query, filters=filters or {}, k=k, min_score=min_score, use_colpali=use_colpali
        )
        return [r.model_dump() for r in results]

    def query(
        self,
        query: str,
        filters: dict = None,
        k: int = 4,
        min_score: float = 0.0,
        max_tokens: int = None,
        temperature: float = None,
        use_colpali: bool = True,
    ) -> dict:
        """
        Generate completion using relevant chunks as context
        
        Args:
            query: Query text
            filters: Optional metadata filters
            k: Number of chunks to use as context (default: 4)
            min_score: Minimum similarity threshold (default: 0.0)
            max_tokens: Maximum tokens in completion
            temperature: Model temperature
            use_colpali: Whether to use ColPali-style embedding model for retrieval
        """
        response = self._client.query(
            query,
            filters=filters or {},
            k=k,
            min_score=min_score,
            max_tokens=max_tokens,
            temperature=temperature,
            use_colpali=use_colpali,
        )
        return response.model_dump()

    def list_documents(self, skip: int = 0, limit: int = 100, filters: dict = None) -> list:
        """List accessible documents"""
        docs = self._client.list_documents(skip=skip, limit=limit, filters=filters or {})
        return [doc.model_dump() for doc in docs]

    def get_document(self, document_id: str) -> dict:
        """Get document metadata by ID"""
        doc = self._client.get_document(document_id)
        return doc.model_dump()
        
    def batch_get_documents(self, document_ids: List[str]) -> List[dict]:
        """
        Retrieve multiple documents by their IDs in a single batch operation.
        
        Args:
            document_ids: List of document IDs to retrieve
            
        Returns:
            List of document metadata
        """
        docs = self._client.batch_get_documents(document_ids)
        return [doc.model_dump() for doc in docs]
        
    def batch_get_chunks(self, sources: List[dict]) -> List[dict]:
        """
        Retrieve specific chunks by their document ID and chunk number in a single batch operation.
        
        Args:
            sources: List of dictionaries with document_id and chunk_number fields
            
        Returns:
            List of chunk results
            
        Example:
            sources = [
                {"document_id": "doc_123", "chunk_number": 0},
                {"document_id": "doc_456", "chunk_number": 2}
            ]
        """
        chunks = self._client.batch_get_chunks(sources)
        return [chunk.model_dump() for chunk in chunks]

    def create_cache(
        self,
        name: str,
        model: str,
        gguf_file: str,
        filters: dict = None,
        docs: list = None,
    ) -> dict:
        """Create a new cache with specified configuration"""
        response = self._client.create_cache(
            name=name,
            model=model,
            gguf_file=gguf_file,
            filters=filters or {},
            docs=docs,
        )
        return response

    def get_cache(self, name: str) -> "Cache":
        """Get a cache by name"""
        return self._client.get_cache(name)

    def close(self):
        """Close the client connection"""
        self._client.close()


class Cache:
    def __init__(self, db: DB, name: str):
        self._db = db
        self._name = name
        self._client_cache = db._client.get_cache(name)

    def update(self) -> bool:
        """Update the cache"""
        return self._client_cache.update()

    def add_docs(self, docs: list) -> bool:
        """Add documents to the cache"""
        return self._client_cache.add_docs(docs)

    def query(self, query: str, max_tokens: int = None, temperature: float = None) -> dict:
        """Query the cache"""
        response = self._client_cache.query(
            query=query,
            max_tokens=max_tokens,
            temperature=temperature,
        )
        return response.model_dump()


if __name__ == "__main__":
    uri = sys.argv[1] if len(sys.argv) > 1 else None
    db = DB(uri)

    # Check server health
    if not db.check_health():
        print("Error: Could not connect to DataBridge server")
        sys.exit(1)

    print("\nConnected to DataBridge")

    # Start an interactive Python shell with 'db' already imported
    import code
    import readline  # Enable arrow key history
    import rlcompleter  # noqa: F401 # Enable tab completion

    readline.parse_and_bind("tab: complete")

    # Create the interactive shell
    shell = code.InteractiveConsole(locals())

    # Print welcome message
    print("\nDataBridge CLI ready to use. The 'db' object is available with all SDK methods.")
    print("Examples:")
    print("  db.ingest_text('hello world')")
    print("  db.query('what are the key findings?')")
    print("  db.batch_get_documents(['doc_id1', 'doc_id2'])")
    print("  db.batch_get_chunks([{'document_id': 'doc_123', 'chunk_number': 0}])")
    print("  result = db.query('how to use this API?'); print(result['sources'])")
    print("Type help(db) for documentation.")

    # Start the shell
    shell.interact(banner="")
Add open telemetry and shell (#5) 2024-12-31 10:22:25 +05:30			`#!/usr/bin/env python3`
			`"""`
			`DataBridge interactive CLI.`
			`Assumes a DataBridge server is running.`

			`Usage:`
Streamline dev experience with optional auth and simplified config (#27) 2025-01-11 21:54:00 +05:30			`Without authentication (connects to localhost):`
			`python shell.py`

			`With authentication:`
			`python shell.py <uri>`
			`Example: python shell.py "databridge://user:token@localhost:8000"`
Add open telemetry and shell (#5) 2024-12-31 10:22:25 +05:30
			`This provides the exact same interface as the Python SDK:`
			`db.ingest_text("content", metadata={...})`
			`db.ingest_file("path/to/file")`
			`db.query("what are the key findings?")`
			`etc...`
			`"""`

			`import sys`
			`from pathlib import Path`
Add docker support (#24) 2025-01-09 15:47:25 +05:30			`import time`
Add natural language rules based ingestion (#34) 2025-02-07 21:08:40 -05:00			`from typing import Any, Dict, List, Optional`
Add docker support (#24) 2025-01-09 15:47:25 +05:30			`import requests`
Add open telemetry and shell (#5) 2024-12-31 10:22:25 +05:30
			`# Add local SDK to path before other imports`
			`_SDK_PATH = str(Path(__file__).parent / "sdks" / "python")`
			`if _SDK_PATH not in sys.path:`
			`sys.path.insert(0, _SDK_PATH)`

			`from databridge import DataBridge # noqa: E402`


			`class DB:`
Streamline dev experience with optional auth and simplified config (#27) 2025-01-11 21:54:00 +05:30			`def __init__(self, uri: str = None):`
			`"""Initialize DataBridge with optional URI"""`
			`self._client = DataBridge(uri, is_local=True, timeout=1000)`
			`self.base_url = "http://localhost:8000" # For health check only`
Add docker support (#24) 2025-01-09 15:47:25 +05:30
			`def check_health(self, max_retries=30, retry_interval=1) -> bool:`
			`"""Check if DataBridge server is healthy with retries"""`
			`health_url = f"{self.base_url}/health"`

			`for attempt in range(max_retries):`
			`try:`
			`response = requests.get(health_url, timeout=5)`
			`if response.status_code == 200:`
			`return True`
			`except requests.exceptions.RequestException:`
			`pass`

			`if attempt < max_retries - 1:`
			`print(`
			`f"Waiting for DataBridge server to be ready... (attempt {attempt + 1}/{max_retries})"`
			`)`
			`time.sleep(retry_interval)`

			`return False`

Add natural language rules based ingestion (#34) 2025-02-07 21:08:40 -05:00			`def ingest_text(`
			`self,`
			`content: str,`
			`metadata: Optional[Dict[str, Any]] = None,`
			`rules: Optional[List[Dict[str, Any]]] = None,`
add colpali and remove content_type from ingestion pathway (#45) 2025-02-28 14:37:46 -05:00			`use_colpali: bool = True,`
Add natural language rules based ingestion (#34) 2025-02-07 21:08:40 -05:00			`) -> dict:`
			`"""`
			`Ingest text content into DataBridge.`

			`Args:`
			`content: Text content to ingest`
			`metadata: Optional metadata dictionary`
			`rules: Optional list of rule objects. Examples:`
			`[{"type": "metadata_extraction", "schema": {"name": "string"}},`
			`{"type": "natural_language", "prompt": "Remove PII"}]`
add colpali and remove content_type from ingestion pathway (#45) 2025-02-28 14:37:46 -05:00			`use_colpali: Whether to use ColPali-style embedding model to ingest the text`
Add natural language rules based ingestion (#34) 2025-02-07 21:08:40 -05:00			`"""`
add colpali and remove content_type from ingestion pathway (#45) 2025-02-28 14:37:46 -05:00			`doc = self._client.ingest_text(`
			`content, metadata=metadata or {}, rules=rules, use_colpali=use_colpali`
			`)`
Add open telemetry and shell (#5) 2024-12-31 10:22:25 +05:30			`return doc.model_dump()`

Add local file system for storage (#10) 2024-12-31 16:55:51 +05:30			`def ingest_file(`
Add natural language rules based ingestion (#34) 2025-02-07 21:08:40 -05:00			`self,`
			`file: str,`
			`filename: str = None,`
			`metadata: dict = None,`
			`rules: Optional[List[Dict[str, Any]]] = None,`
add colpali and remove content_type from ingestion pathway (#45) 2025-02-28 14:37:46 -05:00			`use_colpali: bool = True,`
Add local file system for storage (#10) 2024-12-31 16:55:51 +05:30			`) -> dict:`
Add natural language rules based ingestion (#34) 2025-02-07 21:08:40 -05:00			`"""`
			`Ingest a file into DataBridge.`

			`Args:`
			`file: Path to file to ingest`
			`filename: Optional filename (defaults to basename of file path)`
			`metadata: Optional metadata dictionary`
			`rules: Optional list of rule objects. Examples:`
			`[{"type": "metadata_extraction", "schema": {"title": "string"}},`
			`{"type": "natural_language", "prompt": "Summarize"}]`
add colpali and remove content_type from ingestion pathway (#45) 2025-02-28 14:37:46 -05:00			`use_colpali: Whether to use ColPali-style embedding model to ingest the file`
Add natural language rules based ingestion (#34) 2025-02-07 21:08:40 -05:00			`"""`
Add local file system for storage (#10) 2024-12-31 16:55:51 +05:30			`file_path = Path(file)`
Add natural language rules based ingestion (#34) 2025-02-07 21:08:40 -05:00			`filename = filename or file_path.name`
Add open telemetry and shell (#5) 2024-12-31 10:22:25 +05:30			`doc = self._client.ingest_file(`
Add natural language rules based ingestion (#34) 2025-02-07 21:08:40 -05:00			`file=file_path,`
			`filename=filename,`
			`metadata=metadata or {},`
			`rules=rules,`
add colpali and remove content_type from ingestion pathway (#45) 2025-02-28 14:37:46 -05:00			`use_colpali=use_colpali,`
Add open telemetry and shell (#5) 2024-12-31 10:22:25 +05:30			`)`
			`return doc.model_dump()`

			`def retrieve_chunks(`
add colpali and remove content_type from ingestion pathway (#45) 2025-02-28 14:37:46 -05:00			`self, query: str, filters: dict = None, k: int = 4, min_score: float = 0.0, use_colpali: bool = True`
Add open telemetry and shell (#5) 2024-12-31 10:22:25 +05:30			`) -> list:`
add colpali and remove content_type from ingestion pathway (#45) 2025-02-28 14:37:46 -05:00			`"""`
			`Search for relevant chunks`

			`Args:`
			`query: Search query text`
			`filters: Optional metadata filters`
			`k: Number of results (default: 4)`
			`min_score: Minimum similarity threshold (default: 0.0)`
			`use_colpali: Whether to use ColPali-style embedding model for retrieval`
			`"""`
Add open telemetry and shell (#5) 2024-12-31 10:22:25 +05:30			`results = self._client.retrieve_chunks(`
add colpali and remove content_type from ingestion pathway (#45) 2025-02-28 14:37:46 -05:00			`query, filters=filters or {}, k=k, min_score=min_score, use_colpali=use_colpali`
Add open telemetry and shell (#5) 2024-12-31 10:22:25 +05:30			`)`
			`return [r.model_dump() for r in results]`

			`def retrieve_docs(`
add colpali and remove content_type from ingestion pathway (#45) 2025-02-28 14:37:46 -05:00			`self, query: str, filters: dict = None, k: int = 4, min_score: float = 0.0, use_colpali: bool = True`
Add open telemetry and shell (#5) 2024-12-31 10:22:25 +05:30			`) -> list:`
add colpali and remove content_type from ingestion pathway (#45) 2025-02-28 14:37:46 -05:00			`"""`
			`Retrieve relevant documents`

			`Args:`
			`query: Search query text`
			`filters: Optional metadata filters`
			`k: Number of results (default: 4)`
			`min_score: Minimum similarity threshold (default: 0.0)`
			`use_colpali: Whether to use ColPali-style embedding model for retrieval`
			`"""`
			`results = self._client.retrieve_docs(`
			`query, filters=filters or {}, k=k, min_score=min_score, use_colpali=use_colpali`
			`)`
Add open telemetry and shell (#5) 2024-12-31 10:22:25 +05:30			`return [r.model_dump() for r in results]`

			`def query(`
			`self,`
			`query: str,`
			`filters: dict = None,`
			`k: int = 4,`
			`min_score: float = 0.0,`
			`max_tokens: int = None,`
			`temperature: float = None,`
add colpali and remove content_type from ingestion pathway (#45) 2025-02-28 14:37:46 -05:00			`use_colpali: bool = True,`
Add open telemetry and shell (#5) 2024-12-31 10:22:25 +05:30			`) -> dict:`
add colpali and remove content_type from ingestion pathway (#45) 2025-02-28 14:37:46 -05:00			`"""`
			`Generate completion using relevant chunks as context`

			`Args:`
			`query: Query text`
			`filters: Optional metadata filters`
			`k: Number of chunks to use as context (default: 4)`
			`min_score: Minimum similarity threshold (default: 0.0)`
			`max_tokens: Maximum tokens in completion`
			`temperature: Model temperature`
			`use_colpali: Whether to use ColPali-style embedding model for retrieval`
			`"""`
Add open telemetry and shell (#5) 2024-12-31 10:22:25 +05:30			`response = self._client.query(`
			`query,`
			`filters=filters or {},`
			`k=k,`
			`min_score=min_score,`
			`max_tokens=max_tokens,`
			`temperature=temperature,`
add colpali and remove content_type from ingestion pathway (#45) 2025-02-28 14:37:46 -05:00			`use_colpali=use_colpali,`
Add open telemetry and shell (#5) 2024-12-31 10:22:25 +05:30			`)`
			`return response.model_dump()`

			`def list_documents(self, skip: int = 0, limit: int = 100, filters: dict = None) -> list:`
			`"""List accessible documents"""`
			`docs = self._client.list_documents(skip=skip, limit=limit, filters=filters or {})`
			`return [doc.model_dump() for doc in docs]`

			`def get_document(self, document_id: str) -> dict:`
			`"""Get document metadata by ID"""`
			`doc = self._client.get_document(document_id)`
			`return doc.model_dump()`
Add completion sources and batch retrieval for docs and chunks (#51) 2025-03-09 18:42:04 -04:00
			`def batch_get_documents(self, document_ids: List[str]) -> List[dict]:`
			`"""`
			`Retrieve multiple documents by their IDs in a single batch operation.`

			`Args:`
			`document_ids: List of document IDs to retrieve`

			`Returns:`
			`List of document metadata`
			`"""`
			`docs = self._client.batch_get_documents(document_ids)`
			`return [doc.model_dump() for doc in docs]`

			`def batch_get_chunks(self, sources: List[dict]) -> List[dict]:`
			`"""`
			`Retrieve specific chunks by their document ID and chunk number in a single batch operation.`

			`Args:`
			`sources: List of dictionaries with document_id and chunk_number fields`

			`Returns:`
			`List of chunk results`

			`Example:`
			`sources = [`
			`{"document_id": "doc_123", "chunk_number": 0},`
			`{"document_id": "doc_456", "chunk_number": 2}`
			`]`
			`"""`
			`chunks = self._client.batch_get_chunks(sources)`
			`return [chunk.model_dump() for chunk in chunks]`
Add open telemetry and shell (#5) 2024-12-31 10:22:25 +05:30
Add support for cache-augmented-generation (#30) 2025-01-29 10:19:28 +05:30			`def create_cache(`
			`self,`
			`name: str,`
			`model: str,`
			`gguf_file: str,`
			`filters: dict = None,`
			`docs: list = None,`
			`) -> dict:`
			`"""Create a new cache with specified configuration"""`
			`response = self._client.create_cache(`
			`name=name,`
			`model=model,`
			`gguf_file=gguf_file,`
			`filters=filters or {},`
			`docs=docs,`
			`)`
			`return response`

			`def get_cache(self, name: str) -> "Cache":`
			`"""Get a cache by name"""`
			`return self._client.get_cache(name)`

Add open telemetry and shell (#5) 2024-12-31 10:22:25 +05:30			`def close(self):`
			`"""Close the client connection"""`
			`self._client.close()`


Add support for cache-augmented-generation (#30) 2025-01-29 10:19:28 +05:30			`class Cache:`
			`def __init__(self, db: DB, name: str):`
			`self._db = db`
			`self._name = name`
			`self._client_cache = db._client.get_cache(name)`

			`def update(self) -> bool:`
			`"""Update the cache"""`
			`return self._client_cache.update()`

			`def add_docs(self, docs: list) -> bool:`
			`"""Add documents to the cache"""`
			`return self._client_cache.add_docs(docs)`

Fix video parsing bugs, improve server logging 2025-01-30 16:03:46 -05:00			`def query(self, query: str, max_tokens: int = None, temperature: float = None) -> dict:`
Add support for cache-augmented-generation (#30) 2025-01-29 10:19:28 +05:30			`"""Query the cache"""`
			`response = self._client_cache.query(`
			`query=query,`
			`max_tokens=max_tokens,`
			`temperature=temperature,`
			`)`
			`return response.model_dump()`


Add open telemetry and shell (#5) 2024-12-31 10:22:25 +05:30			`if __name__ == "__main__":`
Streamline dev experience with optional auth and simplified config (#27) 2025-01-11 21:54:00 +05:30			`uri = sys.argv[1] if len(sys.argv) > 1 else None`
			`db = DB(uri)`
Add open telemetry and shell (#5) 2024-12-31 10:22:25 +05:30
Streamline dev experience with optional auth and simplified config (#27) 2025-01-11 21:54:00 +05:30			`# Check server health`
Add docker support (#24) 2025-01-09 15:47:25 +05:30			`if not db.check_health():`
Streamline dev experience with optional auth and simplified config (#27) 2025-01-11 21:54:00 +05:30			`print("Error: Could not connect to DataBridge server")`
Add docker support (#24) 2025-01-09 15:47:25 +05:30			`sys.exit(1)`

Streamline dev experience with optional auth and simplified config (#27) 2025-01-11 21:54:00 +05:30			`print("\nConnected to DataBridge")`
Add docker support (#24) 2025-01-09 15:47:25 +05:30
Add open telemetry and shell (#5) 2024-12-31 10:22:25 +05:30			`# Start an interactive Python shell with 'db' already imported`
			`import code`
			`import readline # Enable arrow key history`
			`import rlcompleter # noqa: F401 # Enable tab completion`

			`readline.parse_and_bind("tab: complete")`

			`# Create the interactive shell`
			`shell = code.InteractiveConsole(locals())`

			`# Print welcome message`
			`print("\nDataBridge CLI ready to use. The 'db' object is available with all SDK methods.")`
Add completion sources and batch retrieval for docs and chunks (#51) 2025-03-09 18:42:04 -04:00			`print("Examples:")`
			`print(" db.ingest_text('hello world')")`
			`print(" db.query('what are the key findings?')")`
			`print(" db.batch_get_documents(['doc_id1', 'doc_id2'])")`
			`print(" db.batch_get_chunks([{'document_id': 'doc_123', 'chunk_number': 0}])")`
			`print(" result = db.query('how to use this API?'); print(result['sources'])")`
Add open telemetry and shell (#5) 2024-12-31 10:22:25 +05:30			`print("Type help(db) for documentation.")`

			`# Start the shell`
			`shell.interact(banner="")`