2024-12-31 10:22:25 +05:30
|
|
|
#!/usr/bin/env python3
|
|
|
|
"""
|
|
|
|
DataBridge interactive CLI.
|
|
|
|
Assumes a DataBridge server is running.
|
|
|
|
|
|
|
|
Usage:
|
2025-01-11 21:54:00 +05:30
|
|
|
Without authentication (connects to localhost):
|
|
|
|
python shell.py
|
|
|
|
|
|
|
|
With authentication:
|
|
|
|
python shell.py <uri>
|
|
|
|
Example: python shell.py "databridge://user:token@localhost:8000"
|
2024-12-31 10:22:25 +05:30
|
|
|
|
|
|
|
This provides the exact same interface as the Python SDK:
|
|
|
|
db.ingest_text("content", metadata={...})
|
|
|
|
db.ingest_file("path/to/file")
|
|
|
|
db.query("what are the key findings?")
|
|
|
|
etc...
|
|
|
|
"""
|
|
|
|
|
|
|
|
import sys
|
|
|
|
from pathlib import Path
|
2025-01-09 15:47:25 +05:30
|
|
|
import time
|
2025-02-07 21:08:40 -05:00
|
|
|
from typing import Any, Dict, List, Optional
|
2025-01-09 15:47:25 +05:30
|
|
|
import requests
|
2024-12-31 10:22:25 +05:30
|
|
|
|
|
|
|
# Add local SDK to path before other imports
|
|
|
|
_SDK_PATH = str(Path(__file__).parent / "sdks" / "python")
|
|
|
|
if _SDK_PATH not in sys.path:
|
|
|
|
sys.path.insert(0, _SDK_PATH)
|
|
|
|
|
|
|
|
from databridge import DataBridge # noqa: E402
|
|
|
|
|
|
|
|
|
|
|
|
class DB:
|
2025-01-11 21:54:00 +05:30
|
|
|
def __init__(self, uri: str = None):
|
|
|
|
"""Initialize DataBridge with optional URI"""
|
|
|
|
self._client = DataBridge(uri, is_local=True, timeout=1000)
|
|
|
|
self.base_url = "http://localhost:8000" # For health check only
|
2025-01-09 15:47:25 +05:30
|
|
|
|
|
|
|
def check_health(self, max_retries=30, retry_interval=1) -> bool:
|
|
|
|
"""Check if DataBridge server is healthy with retries"""
|
|
|
|
health_url = f"{self.base_url}/health"
|
|
|
|
|
|
|
|
for attempt in range(max_retries):
|
|
|
|
try:
|
|
|
|
response = requests.get(health_url, timeout=5)
|
|
|
|
if response.status_code == 200:
|
|
|
|
return True
|
|
|
|
except requests.exceptions.RequestException:
|
|
|
|
pass
|
|
|
|
|
|
|
|
if attempt < max_retries - 1:
|
|
|
|
print(
|
|
|
|
f"Waiting for DataBridge server to be ready... (attempt {attempt + 1}/{max_retries})"
|
|
|
|
)
|
|
|
|
time.sleep(retry_interval)
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
2025-02-07 21:08:40 -05:00
|
|
|
def ingest_text(
|
|
|
|
self,
|
|
|
|
content: str,
|
|
|
|
metadata: Optional[Dict[str, Any]] = None,
|
|
|
|
rules: Optional[List[Dict[str, Any]]] = None,
|
2025-02-28 14:37:46 -05:00
|
|
|
use_colpali: bool = True,
|
2025-02-07 21:08:40 -05:00
|
|
|
) -> dict:
|
|
|
|
"""
|
|
|
|
Ingest text content into DataBridge.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
content: Text content to ingest
|
|
|
|
metadata: Optional metadata dictionary
|
|
|
|
rules: Optional list of rule objects. Examples:
|
|
|
|
[{"type": "metadata_extraction", "schema": {"name": "string"}},
|
|
|
|
{"type": "natural_language", "prompt": "Remove PII"}]
|
2025-02-28 14:37:46 -05:00
|
|
|
use_colpali: Whether to use ColPali-style embedding model to ingest the text
|
2025-02-07 21:08:40 -05:00
|
|
|
"""
|
2025-02-28 14:37:46 -05:00
|
|
|
doc = self._client.ingest_text(
|
|
|
|
content, metadata=metadata or {}, rules=rules, use_colpali=use_colpali
|
|
|
|
)
|
2024-12-31 10:22:25 +05:30
|
|
|
return doc.model_dump()
|
|
|
|
|
2024-12-31 16:55:51 +05:30
|
|
|
def ingest_file(
|
2025-02-07 21:08:40 -05:00
|
|
|
self,
|
|
|
|
file: str,
|
|
|
|
filename: str = None,
|
|
|
|
metadata: dict = None,
|
|
|
|
rules: Optional[List[Dict[str, Any]]] = None,
|
2025-02-28 14:37:46 -05:00
|
|
|
use_colpali: bool = True,
|
2024-12-31 16:55:51 +05:30
|
|
|
) -> dict:
|
2025-02-07 21:08:40 -05:00
|
|
|
"""
|
|
|
|
Ingest a file into DataBridge.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
file: Path to file to ingest
|
|
|
|
filename: Optional filename (defaults to basename of file path)
|
|
|
|
metadata: Optional metadata dictionary
|
|
|
|
rules: Optional list of rule objects. Examples:
|
|
|
|
[{"type": "metadata_extraction", "schema": {"title": "string"}},
|
|
|
|
{"type": "natural_language", "prompt": "Summarize"}]
|
2025-02-28 14:37:46 -05:00
|
|
|
use_colpali: Whether to use ColPali-style embedding model to ingest the file
|
2025-02-07 21:08:40 -05:00
|
|
|
"""
|
2024-12-31 16:55:51 +05:30
|
|
|
file_path = Path(file)
|
2025-02-07 21:08:40 -05:00
|
|
|
filename = filename or file_path.name
|
2024-12-31 10:22:25 +05:30
|
|
|
doc = self._client.ingest_file(
|
2025-02-07 21:08:40 -05:00
|
|
|
file=file_path,
|
|
|
|
filename=filename,
|
|
|
|
metadata=metadata or {},
|
|
|
|
rules=rules,
|
2025-02-28 14:37:46 -05:00
|
|
|
use_colpali=use_colpali,
|
2024-12-31 10:22:25 +05:30
|
|
|
)
|
|
|
|
return doc.model_dump()
|
|
|
|
|
|
|
|
def retrieve_chunks(
|
2025-02-28 14:37:46 -05:00
|
|
|
self, query: str, filters: dict = None, k: int = 4, min_score: float = 0.0, use_colpali: bool = True
|
2024-12-31 10:22:25 +05:30
|
|
|
) -> list:
|
2025-02-28 14:37:46 -05:00
|
|
|
"""
|
|
|
|
Search for relevant chunks
|
|
|
|
|
|
|
|
Args:
|
|
|
|
query: Search query text
|
|
|
|
filters: Optional metadata filters
|
|
|
|
k: Number of results (default: 4)
|
|
|
|
min_score: Minimum similarity threshold (default: 0.0)
|
|
|
|
use_colpali: Whether to use ColPali-style embedding model for retrieval
|
|
|
|
"""
|
2024-12-31 10:22:25 +05:30
|
|
|
results = self._client.retrieve_chunks(
|
2025-02-28 14:37:46 -05:00
|
|
|
query, filters=filters or {}, k=k, min_score=min_score, use_colpali=use_colpali
|
2024-12-31 10:22:25 +05:30
|
|
|
)
|
|
|
|
return [r.model_dump() for r in results]
|
|
|
|
|
|
|
|
def retrieve_docs(
|
2025-02-28 14:37:46 -05:00
|
|
|
self, query: str, filters: dict = None, k: int = 4, min_score: float = 0.0, use_colpali: bool = True
|
2024-12-31 10:22:25 +05:30
|
|
|
) -> list:
|
2025-02-28 14:37:46 -05:00
|
|
|
"""
|
|
|
|
Retrieve relevant documents
|
|
|
|
|
|
|
|
Args:
|
|
|
|
query: Search query text
|
|
|
|
filters: Optional metadata filters
|
|
|
|
k: Number of results (default: 4)
|
|
|
|
min_score: Minimum similarity threshold (default: 0.0)
|
|
|
|
use_colpali: Whether to use ColPali-style embedding model for retrieval
|
|
|
|
"""
|
|
|
|
results = self._client.retrieve_docs(
|
|
|
|
query, filters=filters or {}, k=k, min_score=min_score, use_colpali=use_colpali
|
|
|
|
)
|
2024-12-31 10:22:25 +05:30
|
|
|
return [r.model_dump() for r in results]
|
|
|
|
|
|
|
|
def query(
|
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
filters: dict = None,
|
|
|
|
k: int = 4,
|
|
|
|
min_score: float = 0.0,
|
|
|
|
max_tokens: int = None,
|
|
|
|
temperature: float = None,
|
2025-02-28 14:37:46 -05:00
|
|
|
use_colpali: bool = True,
|
2024-12-31 10:22:25 +05:30
|
|
|
) -> dict:
|
2025-02-28 14:37:46 -05:00
|
|
|
"""
|
|
|
|
Generate completion using relevant chunks as context
|
|
|
|
|
|
|
|
Args:
|
|
|
|
query: Query text
|
|
|
|
filters: Optional metadata filters
|
|
|
|
k: Number of chunks to use as context (default: 4)
|
|
|
|
min_score: Minimum similarity threshold (default: 0.0)
|
|
|
|
max_tokens: Maximum tokens in completion
|
|
|
|
temperature: Model temperature
|
|
|
|
use_colpali: Whether to use ColPali-style embedding model for retrieval
|
|
|
|
"""
|
2024-12-31 10:22:25 +05:30
|
|
|
response = self._client.query(
|
|
|
|
query,
|
|
|
|
filters=filters or {},
|
|
|
|
k=k,
|
|
|
|
min_score=min_score,
|
|
|
|
max_tokens=max_tokens,
|
|
|
|
temperature=temperature,
|
2025-02-28 14:37:46 -05:00
|
|
|
use_colpali=use_colpali,
|
2024-12-31 10:22:25 +05:30
|
|
|
)
|
|
|
|
return response.model_dump()
|
|
|
|
|
|
|
|
def list_documents(self, skip: int = 0, limit: int = 100, filters: dict = None) -> list:
|
|
|
|
"""List accessible documents"""
|
|
|
|
docs = self._client.list_documents(skip=skip, limit=limit, filters=filters or {})
|
|
|
|
return [doc.model_dump() for doc in docs]
|
|
|
|
|
|
|
|
def get_document(self, document_id: str) -> dict:
|
|
|
|
"""Get document metadata by ID"""
|
|
|
|
doc = self._client.get_document(document_id)
|
|
|
|
return doc.model_dump()
|
|
|
|
|
2025-01-29 10:19:28 +05:30
|
|
|
def create_cache(
|
|
|
|
self,
|
|
|
|
name: str,
|
|
|
|
model: str,
|
|
|
|
gguf_file: str,
|
|
|
|
filters: dict = None,
|
|
|
|
docs: list = None,
|
|
|
|
) -> dict:
|
|
|
|
"""Create a new cache with specified configuration"""
|
|
|
|
response = self._client.create_cache(
|
|
|
|
name=name,
|
|
|
|
model=model,
|
|
|
|
gguf_file=gguf_file,
|
|
|
|
filters=filters or {},
|
|
|
|
docs=docs,
|
|
|
|
)
|
|
|
|
return response
|
|
|
|
|
|
|
|
def get_cache(self, name: str) -> "Cache":
|
|
|
|
"""Get a cache by name"""
|
|
|
|
return self._client.get_cache(name)
|
|
|
|
|
2024-12-31 10:22:25 +05:30
|
|
|
def close(self):
|
|
|
|
"""Close the client connection"""
|
|
|
|
self._client.close()
|
|
|
|
|
|
|
|
|
2025-01-29 10:19:28 +05:30
|
|
|
class Cache:
|
|
|
|
def __init__(self, db: DB, name: str):
|
|
|
|
self._db = db
|
|
|
|
self._name = name
|
|
|
|
self._client_cache = db._client.get_cache(name)
|
|
|
|
|
|
|
|
def update(self) -> bool:
|
|
|
|
"""Update the cache"""
|
|
|
|
return self._client_cache.update()
|
|
|
|
|
|
|
|
def add_docs(self, docs: list) -> bool:
|
|
|
|
"""Add documents to the cache"""
|
|
|
|
return self._client_cache.add_docs(docs)
|
|
|
|
|
2025-01-30 16:03:46 -05:00
|
|
|
def query(self, query: str, max_tokens: int = None, temperature: float = None) -> dict:
|
2025-01-29 10:19:28 +05:30
|
|
|
"""Query the cache"""
|
|
|
|
response = self._client_cache.query(
|
|
|
|
query=query,
|
|
|
|
max_tokens=max_tokens,
|
|
|
|
temperature=temperature,
|
|
|
|
)
|
|
|
|
return response.model_dump()
|
|
|
|
|
|
|
|
|
2024-12-31 10:22:25 +05:30
|
|
|
if __name__ == "__main__":
|
2025-01-11 21:54:00 +05:30
|
|
|
uri = sys.argv[1] if len(sys.argv) > 1 else None
|
|
|
|
db = DB(uri)
|
2024-12-31 10:22:25 +05:30
|
|
|
|
2025-01-11 21:54:00 +05:30
|
|
|
# Check server health
|
2025-01-09 15:47:25 +05:30
|
|
|
if not db.check_health():
|
2025-01-11 21:54:00 +05:30
|
|
|
print("Error: Could not connect to DataBridge server")
|
2025-01-09 15:47:25 +05:30
|
|
|
sys.exit(1)
|
|
|
|
|
2025-01-11 21:54:00 +05:30
|
|
|
print("\nConnected to DataBridge")
|
2025-01-09 15:47:25 +05:30
|
|
|
|
2024-12-31 10:22:25 +05:30
|
|
|
# Start an interactive Python shell with 'db' already imported
|
|
|
|
import code
|
|
|
|
import readline # Enable arrow key history
|
|
|
|
import rlcompleter # noqa: F401 # Enable tab completion
|
|
|
|
|
|
|
|
readline.parse_and_bind("tab: complete")
|
|
|
|
|
|
|
|
# Create the interactive shell
|
|
|
|
shell = code.InteractiveConsole(locals())
|
|
|
|
|
|
|
|
# Print welcome message
|
|
|
|
print("\nDataBridge CLI ready to use. The 'db' object is available with all SDK methods.")
|
|
|
|
print("Example: db.ingest_text('hello world')")
|
|
|
|
print("Type help(db) for documentation.")
|
|
|
|
|
|
|
|
# Start the shell
|
|
|
|
shell.interact(banner="")
|