mirror of
https://github.com/james-m-jordan/morphik-core.git
synced 2025-05-09 19:32:38 +00:00
920 lines
32 KiB
Python
920 lines
32 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Morphik interactive CLI.
|
|
Assumes a Morphik server is running.
|
|
|
|
Usage:
|
|
Without authentication (connects to localhost):
|
|
python shell.py
|
|
|
|
With authentication:
|
|
python shell.py <uri>
|
|
Example: python shell.py "morphik://user:token@localhost:8000"
|
|
|
|
This provides the exact same interface as the Python SDK:
|
|
db.ingest_text("content", metadata={...})
|
|
db.ingest_file("path/to/file")
|
|
db.query("what are the key findings?")
|
|
etc...
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
import time
|
|
from typing import Any, Dict, List, Optional, Union
|
|
import requests
|
|
|
|
# Add local SDK to path before other imports
|
|
_SDK_PATH = str(Path(__file__).parent / "sdks" / "python")
|
|
if _SDK_PATH not in sys.path:
|
|
sys.path.insert(0, _SDK_PATH)
|
|
|
|
from morphik import Morphik # noqa: E402
|
|
from morphik.models import Document # noqa: E402
|
|
|
|
|
|
class DB:
|
|
def __init__(self, uri: str = None):
|
|
"""Initialize Morphik with optional URI"""
|
|
self._client = Morphik(uri, is_local=True, timeout=1000)
|
|
self.base_url = "http://localhost:8000" # For health check only
|
|
|
|
def check_health(self, max_retries=30, retry_interval=1) -> bool:
|
|
"""Check if Morphik server is healthy with retries"""
|
|
health_url = f"{self.base_url}/health"
|
|
|
|
for attempt in range(max_retries):
|
|
try:
|
|
response = requests.get(health_url, timeout=5)
|
|
if response.status_code == 200:
|
|
return True
|
|
except requests.exceptions.RequestException:
|
|
pass
|
|
|
|
if attempt < max_retries - 1:
|
|
print(
|
|
f"Waiting for Morphik server to be ready... (attempt {attempt + 1}/{max_retries})"
|
|
)
|
|
time.sleep(retry_interval)
|
|
|
|
return False
|
|
|
|
def ingest_text(
|
|
self,
|
|
content: str,
|
|
metadata: Optional[Dict[str, Any]] = None,
|
|
rules: Optional[List[Dict[str, Any]]] = None,
|
|
use_colpali: bool = True,
|
|
as_object: bool = False,
|
|
) -> Union[dict, "Document"]:
|
|
"""
|
|
Ingest text content into Morphik.
|
|
|
|
Args:
|
|
content: Text content to ingest
|
|
metadata: Optional metadata dictionary
|
|
rules: Optional list of rule objects. Examples:
|
|
[{"type": "metadata_extraction", "schema": {"name": "string"}},
|
|
{"type": "natural_language", "prompt": "Remove PII"}]
|
|
use_colpali: Whether to use ColPali-style embedding model to ingest the text
|
|
as_object: If True, returns the Document object with update methods, otherwise returns a dict
|
|
|
|
Returns:
|
|
Document metadata (dict or Document object)
|
|
|
|
Example:
|
|
```python
|
|
# Create a document and immediately update it with new content
|
|
doc = db.ingest_text("Initial content", as_object=True)
|
|
doc.update_with_text("Additional content")
|
|
```
|
|
"""
|
|
doc = self._client.ingest_text(
|
|
content, metadata=metadata or {}, rules=rules, use_colpali=use_colpali
|
|
)
|
|
return doc if as_object else doc.model_dump()
|
|
|
|
def ingest_file(
|
|
self,
|
|
file: str,
|
|
filename: str = None,
|
|
metadata: dict = None,
|
|
rules: Optional[List[Dict[str, Any]]] = None,
|
|
use_colpali: bool = True,
|
|
as_object: bool = False,
|
|
) -> Union[dict, "Document"]:
|
|
"""
|
|
Ingest a file into Morphik.
|
|
|
|
Args:
|
|
file: Path to file to ingest
|
|
filename: Optional filename (defaults to basename of file path)
|
|
metadata: Optional metadata dictionary
|
|
rules: Optional list of rule objects. Examples:
|
|
[{"type": "metadata_extraction", "schema": {"title": "string"}},
|
|
{"type": "natural_language", "prompt": "Summarize"}]
|
|
use_colpali: Whether to use ColPali-style embedding model to ingest the file
|
|
as_object: If True, returns the Document object with update methods, otherwise returns a dict
|
|
|
|
Returns:
|
|
Document metadata (dict or Document object)
|
|
|
|
Example:
|
|
```python
|
|
# Create a document from a file and immediately update it with text
|
|
doc = db.ingest_file("document.pdf", as_object=True)
|
|
doc.update_with_text("Additional notes about this document")
|
|
```
|
|
"""
|
|
file_path = Path(file)
|
|
filename = filename or file_path.name
|
|
doc = self._client.ingest_file(
|
|
file=file_path,
|
|
filename=filename,
|
|
metadata=metadata or {},
|
|
rules=rules,
|
|
use_colpali=use_colpali,
|
|
)
|
|
return doc if as_object else doc.model_dump()
|
|
|
|
def ingest_files(
|
|
self,
|
|
files: List[str],
|
|
metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
|
rules: Optional[List[Dict[str, Any]]] = None,
|
|
use_colpali: bool = True,
|
|
parallel: bool = True,
|
|
as_objects: bool = False,
|
|
) -> List[Union[dict, "Document"]]:
|
|
"""
|
|
Batch ingest multiple files into Morphik.
|
|
|
|
Args:
|
|
files: List of file paths to ingest
|
|
metadata: Optional metadata (single dict for all files or list of dicts)
|
|
rules: Optional list of rules. Can be either:
|
|
- A single list of rules to apply to all files
|
|
- A list of rule lists, one per file
|
|
use_colpali: Whether to use ColPali-style embedding model
|
|
parallel: Whether to process files in parallel
|
|
as_objects: If True, returns Document objects with update methods, otherwise returns dicts
|
|
|
|
Returns:
|
|
List of document metadata (dicts or Document objects)
|
|
|
|
Example:
|
|
```python
|
|
# Ingest multiple files with shared metadata
|
|
docs = db.ingest_files(
|
|
["doc1.pdf", "doc2.pdf"],
|
|
metadata={"category": "research"},
|
|
parallel=True
|
|
)
|
|
|
|
# Ingest files with individual metadata
|
|
docs = db.ingest_files(
|
|
["doc1.pdf", "doc2.pdf"],
|
|
metadata=[
|
|
{"category": "research", "author": "Alice"},
|
|
{"category": "reports", "author": "Bob"}
|
|
]
|
|
)
|
|
```
|
|
"""
|
|
# Convert file paths to Path objects
|
|
file_paths = [Path(f) for f in files]
|
|
|
|
# Ingest files using the client
|
|
docs = self._client.ingest_files(
|
|
files=file_paths,
|
|
metadata=metadata,
|
|
rules=rules,
|
|
use_colpali=use_colpali,
|
|
parallel=parallel,
|
|
)
|
|
|
|
return docs if as_objects else [doc.model_dump() for doc in docs]
|
|
|
|
def ingest_directory(
|
|
self,
|
|
directory: str,
|
|
recursive: bool = False,
|
|
pattern: str = "*",
|
|
metadata: Optional[Dict[str, Any]] = None,
|
|
rules: Optional[List[Dict[str, Any]]] = None,
|
|
use_colpali: bool = True,
|
|
parallel: bool = True,
|
|
as_objects: bool = False,
|
|
) -> List[Union[dict, "Document"]]:
|
|
"""
|
|
Ingest all files in a directory into Morphik.
|
|
|
|
Args:
|
|
directory: Path to directory containing files to ingest
|
|
recursive: Whether to recursively process subdirectories
|
|
pattern: Optional glob pattern to filter files (e.g. "*.pdf")
|
|
metadata: Optional metadata dictionary to apply to all files
|
|
rules: Optional list of rules. Can be either:
|
|
- A single list of rules to apply to all files
|
|
- A list of rule lists, one per file
|
|
use_colpali: Whether to use ColPali-style embedding model
|
|
parallel: Whether to process files in parallel
|
|
as_objects: If True, returns Document objects with update methods, otherwise returns dicts
|
|
|
|
Returns:
|
|
List of document metadata (dicts or Document objects)
|
|
|
|
Example:
|
|
```python
|
|
# Ingest all PDFs in a directory and its subdirectories
|
|
docs = db.ingest_directory(
|
|
"data/documents",
|
|
recursive=True,
|
|
metadata={"category": "research"},
|
|
pattern="*.pdf"
|
|
)
|
|
```
|
|
"""
|
|
# Convert directory to Path
|
|
dir_path = Path(directory)
|
|
|
|
# Ingest directory using the client
|
|
docs = self._client.ingest_directory(
|
|
directory=dir_path,
|
|
recursive=recursive,
|
|
pattern=pattern,
|
|
metadata=metadata,
|
|
rules=rules,
|
|
use_colpali=use_colpali,
|
|
parallel=parallel,
|
|
)
|
|
|
|
return docs if as_objects else [doc.model_dump() for doc in docs]
|
|
|
|
def retrieve_chunks(
|
|
self,
|
|
query: str,
|
|
filters: dict = None,
|
|
k: int = 4,
|
|
min_score: float = 0.0,
|
|
use_colpali: bool = True,
|
|
) -> list:
|
|
"""
|
|
Search for relevant chunks
|
|
|
|
Args:
|
|
query: Search query text
|
|
filters: Optional metadata filters
|
|
k: Number of results (default: 4)
|
|
min_score: Minimum similarity threshold (default: 0.0)
|
|
use_colpali: Whether to use ColPali-style embedding model for retrieval
|
|
"""
|
|
results = self._client.retrieve_chunks(
|
|
query, filters=filters or {}, k=k, min_score=min_score, use_colpali=use_colpali
|
|
)
|
|
return [r.model_dump() for r in results]
|
|
|
|
def retrieve_docs(
|
|
self,
|
|
query: str,
|
|
filters: dict = None,
|
|
k: int = 4,
|
|
min_score: float = 0.0,
|
|
use_colpali: bool = True,
|
|
) -> list:
|
|
"""
|
|
Retrieve relevant documents
|
|
|
|
Args:
|
|
query: Search query text
|
|
filters: Optional metadata filters
|
|
k: Number of results (default: 4)
|
|
min_score: Minimum similarity threshold (default: 0.0)
|
|
use_colpali: Whether to use ColPali-style embedding model for retrieval
|
|
"""
|
|
results = self._client.retrieve_docs(
|
|
query, filters=filters or {}, k=k, min_score=min_score, use_colpali=use_colpali
|
|
)
|
|
return [r.model_dump() for r in results]
|
|
|
|
def query(
|
|
self,
|
|
query: str,
|
|
filters: dict = None,
|
|
k: int = 4,
|
|
min_score: float = 0.0,
|
|
max_tokens: int = None,
|
|
temperature: float = None,
|
|
use_colpali: bool = True,
|
|
graph_name: str = None,
|
|
hop_depth: int = 1,
|
|
include_paths: bool = False,
|
|
prompt_overrides: dict = None,
|
|
) -> dict:
|
|
"""
|
|
Generate completion using relevant chunks as context
|
|
|
|
Args:
|
|
query: Query text
|
|
filters: Optional metadata filters
|
|
k: Number of chunks to use as context (default: 4)
|
|
min_score: Minimum similarity threshold (default: 0.0)
|
|
max_tokens: Maximum tokens in completion
|
|
temperature: Model temperature
|
|
use_colpali: Whether to use ColPali-style embedding model for retrieval
|
|
graph_name: Optional name of the graph to use for knowledge graph-enhanced retrieval
|
|
hop_depth: Number of relationship hops to traverse in the graph (1-3)
|
|
include_paths: Whether to include relationship paths in the response
|
|
prompt_overrides: Optional customizations for entity extraction, resolution, and query prompts
|
|
|
|
Examples:
|
|
Standard query:
|
|
>>> db.query("What are the key findings?", filters={"category": "research"})
|
|
|
|
Knowledge graph enhanced query:
|
|
>>> db.query("How does product X relate to customer segment Y?",
|
|
graph_name="market_graph", hop_depth=2, include_paths=True)
|
|
|
|
With prompt customization:
|
|
>>> db.query("What are the key findings?",
|
|
prompt_overrides={
|
|
"query": {
|
|
"prompt_template": "Answer the question in a formal, academic tone: {question}"
|
|
}
|
|
})
|
|
|
|
# If include_paths=True, you can inspect the graph paths
|
|
>>> response = db.query("...", graph_name="sales_graph", include_paths=True)
|
|
>>> if "graph" in response.get("metadata", {}):
|
|
>>> for path in response["metadata"]["graph"]["paths"]:
|
|
>>> print(" -> ".join(path))
|
|
"""
|
|
response = self._client.query(
|
|
query,
|
|
filters=filters or {},
|
|
k=k,
|
|
min_score=min_score,
|
|
max_tokens=max_tokens,
|
|
temperature=temperature,
|
|
use_colpali=use_colpali,
|
|
graph_name=graph_name,
|
|
hop_depth=hop_depth,
|
|
include_paths=include_paths,
|
|
prompt_overrides=prompt_overrides,
|
|
)
|
|
return response.model_dump()
|
|
|
|
def list_documents(
|
|
self, skip: int = 0, limit: int = 100, filters: dict = None, as_objects: bool = False
|
|
) -> list:
|
|
"""
|
|
List accessible documents
|
|
|
|
Args:
|
|
skip: Number of documents to skip
|
|
limit: Maximum number of documents to return
|
|
filters: Optional metadata filters
|
|
as_objects: If True, returns Document objects with update methods, otherwise returns dicts
|
|
|
|
Returns:
|
|
List of documents (as dicts or Document objects)
|
|
|
|
Example:
|
|
```python
|
|
# Get a list of documents that can be updated
|
|
docs = db.list_documents(as_objects=True)
|
|
for doc in docs:
|
|
doc.update_metadata({"status": "reviewed"})
|
|
```
|
|
"""
|
|
docs = self._client.list_documents(skip=skip, limit=limit, filters=filters or {})
|
|
return docs if as_objects else [doc.model_dump() for doc in docs]
|
|
|
|
def get_document(self, document_id: str, as_object: bool = False) -> Union[dict, "Document"]:
|
|
"""
|
|
Get document metadata by ID
|
|
|
|
Args:
|
|
document_id: ID of the document
|
|
as_object: If True, returns the Document object with update methods, otherwise returns a dict
|
|
|
|
Returns:
|
|
Document metadata (dict or Document object)
|
|
"""
|
|
doc = self._client.get_document(document_id)
|
|
return doc if as_object else doc.model_dump()
|
|
|
|
def get_document_by_filename(
|
|
self, filename: str, as_object: bool = False
|
|
) -> Union[dict, "Document"]:
|
|
"""
|
|
Get document metadata by filename
|
|
|
|
Args:
|
|
filename: Filename of the document
|
|
as_object: If True, returns the Document object with update methods, otherwise returns a dict
|
|
|
|
Returns:
|
|
Document metadata (dict or Document object)
|
|
|
|
Example:
|
|
```python
|
|
# Get a document by its filename
|
|
doc = db.get_document_by_filename("report.pdf")
|
|
print(f"Document ID: {doc['external_id']}")
|
|
```
|
|
"""
|
|
doc = self._client.get_document_by_filename(filename)
|
|
return doc if as_object else doc.model_dump()
|
|
|
|
def update_document_with_text(
|
|
self,
|
|
document_id: str,
|
|
content: str,
|
|
metadata: Optional[Dict[str, Any]] = None,
|
|
rules: Optional[List] = None,
|
|
update_strategy: str = "add",
|
|
use_colpali: bool = None,
|
|
) -> dict:
|
|
"""
|
|
Update a document with new text content using the specified strategy.
|
|
|
|
Args:
|
|
document_id: ID of the document to update
|
|
content: The new content to add
|
|
metadata: Additional metadata to update (optional)
|
|
rules: Optional list of rules to apply to the content
|
|
update_strategy: Strategy for updating the document (currently only 'add' is supported)
|
|
use_colpali: Whether to use multi-vector embedding
|
|
|
|
Returns:
|
|
Updated document metadata
|
|
"""
|
|
doc = self._client.update_document_with_text(
|
|
document_id=document_id,
|
|
content=content,
|
|
metadata=metadata,
|
|
rules=rules,
|
|
update_strategy=update_strategy,
|
|
use_colpali=use_colpali,
|
|
)
|
|
return doc.model_dump()
|
|
|
|
def update_document_with_file(
|
|
self,
|
|
document_id: str,
|
|
file: str,
|
|
filename: Optional[str] = None,
|
|
metadata: Optional[Dict[str, Any]] = None,
|
|
rules: Optional[List] = None,
|
|
update_strategy: str = "add",
|
|
use_colpali: bool = None,
|
|
) -> dict:
|
|
"""
|
|
Update a document with content from a file using the specified strategy.
|
|
|
|
Args:
|
|
document_id: ID of the document to update
|
|
file: Path to file to add
|
|
filename: Name of the file (optional, defaults to basename of file path)
|
|
metadata: Additional metadata to update (optional)
|
|
rules: Optional list of rules to apply to the content
|
|
update_strategy: Strategy for updating the document (currently only 'add' is supported)
|
|
use_colpali: Whether to use multi-vector embedding
|
|
|
|
Returns:
|
|
Updated document metadata
|
|
"""
|
|
file_path = Path(file)
|
|
filename = filename or file_path.name
|
|
|
|
doc = self._client.update_document_with_file(
|
|
document_id=document_id,
|
|
file=file_path,
|
|
filename=filename,
|
|
metadata=metadata,
|
|
rules=rules,
|
|
update_strategy=update_strategy,
|
|
use_colpali=use_colpali,
|
|
)
|
|
return doc.model_dump()
|
|
|
|
def update_document_metadata(
|
|
self,
|
|
document_id: str,
|
|
metadata: Dict[str, Any],
|
|
) -> dict:
|
|
"""
|
|
Update only the metadata of a document.
|
|
|
|
Args:
|
|
document_id: ID of the document to update
|
|
metadata: New metadata to set
|
|
|
|
Returns:
|
|
Document: Updated document metadata
|
|
"""
|
|
doc = self._client.update_document_metadata(
|
|
document_id=document_id,
|
|
metadata=metadata,
|
|
)
|
|
return doc.model_dump()
|
|
|
|
def update_document_by_filename_with_text(
|
|
self,
|
|
filename: str,
|
|
content: str,
|
|
new_filename: Optional[str] = None,
|
|
metadata: Optional[Dict[str, Any]] = None,
|
|
rules: Optional[List] = None,
|
|
update_strategy: str = "add",
|
|
use_colpali: bool = None,
|
|
) -> dict:
|
|
"""
|
|
Update a document identified by filename with new text content.
|
|
|
|
Args:
|
|
filename: Filename of the document to update
|
|
content: The new content to add
|
|
new_filename: Optional new filename for the document
|
|
metadata: Additional metadata to update (optional)
|
|
rules: Optional list of rules to apply to the content
|
|
update_strategy: Strategy for updating the document (currently only 'add' is supported)
|
|
use_colpali: Whether to use multi-vector embedding
|
|
|
|
Returns:
|
|
Updated document metadata
|
|
"""
|
|
doc = self._client.update_document_by_filename_with_text(
|
|
filename=filename,
|
|
content=content,
|
|
new_filename=new_filename,
|
|
metadata=metadata,
|
|
rules=rules,
|
|
update_strategy=update_strategy,
|
|
use_colpali=use_colpali,
|
|
)
|
|
return doc.model_dump()
|
|
|
|
def update_document_by_filename_with_file(
|
|
self,
|
|
filename: str,
|
|
file: str,
|
|
new_filename: Optional[str] = None,
|
|
metadata: Optional[Dict[str, Any]] = None,
|
|
rules: Optional[List] = None,
|
|
update_strategy: str = "add",
|
|
use_colpali: bool = None,
|
|
) -> dict:
|
|
"""
|
|
Update a document identified by filename with content from a file.
|
|
|
|
Args:
|
|
filename: Filename of the document to update
|
|
file: Path to file to add
|
|
new_filename: Optional new filename for the document
|
|
metadata: Additional metadata to update (optional)
|
|
rules: Optional list of rules to apply to the content
|
|
update_strategy: Strategy for updating the document (currently only 'add' is supported)
|
|
use_colpali: Whether to use multi-vector embedding
|
|
|
|
Returns:
|
|
Updated document metadata
|
|
"""
|
|
file_path = Path(file)
|
|
new_filename = new_filename or file_path.name
|
|
|
|
doc = self._client.update_document_by_filename_with_file(
|
|
filename=filename,
|
|
file=file_path,
|
|
new_filename=new_filename,
|
|
metadata=metadata,
|
|
rules=rules,
|
|
update_strategy=update_strategy,
|
|
use_colpali=use_colpali,
|
|
)
|
|
return doc.model_dump()
|
|
|
|
def update_document_by_filename_metadata(
|
|
self,
|
|
filename: str,
|
|
metadata: Dict[str, Any],
|
|
new_filename: Optional[str] = None,
|
|
) -> dict:
|
|
"""
|
|
Update a document's metadata using filename to identify the document.
|
|
|
|
Args:
|
|
filename: Filename of the document to update
|
|
metadata: New metadata to set
|
|
new_filename: Optional new filename to assign to the document
|
|
|
|
Returns:
|
|
Document: Updated document metadata
|
|
"""
|
|
doc = self._client.update_document_by_filename_metadata(
|
|
filename=filename,
|
|
metadata=metadata,
|
|
new_filename=new_filename,
|
|
)
|
|
return doc.model_dump()
|
|
|
|
def batch_get_documents(
|
|
self, document_ids: List[str], as_objects: bool = False
|
|
) -> List[Union[dict, "Document"]]:
|
|
"""
|
|
Retrieve multiple documents by their IDs in a single batch operation.
|
|
|
|
Args:
|
|
document_ids: List of document IDs to retrieve
|
|
as_objects: If True, returns Document objects with update methods, otherwise returns dicts
|
|
|
|
Returns:
|
|
List of document metadata (as dicts or Document objects)
|
|
|
|
Example:
|
|
```python
|
|
# Get multiple documents that can be updated
|
|
docs = db.batch_get_documents(["doc_123", "doc_456"], as_objects=True)
|
|
for doc in docs:
|
|
doc.update_metadata({"batch_processed": True})
|
|
```
|
|
"""
|
|
docs = self._client.batch_get_documents(document_ids)
|
|
return docs if as_objects else [doc.model_dump() for doc in docs]
|
|
|
|
def batch_get_chunks(self, sources: List[dict]) -> List[dict]:
|
|
"""
|
|
Retrieve specific chunks by their document ID and chunk number in a single batch operation.
|
|
|
|
Args:
|
|
sources: List of dictionaries with document_id and chunk_number fields
|
|
|
|
Returns:
|
|
List of chunk results
|
|
|
|
Example:
|
|
sources = [
|
|
{"document_id": "doc_123", "chunk_number": 0},
|
|
{"document_id": "doc_456", "chunk_number": 2}
|
|
]
|
|
"""
|
|
chunks = self._client.batch_get_chunks(sources)
|
|
return [chunk.model_dump() for chunk in chunks]
|
|
|
|
def create_cache(
|
|
self,
|
|
name: str,
|
|
model: str,
|
|
gguf_file: str,
|
|
filters: dict = None,
|
|
docs: list = None,
|
|
) -> dict:
|
|
"""Create a new cache with specified configuration"""
|
|
response = self._client.create_cache(
|
|
name=name,
|
|
model=model,
|
|
gguf_file=gguf_file,
|
|
filters=filters or {},
|
|
docs=docs,
|
|
)
|
|
return response
|
|
|
|
def get_cache(self, name: str) -> "Cache":
|
|
"""Get a cache by name"""
|
|
return self._client.get_cache(name)
|
|
|
|
def create_graph(
|
|
self,
|
|
name: str,
|
|
filters: Dict[str, Any] = None,
|
|
documents: List[str] = None,
|
|
prompt_overrides: Dict[str, Any] = None,
|
|
) -> dict:
|
|
"""
|
|
Create a graph from documents.
|
|
|
|
This function processes documents matching filters or specific document IDs,
|
|
extracts entities and relationships, and saves them as a graph.
|
|
|
|
Args:
|
|
name: Name of the graph to create
|
|
filters: Optional metadata filters to determine which documents to include
|
|
documents: Optional list of specific document IDs to include
|
|
prompt_overrides: Optional customizations for entity extraction and resolution prompts
|
|
|
|
Returns:
|
|
dict: Information about the created graph
|
|
|
|
Examples:
|
|
Create a graph from documents with category="research":
|
|
>>> db.create_graph("research_graph", filters={"category": "research"})
|
|
|
|
Create a graph from specific documents:
|
|
>>> db.create_graph("custom_graph", documents=["doc1", "doc2", "doc3"])
|
|
|
|
With custom entity extraction examples:
|
|
>>> db.create_graph(
|
|
>>> "medical_graph",
|
|
>>> filters={"category": "medical"},
|
|
>>> prompt_overrides={
|
|
>>> "entity_extraction": {
|
|
>>> "examples": [
|
|
>>> {"label": "Insulin", "type": "MEDICATION"},
|
|
>>> {"label": "Diabetes", "type": "CONDITION"}
|
|
>>> ]
|
|
>>> }
|
|
>>> }
|
|
>>> )
|
|
"""
|
|
graph = self._client.create_graph(
|
|
name=name,
|
|
filters=filters,
|
|
documents=documents,
|
|
prompt_overrides=prompt_overrides,
|
|
)
|
|
return graph.model_dump()
|
|
|
|
def get_graph(self, name: str) -> dict:
|
|
"""
|
|
Get a graph by name.
|
|
|
|
Args:
|
|
name: Name of the graph to retrieve
|
|
|
|
Returns:
|
|
dict: The requested graph object containing entities and relationships
|
|
|
|
Examples:
|
|
Get a graph by name and inspect its contents:
|
|
>>> graph = db.get_graph("research_graph")
|
|
>>> print(f"Graph has {len(graph['entities'])} entities and {len(graph['relationships'])} relationships")
|
|
>>> print(f"Entities: {[entity['label'] for entity in graph['entities'][:5]]}")
|
|
"""
|
|
graph = self._client.get_graph(name)
|
|
return graph.model_dump() if graph else {}
|
|
|
|
def update_graph(
|
|
self,
|
|
name: str,
|
|
additional_filters: dict = None,
|
|
additional_documents: list = None,
|
|
prompt_overrides: dict = None,
|
|
) -> dict:
|
|
"""
|
|
Update an existing graph with new documents.
|
|
|
|
Args:
|
|
name: Name of the graph to update
|
|
additional_filters: Optional additional metadata filters to determine which new documents to include
|
|
additional_documents: Optional list of additional document IDs to include
|
|
prompt_overrides: Optional customizations for entity extraction and resolution prompts
|
|
|
|
Returns:
|
|
dict: The updated graph
|
|
|
|
Examples:
|
|
Update a graph with new documents:
|
|
>>> updated_graph = db.update_graph(
|
|
>>> "research_graph",
|
|
>>> additional_filters={"category": "new_research"},
|
|
>>> additional_documents=["doc4", "doc5"]
|
|
>>> )
|
|
>>> print(f"Graph now has {len(updated_graph['entities'])} entities")
|
|
|
|
With entity resolution examples:
|
|
>>> updated_graph = db.update_graph(
|
|
>>> "research_graph",
|
|
>>> additional_documents=["doc4"],
|
|
>>> prompt_overrides={
|
|
>>> "entity_resolution": {
|
|
>>> "examples": [{
|
|
>>> "canonical": "Machine Learning",
|
|
>>> "variants": ["ML", "machine learning", "AI/ML"]
|
|
>>> }]
|
|
>>> }
|
|
>>> }
|
|
>>> )
|
|
"""
|
|
graph = self._client.update_graph(
|
|
name=name,
|
|
additional_filters=additional_filters,
|
|
additional_documents=additional_documents,
|
|
prompt_overrides=prompt_overrides,
|
|
)
|
|
return graph.model_dump()
|
|
|
|
def list_graphs(self) -> list:
|
|
"""
|
|
List all graphs the user has access to.
|
|
|
|
Returns:
|
|
list: List of graph objects
|
|
|
|
Examples:
|
|
List all accessible graphs:
|
|
>>> graphs = db.list_graphs()
|
|
>>> for graph in graphs:
|
|
>>> print(f"Graph: {graph['name']}, Entities: {len(graph['entities'])}")
|
|
"""
|
|
graphs = self._client.list_graphs()
|
|
return [graph.model_dump() for graph in graphs] if graphs else []
|
|
|
|
def close(self):
|
|
"""Close the client connection"""
|
|
self._client.close()
|
|
|
|
|
|
class Cache:
|
|
def __init__(self, db: DB, name: str):
|
|
self._db = db
|
|
self._name = name
|
|
self._client_cache = db._client.get_cache(name)
|
|
|
|
def update(self) -> bool:
|
|
"""Update the cache"""
|
|
return self._client_cache.update()
|
|
|
|
def add_docs(self, docs: list) -> bool:
|
|
"""Add documents to the cache"""
|
|
return self._client_cache.add_docs(docs)
|
|
|
|
def query(self, query: str, max_tokens: int = None, temperature: float = None) -> dict:
|
|
"""Query the cache"""
|
|
response = self._client_cache.query(
|
|
query=query,
|
|
max_tokens=max_tokens,
|
|
temperature=temperature,
|
|
)
|
|
return response.model_dump()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
uri = sys.argv[1] if len(sys.argv) > 1 else None
|
|
db = DB(uri)
|
|
|
|
# Check server health
|
|
if not db.check_health():
|
|
print("Error: Could not connect to Morphik server")
|
|
sys.exit(1)
|
|
|
|
print("\nConnected to Morphik")
|
|
|
|
# Start an interactive Python shell with 'db' already imported
|
|
import code
|
|
import readline # Enable arrow key history
|
|
import rlcompleter # noqa: F401 # Enable tab completion
|
|
|
|
readline.parse_and_bind("tab: complete")
|
|
|
|
# Create the interactive shell
|
|
shell = code.InteractiveConsole(locals())
|
|
|
|
# Print welcome message
|
|
print("\nMorphik CLI ready to use. The 'db' object is available with all SDK methods.")
|
|
print("Examples:")
|
|
print(" db.ingest_text('hello world')")
|
|
print(" db.query('what are the key findings?')")
|
|
print(" db.batch_get_documents(['doc_id1', 'doc_id2'])")
|
|
print(" db.batch_get_chunks([{'document_id': 'doc_123', 'chunk_number': 0}])")
|
|
print("\nUpdate by Document ID:")
|
|
print(" db.get_document('doc_123')")
|
|
print(
|
|
" db.update_document_with_text('doc_123', 'This is new content to append', filename='updated_doc.txt')"
|
|
)
|
|
print(
|
|
" db.update_document_with_file('doc_123', 'path/to/file.pdf', metadata={'status': 'updated'})"
|
|
)
|
|
print(" db.update_document_metadata('doc_123', {'reviewed': True, 'reviewer': 'John'})")
|
|
print("\nUpdate by Filename:")
|
|
print(" db.get_document_by_filename('report.pdf')")
|
|
print(
|
|
" db.update_document_by_filename_with_text('report.pdf', 'New content', new_filename='updated_report.pdf')"
|
|
)
|
|
print(" db.update_document_by_filename_with_file('report.pdf', 'path/to/new_data.pdf')")
|
|
print(
|
|
" db.update_document_by_filename_metadata('report.pdf', {'reviewed': True}, new_filename='reviewed_report.pdf')"
|
|
)
|
|
print("\nQuerying:")
|
|
print(" result = db.query('how to use this API?'); print(result['sources'])")
|
|
print("\nPrompt Overrides:")
|
|
print(
|
|
" db.query('explain this concept', prompt_overrides={'query': {'prompt_template': 'Answer as a professor: {question}'}})"
|
|
)
|
|
print(" db.create_graph('medical_graph', filters={'category': 'medical'}, prompt_overrides={")
|
|
print(" 'entity_extraction': {'examples': [{'label': 'Insulin', 'type': 'MEDICATION'}]}")
|
|
print(" })")
|
|
print(" db.update_graph('research_graph', additional_documents=['doc123'], prompt_overrides={")
|
|
print(
|
|
" 'entity_resolution': {'examples': [{'canonical': 'Machine Learning', 'variants': ['ML', 'machine learning']}]}"
|
|
)
|
|
print(" })")
|
|
print("\nExamples:")
|
|
print(" db.ingest_text('hello world')")
|
|
print(" db.create_graph('knowledge_graph', filters={'category': 'research'})")
|
|
print(" db.query('How does X relate to Y?', graph_name='knowledge_graph', include_paths=True)")
|
|
print("Type help(db) for documentation.")
|
|
|
|
# Start the shell
|
|
shell.interact(banner="")
|