morphik-core/sdks/python/examples/basic_usage.py
2024-11-16 01:48:15 -05:00

197 lines
5.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import asyncio
import os
import sys
from pathlib import Path
from datetime import UTC, datetime, timedelta
from dotenv import load_dotenv
import jwt
# we can't pip install, this basically acts like pip install.
sdk_root = Path(__file__).parent.parent
sys.path.append(str(sdk_root))
from databridge import DataBridge, ContentType, DataBridgeError
def create_test_uri():
"""Create a test URI with a valid JWT token"""
token = jwt.encode(
{
'owner_id': 'test_user_123',
'exp': datetime.now(UTC) + timedelta(days=30)
},
"your-secret-key-for-signing-tokens",
algorithm='HS256'
)
return f"databridge://test_user_123:{token}@localhost:8000"
async def example_text():
"""Example of ingesting and querying text documents"""
print("\n=== Text Document Example ===")
load_dotenv()
uri = os.getenv("DATABRIDGE_URI")
if not uri:
raise ValueError("Please set DATABRIDGE_URI environment variable")
db = DataBridge(create_test_uri())
try:
# Ingest a simple text document
content = """
Machine learning (ML) is a type of artificial intelligence (AI) that allows
software applications to become more accurate at predicting outcomes without
being explicitly programmed to do so. Machine learning algorithms use historical
data as input to predict new output values.
"""
doc_id = await db.ingest_document(
content=content,
metadata={
"title": "ML Introduction",
"category": "tech",
"tags": ["ML", "AI", "technology"]
}
)
print(f"✓ Document ingested successfully (ID: {doc_id})")
# Query the document
results = await db.query(
query="What is machine learning?",
k=1 # Get top result
)
print("\nQuery Results:")
for result in results:
print(f"Content: {result.content.strip()}")
print(f"Score: {result.score:.2f}")
print(f"Metadata: {result.metadata}")
except DataBridgeError as e:
print(f"× Error: {str(e)}")
finally:
await db.close()
async def example_pdf():
"""Example of ingesting and querying PDF documents"""
print("\n=== PDF Document Example ===")
uri = os.getenv("DATABRIDGE_URI")
if not uri:
raise ValueError("Please set DATABRIDGE_URI environment variable")
# Path to a sample PDF in the examples directory
pdf_path = Path(__file__).parent / "sample.pdf"
if not pdf_path.exists():
print("× sample.pdf not found in examples directory")
return
db = DataBridge(uri)
try:
# Read and ingest PDF
with open(pdf_path, "rb") as f:
pdf_content = f.read()
doc_id = await db.ingest_document(
content=pdf_content,
metadata={
"title": "Sample Document",
"source": "examples",
"file_type": "pdf"
},
content_type=ContentType.PDF
)
print(f"✓ PDF ingested successfully (ID: {doc_id})")
# Query the PDF content
results = await db.query(
query="What is the main topic of this document?",
k=2, # Get top 2 results
filters={"file_type": "pdf"} # Only search PDF documents
)
print("\nQuery Results:")
for i, result in enumerate(results, 1):
print(f"\nResult {i}:")
print(f"Content: {result.content[:200]}...")
print(f"Score: {result.score:.2f}")
print(f"Document ID: {result.doc_id}")
except DataBridgeError as e:
print(f"× Error: {str(e)}")
finally:
await db.close()
async def example_batch():
"""Example of batch operations"""
print("\n=== Batch Operations Example ===")
uri = os.getenv("DATABRIDGE_URI")
if not uri:
raise ValueError("Please set DATABRIDGE_URI environment variable")
db = DataBridge(uri)
try:
# Prepare multiple documents
documents = [
{
"content": "Python is a programming language.",
"metadata": {"category": "programming", "level": "basic"}
},
{
"content": "JavaScript runs in the browser.",
"metadata": {"category": "programming", "level": "basic"}
},
{
"content": "Docker containers package applications.",
"metadata": {"category": "devops", "level": "intermediate"}
}
]
# Ingest multiple documents
doc_ids = []
for doc in documents:
doc_id = await db.ingest_document(
content=doc["content"],
metadata=doc["metadata"]
)
doc_ids.append(doc_id)
print(f"✓ Ingested {len(doc_ids)} documents")
# Query with filters
results = await db.query(
query="What is Python?",
filters={"category": "programming"}
)
print("\nQuery Results (Programming category only):")
for result in results:
print(f"\nContent: {result.content}")
print(f"Category: {result.metadata['category']}")
print(f"Level: {result.metadata['level']}")
except DataBridgeError as e:
print(f"× Error: {str(e)}")
finally:
await db.close()
async def main():
"""Run all examples"""
try:
await example_text()
await example_pdf()
await example_batch()
except Exception as e:
print(f"× Main error: {str(e)}")
if __name__ == "__main__":
asyncio.run(main())