morphik-core/examples/batch_operations.py

71 lines
2.2 KiB
Python
Raw Normal View History

2025-03-20 22:54:18 -04:00
import os
import shutil
2025-03-20 22:54:18 -04:00
import tempfile
2025-03-20 22:54:18 -04:00
from dotenv import load_dotenv
from morphik import Morphik
2025-03-20 22:54:18 -04:00
# Load environment variables
load_dotenv()
# Connect to Morphik
db = Morphik(os.getenv("MORPHIK_URI"), timeout=10000, is_local=True)
2025-03-20 22:54:18 -04:00
2025-03-20 22:54:18 -04:00
# Create some sample text files for batch ingestion
def create_sample_files():
"""Create temporary text files for demonstration"""
files = []
sample_texts = [
"Artificial Intelligence is transforming various industries.",
"Machine Learning models require significant amounts of data.",
"Natural Language Processing enables computers to understand human language.",
"Computer Vision systems can detect objects in images and videos.",
"Reinforcement Learning is used in robotics and game-playing AI.",
2025-03-20 22:54:18 -04:00
]
2025-03-20 22:54:18 -04:00
# Create temporary directory
temp_dir = tempfile.mkdtemp()
print(f"Created temporary directory: {temp_dir}")
2025-03-20 22:54:18 -04:00
# Create text files
for i, text in enumerate(sample_texts):
file_path = os.path.join(temp_dir, f"sample_{i+1}.txt")
with open(file_path, "w") as f:
f.write(text)
files.append(file_path)
2025-03-20 22:54:18 -04:00
return temp_dir, files
2025-03-20 22:54:18 -04:00
# Create sample files
temp_dir, file_paths = create_sample_files()
print(f"Created {len(file_paths)} sample files")
# Batch ingestion of files
print("\nPerforming batch ingestion of files...")
docs = db.ingest_files(
files=file_paths,
metadata={"category": "AI technology", "source": "batch example"},
parallel=True, # Process in parallel for faster ingestion
2025-03-20 22:54:18 -04:00
)
print(f"Ingested {len(docs)} documents")
# Get document IDs
doc_ids = [doc.external_id for doc in docs]
print(f"Document IDs: {doc_ids}")
# Batch retrieval of documents
print("\nPerforming batch retrieval of documents...")
retrieved_docs = db.batch_get_documents(doc_ids)
print(f"Retrieved {len(retrieved_docs)} documents")
# Ingest a directory
print("\nIngesting all files in a directory...")
dir_docs = db.ingest_directory(temp_dir, recursive=True, pattern="*.txt", metadata={"source": "directory ingestion"})
2025-03-20 22:54:18 -04:00
print(f"Ingested {len(dir_docs)} documents from directory")
# Clean up temporary files
shutil.rmtree(temp_dir)
print(f"\nCleaned up temporary directory: {temp_dir}")