morphik-core/examples/image_rules_example.py
2025-04-25 20:43:04 -07:00

89 lines
3.0 KiB
Python

import time
from sdks.python.morphik.sync import Morphik
# sys.path.append(str(Path(__file__).parent.parent))
# Connect to Morphik
db = Morphik(timeout=10000, is_local=True)
# Helper function to wait for document ingestion to complete
def wait_for_ingestion_completion(document_id, max_wait_time=600, check_interval=30):
"""
Poll the system until document ingestion is completed or max wait time is reached.
Args:
document_id: The ID of the document to check
max_wait_time: Maximum time to wait in seconds
check_interval: Time between checks in seconds
Returns:
True if ingestion completed, False if timed out
"""
start_time = time.time()
while (time.time() - start_time) < max_wait_time:
# Get the document status info directly using the status API
status_info = db.get_document_status(document_id)
# Check if ingestion is completed
if status_info.get("status") == "completed":
print(f"Document ingestion completed for {document_id}")
return True
print(f"Document status: {status_info.get('status')}. Waiting {check_interval} seconds...")
time.sleep(check_interval)
print(f"Warning: Maximum wait time reached for document {document_id}")
return False
# Define a single image-focused post_chunking rule
image_rules = [
{
"type": "metadata_extraction",
"stage": "post_chunking",
"use_images": True,
"schema": {
"graph_details": {
"type": "string",
"description": "Detailed description of any graphs, charts, or diagrams visible "
"in the image, including axis labels, trends, and key data points",
},
"technical_elements": {
"type": "array",
"description": "List of technical elements visible in the image such as formulas, "
"equations, or technical diagrams",
},
"visual_content_summary": {
"type": "string",
"description": "Brief summary of the visual content in the technical document",
},
},
}
]
# Ingest document with image-focused post_chunking rule
print("Ingesting document with image-focused post_chunking rule...")
doc = db.ingest_file(
"examples/assets/colpali_example.pdf",
rules=image_rules,
metadata={"source": "example", "rules_stage": "image_analysis"},
use_colpali=True, # Enable colpali for image processing, critical for handling images
)
# Wait for ingestion to complete
wait_for_ingestion_completion(doc.external_id)
# Get updated document information with processed image metadata
updated_doc = db.get_document(doc.external_id)
print("\n" + "=" * 50)
print("DOCUMENT WITH IMAGE PROCESSING RULES")
print("=" * 50)
print(f"Document ID: {updated_doc.external_id}")
# print(f"Document: {updated_doc}")
print(f"Document metadata: {updated_doc.metadata}")
print(f"Document system metadata: {updated_doc.system_metadata}")