morphik-core/examples/image_rules_example.py

import time

from sdks.python.morphik.sync import Morphik

# sys.path.append(str(Path(__file__).parent.parent))


# Connect to Morphik
db = Morphik(timeout=10000, is_local=True)


# Helper function to wait for document ingestion to complete
def wait_for_ingestion_completion(document_id, max_wait_time=600, check_interval=30):
    """
    Poll the system until document ingestion is completed or max wait time is reached.

    Args:
        document_id: The ID of the document to check
        max_wait_time: Maximum time to wait in seconds
        check_interval: Time between checks in seconds

    Returns:
        True if ingestion completed, False if timed out
    """
    start_time = time.time()
    while (time.time() - start_time) < max_wait_time:
        # Get the document status info directly using the status API
        status_info = db.get_document_status(document_id)

        # Check if ingestion is completed
        if status_info.get("status") == "completed":
            print(f"Document ingestion completed for {document_id}")
            return True

        print(f"Document status: {status_info.get('status')}. Waiting {check_interval} seconds...")
        time.sleep(check_interval)

    print(f"Warning: Maximum wait time reached for document {document_id}")
    return False


# Define a single image-focused post_chunking rule
image_rules = [
    {
        "type": "metadata_extraction",
        "stage": "post_chunking",
        "use_images": True,
        "schema": {
            "graph_details": {
                "type": "string",
                "description": "Detailed description of any graphs, charts, or diagrams visible "
                "in the image, including axis labels, trends, and key data points",
            },
            "technical_elements": {
                "type": "array",
                "description": "List of technical elements visible in the image such as formulas, "
                "equations, or technical diagrams",
            },
            "visual_content_summary": {
                "type": "string",
                "description": "Brief summary of the visual content in the technical document",
            },
        },
    }
]

# Ingest document with image-focused post_chunking rule
print("Ingesting document with image-focused post_chunking rule...")
doc = db.ingest_file(
    "examples/assets/colpali_example.pdf",
    rules=image_rules,
    metadata={"source": "example", "rules_stage": "image_analysis"},
    use_colpali=True,  # Enable colpali for image processing, critical for handling images
)

# Wait for ingestion to complete
wait_for_ingestion_completion(doc.external_id)

# Get updated document information with processed image metadata
updated_doc = db.get_document(doc.external_id)

print("\n" + "=" * 50)
print("DOCUMENT WITH IMAGE PROCESSING RULES")
print("=" * 50)
print(f"Document ID: {updated_doc.external_id}")
# print(f"Document: {updated_doc}")
print(f"Document metadata: {updated_doc.metadata}")
print(f"Document system metadata: {updated_doc.system_metadata}")