mirror of
https://github.com/james-m-jordan/morphik-core.git
synced 2025-05-09 19:32:38 +00:00
89 lines
3.0 KiB
Python
89 lines
3.0 KiB
Python
import time
|
|
|
|
from sdks.python.morphik.sync import Morphik
|
|
|
|
# sys.path.append(str(Path(__file__).parent.parent))
|
|
|
|
|
|
# Connect to Morphik
|
|
db = Morphik(timeout=10000, is_local=True)
|
|
|
|
|
|
# Helper function to wait for document ingestion to complete
|
|
def wait_for_ingestion_completion(document_id, max_wait_time=600, check_interval=30):
|
|
"""
|
|
Poll the system until document ingestion is completed or max wait time is reached.
|
|
|
|
Args:
|
|
document_id: The ID of the document to check
|
|
max_wait_time: Maximum time to wait in seconds
|
|
check_interval: Time between checks in seconds
|
|
|
|
Returns:
|
|
True if ingestion completed, False if timed out
|
|
"""
|
|
start_time = time.time()
|
|
while (time.time() - start_time) < max_wait_time:
|
|
# Get the document status info directly using the status API
|
|
status_info = db.get_document_status(document_id)
|
|
|
|
# Check if ingestion is completed
|
|
if status_info.get("status") == "completed":
|
|
print(f"Document ingestion completed for {document_id}")
|
|
return True
|
|
|
|
print(f"Document status: {status_info.get('status')}. Waiting {check_interval} seconds...")
|
|
time.sleep(check_interval)
|
|
|
|
print(f"Warning: Maximum wait time reached for document {document_id}")
|
|
return False
|
|
|
|
|
|
# Define a single image-focused post_chunking rule
|
|
image_rules = [
|
|
{
|
|
"type": "metadata_extraction",
|
|
"stage": "post_chunking",
|
|
"use_images": True,
|
|
"schema": {
|
|
"graph_details": {
|
|
"type": "string",
|
|
"description": "Detailed description of any graphs, charts, or diagrams visible "
|
|
"in the image, including axis labels, trends, and key data points",
|
|
},
|
|
"technical_elements": {
|
|
"type": "array",
|
|
"description": "List of technical elements visible in the image such as formulas, "
|
|
"equations, or technical diagrams",
|
|
},
|
|
"visual_content_summary": {
|
|
"type": "string",
|
|
"description": "Brief summary of the visual content in the technical document",
|
|
},
|
|
},
|
|
}
|
|
]
|
|
|
|
# Ingest document with image-focused post_chunking rule
|
|
print("Ingesting document with image-focused post_chunking rule...")
|
|
doc = db.ingest_file(
|
|
"examples/assets/colpali_example.pdf",
|
|
rules=image_rules,
|
|
metadata={"source": "example", "rules_stage": "image_analysis"},
|
|
use_colpali=True, # Enable colpali for image processing, critical for handling images
|
|
)
|
|
|
|
# Wait for ingestion to complete
|
|
wait_for_ingestion_completion(doc.external_id)
|
|
|
|
# Get updated document information with processed image metadata
|
|
updated_doc = db.get_document(doc.external_id)
|
|
|
|
print("\n" + "=" * 50)
|
|
print("DOCUMENT WITH IMAGE PROCESSING RULES")
|
|
print("=" * 50)
|
|
print(f"Document ID: {updated_doc.external_id}")
|
|
# print(f"Document: {updated_doc}")
|
|
print(f"Document metadata: {updated_doc.metadata}")
|
|
print(f"Document system metadata: {updated_doc.system_metadata}")
|