morphik-core/examples/rules_engine.py
Adityavardhan Agrawal 1792275cb8
Format fix, UI package update (#100)
Co-authored-by: Arnav Agrawal <aa779@cornell.edu>
2025-04-20 16:34:29 -07:00

94 lines
2.7 KiB
Python

import os
from dotenv import load_dotenv
from morphik import Morphik
from morphik.rules import MetadataExtractionRule, NaturalLanguageRule
from pydantic import BaseModel
# Load environment variables
load_dotenv()
# Connect to Morphik
db = Morphik(os.getenv("MORPHIK_URI"), timeout=10000, is_local=True)
# Define sample text with information we want to extract
sample_text = """
Report: Q2 Financial Analysis
Date: June 30, 2023
Author: John Smith, Chief Financial Analyst
Department: Finance
CONFIDENTIAL - INTERNAL USE ONLY
The second quarter showed a 15% increase in revenue compared to Q1.
Key metrics:
- Total Revenue: $5.2M
- Operating Expenses: $3.1M
- Net Profit: $2.1M
Contact john.smith@example.com for questions.
"""
# Define schema for metadata extraction using Pydantic
class DocumentInfo(BaseModel):
title: str
date: str
author: str
department: str
# Define rules using the rules builder
# 1. Metadata extraction rule
metadata_rule = MetadataExtractionRule(schema=DocumentInfo)
# 2. Natural language rule to remove PII
pii_removal_rule = NaturalLanguageRule(
prompt="Remove all personally identifiable information including names, email addresses, and phone numbers."
)
# 3. Natural language rule to summarize
summary_rule = NaturalLanguageRule(prompt="Summarize this document in one paragraph focusing on the financial results.")
# Combine rules
rules = [metadata_rule, pii_removal_rule, summary_rule]
# Ingest document with rules
print("Ingesting document with rules...")
doc = db.ingest_text(sample_text, rules=rules, metadata={"category": "financial"})
print(f"Ingested document with ID: {doc.external_id}")
# Check the extracted metadata
print("\nExtracted metadata:")
for key, value in doc.metadata.items():
print(f" {key}: {value}")
# Retrieve the transformed document
chunks = db.retrieve_chunks(query="Financial results", filters={"document_id": doc.external_id}, k=1)
print("\nTransformed document (with PII removed):")
if chunks:
print(chunks[0].content)
# Use rules with file ingestion
print("\nDefining rules for file ingestion...")
# Rules can also be defined using dictionaries
file_rules = [
{
"type": "metadata_extraction",
"schema": {"title": "string", "author": "string", "company": "string", "year": "number"},
},
{
"type": "natural_language",
"prompt": "Classify this document as either 'technical', 'financial', or 'legal'.",
},
]
# Try to ingest a file with these rules
file_doc = db.ingest_file("examples/assets/colpali_example.pdf", rules=file_rules)
print(f"Ingested file with rules, ID: {file_doc.external_id}")
print(f"Classification: {file_doc.metadata.get('classification', 'Not classified')}")