mirror of
https://github.com/james-m-jordan/morphik-core.git
synced 2025-05-09 19:32:38 +00:00
108 lines
2.8 KiB
Python
108 lines
2.8 KiB
Python
import os
|
|
from dotenv import load_dotenv
|
|
from morphik import Morphik
|
|
from morphik.rules import MetadataExtractionRule, NaturalLanguageRule
|
|
from pydantic import BaseModel
|
|
|
|
# Load environment variables
|
|
load_dotenv()
|
|
|
|
# Connect to Morphik
|
|
db = Morphik(os.getenv("MORPHIK_URI"), timeout=10000, is_local=True)
|
|
|
|
# Define sample text with information we want to extract
|
|
sample_text = """
|
|
Report: Q2 Financial Analysis
|
|
Date: June 30, 2023
|
|
Author: John Smith, Chief Financial Analyst
|
|
Department: Finance
|
|
|
|
CONFIDENTIAL - INTERNAL USE ONLY
|
|
|
|
The second quarter showed a 15% increase in revenue compared to Q1.
|
|
Key metrics:
|
|
- Total Revenue: $5.2M
|
|
- Operating Expenses: $3.1M
|
|
- Net Profit: $2.1M
|
|
|
|
Contact john.smith@example.com for questions.
|
|
"""
|
|
|
|
# Define schema for metadata extraction using Pydantic
|
|
class DocumentInfo(BaseModel):
|
|
title: str
|
|
date: str
|
|
author: str
|
|
department: str
|
|
|
|
# Define rules using the rules builder
|
|
# 1. Metadata extraction rule
|
|
metadata_rule = MetadataExtractionRule(schema=DocumentInfo)
|
|
|
|
# 2. Natural language rule to remove PII
|
|
pii_removal_rule = NaturalLanguageRule(
|
|
prompt="Remove all personally identifiable information including names, email addresses, and phone numbers."
|
|
)
|
|
|
|
# 3. Natural language rule to summarize
|
|
summary_rule = NaturalLanguageRule(
|
|
prompt="Summarize this document in one paragraph focusing on the financial results."
|
|
)
|
|
|
|
# Combine rules
|
|
rules = [metadata_rule, pii_removal_rule, summary_rule]
|
|
|
|
# Ingest document with rules
|
|
print("Ingesting document with rules...")
|
|
doc = db.ingest_text(
|
|
sample_text,
|
|
rules=rules,
|
|
metadata={"category": "financial"}
|
|
)
|
|
|
|
print(f"Ingested document with ID: {doc.external_id}")
|
|
|
|
# Check the extracted metadata
|
|
print("\nExtracted metadata:")
|
|
for key, value in doc.metadata.items():
|
|
print(f" {key}: {value}")
|
|
|
|
# Retrieve the transformed document
|
|
chunks = db.retrieve_chunks(
|
|
query="Financial results",
|
|
filters={"document_id": doc.external_id},
|
|
k=1
|
|
)
|
|
|
|
print("\nTransformed document (with PII removed):")
|
|
if chunks:
|
|
print(chunks[0].content)
|
|
|
|
# Use rules with file ingestion
|
|
print("\nDefining rules for file ingestion...")
|
|
|
|
# Rules can also be defined using dictionaries
|
|
file_rules = [
|
|
{
|
|
"type": "metadata_extraction",
|
|
"schema": {
|
|
"title": "string",
|
|
"author": "string",
|
|
"company": "string",
|
|
"year": "number"
|
|
}
|
|
},
|
|
{
|
|
"type": "natural_language",
|
|
"prompt": "Classify this document as either 'technical', 'financial', or 'legal'."
|
|
}
|
|
]
|
|
|
|
# Try to ingest a file with these rules
|
|
file_doc = db.ingest_file(
|
|
"examples/assets/colpali_example.pdf",
|
|
rules=file_rules
|
|
)
|
|
|
|
print(f"Ingested file with rules, ID: {file_doc.external_id}")
|
|
print(f"Classification: {file_doc.metadata.get('classification', 'Not classified')}") |