morphik-core/sdks/python/morphik/rules.py

from abc import ABC, abstractmethod
from typing import Any, Dict, Literal, Type, Union

from pydantic import BaseModel


class Rule(ABC):
    """Base class for all rules that can be applied during document ingestion"""

    @abstractmethod
    def to_dict(self) -> Dict[str, Any]:
        """Convert the rule to a dictionary format for API requests"""
        pass


class MetadataExtractionRule(Rule):
    """Server-side rule for extracting metadata using a schema"""

    def __init__(
        self,
        schema: Union[Type[BaseModel], Dict[str, Any]],
        stage: Literal["post_parsing", "post_chunking"] = "post_parsing",
    ):
        """
        Args:
            schema: Pydantic model or dict schema defining metadata fields to extract
            stage: When to apply the rule - either "post_parsing" (full document text) or
                  "post_chunking" (individual chunks). Defaults to "post_parsing" for backward compatibility.
        """
        self.schema = schema
        self.stage = stage

    def to_dict(self) -> Dict[str, Any]:
        if isinstance(self.schema, type) and issubclass(self.schema, BaseModel):
            # Convert Pydantic model to dict schema
            schema_dict = self.schema.model_json_schema()
        else:
            # Assume it's already a dict schema
            schema_dict = self.schema

        return {"type": "metadata_extraction", "schema": schema_dict, "stage": self.stage}


class NaturalLanguageRule(Rule):
    """Server-side rule for transforming content using natural language"""

    def __init__(self, prompt: str, stage: Literal["post_parsing", "post_chunking"] = "post_parsing"):
        """
        Args:
            prompt: Instruction for how to transform the content
                   e.g. "Remove any personal information" or "Convert to bullet points"
            stage: When to apply the rule - either "post_parsing" (full document text) or
                  "post_chunking" (individual chunks). Defaults to "post_parsing" for backward compatibility.
        """
        self.prompt = prompt
        self.stage = stage

    def to_dict(self) -> Dict[str, Any]:
        return {"type": "natural_language", "prompt": self.prompt, "stage": self.stage}


__all__ = ["Rule", "MetadataExtractionRule", "NaturalLanguageRule"]
Add natural language rules based ingestion (#34) 2025-02-07 21:08:40 -05:00			`from abc import ABC, abstractmethod`
Staged Rule Execution (#111) 2025-04-23 23:15:03 -07:00			`from typing import Any, Dict, Literal, Type, Union`
Format fix, UI package update (#100) Co-authored-by: Arnav Agrawal <aa779@cornell.edu> 2025-04-20 16:34:29 -07:00
Add natural language rules based ingestion (#34) 2025-02-07 21:08:40 -05:00			`from pydantic import BaseModel`


			`class Rule(ABC):`
			`"""Base class for all rules that can be applied during document ingestion"""`

			`@abstractmethod`
			`def to_dict(self) -> Dict[str, Any]:`
			`"""Convert the rule to a dictionary format for API requests"""`
			`pass`


			`class MetadataExtractionRule(Rule):`
			`"""Server-side rule for extracting metadata using a schema"""`

Staged Rule Execution (#111) 2025-04-23 23:15:03 -07:00			`def __init__(`
			`self,`
			`schema: Union[Type[BaseModel], Dict[str, Any]],`
			`stage: Literal["post_parsing", "post_chunking"] = "post_parsing",`
			`):`
			`"""`
			`Args:`
			`schema: Pydantic model or dict schema defining metadata fields to extract`
			`stage: When to apply the rule - either "post_parsing" (full document text) or`
			`"post_chunking" (individual chunks). Defaults to "post_parsing" for backward compatibility.`
			`"""`
Add natural language rules based ingestion (#34) 2025-02-07 21:08:40 -05:00			`self.schema = schema`
Staged Rule Execution (#111) 2025-04-23 23:15:03 -07:00			`self.stage = stage`
Add natural language rules based ingestion (#34) 2025-02-07 21:08:40 -05:00
			`def to_dict(self) -> Dict[str, Any]:`
			`if isinstance(self.schema, type) and issubclass(self.schema, BaseModel):`
			`# Convert Pydantic model to dict schema`
			`schema_dict = self.schema.model_json_schema()`
			`else:`
			`# Assume it's already a dict schema`
			`schema_dict = self.schema`

Staged Rule Execution (#111) 2025-04-23 23:15:03 -07:00			`return {"type": "metadata_extraction", "schema": schema_dict, "stage": self.stage}`
Add natural language rules based ingestion (#34) 2025-02-07 21:08:40 -05:00

			`class NaturalLanguageRule(Rule):`
			`"""Server-side rule for transforming content using natural language"""`

Staged Rule Execution (#111) 2025-04-23 23:15:03 -07:00			`def __init__(self, prompt: str, stage: Literal["post_parsing", "post_chunking"] = "post_parsing"):`
Add natural language rules based ingestion (#34) 2025-02-07 21:08:40 -05:00			`"""`
			`Args:`
			`prompt: Instruction for how to transform the content`
			`e.g. "Remove any personal information" or "Convert to bullet points"`
Staged Rule Execution (#111) 2025-04-23 23:15:03 -07:00			`stage: When to apply the rule - either "post_parsing" (full document text) or`
			`"post_chunking" (individual chunks). Defaults to "post_parsing" for backward compatibility.`
Add natural language rules based ingestion (#34) 2025-02-07 21:08:40 -05:00			`"""`
			`self.prompt = prompt`
Staged Rule Execution (#111) 2025-04-23 23:15:03 -07:00			`self.stage = stage`
Add natural language rules based ingestion (#34) 2025-02-07 21:08:40 -05:00
			`def to_dict(self) -> Dict[str, Any]:`
Staged Rule Execution (#111) 2025-04-23 23:15:03 -07:00			`return {"type": "natural_language", "prompt": self.prompt, "stage": self.stage}`
Add natural language rules based ingestion (#34) 2025-02-07 21:08:40 -05:00

			`__all__ = ["Rule", "MetadataExtractionRule", "NaturalLanguageRule"]`