2025-02-07 21:08:40 -05:00
|
|
|
from abc import ABC, abstractmethod
|
2025-04-23 23:15:03 -07:00
|
|
|
from typing import Any, Dict, Literal, Type, Union
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-02-07 21:08:40 -05:00
|
|
|
from pydantic import BaseModel
|
|
|
|
|
|
|
|
|
|
|
|
class Rule(ABC):
|
|
|
|
"""Base class for all rules that can be applied during document ingestion"""
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
|
|
"""Convert the rule to a dictionary format for API requests"""
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
class MetadataExtractionRule(Rule):
|
|
|
|
"""Server-side rule for extracting metadata using a schema"""
|
|
|
|
|
2025-04-23 23:15:03 -07:00
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
schema: Union[Type[BaseModel], Dict[str, Any]],
|
|
|
|
stage: Literal["post_parsing", "post_chunking"] = "post_parsing",
|
|
|
|
):
|
|
|
|
"""
|
|
|
|
Args:
|
|
|
|
schema: Pydantic model or dict schema defining metadata fields to extract
|
|
|
|
stage: When to apply the rule - either "post_parsing" (full document text) or
|
|
|
|
"post_chunking" (individual chunks). Defaults to "post_parsing" for backward compatibility.
|
|
|
|
"""
|
2025-02-07 21:08:40 -05:00
|
|
|
self.schema = schema
|
2025-04-23 23:15:03 -07:00
|
|
|
self.stage = stage
|
2025-02-07 21:08:40 -05:00
|
|
|
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
|
|
if isinstance(self.schema, type) and issubclass(self.schema, BaseModel):
|
|
|
|
# Convert Pydantic model to dict schema
|
|
|
|
schema_dict = self.schema.model_json_schema()
|
|
|
|
else:
|
|
|
|
# Assume it's already a dict schema
|
|
|
|
schema_dict = self.schema
|
|
|
|
|
2025-04-23 23:15:03 -07:00
|
|
|
return {"type": "metadata_extraction", "schema": schema_dict, "stage": self.stage}
|
2025-02-07 21:08:40 -05:00
|
|
|
|
|
|
|
|
|
|
|
class NaturalLanguageRule(Rule):
|
|
|
|
"""Server-side rule for transforming content using natural language"""
|
|
|
|
|
2025-04-23 23:15:03 -07:00
|
|
|
def __init__(self, prompt: str, stage: Literal["post_parsing", "post_chunking"] = "post_parsing"):
|
2025-02-07 21:08:40 -05:00
|
|
|
"""
|
|
|
|
Args:
|
|
|
|
prompt: Instruction for how to transform the content
|
|
|
|
e.g. "Remove any personal information" or "Convert to bullet points"
|
2025-04-23 23:15:03 -07:00
|
|
|
stage: When to apply the rule - either "post_parsing" (full document text) or
|
|
|
|
"post_chunking" (individual chunks). Defaults to "post_parsing" for backward compatibility.
|
2025-02-07 21:08:40 -05:00
|
|
|
"""
|
|
|
|
self.prompt = prompt
|
2025-04-23 23:15:03 -07:00
|
|
|
self.stage = stage
|
2025-02-07 21:08:40 -05:00
|
|
|
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
2025-04-23 23:15:03 -07:00
|
|
|
return {"type": "natural_language", "prompt": self.prompt, "stage": self.stage}
|
2025-02-07 21:08:40 -05:00
|
|
|
|
|
|
|
|
|
|
|
__all__ = ["Rule", "MetadataExtractionRule", "NaturalLanguageRule"]
|