2025-02-07 21:08:40 -05:00
|
|
|
from typing import Dict, Any, Literal
|
|
|
|
from pydantic import BaseModel
|
|
|
|
from abc import ABC, abstractmethod
|
|
|
|
from core.config import get_settings
|
2025-04-08 00:19:47 -07:00
|
|
|
import litellm
|
2025-02-07 21:08:40 -05:00
|
|
|
import logging
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
settings = get_settings()
|
|
|
|
|
|
|
|
|
|
|
|
class BaseRule(BaseModel, ABC):
|
|
|
|
"""Base model for all rules"""
|
|
|
|
|
|
|
|
type: str
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
async def apply(self, content: str) -> tuple[Dict[str, Any], str]:
|
|
|
|
"""
|
|
|
|
Apply the rule to the content.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
content: The content to apply the rule to
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
tuple[Dict[str, Any], str]: (metadata, modified_content)
|
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
2025-04-08 00:19:47 -07:00
|
|
|
class MetadataOutput(BaseModel):
|
|
|
|
"""Model for metadata extraction results"""
|
|
|
|
|
|
|
|
# This model will be dynamically extended based on the schema
|
|
|
|
|
|
|
|
|
2025-02-07 21:08:40 -05:00
|
|
|
class MetadataExtractionRule(BaseRule):
|
|
|
|
"""Rule for extracting metadata using a schema"""
|
|
|
|
|
|
|
|
type: Literal["metadata_extraction"]
|
|
|
|
schema: Dict[str, Any]
|
|
|
|
|
|
|
|
async def apply(self, content: str) -> tuple[Dict[str, Any], str]:
|
|
|
|
"""Extract metadata according to schema"""
|
2025-04-08 00:19:47 -07:00
|
|
|
import instructor
|
|
|
|
from pydantic import create_model
|
|
|
|
|
|
|
|
# Create a dynamic Pydantic model based on the schema
|
|
|
|
# This allows instructor to validate the output against our schema
|
|
|
|
field_definitions = {}
|
|
|
|
for field_name, field_info in self.schema.items():
|
|
|
|
if isinstance(field_info, dict) and "type" in field_info:
|
|
|
|
field_type = field_info.get("type")
|
|
|
|
# Convert schema types to Python types
|
|
|
|
if field_type == "string":
|
|
|
|
field_definitions[field_name] = (str, None)
|
|
|
|
elif field_type == "number":
|
|
|
|
field_definitions[field_name] = (float, None)
|
|
|
|
elif field_type == "integer":
|
|
|
|
field_definitions[field_name] = (int, None)
|
|
|
|
elif field_type == "boolean":
|
|
|
|
field_definitions[field_name] = (bool, None)
|
|
|
|
elif field_type == "array":
|
|
|
|
field_definitions[field_name] = (list, None)
|
|
|
|
elif field_type == "object":
|
|
|
|
field_definitions[field_name] = (dict, None)
|
|
|
|
else:
|
|
|
|
# Default to Any for unknown types
|
|
|
|
field_definitions[field_name] = (Any, None)
|
|
|
|
else:
|
|
|
|
# Default to Any if no type specified
|
|
|
|
field_definitions[field_name] = (Any, None)
|
|
|
|
|
|
|
|
# Create the dynamic model
|
|
|
|
DynamicMetadataModel = create_model("DynamicMetadataModel", **field_definitions)
|
|
|
|
|
2025-02-07 21:08:40 -05:00
|
|
|
prompt = f"""
|
|
|
|
Extract metadata from the following text according to this schema:
|
|
|
|
{self.schema}
|
|
|
|
|
|
|
|
Text to extract from:
|
|
|
|
{content}
|
|
|
|
|
2025-04-08 00:19:47 -07:00
|
|
|
Extract all relevant information that matches the schema.
|
2025-02-07 21:08:40 -05:00
|
|
|
"""
|
|
|
|
|
2025-04-08 00:19:47 -07:00
|
|
|
# Get the model configuration from registered_models
|
|
|
|
model_config = settings.REGISTERED_MODELS.get(settings.RULES_MODEL, {})
|
|
|
|
if not model_config:
|
|
|
|
raise ValueError(
|
|
|
|
f"Model '{settings.RULES_MODEL}' not found in registered_models configuration"
|
2025-02-07 21:08:40 -05:00
|
|
|
)
|
2025-04-08 00:19:47 -07:00
|
|
|
|
|
|
|
system_message = {
|
|
|
|
"role": "system",
|
|
|
|
"content": "You are a metadata extraction assistant. Extract structured metadata from text precisely following the provided schema.",
|
|
|
|
}
|
|
|
|
|
|
|
|
user_message = {"role": "user", "content": prompt}
|
|
|
|
|
|
|
|
# Use instructor with litellm to get structured responses
|
|
|
|
client = instructor.from_litellm(litellm.acompletion, mode=instructor.Mode.JSON)
|
|
|
|
|
|
|
|
try:
|
|
|
|
# Extract the model name for instructor call
|
|
|
|
model = model_config.get("model_name")
|
|
|
|
|
|
|
|
# Prepare additional parameters from model config
|
|
|
|
model_kwargs = {k: v for k, v in model_config.items() if k != "model_name"}
|
|
|
|
|
|
|
|
# Use instructor's client to create a structured response
|
|
|
|
response = await client.chat.completions.create(
|
|
|
|
model=model,
|
|
|
|
messages=[system_message, user_message],
|
|
|
|
response_model=DynamicMetadataModel,
|
|
|
|
**model_kwargs,
|
2025-02-07 21:08:40 -05:00
|
|
|
)
|
2025-04-08 00:19:47 -07:00
|
|
|
|
|
|
|
# Convert pydantic model to dict
|
|
|
|
metadata = response.model_dump()
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Error in instructor metadata extraction: {str(e)}")
|
|
|
|
metadata = {}
|
2025-02-07 21:08:40 -05:00
|
|
|
|
|
|
|
return metadata, content
|
|
|
|
|
|
|
|
|
2025-04-08 00:19:47 -07:00
|
|
|
class TransformationOutput(BaseModel):
|
|
|
|
"""Model for text transformation results"""
|
|
|
|
|
|
|
|
transformed_text: str
|
|
|
|
|
|
|
|
|
2025-02-07 21:08:40 -05:00
|
|
|
class NaturalLanguageRule(BaseRule):
|
|
|
|
"""Rule for transforming content using natural language"""
|
|
|
|
|
|
|
|
type: Literal["natural_language"]
|
|
|
|
prompt: str
|
|
|
|
|
|
|
|
async def apply(self, content: str) -> tuple[Dict[str, Any], str]:
|
|
|
|
"""Transform content according to prompt"""
|
2025-04-08 00:19:47 -07:00
|
|
|
import instructor
|
|
|
|
|
2025-02-07 21:08:40 -05:00
|
|
|
prompt = f"""
|
|
|
|
Your task is to transform the following text according to this instruction:
|
|
|
|
{self.prompt}
|
|
|
|
|
|
|
|
Text to transform:
|
|
|
|
{content}
|
|
|
|
|
2025-04-08 00:19:47 -07:00
|
|
|
Perform the transformation and return only the transformed text.
|
2025-02-07 21:08:40 -05:00
|
|
|
"""
|
|
|
|
|
2025-04-08 00:19:47 -07:00
|
|
|
# Get the model configuration from registered_models
|
|
|
|
model_config = settings.REGISTERED_MODELS.get(settings.RULES_MODEL, {})
|
|
|
|
if not model_config:
|
|
|
|
raise ValueError(
|
|
|
|
f"Model '{settings.RULES_MODEL}' not found in registered_models configuration"
|
2025-02-07 21:08:40 -05:00
|
|
|
)
|
2025-04-08 00:19:47 -07:00
|
|
|
|
|
|
|
system_message = {
|
|
|
|
"role": "system",
|
|
|
|
"content": "You are a text transformation assistant. Transform text precisely following the provided instructions.",
|
|
|
|
}
|
|
|
|
|
|
|
|
user_message = {"role": "user", "content": prompt}
|
|
|
|
|
|
|
|
# Use instructor with litellm to get structured responses
|
|
|
|
client = instructor.from_litellm(litellm.acompletion, mode=instructor.Mode.JSON)
|
|
|
|
|
|
|
|
try:
|
|
|
|
# Extract the model name for instructor call
|
|
|
|
model = model_config.get("model_name")
|
|
|
|
|
|
|
|
# Prepare additional parameters from model config
|
|
|
|
model_kwargs = {k: v for k, v in model_config.items() if k != "model_name"}
|
|
|
|
|
|
|
|
# Use instructor's client to create a structured response
|
|
|
|
response = await client.chat.completions.create(
|
|
|
|
model=model,
|
|
|
|
messages=[system_message, user_message],
|
|
|
|
response_model=TransformationOutput,
|
|
|
|
**model_kwargs,
|
2025-02-07 21:08:40 -05:00
|
|
|
)
|
2025-04-08 00:19:47 -07:00
|
|
|
|
|
|
|
# Extract the transformed text from the response model
|
|
|
|
transformed_text = response.transformed_text
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Error in instructor text transformation: {str(e)}")
|
|
|
|
transformed_text = content # Return original content on error
|
2025-02-07 21:08:40 -05:00
|
|
|
|
|
|
|
return {}, transformed_text
|