mirror of
https://github.com/james-m-jordan/morphik-core.git
synced 2025-05-09 19:32:38 +00:00
127 lines
4.0 KiB
Python
127 lines
4.0 KiB
Python
from typing import Dict, Any, Literal
|
|
from pydantic import BaseModel
|
|
from abc import ABC, abstractmethod
|
|
from core.config import get_settings
|
|
from openai import AsyncOpenAI
|
|
from ollama import AsyncClient
|
|
import json
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
settings = get_settings()
|
|
|
|
# Initialize the appropriate client based on settings
|
|
if settings.RULES_PROVIDER == "openai":
|
|
rules_client = AsyncOpenAI()
|
|
else: # ollama
|
|
rules_client = AsyncClient(host=settings.COMPLETION_OLLAMA_BASE_URL)
|
|
|
|
|
|
class BaseRule(BaseModel, ABC):
|
|
"""Base model for all rules"""
|
|
|
|
type: str
|
|
|
|
@abstractmethod
|
|
async def apply(self, content: str) -> tuple[Dict[str, Any], str]:
|
|
"""
|
|
Apply the rule to the content.
|
|
|
|
Args:
|
|
content: The content to apply the rule to
|
|
|
|
Returns:
|
|
tuple[Dict[str, Any], str]: (metadata, modified_content)
|
|
"""
|
|
pass
|
|
|
|
|
|
class MetadataExtractionRule(BaseRule):
|
|
"""Rule for extracting metadata using a schema"""
|
|
|
|
type: Literal["metadata_extraction"]
|
|
schema: Dict[str, Any]
|
|
|
|
async def apply(self, content: str) -> tuple[Dict[str, Any], str]:
|
|
"""Extract metadata according to schema"""
|
|
prompt = f"""
|
|
Extract metadata from the following text according to this schema:
|
|
{self.schema}
|
|
|
|
Text to extract from:
|
|
{content}
|
|
|
|
Return ONLY a JSON object with the extracted metadata.
|
|
"""
|
|
|
|
if settings.RULES_PROVIDER == "openai":
|
|
response = await rules_client.chat.completions.create(
|
|
model=settings.RULES_MODEL,
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": "You are a metadata extraction assistant. Always respond with valid JSON.",
|
|
},
|
|
{"role": "user", "content": prompt},
|
|
],
|
|
response_format={"type": "json_object"},
|
|
)
|
|
metadata = json.loads(response.choices[0].message.content)
|
|
else: # ollama
|
|
response = await rules_client.chat(
|
|
model=settings.RULES_MODEL,
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": "You are a metadata extraction assistant. Always respond with valid JSON.",
|
|
},
|
|
{"role": "user", "content": prompt},
|
|
],
|
|
format="json",
|
|
)
|
|
content_str = response["message"]["content"]
|
|
logger.debug(f"Ollama raw response: {content_str}")
|
|
metadata = json.loads(content_str)
|
|
|
|
return metadata, content
|
|
|
|
|
|
class NaturalLanguageRule(BaseRule):
|
|
"""Rule for transforming content using natural language"""
|
|
|
|
type: Literal["natural_language"]
|
|
prompt: str
|
|
|
|
async def apply(self, content: str) -> tuple[Dict[str, Any], str]:
|
|
"""Transform content according to prompt"""
|
|
prompt = f"""
|
|
Your task is to transform the following text according to this instruction:
|
|
{self.prompt}
|
|
|
|
Text to transform:
|
|
{content}
|
|
|
|
Return ONLY the transformed text.
|
|
"""
|
|
|
|
if settings.RULES_PROVIDER == "openai":
|
|
response = await rules_client.chat.completions.create(
|
|
model=settings.RULES_MODEL,
|
|
messages=[
|
|
{"role": "system", "content": "You are a text transformation assistant."},
|
|
{"role": "user", "content": prompt},
|
|
],
|
|
)
|
|
transformed_text = response.choices[0].message.content
|
|
else: # ollama
|
|
response = await rules_client.chat(
|
|
model=settings.RULES_MODEL,
|
|
messages=[
|
|
{"role": "system", "content": "You are a text transformation assistant."},
|
|
{"role": "user", "content": prompt},
|
|
],
|
|
)
|
|
transformed_text = response["message"]["content"]
|
|
|
|
return {}, transformed_text
|