mirror of
https://github.com/james-m-jordan/morphik-core.git
synced 2025-05-09 19:32:38 +00:00
Image extraction for rules (#115)
This commit is contained in:
parent
d9e4aba9ea
commit
8079b6d51e
@ -43,6 +43,7 @@ class MetadataExtractionRule(BaseRule):
|
|||||||
|
|
||||||
type: Literal["metadata_extraction"]
|
type: Literal["metadata_extraction"]
|
||||||
schema: Dict[str, Any]
|
schema: Dict[str, Any]
|
||||||
|
use_images: bool = False
|
||||||
|
|
||||||
async def apply(self, content: str, metadata: Optional[Dict[str, Any]] = None) -> tuple[Dict[str, Any], str]:
|
async def apply(self, content: str, metadata: Optional[Dict[str, Any]] = None) -> tuple[Dict[str, Any], str]:
|
||||||
"""Extract metadata according to schema"""
|
"""Extract metadata according to schema"""
|
||||||
@ -89,7 +90,10 @@ class MetadataExtractionRule(BaseRule):
|
|||||||
|
|
||||||
schema_text = "\n".join(schema_descriptions)
|
schema_text = "\n".join(schema_descriptions)
|
||||||
|
|
||||||
# Adjust prompt based on whether it's a chunk or full document
|
# Adjust prompt based on whether it's a chunk or full document and whether it's an image
|
||||||
|
if self.use_images:
|
||||||
|
prompt_context = "image" if self.stage == "post_chunking" else "document with images"
|
||||||
|
else:
|
||||||
prompt_context = "chunk of text" if self.stage == "post_chunking" else "text"
|
prompt_context = "chunk of text" if self.stage == "post_chunking" else "text"
|
||||||
|
|
||||||
prompt = f"""
|
prompt = f"""
|
||||||
@ -97,7 +101,7 @@ class MetadataExtractionRule(BaseRule):
|
|||||||
|
|
||||||
{schema_text}
|
{schema_text}
|
||||||
|
|
||||||
Text to extract from:
|
{"Image to analyze:" if self.use_images else "Text to extract from:"}
|
||||||
{content}
|
{content}
|
||||||
|
|
||||||
Follow these guidelines:
|
Follow these guidelines:
|
||||||
@ -113,16 +117,47 @@ class MetadataExtractionRule(BaseRule):
|
|||||||
if not model_config:
|
if not model_config:
|
||||||
raise ValueError(f"Model '{settings.RULES_MODEL}' not found in registered_models configuration")
|
raise ValueError(f"Model '{settings.RULES_MODEL}' not found in registered_models configuration")
|
||||||
|
|
||||||
|
# Prepare base64 data for vision model if this is an image rule
|
||||||
|
vision_messages = []
|
||||||
|
if self.use_images:
|
||||||
|
try:
|
||||||
|
# For image content, check if it's a base64 string
|
||||||
|
# Handle data URI format "data:image/png;base64,..."
|
||||||
|
if content.startswith("data:"):
|
||||||
|
content_type, content = content.split(";base64,", 1)
|
||||||
|
|
||||||
|
# User message with image content
|
||||||
|
vision_messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": prompt},
|
||||||
|
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{content}"}},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error preparing image content for vision model: {str(e)}")
|
||||||
|
# Fall back to text-only if image processing fails
|
||||||
|
vision_messages = []
|
||||||
|
|
||||||
system_message = {
|
system_message = {
|
||||||
"role": "system",
|
"role": "system",
|
||||||
"content": (
|
"content": (
|
||||||
"You are a metadata extraction assistant. Extract structured metadata from text "
|
"You are a metadata extraction assistant. Extract structured metadata "
|
||||||
|
f"from {'images' if self.use_images else 'text'} "
|
||||||
"precisely following the provided schema. Always return the metadata as direct values "
|
"precisely following the provided schema. Always return the metadata as direct values "
|
||||||
"(strings, numbers, booleans), not as objects with additional properties."
|
"(strings, numbers, booleans), not as objects with additional properties."
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# If we have vision messages, use those, otherwise use standard text message
|
||||||
|
messages = []
|
||||||
|
if vision_messages and self.use_images:
|
||||||
|
messages = [system_message] + vision_messages
|
||||||
|
else:
|
||||||
user_message = {"role": "user", "content": prompt}
|
user_message = {"role": "user", "content": prompt}
|
||||||
|
messages = [system_message, user_message]
|
||||||
|
|
||||||
# Use instructor with litellm to get structured responses
|
# Use instructor with litellm to get structured responses
|
||||||
client = instructor.from_litellm(litellm.acompletion, mode=instructor.Mode.JSON)
|
client = instructor.from_litellm(litellm.acompletion, mode=instructor.Mode.JSON)
|
||||||
@ -137,7 +172,7 @@ class MetadataExtractionRule(BaseRule):
|
|||||||
# Use instructor's client to create a structured response
|
# Use instructor's client to create a structured response
|
||||||
response = await client.chat.completions.create(
|
response = await client.chat.completions.create(
|
||||||
model=model,
|
model=model,
|
||||||
messages=[system_message, user_message],
|
messages=messages,
|
||||||
response_model=DynamicMetadataModel,
|
response_model=DynamicMetadataModel,
|
||||||
**model_kwargs,
|
**model_kwargs,
|
||||||
)
|
)
|
||||||
|
@ -60,6 +60,9 @@ class RulesProcessor:
|
|||||||
logger.debug(f"Parsing rule of type: {rule_type}, stage: {stage}")
|
logger.debug(f"Parsing rule of type: {rule_type}, stage: {stage}")
|
||||||
|
|
||||||
if rule_type == "metadata_extraction":
|
if rule_type == "metadata_extraction":
|
||||||
|
# Default use_images to False if not present
|
||||||
|
if "use_images" not in rule_dict:
|
||||||
|
rule_dict["use_images"] = False
|
||||||
return MetadataExtractionRule(**rule_dict)
|
return MetadataExtractionRule(**rule_dict)
|
||||||
elif rule_type == "natural_language":
|
elif rule_type == "natural_language":
|
||||||
return NaturalLanguageRule(**rule_dict)
|
return NaturalLanguageRule(**rule_dict)
|
||||||
@ -119,11 +122,19 @@ class RulesProcessor:
|
|||||||
Args:
|
Args:
|
||||||
chunk: The Chunk object to process
|
chunk: The Chunk object to process
|
||||||
rules: The original list of rule dictionaries
|
rules: The original list of rule dictionaries
|
||||||
|
(these should be already filtered by the caller based on stage and chunk type)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple[Dict[str, Any], Chunk]: (extracted_metadata_for_doc, potentially_modified_chunk)
|
Tuple[Dict[str, Any], Chunk]: (extracted_metadata_for_doc, potentially_modified_chunk)
|
||||||
"""
|
"""
|
||||||
logger.debug(f"Processing chunk-level rules (post_chunking) on chunk content length {len(chunk.content)}")
|
is_image_chunk = chunk.metadata.get("is_image", False)
|
||||||
|
chunk_type = "image" if is_image_chunk else "text"
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
f"Processing chunk-level rules (post_chunking) on {chunk_type} chunk "
|
||||||
|
f"with content length {len(chunk.content)}"
|
||||||
|
)
|
||||||
|
|
||||||
parsed_rules = []
|
parsed_rules = []
|
||||||
|
|
||||||
# Process rules for post_chunking stage
|
# Process rules for post_chunking stage
|
||||||
@ -134,6 +145,16 @@ class RulesProcessor:
|
|||||||
|
|
||||||
# Only include rules for post_chunking stage
|
# Only include rules for post_chunking stage
|
||||||
if rule.stage == "post_chunking":
|
if rule.stage == "post_chunking":
|
||||||
|
# Skip image rules for text chunks and vice versa
|
||||||
|
if isinstance(rule, MetadataExtractionRule):
|
||||||
|
# If it's an image rule but chunk is not an image, or vice versa, skip
|
||||||
|
if (rule.use_images and not is_image_chunk) or (not rule.use_images and is_image_chunk):
|
||||||
|
logger.debug(
|
||||||
|
f"Skipping rule with use_images={rule.use_images} "
|
||||||
|
f"for chunk with is_image={is_image_chunk}"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
parsed_rules.append(rule)
|
parsed_rules.append(rule)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
logger.warning(f"Skipping invalid chunk rule: {e}")
|
logger.warning(f"Skipping invalid chunk rule: {e}")
|
||||||
|
@ -14,6 +14,7 @@ from core.database.postgres_database import PostgresDatabase
|
|||||||
from core.embedding.colpali_embedding_model import ColpaliEmbeddingModel
|
from core.embedding.colpali_embedding_model import ColpaliEmbeddingModel
|
||||||
from core.embedding.litellm_embedding import LiteLLMEmbeddingModel
|
from core.embedding.litellm_embedding import LiteLLMEmbeddingModel
|
||||||
from core.models.auth import AuthContext, EntityType
|
from core.models.auth import AuthContext, EntityType
|
||||||
|
from core.models.rules import MetadataExtractionRule
|
||||||
from core.parser.morphik_parser import MorphikParser
|
from core.parser.morphik_parser import MorphikParser
|
||||||
from core.services.document_service import DocumentService
|
from core.services.document_service import DocumentService
|
||||||
from core.services.rules_processor import RulesProcessor
|
from core.services.rules_processor import RulesProcessor
|
||||||
@ -207,24 +208,87 @@ async def process_ingestion_job(
|
|||||||
raise ValueError("No content chunks extracted after rules processing")
|
raise ValueError("No content chunks extracted after rules processing")
|
||||||
logger.debug(f"Split processed text into {len(parsed_chunks)} chunks")
|
logger.debug(f"Split processed text into {len(parsed_chunks)} chunks")
|
||||||
|
|
||||||
# === Apply post_chunking rules and aggregate metadata ===
|
# 8. Handle ColPali embeddings if enabled - IMPORTANT: Do this BEFORE applying chunk rules
|
||||||
|
# so that image chunks can be processed by rules when use_images=True
|
||||||
|
using_colpali = (
|
||||||
|
use_colpali and document_service.colpali_embedding_model and document_service.colpali_vector_store
|
||||||
|
)
|
||||||
|
chunks_multivector = []
|
||||||
|
if using_colpali:
|
||||||
|
import base64
|
||||||
|
|
||||||
|
import filetype
|
||||||
|
|
||||||
|
file_type = filetype.guess(file_content)
|
||||||
|
file_content_base64 = base64.b64encode(file_content).decode()
|
||||||
|
|
||||||
|
# Use the parsed chunks for Colpali - this will create image chunks if appropriate
|
||||||
|
chunks_multivector = document_service._create_chunks_multivector(
|
||||||
|
file_type, file_content_base64, file_content, parsed_chunks
|
||||||
|
)
|
||||||
|
logger.debug(f"Created {len(chunks_multivector)} chunks for multivector embedding")
|
||||||
|
|
||||||
|
# 9. Apply post_chunking rules and aggregate metadata
|
||||||
processed_chunks = []
|
processed_chunks = []
|
||||||
|
processed_chunks_multivector = []
|
||||||
aggregated_chunk_metadata: Dict[str, Any] = {} # Initialize dict for aggregated metadata
|
aggregated_chunk_metadata: Dict[str, Any] = {} # Initialize dict for aggregated metadata
|
||||||
chunk_contents = [] # Initialize list to collect chunk contents as we process them
|
chunk_contents = [] # Initialize list to collect chunk contents as we process them
|
||||||
|
|
||||||
if rules_list:
|
if rules_list:
|
||||||
logger.info("Applying post-chunking rules...")
|
logger.info("Applying post-chunking rules...")
|
||||||
|
|
||||||
|
# Partition rules by type
|
||||||
|
text_rules = []
|
||||||
|
image_rules = []
|
||||||
|
|
||||||
|
for rule_dict in rules_list:
|
||||||
|
rule = document_service.rules_processor._parse_rule(rule_dict)
|
||||||
|
if rule.stage == "post_chunking":
|
||||||
|
if isinstance(rule, MetadataExtractionRule) and rule.use_images:
|
||||||
|
image_rules.append(rule_dict)
|
||||||
|
else:
|
||||||
|
text_rules.append(rule_dict)
|
||||||
|
|
||||||
|
logger.info(f"Partitioned rules: {len(text_rules)} text rules, {len(image_rules)} image rules")
|
||||||
|
|
||||||
|
# Process regular text chunks with text rules only
|
||||||
|
if text_rules:
|
||||||
|
logger.info(f"Applying {len(text_rules)} text rules to text chunks...")
|
||||||
for chunk_obj in parsed_chunks:
|
for chunk_obj in parsed_chunks:
|
||||||
# Get metadata *and* the potentially modified chunk
|
# Get metadata *and* the potentially modified chunk
|
||||||
chunk_rule_metadata, processed_chunk = await document_service.rules_processor.process_chunk_rules(
|
chunk_rule_metadata, processed_chunk = await document_service.rules_processor.process_chunk_rules(
|
||||||
chunk_obj, rules_list
|
chunk_obj, text_rules
|
||||||
)
|
)
|
||||||
processed_chunks.append(processed_chunk)
|
processed_chunks.append(processed_chunk)
|
||||||
chunk_contents.append(processed_chunk.content) # Collect content as we process
|
chunk_contents.append(processed_chunk.content) # Collect content as we process
|
||||||
# Aggregate the metadata extracted from this chunk
|
# Aggregate the metadata extracted from this chunk
|
||||||
aggregated_chunk_metadata.update(chunk_rule_metadata)
|
aggregated_chunk_metadata.update(chunk_rule_metadata)
|
||||||
logger.info(f"Finished applying post-chunking rules to {len(processed_chunks)} chunks.")
|
else:
|
||||||
|
processed_chunks = parsed_chunks # No text rules, use original chunks
|
||||||
|
|
||||||
|
# Process colpali image chunks with image rules if they exist
|
||||||
|
if chunks_multivector and image_rules:
|
||||||
|
logger.info(f"Applying {len(image_rules)} image rules to image chunks...")
|
||||||
|
for chunk_obj in chunks_multivector:
|
||||||
|
# Only process if it's an image chunk - pass the image content to the rule
|
||||||
|
if chunk_obj.metadata.get("is_image", False):
|
||||||
|
# Get metadata *and* the potentially modified chunk
|
||||||
|
chunk_rule_metadata, processed_chunk = (
|
||||||
|
await document_service.rules_processor.process_chunk_rules(chunk_obj, image_rules)
|
||||||
|
)
|
||||||
|
processed_chunks_multivector.append(processed_chunk)
|
||||||
|
# Aggregate the metadata extracted from this chunk
|
||||||
|
aggregated_chunk_metadata.update(chunk_rule_metadata)
|
||||||
|
else:
|
||||||
|
# Non-image chunks from multivector don't need further processing
|
||||||
|
processed_chunks_multivector.append(chunk_obj)
|
||||||
|
|
||||||
|
logger.info(f"Finished applying image rules to {len(processed_chunks_multivector)} image chunks.")
|
||||||
|
elif chunks_multivector:
|
||||||
|
# No image rules, use original multivector chunks
|
||||||
|
processed_chunks_multivector = chunks_multivector
|
||||||
|
|
||||||
|
logger.info(f"Finished applying post-chunking rules to {len(processed_chunks)} regular chunks.")
|
||||||
logger.info(f"Aggregated metadata from all chunks: {aggregated_chunk_metadata}")
|
logger.info(f"Aggregated metadata from all chunks: {aggregated_chunk_metadata}")
|
||||||
|
|
||||||
# Update the document content with the stitched content from processed chunks
|
# Update the document content with the stitched content from processed chunks
|
||||||
@ -235,41 +299,26 @@ async def process_ingestion_job(
|
|||||||
logger.info(f"Updated document content with stitched chunks (length: {len(stitched_content)})")
|
logger.info(f"Updated document content with stitched chunks (length: {len(stitched_content)})")
|
||||||
else:
|
else:
|
||||||
processed_chunks = parsed_chunks # No rules, use original chunks
|
processed_chunks = parsed_chunks # No rules, use original chunks
|
||||||
|
processed_chunks_multivector = chunks_multivector # No rules, use original multivector chunks
|
||||||
|
|
||||||
# 8. Generate embeddings for processed chunks
|
# 10. Generate embeddings for processed chunks
|
||||||
embeddings = await document_service.embedding_model.embed_for_ingestion(processed_chunks)
|
embeddings = await document_service.embedding_model.embed_for_ingestion(processed_chunks)
|
||||||
logger.debug(f"Generated {len(embeddings)} embeddings")
|
logger.debug(f"Generated {len(embeddings)} embeddings")
|
||||||
|
|
||||||
# 9. Create chunk objects with potentially modified chunk content and metadata
|
# 11. Create chunk objects with potentially modified chunk content and metadata
|
||||||
chunk_objects = document_service._create_chunk_objects(doc.external_id, processed_chunks, embeddings)
|
chunk_objects = document_service._create_chunk_objects(doc.external_id, processed_chunks, embeddings)
|
||||||
logger.debug(f"Created {len(chunk_objects)} chunk objects")
|
logger.debug(f"Created {len(chunk_objects)} chunk objects")
|
||||||
|
|
||||||
# 10. Handle ColPali embeddings if enabled
|
# 12. Handle ColPali embeddings
|
||||||
using_colpali = (
|
|
||||||
use_colpali and document_service.colpali_embedding_model and document_service.colpali_vector_store
|
|
||||||
)
|
|
||||||
chunk_objects_multivector = []
|
chunk_objects_multivector = []
|
||||||
if using_colpali:
|
if using_colpali:
|
||||||
import filetype
|
colpali_embeddings = await document_service.colpali_embedding_model.embed_for_ingestion(
|
||||||
|
processed_chunks_multivector
|
||||||
file_type = filetype.guess(file_content)
|
|
||||||
|
|
||||||
# For ColPali we need the base64 encoding of the file
|
|
||||||
import base64
|
|
||||||
|
|
||||||
file_content_base64 = base64.b64encode(file_content).decode()
|
|
||||||
|
|
||||||
# Use the processed chunks for Colpali
|
|
||||||
chunks_multivector = document_service._create_chunks_multivector(
|
|
||||||
file_type, file_content_base64, file_content, processed_chunks
|
|
||||||
)
|
)
|
||||||
logger.debug(f"Created {len(chunks_multivector)} chunks for multivector embedding")
|
|
||||||
|
|
||||||
colpali_embeddings = await document_service.colpali_embedding_model.embed_for_ingestion(chunks_multivector)
|
|
||||||
logger.debug(f"Generated {len(colpali_embeddings)} embeddings for multivector embedding")
|
logger.debug(f"Generated {len(colpali_embeddings)} embeddings for multivector embedding")
|
||||||
|
|
||||||
chunk_objects_multivector = document_service._create_chunk_objects(
|
chunk_objects_multivector = document_service._create_chunk_objects(
|
||||||
doc.external_id, chunks_multivector, colpali_embeddings
|
doc.external_id, processed_chunks_multivector, colpali_embeddings
|
||||||
)
|
)
|
||||||
|
|
||||||
# === Merge aggregated chunk metadata into document metadata ===
|
# === Merge aggregated chunk metadata into document metadata ===
|
||||||
|
88
examples/image_rules_example.py
Normal file
88
examples/image_rules_example.py
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
import time
|
||||||
|
|
||||||
|
from sdks.python.morphik.sync import Morphik
|
||||||
|
|
||||||
|
# sys.path.append(str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
|
||||||
|
# Connect to Morphik
|
||||||
|
db = Morphik(timeout=10000, is_local=True)
|
||||||
|
|
||||||
|
|
||||||
|
# Helper function to wait for document ingestion to complete
|
||||||
|
def wait_for_ingestion_completion(document_id, max_wait_time=600, check_interval=30):
|
||||||
|
"""
|
||||||
|
Poll the system until document ingestion is completed or max wait time is reached.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document_id: The ID of the document to check
|
||||||
|
max_wait_time: Maximum time to wait in seconds
|
||||||
|
check_interval: Time between checks in seconds
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if ingestion completed, False if timed out
|
||||||
|
"""
|
||||||
|
start_time = time.time()
|
||||||
|
while (time.time() - start_time) < max_wait_time:
|
||||||
|
# Get the document status info directly using the status API
|
||||||
|
status_info = db.get_document_status(document_id)
|
||||||
|
|
||||||
|
# Check if ingestion is completed
|
||||||
|
if status_info.get("status") == "completed":
|
||||||
|
print(f"Document ingestion completed for {document_id}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
print(f"Document status: {status_info.get('status')}. Waiting {check_interval} seconds...")
|
||||||
|
time.sleep(check_interval)
|
||||||
|
|
||||||
|
print(f"Warning: Maximum wait time reached for document {document_id}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# Define a single image-focused post_chunking rule
|
||||||
|
image_rules = [
|
||||||
|
{
|
||||||
|
"type": "metadata_extraction",
|
||||||
|
"stage": "post_chunking",
|
||||||
|
"use_images": True,
|
||||||
|
"schema": {
|
||||||
|
"graph_details": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Detailed description of any graphs, charts, or diagrams visible "
|
||||||
|
"in the image, including axis labels, trends, and key data points",
|
||||||
|
},
|
||||||
|
"technical_elements": {
|
||||||
|
"type": "array",
|
||||||
|
"description": "List of technical elements visible in the image such as formulas, "
|
||||||
|
"equations, or technical diagrams",
|
||||||
|
},
|
||||||
|
"visual_content_summary": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Brief summary of the visual content in the technical document",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
# Ingest document with image-focused post_chunking rule
|
||||||
|
print("Ingesting document with image-focused post_chunking rule...")
|
||||||
|
doc = db.ingest_file(
|
||||||
|
"examples/assets/colpali_example.pdf",
|
||||||
|
rules=image_rules,
|
||||||
|
metadata={"source": "example", "rules_stage": "image_analysis"},
|
||||||
|
use_colpali=True, # Enable colpali for image processing, critical for handling images
|
||||||
|
)
|
||||||
|
|
||||||
|
# Wait for ingestion to complete
|
||||||
|
wait_for_ingestion_completion(doc.external_id)
|
||||||
|
|
||||||
|
# Get updated document information with processed image metadata
|
||||||
|
updated_doc = db.get_document(doc.external_id)
|
||||||
|
|
||||||
|
print("\n" + "=" * 50)
|
||||||
|
print("DOCUMENT WITH IMAGE PROCESSING RULES")
|
||||||
|
print("=" * 50)
|
||||||
|
print(f"Document ID: {updated_doc.external_id}")
|
||||||
|
# print(f"Document: {updated_doc}")
|
||||||
|
print(f"Document metadata: {updated_doc.metadata}")
|
||||||
|
print(f"Document system metadata: {updated_doc.system_metadata}")
|
@ -20,15 +20,18 @@ class MetadataExtractionRule(Rule):
|
|||||||
self,
|
self,
|
||||||
schema: Union[Type[BaseModel], Dict[str, Any]],
|
schema: Union[Type[BaseModel], Dict[str, Any]],
|
||||||
stage: Literal["post_parsing", "post_chunking"] = "post_parsing",
|
stage: Literal["post_parsing", "post_chunking"] = "post_parsing",
|
||||||
|
use_images: bool = False,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
schema: Pydantic model or dict schema defining metadata fields to extract
|
schema: Pydantic model or dict schema defining metadata fields to extract
|
||||||
stage: When to apply the rule - either "post_parsing" (full document text) or
|
stage: When to apply the rule - either "post_parsing" (full document text) or
|
||||||
"post_chunking" (individual chunks). Defaults to "post_parsing" for backward compatibility.
|
"post_chunking" (individual chunks). Defaults to "post_parsing" for backward compatibility.
|
||||||
|
use_images: Whether to process image chunks instead of text chunks. Defaults to False.
|
||||||
"""
|
"""
|
||||||
self.schema = schema
|
self.schema = schema
|
||||||
self.stage = stage
|
self.stage = stage
|
||||||
|
self.use_images = use_images
|
||||||
|
|
||||||
def to_dict(self) -> Dict[str, Any]:
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
if isinstance(self.schema, type) and issubclass(self.schema, BaseModel):
|
if isinstance(self.schema, type) and issubclass(self.schema, BaseModel):
|
||||||
@ -38,7 +41,12 @@ class MetadataExtractionRule(Rule):
|
|||||||
# Assume it's already a dict schema
|
# Assume it's already a dict schema
|
||||||
schema_dict = self.schema
|
schema_dict = self.schema
|
||||||
|
|
||||||
return {"type": "metadata_extraction", "schema": schema_dict, "stage": self.stage}
|
return {
|
||||||
|
"type": "metadata_extraction",
|
||||||
|
"schema": schema_dict,
|
||||||
|
"stage": self.stage,
|
||||||
|
"use_images": self.use_images,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class NaturalLanguageRule(Rule):
|
class NaturalLanguageRule(Rule):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user