From 9453d7784880063f2eb23ee3ec0777e7e8d31ff8 Mon Sep 17 00:00:00 2001 From: Adityavardhan Agrawal Date: Sat, 3 May 2025 13:24:43 -0700 Subject: [PATCH] Fix image sizing for rules --- core/models/rules.py | 42 +++++++++++++++++++++---------- core/services/document_service.py | 10 ++++---- 2 files changed, 34 insertions(+), 18 deletions(-) diff --git a/core/models/rules.py b/core/models/rules.py index 726346d..4e5c8bf 100644 --- a/core/models/rules.py +++ b/core/models/rules.py @@ -92,25 +92,41 @@ class MetadataExtractionRule(BaseRule): # Adjust prompt based on whether it's a chunk or full document and whether it's an image if self.use_images: + # For image rules, we do NOT embed the base64 image data in the textual prompt. + # The image will be sent separately via the `image_url` entry in the vision message. + # Keeping the textual prompt concise avoids blowing up the context window. + prompt = f""" + Extract metadata from the following image according to this schema: + + {schema_text} + + The image is attached below. + + Follow these guidelines: + 1. Extract all requested information as simple strings, numbers, or booleans + (not as objects or nested structures) + 2. If information is not present, indicate this with null instead of making something up + 3. Answer directly with the requested information - don't include explanations or reasoning + 4. Be concise but accurate in your extractions + """ prompt_context = "image" if self.stage == "post_chunking" else "document with images" else: prompt_context = "chunk of text" if self.stage == "post_chunking" else "text" + prompt = f""" + Extract metadata from the following {prompt_context} according to this schema: - prompt = f""" - Extract metadata from the following {prompt_context} according to this schema: + {schema_text} - {schema_text} + Text to extract from: + {content} - {"Image to analyze:" if self.use_images else "Text to extract from:"} - {content} - - Follow these guidelines: - 1. Extract all requested information as simple strings, numbers, or booleans - (not as objects or nested structures) - 2. If information is not present, indicate this with null instead of making something up - 3. Answer directly with the requested information - don't include explanations or reasoning - 4. Be concise but accurate in your extractions - """ + Follow these guidelines: + 1. Extract all requested information as simple strings, numbers, or booleans + (not as objects or nested structures) + 2. If information is not present, indicate this with null instead of making something up + 3. Answer directly with the requested information - don't include explanations or reasoning + 4. Be concise but accurate in your extractions + """ # Get the model configuration from registered_models model_config = settings.REGISTERED_MODELS.get(settings.RULES_MODEL, {}) diff --git a/core/services/document_service.py b/core/services/document_service.py index 60df32d..6601dcc 100644 --- a/core/services/document_service.py +++ b/core/services/document_service.py @@ -744,17 +744,17 @@ class DocumentService: from PIL import Image as PILImage img = PILImage.open(BytesIO(file_content)) - # Resize the image to a max width of 512 while preserving aspect ratio to - # keep the base64 payload smaller (helps avoid context window errors). - max_width = 512 + # Resize and compress aggressively to minimize context window footprint + max_width = 256 # reduce width to shrink payload dramatically if img.width > max_width: ratio = max_width / float(img.width) new_height = int(float(img.height) * ratio) img = img.resize((max_width, new_height)) buffered = BytesIO() - img.save(buffered, format="PNG", optimize=True) - img_b64 = "data:image/png;base64," + base64.b64encode(buffered.getvalue()).decode() + # Save as JPEG with moderate quality instead of PNG to reduce size further + img.convert("RGB").save(buffered, format="JPEG", quality=70, optimize=True) + img_b64 = "data:image/jpeg;base64," + base64.b64encode(buffered.getvalue()).decode() return [Chunk(content=img_b64, metadata={"is_image": True})] except Exception as e: logger.error(f"Error resizing image for base64 encoding: {e}. Falling back to original size.")