From 9453d7784880063f2eb23ee3ec0777e7e8d31ff8 Mon Sep 17 00:00:00 2001
From: Adityavardhan Agrawal <aa729@cornell.edu>
Date: Sat, 3 May 2025 13:24:43 -0700
Subject: [PATCH] Fix image sizing for rules

---
 core/models/rules.py              | 42 +++++++++++++++++++++----------
 core/services/document_service.py | 10 ++++----
 2 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/core/models/rules.py b/core/models/rules.py
index 726346d..4e5c8bf 100644
--- a/core/models/rules.py
+++ b/core/models/rules.py
@@ -92,25 +92,41 @@ class MetadataExtractionRule(BaseRule):
 
         # Adjust prompt based on whether it's a chunk or full document and whether it's an image
         if self.use_images:
+            # For image rules, we do NOT embed the base64 image data in the textual prompt.
+            # The image will be sent separately via the `image_url` entry in the vision message.
+            # Keeping the textual prompt concise avoids blowing up the context window.
+            prompt = f"""
+            Extract metadata from the following image according to this schema:
+
+            {schema_text}
+
+            The image is attached below.
+
+            Follow these guidelines:
+            1. Extract all requested information as simple strings, numbers, or booleans
+            (not as objects or nested structures)
+            2. If information is not present, indicate this with null instead of making something up
+            3. Answer directly with the requested information - don't include explanations or reasoning
+            4. Be concise but accurate in your extractions
+            """
             prompt_context = "image" if self.stage == "post_chunking" else "document with images"
         else:
             prompt_context = "chunk of text" if self.stage == "post_chunking" else "text"
+            prompt = f"""
+            Extract metadata from the following {prompt_context} according to this schema:
 
-        prompt = f"""
-        Extract metadata from the following {prompt_context} according to this schema:
+            {schema_text}
 
-        {schema_text}
+            Text to extract from:
+            {content}
 
-        {"Image to analyze:" if self.use_images else "Text to extract from:"}
-        {content}
-
-        Follow these guidelines:
-        1. Extract all requested information as simple strings, numbers, or booleans
-        (not as objects or nested structures)
-        2. If information is not present, indicate this with null instead of making something up
-        3. Answer directly with the requested information - don't include explanations or reasoning
-        4. Be concise but accurate in your extractions
-        """
+            Follow these guidelines:
+            1. Extract all requested information as simple strings, numbers, or booleans
+            (not as objects or nested structures)
+            2. If information is not present, indicate this with null instead of making something up
+            3. Answer directly with the requested information - don't include explanations or reasoning
+            4. Be concise but accurate in your extractions
+            """
 
         # Get the model configuration from registered_models
         model_config = settings.REGISTERED_MODELS.get(settings.RULES_MODEL, {})
diff --git a/core/services/document_service.py b/core/services/document_service.py
index 60df32d..6601dcc 100644
--- a/core/services/document_service.py
+++ b/core/services/document_service.py
@@ -744,17 +744,17 @@ class DocumentService:
                 from PIL import Image as PILImage
 
                 img = PILImage.open(BytesIO(file_content))
-                # Resize the image to a max width of 512 while preserving aspect ratio to
-                # keep the base64 payload smaller (helps avoid context window errors).
-                max_width = 512
+                # Resize and compress aggressively to minimize context window footprint
+                max_width = 256  # reduce width to shrink payload dramatically
                 if img.width > max_width:
                     ratio = max_width / float(img.width)
                     new_height = int(float(img.height) * ratio)
                     img = img.resize((max_width, new_height))
 
                 buffered = BytesIO()
-                img.save(buffered, format="PNG", optimize=True)
-                img_b64 = "data:image/png;base64," + base64.b64encode(buffered.getvalue()).decode()
+                # Save as JPEG with moderate quality instead of PNG to reduce size further
+                img.convert("RGB").save(buffered, format="JPEG", quality=70, optimize=True)
+                img_b64 = "data:image/jpeg;base64," + base64.b64encode(buffered.getvalue()).decode()
                 return [Chunk(content=img_b64, metadata={"is_image": True})]
             except Exception as e:
                 logger.error(f"Error resizing image for base64 encoding: {e}. Falling back to original size.")