Robustify redis connection

2025-05-09 19:32:38 +00:00 · 2025-04-17 02:37:26 -07:00 · 2025-04-17 02:37:26 -07:00 · f161b7dd2a
commit f161b7dd2a
parent 1ec1efe176
2 changed files with 132 additions and 22 deletions
--- a/core/workers/ingestion_worker.py
+++ b/core/workers/ingestion_worker.py
@ -4,8 +4,11 @@ from typing import Dict, Any, List, Optional
 from datetime import datetime, UTC
 from pathlib import Path
 import asyncio
 import os
 import urllib.parse as up
 import arq
 from arq.connections import RedisSettings
 from core.models.auth import AuthContext, EntityType
 from core.models.documents import Document
 from core.database.postgres_database import PostgresDatabase
@ -461,6 +464,26 @@ async def shutdown(ctx):
    # Close any other open connections or resources that need cleanup
    logger.info("Worker shutdown complete.")
 def redis_settings_from_env() -> RedisSettings:
    """
    Create RedisSettings from environment variables for ARQ worker.
    Returns:
        RedisSettings configured for Redis connection with optimized performance
    """
    url = up.urlparse(os.getenv("REDIS_URL", "redis://127.0.0.1:6379/0"))
    # Use ARQ's supported parameters with optimized values for stability
    # For high-volume ingestion (100+ documents), these settings help prevent timeouts
    return RedisSettings(
        host=url.hostname or os.getenv("REDIS_HOST", "127.0.0.1"),
        port=url.port or int(os.getenv("REDIS_PORT", "6379")),
        database=int(url.path.lstrip("/") or 0),
        conn_timeout=5,        # Increased connection timeout (seconds)
        conn_retries=15,       # More retries for transient connection issues
        conn_retry_delay=1     # Quick retry delay (seconds)
    )
 # ARQ Worker Settings
 class WorkerSettings:
    """
@ -472,40 +495,92 @@ class WorkerSettings:
    functions = [process_ingestion_job]
    on_startup = startup
    on_shutdown = shutdown
-    # Redis settings will be loaded from environment variables by default
+    
-    # Other optional settings:
+    # Use robust Redis settings that handle connection issues
-    # redis_settings = arq.connections.RedisSettings(host='localhost', port=6379)
+    redis_settings = redis_settings_from_env()
    # Result storage settings
    keep_result_ms = 24 * 60 * 60 * 1000  # Keep results for 24 hours (24 * 60 * 60 * 1000 ms)
-    max_jobs = 5  # Reduce concurrent jobs to prevent connection pool exhaustion
+    
-    health_check_interval = 300  # Check worker health every 5 minutes instead of 30 seconds to reduce connection overhead
+    # Concurrency settings - optimized for high-volume ingestion
-    job_timeout = 3600  # 1 hour timeout for jobs
+    max_jobs = 3  # Reduced to prevent resource contention during batch processing
-    max_tries = 3  # Retry failed jobs up to 3 times
+    
-    poll_delay = 0.5  # Poll delay to prevent excessive Redis queries
+    # Resource management
    health_check_interval = 600  # Extended to 10 minutes to reduce Redis overhead
    job_timeout = 7200  # Extended to 2 hours for large document processing
    max_tries = 5  # Retry failed jobs up to 5 times
    poll_delay = 2.0  # Increased poll delay to prevent Redis connection saturation
    # High reliability settings
    allow_abort_jobs = False  # Don't abort jobs on worker shutdown
    retry_jobs = True  # Always retry failed jobs
    # Prevent queue blocking on error
    skip_queue_when_queues_read_fails = True  # Continue processing other queues if one fails
    # Log Redis and connection pool information for debugging
    @staticmethod
    async def health_check(ctx):
-        """Periodic health check to log connection status and job stats."""
+        """
        Enhanced periodic health check to log connection status and job stats.
        Monitors Redis memory, database connections, and job processing metrics.
        """
        database = ctx.get('database')
        vector_store = ctx.get('vector_store')
        job_stats = ctx.get('job_stats', {})
        redis_info = await ctx['redis'].info()
-        logger.info(f"Health check: Redis v{redis_info.get('redis_version', 'unknown')} "
+        # Get detailed Redis info
-                   f"mem_usage={redis_info.get('used_memory_human', 'unknown')} "
+        try:
-                   f"clients_connected={redis_info.get('connected_clients', 'unknown')} "
+            redis_info = await ctx['redis'].info(section=['Server', 'Memory', 'Clients', 'Stats'])
-                   f"db_keys={redis_info.get('db0', {}).get('keys', 0)}"
+            
            # Server and resource usage info
            redis_version = redis_info.get('redis_version', 'unknown')
            used_memory = redis_info.get('used_memory_human', 'unknown')
            used_memory_peak = redis_info.get('used_memory_peak_human', 'unknown')
            clients_connected = redis_info.get('connected_clients', 'unknown')
            rejected_connections = redis_info.get('rejected_connections', 0)
            total_commands = redis_info.get('total_commands_processed', 0)
            # DB keys
            db_info = redis_info.get('db0', {})
            keys_count = db_info.get('keys', 0) if isinstance(db_info, dict) else 0
            # Log comprehensive server status
            logger.info(
                f"Redis Status: v{redis_version} | "
                f"Memory: {used_memory} (peak: {used_memory_peak}) | "
                f"Clients: {clients_connected} (rejected: {rejected_connections}) | "
                f"DB Keys: {keys_count} | Commands: {total_commands}"
            )
            # Check for memory warning thresholds
            if isinstance(used_memory, str) and used_memory.endswith('G'):
                memory_value = float(used_memory[:-1])
                if memory_value > 1.0:  # More than 1GB used
                    logger.warning(f"Redis memory usage is high: {used_memory}")
            # Check for connection issues
            if rejected_connections and int(rejected_connections) > 0:
                logger.warning(f"Redis has rejected {rejected_connections} connections")
        except Exception as e:
            logger.error(f"Failed to get Redis info: {str(e)}")
        # Log job statistics with detailed processing metrics
        ongoing = job_stats.get('ongoing', 0)
        queued = job_stats.get('queued', 0)
        logger.info(
            f"Job Stats: completed={job_stats.get('complete', 0)} | "
            f"failed={job_stats.get('failed', 0)} | "
            f"retried={job_stats.get('retried', 0)} | "
            f"ongoing={ongoing} | queued={queued}"
        )
-        # Log job statistics
+        # Warn if too many jobs are queued/backed up
-        logger.info(f"Job stats: completed={job_stats.get('complete', 0)} "
+        if queued > 50:
-                  f"failed={job_stats.get('failed', 0)} "
+            logger.warning(f"Large job queue backlog: {queued} jobs waiting")
                  f"retried={job_stats.get('retried', 0)} "
                  f"ongoing={job_stats.get('ongoing', 0)} "
                  f"queued={job_stats.get('queued', 0)}"
        )
-        # Test database connectivity
+        # Test database connectivity with extended timeout
        if database and hasattr(database, 'async_session'):
            try:
                async with database.async_session() as session:
--- a/start_server.py
+++ b/start_server.py
@ -8,6 +8,8 @@ import subprocess
 import signal
 import os
 import atexit
 import socket
 import time
 from dotenv import load_dotenv
 from core.config import get_settings
 from core.logging_config import setup_logging
@ -15,6 +17,32 @@ from core.logging_config import setup_logging
 # Global variable to store the worker process
 worker_process = None
 def wait_for_redis(host="localhost", port=6379, timeout=20):
    """
    Wait for Redis to become available.
    Args:
        host: Redis host address
        port: Redis port number
        timeout: Maximum time to wait in seconds
    Returns:
        True if Redis becomes available within the timeout, False otherwise
    """
    logging.info(f"Waiting for Redis to be available at {host}:{port}...")
    t0 = time.monotonic()
    while time.monotonic() - t0 < timeout:
        try:
            with socket.create_connection((host, port), timeout=1):
                logging.info("Redis is accepting connections.")
                return True
        except (OSError, socket.error):
            logging.debug(f"Redis not available yet, retrying... ({int(time.monotonic() - t0)}s elapsed)")
            time.sleep(0.3)
    logging.error(f"Redis not reachable after {timeout}s")
    return False
 def check_and_start_redis():
    """Check if the Redis container is running, start if necessary."""
    try:
@ -266,6 +294,13 @@ def main():
    # Load settings (this will validate all required env vars)
    settings = get_settings()
    # Wait for Redis to be available (using environment variables or defaults)
    redis_host = os.environ.get("REDIS_HOST", "127.0.0.1")
    redis_port = int(os.environ.get("REDIS_PORT", "6379"))
    if not wait_for_redis(host=redis_host, port=redis_port):
        logging.error("Cannot start server without Redis. Please ensure Redis is running.")
        sys.exit(1)
    # Start ARQ worker in the background
    start_arq_worker()