Robustify redis connection

This commit is contained in:
Adityavardhan Agrawal 2025-04-17 02:37:26 -07:00
parent 1ec1efe176
commit f161b7dd2a
2 changed files with 132 additions and 22 deletions

View File

@ -4,8 +4,11 @@ from typing import Dict, Any, List, Optional
from datetime import datetime, UTC from datetime import datetime, UTC
from pathlib import Path from pathlib import Path
import asyncio import asyncio
import os
import urllib.parse as up
import arq import arq
from arq.connections import RedisSettings
from core.models.auth import AuthContext, EntityType from core.models.auth import AuthContext, EntityType
from core.models.documents import Document from core.models.documents import Document
from core.database.postgres_database import PostgresDatabase from core.database.postgres_database import PostgresDatabase
@ -461,6 +464,26 @@ async def shutdown(ctx):
# Close any other open connections or resources that need cleanup # Close any other open connections or resources that need cleanup
logger.info("Worker shutdown complete.") logger.info("Worker shutdown complete.")
def redis_settings_from_env() -> RedisSettings:
"""
Create RedisSettings from environment variables for ARQ worker.
Returns:
RedisSettings configured for Redis connection with optimized performance
"""
url = up.urlparse(os.getenv("REDIS_URL", "redis://127.0.0.1:6379/0"))
# Use ARQ's supported parameters with optimized values for stability
# For high-volume ingestion (100+ documents), these settings help prevent timeouts
return RedisSettings(
host=url.hostname or os.getenv("REDIS_HOST", "127.0.0.1"),
port=url.port or int(os.getenv("REDIS_PORT", "6379")),
database=int(url.path.lstrip("/") or 0),
conn_timeout=5, # Increased connection timeout (seconds)
conn_retries=15, # More retries for transient connection issues
conn_retry_delay=1 # Quick retry delay (seconds)
)
# ARQ Worker Settings # ARQ Worker Settings
class WorkerSettings: class WorkerSettings:
""" """
@ -472,40 +495,92 @@ class WorkerSettings:
functions = [process_ingestion_job] functions = [process_ingestion_job]
on_startup = startup on_startup = startup
on_shutdown = shutdown on_shutdown = shutdown
# Redis settings will be loaded from environment variables by default
# Other optional settings: # Use robust Redis settings that handle connection issues
# redis_settings = arq.connections.RedisSettings(host='localhost', port=6379) redis_settings = redis_settings_from_env()
# Result storage settings
keep_result_ms = 24 * 60 * 60 * 1000 # Keep results for 24 hours (24 * 60 * 60 * 1000 ms) keep_result_ms = 24 * 60 * 60 * 1000 # Keep results for 24 hours (24 * 60 * 60 * 1000 ms)
max_jobs = 5 # Reduce concurrent jobs to prevent connection pool exhaustion
health_check_interval = 300 # Check worker health every 5 minutes instead of 30 seconds to reduce connection overhead # Concurrency settings - optimized for high-volume ingestion
job_timeout = 3600 # 1 hour timeout for jobs max_jobs = 3 # Reduced to prevent resource contention during batch processing
max_tries = 3 # Retry failed jobs up to 3 times
poll_delay = 0.5 # Poll delay to prevent excessive Redis queries # Resource management
health_check_interval = 600 # Extended to 10 minutes to reduce Redis overhead
job_timeout = 7200 # Extended to 2 hours for large document processing
max_tries = 5 # Retry failed jobs up to 5 times
poll_delay = 2.0 # Increased poll delay to prevent Redis connection saturation
# High reliability settings
allow_abort_jobs = False # Don't abort jobs on worker shutdown
retry_jobs = True # Always retry failed jobs
# Prevent queue blocking on error
skip_queue_when_queues_read_fails = True # Continue processing other queues if one fails
# Log Redis and connection pool information for debugging # Log Redis and connection pool information for debugging
@staticmethod @staticmethod
async def health_check(ctx): async def health_check(ctx):
"""Periodic health check to log connection status and job stats.""" """
Enhanced periodic health check to log connection status and job stats.
Monitors Redis memory, database connections, and job processing metrics.
"""
database = ctx.get('database') database = ctx.get('database')
vector_store = ctx.get('vector_store') vector_store = ctx.get('vector_store')
job_stats = ctx.get('job_stats', {}) job_stats = ctx.get('job_stats', {})
redis_info = await ctx['redis'].info()
logger.info(f"Health check: Redis v{redis_info.get('redis_version', 'unknown')} " # Get detailed Redis info
f"mem_usage={redis_info.get('used_memory_human', 'unknown')} " try:
f"clients_connected={redis_info.get('connected_clients', 'unknown')} " redis_info = await ctx['redis'].info(section=['Server', 'Memory', 'Clients', 'Stats'])
f"db_keys={redis_info.get('db0', {}).get('keys', 0)}"
# Server and resource usage info
redis_version = redis_info.get('redis_version', 'unknown')
used_memory = redis_info.get('used_memory_human', 'unknown')
used_memory_peak = redis_info.get('used_memory_peak_human', 'unknown')
clients_connected = redis_info.get('connected_clients', 'unknown')
rejected_connections = redis_info.get('rejected_connections', 0)
total_commands = redis_info.get('total_commands_processed', 0)
# DB keys
db_info = redis_info.get('db0', {})
keys_count = db_info.get('keys', 0) if isinstance(db_info, dict) else 0
# Log comprehensive server status
logger.info(
f"Redis Status: v{redis_version} | "
f"Memory: {used_memory} (peak: {used_memory_peak}) | "
f"Clients: {clients_connected} (rejected: {rejected_connections}) | "
f"DB Keys: {keys_count} | Commands: {total_commands}"
)
# Check for memory warning thresholds
if isinstance(used_memory, str) and used_memory.endswith('G'):
memory_value = float(used_memory[:-1])
if memory_value > 1.0: # More than 1GB used
logger.warning(f"Redis memory usage is high: {used_memory}")
# Check for connection issues
if rejected_connections and int(rejected_connections) > 0:
logger.warning(f"Redis has rejected {rejected_connections} connections")
except Exception as e:
logger.error(f"Failed to get Redis info: {str(e)}")
# Log job statistics with detailed processing metrics
ongoing = job_stats.get('ongoing', 0)
queued = job_stats.get('queued', 0)
logger.info(
f"Job Stats: completed={job_stats.get('complete', 0)} | "
f"failed={job_stats.get('failed', 0)} | "
f"retried={job_stats.get('retried', 0)} | "
f"ongoing={ongoing} | queued={queued}"
) )
# Log job statistics # Warn if too many jobs are queued/backed up
logger.info(f"Job stats: completed={job_stats.get('complete', 0)} " if queued > 50:
f"failed={job_stats.get('failed', 0)} " logger.warning(f"Large job queue backlog: {queued} jobs waiting")
f"retried={job_stats.get('retried', 0)} "
f"ongoing={job_stats.get('ongoing', 0)} "
f"queued={job_stats.get('queued', 0)}"
)
# Test database connectivity # Test database connectivity with extended timeout
if database and hasattr(database, 'async_session'): if database and hasattr(database, 'async_session'):
try: try:
async with database.async_session() as session: async with database.async_session() as session:

View File

@ -8,6 +8,8 @@ import subprocess
import signal import signal
import os import os
import atexit import atexit
import socket
import time
from dotenv import load_dotenv from dotenv import load_dotenv
from core.config import get_settings from core.config import get_settings
from core.logging_config import setup_logging from core.logging_config import setup_logging
@ -15,6 +17,32 @@ from core.logging_config import setup_logging
# Global variable to store the worker process # Global variable to store the worker process
worker_process = None worker_process = None
def wait_for_redis(host="localhost", port=6379, timeout=20):
"""
Wait for Redis to become available.
Args:
host: Redis host address
port: Redis port number
timeout: Maximum time to wait in seconds
Returns:
True if Redis becomes available within the timeout, False otherwise
"""
logging.info(f"Waiting for Redis to be available at {host}:{port}...")
t0 = time.monotonic()
while time.monotonic() - t0 < timeout:
try:
with socket.create_connection((host, port), timeout=1):
logging.info("Redis is accepting connections.")
return True
except (OSError, socket.error):
logging.debug(f"Redis not available yet, retrying... ({int(time.monotonic() - t0)}s elapsed)")
time.sleep(0.3)
logging.error(f"Redis not reachable after {timeout}s")
return False
def check_and_start_redis(): def check_and_start_redis():
"""Check if the Redis container is running, start if necessary.""" """Check if the Redis container is running, start if necessary."""
try: try:
@ -266,6 +294,13 @@ def main():
# Load settings (this will validate all required env vars) # Load settings (this will validate all required env vars)
settings = get_settings() settings = get_settings()
# Wait for Redis to be available (using environment variables or defaults)
redis_host = os.environ.get("REDIS_HOST", "127.0.0.1")
redis_port = int(os.environ.get("REDIS_PORT", "6379"))
if not wait_for_redis(host=redis_host, port=redis_port):
logging.error("Cannot start server without Redis. Please ensure Redis is running.")
sys.exit(1)
# Start ARQ worker in the background # Start ARQ worker in the background
start_arq_worker() start_arq_worker()