From 2985493b66be7a55a9a7056f6c9af486acc411dc Mon Sep 17 00:00:00 2001 From: Adityavardhan Agrawal Date: Tue, 22 Apr 2025 11:39:30 -0700 Subject: [PATCH] Update requirements, fix some docker connection issues --- core/workers/ingestion_worker.py | 20 ++++++++------------ dockerfile | 10 ++++++++-- init.sql | 8 ++++++-- requirements.txt | 29 +++++++++++++++++++++++------ 4 files changed, 45 insertions(+), 22 deletions(-) diff --git a/core/workers/ingestion_worker.py b/core/workers/ingestion_worker.py index aaca196..718d716 100644 --- a/core/workers/ingestion_worker.py +++ b/core/workers/ingestion_worker.py @@ -4,7 +4,6 @@ import logging import os import urllib.parse as up from datetime import UTC, datetime -from pathlib import Path from typing import Any, Dict, List, Optional from arq.connections import RedisSettings @@ -59,7 +58,8 @@ async def get_document_with_retry(document_service, document_id, auth, max_retri attempt += 1 if attempt < max_retries: logger.warning( - f"Document {document_id} not found on attempt {attempt}/{max_retries}. Retrying in {retry_delay}s..." + f"Document {document_id} not found on attempt {attempt}/{max_retries}. " + f"Retrying in {retry_delay}s..." ) await asyncio.sleep(retry_delay) retry_delay *= 1.5 @@ -69,7 +69,8 @@ async def get_document_with_retry(document_service, document_id, auth, max_retri error_msg = str(e) if attempt < max_retries: logger.warning( - f"Error retrieving document on attempt {attempt}/{max_retries}: {error_msg}. Retrying in {retry_delay}s..." + f"Error retrieving document on attempt {attempt}/{max_retries}: {error_msg}. " + f"Retrying in {retry_delay}s..." ) await asyncio.sleep(retry_delay) retry_delay *= 1.5 @@ -392,12 +393,7 @@ async def startup(ctx): logger.error("ColPali vector store initialization failed") ctx["colpali_embedding_model"] = colpali_embedding_model ctx["colpali_vector_store"] = colpali_vector_store - - # Initialize cache factory for DocumentService (may not be used for ingestion) - from core.cache.llama_cache_factory import LlamaCacheFactory - - cache_factory = LlamaCacheFactory(Path(settings.STORAGE_PATH)) - ctx["cache_factory"] = cache_factory + ctx["cache_factory"] = None # Initialize rules processor rules_processor = RulesProcessor() @@ -414,7 +410,7 @@ async def startup(ctx): vector_store=vector_store, embedding_model=embedding_model, parser=parser, - cache_factory=cache_factory, + cache_factory=None, enable_colpali=settings.ENABLE_COLPALI, colpali_embedding_model=colpali_embedding_model, colpali_vector_store=colpali_vector_store, @@ -463,8 +459,8 @@ def redis_settings_from_env() -> RedisSettings: # Use ARQ's supported parameters with optimized values for stability # For high-volume ingestion (100+ documents), these settings help prevent timeouts return RedisSettings( - host=url.hostname or os.getenv("REDIS_HOST", "127.0.0.1"), - port=url.port or int(os.getenv("REDIS_PORT", "6379")), + host=get_settings().REDIS_HOST, + port=get_settings().REDIS_PORT, database=int(url.path.lstrip("/") or 0), conn_timeout=5, # Increased connection timeout (seconds) conn_retries=15, # More retries for transient connection issues diff --git a/dockerfile b/dockerfile index 1b38492..64563f0 100644 --- a/dockerfile +++ b/dockerfile @@ -139,8 +139,14 @@ check_postgres() {\n\ # Check PostgreSQL\n\ check_postgres\n\ \n\ -# Start the application with standard asyncio event loop\n\ -exec uvicorn core.api:app --host $HOST --port $PORT --loop asyncio --http auto --ws auto --lifespan auto\n\ +# Check if command arguments were passed ($# is the number of arguments)\n\ +if [ $# -gt 0 ]; then\n\ + # If arguments exist, execute them (e.g., execute "arq core.workers...")\n\ + exec "$@"\n\ +else\n\ + # Otherwise, execute the default command (Uvicorn for the API)\n\ + exec uvicorn core.api:app --host $HOST --port $PORT --loop asyncio --http auto --ws auto --lifespan auto\n\ +fi\n\ ' > /app/docker-entrypoint.sh && chmod +x /app/docker-entrypoint.sh # Copy application code diff --git a/init.sql b/init.sql index 51eb4f2..5d4f299 100644 --- a/init.sql +++ b/init.sql @@ -74,7 +74,7 @@ $$ LANGUAGE SQL; -- Create graphs table for knowledge graph functionality CREATE TABLE IF NOT EXISTS graphs ( id VARCHAR PRIMARY KEY, - name VARCHAR UNIQUE, + name VARCHAR NOT NULL, entities JSONB DEFAULT '[]', relationships JSONB DEFAULT '[]', graph_metadata JSONB DEFAULT '{}', @@ -86,5 +86,9 @@ CREATE TABLE IF NOT EXISTS graphs ( access_control JSONB DEFAULT '{"readers": [], "writers": [], "admins": []}' ); --- Create index for graph name for faster lookups +-- Create index for graph name and owner for faster lookups CREATE INDEX IF NOT EXISTS idx_graph_name ON graphs(name); +CREATE INDEX IF NOT EXISTS idx_graph_owner ON graphs USING gin(owner); + +-- Create unique constraint on name scoped by owner +CREATE UNIQUE INDEX IF NOT EXISTS idx_graph_owner_name ON graphs((owner->>'id'), name); diff --git a/requirements.txt b/requirements.txt index 1b367ee..a5f34db 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,6 +9,7 @@ anthropic==0.42.0 antlr4-python3-runtime==4.9.3 anyio==4.3.0 appnope==0.1.4 +arq==0.25.0 asgiref==3.8.1 assemblyai==0.36.0 asttokens==2.4.1 @@ -53,6 +54,7 @@ diskcache==5.6.3 distlib==0.3.9 distro==1.9.0 dnspython==2.6.1 +docstring_parser==0.16 docutils==0.21.2 ecdsa==0.19.0 effdet==0.4.1 @@ -67,6 +69,7 @@ filelock==3.15.4 filetype==1.2.0 fireworks-ai==0.15.12 FlagEmbedding==1.3.4 +flake8==7.0.0 flatbuffers==24.3.25 fonttools==4.53.1 frozenlist==1.4.1 @@ -88,6 +91,9 @@ greenlet==3.1.1 grpcio==1.65.4 grpcio-status==1.65.4 h11==0.14.0 +h2==4.2.0 +hiredis==3.1.0 +hpack==4.1.0 html2text==2024.2.26 htmldate==1.9.3 httpcore==1.0.5 @@ -98,22 +104,25 @@ httpx-sse==0.4.0 httpx-ws==0.7.1 huggingface-hub==0.27.0 humanfriendly==10.0 +hyperframe==6.1.0 identify==2.6.3 idna==3.7 ijson==3.3.0 importlib_metadata==8.5.0 iniconfig==2.0.0 inscriptis==2.5.0 +instructor==1.7.9 iopath==0.1.10 ipykernel==6.29.5 ipython==8.26.0 ir_datasets==0.5.9 +isort==6.0.1 jaraco.classes==3.4.0 jaraco.context==6.0.1 jaraco.functools==4.1.0 jedi==0.19.1 Jinja2==3.1.4 -# jiter==0.5.0 +jiter==0.8.2 jmespath==1.0.1 joblib==1.4.2 jsonpatch==1.33 @@ -133,6 +142,7 @@ langdetect==1.0.9 langsmith==0.3.8 lap==0.5.12 layoutparser==0.3.4 +litellm==1.65.4.post1 llama_cpp_python==0.3.5 llvmlite==0.43.0 lmnr==0.4.60 @@ -148,12 +158,15 @@ MarkupSafe==2.1.5 marshmallow==3.21.3 matplotlib==3.9.2 matplotlib-inline==0.1.7 +mccabe==0.7.0 mdurl==0.1.2 monotonic==1.6 more-itertools==10.5.0 +motor==3.4.0 mpmath==1.3.0 multidict==6.0.5 multiprocess==0.70.16 +mypy==1.15.0 mypy-boto3-s3==1.34.138 mypy-extensions==1.0.0 narwhals==1.26.0 @@ -162,7 +175,6 @@ networkx==3.3 nh3==0.2.20 nltk==3.8.1 nodeenv==1.9.1 -# numba==0.60.0 numpy==1.26.4 olefile==0.47 ollama==0.4.7 @@ -228,19 +240,24 @@ pyarrow-hotfix==0.6 pyasn1==0.6.0 pyasn1_modules==0.4.0 pycocotools==2.0.8 +pycodestyle==2.11.1 pycparser==2.22 pydantic==2.10.6 pydantic-settings==2.4.0 pydantic_core==2.27.2 pydeck==0.9.1 pyee==12.1.1 +pyflakes==3.2.0 Pygments==2.18.0 PyJWT==2.9.0 +pymongo==4.7.1 pypandoc==1.13 pyparsing==3.1.2 pypdf==4.3.1 pypdfium2==4.30.0 +pyproject-flake8==7.0.0 pyproject_hooks==1.2.0 +pyright==1.1.399 pytesseract==0.3.10 pytest==8.2.0 pytest-asyncio==0.24.0 @@ -249,6 +266,7 @@ python-docx==1.1.2 python-dotenv==1.0.1 python-iso639==2024.4.27 python-jose==3.3.0 +python-magic==0.4.27 python-multipart==0.0.9 python-oxmsg==0.0.1 python-pptx==0.6.23 @@ -259,6 +277,7 @@ pyzmq==26.2.0 rank-bm25==0.2.2 rapidfuzz==3.9.5 readme_renderer==44.0 +redis==5.2.1 referencing==0.36.2 regex==2024.7.24 requests==2.32.3 @@ -267,6 +286,7 @@ rfc3986==2.0.0 rich==13.7.1 rpds-py==0.22.3 rsa==4.9 +ruff==0.11.5 s3transfer==0.11.2 safetensors==0.4.4 scikit-learn==1.6.0 @@ -315,7 +335,7 @@ ujson==5.9.0 ultralytics==8.3.55 ultralytics-thop==2.0.13 unlzw3==0.2.3 -unstructured==0.16.0 +unstructured==0.15.0 unstructured-client==0.24.1 unstructured-inference==0.7.36 unstructured.pytesseract==0.3.12 @@ -338,6 +358,3 @@ yarl==1.9.4 zipp==3.21.0 zlib-state==0.1.9 zstandard==0.23.0 -litellm==1.65.4.post1 -instructor==1.7.9 -arq==0.25.0