Update requirements, fix some docker connection issues

This commit is contained in:
Adityavardhan Agrawal 2025-04-22 11:39:30 -07:00
parent 3e5774b730
commit 2985493b66
4 changed files with 45 additions and 22 deletions

View File

@ -4,7 +4,6 @@ import logging
import os import os
import urllib.parse as up import urllib.parse as up
from datetime import UTC, datetime from datetime import UTC, datetime
from pathlib import Path
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
from arq.connections import RedisSettings from arq.connections import RedisSettings
@ -59,7 +58,8 @@ async def get_document_with_retry(document_service, document_id, auth, max_retri
attempt += 1 attempt += 1
if attempt < max_retries: if attempt < max_retries:
logger.warning( logger.warning(
f"Document {document_id} not found on attempt {attempt}/{max_retries}. Retrying in {retry_delay}s..." f"Document {document_id} not found on attempt {attempt}/{max_retries}. "
f"Retrying in {retry_delay}s..."
) )
await asyncio.sleep(retry_delay) await asyncio.sleep(retry_delay)
retry_delay *= 1.5 retry_delay *= 1.5
@ -69,7 +69,8 @@ async def get_document_with_retry(document_service, document_id, auth, max_retri
error_msg = str(e) error_msg = str(e)
if attempt < max_retries: if attempt < max_retries:
logger.warning( logger.warning(
f"Error retrieving document on attempt {attempt}/{max_retries}: {error_msg}. Retrying in {retry_delay}s..." f"Error retrieving document on attempt {attempt}/{max_retries}: {error_msg}. "
f"Retrying in {retry_delay}s..."
) )
await asyncio.sleep(retry_delay) await asyncio.sleep(retry_delay)
retry_delay *= 1.5 retry_delay *= 1.5
@ -392,12 +393,7 @@ async def startup(ctx):
logger.error("ColPali vector store initialization failed") logger.error("ColPali vector store initialization failed")
ctx["colpali_embedding_model"] = colpali_embedding_model ctx["colpali_embedding_model"] = colpali_embedding_model
ctx["colpali_vector_store"] = colpali_vector_store ctx["colpali_vector_store"] = colpali_vector_store
ctx["cache_factory"] = None
# Initialize cache factory for DocumentService (may not be used for ingestion)
from core.cache.llama_cache_factory import LlamaCacheFactory
cache_factory = LlamaCacheFactory(Path(settings.STORAGE_PATH))
ctx["cache_factory"] = cache_factory
# Initialize rules processor # Initialize rules processor
rules_processor = RulesProcessor() rules_processor = RulesProcessor()
@ -414,7 +410,7 @@ async def startup(ctx):
vector_store=vector_store, vector_store=vector_store,
embedding_model=embedding_model, embedding_model=embedding_model,
parser=parser, parser=parser,
cache_factory=cache_factory, cache_factory=None,
enable_colpali=settings.ENABLE_COLPALI, enable_colpali=settings.ENABLE_COLPALI,
colpali_embedding_model=colpali_embedding_model, colpali_embedding_model=colpali_embedding_model,
colpali_vector_store=colpali_vector_store, colpali_vector_store=colpali_vector_store,
@ -463,8 +459,8 @@ def redis_settings_from_env() -> RedisSettings:
# Use ARQ's supported parameters with optimized values for stability # Use ARQ's supported parameters with optimized values for stability
# For high-volume ingestion (100+ documents), these settings help prevent timeouts # For high-volume ingestion (100+ documents), these settings help prevent timeouts
return RedisSettings( return RedisSettings(
host=url.hostname or os.getenv("REDIS_HOST", "127.0.0.1"), host=get_settings().REDIS_HOST,
port=url.port or int(os.getenv("REDIS_PORT", "6379")), port=get_settings().REDIS_PORT,
database=int(url.path.lstrip("/") or 0), database=int(url.path.lstrip("/") or 0),
conn_timeout=5, # Increased connection timeout (seconds) conn_timeout=5, # Increased connection timeout (seconds)
conn_retries=15, # More retries for transient connection issues conn_retries=15, # More retries for transient connection issues

View File

@ -139,8 +139,14 @@ check_postgres() {\n\
# Check PostgreSQL\n\ # Check PostgreSQL\n\
check_postgres\n\ check_postgres\n\
\n\ \n\
# Start the application with standard asyncio event loop\n\ # Check if command arguments were passed ($# is the number of arguments)\n\
exec uvicorn core.api:app --host $HOST --port $PORT --loop asyncio --http auto --ws auto --lifespan auto\n\ if [ $# -gt 0 ]; then\n\
# If arguments exist, execute them (e.g., execute "arq core.workers...")\n\
exec "$@"\n\
else\n\
# Otherwise, execute the default command (Uvicorn for the API)\n\
exec uvicorn core.api:app --host $HOST --port $PORT --loop asyncio --http auto --ws auto --lifespan auto\n\
fi\n\
' > /app/docker-entrypoint.sh && chmod +x /app/docker-entrypoint.sh ' > /app/docker-entrypoint.sh && chmod +x /app/docker-entrypoint.sh
# Copy application code # Copy application code

View File

@ -74,7 +74,7 @@ $$ LANGUAGE SQL;
-- Create graphs table for knowledge graph functionality -- Create graphs table for knowledge graph functionality
CREATE TABLE IF NOT EXISTS graphs ( CREATE TABLE IF NOT EXISTS graphs (
id VARCHAR PRIMARY KEY, id VARCHAR PRIMARY KEY,
name VARCHAR UNIQUE, name VARCHAR NOT NULL,
entities JSONB DEFAULT '[]', entities JSONB DEFAULT '[]',
relationships JSONB DEFAULT '[]', relationships JSONB DEFAULT '[]',
graph_metadata JSONB DEFAULT '{}', graph_metadata JSONB DEFAULT '{}',
@ -86,5 +86,9 @@ CREATE TABLE IF NOT EXISTS graphs (
access_control JSONB DEFAULT '{"readers": [], "writers": [], "admins": []}' access_control JSONB DEFAULT '{"readers": [], "writers": [], "admins": []}'
); );
-- Create index for graph name for faster lookups -- Create index for graph name and owner for faster lookups
CREATE INDEX IF NOT EXISTS idx_graph_name ON graphs(name); CREATE INDEX IF NOT EXISTS idx_graph_name ON graphs(name);
CREATE INDEX IF NOT EXISTS idx_graph_owner ON graphs USING gin(owner);
-- Create unique constraint on name scoped by owner
CREATE UNIQUE INDEX IF NOT EXISTS idx_graph_owner_name ON graphs((owner->>'id'), name);

View File

@ -9,6 +9,7 @@ anthropic==0.42.0
antlr4-python3-runtime==4.9.3 antlr4-python3-runtime==4.9.3
anyio==4.3.0 anyio==4.3.0
appnope==0.1.4 appnope==0.1.4
arq==0.25.0
asgiref==3.8.1 asgiref==3.8.1
assemblyai==0.36.0 assemblyai==0.36.0
asttokens==2.4.1 asttokens==2.4.1
@ -53,6 +54,7 @@ diskcache==5.6.3
distlib==0.3.9 distlib==0.3.9
distro==1.9.0 distro==1.9.0
dnspython==2.6.1 dnspython==2.6.1
docstring_parser==0.16
docutils==0.21.2 docutils==0.21.2
ecdsa==0.19.0 ecdsa==0.19.0
effdet==0.4.1 effdet==0.4.1
@ -67,6 +69,7 @@ filelock==3.15.4
filetype==1.2.0 filetype==1.2.0
fireworks-ai==0.15.12 fireworks-ai==0.15.12
FlagEmbedding==1.3.4 FlagEmbedding==1.3.4
flake8==7.0.0
flatbuffers==24.3.25 flatbuffers==24.3.25
fonttools==4.53.1 fonttools==4.53.1
frozenlist==1.4.1 frozenlist==1.4.1
@ -88,6 +91,9 @@ greenlet==3.1.1
grpcio==1.65.4 grpcio==1.65.4
grpcio-status==1.65.4 grpcio-status==1.65.4
h11==0.14.0 h11==0.14.0
h2==4.2.0
hiredis==3.1.0
hpack==4.1.0
html2text==2024.2.26 html2text==2024.2.26
htmldate==1.9.3 htmldate==1.9.3
httpcore==1.0.5 httpcore==1.0.5
@ -98,22 +104,25 @@ httpx-sse==0.4.0
httpx-ws==0.7.1 httpx-ws==0.7.1
huggingface-hub==0.27.0 huggingface-hub==0.27.0
humanfriendly==10.0 humanfriendly==10.0
hyperframe==6.1.0
identify==2.6.3 identify==2.6.3
idna==3.7 idna==3.7
ijson==3.3.0 ijson==3.3.0
importlib_metadata==8.5.0 importlib_metadata==8.5.0
iniconfig==2.0.0 iniconfig==2.0.0
inscriptis==2.5.0 inscriptis==2.5.0
instructor==1.7.9
iopath==0.1.10 iopath==0.1.10
ipykernel==6.29.5 ipykernel==6.29.5
ipython==8.26.0 ipython==8.26.0
ir_datasets==0.5.9 ir_datasets==0.5.9
isort==6.0.1
jaraco.classes==3.4.0 jaraco.classes==3.4.0
jaraco.context==6.0.1 jaraco.context==6.0.1
jaraco.functools==4.1.0 jaraco.functools==4.1.0
jedi==0.19.1 jedi==0.19.1
Jinja2==3.1.4 Jinja2==3.1.4
# jiter==0.5.0 jiter==0.8.2
jmespath==1.0.1 jmespath==1.0.1
joblib==1.4.2 joblib==1.4.2
jsonpatch==1.33 jsonpatch==1.33
@ -133,6 +142,7 @@ langdetect==1.0.9
langsmith==0.3.8 langsmith==0.3.8
lap==0.5.12 lap==0.5.12
layoutparser==0.3.4 layoutparser==0.3.4
litellm==1.65.4.post1
llama_cpp_python==0.3.5 llama_cpp_python==0.3.5
llvmlite==0.43.0 llvmlite==0.43.0
lmnr==0.4.60 lmnr==0.4.60
@ -148,12 +158,15 @@ MarkupSafe==2.1.5
marshmallow==3.21.3 marshmallow==3.21.3
matplotlib==3.9.2 matplotlib==3.9.2
matplotlib-inline==0.1.7 matplotlib-inline==0.1.7
mccabe==0.7.0
mdurl==0.1.2 mdurl==0.1.2
monotonic==1.6 monotonic==1.6
more-itertools==10.5.0 more-itertools==10.5.0
motor==3.4.0
mpmath==1.3.0 mpmath==1.3.0
multidict==6.0.5 multidict==6.0.5
multiprocess==0.70.16 multiprocess==0.70.16
mypy==1.15.0
mypy-boto3-s3==1.34.138 mypy-boto3-s3==1.34.138
mypy-extensions==1.0.0 mypy-extensions==1.0.0
narwhals==1.26.0 narwhals==1.26.0
@ -162,7 +175,6 @@ networkx==3.3
nh3==0.2.20 nh3==0.2.20
nltk==3.8.1 nltk==3.8.1
nodeenv==1.9.1 nodeenv==1.9.1
# numba==0.60.0
numpy==1.26.4 numpy==1.26.4
olefile==0.47 olefile==0.47
ollama==0.4.7 ollama==0.4.7
@ -228,19 +240,24 @@ pyarrow-hotfix==0.6
pyasn1==0.6.0 pyasn1==0.6.0
pyasn1_modules==0.4.0 pyasn1_modules==0.4.0
pycocotools==2.0.8 pycocotools==2.0.8
pycodestyle==2.11.1
pycparser==2.22 pycparser==2.22
pydantic==2.10.6 pydantic==2.10.6
pydantic-settings==2.4.0 pydantic-settings==2.4.0
pydantic_core==2.27.2 pydantic_core==2.27.2
pydeck==0.9.1 pydeck==0.9.1
pyee==12.1.1 pyee==12.1.1
pyflakes==3.2.0
Pygments==2.18.0 Pygments==2.18.0
PyJWT==2.9.0 PyJWT==2.9.0
pymongo==4.7.1
pypandoc==1.13 pypandoc==1.13
pyparsing==3.1.2 pyparsing==3.1.2
pypdf==4.3.1 pypdf==4.3.1
pypdfium2==4.30.0 pypdfium2==4.30.0
pyproject-flake8==7.0.0
pyproject_hooks==1.2.0 pyproject_hooks==1.2.0
pyright==1.1.399
pytesseract==0.3.10 pytesseract==0.3.10
pytest==8.2.0 pytest==8.2.0
pytest-asyncio==0.24.0 pytest-asyncio==0.24.0
@ -249,6 +266,7 @@ python-docx==1.1.2
python-dotenv==1.0.1 python-dotenv==1.0.1
python-iso639==2024.4.27 python-iso639==2024.4.27
python-jose==3.3.0 python-jose==3.3.0
python-magic==0.4.27
python-multipart==0.0.9 python-multipart==0.0.9
python-oxmsg==0.0.1 python-oxmsg==0.0.1
python-pptx==0.6.23 python-pptx==0.6.23
@ -259,6 +277,7 @@ pyzmq==26.2.0
rank-bm25==0.2.2 rank-bm25==0.2.2
rapidfuzz==3.9.5 rapidfuzz==3.9.5
readme_renderer==44.0 readme_renderer==44.0
redis==5.2.1
referencing==0.36.2 referencing==0.36.2
regex==2024.7.24 regex==2024.7.24
requests==2.32.3 requests==2.32.3
@ -267,6 +286,7 @@ rfc3986==2.0.0
rich==13.7.1 rich==13.7.1
rpds-py==0.22.3 rpds-py==0.22.3
rsa==4.9 rsa==4.9
ruff==0.11.5
s3transfer==0.11.2 s3transfer==0.11.2
safetensors==0.4.4 safetensors==0.4.4
scikit-learn==1.6.0 scikit-learn==1.6.0
@ -315,7 +335,7 @@ ujson==5.9.0
ultralytics==8.3.55 ultralytics==8.3.55
ultralytics-thop==2.0.13 ultralytics-thop==2.0.13
unlzw3==0.2.3 unlzw3==0.2.3
unstructured==0.16.0 unstructured==0.15.0
unstructured-client==0.24.1 unstructured-client==0.24.1
unstructured-inference==0.7.36 unstructured-inference==0.7.36
unstructured.pytesseract==0.3.12 unstructured.pytesseract==0.3.12
@ -338,6 +358,3 @@ yarl==1.9.4
zipp==3.21.0 zipp==3.21.0
zlib-state==0.1.9 zlib-state==0.1.9
zstandard==0.23.0 zstandard==0.23.0
litellm==1.65.4.post1
instructor==1.7.9
arq==0.25.0