mirror of
https://github.com/james-m-jordan/morphik-core.git
synced 2025-05-09 19:32:38 +00:00
add task queue (#87)
* add task queue * ensure task queuing is working as expected. * add downstream sdk changes * bugs and pr comments * update docker arq running logic
This commit is contained in:
parent
7fe2e19a81
commit
3c1195e001
337
core/api.py
337
core/api.py
@ -1,5 +1,7 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
|
import base64
|
||||||
|
import uuid
|
||||||
from datetime import datetime, UTC, timedelta
|
from datetime import datetime, UTC, timedelta
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import sys
|
import sys
|
||||||
@ -8,6 +10,7 @@ from fastapi import FastAPI, Form, HTTPException, Depends, Header, UploadFile, F
|
|||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
import jwt
|
import jwt
|
||||||
import logging
|
import logging
|
||||||
|
import arq
|
||||||
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
||||||
from core.limits_utils import check_and_increment_limits
|
from core.limits_utils import check_and_increment_limits
|
||||||
from core.models.request import GenerateUriRequest, RetrieveRequest, CompletionQueryRequest, IngestTextRequest, CreateGraphRequest, UpdateGraphRequest, BatchIngestResponse
|
from core.models.request import GenerateUriRequest, RetrieveRequest, CompletionQueryRequest, IngestTextRequest, CreateGraphRequest, UpdateGraphRequest, BatchIngestResponse
|
||||||
@ -79,6 +82,8 @@ if not settings.POSTGRES_URI:
|
|||||||
raise ValueError("PostgreSQL URI is required for PostgreSQL database")
|
raise ValueError("PostgreSQL URI is required for PostgreSQL database")
|
||||||
database = PostgresDatabase(uri=settings.POSTGRES_URI)
|
database = PostgresDatabase(uri=settings.POSTGRES_URI)
|
||||||
|
|
||||||
|
# Redis settings already imported at top of file
|
||||||
|
|
||||||
|
|
||||||
@app.on_event("startup")
|
@app.on_event("startup")
|
||||||
async def initialize_database():
|
async def initialize_database():
|
||||||
@ -129,6 +134,37 @@ async def initialize_user_limits_database():
|
|||||||
user_limits_db = UserLimitsDatabase(uri=settings.POSTGRES_URI)
|
user_limits_db = UserLimitsDatabase(uri=settings.POSTGRES_URI)
|
||||||
await user_limits_db.initialize()
|
await user_limits_db.initialize()
|
||||||
|
|
||||||
|
@app.on_event("startup")
|
||||||
|
async def initialize_redis_pool():
|
||||||
|
"""Initialize the Redis connection pool for background tasks."""
|
||||||
|
global redis_pool
|
||||||
|
logger.info("Initializing Redis connection pool...")
|
||||||
|
|
||||||
|
# Get Redis settings from configuration
|
||||||
|
redis_host = settings.REDIS_HOST
|
||||||
|
redis_port = settings.REDIS_PORT
|
||||||
|
|
||||||
|
# Log the Redis connection details
|
||||||
|
logger.info(f"Connecting to Redis at {redis_host}:{redis_port}")
|
||||||
|
|
||||||
|
redis_settings = arq.connections.RedisSettings(
|
||||||
|
host=redis_host,
|
||||||
|
port=redis_port,
|
||||||
|
)
|
||||||
|
|
||||||
|
redis_pool = await arq.create_pool(redis_settings)
|
||||||
|
logger.info("Redis connection pool initialized successfully")
|
||||||
|
|
||||||
|
@app.on_event("shutdown")
|
||||||
|
async def close_redis_pool():
|
||||||
|
"""Close the Redis connection pool on application shutdown."""
|
||||||
|
global redis_pool
|
||||||
|
if redis_pool:
|
||||||
|
logger.info("Closing Redis connection pool...")
|
||||||
|
redis_pool.close()
|
||||||
|
await redis_pool.wait_closed()
|
||||||
|
logger.info("Redis connection pool closed")
|
||||||
|
|
||||||
# Initialize vector store
|
# Initialize vector store
|
||||||
if not settings.POSTGRES_URI:
|
if not settings.POSTGRES_URI:
|
||||||
raise ValueError("PostgreSQL URI is required for pgvector store")
|
raise ValueError("PostgreSQL URI is required for pgvector store")
|
||||||
@ -319,6 +355,13 @@ async def ingest_text(
|
|||||||
raise HTTPException(status_code=403, detail=str(e))
|
raise HTTPException(status_code=403, detail=str(e))
|
||||||
|
|
||||||
|
|
||||||
|
# Redis pool for background tasks
|
||||||
|
redis_pool = None
|
||||||
|
|
||||||
|
def get_redis_pool():
|
||||||
|
"""Get the global Redis connection pool for background tasks."""
|
||||||
|
return redis_pool
|
||||||
|
|
||||||
@app.post("/ingest/file", response_model=Document)
|
@app.post("/ingest/file", response_model=Document)
|
||||||
async def ingest_file(
|
async def ingest_file(
|
||||||
file: UploadFile,
|
file: UploadFile,
|
||||||
@ -328,9 +371,10 @@ async def ingest_file(
|
|||||||
use_colpali: Optional[bool] = None,
|
use_colpali: Optional[bool] = None,
|
||||||
folder_name: Optional[str] = Form(None),
|
folder_name: Optional[str] = Form(None),
|
||||||
end_user_id: Optional[str] = Form(None),
|
end_user_id: Optional[str] = Form(None),
|
||||||
|
redis: arq.ArqRedis = Depends(get_redis_pool),
|
||||||
) -> Document:
|
) -> Document:
|
||||||
"""
|
"""
|
||||||
Ingest a file document.
|
Ingest a file document asynchronously.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file: File to ingest
|
file: File to ingest
|
||||||
@ -342,17 +386,23 @@ async def ingest_file(
|
|||||||
use_colpali: Whether to use ColPali embedding model
|
use_colpali: Whether to use ColPali embedding model
|
||||||
folder_name: Optional folder to scope the document to
|
folder_name: Optional folder to scope the document to
|
||||||
end_user_id: Optional end-user ID to scope the document to
|
end_user_id: Optional end-user ID to scope the document to
|
||||||
|
redis: Redis connection pool for background tasks
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Document: Metadata of ingested document
|
Document with processing status that can be used to check progress
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
|
# Parse metadata and rules
|
||||||
metadata_dict = json.loads(metadata)
|
metadata_dict = json.loads(metadata)
|
||||||
rules_list = json.loads(rules)
|
rules_list = json.loads(rules)
|
||||||
use_colpali = bool(use_colpali)
|
use_colpali = bool(use_colpali)
|
||||||
|
|
||||||
|
# Ensure user has write permission
|
||||||
|
if "write" not in auth.permissions:
|
||||||
|
raise PermissionError("User does not have write permission")
|
||||||
|
|
||||||
async with telemetry.track_operation(
|
async with telemetry.track_operation(
|
||||||
operation_type="ingest_file",
|
operation_type="queue_ingest_file",
|
||||||
user_id=auth.entity_id,
|
user_id=auth.entity_id,
|
||||||
metadata={
|
metadata={
|
||||||
"filename": file.filename,
|
"filename": file.filename,
|
||||||
@ -364,21 +414,97 @@ async def ingest_file(
|
|||||||
"end_user_id": end_user_id,
|
"end_user_id": end_user_id,
|
||||||
},
|
},
|
||||||
):
|
):
|
||||||
logger.debug(f"API: Ingesting file with use_colpali: {use_colpali}")
|
logger.debug(f"API: Queueing file ingestion with use_colpali: {use_colpali}")
|
||||||
|
|
||||||
return await document_service.ingest_file(
|
# Create a document with processing status
|
||||||
file=file,
|
doc = Document(
|
||||||
|
content_type=file.content_type,
|
||||||
|
filename=file.filename,
|
||||||
metadata=metadata_dict,
|
metadata=metadata_dict,
|
||||||
auth=auth,
|
owner={"type": auth.entity_type, "id": auth.entity_id},
|
||||||
rules=rules_list,
|
access_control={
|
||||||
|
"readers": [auth.entity_id],
|
||||||
|
"writers": [auth.entity_id],
|
||||||
|
"admins": [auth.entity_id],
|
||||||
|
"user_id": [auth.user_id] if auth.user_id else [],
|
||||||
|
},
|
||||||
|
system_metadata={"status": "processing"}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add folder_name and end_user_id to system_metadata if provided
|
||||||
|
if folder_name:
|
||||||
|
doc.system_metadata["folder_name"] = folder_name
|
||||||
|
if end_user_id:
|
||||||
|
doc.system_metadata["end_user_id"] = end_user_id
|
||||||
|
|
||||||
|
# Set processing status
|
||||||
|
doc.system_metadata["status"] = "processing"
|
||||||
|
|
||||||
|
# Store the document in the database
|
||||||
|
success = await database.store_document(doc)
|
||||||
|
if not success:
|
||||||
|
raise Exception("Failed to store document metadata")
|
||||||
|
|
||||||
|
# Read file content
|
||||||
|
file_content = await file.read()
|
||||||
|
|
||||||
|
# Generate a unique key for the file
|
||||||
|
file_key = f"ingest_uploads/{uuid.uuid4()}/{file.filename}"
|
||||||
|
|
||||||
|
# Store the file in the configured storage
|
||||||
|
file_content_base64 = base64.b64encode(file_content).decode()
|
||||||
|
bucket, stored_key = await storage.upload_from_base64(
|
||||||
|
file_content_base64,
|
||||||
|
file_key,
|
||||||
|
file.content_type
|
||||||
|
)
|
||||||
|
logger.debug(f"Stored file in bucket {bucket} with key {stored_key}")
|
||||||
|
|
||||||
|
# Update document with storage info
|
||||||
|
doc.storage_info = {"bucket": bucket, "key": stored_key}
|
||||||
|
await database.update_document(
|
||||||
|
document_id=doc.external_id,
|
||||||
|
updates={"storage_info": doc.storage_info},
|
||||||
|
auth=auth
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert auth context to a dictionary for serialization
|
||||||
|
auth_dict = {
|
||||||
|
"entity_type": auth.entity_type.value,
|
||||||
|
"entity_id": auth.entity_id,
|
||||||
|
"app_id": auth.app_id,
|
||||||
|
"permissions": list(auth.permissions),
|
||||||
|
"user_id": auth.user_id
|
||||||
|
}
|
||||||
|
|
||||||
|
# Enqueue the background job
|
||||||
|
job = await redis.enqueue_job(
|
||||||
|
'process_ingestion_job',
|
||||||
|
document_id=doc.external_id,
|
||||||
|
file_key=stored_key,
|
||||||
|
bucket=bucket,
|
||||||
|
original_filename=file.filename,
|
||||||
|
content_type=file.content_type,
|
||||||
|
metadata_json=metadata,
|
||||||
|
auth_dict=auth_dict,
|
||||||
|
rules_list=rules_list,
|
||||||
use_colpali=use_colpali,
|
use_colpali=use_colpali,
|
||||||
folder_name=folder_name,
|
folder_name=folder_name,
|
||||||
end_user_id=end_user_id,
|
end_user_id=end_user_id
|
||||||
)
|
)
|
||||||
|
|
||||||
|
logger.info(f"File ingestion job queued with ID: {job.job_id} for document: {doc.external_id}")
|
||||||
|
|
||||||
|
# Return the document with processing status
|
||||||
|
return doc
|
||||||
|
|
||||||
except json.JSONDecodeError as e:
|
except json.JSONDecodeError as e:
|
||||||
raise HTTPException(status_code=400, detail=f"Invalid JSON: {str(e)}")
|
raise HTTPException(status_code=400, detail=f"Invalid JSON: {str(e)}")
|
||||||
except PermissionError as e:
|
except PermissionError as e:
|
||||||
raise HTTPException(status_code=403, detail=str(e))
|
raise HTTPException(status_code=403, detail=str(e))
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error queueing file ingestion: {str(e)}")
|
||||||
|
raise HTTPException(status_code=500, detail=f"Error queueing file ingestion: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
@app.post("/ingest/files", response_model=BatchIngestResponse)
|
@app.post("/ingest/files", response_model=BatchIngestResponse)
|
||||||
@ -387,13 +513,14 @@ async def batch_ingest_files(
|
|||||||
metadata: str = Form("{}"),
|
metadata: str = Form("{}"),
|
||||||
rules: str = Form("[]"),
|
rules: str = Form("[]"),
|
||||||
use_colpali: Optional[bool] = Form(None),
|
use_colpali: Optional[bool] = Form(None),
|
||||||
parallel: bool = Form(True),
|
parallel: Optional[bool] = Form(True),
|
||||||
folder_name: Optional[str] = Form(None),
|
folder_name: Optional[str] = Form(None),
|
||||||
end_user_id: Optional[str] = Form(None),
|
end_user_id: Optional[str] = Form(None),
|
||||||
auth: AuthContext = Depends(verify_token),
|
auth: AuthContext = Depends(verify_token),
|
||||||
|
redis: arq.ArqRedis = Depends(get_redis_pool),
|
||||||
) -> BatchIngestResponse:
|
) -> BatchIngestResponse:
|
||||||
"""
|
"""
|
||||||
Batch ingest multiple files.
|
Batch ingest multiple files using the task queue.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
files: List of files to ingest
|
files: List of files to ingest
|
||||||
@ -402,15 +529,15 @@ async def batch_ingest_files(
|
|||||||
- A single list of rules to apply to all files
|
- A single list of rules to apply to all files
|
||||||
- A list of rule lists, one per file
|
- A list of rule lists, one per file
|
||||||
use_colpali: Whether to use ColPali-style embedding
|
use_colpali: Whether to use ColPali-style embedding
|
||||||
parallel: Whether to process files in parallel
|
|
||||||
folder_name: Optional folder to scope the documents to
|
folder_name: Optional folder to scope the documents to
|
||||||
end_user_id: Optional end-user ID to scope the documents to
|
end_user_id: Optional end-user ID to scope the documents to
|
||||||
auth: Authentication context
|
auth: Authentication context
|
||||||
|
redis: Redis connection pool for background tasks
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
BatchIngestResponse containing:
|
BatchIngestResponse containing:
|
||||||
- documents: List of successfully ingested documents
|
- documents: List of created documents with processing status
|
||||||
- errors: List of errors encountered during ingestion
|
- errors: List of errors that occurred during the batch operation
|
||||||
"""
|
"""
|
||||||
if not files:
|
if not files:
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
@ -421,8 +548,15 @@ async def batch_ingest_files(
|
|||||||
try:
|
try:
|
||||||
metadata_value = json.loads(metadata)
|
metadata_value = json.loads(metadata)
|
||||||
rules_list = json.loads(rules)
|
rules_list = json.loads(rules)
|
||||||
|
use_colpali = bool(use_colpali)
|
||||||
|
|
||||||
|
# Ensure user has write permission
|
||||||
|
if "write" not in auth.permissions:
|
||||||
|
raise PermissionError("User does not have write permission")
|
||||||
except json.JSONDecodeError as e:
|
except json.JSONDecodeError as e:
|
||||||
raise HTTPException(status_code=400, detail=f"Invalid JSON: {str(e)}")
|
raise HTTPException(status_code=400, detail=f"Invalid JSON: {str(e)}")
|
||||||
|
except PermissionError as e:
|
||||||
|
raise HTTPException(status_code=403, detail=str(e))
|
||||||
|
|
||||||
# Validate metadata if it's a list
|
# Validate metadata if it's a list
|
||||||
if isinstance(metadata_value, list) and len(metadata_value) != len(files):
|
if isinstance(metadata_value, list) and len(metadata_value) != len(files):
|
||||||
@ -439,13 +573,19 @@ async def batch_ingest_files(
|
|||||||
detail=f"Number of rule lists ({len(rules_list)}) must match number of files ({len(files)})"
|
detail=f"Number of rule lists ({len(rules_list)}) must match number of files ({len(files)})"
|
||||||
)
|
)
|
||||||
|
|
||||||
documents = []
|
# Convert auth context to a dictionary for serialization
|
||||||
errors = []
|
auth_dict = {
|
||||||
|
"entity_type": auth.entity_type.value,
|
||||||
|
"entity_id": auth.entity_id,
|
||||||
|
"app_id": auth.app_id,
|
||||||
|
"permissions": list(auth.permissions),
|
||||||
|
"user_id": auth.user_id
|
||||||
|
}
|
||||||
|
|
||||||
# We'll pass folder_name and end_user_id directly to the ingest_file functions
|
created_documents = []
|
||||||
|
|
||||||
async with telemetry.track_operation(
|
async with telemetry.track_operation(
|
||||||
operation_type="batch_ingest",
|
operation_type="queue_batch_ingest",
|
||||||
user_id=auth.entity_id,
|
user_id=auth.entity_id,
|
||||||
metadata={
|
metadata={
|
||||||
"file_count": len(files),
|
"file_count": len(files),
|
||||||
@ -455,54 +595,98 @@ async def batch_ingest_files(
|
|||||||
"end_user_id": end_user_id,
|
"end_user_id": end_user_id,
|
||||||
},
|
},
|
||||||
):
|
):
|
||||||
if parallel:
|
|
||||||
tasks = []
|
|
||||||
for i, file in enumerate(files):
|
|
||||||
metadata_item = metadata_value[i] if isinstance(metadata_value, list) else metadata_value
|
|
||||||
file_rules = rules_list[i] if isinstance(rules_list, list) and rules_list and isinstance(rules_list[0], list) else rules_list
|
|
||||||
task = document_service.ingest_file(
|
|
||||||
file=file,
|
|
||||||
metadata=metadata_item,
|
|
||||||
auth=auth,
|
|
||||||
rules=file_rules,
|
|
||||||
use_colpali=use_colpali,
|
|
||||||
folder_name=folder_name,
|
|
||||||
end_user_id=end_user_id
|
|
||||||
)
|
|
||||||
tasks.append(task)
|
|
||||||
|
|
||||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
||||||
|
|
||||||
for i, result in enumerate(results):
|
|
||||||
if isinstance(result, Exception):
|
|
||||||
errors.append({
|
|
||||||
"filename": files[i].filename,
|
|
||||||
"error": str(result)
|
|
||||||
})
|
|
||||||
else:
|
|
||||||
documents.append(result)
|
|
||||||
else:
|
|
||||||
for i, file in enumerate(files):
|
|
||||||
try:
|
try:
|
||||||
|
for i, file in enumerate(files):
|
||||||
|
# Get the metadata and rules for this file
|
||||||
metadata_item = metadata_value[i] if isinstance(metadata_value, list) else metadata_value
|
metadata_item = metadata_value[i] if isinstance(metadata_value, list) else metadata_value
|
||||||
file_rules = rules_list[i] if isinstance(rules_list, list) and rules_list and isinstance(rules_list[0], list) else rules_list
|
file_rules = rules_list[i] if isinstance(rules_list, list) and rules_list and isinstance(rules_list[0], list) else rules_list
|
||||||
doc = await document_service.ingest_file(
|
|
||||||
file=file,
|
# Create a document with processing status
|
||||||
|
doc = Document(
|
||||||
|
content_type=file.content_type,
|
||||||
|
filename=file.filename,
|
||||||
metadata=metadata_item,
|
metadata=metadata_item,
|
||||||
auth=auth,
|
owner={"type": auth.entity_type, "id": auth.entity_id},
|
||||||
rules=file_rules,
|
access_control={
|
||||||
|
"readers": [auth.entity_id],
|
||||||
|
"writers": [auth.entity_id],
|
||||||
|
"admins": [auth.entity_id],
|
||||||
|
"user_id": [auth.user_id] if auth.user_id else [],
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add folder_name and end_user_id to system_metadata if provided
|
||||||
|
if folder_name:
|
||||||
|
doc.system_metadata["folder_name"] = folder_name
|
||||||
|
if end_user_id:
|
||||||
|
doc.system_metadata["end_user_id"] = end_user_id
|
||||||
|
|
||||||
|
# Set processing status
|
||||||
|
doc.system_metadata["status"] = "processing"
|
||||||
|
|
||||||
|
# Store the document in the database
|
||||||
|
success = await database.store_document(doc)
|
||||||
|
if not success:
|
||||||
|
raise Exception(f"Failed to store document metadata for {file.filename}")
|
||||||
|
|
||||||
|
# Read file content
|
||||||
|
file_content = await file.read()
|
||||||
|
|
||||||
|
# Generate a unique key for the file
|
||||||
|
file_key = f"ingest_uploads/{uuid.uuid4()}/{file.filename}"
|
||||||
|
|
||||||
|
# Store the file in the configured storage
|
||||||
|
file_content_base64 = base64.b64encode(file_content).decode()
|
||||||
|
bucket, stored_key = await storage.upload_from_base64(
|
||||||
|
file_content_base64,
|
||||||
|
file_key,
|
||||||
|
file.content_type
|
||||||
|
)
|
||||||
|
logger.debug(f"Stored file in bucket {bucket} with key {stored_key}")
|
||||||
|
|
||||||
|
# Update document with storage info
|
||||||
|
doc.storage_info = {"bucket": bucket, "key": stored_key}
|
||||||
|
await database.update_document(
|
||||||
|
document_id=doc.external_id,
|
||||||
|
updates={"storage_info": doc.storage_info},
|
||||||
|
auth=auth
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert metadata to JSON string for job
|
||||||
|
metadata_json = json.dumps(metadata_item)
|
||||||
|
|
||||||
|
# Enqueue the background job
|
||||||
|
job = await redis.enqueue_job(
|
||||||
|
'process_ingestion_job',
|
||||||
|
document_id=doc.external_id,
|
||||||
|
file_key=stored_key,
|
||||||
|
bucket=bucket,
|
||||||
|
original_filename=file.filename,
|
||||||
|
content_type=file.content_type,
|
||||||
|
metadata_json=metadata_json,
|
||||||
|
auth_dict=auth_dict,
|
||||||
|
rules_list=file_rules,
|
||||||
use_colpali=use_colpali,
|
use_colpali=use_colpali,
|
||||||
folder_name=folder_name,
|
folder_name=folder_name,
|
||||||
end_user_id=end_user_id
|
end_user_id=end_user_id
|
||||||
)
|
)
|
||||||
documents.append(doc)
|
|
||||||
except Exception as e:
|
|
||||||
errors.append({
|
|
||||||
"filename": file.filename,
|
|
||||||
"error": str(e)
|
|
||||||
})
|
|
||||||
|
|
||||||
return BatchIngestResponse(documents=documents, errors=errors)
|
logger.info(f"File ingestion job queued with ID: {job.job_id} for document: {doc.external_id}")
|
||||||
|
|
||||||
|
# Add document to the list
|
||||||
|
created_documents.append(doc)
|
||||||
|
|
||||||
|
# Return information about created documents
|
||||||
|
return BatchIngestResponse(
|
||||||
|
documents=created_documents,
|
||||||
|
errors=[]
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error queueing batch file ingestion: {str(e)}")
|
||||||
|
raise HTTPException(status_code=500, detail=f"Error queueing batch file ingestion: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@app.post("/retrieve/chunks", response_model=List[ChunkResult])
|
@app.post("/retrieve/chunks", response_model=List[ChunkResult])
|
||||||
@ -818,6 +1002,45 @@ async def get_document(document_id: str, auth: AuthContext = Depends(verify_toke
|
|||||||
logger.error(f"Error getting document: {e}")
|
logger.error(f"Error getting document: {e}")
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
@app.get("/documents/{document_id}/status", response_model=Dict[str, Any])
|
||||||
|
async def get_document_status(document_id: str, auth: AuthContext = Depends(verify_token)):
|
||||||
|
"""
|
||||||
|
Get the processing status of a document.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document_id: ID of the document to check
|
||||||
|
auth: Authentication context
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict containing status information for the document
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
doc = await document_service.db.get_document(document_id, auth)
|
||||||
|
if not doc:
|
||||||
|
raise HTTPException(status_code=404, detail="Document not found")
|
||||||
|
|
||||||
|
# Extract status information
|
||||||
|
status = doc.system_metadata.get("status", "unknown")
|
||||||
|
|
||||||
|
response = {
|
||||||
|
"document_id": doc.external_id,
|
||||||
|
"status": status,
|
||||||
|
"filename": doc.filename,
|
||||||
|
"created_at": doc.system_metadata.get("created_at"),
|
||||||
|
"updated_at": doc.system_metadata.get("updated_at"),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add error information if failed
|
||||||
|
if status == "failed":
|
||||||
|
response["error"] = doc.system_metadata.get("error", "Unknown error")
|
||||||
|
|
||||||
|
return response
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error getting document status: {str(e)}")
|
||||||
|
raise HTTPException(status_code=500, detail=f"Error getting document status: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
@app.delete("/documents/{document_id}")
|
@app.delete("/documents/{document_id}")
|
||||||
async def delete_document(document_id: str, auth: AuthContext = Depends(verify_token)):
|
async def delete_document(document_id: str, auth: AuthContext = Depends(verify_token)):
|
||||||
|
@ -95,6 +95,10 @@ class Settings(BaseSettings):
|
|||||||
# API configuration
|
# API configuration
|
||||||
API_DOMAIN: str = "api.morphik.ai"
|
API_DOMAIN: str = "api.morphik.ai"
|
||||||
|
|
||||||
|
# Redis configuration
|
||||||
|
REDIS_HOST: str = "localhost"
|
||||||
|
REDIS_PORT: int = 6379
|
||||||
|
|
||||||
# Telemetry configuration
|
# Telemetry configuration
|
||||||
TELEMETRY_ENABLED: bool = True
|
TELEMETRY_ENABLED: bool = True
|
||||||
HONEYCOMB_ENABLED: bool = True
|
HONEYCOMB_ENABLED: bool = True
|
||||||
@ -269,6 +273,14 @@ def get_settings() -> Settings:
|
|||||||
"API_DOMAIN": config["morphik"].get("api_domain", "api.morphik.ai"), # Default API domain
|
"API_DOMAIN": config["morphik"].get("api_domain", "api.morphik.ai"), # Default API domain
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# load redis config
|
||||||
|
redis_config = {}
|
||||||
|
if "redis" in config:
|
||||||
|
redis_config = {
|
||||||
|
"REDIS_HOST": config["redis"].get("host", "localhost"),
|
||||||
|
"REDIS_PORT": int(config["redis"].get("port", 6379)),
|
||||||
|
}
|
||||||
|
|
||||||
# load graph config
|
# load graph config
|
||||||
graph_config = {
|
graph_config = {
|
||||||
"GRAPH_PROVIDER": "litellm",
|
"GRAPH_PROVIDER": "litellm",
|
||||||
@ -309,6 +321,7 @@ def get_settings() -> Settings:
|
|||||||
vector_store_config,
|
vector_store_config,
|
||||||
rules_config,
|
rules_config,
|
||||||
morphik_config,
|
morphik_config,
|
||||||
|
redis_config,
|
||||||
graph_config,
|
graph_config,
|
||||||
telemetry_config,
|
telemetry_config,
|
||||||
openai_config,
|
openai_config,
|
||||||
|
@ -46,6 +46,7 @@ class Document(BaseModel):
|
|||||||
"version": 1,
|
"version": 1,
|
||||||
"folder_name": None,
|
"folder_name": None,
|
||||||
"end_user_id": None,
|
"end_user_id": None,
|
||||||
|
"status": "processing", # Status can be: processing, completed, failed
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
"""metadata such as creation date etc."""
|
"""metadata such as creation date etc."""
|
||||||
|
@ -96,6 +96,12 @@ class BatchIngestResponse(BaseModel):
|
|||||||
documents: List[Document]
|
documents: List[Document]
|
||||||
errors: List[Dict[str, str]]
|
errors: List[Dict[str, str]]
|
||||||
|
|
||||||
|
class BatchIngestJobResponse(BaseModel):
|
||||||
|
"""Response model for batch ingestion jobs"""
|
||||||
|
status: str = Field(..., description="Status of the batch operation")
|
||||||
|
documents: List[Document] = Field(..., description="List of created documents with processing status")
|
||||||
|
timestamp: str = Field(..., description="ISO-formatted timestamp")
|
||||||
|
|
||||||
|
|
||||||
class GenerateUriRequest(BaseModel):
|
class GenerateUriRequest(BaseModel):
|
||||||
"""Request model for generating a cloud URI"""
|
"""Request model for generating a cloud URI"""
|
||||||
|
@ -8,7 +8,7 @@ class BaseParser(ABC):
|
|||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
async def parse_file_to_text(
|
async def parse_file_to_text(
|
||||||
self, file: bytes, content_type: str, filename: str
|
self, file: bytes, filename: str
|
||||||
) -> Tuple[Dict[str, Any], str]:
|
) -> Tuple[Dict[str, Any], str]:
|
||||||
"""
|
"""
|
||||||
Parse file content into text.
|
Parse file content into text.
|
||||||
|
@ -486,6 +486,7 @@ class DocumentService:
|
|||||||
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
# TODO: check if it's unused. if so, remove it.
|
||||||
async def ingest_file(
|
async def ingest_file(
|
||||||
self,
|
self,
|
||||||
file: UploadFile,
|
file: UploadFile,
|
||||||
|
1
core/workers/__init__.py
Normal file
1
core/workers/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
# Workers package for background job processing
|
411
core/workers/ingestion_worker.py
Normal file
411
core/workers/ingestion_worker.py
Normal file
@ -0,0 +1,411 @@
|
|||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from typing import Dict, Any, List, Optional
|
||||||
|
from datetime import datetime, UTC
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import arq
|
||||||
|
from core.models.auth import AuthContext, EntityType
|
||||||
|
from core.models.documents import Document
|
||||||
|
from core.database.postgres_database import PostgresDatabase
|
||||||
|
from core.vector_store.pgvector_store import PGVectorStore
|
||||||
|
from core.parser.morphik_parser import MorphikParser
|
||||||
|
from core.embedding.litellm_embedding import LiteLLMEmbeddingModel
|
||||||
|
from core.completion.litellm_completion import LiteLLMCompletionModel
|
||||||
|
from core.storage.local_storage import LocalStorage
|
||||||
|
from core.storage.s3_storage import S3Storage
|
||||||
|
from core.embedding.colpali_embedding_model import ColpaliEmbeddingModel
|
||||||
|
from core.vector_store.multi_vector_store import MultiVectorStore
|
||||||
|
from core.services.document_service import DocumentService
|
||||||
|
from core.services.telemetry import TelemetryService
|
||||||
|
from core.services.rules_processor import RulesProcessor
|
||||||
|
from core.config import get_settings
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
async def process_ingestion_job(
|
||||||
|
ctx: Dict[str, Any],
|
||||||
|
document_id: str,
|
||||||
|
file_key: str,
|
||||||
|
bucket: str,
|
||||||
|
original_filename: str,
|
||||||
|
content_type: str,
|
||||||
|
metadata_json: str,
|
||||||
|
auth_dict: Dict[str, Any],
|
||||||
|
rules_list: List[Dict[str, Any]],
|
||||||
|
use_colpali: bool,
|
||||||
|
folder_name: Optional[str] = None,
|
||||||
|
end_user_id: Optional[str] = None
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Background worker task that processes file ingestion jobs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ctx: The ARQ context dictionary
|
||||||
|
file_key: The storage key where the file is stored
|
||||||
|
bucket: The storage bucket name
|
||||||
|
original_filename: The original file name
|
||||||
|
content_type: The file's content type/MIME type
|
||||||
|
metadata_json: JSON string of metadata
|
||||||
|
auth_dict: Dict representation of AuthContext
|
||||||
|
rules_list: List of rules to apply (already converted to dictionaries)
|
||||||
|
use_colpali: Whether to use ColPali embedding model
|
||||||
|
folder_name: Optional folder to scope the document to
|
||||||
|
end_user_id: Optional end-user ID to scope the document to
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A dictionary with the document ID and processing status
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 1. Log the start of the job
|
||||||
|
logger.info(f"Starting ingestion job for file: {original_filename}")
|
||||||
|
|
||||||
|
# 2. Deserialize metadata and auth
|
||||||
|
metadata = json.loads(metadata_json) if metadata_json else {}
|
||||||
|
auth = AuthContext(
|
||||||
|
entity_type=EntityType(auth_dict.get("entity_type", "unknown")),
|
||||||
|
entity_id=auth_dict.get("entity_id", ""),
|
||||||
|
app_id=auth_dict.get("app_id"),
|
||||||
|
permissions=set(auth_dict.get("permissions", ["read"])),
|
||||||
|
user_id=auth_dict.get("user_id", auth_dict.get("entity_id", ""))
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get document service from the context
|
||||||
|
document_service : DocumentService = ctx['document_service']
|
||||||
|
|
||||||
|
# 3. Download the file from storage
|
||||||
|
logger.info(f"Downloading file from {bucket}/{file_key}")
|
||||||
|
file_content = await document_service.storage.download_file(bucket, file_key)
|
||||||
|
|
||||||
|
# Ensure file_content is bytes
|
||||||
|
if hasattr(file_content, 'read'):
|
||||||
|
file_content = file_content.read()
|
||||||
|
|
||||||
|
# 4. Parse file to text
|
||||||
|
additional_metadata, text = await document_service.parser.parse_file_to_text(
|
||||||
|
file_content, original_filename
|
||||||
|
)
|
||||||
|
logger.debug(f"Parsed file into text of length {len(text)}")
|
||||||
|
|
||||||
|
# 5. Apply rules if provided
|
||||||
|
if rules_list:
|
||||||
|
rule_metadata, modified_text = await document_service.rules_processor.process_rules(text, rules_list)
|
||||||
|
# Update document metadata with extracted metadata from rules
|
||||||
|
metadata.update(rule_metadata)
|
||||||
|
|
||||||
|
if modified_text:
|
||||||
|
text = modified_text
|
||||||
|
logger.info("Updated text with modified content from rules")
|
||||||
|
|
||||||
|
# 6. Retrieve the existing document
|
||||||
|
logger.debug(f"Retrieving document with ID: {document_id}")
|
||||||
|
logger.debug(f"Auth context: entity_type={auth.entity_type}, entity_id={auth.entity_id}, permissions={auth.permissions}")
|
||||||
|
doc = await document_service.db.get_document(document_id, auth)
|
||||||
|
|
||||||
|
if not doc:
|
||||||
|
logger.error(f"Document {document_id} not found in database")
|
||||||
|
logger.error(f"Details - file: {original_filename}, content_type: {content_type}, bucket: {bucket}, key: {file_key}")
|
||||||
|
logger.error(f"Auth: entity_type={auth.entity_type}, entity_id={auth.entity_id}, permissions={auth.permissions}")
|
||||||
|
# Try to get all accessible documents to debug
|
||||||
|
try:
|
||||||
|
all_docs = await document_service.db.get_documents(auth, 0, 100)
|
||||||
|
logger.debug(f"User has access to {len(all_docs)} documents: {[d.external_id for d in all_docs]}")
|
||||||
|
except Exception as list_err:
|
||||||
|
logger.error(f"Failed to list user documents: {str(list_err)}")
|
||||||
|
|
||||||
|
raise ValueError(f"Document {document_id} not found in database")
|
||||||
|
|
||||||
|
# Prepare updates for the document
|
||||||
|
updates = {
|
||||||
|
"metadata": metadata,
|
||||||
|
"additional_metadata": additional_metadata,
|
||||||
|
"system_metadata": {**doc.system_metadata, "content": text}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add folder_name and end_user_id to system_metadata if provided
|
||||||
|
if folder_name:
|
||||||
|
updates["system_metadata"]["folder_name"] = folder_name
|
||||||
|
if end_user_id:
|
||||||
|
updates["system_metadata"]["end_user_id"] = end_user_id
|
||||||
|
|
||||||
|
# Update the document in the database
|
||||||
|
success = await document_service.db.update_document(
|
||||||
|
document_id=document_id,
|
||||||
|
updates=updates,
|
||||||
|
auth=auth
|
||||||
|
)
|
||||||
|
|
||||||
|
if not success:
|
||||||
|
raise ValueError(f"Failed to update document {document_id}")
|
||||||
|
|
||||||
|
# Refresh document object with updated data
|
||||||
|
doc = await document_service.db.get_document(document_id, auth)
|
||||||
|
logger.debug(f"Updated document in database with parsed content")
|
||||||
|
|
||||||
|
# 7. Split text into chunks
|
||||||
|
chunks = await document_service.parser.split_text(text)
|
||||||
|
if not chunks:
|
||||||
|
raise ValueError("No content chunks extracted")
|
||||||
|
logger.debug(f"Split processed text into {len(chunks)} chunks")
|
||||||
|
|
||||||
|
# 8. Generate embeddings for chunks
|
||||||
|
embeddings = await document_service.embedding_model.embed_for_ingestion(chunks)
|
||||||
|
logger.debug(f"Generated {len(embeddings)} embeddings")
|
||||||
|
|
||||||
|
# 9. Create chunk objects
|
||||||
|
chunk_objects = document_service._create_chunk_objects(doc.external_id, chunks, embeddings)
|
||||||
|
logger.debug(f"Created {len(chunk_objects)} chunk objects")
|
||||||
|
|
||||||
|
# 10. Handle ColPali embeddings if enabled
|
||||||
|
chunk_objects_multivector = []
|
||||||
|
if use_colpali and document_service.colpali_embedding_model and document_service.colpali_vector_store:
|
||||||
|
import filetype
|
||||||
|
file_type = filetype.guess(file_content)
|
||||||
|
|
||||||
|
# For ColPali we need the base64 encoding of the file
|
||||||
|
import base64
|
||||||
|
file_content_base64 = base64.b64encode(file_content).decode()
|
||||||
|
|
||||||
|
chunks_multivector = document_service._create_chunks_multivector(
|
||||||
|
file_type, file_content_base64, file_content, chunks
|
||||||
|
)
|
||||||
|
logger.debug(f"Created {len(chunks_multivector)} chunks for multivector embedding")
|
||||||
|
|
||||||
|
colpali_embeddings = await document_service.colpali_embedding_model.embed_for_ingestion(
|
||||||
|
chunks_multivector
|
||||||
|
)
|
||||||
|
logger.debug(f"Generated {len(colpali_embeddings)} embeddings for multivector embedding")
|
||||||
|
|
||||||
|
chunk_objects_multivector = document_service._create_chunk_objects(
|
||||||
|
doc.external_id, chunks_multivector, colpali_embeddings
|
||||||
|
)
|
||||||
|
|
||||||
|
# Update document status to completed before storing
|
||||||
|
doc.system_metadata["status"] = "completed"
|
||||||
|
doc.system_metadata["updated_at"] = datetime.now(UTC)
|
||||||
|
|
||||||
|
# 11. Store chunks and update document with is_update=True
|
||||||
|
chunk_ids = await document_service._store_chunks_and_doc(
|
||||||
|
chunk_objects, doc, use_colpali, chunk_objects_multivector,
|
||||||
|
is_update=True, auth=auth
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.debug(f"Successfully completed processing for document {doc.external_id}")
|
||||||
|
|
||||||
|
# 13. Log successful completion
|
||||||
|
logger.info(f"Successfully completed ingestion for {original_filename}, document ID: {doc.external_id}")
|
||||||
|
|
||||||
|
# 14. Return document ID
|
||||||
|
return {
|
||||||
|
"document_id": doc.external_id,
|
||||||
|
"status": "completed",
|
||||||
|
"filename": original_filename,
|
||||||
|
"content_type": content_type,
|
||||||
|
"timestamp": datetime.now(UTC).isoformat()
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing ingestion job for file {original_filename}: {str(e)}")
|
||||||
|
|
||||||
|
# Update document status to failed if the document exists
|
||||||
|
try:
|
||||||
|
# Create AuthContext for database operations
|
||||||
|
auth_context = AuthContext(
|
||||||
|
entity_type=EntityType(auth_dict.get("entity_type", "unknown")),
|
||||||
|
entity_id=auth_dict.get("entity_id", ""),
|
||||||
|
app_id=auth_dict.get("app_id"),
|
||||||
|
permissions=set(auth_dict.get("permissions", ["read"])),
|
||||||
|
user_id=auth_dict.get("user_id", auth_dict.get("entity_id", ""))
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get database from context
|
||||||
|
database = ctx.get('database')
|
||||||
|
|
||||||
|
if database:
|
||||||
|
# Try to get the document
|
||||||
|
doc = await database.get_document(document_id, auth_context)
|
||||||
|
|
||||||
|
if doc:
|
||||||
|
# Update the document status to failed
|
||||||
|
await database.update_document(
|
||||||
|
document_id=document_id,
|
||||||
|
updates={
|
||||||
|
"system_metadata": {
|
||||||
|
**doc.system_metadata,
|
||||||
|
"status": "failed",
|
||||||
|
"error": str(e),
|
||||||
|
"updated_at": datetime.now(UTC)
|
||||||
|
}
|
||||||
|
},
|
||||||
|
auth=auth_context
|
||||||
|
)
|
||||||
|
logger.info(f"Updated document {document_id} status to failed")
|
||||||
|
except Exception as inner_e:
|
||||||
|
logger.error(f"Failed to update document status: {str(inner_e)}")
|
||||||
|
|
||||||
|
# Return error information
|
||||||
|
return {
|
||||||
|
"status": "failed",
|
||||||
|
"filename": original_filename,
|
||||||
|
"error": str(e),
|
||||||
|
"timestamp": datetime.now(UTC).isoformat()
|
||||||
|
}
|
||||||
|
|
||||||
|
async def startup(ctx):
|
||||||
|
"""
|
||||||
|
Worker startup: Initialize all necessary services that will be reused across jobs.
|
||||||
|
|
||||||
|
This initialization is similar to what happens in core/api.py during app startup,
|
||||||
|
but adapted for the worker context.
|
||||||
|
"""
|
||||||
|
logger.info("Worker starting up. Initializing services...")
|
||||||
|
|
||||||
|
# Get settings
|
||||||
|
settings = get_settings()
|
||||||
|
|
||||||
|
# Initialize database
|
||||||
|
logger.info("Initializing database...")
|
||||||
|
database = PostgresDatabase(uri=settings.POSTGRES_URI)
|
||||||
|
success = await database.initialize()
|
||||||
|
if success:
|
||||||
|
logger.info("Database initialization successful")
|
||||||
|
else:
|
||||||
|
logger.error("Database initialization failed")
|
||||||
|
ctx['database'] = database
|
||||||
|
|
||||||
|
# Initialize vector store
|
||||||
|
logger.info("Initializing primary vector store...")
|
||||||
|
vector_store = PGVectorStore(uri=settings.POSTGRES_URI)
|
||||||
|
success = await vector_store.initialize()
|
||||||
|
if success:
|
||||||
|
logger.info("Primary vector store initialization successful")
|
||||||
|
else:
|
||||||
|
logger.error("Primary vector store initialization failed")
|
||||||
|
ctx['vector_store'] = vector_store
|
||||||
|
|
||||||
|
# Initialize storage
|
||||||
|
if settings.STORAGE_PROVIDER == "local":
|
||||||
|
storage = LocalStorage(storage_path=settings.STORAGE_PATH)
|
||||||
|
elif settings.STORAGE_PROVIDER == "aws-s3":
|
||||||
|
storage = S3Storage(
|
||||||
|
aws_access_key=settings.AWS_ACCESS_KEY,
|
||||||
|
aws_secret_key=settings.AWS_SECRET_ACCESS_KEY,
|
||||||
|
region_name=settings.AWS_REGION,
|
||||||
|
default_bucket=settings.S3_BUCKET,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported storage provider: {settings.STORAGE_PROVIDER}")
|
||||||
|
ctx['storage'] = storage
|
||||||
|
|
||||||
|
# Initialize parser
|
||||||
|
parser = MorphikParser(
|
||||||
|
chunk_size=settings.CHUNK_SIZE,
|
||||||
|
chunk_overlap=settings.CHUNK_OVERLAP,
|
||||||
|
use_unstructured_api=settings.USE_UNSTRUCTURED_API,
|
||||||
|
unstructured_api_key=settings.UNSTRUCTURED_API_KEY,
|
||||||
|
assemblyai_api_key=settings.ASSEMBLYAI_API_KEY,
|
||||||
|
anthropic_api_key=settings.ANTHROPIC_API_KEY,
|
||||||
|
use_contextual_chunking=settings.USE_CONTEXTUAL_CHUNKING,
|
||||||
|
)
|
||||||
|
ctx['parser'] = parser
|
||||||
|
|
||||||
|
# Initialize embedding model
|
||||||
|
embedding_model = LiteLLMEmbeddingModel(model_key=settings.EMBEDDING_MODEL)
|
||||||
|
logger.info(f"Initialized LiteLLM embedding model with model key: {settings.EMBEDDING_MODEL}")
|
||||||
|
ctx['embedding_model'] = embedding_model
|
||||||
|
|
||||||
|
# Initialize completion model
|
||||||
|
completion_model = LiteLLMCompletionModel(model_key=settings.COMPLETION_MODEL)
|
||||||
|
logger.info(f"Initialized LiteLLM completion model with model key: {settings.COMPLETION_MODEL}")
|
||||||
|
ctx['completion_model'] = completion_model
|
||||||
|
|
||||||
|
# Initialize reranker
|
||||||
|
reranker = None
|
||||||
|
if settings.USE_RERANKING:
|
||||||
|
if settings.RERANKER_PROVIDER == "flag":
|
||||||
|
from core.reranker.flag_reranker import FlagReranker
|
||||||
|
reranker = FlagReranker(
|
||||||
|
model_name=settings.RERANKER_MODEL,
|
||||||
|
device=settings.RERANKER_DEVICE,
|
||||||
|
use_fp16=settings.RERANKER_USE_FP16,
|
||||||
|
query_max_length=settings.RERANKER_QUERY_MAX_LENGTH,
|
||||||
|
passage_max_length=settings.RERANKER_PASSAGE_MAX_LENGTH,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.warning(f"Unsupported reranker provider: {settings.RERANKER_PROVIDER}")
|
||||||
|
ctx['reranker'] = reranker
|
||||||
|
|
||||||
|
# Initialize ColPali embedding model and vector store if enabled
|
||||||
|
colpali_embedding_model = None
|
||||||
|
colpali_vector_store = None
|
||||||
|
if settings.ENABLE_COLPALI:
|
||||||
|
logger.info("Initializing ColPali components...")
|
||||||
|
colpali_embedding_model = ColpaliEmbeddingModel()
|
||||||
|
colpali_vector_store = MultiVectorStore(uri=settings.POSTGRES_URI)
|
||||||
|
_ = colpali_vector_store.initialize()
|
||||||
|
ctx['colpali_embedding_model'] = colpali_embedding_model
|
||||||
|
ctx['colpali_vector_store'] = colpali_vector_store
|
||||||
|
|
||||||
|
# Initialize cache factory for DocumentService (may not be used for ingestion)
|
||||||
|
from core.cache.llama_cache_factory import LlamaCacheFactory
|
||||||
|
cache_factory = LlamaCacheFactory(Path(settings.STORAGE_PATH))
|
||||||
|
ctx['cache_factory'] = cache_factory
|
||||||
|
|
||||||
|
# Initialize rules processor
|
||||||
|
rules_processor = RulesProcessor()
|
||||||
|
ctx['rules_processor'] = rules_processor
|
||||||
|
|
||||||
|
# Initialize telemetry service
|
||||||
|
telemetry = TelemetryService()
|
||||||
|
ctx['telemetry'] = telemetry
|
||||||
|
|
||||||
|
# Create the document service using all initialized components
|
||||||
|
document_service = DocumentService(
|
||||||
|
storage=storage,
|
||||||
|
database=database,
|
||||||
|
vector_store=vector_store,
|
||||||
|
embedding_model=embedding_model,
|
||||||
|
completion_model=completion_model,
|
||||||
|
parser=parser,
|
||||||
|
reranker=reranker,
|
||||||
|
cache_factory=cache_factory,
|
||||||
|
enable_colpali=settings.ENABLE_COLPALI,
|
||||||
|
colpali_embedding_model=colpali_embedding_model,
|
||||||
|
colpali_vector_store=colpali_vector_store,
|
||||||
|
)
|
||||||
|
ctx['document_service'] = document_service
|
||||||
|
|
||||||
|
logger.info("Worker startup complete. All services initialized.")
|
||||||
|
|
||||||
|
async def shutdown(ctx):
|
||||||
|
"""
|
||||||
|
Worker shutdown: Clean up resources.
|
||||||
|
|
||||||
|
Properly close connections and cleanup resources to prevent leaks.
|
||||||
|
"""
|
||||||
|
logger.info("Worker shutting down. Cleaning up resources...")
|
||||||
|
|
||||||
|
# Close database connections
|
||||||
|
if 'database' in ctx and hasattr(ctx['database'], 'engine'):
|
||||||
|
logger.info("Closing database connections...")
|
||||||
|
await ctx['database'].engine.dispose()
|
||||||
|
|
||||||
|
# Close any other open connections or resources that need cleanup
|
||||||
|
logger.info("Worker shutdown complete.")
|
||||||
|
|
||||||
|
# ARQ Worker Settings
|
||||||
|
class WorkerSettings:
|
||||||
|
"""
|
||||||
|
ARQ Worker settings for the ingestion worker.
|
||||||
|
|
||||||
|
This defines the functions available to the worker, startup and shutdown handlers,
|
||||||
|
and any specific Redis settings.
|
||||||
|
"""
|
||||||
|
functions = [process_ingestion_job]
|
||||||
|
on_startup = startup
|
||||||
|
on_shutdown = shutdown
|
||||||
|
# Redis settings will be loaded from environment variables by default
|
||||||
|
# Other optional settings:
|
||||||
|
# redis_settings = arq.connections.RedisSettings(host='localhost', port=6379)
|
||||||
|
keep_result_ms = 24 * 60 * 60 * 1000 # Keep results for 24 hours (24 * 60 * 60 * 1000 ms)
|
||||||
|
max_jobs = 10 # Maximum number of jobs to run concurrently
|
@ -26,6 +26,8 @@ services:
|
|||||||
- HOST=0.0.0.0
|
- HOST=0.0.0.0
|
||||||
- PORT=8000
|
- PORT=8000
|
||||||
- LOG_LEVEL=DEBUG
|
- LOG_LEVEL=DEBUG
|
||||||
|
- REDIS_HOST=redis
|
||||||
|
- REDIS_PORT=6379
|
||||||
volumes:
|
volumes:
|
||||||
- ./storage:/app/storage
|
- ./storage:/app/storage
|
||||||
- ./logs:/app/logs
|
- ./logs:/app/logs
|
||||||
@ -34,6 +36,8 @@ services:
|
|||||||
depends_on:
|
depends_on:
|
||||||
postgres:
|
postgres:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
redis:
|
||||||
|
condition: service_healthy
|
||||||
config-check:
|
config-check:
|
||||||
condition: service_completed_successfully
|
condition: service_completed_successfully
|
||||||
ollama:
|
ollama:
|
||||||
@ -44,6 +48,51 @@ services:
|
|||||||
env_file:
|
env_file:
|
||||||
- .env
|
- .env
|
||||||
|
|
||||||
|
worker:
|
||||||
|
build: .
|
||||||
|
command: arq core.workers.ingestion_worker.WorkerSettings
|
||||||
|
environment:
|
||||||
|
- JWT_SECRET_KEY=${JWT_SECRET_KEY:-your-secret-key-here}
|
||||||
|
- POSTGRES_URI=postgresql+asyncpg://morphik:morphik@postgres:5432/morphik
|
||||||
|
- PGPASSWORD=morphik
|
||||||
|
- LOG_LEVEL=DEBUG
|
||||||
|
- REDIS_HOST=redis
|
||||||
|
- REDIS_PORT=6379
|
||||||
|
volumes:
|
||||||
|
- ./storage:/app/storage
|
||||||
|
- ./logs:/app/logs
|
||||||
|
- ./morphik.toml:/app/morphik.toml
|
||||||
|
- huggingface_cache:/root/.cache/huggingface
|
||||||
|
depends_on:
|
||||||
|
postgres:
|
||||||
|
condition: service_healthy
|
||||||
|
redis:
|
||||||
|
condition: service_healthy
|
||||||
|
config-check:
|
||||||
|
condition: service_completed_successfully
|
||||||
|
ollama:
|
||||||
|
condition: service_started
|
||||||
|
required: false
|
||||||
|
networks:
|
||||||
|
- morphik-network
|
||||||
|
env_file:
|
||||||
|
- .env
|
||||||
|
|
||||||
|
redis:
|
||||||
|
image: redis:7-alpine
|
||||||
|
ports:
|
||||||
|
- "6379:6379"
|
||||||
|
volumes:
|
||||||
|
- redis_data:/data
|
||||||
|
command: redis-server --appendonly yes
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "redis-cli", "ping"]
|
||||||
|
interval: 5s
|
||||||
|
timeout: 3s
|
||||||
|
retries: 5
|
||||||
|
networks:
|
||||||
|
- morphik-network
|
||||||
|
|
||||||
postgres:
|
postgres:
|
||||||
build:
|
build:
|
||||||
context: .
|
context: .
|
||||||
@ -88,3 +137,4 @@ volumes:
|
|||||||
postgres_data:
|
postgres_data:
|
||||||
ollama_data:
|
ollama_data:
|
||||||
huggingface_cache:
|
huggingface_cache:
|
||||||
|
redis_data:
|
||||||
|
@ -100,6 +100,10 @@ enable_colpali = true
|
|||||||
mode = "self_hosted" # "cloud" or "self_hosted"
|
mode = "self_hosted" # "cloud" or "self_hosted"
|
||||||
api_domain = "api.morphik.ai" # API domain for cloud URIs
|
api_domain = "api.morphik.ai" # API domain for cloud URIs
|
||||||
|
|
||||||
|
[redis]
|
||||||
|
host = "localhost"
|
||||||
|
port = 6379
|
||||||
|
|
||||||
[graph]
|
[graph]
|
||||||
model = "ollama_llama"
|
model = "ollama_llama"
|
||||||
enable_entity_resolution = true
|
enable_entity_resolution = true
|
||||||
|
@ -340,3 +340,4 @@ zlib-state==0.1.9
|
|||||||
zstandard==0.23.0
|
zstandard==0.23.0
|
||||||
litellm==1.65.4.post1
|
litellm==1.65.4.post1
|
||||||
instructor==1.7.9
|
instructor==1.7.9
|
||||||
|
arq==0.25.0
|
||||||
|
@ -1491,6 +1491,76 @@ class AsyncMorphik:
|
|||||||
doc._client = self
|
doc._client = self
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
async def get_document_status(self, document_id: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Get the current processing status of a document.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document_id: ID of the document to check
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict[str, Any]: Status information including current status, potential errors, and other metadata
|
||||||
|
|
||||||
|
Example:
|
||||||
|
```python
|
||||||
|
status = await db.get_document_status("doc_123")
|
||||||
|
if status["status"] == "completed":
|
||||||
|
print("Document processing complete")
|
||||||
|
elif status["status"] == "failed":
|
||||||
|
print(f"Processing failed: {status['error']}")
|
||||||
|
else:
|
||||||
|
print("Document still processing...")
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
response = await self._request("GET", f"documents/{document_id}/status")
|
||||||
|
return response
|
||||||
|
|
||||||
|
async def wait_for_document_completion(self, document_id: str, timeout_seconds=300, check_interval_seconds=2) -> Document:
|
||||||
|
"""
|
||||||
|
Wait for a document's processing to complete.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document_id: ID of the document to wait for
|
||||||
|
timeout_seconds: Maximum time to wait for completion (default: 300 seconds)
|
||||||
|
check_interval_seconds: Time between status checks (default: 2 seconds)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Document: Updated document with the latest status
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
TimeoutError: If processing doesn't complete within the timeout period
|
||||||
|
ValueError: If processing fails with an error
|
||||||
|
|
||||||
|
Example:
|
||||||
|
```python
|
||||||
|
# Upload a file and wait for processing to complete
|
||||||
|
doc = await db.ingest_file("large_document.pdf")
|
||||||
|
try:
|
||||||
|
completed_doc = await db.wait_for_document_completion(doc.external_id)
|
||||||
|
print(f"Processing complete! Document has {len(completed_doc.chunk_ids)} chunks")
|
||||||
|
except TimeoutError:
|
||||||
|
print("Processing is taking too long")
|
||||||
|
except ValueError as e:
|
||||||
|
print(f"Processing failed: {e}")
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
start_time = asyncio.get_event_loop().time()
|
||||||
|
|
||||||
|
while (asyncio.get_event_loop().time() - start_time) < timeout_seconds:
|
||||||
|
status = await self.get_document_status(document_id)
|
||||||
|
|
||||||
|
if status["status"] == "completed":
|
||||||
|
# Get the full document now that it's complete
|
||||||
|
return await self.get_document(document_id)
|
||||||
|
elif status["status"] == "failed":
|
||||||
|
raise ValueError(f"Document processing failed: {status.get('error', 'Unknown error')}")
|
||||||
|
|
||||||
|
# Wait before checking again
|
||||||
|
await asyncio.sleep(check_interval_seconds)
|
||||||
|
|
||||||
|
raise TimeoutError(f"Document processing did not complete within {timeout_seconds} seconds")
|
||||||
|
|
||||||
async def get_document_by_filename(self, filename: str) -> Document:
|
async def get_document_by_filename(self, filename: str) -> Document:
|
||||||
"""
|
"""
|
||||||
Get document metadata by filename.
|
Get document metadata by filename.
|
||||||
|
@ -25,6 +25,60 @@ class Document(BaseModel):
|
|||||||
# Client reference for update methods
|
# Client reference for update methods
|
||||||
_client = None
|
_client = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def status(self) -> Dict[str, Any]:
|
||||||
|
"""Get the latest processing status of the document from the API.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict[str, Any]: Status information including current status, potential errors, and other metadata
|
||||||
|
"""
|
||||||
|
if self._client is None:
|
||||||
|
raise ValueError(
|
||||||
|
"Document instance not connected to a client. Use a document returned from a Morphik client method."
|
||||||
|
)
|
||||||
|
return self._client.get_document_status(self.external_id)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_processing(self) -> bool:
|
||||||
|
"""Check if the document is still being processed."""
|
||||||
|
return self.status.get("status") == "processing"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_ingested(self) -> bool:
|
||||||
|
"""Check if the document has completed processing."""
|
||||||
|
return self.status.get("status") == "completed"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_failed(self) -> bool:
|
||||||
|
"""Check if document processing has failed."""
|
||||||
|
return self.status.get("status") == "failed"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def error(self) -> Optional[str]:
|
||||||
|
"""Get the error message if processing failed."""
|
||||||
|
status_info = self.status
|
||||||
|
return status_info.get("error") if status_info.get("status") == "failed" else None
|
||||||
|
|
||||||
|
def wait_for_completion(self, timeout_seconds=300, check_interval_seconds=2):
|
||||||
|
"""Wait for document processing to complete.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
timeout_seconds: Maximum time to wait for completion (default: 300 seconds)
|
||||||
|
check_interval_seconds: Time between status checks (default: 2 seconds)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Document: Updated document with the latest status
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
TimeoutError: If processing doesn't complete within the timeout period
|
||||||
|
ValueError: If processing fails with an error
|
||||||
|
"""
|
||||||
|
if self._client is None:
|
||||||
|
raise ValueError(
|
||||||
|
"Document instance not connected to a client. Use a document returned from a Morphik client method."
|
||||||
|
)
|
||||||
|
return self._client.wait_for_document_completion(self.external_id, timeout_seconds, check_interval_seconds)
|
||||||
|
|
||||||
def update_with_text(
|
def update_with_text(
|
||||||
self,
|
self,
|
||||||
content: str,
|
content: str,
|
||||||
|
@ -1619,6 +1619,76 @@ class Morphik:
|
|||||||
doc._client = self
|
doc._client = self
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
def get_document_status(self, document_id: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Get the current processing status of a document.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document_id: ID of the document to check
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict[str, Any]: Status information including current status, potential errors, and other metadata
|
||||||
|
|
||||||
|
Example:
|
||||||
|
```python
|
||||||
|
status = db.get_document_status("doc_123")
|
||||||
|
if status["status"] == "completed":
|
||||||
|
print("Document processing complete")
|
||||||
|
elif status["status"] == "failed":
|
||||||
|
print(f"Processing failed: {status['error']}")
|
||||||
|
else:
|
||||||
|
print("Document still processing...")
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
response = self._request("GET", f"documents/{document_id}/status")
|
||||||
|
return response
|
||||||
|
|
||||||
|
def wait_for_document_completion(self, document_id: str, timeout_seconds=300, check_interval_seconds=2) -> Document:
|
||||||
|
"""
|
||||||
|
Wait for a document's processing to complete.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document_id: ID of the document to wait for
|
||||||
|
timeout_seconds: Maximum time to wait for completion (default: 300 seconds)
|
||||||
|
check_interval_seconds: Time between status checks (default: 2 seconds)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Document: Updated document with the latest status
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
TimeoutError: If processing doesn't complete within the timeout period
|
||||||
|
ValueError: If processing fails with an error
|
||||||
|
|
||||||
|
Example:
|
||||||
|
```python
|
||||||
|
# Upload a file and wait for processing to complete
|
||||||
|
doc = db.ingest_file("large_document.pdf")
|
||||||
|
try:
|
||||||
|
completed_doc = db.wait_for_document_completion(doc.external_id)
|
||||||
|
print(f"Processing complete! Document has {len(completed_doc.chunk_ids)} chunks")
|
||||||
|
except TimeoutError:
|
||||||
|
print("Processing is taking too long")
|
||||||
|
except ValueError as e:
|
||||||
|
print(f"Processing failed: {e}")
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
import time
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
while (time.time() - start_time) < timeout_seconds:
|
||||||
|
status = self.get_document_status(document_id)
|
||||||
|
|
||||||
|
if status["status"] == "completed":
|
||||||
|
# Get the full document now that it's complete
|
||||||
|
return self.get_document(document_id)
|
||||||
|
elif status["status"] == "failed":
|
||||||
|
raise ValueError(f"Document processing failed: {status.get('error', 'Unknown error')}")
|
||||||
|
|
||||||
|
# Wait before checking again
|
||||||
|
time.sleep(check_interval_seconds)
|
||||||
|
|
||||||
|
raise TimeoutError(f"Document processing did not complete within {timeout_seconds} seconds")
|
||||||
|
|
||||||
def get_document_by_filename(self, filename: str) -> Document:
|
def get_document_by_filename(self, filename: str) -> Document:
|
||||||
"""
|
"""
|
||||||
Get document metadata by filename.
|
Get document metadata by filename.
|
||||||
|
141
start_server.py
141
start_server.py
@ -4,10 +4,140 @@ import sys
|
|||||||
import tomli
|
import tomli
|
||||||
import requests
|
import requests
|
||||||
import logging
|
import logging
|
||||||
|
import subprocess
|
||||||
|
import signal
|
||||||
|
import os
|
||||||
|
import atexit
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from core.config import get_settings
|
from core.config import get_settings
|
||||||
from core.logging_config import setup_logging
|
from core.logging_config import setup_logging
|
||||||
|
|
||||||
|
# Global variable to store the worker process
|
||||||
|
worker_process = None
|
||||||
|
|
||||||
|
def check_and_start_redis():
|
||||||
|
"""Check if the Redis container is running, start if necessary."""
|
||||||
|
try:
|
||||||
|
# Check if container exists and is running
|
||||||
|
check_running_cmd = ["docker", "ps", "-q", "-f", "name=morphik-redis"]
|
||||||
|
running_container = subprocess.check_output(check_running_cmd).strip()
|
||||||
|
|
||||||
|
if running_container:
|
||||||
|
logging.info("Redis container (morphik-redis) is already running.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Check if container exists but is stopped
|
||||||
|
check_exists_cmd = ["docker", "ps", "-a", "-q", "-f", "name=morphik-redis"]
|
||||||
|
existing_container = subprocess.check_output(check_exists_cmd).strip()
|
||||||
|
|
||||||
|
if existing_container:
|
||||||
|
logging.info("Starting existing Redis container (morphik-redis)...")
|
||||||
|
subprocess.run(["docker", "start", "morphik-redis"], check=True, capture_output=True)
|
||||||
|
logging.info("Redis container started.")
|
||||||
|
else:
|
||||||
|
logging.info("Creating and starting Redis container (morphik-redis)...")
|
||||||
|
subprocess.run(
|
||||||
|
["docker", "run", "-d", "--name", "morphik-redis", "-p", "6379:6379", "redis"],
|
||||||
|
check=True,
|
||||||
|
capture_output=True,
|
||||||
|
)
|
||||||
|
logging.info("Redis container created and started.")
|
||||||
|
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
logging.error(f"Failed to manage Redis container: {e}")
|
||||||
|
logging.error(f"Stderr: {e.stderr.decode() if e.stderr else 'N/A'}")
|
||||||
|
sys.exit(1)
|
||||||
|
except FileNotFoundError:
|
||||||
|
logging.error("Docker command not found. Please ensure Docker is installed and in PATH.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def start_arq_worker():
|
||||||
|
"""Start the ARQ worker as a subprocess."""
|
||||||
|
global worker_process
|
||||||
|
try:
|
||||||
|
logging.info("Starting ARQ worker...")
|
||||||
|
|
||||||
|
# Ensure logs directory exists
|
||||||
|
log_dir = os.path.join(os.getcwd(), "logs")
|
||||||
|
os.makedirs(log_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Worker log file paths
|
||||||
|
worker_log_path = os.path.join(log_dir, "worker.log")
|
||||||
|
|
||||||
|
# Open log files
|
||||||
|
worker_log = open(worker_log_path, "a")
|
||||||
|
|
||||||
|
# Add timestamp to log
|
||||||
|
timestamp = subprocess.check_output(["date"]).decode().strip()
|
||||||
|
worker_log.write(f"\n\n--- Worker started at {timestamp} ---\n\n")
|
||||||
|
worker_log.flush()
|
||||||
|
|
||||||
|
# Use sys.executable to ensure the same Python environment is used
|
||||||
|
worker_cmd = [sys.executable, "-m", "arq", "core.workers.ingestion_worker.WorkerSettings"]
|
||||||
|
|
||||||
|
# Start the worker with output redirected to log files
|
||||||
|
worker_process = subprocess.Popen(
|
||||||
|
worker_cmd,
|
||||||
|
stdout=worker_log,
|
||||||
|
stderr=worker_log,
|
||||||
|
env=dict(os.environ, PYTHONUNBUFFERED="1") # Ensure unbuffered output
|
||||||
|
)
|
||||||
|
logging.info(f"ARQ worker started with PID: {worker_process.pid}")
|
||||||
|
logging.info(f"Worker logs available at: {worker_log_path}")
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to start ARQ worker: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup_processes():
|
||||||
|
"""Stop the ARQ worker process on exit."""
|
||||||
|
global worker_process
|
||||||
|
if worker_process and worker_process.poll() is None: # Check if process is still running
|
||||||
|
logging.info(f"Stopping ARQ worker (PID: {worker_process.pid})...")
|
||||||
|
|
||||||
|
# Log the worker termination
|
||||||
|
try:
|
||||||
|
log_dir = os.path.join(os.getcwd(), "logs")
|
||||||
|
worker_log_path = os.path.join(log_dir, "worker.log")
|
||||||
|
|
||||||
|
with open(worker_log_path, "a") as worker_log:
|
||||||
|
timestamp = subprocess.check_output(["date"]).decode().strip()
|
||||||
|
worker_log.write(f"\n\n--- Worker stopping at {timestamp} ---\n\n")
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"Could not write worker stop message to log: {e}")
|
||||||
|
|
||||||
|
# Send SIGTERM first for graceful shutdown
|
||||||
|
worker_process.terminate()
|
||||||
|
try:
|
||||||
|
# Wait a bit for graceful shutdown
|
||||||
|
worker_process.wait(timeout=5)
|
||||||
|
logging.info("ARQ worker stopped gracefully.")
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
logging.warning("ARQ worker did not terminate gracefully, sending SIGKILL.")
|
||||||
|
worker_process.kill() # Force kill if it doesn't stop
|
||||||
|
logging.info("ARQ worker killed.")
|
||||||
|
|
||||||
|
# Close any open file descriptors for the process
|
||||||
|
if hasattr(worker_process, 'stdout') and worker_process.stdout:
|
||||||
|
worker_process.stdout.close()
|
||||||
|
if hasattr(worker_process, 'stderr') and worker_process.stderr:
|
||||||
|
worker_process.stderr.close()
|
||||||
|
|
||||||
|
# Optional: Add Redis container stop logic here if desired
|
||||||
|
# try:
|
||||||
|
# logging.info("Stopping Redis container...")
|
||||||
|
# subprocess.run(["docker", "stop", "morphik-redis"], check=False, capture_output=True)
|
||||||
|
# except Exception as e:
|
||||||
|
# logging.warning(f"Could not stop Redis container: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
# Register the cleanup function to be called on script exit
|
||||||
|
atexit.register(cleanup_processes)
|
||||||
|
# Also register for SIGINT (Ctrl+C) and SIGTERM
|
||||||
|
signal.signal(signal.SIGINT, lambda sig, frame: sys.exit(0))
|
||||||
|
signal.signal(signal.SIGTERM, lambda sig, frame: sys.exit(0))
|
||||||
|
|
||||||
|
|
||||||
def check_ollama_running(base_url):
|
def check_ollama_running(base_url):
|
||||||
"""Check if Ollama is running and accessible at the given URL."""
|
"""Check if Ollama is running and accessible at the given URL."""
|
||||||
@ -98,6 +228,9 @@ def main():
|
|||||||
# Set up logging first with specified level
|
# Set up logging first with specified level
|
||||||
setup_logging(log_level=args.log.upper())
|
setup_logging(log_level=args.log.upper())
|
||||||
|
|
||||||
|
# Check and start Redis container
|
||||||
|
check_and_start_redis()
|
||||||
|
|
||||||
# Load environment variables from .env file
|
# Load environment variables from .env file
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
@ -133,14 +266,18 @@ def main():
|
|||||||
# Load settings (this will validate all required env vars)
|
# Load settings (this will validate all required env vars)
|
||||||
settings = get_settings()
|
settings = get_settings()
|
||||||
|
|
||||||
# Start server
|
# Start ARQ worker in the background
|
||||||
|
start_arq_worker()
|
||||||
|
|
||||||
|
# Start server (this is blocking)
|
||||||
|
logging.info("Starting Uvicorn server...")
|
||||||
uvicorn.run(
|
uvicorn.run(
|
||||||
"core.api:app",
|
"core.api:app",
|
||||||
host=settings.HOST,
|
host=settings.HOST,
|
||||||
port=settings.PORT,
|
port=settings.PORT,
|
||||||
loop="asyncio",
|
loop="asyncio",
|
||||||
log_level=args.log,
|
log_level=args.log,
|
||||||
# reload=settings.RELOAD
|
# reload=settings.RELOAD # Reload might interfere with subprocess management
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user