2025-02-07 21:08:40 -05:00
|
|
|
import argparse
|
2025-04-20 16:34:29 -07:00
|
|
|
import atexit
|
2025-04-06 14:28:54 -07:00
|
|
|
import logging
|
2025-04-15 23:31:49 -07:00
|
|
|
import os
|
2025-04-20 16:34:29 -07:00
|
|
|
import signal
|
2025-04-17 02:37:26 -07:00
|
|
|
import socket
|
2025-04-20 16:34:29 -07:00
|
|
|
import subprocess
|
|
|
|
import sys
|
2025-04-17 02:37:26 -07:00
|
|
|
import time
|
2025-04-20 16:34:29 -07:00
|
|
|
|
|
|
|
import requests
|
|
|
|
import tomli
|
|
|
|
import uvicorn
|
2024-11-16 01:48:15 -05:00
|
|
|
from dotenv import load_dotenv
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2024-11-22 18:56:22 -05:00
|
|
|
from core.config import get_settings
|
2025-01-30 16:03:46 -05:00
|
|
|
from core.logging_config import setup_logging
|
2024-11-16 01:48:15 -05:00
|
|
|
|
2025-04-15 23:31:49 -07:00
|
|
|
# Global variable to store the worker process
|
|
|
|
worker_process = None
|
|
|
|
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-17 02:37:26 -07:00
|
|
|
def wait_for_redis(host="localhost", port=6379, timeout=20):
|
|
|
|
"""
|
|
|
|
Wait for Redis to become available.
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-17 02:37:26 -07:00
|
|
|
Args:
|
|
|
|
host: Redis host address
|
|
|
|
port: Redis port number
|
|
|
|
timeout: Maximum time to wait in seconds
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-17 02:37:26 -07:00
|
|
|
Returns:
|
|
|
|
True if Redis becomes available within the timeout, False otherwise
|
|
|
|
"""
|
|
|
|
logging.info(f"Waiting for Redis to be available at {host}:{port}...")
|
|
|
|
t0 = time.monotonic()
|
|
|
|
while time.monotonic() - t0 < timeout:
|
|
|
|
try:
|
|
|
|
with socket.create_connection((host, port), timeout=1):
|
|
|
|
logging.info("Redis is accepting connections.")
|
|
|
|
return True
|
|
|
|
except (OSError, socket.error):
|
|
|
|
logging.debug(f"Redis not available yet, retrying... ({int(time.monotonic() - t0)}s elapsed)")
|
|
|
|
time.sleep(0.3)
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-17 02:37:26 -07:00
|
|
|
logging.error(f"Redis not reachable after {timeout}s")
|
|
|
|
return False
|
|
|
|
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-15 23:31:49 -07:00
|
|
|
def check_and_start_redis():
|
|
|
|
"""Check if the Redis container is running, start if necessary."""
|
|
|
|
try:
|
|
|
|
# Check if container exists and is running
|
|
|
|
check_running_cmd = ["docker", "ps", "-q", "-f", "name=morphik-redis"]
|
|
|
|
running_container = subprocess.check_output(check_running_cmd).strip()
|
|
|
|
|
|
|
|
if running_container:
|
|
|
|
logging.info("Redis container (morphik-redis) is already running.")
|
|
|
|
return
|
|
|
|
|
|
|
|
# Check if container exists but is stopped
|
|
|
|
check_exists_cmd = ["docker", "ps", "-a", "-q", "-f", "name=morphik-redis"]
|
|
|
|
existing_container = subprocess.check_output(check_exists_cmd).strip()
|
|
|
|
|
|
|
|
if existing_container:
|
|
|
|
logging.info("Starting existing Redis container (morphik-redis)...")
|
|
|
|
subprocess.run(["docker", "start", "morphik-redis"], check=True, capture_output=True)
|
|
|
|
logging.info("Redis container started.")
|
|
|
|
else:
|
|
|
|
logging.info("Creating and starting Redis container (morphik-redis)...")
|
|
|
|
subprocess.run(
|
|
|
|
["docker", "run", "-d", "--name", "morphik-redis", "-p", "6379:6379", "redis"],
|
|
|
|
check=True,
|
|
|
|
capture_output=True,
|
|
|
|
)
|
|
|
|
logging.info("Redis container created and started.")
|
|
|
|
|
|
|
|
except subprocess.CalledProcessError as e:
|
|
|
|
logging.error(f"Failed to manage Redis container: {e}")
|
|
|
|
logging.error(f"Stderr: {e.stderr.decode() if e.stderr else 'N/A'}")
|
|
|
|
sys.exit(1)
|
|
|
|
except FileNotFoundError:
|
|
|
|
logging.error("Docker command not found. Please ensure Docker is installed and in PATH.")
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
def start_arq_worker():
|
|
|
|
"""Start the ARQ worker as a subprocess."""
|
|
|
|
global worker_process
|
|
|
|
try:
|
|
|
|
logging.info("Starting ARQ worker...")
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-15 23:31:49 -07:00
|
|
|
# Ensure logs directory exists
|
|
|
|
log_dir = os.path.join(os.getcwd(), "logs")
|
|
|
|
os.makedirs(log_dir, exist_ok=True)
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-15 23:31:49 -07:00
|
|
|
# Worker log file paths
|
|
|
|
worker_log_path = os.path.join(log_dir, "worker.log")
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-15 23:31:49 -07:00
|
|
|
# Open log files
|
|
|
|
worker_log = open(worker_log_path, "a")
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-15 23:31:49 -07:00
|
|
|
# Add timestamp to log
|
|
|
|
timestamp = subprocess.check_output(["date"]).decode().strip()
|
|
|
|
worker_log.write(f"\n\n--- Worker started at {timestamp} ---\n\n")
|
|
|
|
worker_log.flush()
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-15 23:31:49 -07:00
|
|
|
# Use sys.executable to ensure the same Python environment is used
|
|
|
|
worker_cmd = [sys.executable, "-m", "arq", "core.workers.ingestion_worker.WorkerSettings"]
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-15 23:31:49 -07:00
|
|
|
# Start the worker with output redirected to log files
|
|
|
|
worker_process = subprocess.Popen(
|
2025-04-20 16:34:29 -07:00
|
|
|
worker_cmd,
|
2025-04-15 23:31:49 -07:00
|
|
|
stdout=worker_log,
|
|
|
|
stderr=worker_log,
|
2025-04-20 16:34:29 -07:00
|
|
|
env=dict(os.environ, PYTHONUNBUFFERED="1"), # Ensure unbuffered output
|
2025-04-15 23:31:49 -07:00
|
|
|
)
|
|
|
|
logging.info(f"ARQ worker started with PID: {worker_process.pid}")
|
|
|
|
logging.info(f"Worker logs available at: {worker_log_path}")
|
|
|
|
except Exception as e:
|
|
|
|
logging.error(f"Failed to start ARQ worker: {e}")
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
def cleanup_processes():
|
|
|
|
"""Stop the ARQ worker process on exit."""
|
|
|
|
global worker_process
|
2025-04-20 16:34:29 -07:00
|
|
|
if worker_process and worker_process.poll() is None: # Check if process is still running
|
2025-04-15 23:31:49 -07:00
|
|
|
logging.info(f"Stopping ARQ worker (PID: {worker_process.pid})...")
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-15 23:31:49 -07:00
|
|
|
# Log the worker termination
|
|
|
|
try:
|
|
|
|
log_dir = os.path.join(os.getcwd(), "logs")
|
|
|
|
worker_log_path = os.path.join(log_dir, "worker.log")
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-15 23:31:49 -07:00
|
|
|
with open(worker_log_path, "a") as worker_log:
|
|
|
|
timestamp = subprocess.check_output(["date"]).decode().strip()
|
|
|
|
worker_log.write(f"\n\n--- Worker stopping at {timestamp} ---\n\n")
|
|
|
|
except Exception as e:
|
|
|
|
logging.warning(f"Could not write worker stop message to log: {e}")
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-15 23:31:49 -07:00
|
|
|
# Send SIGTERM first for graceful shutdown
|
|
|
|
worker_process.terminate()
|
|
|
|
try:
|
|
|
|
# Wait a bit for graceful shutdown
|
|
|
|
worker_process.wait(timeout=5)
|
|
|
|
logging.info("ARQ worker stopped gracefully.")
|
|
|
|
except subprocess.TimeoutExpired:
|
|
|
|
logging.warning("ARQ worker did not terminate gracefully, sending SIGKILL.")
|
2025-04-20 16:34:29 -07:00
|
|
|
worker_process.kill() # Force kill if it doesn't stop
|
2025-04-15 23:31:49 -07:00
|
|
|
logging.info("ARQ worker killed.")
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-15 23:31:49 -07:00
|
|
|
# Close any open file descriptors for the process
|
2025-04-20 16:34:29 -07:00
|
|
|
if hasattr(worker_process, "stdout") and worker_process.stdout:
|
2025-04-15 23:31:49 -07:00
|
|
|
worker_process.stdout.close()
|
2025-04-20 16:34:29 -07:00
|
|
|
if hasattr(worker_process, "stderr") and worker_process.stderr:
|
2025-04-15 23:31:49 -07:00
|
|
|
worker_process.stderr.close()
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-15 23:31:49 -07:00
|
|
|
# Optional: Add Redis container stop logic here if desired
|
|
|
|
# try:
|
|
|
|
# logging.info("Stopping Redis container...")
|
|
|
|
# subprocess.run(["docker", "stop", "morphik-redis"], check=False, capture_output=True)
|
|
|
|
# except Exception as e:
|
|
|
|
# logging.warning(f"Could not stop Redis container: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
# Register the cleanup function to be called on script exit
|
|
|
|
atexit.register(cleanup_processes)
|
|
|
|
# Also register for SIGINT (Ctrl+C) and SIGTERM
|
|
|
|
signal.signal(signal.SIGINT, lambda sig, frame: sys.exit(0))
|
|
|
|
signal.signal(signal.SIGTERM, lambda sig, frame: sys.exit(0))
|
|
|
|
|
2024-11-16 14:37:01 -05:00
|
|
|
|
2025-04-06 14:28:54 -07:00
|
|
|
def check_ollama_running(base_url):
|
|
|
|
"""Check if Ollama is running and accessible at the given URL."""
|
|
|
|
try:
|
|
|
|
api_url = f"{base_url}/api/tags"
|
|
|
|
response = requests.get(api_url, timeout=2)
|
|
|
|
return response.status_code == 200
|
|
|
|
except requests.RequestException:
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
def get_ollama_usage_info():
|
|
|
|
"""Check if Ollama is required based on the configuration file and get base URLs."""
|
|
|
|
try:
|
2025-04-09 18:46:00 -07:00
|
|
|
with open("morphik.toml", "rb") as f:
|
2025-04-06 14:28:54 -07:00
|
|
|
config = tomli.load(f)
|
|
|
|
|
|
|
|
ollama_configs = []
|
|
|
|
|
2025-04-08 17:14:34 -07:00
|
|
|
# Get registered Ollama models first
|
|
|
|
ollama_models = {}
|
|
|
|
if "registered_models" in config:
|
|
|
|
for model_key, model_config in config["registered_models"].items():
|
|
|
|
model_name = model_config.get("model_name", "")
|
|
|
|
if "ollama" in model_name:
|
|
|
|
api_base = model_config.get("api_base")
|
|
|
|
if api_base:
|
|
|
|
ollama_models[model_key] = api_base
|
2025-04-06 14:28:54 -07:00
|
|
|
|
2025-04-08 17:14:34 -07:00
|
|
|
# Check which components are using Ollama models
|
|
|
|
components_to_check = ["embedding", "completion", "rules", "graph", "parser.vision"]
|
2025-04-06 14:28:54 -07:00
|
|
|
|
2025-04-08 17:14:34 -07:00
|
|
|
for component in components_to_check:
|
|
|
|
if component == "parser.vision":
|
|
|
|
# Special handling for parser.vision
|
|
|
|
if "parser" in config and "vision" in config["parser"]:
|
|
|
|
model_key = config["parser"]["vision"].get("model")
|
|
|
|
if model_key in ollama_models:
|
2025-04-20 16:34:29 -07:00
|
|
|
ollama_configs.append({"component": component, "base_url": ollama_models[model_key]})
|
2025-04-08 17:14:34 -07:00
|
|
|
else:
|
|
|
|
# Standard component check
|
|
|
|
if component in config:
|
|
|
|
model_key = config[component].get("model")
|
|
|
|
if model_key in ollama_models:
|
2025-04-20 16:34:29 -07:00
|
|
|
ollama_configs.append({"component": component, "base_url": ollama_models[model_key]})
|
2025-04-06 14:28:54 -07:00
|
|
|
|
2025-04-08 17:14:34 -07:00
|
|
|
# Add contextual chunking model check
|
2025-04-06 14:28:54 -07:00
|
|
|
if (
|
|
|
|
"parser" in config
|
2025-04-08 17:14:34 -07:00
|
|
|
and config["parser"].get("use_contextual_chunking")
|
|
|
|
and "contextual_chunking_model" in config["parser"]
|
2025-04-06 14:28:54 -07:00
|
|
|
):
|
2025-04-08 17:14:34 -07:00
|
|
|
model_key = config["parser"]["contextual_chunking_model"]
|
|
|
|
if model_key in ollama_models:
|
|
|
|
ollama_configs.append(
|
|
|
|
{
|
|
|
|
"component": "parser.contextual_chunking",
|
|
|
|
"base_url": ollama_models[model_key],
|
|
|
|
}
|
|
|
|
)
|
2025-04-06 14:28:54 -07:00
|
|
|
|
|
|
|
return ollama_configs
|
|
|
|
except Exception as e:
|
|
|
|
logging.error(f"Error checking Ollama configuration: {e}")
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
2024-11-16 01:48:15 -05:00
|
|
|
def main():
|
2025-02-07 21:08:40 -05:00
|
|
|
# Parse command line arguments
|
2025-04-09 18:46:00 -07:00
|
|
|
parser = argparse.ArgumentParser(description="Start the Morphik server")
|
2025-02-07 21:08:40 -05:00
|
|
|
parser.add_argument(
|
|
|
|
"--log",
|
|
|
|
choices=["debug", "info", "warning", "error"],
|
|
|
|
default="info",
|
|
|
|
help="Set the logging level",
|
|
|
|
)
|
2025-04-06 14:28:54 -07:00
|
|
|
parser.add_argument(
|
|
|
|
"--skip-ollama-check",
|
|
|
|
action="store_true",
|
|
|
|
help="Skip Ollama availability check",
|
|
|
|
)
|
2025-02-07 21:08:40 -05:00
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
# Set up logging first with specified level
|
|
|
|
setup_logging(log_level=args.log.upper())
|
2025-01-30 16:03:46 -05:00
|
|
|
|
2025-04-15 23:31:49 -07:00
|
|
|
# Check and start Redis container
|
|
|
|
check_and_start_redis()
|
|
|
|
|
2024-11-16 01:48:15 -05:00
|
|
|
# Load environment variables from .env file
|
|
|
|
load_dotenv()
|
|
|
|
|
2025-04-06 14:28:54 -07:00
|
|
|
# Check if Ollama is required and running
|
|
|
|
if not args.skip_ollama_check:
|
|
|
|
ollama_configs = get_ollama_usage_info()
|
|
|
|
|
|
|
|
if ollama_configs:
|
|
|
|
# Group configs by base_url to avoid redundant checks
|
|
|
|
base_urls = {}
|
|
|
|
for config in ollama_configs:
|
|
|
|
if config["base_url"] not in base_urls:
|
|
|
|
base_urls[config["base_url"]] = []
|
|
|
|
base_urls[config["base_url"]].append(config["component"])
|
|
|
|
|
|
|
|
all_running = True
|
|
|
|
for base_url, components in base_urls.items():
|
|
|
|
if not check_ollama_running(base_url):
|
|
|
|
print(f"ERROR: Ollama is not accessible at {base_url}")
|
|
|
|
print(f"This URL is used by these components: {', '.join(components)}")
|
|
|
|
all_running = False
|
|
|
|
|
|
|
|
if not all_running:
|
2025-04-20 16:34:29 -07:00
|
|
|
print("\nPlease ensure Ollama is running at the configured URLs before starting the server")
|
2025-04-06 14:28:54 -07:00
|
|
|
print("Run with --skip-ollama-check to bypass this check")
|
|
|
|
sys.exit(1)
|
|
|
|
else:
|
|
|
|
component_list = [config["component"] for config in ollama_configs]
|
|
|
|
print(f"Ollama is running and will be used for: {', '.join(component_list)}")
|
|
|
|
|
2024-11-22 18:56:22 -05:00
|
|
|
# Load settings (this will validate all required env vars)
|
|
|
|
settings = get_settings()
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2025-04-17 02:37:26 -07:00
|
|
|
# Wait for Redis to be available (using environment variables or defaults)
|
|
|
|
redis_host = os.environ.get("REDIS_HOST", "127.0.0.1")
|
|
|
|
redis_port = int(os.environ.get("REDIS_PORT", "6379"))
|
|
|
|
if not wait_for_redis(host=redis_host, port=redis_port):
|
|
|
|
logging.error("Cannot start server without Redis. Please ensure Redis is running.")
|
|
|
|
sys.exit(1)
|
2024-11-16 01:48:15 -05:00
|
|
|
|
2025-04-15 23:31:49 -07:00
|
|
|
# Start ARQ worker in the background
|
|
|
|
start_arq_worker()
|
|
|
|
|
|
|
|
# Start server (this is blocking)
|
|
|
|
logging.info("Starting Uvicorn server...")
|
2024-11-16 01:48:15 -05:00
|
|
|
uvicorn.run(
|
|
|
|
"core.api:app",
|
2024-11-22 18:56:22 -05:00
|
|
|
host=settings.HOST,
|
|
|
|
port=settings.PORT,
|
2024-12-17 21:40:38 -05:00
|
|
|
loop="asyncio",
|
2025-02-07 21:08:40 -05:00
|
|
|
log_level=args.log,
|
2025-04-15 23:31:49 -07:00
|
|
|
# reload=settings.RELOAD # Reload might interfere with subprocess management
|
2024-11-16 01:48:15 -05:00
|
|
|
)
|
|
|
|
|
2024-11-22 18:56:22 -05:00
|
|
|
|
2024-11-16 01:48:15 -05:00
|
|
|
if __name__ == "__main__":
|
2024-11-16 14:37:01 -05:00
|
|
|
main()
|