mirror of
https://github.com/james-m-jordan/morphik-core.git
synced 2025-05-09 19:32:38 +00:00
318 lines
11 KiB
Python
318 lines
11 KiB
Python
import os
|
|
from typing import Literal, Optional, Dict, Any
|
|
from pydantic_settings import BaseSettings
|
|
from functools import lru_cache
|
|
import tomli
|
|
from dotenv import load_dotenv
|
|
from collections import ChainMap
|
|
|
|
|
|
class Settings(BaseSettings):
|
|
"""Morphik configuration settings."""
|
|
|
|
# Environment variables
|
|
JWT_SECRET_KEY: str
|
|
POSTGRES_URI: Optional[str] = None
|
|
UNSTRUCTURED_API_KEY: Optional[str] = None
|
|
AWS_ACCESS_KEY: Optional[str] = None
|
|
AWS_SECRET_ACCESS_KEY: Optional[str] = None
|
|
OPENAI_API_KEY: Optional[str] = None
|
|
ANTHROPIC_API_KEY: Optional[str] = None
|
|
ASSEMBLYAI_API_KEY: Optional[str] = None
|
|
|
|
# API configuration
|
|
HOST: str
|
|
PORT: int
|
|
RELOAD: bool
|
|
|
|
# Auth configuration
|
|
JWT_ALGORITHM: str
|
|
dev_mode: bool = False
|
|
dev_entity_type: str = "developer"
|
|
dev_entity_id: str = "dev_user"
|
|
dev_permissions: list = ["read", "write", "admin"]
|
|
|
|
# Registered models configuration
|
|
REGISTERED_MODELS: Dict[str, Dict[str, Any]] = {}
|
|
|
|
# Completion configuration
|
|
COMPLETION_PROVIDER: Literal["litellm"] = "litellm"
|
|
COMPLETION_MODEL: str
|
|
|
|
|
|
# Database configuration
|
|
DATABASE_PROVIDER: Literal["postgres"]
|
|
DATABASE_NAME: Optional[str] = None
|
|
|
|
# Embedding configuration
|
|
EMBEDDING_PROVIDER: Literal["litellm"] = "litellm"
|
|
EMBEDDING_MODEL: str
|
|
VECTOR_DIMENSIONS: int
|
|
EMBEDDING_SIMILARITY_METRIC: Literal["cosine", "dotProduct"]
|
|
|
|
# Parser configuration
|
|
CHUNK_SIZE: int
|
|
CHUNK_OVERLAP: int
|
|
USE_UNSTRUCTURED_API: bool
|
|
FRAME_SAMPLE_RATE: Optional[int] = None
|
|
USE_CONTEXTUAL_CHUNKING: bool = False
|
|
|
|
# Rules configuration
|
|
RULES_PROVIDER: Literal["litellm"] = "litellm"
|
|
RULES_MODEL: str
|
|
RULES_BATCH_SIZE: int = 4096
|
|
|
|
# Graph configuration
|
|
GRAPH_PROVIDER: Literal["litellm"] = "litellm"
|
|
GRAPH_MODEL: str
|
|
ENABLE_ENTITY_RESOLUTION: bool = True
|
|
|
|
# Reranker configuration
|
|
USE_RERANKING: bool
|
|
RERANKER_PROVIDER: Optional[Literal["flag"]] = None
|
|
RERANKER_MODEL: Optional[str] = None
|
|
RERANKER_QUERY_MAX_LENGTH: Optional[int] = None
|
|
RERANKER_PASSAGE_MAX_LENGTH: Optional[int] = None
|
|
RERANKER_USE_FP16: Optional[bool] = None
|
|
RERANKER_DEVICE: Optional[str] = None
|
|
|
|
# Storage configuration
|
|
STORAGE_PROVIDER: Literal["local", "aws-s3"]
|
|
STORAGE_PATH: Optional[str] = None
|
|
AWS_REGION: Optional[str] = None
|
|
S3_BUCKET: Optional[str] = None
|
|
|
|
# Vector store configuration
|
|
VECTOR_STORE_PROVIDER: Literal["pgvector"]
|
|
VECTOR_STORE_DATABASE_NAME: Optional[str] = None
|
|
|
|
# Colpali configuration
|
|
ENABLE_COLPALI: bool
|
|
|
|
# Mode configuration
|
|
MODE: Literal["cloud", "self_hosted"] = "cloud"
|
|
|
|
# API configuration
|
|
API_DOMAIN: str = "api.morphik.ai"
|
|
|
|
# Telemetry configuration
|
|
TELEMETRY_ENABLED: bool = True
|
|
HONEYCOMB_ENABLED: bool = True
|
|
HONEYCOMB_ENDPOINT: str = "https://api.honeycomb.io"
|
|
HONEYCOMB_PROXY_ENDPOINT: str = "https://otel-proxy.onrender.com/"
|
|
SERVICE_NAME: str = "morphik-core"
|
|
OTLP_TIMEOUT: int = 10
|
|
OTLP_MAX_RETRIES: int = 3
|
|
OTLP_RETRY_DELAY: int = 1
|
|
OTLP_MAX_EXPORT_BATCH_SIZE: int = 512
|
|
OTLP_SCHEDULE_DELAY_MILLIS: int = 5000
|
|
OTLP_MAX_QUEUE_SIZE: int = 2048
|
|
|
|
|
|
@lru_cache()
|
|
def get_settings() -> Settings:
|
|
"""Get cached settings instance."""
|
|
load_dotenv(override=True)
|
|
|
|
# Load config.toml
|
|
with open("morphik.toml", "rb") as f:
|
|
config = tomli.load(f)
|
|
|
|
em = "'{missing_value}' needed if '{field}' is set to '{value}'"
|
|
openai_config = {}
|
|
|
|
# load api config
|
|
api_config = {
|
|
"HOST": config["api"]["host"],
|
|
"PORT": int(config["api"]["port"]),
|
|
"RELOAD": bool(config["api"]["reload"]),
|
|
}
|
|
|
|
# load auth config
|
|
auth_config = {
|
|
"JWT_ALGORITHM": config["auth"]["jwt_algorithm"],
|
|
"JWT_SECRET_KEY": os.environ.get(
|
|
"JWT_SECRET_KEY", "dev-secret-key"
|
|
), # Default for dev mode
|
|
"dev_mode": config["auth"].get("dev_mode", False),
|
|
"dev_entity_type": config["auth"].get("dev_entity_type", "developer"),
|
|
"dev_entity_id": config["auth"].get("dev_entity_id", "dev_user"),
|
|
"dev_permissions": config["auth"].get("dev_permissions", ["read", "write", "admin"]),
|
|
}
|
|
|
|
# Only require JWT_SECRET_KEY in non-dev mode
|
|
if not auth_config["dev_mode"] and "JWT_SECRET_KEY" not in os.environ:
|
|
raise ValueError("JWT_SECRET_KEY is required when dev_mode is disabled")
|
|
|
|
# Load registered models if available
|
|
registered_models = {}
|
|
if "registered_models" in config:
|
|
registered_models = {"REGISTERED_MODELS": config["registered_models"]}
|
|
|
|
# load completion config
|
|
completion_config = {
|
|
"COMPLETION_PROVIDER": "litellm",
|
|
}
|
|
|
|
# Set the model key for LiteLLM
|
|
if "model" not in config["completion"]:
|
|
raise ValueError("'model' is required in the completion configuration")
|
|
completion_config["COMPLETION_MODEL"] = config["completion"]["model"]
|
|
|
|
# load database config
|
|
database_config = {"DATABASE_PROVIDER": config["database"]["provider"]}
|
|
if database_config["DATABASE_PROVIDER"] != "postgres":
|
|
prov = database_config["DATABASE_PROVIDER"]
|
|
raise ValueError(f"Unknown database provider selected: '{prov}'")
|
|
|
|
if "POSTGRES_URI" in os.environ:
|
|
database_config.update({"POSTGRES_URI": os.environ["POSTGRES_URI"]})
|
|
else:
|
|
msg = em.format(
|
|
missing_value="POSTGRES_URI", field="database.provider", value="postgres"
|
|
)
|
|
raise ValueError(msg)
|
|
|
|
# load embedding config
|
|
embedding_config = {
|
|
"EMBEDDING_PROVIDER": "litellm",
|
|
"VECTOR_DIMENSIONS": config["embedding"]["dimensions"],
|
|
"EMBEDDING_SIMILARITY_METRIC": config["embedding"]["similarity_metric"],
|
|
}
|
|
|
|
# Set the model key for LiteLLM
|
|
if "model" not in config["embedding"]:
|
|
raise ValueError("'model' is required in the embedding configuration")
|
|
embedding_config["EMBEDDING_MODEL"] = config["embedding"]["model"]
|
|
|
|
# load parser config
|
|
parser_config = {
|
|
"CHUNK_SIZE": config["parser"]["chunk_size"],
|
|
"CHUNK_OVERLAP": config["parser"]["chunk_overlap"],
|
|
"USE_UNSTRUCTURED_API": config["parser"]["use_unstructured_api"],
|
|
"USE_CONTEXTUAL_CHUNKING": config["parser"].get("use_contextual_chunking", False),
|
|
}
|
|
if parser_config["USE_UNSTRUCTURED_API"] and "UNSTRUCTURED_API_KEY" not in os.environ:
|
|
msg = em.format(
|
|
missing_value="UNSTRUCTURED_API_KEY", field="parser.use_unstructured_api", value="true"
|
|
)
|
|
raise ValueError(msg)
|
|
elif parser_config["USE_UNSTRUCTURED_API"]:
|
|
parser_config.update({"UNSTRUCTURED_API_KEY": os.environ["UNSTRUCTURED_API_KEY"]})
|
|
|
|
# load reranker config
|
|
reranker_config = {"USE_RERANKING": config["reranker"]["use_reranker"]}
|
|
if reranker_config["USE_RERANKING"]:
|
|
reranker_config.update(
|
|
{
|
|
"RERANKER_PROVIDER": config["reranker"]["provider"],
|
|
"RERANKER_MODEL": config["reranker"]["model_name"],
|
|
"RERANKER_QUERY_MAX_LENGTH": config["reranker"]["query_max_length"],
|
|
"RERANKER_PASSAGE_MAX_LENGTH": config["reranker"]["passage_max_length"],
|
|
"RERANKER_USE_FP16": config["reranker"]["use_fp16"],
|
|
"RERANKER_DEVICE": config["reranker"]["device"],
|
|
}
|
|
)
|
|
|
|
# load storage config
|
|
storage_config = {"STORAGE_PROVIDER": config["storage"]["provider"], "STORAGE_PATH": config["storage"]["storage_path"]}
|
|
match storage_config["STORAGE_PROVIDER"]:
|
|
case "local":
|
|
storage_config.update({"STORAGE_PATH": config["storage"]["storage_path"]})
|
|
case "aws-s3" if all(
|
|
key in os.environ for key in ["AWS_ACCESS_KEY", "AWS_SECRET_ACCESS_KEY"]
|
|
):
|
|
storage_config.update(
|
|
{
|
|
"AWS_REGION": config["storage"]["region"],
|
|
"S3_BUCKET": config["storage"]["bucket_name"],
|
|
"AWS_ACCESS_KEY": os.environ["AWS_ACCESS_KEY"],
|
|
"AWS_SECRET_ACCESS_KEY": os.environ["AWS_SECRET_ACCESS_KEY"],
|
|
}
|
|
)
|
|
case "aws-s3":
|
|
msg = em.format(
|
|
missing_value="AWS credentials", field="storage.provider", value="aws-s3"
|
|
)
|
|
raise ValueError(msg)
|
|
case _:
|
|
prov = storage_config["STORAGE_PROVIDER"]
|
|
raise ValueError(f"Unknown storage provider selected: '{prov}'")
|
|
|
|
# load vector store config
|
|
vector_store_config = {"VECTOR_STORE_PROVIDER": config["vector_store"]["provider"]}
|
|
if vector_store_config["VECTOR_STORE_PROVIDER"] != "pgvector":
|
|
prov = vector_store_config["VECTOR_STORE_PROVIDER"]
|
|
raise ValueError(f"Unknown vector store provider selected: '{prov}'")
|
|
|
|
if "POSTGRES_URI" not in os.environ:
|
|
msg = em.format(
|
|
missing_value="POSTGRES_URI", field="vector_store.provider", value="pgvector"
|
|
)
|
|
raise ValueError(msg)
|
|
|
|
# load rules config
|
|
rules_config = {
|
|
"RULES_PROVIDER": "litellm",
|
|
"RULES_BATCH_SIZE": config["rules"]["batch_size"],
|
|
}
|
|
|
|
# Set the model key for LiteLLM
|
|
if "model" not in config["rules"]:
|
|
raise ValueError("'model' is required in the rules configuration")
|
|
rules_config["RULES_MODEL"] = config["rules"]["model"]
|
|
|
|
# load morphik config
|
|
morphik_config = {
|
|
"ENABLE_COLPALI": config["morphik"]["enable_colpali"],
|
|
"MODE": config["morphik"].get("mode", "cloud"), # Default to "cloud" mode
|
|
"API_DOMAIN": config["morphik"].get("api_domain", "api.morphik.ai"), # Default API domain
|
|
}
|
|
|
|
# load graph config
|
|
graph_config = {
|
|
"GRAPH_PROVIDER": "litellm",
|
|
"ENABLE_ENTITY_RESOLUTION": config["graph"].get("enable_entity_resolution", True),
|
|
}
|
|
|
|
# Set the model key for LiteLLM
|
|
if "model" not in config["graph"]:
|
|
raise ValueError("'model' is required in the graph configuration")
|
|
graph_config["GRAPH_MODEL"] = config["graph"]["model"]
|
|
|
|
# load telemetry config
|
|
telemetry_config = {}
|
|
if "telemetry" in config:
|
|
telemetry_config = {
|
|
"TELEMETRY_ENABLED": config["telemetry"].get("enabled", True),
|
|
"HONEYCOMB_ENABLED": config["telemetry"].get("honeycomb_enabled", True),
|
|
"HONEYCOMB_ENDPOINT": config["telemetry"].get("honeycomb_endpoint", "https://api.honeycomb.io"),
|
|
"SERVICE_NAME": config["telemetry"].get("service_name", "morphik-core"),
|
|
"OTLP_TIMEOUT": config["telemetry"].get("otlp_timeout", 10),
|
|
"OTLP_MAX_RETRIES": config["telemetry"].get("otlp_max_retries", 3),
|
|
"OTLP_RETRY_DELAY": config["telemetry"].get("otlp_retry_delay", 1),
|
|
"OTLP_MAX_EXPORT_BATCH_SIZE": config["telemetry"].get("otlp_max_export_batch_size", 512),
|
|
"OTLP_SCHEDULE_DELAY_MILLIS": config["telemetry"].get("otlp_schedule_delay_millis", 5000),
|
|
"OTLP_MAX_QUEUE_SIZE": config["telemetry"].get("otlp_max_queue_size", 2048),
|
|
}
|
|
|
|
settings_dict = dict(ChainMap(
|
|
api_config,
|
|
auth_config,
|
|
registered_models,
|
|
completion_config,
|
|
database_config,
|
|
embedding_config,
|
|
parser_config,
|
|
reranker_config,
|
|
storage_config,
|
|
vector_store_config,
|
|
rules_config,
|
|
morphik_config,
|
|
graph_config,
|
|
telemetry_config,
|
|
openai_config,
|
|
))
|
|
|
|
return Settings(**settings_dict)
|