mirror of
https://github.com/james-m-jordan/morphik-core.git
synced 2025-05-09 19:32:38 +00:00
161 lines
6.6 KiB
Python
161 lines
6.6 KiB
Python
from typing import Optional
|
|
from pydantic import Field
|
|
from pydantic_settings import BaseSettings
|
|
from functools import lru_cache
|
|
import tomli
|
|
from dotenv import load_dotenv
|
|
|
|
|
|
class Settings(BaseSettings):
|
|
"""DataBridge configuration settings."""
|
|
|
|
# Required environment variables (referenced in config.toml)
|
|
JWT_SECRET_KEY: str = Field(..., env="JWT_SECRET_KEY")
|
|
MONGODB_URI: Optional[str] = Field(None, env="MONGODB_URI")
|
|
POSTGRES_URI: Optional[str] = Field(None, env="POSTGRES_URI")
|
|
|
|
UNSTRUCTURED_API_KEY: Optional[str] = Field(None, env="UNSTRUCTURED_API_KEY")
|
|
AWS_ACCESS_KEY: Optional[str] = Field(None, env="AWS_ACCESS_KEY")
|
|
AWS_SECRET_ACCESS_KEY: Optional[str] = Field(None, env="AWS_SECRET_ACCESS_KEY")
|
|
ASSEMBLYAI_API_KEY: Optional[str] = Field(None, env="ASSEMBLYAI_API_KEY")
|
|
OPENAI_API_KEY: Optional[str] = Field(None, env="OPENAI_API_KEY")
|
|
ANTHROPIC_API_KEY: Optional[str] = Field(None, env="ANTHROPIC_API_KEY")
|
|
|
|
# Service settings
|
|
HOST: str = "localhost"
|
|
PORT: int = 8000
|
|
RELOAD: bool = False
|
|
|
|
# Component selection
|
|
STORAGE_PROVIDER: str = "local"
|
|
DATABASE_PROVIDER: str = "mongodb"
|
|
VECTOR_STORE_PROVIDER: str = "mongodb"
|
|
EMBEDDING_PROVIDER: str = "openai"
|
|
COMPLETION_PROVIDER: str = "ollama"
|
|
PARSER_PROVIDER: str = "combined"
|
|
RERANKER_PROVIDER: str = "bge"
|
|
|
|
# Storage settings
|
|
STORAGE_PATH: str = "./storage"
|
|
AWS_REGION: str = "us-east-2"
|
|
S3_BUCKET: str = "databridge-s3-storage"
|
|
|
|
# Database settings
|
|
DATABRIDGE_DB: str = "DataBridgeTest"
|
|
DOCUMENTS_TABLE: str = "documents"
|
|
CHUNKS_TABLE: str = "document_chunks"
|
|
DOCUMENTS_COLLECTION: str = "documents"
|
|
CHUNKS_COLLECTION: str = "document_chunks"
|
|
|
|
# Vector store settings
|
|
VECTOR_INDEX_NAME: str = "vector_index"
|
|
VECTOR_DIMENSIONS: int = 1536
|
|
PGVECTOR_TABLE_NAME: str = "vector_embeddings"
|
|
PGVECTOR_INDEX_METHOD: str = "ivfflat"
|
|
PGVECTOR_INDEX_LISTS: int = 100
|
|
PGVECTOR_PROBES: int = 10
|
|
|
|
# Model settings
|
|
EMBEDDING_MODEL: str = "text-embedding-3-small"
|
|
COMPLETION_MODEL: str = "llama3.1"
|
|
COMPLETION_MAX_TOKENS: int = 1000
|
|
COMPLETION_TEMPERATURE: float = 0.7
|
|
OLLAMA_BASE_URL: str = "http://localhost:11434"
|
|
RERANKER_MODEL: str = "BAAI/bge-reranker-v2-gemma"
|
|
RERANKER_DEVICE: Optional[str] = None
|
|
RERANKER_USE_FP16: bool = True
|
|
RERANKER_QUERY_MAX_LENGTH: int = 256
|
|
RERANKER_PASSAGE_MAX_LENGTH: int = 512
|
|
|
|
# Processing settings
|
|
CHUNK_SIZE: int = 1000
|
|
CHUNK_OVERLAP: int = 200
|
|
DEFAULT_K: int = 4
|
|
FRAME_SAMPLE_RATE: int = 120
|
|
USE_UNSTRUCTURED_API: bool = False
|
|
USE_RERANKING: bool = True
|
|
|
|
# Auth settings
|
|
JWT_ALGORITHM: str = "HS256"
|
|
|
|
|
|
@lru_cache()
|
|
def get_settings() -> Settings:
|
|
"""Get cached settings instance."""
|
|
load_dotenv(override=True)
|
|
|
|
# Load config.toml
|
|
with open("config.toml", "rb") as f:
|
|
config = tomli.load(f)
|
|
|
|
# Map config.toml values to settings
|
|
settings_dict = {
|
|
# Service settings
|
|
"HOST": config["service"]["host"],
|
|
"PORT": config["service"]["port"],
|
|
"RELOAD": config["service"]["reload"],
|
|
# Component selection
|
|
"STORAGE_PROVIDER": config["service"]["components"]["storage"],
|
|
"DATABASE_PROVIDER": config["service"]["components"]["database"],
|
|
"VECTOR_STORE_PROVIDER": config["service"]["components"]["vector_store"],
|
|
"EMBEDDING_PROVIDER": config["service"]["components"]["embedding"],
|
|
"COMPLETION_PROVIDER": config["service"]["components"]["completion"],
|
|
"PARSER_PROVIDER": config["service"]["components"]["parser"],
|
|
"RERANKER_PROVIDER": config["service"]["components"]["reranker"],
|
|
# Storage settings
|
|
"STORAGE_PATH": config["storage"]["local"]["path"],
|
|
"AWS_REGION": config["storage"]["aws"]["region"],
|
|
"S3_BUCKET": config["storage"]["aws"]["bucket_name"],
|
|
# Database settings
|
|
"DATABRIDGE_DB": config["database"][config["service"]["components"]["database"]][
|
|
"database_name"
|
|
],
|
|
"DOCUMENTS_TABLE": config["database"]
|
|
.get("postgres", {})
|
|
.get("documents_table", "documents"),
|
|
"CHUNKS_TABLE": config["database"]
|
|
.get("postgres", {})
|
|
.get("chunks_table", "document_chunks"),
|
|
"DOCUMENTS_COLLECTION": config["database"]
|
|
.get("mongodb", {})
|
|
.get("documents_collection", "documents"),
|
|
"CHUNKS_COLLECTION": config["database"]
|
|
.get("mongodb", {})
|
|
.get("chunks_collection", "document_chunks"),
|
|
# Vector store settings
|
|
"VECTOR_INDEX_NAME": config["vector_store"]["mongodb"]["index_name"],
|
|
"VECTOR_DIMENSIONS": config["vector_store"][
|
|
config["service"]["components"]["vector_store"]
|
|
]["dimensions"],
|
|
"PGVECTOR_TABLE_NAME": config["vector_store"]
|
|
.get("pgvector", {})
|
|
.get("table_name", "vector_embeddings"),
|
|
"PGVECTOR_INDEX_METHOD": config["vector_store"]
|
|
.get("pgvector", {})
|
|
.get("index_method", "ivfflat"),
|
|
"PGVECTOR_INDEX_LISTS": config["vector_store"].get("pgvector", {}).get("index_lists", 100),
|
|
"PGVECTOR_PROBES": config["vector_store"].get("pgvector", {}).get("probes", 10),
|
|
# Model settings
|
|
"EMBEDDING_MODEL": config["models"]["embedding"]["model_name"],
|
|
"COMPLETION_MODEL": config["models"]["completion"]["model_name"],
|
|
"COMPLETION_MAX_TOKENS": config["models"]["completion"]["default_max_tokens"],
|
|
"COMPLETION_TEMPERATURE": config["models"]["completion"]["default_temperature"],
|
|
"OLLAMA_BASE_URL": config["models"]["ollama"]["base_url"],
|
|
"RERANKER_MODEL": config["models"]["reranker"]["model_name"],
|
|
"RERANKER_DEVICE": config["models"]["reranker"].get("device"),
|
|
"RERANKER_USE_FP16": config["models"]["reranker"].get("use_fp16", True),
|
|
"RERANKER_QUERY_MAX_LENGTH": config["models"]["reranker"].get("query_max_length", 256),
|
|
"RERANKER_PASSAGE_MAX_LENGTH": config["models"]["reranker"].get("passage_max_length", 512),
|
|
# Processing settings
|
|
"CHUNK_SIZE": config["processing"]["text"]["chunk_size"],
|
|
"CHUNK_OVERLAP": config["processing"]["text"]["chunk_overlap"],
|
|
"DEFAULT_K": config["processing"]["text"]["default_k"],
|
|
"USE_RERANKING": config["processing"]["text"]["use_reranking"],
|
|
"FRAME_SAMPLE_RATE": config["processing"]["video"]["frame_sample_rate"],
|
|
"USE_UNSTRUCTURED_API": config["processing"]["unstructured"]["use_api"],
|
|
# Auth settings
|
|
"JWT_ALGORITHM": config["auth"]["jwt_algorithm"],
|
|
}
|
|
|
|
return Settings(**settings_dict)
|