mirror of
https://github.com/james-m-jordan/morphik-core.git
synced 2025-05-09 19:32:38 +00:00
update configuration style to support easy model editing
This commit is contained in:
parent
36ccca0332
commit
418054e9a3
75
config.toml
75
config.toml
@ -1,32 +1,53 @@
|
||||
[aws]
|
||||
default_region = "us-east-2"
|
||||
default_bucket_name = "databridge-s3-storage"
|
||||
|
||||
[mongodb]
|
||||
database_name = "DataBridgeTest"
|
||||
documents_collection = "documents"
|
||||
chunks_collection = "document_chunks"
|
||||
|
||||
[mongodb.vector]
|
||||
dimensions = 1536
|
||||
index_name = "vector_index"
|
||||
|
||||
[model]
|
||||
embedding_model = "text-embedding-3-small"
|
||||
completion_model = "gpt-3.5-turbo"
|
||||
|
||||
[document_processing]
|
||||
chunk_size = 1000
|
||||
chunk_overlap = 200
|
||||
default_k = 4
|
||||
|
||||
[video_processing]
|
||||
frame_sample_rate = 120
|
||||
|
||||
[server]
|
||||
# Core Service Components
|
||||
[service]
|
||||
host = "localhost"
|
||||
port = 8000
|
||||
reload = false
|
||||
|
||||
[service.components]
|
||||
storage = "aws-s3"
|
||||
database = "mongodb"
|
||||
vector_store = "mongodb"
|
||||
embedding = "openai"
|
||||
completion = "openai"
|
||||
parser = "combined"
|
||||
|
||||
# Storage Configuration
|
||||
[storage.aws]
|
||||
region = "us-east-2"
|
||||
bucket_name = "databridge-s3-storage"
|
||||
|
||||
# Database Configuration
|
||||
[database.mongodb]
|
||||
database_name = "DataBridgeTest"
|
||||
documents_collection = "documents"
|
||||
chunks_collection = "document_chunks"
|
||||
|
||||
# Vector Store Configuration
|
||||
[vector_store.mongodb]
|
||||
dimensions = 1536
|
||||
index_name = "vector_index"
|
||||
|
||||
# Model Configurations
|
||||
[models]
|
||||
[models.embedding]
|
||||
model_name = "text-embedding-3-small"
|
||||
|
||||
[models.completion]
|
||||
model_name = "gpt-4o-mini"
|
||||
default_max_tokens = 1000
|
||||
default_temperature = 0.7
|
||||
|
||||
# Document Processing
|
||||
[processing]
|
||||
[processing.text]
|
||||
chunk_size = 1000
|
||||
chunk_overlap = 200
|
||||
default_k = 4
|
||||
|
||||
[processing.video]
|
||||
frame_sample_rate = 120
|
||||
|
||||
# Authentication
|
||||
[auth]
|
||||
jwt_algorithm = "HS256"
|
||||
jwt_algorithm = "HS256"
|
||||
|
115
core/api.py
115
core/api.py
@ -5,6 +5,7 @@ from fastapi import FastAPI, Form, HTTPException, Depends, Header, UploadFile
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
import jwt
|
||||
import logging
|
||||
from core.completion.openai_completion import OpenAICompletionModel
|
||||
from core.models.request import (
|
||||
IngestTextRequest,
|
||||
RetrieveRequest,
|
||||
@ -14,14 +15,14 @@ from core.models.documents import Document, DocumentResult, ChunkResult
|
||||
from core.models.auth import AuthContext, EntityType
|
||||
from core.parser.combined_parser import CombinedParser
|
||||
from core.completion.base_completion import CompletionResponse
|
||||
from core.parser.unstructured_parser import UnstructuredAPIParser
|
||||
from core.services.document_service import DocumentService
|
||||
from core.config import get_settings
|
||||
from core.database.mongo_database import MongoDatabase
|
||||
from core.vector_store.mongo_vector_store import MongoDBAtlasVectorStore
|
||||
from core.storage.s3_storage import S3Storage
|
||||
from core.parser.unstructured_parser import UnstructuredAPIParser
|
||||
from core.embedding_model.openai_embedding_model import OpenAIEmbeddingModel
|
||||
from core.completion.openai_completion import OpenAICompletionModel
|
||||
from core.embedding.openai_embedding_model import OpenAIEmbeddingModel
|
||||
from core.completion.ollama_completion import OllamaCompletionModel
|
||||
|
||||
# Initialize FastAPI app
|
||||
app = FastAPI(title="DataBridge API")
|
||||
@ -39,42 +40,90 @@ app.add_middleware(
|
||||
# Initialize service
|
||||
settings = get_settings()
|
||||
|
||||
# Initialize components
|
||||
database = MongoDatabase(
|
||||
uri=settings.MONGODB_URI,
|
||||
db_name=settings.DATABRIDGE_DB,
|
||||
collection_name=settings.DOCUMENTS_COLLECTION,
|
||||
)
|
||||
# Initialize database
|
||||
match settings.DATABASE_PROVIDER:
|
||||
case "mongodb":
|
||||
database = MongoDatabase(
|
||||
uri=settings.MONGODB_URI,
|
||||
db_name=settings.DATABRIDGE_DB,
|
||||
collection_name=settings.DOCUMENTS_COLLECTION,
|
||||
)
|
||||
case _:
|
||||
raise ValueError(f"Unsupported database provider: {settings.DATABASE_PROVIDER}")
|
||||
|
||||
vector_store = MongoDBAtlasVectorStore(
|
||||
uri=settings.MONGODB_URI,
|
||||
database_name=settings.DATABRIDGE_DB,
|
||||
collection_name=settings.CHUNKS_COLLECTION,
|
||||
index_name=settings.VECTOR_INDEX_NAME,
|
||||
)
|
||||
# Initialize vector store
|
||||
match settings.VECTOR_STORE_PROVIDER:
|
||||
case "mongodb":
|
||||
vector_store = MongoDBAtlasVectorStore(
|
||||
uri=settings.MONGODB_URI,
|
||||
database_name=settings.DATABRIDGE_DB,
|
||||
collection_name=settings.CHUNKS_COLLECTION,
|
||||
index_name=settings.VECTOR_INDEX_NAME,
|
||||
)
|
||||
case _:
|
||||
raise ValueError(
|
||||
f"Unsupported vector store provider: {settings.VECTOR_STORE_PROVIDER}"
|
||||
)
|
||||
|
||||
storage = S3Storage(
|
||||
aws_access_key=settings.AWS_ACCESS_KEY,
|
||||
aws_secret_key=settings.AWS_SECRET_ACCESS_KEY,
|
||||
region_name=settings.AWS_REGION,
|
||||
default_bucket=settings.S3_BUCKET,
|
||||
)
|
||||
# Initialize storage
|
||||
match settings.STORAGE_PROVIDER:
|
||||
case "aws-s3":
|
||||
storage = S3Storage(
|
||||
aws_access_key=settings.AWS_ACCESS_KEY,
|
||||
aws_secret_key=settings.AWS_SECRET_ACCESS_KEY,
|
||||
region_name=settings.AWS_REGION,
|
||||
default_bucket=settings.S3_BUCKET,
|
||||
)
|
||||
case _:
|
||||
raise ValueError(f"Unsupported storage provider: {settings.STORAGE_PROVIDER}")
|
||||
|
||||
parser = CombinedParser(
|
||||
unstructured_api_key=settings.UNSTRUCTURED_API_KEY,
|
||||
assemblyai_api_key=settings.ASSEMBLYAI_API_KEY,
|
||||
chunk_size=settings.CHUNK_SIZE,
|
||||
chunk_overlap=settings.CHUNK_OVERLAP,
|
||||
frame_sample_rate=settings.FRAME_SAMPLE_RATE,
|
||||
)
|
||||
# Initialize parser
|
||||
match settings.PARSER_PROVIDER:
|
||||
case "combined":
|
||||
parser = CombinedParser(
|
||||
unstructured_api_key=settings.UNSTRUCTURED_API_KEY,
|
||||
assemblyai_api_key=settings.ASSEMBLYAI_API_KEY,
|
||||
chunk_size=settings.CHUNK_SIZE,
|
||||
chunk_overlap=settings.CHUNK_OVERLAP,
|
||||
frame_sample_rate=settings.FRAME_SAMPLE_RATE,
|
||||
)
|
||||
case "unstructured":
|
||||
parser = UnstructuredAPIParser(
|
||||
unstructured_api_key=settings.UNSTRUCTURED_API_KEY,
|
||||
chunk_size=settings.CHUNK_SIZE,
|
||||
chunk_overlap=settings.CHUNK_OVERLAP,
|
||||
)
|
||||
case _:
|
||||
raise ValueError(f"Unsupported parser provider: {settings.PARSER_PROVIDER}")
|
||||
|
||||
embedding_model = OpenAIEmbeddingModel(
|
||||
api_key=settings.OPENAI_API_KEY, model_name=settings.EMBEDDING_MODEL
|
||||
)
|
||||
# Initialize embedding model
|
||||
match settings.EMBEDDING_PROVIDER:
|
||||
case "openai":
|
||||
embedding_model = OpenAIEmbeddingModel(
|
||||
api_key=settings.OPENAI_API_KEY,
|
||||
model_name=settings.EMBEDDING_MODEL,
|
||||
)
|
||||
case _:
|
||||
raise ValueError(
|
||||
f"Unsupported embedding provider: {settings.EMBEDDING_PROVIDER}"
|
||||
)
|
||||
|
||||
completion_model = OpenAICompletionModel(model_name=settings.COMPLETION_MODEL)
|
||||
# Initialize completion model
|
||||
match settings.COMPLETION_PROVIDER:
|
||||
case "ollama":
|
||||
completion_model = OllamaCompletionModel(
|
||||
model_name=settings.COMPLETION_MODEL,
|
||||
)
|
||||
case "openai":
|
||||
completion_model = OpenAICompletionModel(
|
||||
model_name=settings.COMPLETION_MODEL,
|
||||
)
|
||||
case _:
|
||||
raise ValueError(
|
||||
f"Unsupported completion provider: {settings.COMPLETION_PROVIDER}"
|
||||
)
|
||||
|
||||
# Initialize document service
|
||||
# Initialize document service with configured components
|
||||
document_service = DocumentService(
|
||||
database=database,
|
||||
vector_store=vector_store,
|
||||
|
100
core/config.py
100
core/config.py
@ -8,7 +8,7 @@ from dotenv import load_dotenv
|
||||
class Settings(BaseSettings):
|
||||
"""DataBridge configuration settings."""
|
||||
|
||||
# Required environment variables
|
||||
# Required environment variables (referenced in config.toml)
|
||||
MONGODB_URI: str = Field(..., env="MONGODB_URI")
|
||||
OPENAI_API_KEY: str = Field(..., env="OPENAI_API_KEY")
|
||||
UNSTRUCTURED_API_KEY: str = Field(..., env="UNSTRUCTURED_API_KEY")
|
||||
@ -17,25 +17,47 @@ class Settings(BaseSettings):
|
||||
AWS_SECRET_ACCESS_KEY: str = Field(..., env="AWS_SECRET_ACCESS_KEY")
|
||||
JWT_SECRET_KEY: str = Field(..., env="JWT_SECRET_KEY")
|
||||
|
||||
# Values from config.toml with defaults
|
||||
AWS_REGION: str = "us-east-2"
|
||||
S3_BUCKET: str = "databridge-s3-storage"
|
||||
DATABRIDGE_DB: str = "databridge"
|
||||
DOCUMENTS_COLLECTION: str = "documents"
|
||||
CHUNKS_COLLECTION: str = "document_chunks"
|
||||
VECTOR_INDEX_NAME: str = "vector_index"
|
||||
VECTOR_DIMENSIONS: int = 1536
|
||||
EMBEDDING_MODEL: str = "text-embedding-3-small"
|
||||
COMPLETION_MODEL: str = "gpt-3.5-turbo"
|
||||
CHUNK_SIZE: int = 1000
|
||||
CHUNK_OVERLAP: int = 200
|
||||
DEFAULT_K: int = 4
|
||||
# Service settings
|
||||
HOST: str = "localhost"
|
||||
PORT: int = 8000
|
||||
RELOAD: bool = False
|
||||
JWT_ALGORITHM: str = "HS256"
|
||||
|
||||
# Component selection
|
||||
STORAGE_PROVIDER: str = "aws-s3"
|
||||
DATABASE_PROVIDER: str = "mongodb"
|
||||
VECTOR_STORE_PROVIDER: str = "mongodb"
|
||||
EMBEDDING_PROVIDER: str = "openai"
|
||||
COMPLETION_PROVIDER: str = "ollama"
|
||||
PARSER_PROVIDER: str = "combined"
|
||||
|
||||
# Storage settings
|
||||
AWS_REGION: str = "us-east-2"
|
||||
S3_BUCKET: str = "databridge-s3-storage"
|
||||
|
||||
# Database settings
|
||||
DATABRIDGE_DB: str = "DataBridgeTest"
|
||||
DOCUMENTS_COLLECTION: str = "documents"
|
||||
CHUNKS_COLLECTION: str = "document_chunks"
|
||||
|
||||
# Vector store settings
|
||||
VECTOR_INDEX_NAME: str = "vector_index"
|
||||
VECTOR_DIMENSIONS: int = 1536
|
||||
|
||||
# Model settings
|
||||
EMBEDDING_MODEL: str = "text-embedding-3-small"
|
||||
COMPLETION_MODEL: str = "llama3.1"
|
||||
COMPLETION_MAX_TOKENS: int = 1000
|
||||
COMPLETION_TEMPERATURE: float = 0.7
|
||||
|
||||
# Processing settings
|
||||
CHUNK_SIZE: int = 1000
|
||||
CHUNK_OVERLAP: int = 200
|
||||
DEFAULT_K: int = 4
|
||||
FRAME_SAMPLE_RATE: int = 120
|
||||
|
||||
# Auth settings
|
||||
JWT_ALGORITHM: str = "HS256"
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def get_settings() -> Settings:
|
||||
@ -48,23 +70,39 @@ def get_settings() -> Settings:
|
||||
|
||||
# Map config.toml values to settings
|
||||
settings_dict = {
|
||||
"AWS_REGION": config["aws"]["default_region"],
|
||||
"S3_BUCKET": config["aws"]["default_bucket_name"],
|
||||
"DATABRIDGE_DB": config["mongodb"]["database_name"],
|
||||
"DOCUMENTS_COLLECTION": config["mongodb"]["documents_collection"],
|
||||
"CHUNKS_COLLECTION": config["mongodb"]["chunks_collection"],
|
||||
"VECTOR_INDEX_NAME": config["mongodb"]["vector"]["index_name"],
|
||||
"VECTOR_DIMENSIONS": config["mongodb"]["vector"]["dimensions"],
|
||||
"EMBEDDING_MODEL": config["model"]["embedding_model"],
|
||||
"COMPLETION_MODEL": config["model"]["completion_model"],
|
||||
"CHUNK_SIZE": config["document_processing"]["chunk_size"],
|
||||
"CHUNK_OVERLAP": config["document_processing"]["chunk_overlap"],
|
||||
"DEFAULT_K": config["document_processing"]["default_k"],
|
||||
"HOST": config["server"]["host"],
|
||||
"PORT": config["server"]["port"],
|
||||
"RELOAD": config["server"]["reload"],
|
||||
# Service settings
|
||||
"HOST": config["service"]["host"],
|
||||
"PORT": config["service"]["port"],
|
||||
"RELOAD": config["service"]["reload"],
|
||||
# Component selection
|
||||
"STORAGE_PROVIDER": config["service"]["components"]["storage"],
|
||||
"DATABASE_PROVIDER": config["service"]["components"]["database"],
|
||||
"VECTOR_STORE_PROVIDER": config["service"]["components"]["vector_store"],
|
||||
"EMBEDDING_PROVIDER": config["service"]["components"]["embedding"],
|
||||
"COMPLETION_PROVIDER": config["service"]["components"]["completion"],
|
||||
"PARSER_PROVIDER": config["service"]["components"]["parser"],
|
||||
# Storage settings
|
||||
"AWS_REGION": config["storage"]["aws"]["region"],
|
||||
"S3_BUCKET": config["storage"]["aws"]["bucket_name"],
|
||||
# Database settings
|
||||
"DATABRIDGE_DB": config["database"]["mongodb"]["database_name"],
|
||||
"DOCUMENTS_COLLECTION": config["database"]["mongodb"]["documents_collection"],
|
||||
"CHUNKS_COLLECTION": config["database"]["mongodb"]["chunks_collection"],
|
||||
# Vector store settings
|
||||
"VECTOR_INDEX_NAME": config["vector_store"]["mongodb"]["index_name"],
|
||||
"VECTOR_DIMENSIONS": config["vector_store"]["mongodb"]["dimensions"],
|
||||
# Model settings
|
||||
"EMBEDDING_MODEL": config["models"]["embedding"]["model_name"],
|
||||
"COMPLETION_MODEL": config["models"]["completion"]["model_name"],
|
||||
"COMPLETION_MAX_TOKENS": config["models"]["completion"]["default_max_tokens"],
|
||||
"COMPLETION_TEMPERATURE": config["models"]["completion"]["default_temperature"],
|
||||
# Processing settings
|
||||
"CHUNK_SIZE": config["processing"]["text"]["chunk_size"],
|
||||
"CHUNK_OVERLAP": config["processing"]["text"]["chunk_overlap"],
|
||||
"DEFAULT_K": config["processing"]["text"]["default_k"],
|
||||
"FRAME_SAMPLE_RATE": config["processing"]["video"]["frame_sample_rate"],
|
||||
# Auth settings
|
||||
"JWT_ALGORITHM": config["auth"]["jwt_algorithm"],
|
||||
"FRAME_SAMPLE_RATE": config["video_processing"]["frame_sample_rate"],
|
||||
}
|
||||
|
||||
return Settings(**settings_dict)
|
||||
|
@ -14,7 +14,7 @@ from ..models.auth import AuthContext
|
||||
from core.database.base_database import BaseDatabase
|
||||
from core.storage.base_storage import BaseStorage
|
||||
from core.vector_store.base_vector_store import BaseVectorStore
|
||||
from core.embedding_model.base_embedding_model import BaseEmbeddingModel
|
||||
from core.embedding.base_embedding_model import BaseEmbeddingModel
|
||||
from core.parser.base_parser import BaseParser
|
||||
from core.completion.base_completion import BaseCompletionModel
|
||||
from core.completion.base_completion import CompletionRequest, CompletionResponse
|
||||
|
Loading…
x
Reference in New Issue
Block a user