diff --git a/config.toml b/config.toml index de8b5b9..b2e70ce 100644 --- a/config.toml +++ b/config.toml @@ -1,32 +1,53 @@ -[aws] -default_region = "us-east-2" -default_bucket_name = "databridge-s3-storage" - -[mongodb] -database_name = "DataBridgeTest" -documents_collection = "documents" -chunks_collection = "document_chunks" - -[mongodb.vector] -dimensions = 1536 -index_name = "vector_index" - -[model] -embedding_model = "text-embedding-3-small" -completion_model = "gpt-3.5-turbo" - -[document_processing] -chunk_size = 1000 -chunk_overlap = 200 -default_k = 4 - -[video_processing] -frame_sample_rate = 120 - -[server] +# Core Service Components +[service] host = "localhost" port = 8000 reload = false +[service.components] +storage = "aws-s3" +database = "mongodb" +vector_store = "mongodb" +embedding = "openai" +completion = "openai" +parser = "combined" + +# Storage Configuration +[storage.aws] +region = "us-east-2" +bucket_name = "databridge-s3-storage" + +# Database Configuration +[database.mongodb] +database_name = "DataBridgeTest" +documents_collection = "documents" +chunks_collection = "document_chunks" + +# Vector Store Configuration +[vector_store.mongodb] +dimensions = 1536 +index_name = "vector_index" + +# Model Configurations +[models] +[models.embedding] +model_name = "text-embedding-3-small" + +[models.completion] +model_name = "gpt-4o-mini" +default_max_tokens = 1000 +default_temperature = 0.7 + +# Document Processing +[processing] +[processing.text] +chunk_size = 1000 +chunk_overlap = 200 +default_k = 4 + +[processing.video] +frame_sample_rate = 120 + +# Authentication [auth] -jwt_algorithm = "HS256" \ No newline at end of file +jwt_algorithm = "HS256" diff --git a/core/api.py b/core/api.py index f514b03..bc07811 100644 --- a/core/api.py +++ b/core/api.py @@ -5,6 +5,7 @@ from fastapi import FastAPI, Form, HTTPException, Depends, Header, UploadFile from fastapi.middleware.cors import CORSMiddleware import jwt import logging +from core.completion.openai_completion import OpenAICompletionModel from core.models.request import ( IngestTextRequest, RetrieveRequest, @@ -14,14 +15,14 @@ from core.models.documents import Document, DocumentResult, ChunkResult from core.models.auth import AuthContext, EntityType from core.parser.combined_parser import CombinedParser from core.completion.base_completion import CompletionResponse +from core.parser.unstructured_parser import UnstructuredAPIParser from core.services.document_service import DocumentService from core.config import get_settings from core.database.mongo_database import MongoDatabase from core.vector_store.mongo_vector_store import MongoDBAtlasVectorStore from core.storage.s3_storage import S3Storage -from core.parser.unstructured_parser import UnstructuredAPIParser -from core.embedding_model.openai_embedding_model import OpenAIEmbeddingModel -from core.completion.openai_completion import OpenAICompletionModel +from core.embedding.openai_embedding_model import OpenAIEmbeddingModel +from core.completion.ollama_completion import OllamaCompletionModel # Initialize FastAPI app app = FastAPI(title="DataBridge API") @@ -39,42 +40,90 @@ app.add_middleware( # Initialize service settings = get_settings() -# Initialize components -database = MongoDatabase( - uri=settings.MONGODB_URI, - db_name=settings.DATABRIDGE_DB, - collection_name=settings.DOCUMENTS_COLLECTION, -) +# Initialize database +match settings.DATABASE_PROVIDER: + case "mongodb": + database = MongoDatabase( + uri=settings.MONGODB_URI, + db_name=settings.DATABRIDGE_DB, + collection_name=settings.DOCUMENTS_COLLECTION, + ) + case _: + raise ValueError(f"Unsupported database provider: {settings.DATABASE_PROVIDER}") -vector_store = MongoDBAtlasVectorStore( - uri=settings.MONGODB_URI, - database_name=settings.DATABRIDGE_DB, - collection_name=settings.CHUNKS_COLLECTION, - index_name=settings.VECTOR_INDEX_NAME, -) +# Initialize vector store +match settings.VECTOR_STORE_PROVIDER: + case "mongodb": + vector_store = MongoDBAtlasVectorStore( + uri=settings.MONGODB_URI, + database_name=settings.DATABRIDGE_DB, + collection_name=settings.CHUNKS_COLLECTION, + index_name=settings.VECTOR_INDEX_NAME, + ) + case _: + raise ValueError( + f"Unsupported vector store provider: {settings.VECTOR_STORE_PROVIDER}" + ) -storage = S3Storage( - aws_access_key=settings.AWS_ACCESS_KEY, - aws_secret_key=settings.AWS_SECRET_ACCESS_KEY, - region_name=settings.AWS_REGION, - default_bucket=settings.S3_BUCKET, -) +# Initialize storage +match settings.STORAGE_PROVIDER: + case "aws-s3": + storage = S3Storage( + aws_access_key=settings.AWS_ACCESS_KEY, + aws_secret_key=settings.AWS_SECRET_ACCESS_KEY, + region_name=settings.AWS_REGION, + default_bucket=settings.S3_BUCKET, + ) + case _: + raise ValueError(f"Unsupported storage provider: {settings.STORAGE_PROVIDER}") -parser = CombinedParser( - unstructured_api_key=settings.UNSTRUCTURED_API_KEY, - assemblyai_api_key=settings.ASSEMBLYAI_API_KEY, - chunk_size=settings.CHUNK_SIZE, - chunk_overlap=settings.CHUNK_OVERLAP, - frame_sample_rate=settings.FRAME_SAMPLE_RATE, -) +# Initialize parser +match settings.PARSER_PROVIDER: + case "combined": + parser = CombinedParser( + unstructured_api_key=settings.UNSTRUCTURED_API_KEY, + assemblyai_api_key=settings.ASSEMBLYAI_API_KEY, + chunk_size=settings.CHUNK_SIZE, + chunk_overlap=settings.CHUNK_OVERLAP, + frame_sample_rate=settings.FRAME_SAMPLE_RATE, + ) + case "unstructured": + parser = UnstructuredAPIParser( + unstructured_api_key=settings.UNSTRUCTURED_API_KEY, + chunk_size=settings.CHUNK_SIZE, + chunk_overlap=settings.CHUNK_OVERLAP, + ) + case _: + raise ValueError(f"Unsupported parser provider: {settings.PARSER_PROVIDER}") -embedding_model = OpenAIEmbeddingModel( - api_key=settings.OPENAI_API_KEY, model_name=settings.EMBEDDING_MODEL -) +# Initialize embedding model +match settings.EMBEDDING_PROVIDER: + case "openai": + embedding_model = OpenAIEmbeddingModel( + api_key=settings.OPENAI_API_KEY, + model_name=settings.EMBEDDING_MODEL, + ) + case _: + raise ValueError( + f"Unsupported embedding provider: {settings.EMBEDDING_PROVIDER}" + ) -completion_model = OpenAICompletionModel(model_name=settings.COMPLETION_MODEL) +# Initialize completion model +match settings.COMPLETION_PROVIDER: + case "ollama": + completion_model = OllamaCompletionModel( + model_name=settings.COMPLETION_MODEL, + ) + case "openai": + completion_model = OpenAICompletionModel( + model_name=settings.COMPLETION_MODEL, + ) + case _: + raise ValueError( + f"Unsupported completion provider: {settings.COMPLETION_PROVIDER}" + ) -# Initialize document service +# Initialize document service with configured components document_service = DocumentService( database=database, vector_store=vector_store, diff --git a/core/config.py b/core/config.py index 08e0c8f..cfe0c95 100644 --- a/core/config.py +++ b/core/config.py @@ -8,7 +8,7 @@ from dotenv import load_dotenv class Settings(BaseSettings): """DataBridge configuration settings.""" - # Required environment variables + # Required environment variables (referenced in config.toml) MONGODB_URI: str = Field(..., env="MONGODB_URI") OPENAI_API_KEY: str = Field(..., env="OPENAI_API_KEY") UNSTRUCTURED_API_KEY: str = Field(..., env="UNSTRUCTURED_API_KEY") @@ -17,25 +17,47 @@ class Settings(BaseSettings): AWS_SECRET_ACCESS_KEY: str = Field(..., env="AWS_SECRET_ACCESS_KEY") JWT_SECRET_KEY: str = Field(..., env="JWT_SECRET_KEY") - # Values from config.toml with defaults - AWS_REGION: str = "us-east-2" - S3_BUCKET: str = "databridge-s3-storage" - DATABRIDGE_DB: str = "databridge" - DOCUMENTS_COLLECTION: str = "documents" - CHUNKS_COLLECTION: str = "document_chunks" - VECTOR_INDEX_NAME: str = "vector_index" - VECTOR_DIMENSIONS: int = 1536 - EMBEDDING_MODEL: str = "text-embedding-3-small" - COMPLETION_MODEL: str = "gpt-3.5-turbo" - CHUNK_SIZE: int = 1000 - CHUNK_OVERLAP: int = 200 - DEFAULT_K: int = 4 + # Service settings HOST: str = "localhost" PORT: int = 8000 RELOAD: bool = False - JWT_ALGORITHM: str = "HS256" + + # Component selection + STORAGE_PROVIDER: str = "aws-s3" + DATABASE_PROVIDER: str = "mongodb" + VECTOR_STORE_PROVIDER: str = "mongodb" + EMBEDDING_PROVIDER: str = "openai" + COMPLETION_PROVIDER: str = "ollama" + PARSER_PROVIDER: str = "combined" + + # Storage settings + AWS_REGION: str = "us-east-2" + S3_BUCKET: str = "databridge-s3-storage" + + # Database settings + DATABRIDGE_DB: str = "DataBridgeTest" + DOCUMENTS_COLLECTION: str = "documents" + CHUNKS_COLLECTION: str = "document_chunks" + + # Vector store settings + VECTOR_INDEX_NAME: str = "vector_index" + VECTOR_DIMENSIONS: int = 1536 + + # Model settings + EMBEDDING_MODEL: str = "text-embedding-3-small" + COMPLETION_MODEL: str = "llama3.1" + COMPLETION_MAX_TOKENS: int = 1000 + COMPLETION_TEMPERATURE: float = 0.7 + + # Processing settings + CHUNK_SIZE: int = 1000 + CHUNK_OVERLAP: int = 200 + DEFAULT_K: int = 4 FRAME_SAMPLE_RATE: int = 120 + # Auth settings + JWT_ALGORITHM: str = "HS256" + @lru_cache() def get_settings() -> Settings: @@ -48,23 +70,39 @@ def get_settings() -> Settings: # Map config.toml values to settings settings_dict = { - "AWS_REGION": config["aws"]["default_region"], - "S3_BUCKET": config["aws"]["default_bucket_name"], - "DATABRIDGE_DB": config["mongodb"]["database_name"], - "DOCUMENTS_COLLECTION": config["mongodb"]["documents_collection"], - "CHUNKS_COLLECTION": config["mongodb"]["chunks_collection"], - "VECTOR_INDEX_NAME": config["mongodb"]["vector"]["index_name"], - "VECTOR_DIMENSIONS": config["mongodb"]["vector"]["dimensions"], - "EMBEDDING_MODEL": config["model"]["embedding_model"], - "COMPLETION_MODEL": config["model"]["completion_model"], - "CHUNK_SIZE": config["document_processing"]["chunk_size"], - "CHUNK_OVERLAP": config["document_processing"]["chunk_overlap"], - "DEFAULT_K": config["document_processing"]["default_k"], - "HOST": config["server"]["host"], - "PORT": config["server"]["port"], - "RELOAD": config["server"]["reload"], + # Service settings + "HOST": config["service"]["host"], + "PORT": config["service"]["port"], + "RELOAD": config["service"]["reload"], + # Component selection + "STORAGE_PROVIDER": config["service"]["components"]["storage"], + "DATABASE_PROVIDER": config["service"]["components"]["database"], + "VECTOR_STORE_PROVIDER": config["service"]["components"]["vector_store"], + "EMBEDDING_PROVIDER": config["service"]["components"]["embedding"], + "COMPLETION_PROVIDER": config["service"]["components"]["completion"], + "PARSER_PROVIDER": config["service"]["components"]["parser"], + # Storage settings + "AWS_REGION": config["storage"]["aws"]["region"], + "S3_BUCKET": config["storage"]["aws"]["bucket_name"], + # Database settings + "DATABRIDGE_DB": config["database"]["mongodb"]["database_name"], + "DOCUMENTS_COLLECTION": config["database"]["mongodb"]["documents_collection"], + "CHUNKS_COLLECTION": config["database"]["mongodb"]["chunks_collection"], + # Vector store settings + "VECTOR_INDEX_NAME": config["vector_store"]["mongodb"]["index_name"], + "VECTOR_DIMENSIONS": config["vector_store"]["mongodb"]["dimensions"], + # Model settings + "EMBEDDING_MODEL": config["models"]["embedding"]["model_name"], + "COMPLETION_MODEL": config["models"]["completion"]["model_name"], + "COMPLETION_MAX_TOKENS": config["models"]["completion"]["default_max_tokens"], + "COMPLETION_TEMPERATURE": config["models"]["completion"]["default_temperature"], + # Processing settings + "CHUNK_SIZE": config["processing"]["text"]["chunk_size"], + "CHUNK_OVERLAP": config["processing"]["text"]["chunk_overlap"], + "DEFAULT_K": config["processing"]["text"]["default_k"], + "FRAME_SAMPLE_RATE": config["processing"]["video"]["frame_sample_rate"], + # Auth settings "JWT_ALGORITHM": config["auth"]["jwt_algorithm"], - "FRAME_SAMPLE_RATE": config["video_processing"]["frame_sample_rate"], } return Settings(**settings_dict) diff --git a/core/services/document_service.py b/core/services/document_service.py index e45ee0f..ba41146 100644 --- a/core/services/document_service.py +++ b/core/services/document_service.py @@ -14,7 +14,7 @@ from ..models.auth import AuthContext from core.database.base_database import BaseDatabase from core.storage.base_storage import BaseStorage from core.vector_store.base_vector_store import BaseVectorStore -from core.embedding_model.base_embedding_model import BaseEmbeddingModel +from core.embedding.base_embedding_model import BaseEmbeddingModel from core.parser.base_parser import BaseParser from core.completion.base_completion import BaseCompletionModel from core.completion.base_completion import CompletionRequest, CompletionResponse