morphik-core/config.toml
Arnav Agrawal c3726504f7
add support for PostgreSQL and pgvector (#15)
Co-authored-by: Adityavardhan Agrawal <aa729@cornell.edu>
2025-01-04 08:14:52 -05:00

85 lines
2.1 KiB
TOML

# Core Service Components
[service]
host = "localhost"
port = 8000
reload = false
[service.components]
storage = "local" # "aws-s3"
database = "postgres" # "postgres", "mongodb"
vector_store = "pgvector" # "mongodb"
embedding = "ollama" # "openai", "ollama"
completion = "ollama" # "openai", "ollama"
parser = "combined" # "combined", "unstructured", "contextual"
reranker = "bge" # "bge"
# Storage Configuration
[storage.local]
path = "./storage"
[storage.aws]
region = "us-east-2"
bucket_name = "databridge-s3-storage"
# Database Configuration
[database.postgres]
database_name = "databridge"
documents_table = "documents"
chunks_table = "document_chunks"
[database.mongodb]
database_name = "databridge"
documents_collection = "documents"
chunks_collection = "document_chunks"
# Vector Store Configuration
[vector_store.mongodb]
dimensions = 768 # 768 for nomic-embed-text, 1536 for text-embedding-3-small
index_name = "vector_index"
similarity_metric = "cosine"
[vector_store.pgvector]
dimensions = 768 # 768 for nomic-embed-text, 1536 for text-embedding-3-small
table_name = "vector_embeddings"
index_method = "ivfflat" # "ivfflat" or "hnsw"
index_lists = 100 # Number of lists for ivfflat index
probes = 10 # Number of probes for ivfflat search
# Model Configurations
[models]
[models.embedding]
model_name = "nomic-embed-text" # "text-embedding-3-small", "nomic-embed-text"
[models.completion]
model_name = "llama3.2" # "gpt-4o-mini", "llama3.1", etc.
default_max_tokens = 1000
default_temperature = 0.7
[models.ollama]
base_url = "http://localhost:11434"
[models.reranker]
model_name = "BAAI/bge-reranker-large" # "BAAI/bge-reranker-v2-gemma", "BAAI/bge-reranker-large"
device = "mps" # "cuda:0" # Optional: Set to null or remove for CPU
use_fp16 = true
query_max_length = 256
passage_max_length = 512
# Document Processing
[processing]
[processing.text]
chunk_size = 1000
chunk_overlap = 200
default_k = 4
use_reranking = true # Whether to use reranking by default
[processing.video]
frame_sample_rate = 120
[processing.unstructured]
use_api = false
# Authentication
[auth]
jwt_algorithm = "HS256"