morphik-core/morphik.toml

[api]
host = "0.0.0.0"
port = 8000
reload = true

[auth]
jwt_algorithm = "HS256"
dev_mode = true  # Enabled by default for easier local development
dev_entity_id = "dev_user"  # Default dev user ID
dev_entity_type = "developer"  # Default dev entity type
dev_permissions = ["read", "write", "admin"]  # Default dev permissions

#### Registered models
[registered_models]
# OpenAI models
openai_gpt4o = { model_name = "gpt-4o", vision = true }
openai_gpt4 = { model_name = "gpt-4" }

# Azure OpenAI models
azure_gpt4 = { model_name = "gpt-4", api_base = "YOUR_AZURE_URL_HERE", api_version = "2023-05-15", deployment_id = "gpt-4-deployment" }
azure_gpt35 = { model_name = "gpt-3.5-turbo", api_base = "YOUR_AZURE_URL_HERE", api_version = "2023-05-15", deployment_id = "gpt-35-turbo-deployment" }

# Anthropic models
claude_opus = { model_name = "claude-3-opus-20240229" }
claude_sonnet = { model_name = "claude-3-sonnet-20240229" }

# Ollama models
ollama_llama = { model_name = "ollama_chat/llama3.2", api_base = "http://localhost:11434" }
ollama_llama_vision = { model_name = "ollama_chat/llama3.2-vision", api_base = "http://localhost:11434", vision = true }
# If you are running on docker, but running ollama locally use the following
ollama_llama_docker = { model_name = "ollama_chat/llama3.2", api_base = "http://host.docker.internal:11434" }
ollama_llama_vision_docker = { model_name = "ollama_chat/llama3.2-vision", api_base = "http://host.docker.internal:11434", vision = true }
# If you are running on docker and ollama is also running on docker use the following
ollama_llama_docker_docker = { model_name = "ollama_chat/llama3.2", api_base = "http://ollama:11434" }
ollama_llama_vision_docker_docker = { model_name = "ollama_chat/llama3.2-vision", api_base = "http://ollama:11434", vision = true }

# Embedding models
openai_embedding = { model_name = "text-embedding-3-small" }
openai_embedding_large = { model_name = "text-embedding-3-large" }
azure_embedding = { model_name = "text-embedding-ada-002", api_base = "YOUR_AZURE_URL_HERE", api_version = "2023-05-15", deployment_id = "embedding-ada-002" }
ollama_embedding = { model_name = "ollama/nomic-embed-text", api_base = "http://localhost:11434" }
# If you are running on docker, but running ollama locally use the following
ollama_embedding_docker = { model_name = "ollama/nomic-embed-text", api_base = "http://host.docker.internal:11434" }
# If you are running on docker and ollama is also running on docker use the following
ollama_embedding_docker_docker = { model_name = "ollama/nomic-embed-text", api_base = "http://ollama:11434" }

#### Component configurations ####

[completion]
model = "ollama_llama_vision"  # Reference to a key in registered_models
default_max_tokens = "1000"
default_temperature = 0.5

[database]
provider = "postgres"

[embedding]
model = "ollama_embedding"  # Reference to registered model
dimensions = 768
similarity_metric = "cosine"

[parser]
chunk_size = 1000
chunk_overlap = 200
use_unstructured_api = false
use_contextual_chunking = false
contextual_chunking_model = "ollama_llama"  # Reference to a key in registered_models

[parser.vision]
model = "ollama_llama_vision"  # Reference to a key in registered_models
frame_sample_rate = -1  # Set to -1 to disable frame captioning

[reranker]
use_reranker = true
provider = "flag"
model_name = "BAAI/bge-reranker-large"
query_max_length = 256
passage_max_length = 512
use_fp16 = true
device = "mps" # use "cpu" if on docker and using a mac, "cuda" if cuda enabled device

[storage]
provider = "local"
storage_path = "./storage"

# [storage]
# provider = "aws-s3"
# region = "us-east-2"
# bucket_name = "morphik-s3-storage"

[vector_store]
provider = "pgvector"

[rules]
model = "ollama_llama"
batch_size = 4096

[morphik]
enable_colpali = true
mode = "self_hosted"  # "cloud" or "self_hosted"
api_domain = "api.morphik.ai"  # API domain for cloud URIs

[redis]
host = "localhost"
port = 6379

[graph]
model = "ollama_llama"
enable_entity_resolution = true

[telemetry]
enabled = true
honeycomb_enabled = true
honeycomb_endpoint = "https://api.honeycomb.io"
honeycomb_proxy_endpoint = "https://otel-proxy.onrender.com"
service_name = "databridge-core"
otlp_timeout = 10
otlp_max_retries = 3
otlp_retry_delay = 1
otlp_max_export_batch_size = 512
otlp_schedule_delay_millis = 5000
otlp_max_queue_size = 2048
Config improvements (#17) 2025-01-07 01:42:10 -05:00			`[api]`
Add litellm support across the system (#74) 2025-04-08 00:19:47 -07:00			`host = "0.0.0.0"`
Config improvements (#17) 2025-01-07 01:42:10 -05:00			`port = 8000`
			`reload = true`

			`[auth]`
			`jwt_algorithm = "HS256"`
Streamline dev experience with optional auth and simplified config (#27) 2025-01-11 21:54:00 +05:30			`dev_mode = true # Enabled by default for easier local development`
			`dev_entity_id = "dev_user" # Default dev user ID`
			`dev_entity_type = "developer" # Default dev entity type`
			`dev_permissions = ["read", "write", "admin"] # Default dev permissions`
Config improvements (#17) 2025-01-07 01:42:10 -05:00
Add litellm support across the system (#74) 2025-04-08 00:19:47 -07:00			`#### Registered models`
			`[registered_models]`
			`# OpenAI models`
add vision tag to openai gpt4o 2025-04-13 18:51:55 -07:00			`openai_gpt4o = { model_name = "gpt-4o", vision = true }`
Add litellm support across the system (#74) 2025-04-08 00:19:47 -07:00			`openai_gpt4 = { model_name = "gpt-4" }`

			`# Azure OpenAI models`
			`azure_gpt4 = { model_name = "gpt-4", api_base = "YOUR_AZURE_URL_HERE", api_version = "2023-05-15", deployment_id = "gpt-4-deployment" }`
			`azure_gpt35 = { model_name = "gpt-3.5-turbo", api_base = "YOUR_AZURE_URL_HERE", api_version = "2023-05-15", deployment_id = "gpt-35-turbo-deployment" }`

			`# Anthropic models`
			`claude_opus = { model_name = "claude-3-opus-20240229" }`
			`claude_sonnet = { model_name = "claude-3-sonnet-20240229" }`
Config improvements (#17) 2025-01-07 01:42:10 -05:00
Add litellm support across the system (#74) 2025-04-08 00:19:47 -07:00			`# Ollama models`
			`ollama_llama = { model_name = "ollama_chat/llama3.2", api_base = "http://localhost:11434" }`
			`ollama_llama_vision = { model_name = "ollama_chat/llama3.2-vision", api_base = "http://localhost:11434", vision = true }`
Add various ollama configs 2025-04-12 22:26:26 -07:00			`# If you are running on docker, but running ollama locally use the following`
Fix docker PG issue 2025-04-12 22:23:10 -07:00			`ollama_llama_docker = { model_name = "ollama_chat/llama3.2", api_base = "http://host.docker.internal:11434" }`
			`ollama_llama_vision_docker = { model_name = "ollama_chat/llama3.2-vision", api_base = "http://host.docker.internal:11434", vision = true }`
Add various ollama configs 2025-04-12 22:26:26 -07:00			`# If you are running on docker and ollama is also running on docker use the following`
			`ollama_llama_docker_docker = { model_name = "ollama_chat/llama3.2", api_base = "http://ollama:11434" }`
			`ollama_llama_vision_docker_docker = { model_name = "ollama_chat/llama3.2-vision", api_base = "http://ollama:11434", vision = true }`
Add update graphs, and custom open ai url (#63) 2025-03-29 23:22:47 -07:00
Add litellm support across the system (#74) 2025-04-08 00:19:47 -07:00			`# Embedding models`
			`openai_embedding = { model_name = "text-embedding-3-small" }`
			`openai_embedding_large = { model_name = "text-embedding-3-large" }`
			`azure_embedding = { model_name = "text-embedding-ada-002", api_base = "YOUR_AZURE_URL_HERE", api_version = "2023-05-15", deployment_id = "embedding-ada-002" }`
			`ollama_embedding = { model_name = "ollama/nomic-embed-text", api_base = "http://localhost:11434" }`
Add various ollama configs 2025-04-12 22:26:26 -07:00			`# If you are running on docker, but running ollama locally use the following`
Fix docker PG issue 2025-04-12 22:23:10 -07:00			`ollama_embedding_docker = { model_name = "ollama/nomic-embed-text", api_base = "http://host.docker.internal:11434" }`
Add various ollama configs 2025-04-12 22:26:26 -07:00			`# If you are running on docker and ollama is also running on docker use the following`
			`ollama_embedding_docker_docker = { model_name = "ollama/nomic-embed-text", api_base = "http://ollama:11434" }`
Add litellm support across the system (#74) 2025-04-08 00:19:47 -07:00
			`#### Component configurations ####`

			`[completion]`
Change default to ollama for completions 2025-04-15 20:50:34 -07:00			`model = "ollama_llama_vision" # Reference to a key in registered_models`
Add litellm support across the system (#74) 2025-04-08 00:19:47 -07:00			`default_max_tokens = "1000"`
			`default_temperature = 0.5`
Config improvements (#17) 2025-01-07 01:42:10 -05:00
			`[database]`
			`provider = "postgres"`

			`[embedding]`
Add litellm support across the system (#74) 2025-04-08 00:19:47 -07:00			`model = "ollama_embedding" # Reference to registered model`
Config improvements (#17) 2025-01-07 01:42:10 -05:00			`dimensions = 768`
			`similarity_metric = "cosine"`

			`[parser]`
			`chunk_size = 1000`
			`chunk_overlap = 200`
			`use_unstructured_api = false`
Separate parsing and chunking into different function for easy rules processing (#41) 2025-02-15 21:02:15 +03:00			`use_contextual_chunking = false`
Add litellm support across the system (#74) 2025-04-08 00:19:47 -07:00			`contextual_chunking_model = "ollama_llama" # Reference to a key in registered_models`
Fix video parsing bugs, improve server logging 2025-01-30 16:03:46 -05:00
			`[parser.vision]`
Add litellm support across the system (#74) 2025-04-08 00:19:47 -07:00			`model = "ollama_llama_vision" # Reference to a key in registered_models`
Fix video parsing bugs, improve server logging 2025-01-30 16:03:46 -05:00			`frame_sample_rate = -1 # Set to -1 to disable frame captioning`
Config improvements (#17) 2025-01-07 01:42:10 -05:00
			`[reranker]`
Add support for ColPali (#43) * debug mps not supported * further debug (i think i lost some braincells) * fix mps bug and resolve dependency issues * remove libmagic dependence * add colpali embedding model * multi-vector store works - verified with testing * add integration testing * support text embedding in colpali * complete colplai integration and testing * formatting + some PR comments * remove experimental files * resolve PR comments 2025-02-26 20:17:12 -05:00			`use_reranker = true`
Config improvements (#17) 2025-01-07 01:42:10 -05:00			`provider = "flag"`
			`model_name = "BAAI/bge-reranker-large"`
			`query_max_length = 256`
			`passage_max_length = 512`
			`use_fp16 = true`
Add update graphs, and custom open ai url (#63) 2025-03-29 23:22:47 -07:00			`device = "mps" # use "cpu" if on docker and using a mac, "cuda" if cuda enabled device`
Config improvements (#17) 2025-01-07 01:42:10 -05:00
			`[storage]`
			`provider = "local"`
Fix storage path for local storage and add clear table utility script 2025-01-22 16:57:09 -05:00			`storage_path = "./storage"`
Config improvements (#17) 2025-01-07 01:42:10 -05:00
			`# [storage]`
			`# provider = "aws-s3"`
			`# region = "us-east-2"`
Refactor Python SDK: Introduce Morphik SDK, replace DataBridge references (#75) 2025-04-09 18:46:00 -07:00			`# bucket_name = "morphik-s3-storage"`
Config improvements (#17) 2025-01-07 01:42:10 -05:00
			`[vector_store]`
			`provider = "pgvector"`

Fix config start server issue 2025-02-08 00:09:36 -05:00			`[rules]`
Add litellm support across the system (#74) 2025-04-08 00:19:47 -07:00			`model = "ollama_llama"`
Fix config start server issue 2025-02-08 00:09:36 -05:00			`batch_size = 4096`
Add support for ColPali (#43) * debug mps not supported * further debug (i think i lost some braincells) * fix mps bug and resolve dependency issues * remove libmagic dependence * add colpali embedding model * multi-vector store works - verified with testing * add integration testing * support text embedding in colpali * complete colplai integration and testing * formatting + some PR comments * remove experimental files * resolve PR comments 2025-02-26 20:17:12 -05:00
Refactor Python SDK: Introduce Morphik SDK, replace DataBridge references (#75) 2025-04-09 18:46:00 -07:00			`[morphik]`
Add support for ColPali (#43) * debug mps not supported * further debug (i think i lost some braincells) * fix mps bug and resolve dependency issues * remove libmagic dependence * add colpali embedding model * multi-vector store works - verified with testing * add integration testing * support text embedding in colpali * complete colplai integration and testing * formatting + some PR comments * remove experimental files * resolve PR comments 2025-02-26 20:17:12 -05:00			`enable_colpali = true`
Add hosted tier limits, cloud uri gen (#59) 2025-03-27 17:30:02 -07:00			`mode = "self_hosted" # "cloud" or "self_hosted"`
Add prompt override example 2025-04-11 01:05:15 -07:00			`api_domain = "api.morphik.ai" # API domain for cloud URIs`
Implement knowledge graphs, and graph enhanced querying (#48) 2025-03-17 17:36:43 -04:00
add task queue (#87) * add task queue * ensure task queuing is working as expected. * add downstream sdk changes * bugs and pr comments * update docker arq running logic 2025-04-15 23:31:49 -07:00			`[redis]`
			`host = "localhost"`
			`port = 6379`

Implement knowledge graphs, and graph enhanced querying (#48) 2025-03-17 17:36:43 -04:00			`[graph]`
Add litellm support across the system (#74) 2025-04-08 00:19:47 -07:00			`model = "ollama_llama"`
Add entity resolution (#58) 2025-03-29 19:25:01 -07:00			`enable_entity_resolution = true`
Implement knowledge graphs, and graph enhanced querying (#48) 2025-03-17 17:36:43 -04:00
add honeycomb connection (#56) 2025-03-23 17:50:18 -04:00			`[telemetry]`
			`enabled = true`
			`honeycomb_enabled = true`
			`honeycomb_endpoint = "https://api.honeycomb.io"`
			`honeycomb_proxy_endpoint = "https://otel-proxy.onrender.com"`
			`service_name = "databridge-core"`
			`otlp_timeout = 10`
			`otlp_max_retries = 3`
			`otlp_retry_delay = 1`
			`otlp_max_export_batch_size = 512`
			`otlp_schedule_delay_millis = 5000`
			`otlp_max_queue_size = 2048`