From 5d7652105966517c0230ccf8a7368f1f746cb0bc Mon Sep 17 00:00:00 2001 From: LukeZekes <41444778+LukeZekes@users.noreply.github.com> Date: Tue, 18 Mar 2025 13:41:39 -0500 Subject: [PATCH] Change in max_sim function for newer postgresql versions (#54) * add filename option for text documents * fix: duplicated env variables * Include chunk score in query response * don't initialize colpali store if one is not provided * Add score to ChunkSource * Add huggingface cache volume to avoid redownloads * Replace <~> in maximum_similarity calculation * Add dev_mode setup to default configuration * Dump with databridge documents (no pdfs) * Add dev_mode setup to default configuration * Comment docker instructions for api host * Use empty dump file --- core/vector_store/multi_vector_store.py | 5 ++++- databridge.toml | 2 +- docker-compose.yml | 23 ++++++++++++++++++++++- dockerfile | 4 ++++ dump.sql | 0 init.sql | 4 +++- postgres.dockerfile | 3 +++ quick_setup.py | 5 ++++- 8 files changed, 41 insertions(+), 5 deletions(-) create mode 100644 dump.sql diff --git a/core/vector_store/multi_vector_store.py b/core/vector_store/multi_vector_store.py index 692830c..9eecf8d 100644 --- a/core/vector_store/multi_vector_store.py +++ b/core/vector_store/multi_vector_store.py @@ -132,7 +132,10 @@ class MultiVectorStore(BaseVectorStore): SELECT unnest(document) AS document ), similarities AS ( - SELECT query_number, 1 - ((document <~> query) / bit_length(query)) AS similarity FROM queries CROSS JOIN documents + SELECT + query_number, + 1.0 - (bit_count(document # query)::float / greatest(bit_length(query), 1)::float) AS similarity + FROM queries CROSS JOIN documents ), max_similarities AS ( SELECT MAX(similarity) AS max_similarity FROM similarities GROUP BY query_number diff --git a/databridge.toml b/databridge.toml index 1e91590..6e14c74 100644 --- a/databridge.toml +++ b/databridge.toml @@ -1,5 +1,5 @@ [api] -host = "localhost" +host = "localhost" # Needs to be "0.0.0.0" for docker port = 8000 reload = true diff --git a/docker-compose.yml b/docker-compose.yml index a95934e..89678c1 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,6 +1,20 @@ version: '3.8' +x-ollama-check: &ollama-check + # This command reads the toml file and checks if any provider is set to "ollama" + command: > + /bin/sh -c ' + grep -q "provider *= *\"ollama\"" databridge.toml && + echo "true" > /tmp/needs_ollama || + echo "false" > /tmp/needs_ollama' + services: + config-check: + image: alpine + volumes: + - ./databridge.toml:/databridge.toml + <<: *ollama-check + databridge: build: . ports: @@ -16,11 +30,15 @@ services: - ./storage:/app/storage - ./logs:/app/logs - ./databridge.toml:/app/databridge.toml + - huggingface_cache:/root/.cache/huggingface depends_on: postgres: condition: service_healthy + config-check: + condition: service_completed_successfully ollama: condition: service_started + required: false networks: - databridge-network env_file: @@ -51,6 +69,8 @@ services: ollama: image: ollama/ollama:latest + profiles: + - ollama ports: - "11434:11434" volumes: @@ -66,4 +86,5 @@ networks: volumes: postgres_data: - ollama_data: + ollama_data: + huggingface_cache: diff --git a/dockerfile b/dockerfile index 5856a70..d475474 100644 --- a/dockerfile +++ b/dockerfile @@ -65,6 +65,10 @@ reload = false\n\ \n\ [auth]\n\ jwt_algorithm = "HS256"\n\ +dev_mode = true\n\ +dev_entity_id = "dev_user"\n\ +dev_entity_type = "developer"\n\ +dev_permissions = ["read", "write", "admin"]\n\ \n\ [completion]\n\ provider = "ollama"\n\ diff --git a/dump.sql b/dump.sql new file mode 100644 index 0000000..e69de29 diff --git a/init.sql b/init.sql index 698a840..7792c76 100644 --- a/init.sql +++ b/init.sql @@ -60,7 +60,9 @@ CREATE OR REPLACE FUNCTION max_sim(document bit[], query bit[]) RETURNS double p SELECT unnest(document) AS document ), similarities AS ( - SELECT query_number, 1 - ((document <~> query) / bit_length(query)) AS similarity + SELECT + query_number, + 1.0 - (bit_count(document # query)::float / greatest(bit_length(query), 1)::float) AS similarity FROM queries CROSS JOIN documents ), max_similarities AS ( diff --git a/postgres.dockerfile b/postgres.dockerfile index ca92690..4b72bf3 100644 --- a/postgres.dockerfile +++ b/postgres.dockerfile @@ -20,3 +20,6 @@ RUN apk del git build-base clang llvm postgresql-dev \ # Copy initialization scripts COPY init.sql /docker-entrypoint-initdb.d/ + +# Copy data dump +COPY dump.sql /tmp/dump.sql \ No newline at end of file diff --git a/quick_setup.py b/quick_setup.py index 498757d..081025a 100644 --- a/quick_setup.py +++ b/quick_setup.py @@ -328,7 +328,10 @@ def setup_postgres(): SELECT unnest(document_bits) AS document ), similarities AS ( - SELECT query_number, 1 - ((document <~> query) / bit_length(query)) AS similarity FROM queries CROSS JOIN documents + SELECT + query_number, + 1.0 - (bit_count(document # query)::float / greatest(bit_length(query), 1)::float) AS similarity + FROM queries CROSS JOIN documents ), max_similarities AS ( SELECT MAX(similarity) AS max_similarity FROM similarities GROUP BY query_number