Change in max_sim function for newer postgresql versions (#54)

* add filename option for text documents

* fix: duplicated env variables

* Include chunk score in query response

* don't initialize colpali store if one is not provided

* Add score to ChunkSource

* Add huggingface cache volume to avoid redownloads

* Replace <~> in maximum_similarity calculation

* Add dev_mode setup to default configuration

* Dump with databridge documents (no pdfs)

* Add dev_mode setup to default configuration

* Comment docker instructions for api host

* Use empty dump file
This commit is contained in:
LukeZekes 2025-03-18 13:41:39 -05:00 committed by GitHub
parent 9df9196d51
commit 5d76521059
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 41 additions and 5 deletions

View File

@ -132,7 +132,10 @@ class MultiVectorStore(BaseVectorStore):
SELECT unnest(document) AS document
),
similarities AS (
SELECT query_number, 1 - ((document <~> query) / bit_length(query)) AS similarity FROM queries CROSS JOIN documents
SELECT
query_number,
1.0 - (bit_count(document # query)::float / greatest(bit_length(query), 1)::float) AS similarity
FROM queries CROSS JOIN documents
),
max_similarities AS (
SELECT MAX(similarity) AS max_similarity FROM similarities GROUP BY query_number

View File

@ -1,5 +1,5 @@
[api]
host = "localhost"
host = "localhost" # Needs to be "0.0.0.0" for docker
port = 8000
reload = true

View File

@ -1,6 +1,20 @@
version: '3.8'
x-ollama-check: &ollama-check
# This command reads the toml file and checks if any provider is set to "ollama"
command: >
/bin/sh -c '
grep -q "provider *= *\"ollama\"" databridge.toml &&
echo "true" > /tmp/needs_ollama ||
echo "false" > /tmp/needs_ollama'
services:
config-check:
image: alpine
volumes:
- ./databridge.toml:/databridge.toml
<<: *ollama-check
databridge:
build: .
ports:
@ -16,11 +30,15 @@ services:
- ./storage:/app/storage
- ./logs:/app/logs
- ./databridge.toml:/app/databridge.toml
- huggingface_cache:/root/.cache/huggingface
depends_on:
postgres:
condition: service_healthy
config-check:
condition: service_completed_successfully
ollama:
condition: service_started
required: false
networks:
- databridge-network
env_file:
@ -51,6 +69,8 @@ services:
ollama:
image: ollama/ollama:latest
profiles:
- ollama
ports:
- "11434:11434"
volumes:
@ -66,4 +86,5 @@ networks:
volumes:
postgres_data:
ollama_data:
ollama_data:
huggingface_cache:

View File

@ -65,6 +65,10 @@ reload = false\n\
\n\
[auth]\n\
jwt_algorithm = "HS256"\n\
dev_mode = true\n\
dev_entity_id = "dev_user"\n\
dev_entity_type = "developer"\n\
dev_permissions = ["read", "write", "admin"]\n\
\n\
[completion]\n\
provider = "ollama"\n\

0
dump.sql Normal file
View File

View File

@ -60,7 +60,9 @@ CREATE OR REPLACE FUNCTION max_sim(document bit[], query bit[]) RETURNS double p
SELECT unnest(document) AS document
),
similarities AS (
SELECT query_number, 1 - ((document <~> query) / bit_length(query)) AS similarity
SELECT
query_number,
1.0 - (bit_count(document # query)::float / greatest(bit_length(query), 1)::float) AS similarity
FROM queries CROSS JOIN documents
),
max_similarities AS (

View File

@ -20,3 +20,6 @@ RUN apk del git build-base clang llvm postgresql-dev \
# Copy initialization scripts
COPY init.sql /docker-entrypoint-initdb.d/
# Copy data dump
COPY dump.sql /tmp/dump.sql

View File

@ -328,7 +328,10 @@ def setup_postgres():
SELECT unnest(document_bits) AS document
),
similarities AS (
SELECT query_number, 1 - ((document <~> query) / bit_length(query)) AS similarity FROM queries CROSS JOIN documents
SELECT
query_number,
1.0 - (bit_count(document # query)::float / greatest(bit_length(query), 1)::float) AS similarity
FROM queries CROSS JOIN documents
),
max_similarities AS (
SELECT MAX(similarity) AS max_similarity FROM similarities GROUP BY query_number