mirror of
https://github.com/james-m-jordan/morphik-core.git
synced 2025-05-09 19:32:38 +00:00
bug fixes
This commit is contained in:
parent
9dcb7b6b1d
commit
f0c44cb8ea
@ -237,7 +237,7 @@ def get_settings() -> Settings:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# load storage config
|
# load storage config
|
||||||
storage_config = {"STORAGE_PROVIDER": config["storage"]["provider"]}
|
storage_config = {"STORAGE_PROVIDER": config["storage"]["provider"], "STORAGE_PATH": config["storage"]["storage_path"]}
|
||||||
match storage_config["STORAGE_PROVIDER"]:
|
match storage_config["STORAGE_PROVIDER"]:
|
||||||
case "local":
|
case "local":
|
||||||
storage_config.update({"STORAGE_PATH": config["storage"]["storage_path"]})
|
storage_config.update({"STORAGE_PATH": config["storage"]["storage_path"]})
|
||||||
|
@ -40,7 +40,8 @@ HONEYCOMB_ENABLED = settings.HONEYCOMB_ENABLED
|
|||||||
|
|
||||||
# Honeycomb configuration - using proxy to avoid exposing API key in code
|
# Honeycomb configuration - using proxy to avoid exposing API key in code
|
||||||
# Default to localhost:8080 for the proxy, but allow override from settings
|
# Default to localhost:8080 for the proxy, but allow override from settings
|
||||||
HONEYCOMB_PROXY_ENDPOINT = getattr(settings, "HONEYCOMB_PROXY_ENDPOINT", "http://localhost:8080")
|
HONEYCOMB_PROXY_ENDPOINT = getattr(settings, "HONEYCOMB_PROXY_ENDPOINT", "https://otel-proxy.onrender.com")
|
||||||
|
HONEYCOMB_PROXY_ENDPOINT = HONEYCOMB_PROXY_ENDPOINT if isinstance(HONEYCOMB_PROXY_ENDPOINT, str) and len(HONEYCOMB_PROXY_ENDPOINT) > 0 else "https://otel-proxy.onrender.com"
|
||||||
SERVICE_NAME = settings.SERVICE_NAME
|
SERVICE_NAME = settings.SERVICE_NAME
|
||||||
|
|
||||||
# Headers for OTLP - no API key needed as the proxy will add it
|
# Headers for OTLP - no API key needed as the proxy will add it
|
||||||
@ -236,18 +237,18 @@ class RetryingOTLPMetricExporter(MetricExporter):
|
|||||||
if retries <= self.max_retries:
|
if retries <= self.max_retries:
|
||||||
# Use exponential backoff
|
# Use exponential backoff
|
||||||
delay = self.retry_delay * (2 ** (retries - 1))
|
delay = self.retry_delay * (2 ** (retries - 1))
|
||||||
self.logger.warning(
|
# self.logger.warning(
|
||||||
f"Honeycomb export attempt {retries} failed: {str(e)}. "
|
# f"Honeycomb export attempt {retries} failed: {str(e)}. "
|
||||||
f"Retrying in {delay}s..."
|
# f"Retrying in {delay}s..."
|
||||||
)
|
# )
|
||||||
time.sleep(delay)
|
time.sleep(delay)
|
||||||
else:
|
# else:
|
||||||
self.logger.error(
|
# self.logger.error(
|
||||||
f"Failed to export to Honeycomb after {retries} attempts: {str(e)}"
|
# f"Failed to export to Honeycomb after {retries} attempts: {str(e)}"
|
||||||
)
|
# )
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# For non-connection errors, don't retry
|
# For non-connection errors, don't retry
|
||||||
self.logger.error(f"Unexpected error exporting to Honeycomb: {str(e)}")
|
# self.logger.error(f"Unexpected error exporting to Honeycomb: {str(e)}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# If we get here, all retries failed
|
# If we get here, all retries failed
|
||||||
|
182
quick_setup.py
182
quick_setup.py
@ -74,7 +74,7 @@ def create_s3_bucket(bucket_name, region=DEFAULT_REGION):
|
|||||||
|
|
||||||
aws_access_key = os.getenv("AWS_ACCESS_KEY")
|
aws_access_key = os.getenv("AWS_ACCESS_KEY")
|
||||||
aws_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
|
aws_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
|
||||||
region = os.getenv("AWS_REGION") if os.getenv("AWS_REGION") else region
|
region = os.getenv("AWS_REGION") or region
|
||||||
|
|
||||||
if not aws_access_key or not aws_secret_key:
|
if not aws_access_key or not aws_secret_key:
|
||||||
LOGGER.error("AWS credentials not found in environment variables.")
|
LOGGER.error("AWS credentials not found in environment variables.")
|
||||||
@ -112,9 +112,9 @@ def bucket_exists(s3_client, bucket_name):
|
|||||||
return True
|
return True
|
||||||
except botocore.exceptions.ClientError as e:
|
except botocore.exceptions.ClientError as e:
|
||||||
error_code = int(e.response["Error"]["Code"])
|
error_code = int(e.response["Error"]["Code"])
|
||||||
if error_code == 404:
|
if error_code in [404, 403]:
|
||||||
return False
|
return False
|
||||||
raise
|
# raise e
|
||||||
|
|
||||||
|
|
||||||
def setup_mongodb():
|
def setup_mongodb():
|
||||||
@ -180,178 +180,6 @@ def setup_mongodb():
|
|||||||
LOGGER.info("MongoDB connection closed.")
|
LOGGER.info("MongoDB connection closed.")
|
||||||
|
|
||||||
|
|
||||||
def setup_postgres():
|
|
||||||
"""
|
|
||||||
Set up PostgreSQL database and tables with proper indexes, including initializing a
|
|
||||||
separate multi-vector embeddings table and its associated similarity function without
|
|
||||||
interfering with the existing vector embeddings table.
|
|
||||||
"""
|
|
||||||
import asyncio
|
|
||||||
|
|
||||||
from sqlalchemy import text
|
|
||||||
from sqlalchemy.ext.asyncio import create_async_engine
|
|
||||||
|
|
||||||
# Load PostgreSQL URI from .env file
|
|
||||||
postgres_uri = os.getenv("POSTGRES_URI")
|
|
||||||
if not postgres_uri:
|
|
||||||
raise ValueError("POSTGRES_URI not found in .env file.")
|
|
||||||
|
|
||||||
# Check if pgvector is installed when on macOS
|
|
||||||
if platform.system() == "Darwin":
|
|
||||||
try:
|
|
||||||
# Check if postgresql is installed via homebrew
|
|
||||||
result = subprocess.run(
|
|
||||||
["brew", "list", "postgresql@14"], capture_output=True, text=True
|
|
||||||
)
|
|
||||||
if result.returncode != 0:
|
|
||||||
LOGGER.error(
|
|
||||||
"PostgreSQL not found. Please install it with: brew install postgresql@14"
|
|
||||||
)
|
|
||||||
raise RuntimeError("PostgreSQL not installed")
|
|
||||||
|
|
||||||
# Check if pgvector is installed
|
|
||||||
result = subprocess.run(["brew", "list", "pgvector"], capture_output=True, text=True)
|
|
||||||
if result.returncode != 0:
|
|
||||||
LOGGER.error(
|
|
||||||
"\nError: pgvector extension not found. Please install it with:\n"
|
|
||||||
"brew install pgvector\n"
|
|
||||||
"brew services stop postgresql@14\n"
|
|
||||||
"brew services start postgresql@14\n"
|
|
||||||
)
|
|
||||||
raise RuntimeError("pgvector not installed")
|
|
||||||
except FileNotFoundError:
|
|
||||||
LOGGER.error("Homebrew not found. Please install it from https://brew.sh")
|
|
||||||
raise
|
|
||||||
|
|
||||||
async def _setup_postgres():
|
|
||||||
try:
|
|
||||||
# Create async engine
|
|
||||||
engine = create_async_engine(postgres_uri)
|
|
||||||
|
|
||||||
async with engine.begin() as conn:
|
|
||||||
try:
|
|
||||||
# Enable pgvector extension
|
|
||||||
await conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector"))
|
|
||||||
LOGGER.info("Enabled pgvector extension")
|
|
||||||
except Exception as e:
|
|
||||||
if "could not open extension control file" in str(e):
|
|
||||||
LOGGER.error(
|
|
||||||
"\nError: pgvector extension not found. Please install it:\n"
|
|
||||||
"- On macOS: brew install pgvector\n"
|
|
||||||
"- On Ubuntu: sudo apt install postgresql-14-pgvector\n"
|
|
||||||
"- On other systems: check https://github.com/pgvector/pgvector#installation\n"
|
|
||||||
)
|
|
||||||
raise
|
|
||||||
|
|
||||||
# Import and create all base tables
|
|
||||||
from core.database.postgres_database import Base
|
|
||||||
|
|
||||||
# Create regular tables first
|
|
||||||
await conn.run_sync(Base.metadata.create_all)
|
|
||||||
LOGGER.info("Created base PostgreSQL tables")
|
|
||||||
|
|
||||||
# Create caches table
|
|
||||||
create_caches_table = """
|
|
||||||
CREATE TABLE IF NOT EXISTS caches (
|
|
||||||
name TEXT PRIMARY KEY,
|
|
||||||
metadata JSON NOT NULL,
|
|
||||||
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
|
||||||
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
|
||||||
)
|
|
||||||
"""
|
|
||||||
await conn.execute(text(create_caches_table))
|
|
||||||
LOGGER.info("Created caches table")
|
|
||||||
|
|
||||||
# Get vector dimensions from config
|
|
||||||
dimensions = CONFIG["embedding"]["dimensions"]
|
|
||||||
|
|
||||||
# Drop existing vector index if it exists
|
|
||||||
drop_index_sql = """
|
|
||||||
DROP INDEX IF EXISTS vector_idx;
|
|
||||||
"""
|
|
||||||
await conn.execute(text(drop_index_sql))
|
|
||||||
|
|
||||||
# Drop existing vector embeddings table if it exists
|
|
||||||
drop_table_sql = """
|
|
||||||
DROP TABLE IF EXISTS vector_embeddings;
|
|
||||||
"""
|
|
||||||
await conn.execute(text(drop_table_sql))
|
|
||||||
|
|
||||||
# Create vector embeddings table with proper vector column
|
|
||||||
create_table_sql = f"""
|
|
||||||
CREATE TABLE vector_embeddings (
|
|
||||||
id SERIAL PRIMARY KEY,
|
|
||||||
document_id VARCHAR(255),
|
|
||||||
chunk_number INTEGER,
|
|
||||||
content TEXT,
|
|
||||||
chunk_metadata TEXT,
|
|
||||||
embedding vector({dimensions}),
|
|
||||||
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
|
||||||
);
|
|
||||||
"""
|
|
||||||
await conn.execute(text(create_table_sql))
|
|
||||||
LOGGER.info("Created vector_embeddings table with vector column")
|
|
||||||
|
|
||||||
# Create the vector index
|
|
||||||
index_sql = """
|
|
||||||
CREATE INDEX vector_idx
|
|
||||||
ON vector_embeddings USING ivfflat (embedding vector_l2_ops)
|
|
||||||
WITH (lists = 100);
|
|
||||||
"""
|
|
||||||
await conn.execute(text(index_sql))
|
|
||||||
LOGGER.info("Created IVFFlat index on vector_embeddings")
|
|
||||||
|
|
||||||
# Initialize multi-vector embeddings table and associated similarity function
|
|
||||||
drop_multi_vector_sql = """
|
|
||||||
DROP TABLE IF EXISTS multi_vector_embeddings;
|
|
||||||
"""
|
|
||||||
await conn.execute(text(drop_multi_vector_sql))
|
|
||||||
LOGGER.info("Dropped existing multi_vector_embeddings table (if any)")
|
|
||||||
|
|
||||||
create_multi_vector_sql = """
|
|
||||||
CREATE TABLE multi_vector_embeddings (
|
|
||||||
id BIGSERIAL PRIMARY KEY,
|
|
||||||
embeddings BIT(128)[]
|
|
||||||
);
|
|
||||||
"""
|
|
||||||
await conn.execute(text(create_multi_vector_sql))
|
|
||||||
LOGGER.info(
|
|
||||||
"Created multi_vector_embeddings table with BIT(128)[] embeddings column"
|
|
||||||
)
|
|
||||||
|
|
||||||
create_function_sql = """
|
|
||||||
CREATE OR REPLACE FUNCTION max_sim(document_bits BIT[], query_bits BIT[]) RETURNS double precision AS $$
|
|
||||||
WITH queries AS (
|
|
||||||
SELECT row_number() OVER () AS query_number, * FROM (SELECT unnest(query_bits) AS query) AS foo
|
|
||||||
),
|
|
||||||
documents AS (
|
|
||||||
SELECT unnest(document_bits) AS document
|
|
||||||
),
|
|
||||||
similarities AS (
|
|
||||||
SELECT
|
|
||||||
query_number,
|
|
||||||
1.0 - (bit_count(document # query)::float / greatest(bit_length(query), 1)::float) AS similarity
|
|
||||||
FROM queries CROSS JOIN documents
|
|
||||||
),
|
|
||||||
max_similarities AS (
|
|
||||||
SELECT MAX(similarity) AS max_similarity FROM similarities GROUP BY query_number
|
|
||||||
)
|
|
||||||
SELECT SUM(max_similarity) FROM max_similarities
|
|
||||||
$$ LANGUAGE SQL;
|
|
||||||
"""
|
|
||||||
await conn.execute(text(create_function_sql))
|
|
||||||
LOGGER.info("Created function max_sim for multi-vector similarity computation")
|
|
||||||
|
|
||||||
await engine.dispose()
|
|
||||||
LOGGER.info("PostgreSQL setup completed successfully")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
LOGGER.error(f"Failed to setup PostgreSQL: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
asyncio.run(_setup_postgres())
|
|
||||||
|
|
||||||
|
|
||||||
def setup():
|
def setup():
|
||||||
# Setup S3 if configured
|
# Setup S3 if configured
|
||||||
if STORAGE_PROVIDER == "aws-s3":
|
if STORAGE_PROVIDER == "aws-s3":
|
||||||
@ -366,9 +194,7 @@ def setup():
|
|||||||
setup_mongodb()
|
setup_mongodb()
|
||||||
LOGGER.info("MongoDB setup completed.")
|
LOGGER.info("MongoDB setup completed.")
|
||||||
case "postgres":
|
case "postgres":
|
||||||
LOGGER.info("Setting up PostgreSQL...")
|
LOGGER.info("Postgres is setup on database intialization - nothing to do here!")
|
||||||
setup_postgres()
|
|
||||||
LOGGER.info("PostgreSQL setup completed.")
|
|
||||||
case _:
|
case _:
|
||||||
LOGGER.error(f"Unsupported database provider: {DATABASE_PROVIDER}")
|
LOGGER.error(f"Unsupported database provider: {DATABASE_PROVIDER}")
|
||||||
raise ValueError(f"Unsupported database provider: {DATABASE_PROVIDER}")
|
raise ValueError(f"Unsupported database provider: {DATABASE_PROVIDER}")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user