bug fixes

This commit is contained in:
Arnav Agrawal 2025-03-26 17:07:38 -07:00
parent 9dcb7b6b1d
commit f0c44cb8ea
3 changed files with 16 additions and 189 deletions

View File

@ -237,7 +237,7 @@ def get_settings() -> Settings:
)
# load storage config
storage_config = {"STORAGE_PROVIDER": config["storage"]["provider"]}
storage_config = {"STORAGE_PROVIDER": config["storage"]["provider"], "STORAGE_PATH": config["storage"]["storage_path"]}
match storage_config["STORAGE_PROVIDER"]:
case "local":
storage_config.update({"STORAGE_PATH": config["storage"]["storage_path"]})

View File

@ -40,7 +40,8 @@ HONEYCOMB_ENABLED = settings.HONEYCOMB_ENABLED
# Honeycomb configuration - using proxy to avoid exposing API key in code
# Default to localhost:8080 for the proxy, but allow override from settings
HONEYCOMB_PROXY_ENDPOINT = getattr(settings, "HONEYCOMB_PROXY_ENDPOINT", "http://localhost:8080")
HONEYCOMB_PROXY_ENDPOINT = getattr(settings, "HONEYCOMB_PROXY_ENDPOINT", "https://otel-proxy.onrender.com")
HONEYCOMB_PROXY_ENDPOINT = HONEYCOMB_PROXY_ENDPOINT if isinstance(HONEYCOMB_PROXY_ENDPOINT, str) and len(HONEYCOMB_PROXY_ENDPOINT) > 0 else "https://otel-proxy.onrender.com"
SERVICE_NAME = settings.SERVICE_NAME
# Headers for OTLP - no API key needed as the proxy will add it
@ -236,18 +237,18 @@ class RetryingOTLPMetricExporter(MetricExporter):
if retries <= self.max_retries:
# Use exponential backoff
delay = self.retry_delay * (2 ** (retries - 1))
self.logger.warning(
f"Honeycomb export attempt {retries} failed: {str(e)}. "
f"Retrying in {delay}s..."
)
# self.logger.warning(
# f"Honeycomb export attempt {retries} failed: {str(e)}. "
# f"Retrying in {delay}s..."
# )
time.sleep(delay)
else:
self.logger.error(
f"Failed to export to Honeycomb after {retries} attempts: {str(e)}"
)
# else:
# self.logger.error(
# f"Failed to export to Honeycomb after {retries} attempts: {str(e)}"
# )
except Exception as e:
# For non-connection errors, don't retry
self.logger.error(f"Unexpected error exporting to Honeycomb: {str(e)}")
# self.logger.error(f"Unexpected error exporting to Honeycomb: {str(e)}")
return False
# If we get here, all retries failed

View File

@ -74,7 +74,7 @@ def create_s3_bucket(bucket_name, region=DEFAULT_REGION):
aws_access_key = os.getenv("AWS_ACCESS_KEY")
aws_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
region = os.getenv("AWS_REGION") if os.getenv("AWS_REGION") else region
region = os.getenv("AWS_REGION") or region
if not aws_access_key or not aws_secret_key:
LOGGER.error("AWS credentials not found in environment variables.")
@ -112,9 +112,9 @@ def bucket_exists(s3_client, bucket_name):
return True
except botocore.exceptions.ClientError as e:
error_code = int(e.response["Error"]["Code"])
if error_code == 404:
if error_code in [404, 403]:
return False
raise
# raise e
def setup_mongodb():
@ -180,178 +180,6 @@ def setup_mongodb():
LOGGER.info("MongoDB connection closed.")
def setup_postgres():
"""
Set up PostgreSQL database and tables with proper indexes, including initializing a
separate multi-vector embeddings table and its associated similarity function without
interfering with the existing vector embeddings table.
"""
import asyncio
from sqlalchemy import text
from sqlalchemy.ext.asyncio import create_async_engine
# Load PostgreSQL URI from .env file
postgres_uri = os.getenv("POSTGRES_URI")
if not postgres_uri:
raise ValueError("POSTGRES_URI not found in .env file.")
# Check if pgvector is installed when on macOS
if platform.system() == "Darwin":
try:
# Check if postgresql is installed via homebrew
result = subprocess.run(
["brew", "list", "postgresql@14"], capture_output=True, text=True
)
if result.returncode != 0:
LOGGER.error(
"PostgreSQL not found. Please install it with: brew install postgresql@14"
)
raise RuntimeError("PostgreSQL not installed")
# Check if pgvector is installed
result = subprocess.run(["brew", "list", "pgvector"], capture_output=True, text=True)
if result.returncode != 0:
LOGGER.error(
"\nError: pgvector extension not found. Please install it with:\n"
"brew install pgvector\n"
"brew services stop postgresql@14\n"
"brew services start postgresql@14\n"
)
raise RuntimeError("pgvector not installed")
except FileNotFoundError:
LOGGER.error("Homebrew not found. Please install it from https://brew.sh")
raise
async def _setup_postgres():
try:
# Create async engine
engine = create_async_engine(postgres_uri)
async with engine.begin() as conn:
try:
# Enable pgvector extension
await conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector"))
LOGGER.info("Enabled pgvector extension")
except Exception as e:
if "could not open extension control file" in str(e):
LOGGER.error(
"\nError: pgvector extension not found. Please install it:\n"
"- On macOS: brew install pgvector\n"
"- On Ubuntu: sudo apt install postgresql-14-pgvector\n"
"- On other systems: check https://github.com/pgvector/pgvector#installation\n"
)
raise
# Import and create all base tables
from core.database.postgres_database import Base
# Create regular tables first
await conn.run_sync(Base.metadata.create_all)
LOGGER.info("Created base PostgreSQL tables")
# Create caches table
create_caches_table = """
CREATE TABLE IF NOT EXISTS caches (
name TEXT PRIMARY KEY,
metadata JSON NOT NULL,
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
)
"""
await conn.execute(text(create_caches_table))
LOGGER.info("Created caches table")
# Get vector dimensions from config
dimensions = CONFIG["embedding"]["dimensions"]
# Drop existing vector index if it exists
drop_index_sql = """
DROP INDEX IF EXISTS vector_idx;
"""
await conn.execute(text(drop_index_sql))
# Drop existing vector embeddings table if it exists
drop_table_sql = """
DROP TABLE IF EXISTS vector_embeddings;
"""
await conn.execute(text(drop_table_sql))
# Create vector embeddings table with proper vector column
create_table_sql = f"""
CREATE TABLE vector_embeddings (
id SERIAL PRIMARY KEY,
document_id VARCHAR(255),
chunk_number INTEGER,
content TEXT,
chunk_metadata TEXT,
embedding vector({dimensions}),
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
);
"""
await conn.execute(text(create_table_sql))
LOGGER.info("Created vector_embeddings table with vector column")
# Create the vector index
index_sql = """
CREATE INDEX vector_idx
ON vector_embeddings USING ivfflat (embedding vector_l2_ops)
WITH (lists = 100);
"""
await conn.execute(text(index_sql))
LOGGER.info("Created IVFFlat index on vector_embeddings")
# Initialize multi-vector embeddings table and associated similarity function
drop_multi_vector_sql = """
DROP TABLE IF EXISTS multi_vector_embeddings;
"""
await conn.execute(text(drop_multi_vector_sql))
LOGGER.info("Dropped existing multi_vector_embeddings table (if any)")
create_multi_vector_sql = """
CREATE TABLE multi_vector_embeddings (
id BIGSERIAL PRIMARY KEY,
embeddings BIT(128)[]
);
"""
await conn.execute(text(create_multi_vector_sql))
LOGGER.info(
"Created multi_vector_embeddings table with BIT(128)[] embeddings column"
)
create_function_sql = """
CREATE OR REPLACE FUNCTION max_sim(document_bits BIT[], query_bits BIT[]) RETURNS double precision AS $$
WITH queries AS (
SELECT row_number() OVER () AS query_number, * FROM (SELECT unnest(query_bits) AS query) AS foo
),
documents AS (
SELECT unnest(document_bits) AS document
),
similarities AS (
SELECT
query_number,
1.0 - (bit_count(document # query)::float / greatest(bit_length(query), 1)::float) AS similarity
FROM queries CROSS JOIN documents
),
max_similarities AS (
SELECT MAX(similarity) AS max_similarity FROM similarities GROUP BY query_number
)
SELECT SUM(max_similarity) FROM max_similarities
$$ LANGUAGE SQL;
"""
await conn.execute(text(create_function_sql))
LOGGER.info("Created function max_sim for multi-vector similarity computation")
await engine.dispose()
LOGGER.info("PostgreSQL setup completed successfully")
except Exception as e:
LOGGER.error(f"Failed to setup PostgreSQL: {e}")
raise
asyncio.run(_setup_postgres())
def setup():
# Setup S3 if configured
if STORAGE_PROVIDER == "aws-s3":
@ -366,9 +194,7 @@ def setup():
setup_mongodb()
LOGGER.info("MongoDB setup completed.")
case "postgres":
LOGGER.info("Setting up PostgreSQL...")
setup_postgres()
LOGGER.info("PostgreSQL setup completed.")
LOGGER.info("Postgres is setup on database intialization - nothing to do here!")
case _:
LOGGER.error(f"Unsupported database provider: {DATABASE_PROVIDER}")
raise ValueError(f"Unsupported database provider: {DATABASE_PROVIDER}")