2024-12-23 13:22:21 -05:00
|
|
|
import botocore
|
|
|
|
from dotenv import load_dotenv, find_dotenv
|
|
|
|
import os
|
|
|
|
import boto3
|
|
|
|
import logging
|
|
|
|
import tomli # for reading toml files
|
|
|
|
from pathlib import Path
|
|
|
|
from pymongo import MongoClient
|
|
|
|
from pymongo.errors import ConnectionFailure, OperationFailure
|
|
|
|
from pymongo.operations import SearchIndexModel
|
2024-12-26 11:34:24 -05:00
|
|
|
import argparse
|
2024-12-23 13:22:21 -05:00
|
|
|
|
|
|
|
# Force reload of environment variables
|
|
|
|
load_dotenv(find_dotenv(), override=True)
|
|
|
|
|
|
|
|
# Set up argument parser
|
2024-12-26 11:34:24 -05:00
|
|
|
parser = argparse.ArgumentParser(description="Setup S3 bucket and MongoDB collections")
|
|
|
|
parser.add_argument("--debug", action="store_true", help="Enable debug logging")
|
2024-12-29 12:48:41 +05:30
|
|
|
parser.add_argument("--quiet", action="store_true", help="Only show warning and error logs")
|
2024-12-23 13:22:21 -05:00
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
# Configure logging based on command line arguments
|
|
|
|
LOGGER = logging.getLogger(__name__)
|
2025-01-01 09:18:23 -05:00
|
|
|
match (args.debug, args.quiet):
|
|
|
|
case (True, _):
|
|
|
|
LOGGER.setLevel(logging.DEBUG)
|
|
|
|
case (_, True):
|
|
|
|
LOGGER.setLevel(logging.WARNING)
|
|
|
|
case _:
|
|
|
|
LOGGER.setLevel(logging.INFO)
|
2024-12-23 13:22:21 -05:00
|
|
|
|
|
|
|
# Add console handler with formatting
|
|
|
|
console_handler = logging.StreamHandler()
|
2024-12-26 11:34:24 -05:00
|
|
|
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
2024-12-23 13:22:21 -05:00
|
|
|
console_handler.setFormatter(formatter)
|
|
|
|
LOGGER.addHandler(console_handler)
|
|
|
|
|
|
|
|
# Load configuration from config.toml
|
|
|
|
config_path = Path("config.toml")
|
|
|
|
with open(config_path, "rb") as f:
|
|
|
|
CONFIG = tomli.load(f)
|
|
|
|
LOGGER.info("Loaded configuration from config.toml")
|
|
|
|
|
|
|
|
# Extract configuration values
|
2024-12-31 16:55:51 +05:30
|
|
|
STORAGE_PROVIDER = CONFIG["service"]["components"]["storage"]
|
2025-01-04 18:41:09 +05:30
|
|
|
DATABASE_PROVIDER = CONFIG["service"]["components"]["database"]
|
|
|
|
DATABASE_NAME = CONFIG["database"][DATABASE_PROVIDER]["database_name"]
|
|
|
|
|
|
|
|
# MongoDB specific config
|
2024-12-27 12:17:16 +05:30
|
|
|
DOCUMENTS_COLLECTION = CONFIG["database"]["mongodb"]["documents_collection"]
|
|
|
|
CHUNKS_COLLECTION = CONFIG["database"]["mongodb"]["chunks_collection"]
|
|
|
|
VECTOR_DIMENSIONS = CONFIG["vector_store"]["mongodb"]["dimensions"]
|
|
|
|
VECTOR_INDEX_NAME = CONFIG["vector_store"]["mongodb"]["index_name"]
|
|
|
|
SIMILARITY_METRIC = CONFIG["vector_store"]["mongodb"]["similarity_metric"]
|
2024-12-23 13:22:21 -05:00
|
|
|
|
2025-01-04 18:41:09 +05:30
|
|
|
# PostgreSQL specific config
|
|
|
|
DOCUMENTS_TABLE = CONFIG["database"]["postgres"]["documents_table"]
|
|
|
|
CHUNKS_TABLE = CONFIG["database"]["postgres"]["chunks_table"]
|
|
|
|
|
2024-12-31 16:55:51 +05:30
|
|
|
# Extract storage-specific configuration
|
|
|
|
DEFAULT_REGION = CONFIG["storage"]["aws"]["region"] if STORAGE_PROVIDER == "aws-s3" else None
|
|
|
|
DEFAULT_BUCKET_NAME = (
|
|
|
|
CONFIG["storage"]["aws"]["bucket_name"] if STORAGE_PROVIDER == "aws-s3" else None
|
|
|
|
)
|
|
|
|
|
2024-12-26 11:34:24 -05:00
|
|
|
|
2024-12-23 13:22:21 -05:00
|
|
|
def create_s3_bucket(bucket_name, region=DEFAULT_REGION):
|
2024-12-31 16:55:51 +05:30
|
|
|
"""Set up S3 bucket."""
|
2024-12-23 13:22:21 -05:00
|
|
|
# Clear any existing AWS credentials from environment
|
2024-12-26 11:34:24 -05:00
|
|
|
boto3.Session().resource("s3").meta.client.close()
|
|
|
|
|
|
|
|
aws_access_key = os.getenv("AWS_ACCESS_KEY")
|
|
|
|
aws_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
|
|
|
|
region = os.getenv("AWS_REGION") if os.getenv("AWS_REGION") else region
|
|
|
|
|
2024-12-23 13:22:21 -05:00
|
|
|
if not aws_access_key or not aws_secret_key:
|
|
|
|
LOGGER.error("AWS credentials not found in environment variables.")
|
|
|
|
return
|
2024-12-26 11:34:24 -05:00
|
|
|
|
|
|
|
LOGGER.debug("Successfully retrieved AWS credentials and region.")
|
2024-12-23 13:22:21 -05:00
|
|
|
# Create new session with explicit credentials
|
|
|
|
session = boto3.Session(
|
|
|
|
aws_access_key_id=aws_access_key,
|
|
|
|
aws_secret_access_key=aws_secret_key,
|
2024-12-26 11:34:24 -05:00
|
|
|
region_name=region,
|
2024-12-23 13:22:21 -05:00
|
|
|
)
|
2024-12-26 11:34:24 -05:00
|
|
|
|
|
|
|
s3_client = session.client("s3")
|
2024-12-23 13:22:21 -05:00
|
|
|
LOGGER.debug("Successfully created S3 client.")
|
|
|
|
|
|
|
|
if bucket_exists(s3_client, bucket_name):
|
|
|
|
LOGGER.info(f"Bucket with name {bucket_name} already exists")
|
2024-12-26 11:34:24 -05:00
|
|
|
return
|
2024-12-31 16:55:51 +05:30
|
|
|
|
2024-12-26 11:34:24 -05:00
|
|
|
if region == "us-east-1":
|
2024-12-23 13:22:21 -05:00
|
|
|
s3_client.create_bucket(Bucket=bucket_name)
|
|
|
|
else:
|
|
|
|
s3_client.create_bucket(
|
2024-12-26 11:34:24 -05:00
|
|
|
Bucket=bucket_name, CreateBucketConfiguration={"LocationConstraint": region}
|
2024-12-23 13:22:21 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
LOGGER.debug(f"Bucket {bucket_name} created successfully in {region} region.")
|
|
|
|
|
|
|
|
|
|
|
|
def bucket_exists(s3_client, bucket_name):
|
2024-12-31 16:55:51 +05:30
|
|
|
"""Check if an S3 bucket exists."""
|
2024-12-23 13:22:21 -05:00
|
|
|
try:
|
|
|
|
s3_client.head_bucket(Bucket=bucket_name)
|
|
|
|
return True
|
|
|
|
except botocore.exceptions.ClientError as e:
|
2024-12-26 11:34:24 -05:00
|
|
|
error_code = int(e.response["Error"]["Code"])
|
2024-12-23 13:22:21 -05:00
|
|
|
if error_code == 404:
|
|
|
|
return False
|
|
|
|
raise
|
|
|
|
|
2024-12-26 11:34:24 -05:00
|
|
|
|
2024-12-23 13:22:21 -05:00
|
|
|
def setup_mongodb():
|
|
|
|
"""
|
|
|
|
Set up MongoDB database, documents collection, and vector index on documents_chunk collection.
|
|
|
|
"""
|
|
|
|
# Load MongoDB URI from .env file
|
2024-12-26 11:34:24 -05:00
|
|
|
mongo_uri = os.getenv("MONGODB_URI")
|
2024-12-23 13:22:21 -05:00
|
|
|
if not mongo_uri:
|
|
|
|
raise ValueError("MONGODB_URI not found in .env file.")
|
2024-12-26 11:34:24 -05:00
|
|
|
|
2024-12-23 13:22:21 -05:00
|
|
|
try:
|
|
|
|
# Connect to MongoDB
|
|
|
|
client = MongoClient(mongo_uri)
|
2024-12-26 11:34:24 -05:00
|
|
|
client.admin.command("ping") # Check connection
|
2024-12-23 13:22:21 -05:00
|
|
|
LOGGER.info("Connected to MongoDB successfully.")
|
2024-12-26 11:34:24 -05:00
|
|
|
|
2024-12-23 13:22:21 -05:00
|
|
|
# Create or access the database
|
|
|
|
db = client[DATABASE_NAME]
|
|
|
|
LOGGER.info(f"Database '{DATABASE_NAME}' ready.")
|
2024-12-26 11:34:24 -05:00
|
|
|
|
2024-12-23 13:22:21 -05:00
|
|
|
# Create 'documents' collection
|
|
|
|
if DOCUMENTS_COLLECTION not in db.list_collection_names():
|
|
|
|
db.create_collection(DOCUMENTS_COLLECTION)
|
|
|
|
LOGGER.info(f"Collection '{DOCUMENTS_COLLECTION}' created.")
|
|
|
|
else:
|
|
|
|
LOGGER.info(f"Collection '{DOCUMENTS_COLLECTION}' already exists.")
|
2024-12-26 11:34:24 -05:00
|
|
|
|
2024-12-23 13:22:21 -05:00
|
|
|
# Create 'documents_chunk' collection with vector index
|
|
|
|
if CHUNKS_COLLECTION not in db.list_collection_names():
|
|
|
|
db.create_collection(CHUNKS_COLLECTION)
|
|
|
|
LOGGER.info(f"Collection '{CHUNKS_COLLECTION}' created.")
|
|
|
|
else:
|
|
|
|
LOGGER.info(f"Collection '{CHUNKS_COLLECTION}' already exists.")
|
2024-12-26 11:34:24 -05:00
|
|
|
|
2024-12-23 13:22:21 -05:00
|
|
|
vector_index_definition = {
|
|
|
|
"fields": [
|
2024-12-26 11:34:24 -05:00
|
|
|
{
|
|
|
|
"numDimensions": VECTOR_DIMENSIONS,
|
|
|
|
"path": "embedding",
|
2024-12-27 12:17:16 +05:30
|
|
|
"similarity": SIMILARITY_METRIC,
|
2024-12-26 11:34:24 -05:00
|
|
|
"type": "vector",
|
|
|
|
},
|
|
|
|
{"path": "document_id", "type": "filter"},
|
2024-12-23 13:22:21 -05:00
|
|
|
]
|
|
|
|
}
|
|
|
|
vector_index = SearchIndexModel(
|
|
|
|
name=VECTOR_INDEX_NAME,
|
|
|
|
definition=vector_index_definition,
|
2024-12-26 11:34:24 -05:00
|
|
|
type="vectorSearch",
|
2024-12-23 13:22:21 -05:00
|
|
|
)
|
|
|
|
db[CHUNKS_COLLECTION].create_search_index(model=vector_index)
|
2024-12-29 12:48:41 +05:30
|
|
|
LOGGER.info("Vector index 'vector_index' created on 'documents_chunk' collection.")
|
2024-12-26 11:34:24 -05:00
|
|
|
|
2024-12-23 13:22:21 -05:00
|
|
|
except ConnectionFailure:
|
2024-12-29 12:48:41 +05:30
|
|
|
LOGGER.error("Failed to connect to MongoDB. Check your MongoDB URI and network connection.")
|
2024-12-23 13:22:21 -05:00
|
|
|
except OperationFailure as e:
|
|
|
|
LOGGER.error(f"MongoDB operation failed: {e}")
|
|
|
|
except Exception as e:
|
|
|
|
LOGGER.error(f"Unexpected error: {e}")
|
|
|
|
finally:
|
|
|
|
client.close()
|
|
|
|
LOGGER.info("MongoDB connection closed.")
|
|
|
|
|
2024-12-26 11:34:24 -05:00
|
|
|
|
2025-01-04 18:41:09 +05:30
|
|
|
def setup_postgres():
|
|
|
|
"""
|
|
|
|
Set up PostgreSQL database and tables with proper indexes.
|
|
|
|
"""
|
|
|
|
import asyncio
|
|
|
|
from sqlalchemy.ext.asyncio import create_async_engine
|
|
|
|
|
|
|
|
# Load PostgreSQL URI from .env file
|
|
|
|
postgres_uri = os.getenv("POSTGRES_URI")
|
|
|
|
if not postgres_uri:
|
|
|
|
raise ValueError("POSTGRES_URI not found in .env file.")
|
|
|
|
|
|
|
|
async def _setup_postgres():
|
|
|
|
try:
|
|
|
|
# Create async engine
|
|
|
|
engine = create_async_engine(postgres_uri)
|
|
|
|
|
|
|
|
async with engine.begin() as conn:
|
|
|
|
# Import and create all tables
|
|
|
|
from core.database.postgres_database import Base
|
|
|
|
|
|
|
|
await conn.run_sync(Base.metadata.create_all)
|
|
|
|
LOGGER.info("Created all PostgreSQL tables and indexes.")
|
|
|
|
|
|
|
|
await engine.dispose()
|
|
|
|
LOGGER.info("PostgreSQL setup completed successfully.")
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
LOGGER.error(f"Failed to setup PostgreSQL: {e}")
|
|
|
|
raise
|
|
|
|
|
|
|
|
asyncio.run(_setup_postgres())
|
|
|
|
|
|
|
|
|
2024-12-23 13:22:21 -05:00
|
|
|
def setup():
|
2024-12-31 16:55:51 +05:30
|
|
|
# Setup S3 if configured
|
|
|
|
if STORAGE_PROVIDER == "aws-s3":
|
|
|
|
LOGGER.info("Setting up S3 bucket...")
|
|
|
|
create_s3_bucket(DEFAULT_BUCKET_NAME, DEFAULT_REGION)
|
|
|
|
LOGGER.info("S3 bucket setup completed.")
|
2024-12-26 11:34:24 -05:00
|
|
|
|
2025-01-04 18:41:09 +05:30
|
|
|
# Setup database based on provider
|
|
|
|
match DATABASE_PROVIDER:
|
|
|
|
case "mongodb":
|
|
|
|
LOGGER.info("Setting up MongoDB...")
|
|
|
|
setup_mongodb()
|
|
|
|
LOGGER.info("MongoDB setup completed.")
|
|
|
|
case "postgres":
|
|
|
|
LOGGER.info("Setting up PostgreSQL...")
|
|
|
|
setup_postgres()
|
|
|
|
LOGGER.info("PostgreSQL setup completed.")
|
|
|
|
case _:
|
|
|
|
LOGGER.error(f"Unsupported database provider: {DATABASE_PROVIDER}")
|
|
|
|
raise ValueError(f"Unsupported database provider: {DATABASE_PROVIDER}")
|
2024-12-23 13:22:21 -05:00
|
|
|
|
|
|
|
LOGGER.info("Setup completed successfully. Feel free to start the server now!")
|
|
|
|
|
2024-12-26 11:34:24 -05:00
|
|
|
|
2024-12-23 13:22:21 -05:00
|
|
|
if __name__ == "__main__":
|
|
|
|
setup()
|