Add local file system for storage (#10)

This commit is contained in:
Adityavardhan Agrawal 2024-12-31 16:55:51 +05:30 committed by GitHub
parent 3e4a9999ad
commit 367dc079e8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 89 additions and 40 deletions

View File

@ -1,6 +1,8 @@
MONGODB_URI="mongodb+srv://..."
OPENAI_API_KEY="sk-proj-..."
UNSTRUCTURED_API_KEY="..."
MONGODB_URI="..."
OPENAI_API_KEY="..."
# Optional: Only needed if using AWS S3 storage
AWS_ACCESS_KEY="..."
AWS_SECRET_ACCESS_KEY="..."
UNSTRUCTURED_API_KEY="..."
ASSEMBLYAI_API_KEY="..."
JWT_SECRET_KEY="..."

3
.gitignore vendored
View File

@ -23,3 +23,6 @@ core/tests/assets
.vscode/
*.DS_Store
storage/*
logs/*

View File

@ -5,7 +5,7 @@ port = 8000
reload = false
[service.components]
storage = "aws-s3"
storage = "local" # "aws-s3"
database = "mongodb"
vector_store = "mongodb"
embedding = "openai" # "ollama"
@ -13,6 +13,9 @@ completion = "openai" # "ollama"
parser = "combined" # "unstructured"
# Storage Configuration
[storage.local]
path = "./storage"
[storage.aws]
region = "us-east-2"
bucket_name = "databridge-s3-storage"

View File

@ -24,6 +24,7 @@ from core.config import get_settings
from core.database.mongo_database import MongoDatabase
from core.vector_store.mongo_vector_store import MongoDBAtlasVectorStore
from core.storage.s3_storage import S3Storage
from core.storage.local_storage import LocalStorage
from core.embedding.openai_embedding_model import OpenAIEmbeddingModel
from core.completion.ollama_completion import OllamaCompletionModel
@ -74,6 +75,8 @@ match settings.VECTOR_STORE_PROVIDER:
# Initialize storage
match settings.STORAGE_PROVIDER:
case "local":
storage = LocalStorage(storage_path=settings.STORAGE_PATH)
case "aws-s3":
storage = S3Storage(
aws_access_key=settings.AWS_ACCESS_KEY,

View File

@ -13,8 +13,8 @@ class Settings(BaseSettings):
OPENAI_API_KEY: str = Field(..., env="OPENAI_API_KEY")
UNSTRUCTURED_API_KEY: str = Field(..., env="UNSTRUCTURED_API_KEY")
ASSEMBLYAI_API_KEY: str = Field(..., env="ASSEMBLYAI_API_KEY")
AWS_ACCESS_KEY: str = Field(..., env="AWS_ACCESS_KEY")
AWS_SECRET_ACCESS_KEY: str = Field(..., env="AWS_SECRET_ACCESS_KEY")
AWS_ACCESS_KEY: str = Field(None, env="AWS_ACCESS_KEY")
AWS_SECRET_ACCESS_KEY: str = Field(None, env="AWS_SECRET_ACCESS_KEY")
JWT_SECRET_KEY: str = Field(..., env="JWT_SECRET_KEY")
# Service settings
@ -23,7 +23,7 @@ class Settings(BaseSettings):
RELOAD: bool = False
# Component selection
STORAGE_PROVIDER: str = "aws-s3"
STORAGE_PROVIDER: str = "local"
DATABASE_PROVIDER: str = "mongodb"
VECTOR_STORE_PROVIDER: str = "mongodb"
EMBEDDING_PROVIDER: str = "openai"
@ -31,6 +31,7 @@ class Settings(BaseSettings):
PARSER_PROVIDER: str = "combined"
# Storage settings
STORAGE_PATH: str = "./storage"
AWS_REGION: str = "us-east-2"
S3_BUCKET: str = "databridge-s3-storage"
@ -83,6 +84,7 @@ def get_settings() -> Settings:
"COMPLETION_PROVIDER": config["service"]["components"]["completion"],
"PARSER_PROVIDER": config["service"]["components"]["parser"],
# Storage settings
"STORAGE_PATH": config["storage"]["local"]["path"],
"AWS_REGION": config["storage"]["aws"]["region"],
"S3_BUCKET": config["storage"]["aws"]["bucket_name"],
# Database settings

View File

@ -5,26 +5,6 @@ from typing import Tuple, Optional, Union, BinaryIO
class BaseStorage(ABC):
"""Base interface for storage providers."""
@abstractmethod
async def upload_file(
self,
file: Union[str, bytes, BinaryIO],
key: str,
content_type: Optional[str] = None,
) -> Tuple[str, str]:
"""
Upload a file to storage.
Args:
file: File content as string, bytes or file object
key: Storage key/path for the file
content_type: Optional MIME type
Returns:
Tuple[str, str]: (bucket/container name, storage key)
"""
pass
@abstractmethod
async def upload_from_base64(
self, content: str, key: str, content_type: Optional[str] = None

View File

@ -0,0 +1,49 @@
import base64
from pathlib import Path
from typing import Tuple, Optional, BinaryIO
from .base_storage import BaseStorage
class LocalStorage(BaseStorage):
def __init__(self, storage_path: str):
"""Initialize local storage with a base path."""
self.storage_path = Path(storage_path)
# Create storage directory if it doesn't exist
self.storage_path.mkdir(parents=True, exist_ok=True)
async def download_file(self, bucket: str, key: str) -> BinaryIO:
"""Download a file from local storage."""
file_path = self.storage_path / key
if not file_path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
return open(file_path, "rb")
async def upload_from_base64(
self, base64_content: str, key: str, content_type: Optional[str] = None
) -> Tuple[str, str]:
"""Upload base64 encoded content to local storage."""
# Decode base64 content
file_content = base64.b64decode(base64_content)
# Create file path
file_path = self.storage_path / key
# Write content to file
with open(file_path, "wb") as f:
f.write(file_content)
return str(self.storage_path), key
async def get_download_url(self, bucket: str, key: str) -> str:
"""Get local file path as URL."""
file_path = self.storage_path / key
if not file_path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
return f"file://{file_path.absolute()}"
async def delete_file(self, bucket: str, key: str) -> bool:
"""Delete a file from local storage."""
file_path = self.storage_path / key
if file_path.exists():
file_path.unlink()
return True

View File

@ -41,8 +41,7 @@ with open(config_path, "rb") as f:
LOGGER.info("Loaded configuration from config.toml")
# Extract configuration values
DEFAULT_REGION = CONFIG["storage"]["aws"]["region"]
DEFAULT_BUCKET_NAME = CONFIG["storage"]["aws"]["bucket_name"]
STORAGE_PROVIDER = CONFIG["service"]["components"]["storage"]
DATABASE_NAME = CONFIG["database"]["mongodb"]["database_name"]
DOCUMENTS_COLLECTION = CONFIG["database"]["mongodb"]["documents_collection"]
CHUNKS_COLLECTION = CONFIG["database"]["mongodb"]["chunks_collection"]
@ -50,8 +49,15 @@ VECTOR_DIMENSIONS = CONFIG["vector_store"]["mongodb"]["dimensions"]
VECTOR_INDEX_NAME = CONFIG["vector_store"]["mongodb"]["index_name"]
SIMILARITY_METRIC = CONFIG["vector_store"]["mongodb"]["similarity_metric"]
# Extract storage-specific configuration
DEFAULT_REGION = CONFIG["storage"]["aws"]["region"] if STORAGE_PROVIDER == "aws-s3" else None
DEFAULT_BUCKET_NAME = (
CONFIG["storage"]["aws"]["bucket_name"] if STORAGE_PROVIDER == "aws-s3" else None
)
def create_s3_bucket(bucket_name, region=DEFAULT_REGION):
"""Set up S3 bucket."""
# Clear any existing AWS credentials from environment
boto3.Session().resource("s3").meta.client.close()
@ -74,11 +80,10 @@ def create_s3_bucket(bucket_name, region=DEFAULT_REGION):
s3_client = session.client("s3")
LOGGER.debug("Successfully created S3 client.")
# create_bucket = not
if bucket_exists(s3_client, bucket_name):
LOGGER.info(f"Bucket with name {bucket_name} already exists")
return
# Create bucket with location constraint if region is not us-east-1
if region == "us-east-1":
s3_client.create_bucket(Bucket=bucket_name)
else:
@ -90,9 +95,7 @@ def create_s3_bucket(bucket_name, region=DEFAULT_REGION):
def bucket_exists(s3_client, bucket_name):
"""
Check if an S3 bucket exists.
"""
"""Check if an S3 bucket exists."""
try:
s3_client.head_bucket(Bucket=bucket_name)
return True
@ -167,9 +170,11 @@ def setup_mongodb():
def setup():
LOGGER.info("Creating S3 bucket...")
create_s3_bucket(DEFAULT_BUCKET_NAME)
LOGGER.info("S3 bucket created successfully.")
# Setup S3 if configured
if STORAGE_PROVIDER == "aws-s3":
LOGGER.info("Setting up S3 bucket...")
create_s3_bucket(DEFAULT_BUCKET_NAME, DEFAULT_REGION)
LOGGER.info("S3 bucket setup completed.")
LOGGER.info("Setting up MongoDB...")
setup_mongodb()

View File

@ -39,11 +39,13 @@ class DB:
doc = self._client.ingest_text(content, metadata=metadata or {})
return doc.model_dump()
def ingest_file(self, file_path: str, metadata: dict = None, content_type: str = None) -> dict:
def ingest_file(
self, file: str, filename: str, metadata: dict = None, content_type: str = None
) -> dict:
"""Ingest a file into DataBridge"""
file_path = Path(file_path)
file_path = Path(file)
doc = self._client.ingest_file(
file_path, filename=file_path.name, content_type=content_type, metadata=metadata or {}
file=file_path, filename=filename, content_type=content_type, metadata=metadata or {}
)
return doc.model_dump()