mirror of
https://github.com/james-m-jordan/morphik-core.git
synced 2025-05-09 19:32:38 +00:00
Add local file system for storage (#10)
This commit is contained in:
parent
3e4a9999ad
commit
367dc079e8
@ -1,6 +1,8 @@
|
||||
MONGODB_URI="mongodb+srv://..."
|
||||
OPENAI_API_KEY="sk-proj-..."
|
||||
UNSTRUCTURED_API_KEY="..."
|
||||
MONGODB_URI="..."
|
||||
OPENAI_API_KEY="..."
|
||||
# Optional: Only needed if using AWS S3 storage
|
||||
AWS_ACCESS_KEY="..."
|
||||
AWS_SECRET_ACCESS_KEY="..."
|
||||
UNSTRUCTURED_API_KEY="..."
|
||||
ASSEMBLYAI_API_KEY="..."
|
||||
JWT_SECRET_KEY="..."
|
||||
|
3
.gitignore
vendored
3
.gitignore
vendored
@ -23,3 +23,6 @@ core/tests/assets
|
||||
.vscode/
|
||||
|
||||
*.DS_Store
|
||||
|
||||
storage/*
|
||||
logs/*
|
||||
|
@ -5,7 +5,7 @@ port = 8000
|
||||
reload = false
|
||||
|
||||
[service.components]
|
||||
storage = "aws-s3"
|
||||
storage = "local" # "aws-s3"
|
||||
database = "mongodb"
|
||||
vector_store = "mongodb"
|
||||
embedding = "openai" # "ollama"
|
||||
@ -13,6 +13,9 @@ completion = "openai" # "ollama"
|
||||
parser = "combined" # "unstructured"
|
||||
|
||||
# Storage Configuration
|
||||
[storage.local]
|
||||
path = "./storage"
|
||||
|
||||
[storage.aws]
|
||||
region = "us-east-2"
|
||||
bucket_name = "databridge-s3-storage"
|
||||
|
@ -24,6 +24,7 @@ from core.config import get_settings
|
||||
from core.database.mongo_database import MongoDatabase
|
||||
from core.vector_store.mongo_vector_store import MongoDBAtlasVectorStore
|
||||
from core.storage.s3_storage import S3Storage
|
||||
from core.storage.local_storage import LocalStorage
|
||||
from core.embedding.openai_embedding_model import OpenAIEmbeddingModel
|
||||
from core.completion.ollama_completion import OllamaCompletionModel
|
||||
|
||||
@ -74,6 +75,8 @@ match settings.VECTOR_STORE_PROVIDER:
|
||||
|
||||
# Initialize storage
|
||||
match settings.STORAGE_PROVIDER:
|
||||
case "local":
|
||||
storage = LocalStorage(storage_path=settings.STORAGE_PATH)
|
||||
case "aws-s3":
|
||||
storage = S3Storage(
|
||||
aws_access_key=settings.AWS_ACCESS_KEY,
|
||||
|
@ -13,8 +13,8 @@ class Settings(BaseSettings):
|
||||
OPENAI_API_KEY: str = Field(..., env="OPENAI_API_KEY")
|
||||
UNSTRUCTURED_API_KEY: str = Field(..., env="UNSTRUCTURED_API_KEY")
|
||||
ASSEMBLYAI_API_KEY: str = Field(..., env="ASSEMBLYAI_API_KEY")
|
||||
AWS_ACCESS_KEY: str = Field(..., env="AWS_ACCESS_KEY")
|
||||
AWS_SECRET_ACCESS_KEY: str = Field(..., env="AWS_SECRET_ACCESS_KEY")
|
||||
AWS_ACCESS_KEY: str = Field(None, env="AWS_ACCESS_KEY")
|
||||
AWS_SECRET_ACCESS_KEY: str = Field(None, env="AWS_SECRET_ACCESS_KEY")
|
||||
JWT_SECRET_KEY: str = Field(..., env="JWT_SECRET_KEY")
|
||||
|
||||
# Service settings
|
||||
@ -23,7 +23,7 @@ class Settings(BaseSettings):
|
||||
RELOAD: bool = False
|
||||
|
||||
# Component selection
|
||||
STORAGE_PROVIDER: str = "aws-s3"
|
||||
STORAGE_PROVIDER: str = "local"
|
||||
DATABASE_PROVIDER: str = "mongodb"
|
||||
VECTOR_STORE_PROVIDER: str = "mongodb"
|
||||
EMBEDDING_PROVIDER: str = "openai"
|
||||
@ -31,6 +31,7 @@ class Settings(BaseSettings):
|
||||
PARSER_PROVIDER: str = "combined"
|
||||
|
||||
# Storage settings
|
||||
STORAGE_PATH: str = "./storage"
|
||||
AWS_REGION: str = "us-east-2"
|
||||
S3_BUCKET: str = "databridge-s3-storage"
|
||||
|
||||
@ -83,6 +84,7 @@ def get_settings() -> Settings:
|
||||
"COMPLETION_PROVIDER": config["service"]["components"]["completion"],
|
||||
"PARSER_PROVIDER": config["service"]["components"]["parser"],
|
||||
# Storage settings
|
||||
"STORAGE_PATH": config["storage"]["local"]["path"],
|
||||
"AWS_REGION": config["storage"]["aws"]["region"],
|
||||
"S3_BUCKET": config["storage"]["aws"]["bucket_name"],
|
||||
# Database settings
|
||||
|
@ -5,26 +5,6 @@ from typing import Tuple, Optional, Union, BinaryIO
|
||||
class BaseStorage(ABC):
|
||||
"""Base interface for storage providers."""
|
||||
|
||||
@abstractmethod
|
||||
async def upload_file(
|
||||
self,
|
||||
file: Union[str, bytes, BinaryIO],
|
||||
key: str,
|
||||
content_type: Optional[str] = None,
|
||||
) -> Tuple[str, str]:
|
||||
"""
|
||||
Upload a file to storage.
|
||||
|
||||
Args:
|
||||
file: File content as string, bytes or file object
|
||||
key: Storage key/path for the file
|
||||
content_type: Optional MIME type
|
||||
|
||||
Returns:
|
||||
Tuple[str, str]: (bucket/container name, storage key)
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def upload_from_base64(
|
||||
self, content: str, key: str, content_type: Optional[str] = None
|
||||
|
49
core/storage/local_storage.py
Normal file
49
core/storage/local_storage.py
Normal file
@ -0,0 +1,49 @@
|
||||
import base64
|
||||
from pathlib import Path
|
||||
from typing import Tuple, Optional, BinaryIO
|
||||
from .base_storage import BaseStorage
|
||||
|
||||
|
||||
class LocalStorage(BaseStorage):
|
||||
def __init__(self, storage_path: str):
|
||||
"""Initialize local storage with a base path."""
|
||||
self.storage_path = Path(storage_path)
|
||||
# Create storage directory if it doesn't exist
|
||||
self.storage_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
async def download_file(self, bucket: str, key: str) -> BinaryIO:
|
||||
"""Download a file from local storage."""
|
||||
file_path = self.storage_path / key
|
||||
if not file_path.exists():
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
return open(file_path, "rb")
|
||||
|
||||
async def upload_from_base64(
|
||||
self, base64_content: str, key: str, content_type: Optional[str] = None
|
||||
) -> Tuple[str, str]:
|
||||
"""Upload base64 encoded content to local storage."""
|
||||
# Decode base64 content
|
||||
file_content = base64.b64decode(base64_content)
|
||||
|
||||
# Create file path
|
||||
file_path = self.storage_path / key
|
||||
|
||||
# Write content to file
|
||||
with open(file_path, "wb") as f:
|
||||
f.write(file_content)
|
||||
|
||||
return str(self.storage_path), key
|
||||
|
||||
async def get_download_url(self, bucket: str, key: str) -> str:
|
||||
"""Get local file path as URL."""
|
||||
file_path = self.storage_path / key
|
||||
if not file_path.exists():
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
return f"file://{file_path.absolute()}"
|
||||
|
||||
async def delete_file(self, bucket: str, key: str) -> bool:
|
||||
"""Delete a file from local storage."""
|
||||
file_path = self.storage_path / key
|
||||
if file_path.exists():
|
||||
file_path.unlink()
|
||||
return True
|
@ -41,8 +41,7 @@ with open(config_path, "rb") as f:
|
||||
LOGGER.info("Loaded configuration from config.toml")
|
||||
|
||||
# Extract configuration values
|
||||
DEFAULT_REGION = CONFIG["storage"]["aws"]["region"]
|
||||
DEFAULT_BUCKET_NAME = CONFIG["storage"]["aws"]["bucket_name"]
|
||||
STORAGE_PROVIDER = CONFIG["service"]["components"]["storage"]
|
||||
DATABASE_NAME = CONFIG["database"]["mongodb"]["database_name"]
|
||||
DOCUMENTS_COLLECTION = CONFIG["database"]["mongodb"]["documents_collection"]
|
||||
CHUNKS_COLLECTION = CONFIG["database"]["mongodb"]["chunks_collection"]
|
||||
@ -50,8 +49,15 @@ VECTOR_DIMENSIONS = CONFIG["vector_store"]["mongodb"]["dimensions"]
|
||||
VECTOR_INDEX_NAME = CONFIG["vector_store"]["mongodb"]["index_name"]
|
||||
SIMILARITY_METRIC = CONFIG["vector_store"]["mongodb"]["similarity_metric"]
|
||||
|
||||
# Extract storage-specific configuration
|
||||
DEFAULT_REGION = CONFIG["storage"]["aws"]["region"] if STORAGE_PROVIDER == "aws-s3" else None
|
||||
DEFAULT_BUCKET_NAME = (
|
||||
CONFIG["storage"]["aws"]["bucket_name"] if STORAGE_PROVIDER == "aws-s3" else None
|
||||
)
|
||||
|
||||
|
||||
def create_s3_bucket(bucket_name, region=DEFAULT_REGION):
|
||||
"""Set up S3 bucket."""
|
||||
# Clear any existing AWS credentials from environment
|
||||
boto3.Session().resource("s3").meta.client.close()
|
||||
|
||||
@ -74,11 +80,10 @@ def create_s3_bucket(bucket_name, region=DEFAULT_REGION):
|
||||
s3_client = session.client("s3")
|
||||
LOGGER.debug("Successfully created S3 client.")
|
||||
|
||||
# create_bucket = not
|
||||
if bucket_exists(s3_client, bucket_name):
|
||||
LOGGER.info(f"Bucket with name {bucket_name} already exists")
|
||||
return
|
||||
# Create bucket with location constraint if region is not us-east-1
|
||||
|
||||
if region == "us-east-1":
|
||||
s3_client.create_bucket(Bucket=bucket_name)
|
||||
else:
|
||||
@ -90,9 +95,7 @@ def create_s3_bucket(bucket_name, region=DEFAULT_REGION):
|
||||
|
||||
|
||||
def bucket_exists(s3_client, bucket_name):
|
||||
"""
|
||||
Check if an S3 bucket exists.
|
||||
"""
|
||||
"""Check if an S3 bucket exists."""
|
||||
try:
|
||||
s3_client.head_bucket(Bucket=bucket_name)
|
||||
return True
|
||||
@ -167,9 +170,11 @@ def setup_mongodb():
|
||||
|
||||
|
||||
def setup():
|
||||
LOGGER.info("Creating S3 bucket...")
|
||||
create_s3_bucket(DEFAULT_BUCKET_NAME)
|
||||
LOGGER.info("S3 bucket created successfully.")
|
||||
# Setup S3 if configured
|
||||
if STORAGE_PROVIDER == "aws-s3":
|
||||
LOGGER.info("Setting up S3 bucket...")
|
||||
create_s3_bucket(DEFAULT_BUCKET_NAME, DEFAULT_REGION)
|
||||
LOGGER.info("S3 bucket setup completed.")
|
||||
|
||||
LOGGER.info("Setting up MongoDB...")
|
||||
setup_mongodb()
|
||||
|
8
shell.py
8
shell.py
@ -39,11 +39,13 @@ class DB:
|
||||
doc = self._client.ingest_text(content, metadata=metadata or {})
|
||||
return doc.model_dump()
|
||||
|
||||
def ingest_file(self, file_path: str, metadata: dict = None, content_type: str = None) -> dict:
|
||||
def ingest_file(
|
||||
self, file: str, filename: str, metadata: dict = None, content_type: str = None
|
||||
) -> dict:
|
||||
"""Ingest a file into DataBridge"""
|
||||
file_path = Path(file_path)
|
||||
file_path = Path(file)
|
||||
doc = self._client.ingest_file(
|
||||
file_path, filename=file_path.name, content_type=content_type, metadata=metadata or {}
|
||||
file=file_path, filename=filename, content_type=content_type, metadata=metadata or {}
|
||||
)
|
||||
return doc.model_dump()
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user