mirror of
https://github.com/james-m-jordan/morphik-core.git
synced 2025-05-09 19:32:38 +00:00
Add local file system for storage (#10)
This commit is contained in:
parent
3e4a9999ad
commit
367dc079e8
@ -1,6 +1,8 @@
|
|||||||
MONGODB_URI="mongodb+srv://..."
|
MONGODB_URI="..."
|
||||||
OPENAI_API_KEY="sk-proj-..."
|
OPENAI_API_KEY="..."
|
||||||
UNSTRUCTURED_API_KEY="..."
|
# Optional: Only needed if using AWS S3 storage
|
||||||
AWS_ACCESS_KEY="..."
|
AWS_ACCESS_KEY="..."
|
||||||
AWS_SECRET_ACCESS_KEY="..."
|
AWS_SECRET_ACCESS_KEY="..."
|
||||||
|
UNSTRUCTURED_API_KEY="..."
|
||||||
|
ASSEMBLYAI_API_KEY="..."
|
||||||
JWT_SECRET_KEY="..."
|
JWT_SECRET_KEY="..."
|
||||||
|
3
.gitignore
vendored
3
.gitignore
vendored
@ -23,3 +23,6 @@ core/tests/assets
|
|||||||
.vscode/
|
.vscode/
|
||||||
|
|
||||||
*.DS_Store
|
*.DS_Store
|
||||||
|
|
||||||
|
storage/*
|
||||||
|
logs/*
|
||||||
|
@ -5,7 +5,7 @@ port = 8000
|
|||||||
reload = false
|
reload = false
|
||||||
|
|
||||||
[service.components]
|
[service.components]
|
||||||
storage = "aws-s3"
|
storage = "local" # "aws-s3"
|
||||||
database = "mongodb"
|
database = "mongodb"
|
||||||
vector_store = "mongodb"
|
vector_store = "mongodb"
|
||||||
embedding = "openai" # "ollama"
|
embedding = "openai" # "ollama"
|
||||||
@ -13,6 +13,9 @@ completion = "openai" # "ollama"
|
|||||||
parser = "combined" # "unstructured"
|
parser = "combined" # "unstructured"
|
||||||
|
|
||||||
# Storage Configuration
|
# Storage Configuration
|
||||||
|
[storage.local]
|
||||||
|
path = "./storage"
|
||||||
|
|
||||||
[storage.aws]
|
[storage.aws]
|
||||||
region = "us-east-2"
|
region = "us-east-2"
|
||||||
bucket_name = "databridge-s3-storage"
|
bucket_name = "databridge-s3-storage"
|
||||||
|
@ -24,6 +24,7 @@ from core.config import get_settings
|
|||||||
from core.database.mongo_database import MongoDatabase
|
from core.database.mongo_database import MongoDatabase
|
||||||
from core.vector_store.mongo_vector_store import MongoDBAtlasVectorStore
|
from core.vector_store.mongo_vector_store import MongoDBAtlasVectorStore
|
||||||
from core.storage.s3_storage import S3Storage
|
from core.storage.s3_storage import S3Storage
|
||||||
|
from core.storage.local_storage import LocalStorage
|
||||||
from core.embedding.openai_embedding_model import OpenAIEmbeddingModel
|
from core.embedding.openai_embedding_model import OpenAIEmbeddingModel
|
||||||
from core.completion.ollama_completion import OllamaCompletionModel
|
from core.completion.ollama_completion import OllamaCompletionModel
|
||||||
|
|
||||||
@ -74,6 +75,8 @@ match settings.VECTOR_STORE_PROVIDER:
|
|||||||
|
|
||||||
# Initialize storage
|
# Initialize storage
|
||||||
match settings.STORAGE_PROVIDER:
|
match settings.STORAGE_PROVIDER:
|
||||||
|
case "local":
|
||||||
|
storage = LocalStorage(storage_path=settings.STORAGE_PATH)
|
||||||
case "aws-s3":
|
case "aws-s3":
|
||||||
storage = S3Storage(
|
storage = S3Storage(
|
||||||
aws_access_key=settings.AWS_ACCESS_KEY,
|
aws_access_key=settings.AWS_ACCESS_KEY,
|
||||||
|
@ -13,8 +13,8 @@ class Settings(BaseSettings):
|
|||||||
OPENAI_API_KEY: str = Field(..., env="OPENAI_API_KEY")
|
OPENAI_API_KEY: str = Field(..., env="OPENAI_API_KEY")
|
||||||
UNSTRUCTURED_API_KEY: str = Field(..., env="UNSTRUCTURED_API_KEY")
|
UNSTRUCTURED_API_KEY: str = Field(..., env="UNSTRUCTURED_API_KEY")
|
||||||
ASSEMBLYAI_API_KEY: str = Field(..., env="ASSEMBLYAI_API_KEY")
|
ASSEMBLYAI_API_KEY: str = Field(..., env="ASSEMBLYAI_API_KEY")
|
||||||
AWS_ACCESS_KEY: str = Field(..., env="AWS_ACCESS_KEY")
|
AWS_ACCESS_KEY: str = Field(None, env="AWS_ACCESS_KEY")
|
||||||
AWS_SECRET_ACCESS_KEY: str = Field(..., env="AWS_SECRET_ACCESS_KEY")
|
AWS_SECRET_ACCESS_KEY: str = Field(None, env="AWS_SECRET_ACCESS_KEY")
|
||||||
JWT_SECRET_KEY: str = Field(..., env="JWT_SECRET_KEY")
|
JWT_SECRET_KEY: str = Field(..., env="JWT_SECRET_KEY")
|
||||||
|
|
||||||
# Service settings
|
# Service settings
|
||||||
@ -23,7 +23,7 @@ class Settings(BaseSettings):
|
|||||||
RELOAD: bool = False
|
RELOAD: bool = False
|
||||||
|
|
||||||
# Component selection
|
# Component selection
|
||||||
STORAGE_PROVIDER: str = "aws-s3"
|
STORAGE_PROVIDER: str = "local"
|
||||||
DATABASE_PROVIDER: str = "mongodb"
|
DATABASE_PROVIDER: str = "mongodb"
|
||||||
VECTOR_STORE_PROVIDER: str = "mongodb"
|
VECTOR_STORE_PROVIDER: str = "mongodb"
|
||||||
EMBEDDING_PROVIDER: str = "openai"
|
EMBEDDING_PROVIDER: str = "openai"
|
||||||
@ -31,6 +31,7 @@ class Settings(BaseSettings):
|
|||||||
PARSER_PROVIDER: str = "combined"
|
PARSER_PROVIDER: str = "combined"
|
||||||
|
|
||||||
# Storage settings
|
# Storage settings
|
||||||
|
STORAGE_PATH: str = "./storage"
|
||||||
AWS_REGION: str = "us-east-2"
|
AWS_REGION: str = "us-east-2"
|
||||||
S3_BUCKET: str = "databridge-s3-storage"
|
S3_BUCKET: str = "databridge-s3-storage"
|
||||||
|
|
||||||
@ -83,6 +84,7 @@ def get_settings() -> Settings:
|
|||||||
"COMPLETION_PROVIDER": config["service"]["components"]["completion"],
|
"COMPLETION_PROVIDER": config["service"]["components"]["completion"],
|
||||||
"PARSER_PROVIDER": config["service"]["components"]["parser"],
|
"PARSER_PROVIDER": config["service"]["components"]["parser"],
|
||||||
# Storage settings
|
# Storage settings
|
||||||
|
"STORAGE_PATH": config["storage"]["local"]["path"],
|
||||||
"AWS_REGION": config["storage"]["aws"]["region"],
|
"AWS_REGION": config["storage"]["aws"]["region"],
|
||||||
"S3_BUCKET": config["storage"]["aws"]["bucket_name"],
|
"S3_BUCKET": config["storage"]["aws"]["bucket_name"],
|
||||||
# Database settings
|
# Database settings
|
||||||
|
@ -5,26 +5,6 @@ from typing import Tuple, Optional, Union, BinaryIO
|
|||||||
class BaseStorage(ABC):
|
class BaseStorage(ABC):
|
||||||
"""Base interface for storage providers."""
|
"""Base interface for storage providers."""
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
async def upload_file(
|
|
||||||
self,
|
|
||||||
file: Union[str, bytes, BinaryIO],
|
|
||||||
key: str,
|
|
||||||
content_type: Optional[str] = None,
|
|
||||||
) -> Tuple[str, str]:
|
|
||||||
"""
|
|
||||||
Upload a file to storage.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
file: File content as string, bytes or file object
|
|
||||||
key: Storage key/path for the file
|
|
||||||
content_type: Optional MIME type
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tuple[str, str]: (bucket/container name, storage key)
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
async def upload_from_base64(
|
async def upload_from_base64(
|
||||||
self, content: str, key: str, content_type: Optional[str] = None
|
self, content: str, key: str, content_type: Optional[str] = None
|
||||||
|
49
core/storage/local_storage.py
Normal file
49
core/storage/local_storage.py
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
import base64
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Tuple, Optional, BinaryIO
|
||||||
|
from .base_storage import BaseStorage
|
||||||
|
|
||||||
|
|
||||||
|
class LocalStorage(BaseStorage):
|
||||||
|
def __init__(self, storage_path: str):
|
||||||
|
"""Initialize local storage with a base path."""
|
||||||
|
self.storage_path = Path(storage_path)
|
||||||
|
# Create storage directory if it doesn't exist
|
||||||
|
self.storage_path.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
async def download_file(self, bucket: str, key: str) -> BinaryIO:
|
||||||
|
"""Download a file from local storage."""
|
||||||
|
file_path = self.storage_path / key
|
||||||
|
if not file_path.exists():
|
||||||
|
raise FileNotFoundError(f"File not found: {file_path}")
|
||||||
|
return open(file_path, "rb")
|
||||||
|
|
||||||
|
async def upload_from_base64(
|
||||||
|
self, base64_content: str, key: str, content_type: Optional[str] = None
|
||||||
|
) -> Tuple[str, str]:
|
||||||
|
"""Upload base64 encoded content to local storage."""
|
||||||
|
# Decode base64 content
|
||||||
|
file_content = base64.b64decode(base64_content)
|
||||||
|
|
||||||
|
# Create file path
|
||||||
|
file_path = self.storage_path / key
|
||||||
|
|
||||||
|
# Write content to file
|
||||||
|
with open(file_path, "wb") as f:
|
||||||
|
f.write(file_content)
|
||||||
|
|
||||||
|
return str(self.storage_path), key
|
||||||
|
|
||||||
|
async def get_download_url(self, bucket: str, key: str) -> str:
|
||||||
|
"""Get local file path as URL."""
|
||||||
|
file_path = self.storage_path / key
|
||||||
|
if not file_path.exists():
|
||||||
|
raise FileNotFoundError(f"File not found: {file_path}")
|
||||||
|
return f"file://{file_path.absolute()}"
|
||||||
|
|
||||||
|
async def delete_file(self, bucket: str, key: str) -> bool:
|
||||||
|
"""Delete a file from local storage."""
|
||||||
|
file_path = self.storage_path / key
|
||||||
|
if file_path.exists():
|
||||||
|
file_path.unlink()
|
||||||
|
return True
|
@ -41,8 +41,7 @@ with open(config_path, "rb") as f:
|
|||||||
LOGGER.info("Loaded configuration from config.toml")
|
LOGGER.info("Loaded configuration from config.toml")
|
||||||
|
|
||||||
# Extract configuration values
|
# Extract configuration values
|
||||||
DEFAULT_REGION = CONFIG["storage"]["aws"]["region"]
|
STORAGE_PROVIDER = CONFIG["service"]["components"]["storage"]
|
||||||
DEFAULT_BUCKET_NAME = CONFIG["storage"]["aws"]["bucket_name"]
|
|
||||||
DATABASE_NAME = CONFIG["database"]["mongodb"]["database_name"]
|
DATABASE_NAME = CONFIG["database"]["mongodb"]["database_name"]
|
||||||
DOCUMENTS_COLLECTION = CONFIG["database"]["mongodb"]["documents_collection"]
|
DOCUMENTS_COLLECTION = CONFIG["database"]["mongodb"]["documents_collection"]
|
||||||
CHUNKS_COLLECTION = CONFIG["database"]["mongodb"]["chunks_collection"]
|
CHUNKS_COLLECTION = CONFIG["database"]["mongodb"]["chunks_collection"]
|
||||||
@ -50,8 +49,15 @@ VECTOR_DIMENSIONS = CONFIG["vector_store"]["mongodb"]["dimensions"]
|
|||||||
VECTOR_INDEX_NAME = CONFIG["vector_store"]["mongodb"]["index_name"]
|
VECTOR_INDEX_NAME = CONFIG["vector_store"]["mongodb"]["index_name"]
|
||||||
SIMILARITY_METRIC = CONFIG["vector_store"]["mongodb"]["similarity_metric"]
|
SIMILARITY_METRIC = CONFIG["vector_store"]["mongodb"]["similarity_metric"]
|
||||||
|
|
||||||
|
# Extract storage-specific configuration
|
||||||
|
DEFAULT_REGION = CONFIG["storage"]["aws"]["region"] if STORAGE_PROVIDER == "aws-s3" else None
|
||||||
|
DEFAULT_BUCKET_NAME = (
|
||||||
|
CONFIG["storage"]["aws"]["bucket_name"] if STORAGE_PROVIDER == "aws-s3" else None
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def create_s3_bucket(bucket_name, region=DEFAULT_REGION):
|
def create_s3_bucket(bucket_name, region=DEFAULT_REGION):
|
||||||
|
"""Set up S3 bucket."""
|
||||||
# Clear any existing AWS credentials from environment
|
# Clear any existing AWS credentials from environment
|
||||||
boto3.Session().resource("s3").meta.client.close()
|
boto3.Session().resource("s3").meta.client.close()
|
||||||
|
|
||||||
@ -74,11 +80,10 @@ def create_s3_bucket(bucket_name, region=DEFAULT_REGION):
|
|||||||
s3_client = session.client("s3")
|
s3_client = session.client("s3")
|
||||||
LOGGER.debug("Successfully created S3 client.")
|
LOGGER.debug("Successfully created S3 client.")
|
||||||
|
|
||||||
# create_bucket = not
|
|
||||||
if bucket_exists(s3_client, bucket_name):
|
if bucket_exists(s3_client, bucket_name):
|
||||||
LOGGER.info(f"Bucket with name {bucket_name} already exists")
|
LOGGER.info(f"Bucket with name {bucket_name} already exists")
|
||||||
return
|
return
|
||||||
# Create bucket with location constraint if region is not us-east-1
|
|
||||||
if region == "us-east-1":
|
if region == "us-east-1":
|
||||||
s3_client.create_bucket(Bucket=bucket_name)
|
s3_client.create_bucket(Bucket=bucket_name)
|
||||||
else:
|
else:
|
||||||
@ -90,9 +95,7 @@ def create_s3_bucket(bucket_name, region=DEFAULT_REGION):
|
|||||||
|
|
||||||
|
|
||||||
def bucket_exists(s3_client, bucket_name):
|
def bucket_exists(s3_client, bucket_name):
|
||||||
"""
|
"""Check if an S3 bucket exists."""
|
||||||
Check if an S3 bucket exists.
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
s3_client.head_bucket(Bucket=bucket_name)
|
s3_client.head_bucket(Bucket=bucket_name)
|
||||||
return True
|
return True
|
||||||
@ -167,9 +170,11 @@ def setup_mongodb():
|
|||||||
|
|
||||||
|
|
||||||
def setup():
|
def setup():
|
||||||
LOGGER.info("Creating S3 bucket...")
|
# Setup S3 if configured
|
||||||
create_s3_bucket(DEFAULT_BUCKET_NAME)
|
if STORAGE_PROVIDER == "aws-s3":
|
||||||
LOGGER.info("S3 bucket created successfully.")
|
LOGGER.info("Setting up S3 bucket...")
|
||||||
|
create_s3_bucket(DEFAULT_BUCKET_NAME, DEFAULT_REGION)
|
||||||
|
LOGGER.info("S3 bucket setup completed.")
|
||||||
|
|
||||||
LOGGER.info("Setting up MongoDB...")
|
LOGGER.info("Setting up MongoDB...")
|
||||||
setup_mongodb()
|
setup_mongodb()
|
||||||
|
8
shell.py
8
shell.py
@ -39,11 +39,13 @@ class DB:
|
|||||||
doc = self._client.ingest_text(content, metadata=metadata or {})
|
doc = self._client.ingest_text(content, metadata=metadata or {})
|
||||||
return doc.model_dump()
|
return doc.model_dump()
|
||||||
|
|
||||||
def ingest_file(self, file_path: str, metadata: dict = None, content_type: str = None) -> dict:
|
def ingest_file(
|
||||||
|
self, file: str, filename: str, metadata: dict = None, content_type: str = None
|
||||||
|
) -> dict:
|
||||||
"""Ingest a file into DataBridge"""
|
"""Ingest a file into DataBridge"""
|
||||||
file_path = Path(file_path)
|
file_path = Path(file)
|
||||||
doc = self._client.ingest_file(
|
doc = self._client.ingest_file(
|
||||||
file_path, filename=file_path.name, content_type=content_type, metadata=metadata or {}
|
file=file_path, filename=filename, content_type=content_type, metadata=metadata or {}
|
||||||
)
|
)
|
||||||
return doc.model_dump()
|
return doc.model_dump()
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user