mirror of
https://github.com/james-m-jordan/morphik-core.git
synced 2025-05-09 19:32:38 +00:00
use local unstructured by default (#12)
This commit is contained in:
parent
abccf99974
commit
48e6aeb8b7
19
config.toml
19
config.toml
@ -8,9 +8,9 @@ reload = false
|
||||
storage = "local" # "aws-s3"
|
||||
database = "mongodb"
|
||||
vector_store = "mongodb"
|
||||
embedding = "openai" # "ollama"
|
||||
completion = "openai" # "ollama"
|
||||
parser = "combined" # "unstructured", "contextual"
|
||||
embedding = "ollama" # "openai", "ollama"
|
||||
completion = "ollama" # "openai", "ollama"
|
||||
parser = "combined" # "combined", "unstructured", "contextual"
|
||||
|
||||
# Storage Configuration
|
||||
[storage.local]
|
||||
@ -22,23 +22,23 @@ bucket_name = "databridge-s3-storage"
|
||||
|
||||
# Database Configuration
|
||||
[database.mongodb]
|
||||
database_name = "DataBridgeTest"
|
||||
database_name = "databridge"
|
||||
documents_collection = "documents"
|
||||
chunks_collection = "document_chunks"
|
||||
|
||||
# Vector Store Configuration
|
||||
[vector_store.mongodb]
|
||||
dimensions = 1536 # 768 for nomic-embed-text
|
||||
dimensions = 768 # 768 for nomic-embed-text, 1536 for text-embedding-3-small
|
||||
index_name = "vector_index"
|
||||
similarity_metric = "dotProduct"
|
||||
similarity_metric = "cosine"
|
||||
|
||||
# Model Configurations
|
||||
[models]
|
||||
[models.embedding]
|
||||
model_name = "text-embedding-3-small" # "nomic-embed-text"
|
||||
model_name = "nomic-embed-text" # "text-embedding-3-small", "nomic-embed-text"
|
||||
|
||||
[models.completion]
|
||||
model_name = "gpt-4o-mini" # "llama3.1"
|
||||
model_name = "llama3.1" # "gpt-4o-mini", "llama3.1", etc.
|
||||
default_max_tokens = 1000
|
||||
default_temperature = 0.7
|
||||
|
||||
@ -55,6 +55,9 @@ default_k = 4
|
||||
[processing.video]
|
||||
frame_sample_rate = 120
|
||||
|
||||
[processing.unstructured]
|
||||
use_api = false
|
||||
|
||||
# Authentication
|
||||
[auth]
|
||||
jwt_algorithm = "HS256"
|
||||
|
@ -17,7 +17,7 @@ from core.models.documents import Document, DocumentResult, ChunkResult
|
||||
from core.models.auth import AuthContext, EntityType
|
||||
from core.parser.combined_parser import CombinedParser
|
||||
from core.completion.base_completion import CompletionResponse
|
||||
from core.parser.unstructured_parser import UnstructuredAPIParser
|
||||
from core.parser.unstructured_parser import UnstructuredParser
|
||||
from core.services.document_service import DocumentService
|
||||
from core.services.telemetry import TelemetryService
|
||||
from core.config import get_settings
|
||||
@ -96,6 +96,7 @@ match settings.PARSER_PROVIDER:
|
||||
if not settings.ASSEMBLYAI_API_KEY:
|
||||
raise ValueError("AssemblyAI API key is required for combined parser")
|
||||
parser = CombinedParser(
|
||||
use_unstructured_api=settings.USE_UNSTRUCTURED_API,
|
||||
unstructured_api_key=settings.UNSTRUCTURED_API_KEY,
|
||||
assemblyai_api_key=settings.ASSEMBLYAI_API_KEY,
|
||||
chunk_size=settings.CHUNK_SIZE,
|
||||
@ -103,7 +104,8 @@ match settings.PARSER_PROVIDER:
|
||||
frame_sample_rate=settings.FRAME_SAMPLE_RATE,
|
||||
)
|
||||
case "unstructured":
|
||||
parser = UnstructuredAPIParser(
|
||||
parser = UnstructuredParser(
|
||||
use_api=settings.USE_UNSTRUCTURED_API,
|
||||
api_key=settings.UNSTRUCTURED_API_KEY,
|
||||
chunk_size=settings.CHUNK_SIZE,
|
||||
chunk_overlap=settings.CHUNK_OVERLAP,
|
||||
@ -112,6 +114,7 @@ match settings.PARSER_PROVIDER:
|
||||
if not settings.ANTHROPIC_API_KEY:
|
||||
raise ValueError("Anthropic API key is required for contextual parser")
|
||||
parser = ContextualParser(
|
||||
use_unstructured_api=settings.USE_UNSTRUCTURED_API,
|
||||
unstructured_api_key=settings.UNSTRUCTURED_API_KEY,
|
||||
assemblyai_api_key=settings.ASSEMBLYAI_API_KEY,
|
||||
chunk_size=settings.CHUNK_SIZE,
|
||||
|
@ -12,8 +12,8 @@ class Settings(BaseSettings):
|
||||
# Required environment variables (referenced in config.toml)
|
||||
JWT_SECRET_KEY: str = Field(..., env="JWT_SECRET_KEY")
|
||||
MONGODB_URI: str = Field(..., env="MONGODB_URI")
|
||||
UNSTRUCTURED_API_KEY: str = Field(..., env="UNSTRUCTURED_API_KEY")
|
||||
|
||||
UNSTRUCTURED_API_KEY: Optional[str] = Field(None, env="UNSTRUCTURED_API_KEY")
|
||||
AWS_ACCESS_KEY: Optional[str] = Field(None, env="AWS_ACCESS_KEY")
|
||||
AWS_SECRET_ACCESS_KEY: Optional[str] = Field(None, env="AWS_SECRET_ACCESS_KEY")
|
||||
ASSEMBLYAI_API_KEY: Optional[str] = Field(None, env="ASSEMBLYAI_API_KEY")
|
||||
@ -59,6 +59,7 @@ class Settings(BaseSettings):
|
||||
CHUNK_OVERLAP: int = 200
|
||||
DEFAULT_K: int = 4
|
||||
FRAME_SAMPLE_RATE: int = 120
|
||||
USE_UNSTRUCTURED_API: bool = False
|
||||
|
||||
# Auth settings
|
||||
JWT_ALGORITHM: str = "HS256"
|
||||
@ -108,6 +109,7 @@ def get_settings() -> Settings:
|
||||
"CHUNK_OVERLAP": config["processing"]["text"]["chunk_overlap"],
|
||||
"DEFAULT_K": config["processing"]["text"]["default_k"],
|
||||
"FRAME_SAMPLE_RATE": config["processing"]["video"]["frame_sample_rate"],
|
||||
"USE_UNSTRUCTURED_API": config["processing"]["unstructured"]["use_api"],
|
||||
# Auth settings
|
||||
"JWT_ALGORITHM": config["auth"]["jwt_algorithm"],
|
||||
}
|
||||
|
@ -13,7 +13,7 @@ class BaseParser(ABC):
|
||||
|
||||
@abstractmethod
|
||||
async def parse_file(
|
||||
self, file: bytes, content_type: str
|
||||
self, file: bytes, content_type: str, filename: str
|
||||
) -> Tuple[Dict[str, Any], List[Chunk]]:
|
||||
"""Parse file content into text chunks"""
|
||||
pass
|
||||
|
@ -6,7 +6,7 @@ import magic
|
||||
from core.models.chunk import Chunk
|
||||
|
||||
from core.parser.base_parser import BaseParser
|
||||
from core.parser.unstructured_parser import UnstructuredAPIParser
|
||||
from core.parser.unstructured_parser import UnstructuredParser
|
||||
from core.parser.video.parse_video import VideoParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -15,13 +15,15 @@ logger = logging.getLogger(__name__)
|
||||
class CombinedParser(BaseParser):
|
||||
def __init__(
|
||||
self,
|
||||
use_unstructured_api: bool,
|
||||
unstructured_api_key: str,
|
||||
assemblyai_api_key: str,
|
||||
chunk_size: int,
|
||||
chunk_overlap: int,
|
||||
frame_sample_rate: int,
|
||||
):
|
||||
self.unstructured_parser = UnstructuredAPIParser(
|
||||
self.unstructured_parser = UnstructuredParser(
|
||||
use_api=use_unstructured_api,
|
||||
api_key=unstructured_api_key,
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
@ -77,7 +79,7 @@ class CombinedParser(BaseParser):
|
||||
return await self.unstructured_parser.split_text(text)
|
||||
|
||||
async def parse_file(
|
||||
self, file: bytes, content_type: str
|
||||
self, file: bytes, content_type: str, filename: str
|
||||
) -> Tuple[Dict[str, Any], List[Chunk]]:
|
||||
"""Parse file content into text chunks. Returns document metadata and a list of chunks"""
|
||||
is_video = self._is_video_file(file_bytes=file)
|
||||
@ -85,7 +87,7 @@ class CombinedParser(BaseParser):
|
||||
if is_video:
|
||||
return await self._parse_video(file)
|
||||
else:
|
||||
return await self.unstructured_parser.parse_file(file, content_type)
|
||||
return await self.unstructured_parser.parse_file(file, content_type, filename)
|
||||
|
||||
async def _parse_video(self, file: bytes) -> Tuple[Dict[str, Any], List[Chunk]]:
|
||||
"""Parse video file and combine transcript and frame descriptions into chunks"""
|
||||
|
@ -29,6 +29,7 @@ Answer only with the succinct context and nothing else.
|
||||
class ContextualParser(BaseParser):
|
||||
def __init__(
|
||||
self,
|
||||
use_unstructured_api: bool,
|
||||
unstructured_api_key: str,
|
||||
assemblyai_api_key: str,
|
||||
chunk_size: int,
|
||||
@ -37,6 +38,7 @@ class ContextualParser(BaseParser):
|
||||
anthropic_api_key: str,
|
||||
):
|
||||
self.combined_parser = CombinedParser(
|
||||
use_unstructured_api=use_unstructured_api,
|
||||
unstructured_api_key=unstructured_api_key,
|
||||
assemblyai_api_key=assemblyai_api_key,
|
||||
chunk_size=chunk_size,
|
||||
@ -97,7 +99,7 @@ class ContextualParser(BaseParser):
|
||||
return new_chunks
|
||||
|
||||
async def parse_file(
|
||||
self, file: bytes, content_type: str
|
||||
self, file: bytes, content_type: str, filename: str
|
||||
) -> Tuple[Dict[str, Any], List[Chunk]]:
|
||||
document_metadata, chunks = await self.combined_parser.parse_file(file, content_type)
|
||||
document_text = "\n".join([chunk.content for chunk in chunks])
|
||||
|
@ -10,13 +10,18 @@ from .base_parser import BaseParser
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class UnstructuredAPIParser(BaseParser):
|
||||
class UnstructuredParser(BaseParser):
|
||||
def __init__(
|
||||
self,
|
||||
use_api: bool,
|
||||
api_key: str,
|
||||
chunk_size: int,
|
||||
chunk_overlap: int,
|
||||
):
|
||||
if use_api and not api_key:
|
||||
logger.error("API key is required if use_api is set to True")
|
||||
raise ValueError("[UnstructuredParser] API key is required if use_api is True")
|
||||
self.use_api = use_api
|
||||
self.api_key = api_key
|
||||
self.text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=chunk_size,
|
||||
@ -30,14 +35,15 @@ class UnstructuredAPIParser(BaseParser):
|
||||
return [Chunk(content=chunk, metadata={}) for chunk in self.text_splitter.split_text(text)]
|
||||
|
||||
async def parse_file(
|
||||
self, file: bytes, content_type: str
|
||||
self, file: bytes, content_type: str, filename: str
|
||||
) -> Tuple[Dict[str, Any], List[Chunk]]:
|
||||
"""Parse file content using unstructured"""
|
||||
# Parse with unstructured
|
||||
loader = UnstructuredLoader(
|
||||
file=io.BytesIO(file),
|
||||
partition_via_api=True,
|
||||
api_key=self.api_key,
|
||||
partition_via_api=self.use_api,
|
||||
api_key=self.api_key if self.use_api else None,
|
||||
metadata_filename=None if self.use_api else filename,
|
||||
chunking_strategy="by_title",
|
||||
)
|
||||
elements = loader.load()
|
||||
|
@ -162,7 +162,7 @@ class DocumentService:
|
||||
|
||||
file_content = await file.read()
|
||||
additional_metadata, chunks = await self.parser.parse_file(
|
||||
file_content, file.content_type or ""
|
||||
file_content, file.content_type or "", file.filename
|
||||
)
|
||||
|
||||
doc = Document(
|
||||
|
@ -21,12 +21,13 @@ args = parser.parse_args()
|
||||
|
||||
# Configure logging based on command line arguments
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
if args.debug:
|
||||
LOGGER.setLevel(logging.DEBUG)
|
||||
elif args.quiet:
|
||||
LOGGER.setLevel(logging.WARNING)
|
||||
else:
|
||||
LOGGER.setLevel(logging.INFO)
|
||||
match (args.debug, args.quiet):
|
||||
case (True, _):
|
||||
LOGGER.setLevel(logging.DEBUG)
|
||||
case (_, True):
|
||||
LOGGER.setLevel(logging.WARNING)
|
||||
case _:
|
||||
LOGGER.setLevel(logging.INFO)
|
||||
|
||||
# Add console handler with formatting
|
||||
console_handler = logging.StreamHandler()
|
||||
|
3
shell.py
3
shell.py
@ -32,7 +32,8 @@ class DB:
|
||||
if "localhost" in uri or "127.0.0.1" in uri:
|
||||
uri = uri.replace("databridge://", "http://")
|
||||
self.uri = uri
|
||||
self._client = DataBridge(self.uri, is_local="localhost" in uri or "127.0.0.1" in uri)
|
||||
is_local = "localhost" in uri or "127.0.0.1" in uri
|
||||
self._client = DataBridge(self.uri, is_local=is_local, timeout=1000)
|
||||
|
||||
def ingest_text(self, content: str, metadata: dict = None) -> dict:
|
||||
"""Ingest text content into DataBridge"""
|
||||
|
Loading…
x
Reference in New Issue
Block a user