use local unstructured by default (#12)

This commit is contained in:
Arnav Agrawal 2025-01-01 09:18:23 -05:00 committed by GitHub
parent abccf99974
commit 48e6aeb8b7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 49 additions and 29 deletions

View File

@ -8,9 +8,9 @@ reload = false
storage = "local" # "aws-s3"
database = "mongodb"
vector_store = "mongodb"
embedding = "openai" # "ollama"
completion = "openai" # "ollama"
parser = "combined" # "unstructured", "contextual"
embedding = "ollama" # "openai", "ollama"
completion = "ollama" # "openai", "ollama"
parser = "combined" # "combined", "unstructured", "contextual"
# Storage Configuration
[storage.local]
@ -22,23 +22,23 @@ bucket_name = "databridge-s3-storage"
# Database Configuration
[database.mongodb]
database_name = "DataBridgeTest"
database_name = "databridge"
documents_collection = "documents"
chunks_collection = "document_chunks"
# Vector Store Configuration
[vector_store.mongodb]
dimensions = 1536 # 768 for nomic-embed-text
dimensions = 768 # 768 for nomic-embed-text, 1536 for text-embedding-3-small
index_name = "vector_index"
similarity_metric = "dotProduct"
similarity_metric = "cosine"
# Model Configurations
[models]
[models.embedding]
model_name = "text-embedding-3-small" # "nomic-embed-text"
model_name = "nomic-embed-text" # "text-embedding-3-small", "nomic-embed-text"
[models.completion]
model_name = "gpt-4o-mini" # "llama3.1"
model_name = "llama3.1" # "gpt-4o-mini", "llama3.1", etc.
default_max_tokens = 1000
default_temperature = 0.7
@ -55,6 +55,9 @@ default_k = 4
[processing.video]
frame_sample_rate = 120
[processing.unstructured]
use_api = false
# Authentication
[auth]
jwt_algorithm = "HS256"

View File

@ -17,7 +17,7 @@ from core.models.documents import Document, DocumentResult, ChunkResult
from core.models.auth import AuthContext, EntityType
from core.parser.combined_parser import CombinedParser
from core.completion.base_completion import CompletionResponse
from core.parser.unstructured_parser import UnstructuredAPIParser
from core.parser.unstructured_parser import UnstructuredParser
from core.services.document_service import DocumentService
from core.services.telemetry import TelemetryService
from core.config import get_settings
@ -96,6 +96,7 @@ match settings.PARSER_PROVIDER:
if not settings.ASSEMBLYAI_API_KEY:
raise ValueError("AssemblyAI API key is required for combined parser")
parser = CombinedParser(
use_unstructured_api=settings.USE_UNSTRUCTURED_API,
unstructured_api_key=settings.UNSTRUCTURED_API_KEY,
assemblyai_api_key=settings.ASSEMBLYAI_API_KEY,
chunk_size=settings.CHUNK_SIZE,
@ -103,7 +104,8 @@ match settings.PARSER_PROVIDER:
frame_sample_rate=settings.FRAME_SAMPLE_RATE,
)
case "unstructured":
parser = UnstructuredAPIParser(
parser = UnstructuredParser(
use_api=settings.USE_UNSTRUCTURED_API,
api_key=settings.UNSTRUCTURED_API_KEY,
chunk_size=settings.CHUNK_SIZE,
chunk_overlap=settings.CHUNK_OVERLAP,
@ -112,6 +114,7 @@ match settings.PARSER_PROVIDER:
if not settings.ANTHROPIC_API_KEY:
raise ValueError("Anthropic API key is required for contextual parser")
parser = ContextualParser(
use_unstructured_api=settings.USE_UNSTRUCTURED_API,
unstructured_api_key=settings.UNSTRUCTURED_API_KEY,
assemblyai_api_key=settings.ASSEMBLYAI_API_KEY,
chunk_size=settings.CHUNK_SIZE,

View File

@ -12,8 +12,8 @@ class Settings(BaseSettings):
# Required environment variables (referenced in config.toml)
JWT_SECRET_KEY: str = Field(..., env="JWT_SECRET_KEY")
MONGODB_URI: str = Field(..., env="MONGODB_URI")
UNSTRUCTURED_API_KEY: str = Field(..., env="UNSTRUCTURED_API_KEY")
UNSTRUCTURED_API_KEY: Optional[str] = Field(None, env="UNSTRUCTURED_API_KEY")
AWS_ACCESS_KEY: Optional[str] = Field(None, env="AWS_ACCESS_KEY")
AWS_SECRET_ACCESS_KEY: Optional[str] = Field(None, env="AWS_SECRET_ACCESS_KEY")
ASSEMBLYAI_API_KEY: Optional[str] = Field(None, env="ASSEMBLYAI_API_KEY")
@ -59,6 +59,7 @@ class Settings(BaseSettings):
CHUNK_OVERLAP: int = 200
DEFAULT_K: int = 4
FRAME_SAMPLE_RATE: int = 120
USE_UNSTRUCTURED_API: bool = False
# Auth settings
JWT_ALGORITHM: str = "HS256"
@ -108,6 +109,7 @@ def get_settings() -> Settings:
"CHUNK_OVERLAP": config["processing"]["text"]["chunk_overlap"],
"DEFAULT_K": config["processing"]["text"]["default_k"],
"FRAME_SAMPLE_RATE": config["processing"]["video"]["frame_sample_rate"],
"USE_UNSTRUCTURED_API": config["processing"]["unstructured"]["use_api"],
# Auth settings
"JWT_ALGORITHM": config["auth"]["jwt_algorithm"],
}

View File

@ -13,7 +13,7 @@ class BaseParser(ABC):
@abstractmethod
async def parse_file(
self, file: bytes, content_type: str
self, file: bytes, content_type: str, filename: str
) -> Tuple[Dict[str, Any], List[Chunk]]:
"""Parse file content into text chunks"""
pass

View File

@ -6,7 +6,7 @@ import magic
from core.models.chunk import Chunk
from core.parser.base_parser import BaseParser
from core.parser.unstructured_parser import UnstructuredAPIParser
from core.parser.unstructured_parser import UnstructuredParser
from core.parser.video.parse_video import VideoParser
logger = logging.getLogger(__name__)
@ -15,13 +15,15 @@ logger = logging.getLogger(__name__)
class CombinedParser(BaseParser):
def __init__(
self,
use_unstructured_api: bool,
unstructured_api_key: str,
assemblyai_api_key: str,
chunk_size: int,
chunk_overlap: int,
frame_sample_rate: int,
):
self.unstructured_parser = UnstructuredAPIParser(
self.unstructured_parser = UnstructuredParser(
use_api=use_unstructured_api,
api_key=unstructured_api_key,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
@ -77,7 +79,7 @@ class CombinedParser(BaseParser):
return await self.unstructured_parser.split_text(text)
async def parse_file(
self, file: bytes, content_type: str
self, file: bytes, content_type: str, filename: str
) -> Tuple[Dict[str, Any], List[Chunk]]:
"""Parse file content into text chunks. Returns document metadata and a list of chunks"""
is_video = self._is_video_file(file_bytes=file)
@ -85,7 +87,7 @@ class CombinedParser(BaseParser):
if is_video:
return await self._parse_video(file)
else:
return await self.unstructured_parser.parse_file(file, content_type)
return await self.unstructured_parser.parse_file(file, content_type, filename)
async def _parse_video(self, file: bytes) -> Tuple[Dict[str, Any], List[Chunk]]:
"""Parse video file and combine transcript and frame descriptions into chunks"""

View File

@ -29,6 +29,7 @@ Answer only with the succinct context and nothing else.
class ContextualParser(BaseParser):
def __init__(
self,
use_unstructured_api: bool,
unstructured_api_key: str,
assemblyai_api_key: str,
chunk_size: int,
@ -37,6 +38,7 @@ class ContextualParser(BaseParser):
anthropic_api_key: str,
):
self.combined_parser = CombinedParser(
use_unstructured_api=use_unstructured_api,
unstructured_api_key=unstructured_api_key,
assemblyai_api_key=assemblyai_api_key,
chunk_size=chunk_size,
@ -97,7 +99,7 @@ class ContextualParser(BaseParser):
return new_chunks
async def parse_file(
self, file: bytes, content_type: str
self, file: bytes, content_type: str, filename: str
) -> Tuple[Dict[str, Any], List[Chunk]]:
document_metadata, chunks = await self.combined_parser.parse_file(file, content_type)
document_text = "\n".join([chunk.content for chunk in chunks])

View File

@ -10,13 +10,18 @@ from .base_parser import BaseParser
logger = logging.getLogger(__name__)
class UnstructuredAPIParser(BaseParser):
class UnstructuredParser(BaseParser):
def __init__(
self,
use_api: bool,
api_key: str,
chunk_size: int,
chunk_overlap: int,
):
if use_api and not api_key:
logger.error("API key is required if use_api is set to True")
raise ValueError("[UnstructuredParser] API key is required if use_api is True")
self.use_api = use_api
self.api_key = api_key
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
@ -30,14 +35,15 @@ class UnstructuredAPIParser(BaseParser):
return [Chunk(content=chunk, metadata={}) for chunk in self.text_splitter.split_text(text)]
async def parse_file(
self, file: bytes, content_type: str
self, file: bytes, content_type: str, filename: str
) -> Tuple[Dict[str, Any], List[Chunk]]:
"""Parse file content using unstructured"""
# Parse with unstructured
loader = UnstructuredLoader(
file=io.BytesIO(file),
partition_via_api=True,
api_key=self.api_key,
partition_via_api=self.use_api,
api_key=self.api_key if self.use_api else None,
metadata_filename=None if self.use_api else filename,
chunking_strategy="by_title",
)
elements = loader.load()

View File

@ -162,7 +162,7 @@ class DocumentService:
file_content = await file.read()
additional_metadata, chunks = await self.parser.parse_file(
file_content, file.content_type or ""
file_content, file.content_type or "", file.filename
)
doc = Document(

View File

@ -21,12 +21,13 @@ args = parser.parse_args()
# Configure logging based on command line arguments
LOGGER = logging.getLogger(__name__)
if args.debug:
LOGGER.setLevel(logging.DEBUG)
elif args.quiet:
LOGGER.setLevel(logging.WARNING)
else:
LOGGER.setLevel(logging.INFO)
match (args.debug, args.quiet):
case (True, _):
LOGGER.setLevel(logging.DEBUG)
case (_, True):
LOGGER.setLevel(logging.WARNING)
case _:
LOGGER.setLevel(logging.INFO)
# Add console handler with formatting
console_handler = logging.StreamHandler()

View File

@ -32,7 +32,8 @@ class DB:
if "localhost" in uri or "127.0.0.1" in uri:
uri = uri.replace("databridge://", "http://")
self.uri = uri
self._client = DataBridge(self.uri, is_local="localhost" in uri or "127.0.0.1" in uri)
is_local = "localhost" in uri or "127.0.0.1" in uri
self._client = DataBridge(self.uri, is_local=is_local, timeout=1000)
def ingest_text(self, content: str, metadata: dict = None) -> dict:
"""Ingest text content into DataBridge"""