use local unstructured by default (#12)

2025-05-09 19:32:38 +00:00 · 2025-01-01 09:18:23 -05:00 · 2025-01-01 09:18:23 -05:00 · 48e6aeb8b7
commit 48e6aeb8b7
parent abccf99974
10 changed files with 49 additions and 29 deletions
--- a/config.toml
+++ b/config.toml
@ -8,9 +8,9 @@ reload = false
 storage = "local" # "aws-s3"
 database = "mongodb"
 vector_store = "mongodb"
-embedding = "openai" # "ollama"
-completion = "openai" # "ollama"
-parser = "combined" # "unstructured", "contextual"
+embedding = "ollama" # "openai", "ollama"
+completion = "ollama" # "openai", "ollama"
+parser = "combined" # "combined", "unstructured", "contextual"

 # Storage Configuration
 [storage.local]
@ -22,23 +22,23 @@ bucket_name = "databridge-s3-storage"

 # Database Configuration
 [database.mongodb]
-database_name = "DataBridgeTest"
+database_name = "databridge"
 documents_collection = "documents"
 chunks_collection = "document_chunks"

 # Vector Store Configuration
 [vector_store.mongodb]
-dimensions = 1536 # 768 for nomic-embed-text
+dimensions = 768 # 768 for nomic-embed-text, 1536 for text-embedding-3-small
 index_name = "vector_index"
-similarity_metric = "dotProduct"
+similarity_metric = "cosine"

 # Model Configurations
 [models]
 [models.embedding]
-model_name = "text-embedding-3-small" # "nomic-embed-text"
+model_name = "nomic-embed-text" # "text-embedding-3-small", "nomic-embed-text"

 [models.completion]
-model_name = "gpt-4o-mini" # "llama3.1"
+model_name = "llama3.1" # "gpt-4o-mini", "llama3.1", etc.
 default_max_tokens = 1000
 default_temperature = 0.7

@ -55,6 +55,9 @@ default_k = 4
 [processing.video]
 frame_sample_rate = 120

+[processing.unstructured]
+use_api = false
+
 # Authentication
 [auth]
 jwt_algorithm = "HS256"
--- a/core/api.py
+++ b/core/api.py
@ -17,7 +17,7 @@ from core.models.documents import Document, DocumentResult, ChunkResult
 from core.models.auth import AuthContext, EntityType
 from core.parser.combined_parser import CombinedParser
 from core.completion.base_completion import CompletionResponse
-from core.parser.unstructured_parser import UnstructuredAPIParser
+from core.parser.unstructured_parser import UnstructuredParser
 from core.services.document_service import DocumentService
 from core.services.telemetry import TelemetryService
 from core.config import get_settings
@ -96,6 +96,7 @@ match settings.PARSER_PROVIDER:
        if not settings.ASSEMBLYAI_API_KEY:
            raise ValueError("AssemblyAI API key is required for combined parser")
        parser = CombinedParser(
+            use_unstructured_api=settings.USE_UNSTRUCTURED_API,
            unstructured_api_key=settings.UNSTRUCTURED_API_KEY,
            assemblyai_api_key=settings.ASSEMBLYAI_API_KEY,
            chunk_size=settings.CHUNK_SIZE,
@ -103,7 +104,8 @@ match settings.PARSER_PROVIDER:
            frame_sample_rate=settings.FRAME_SAMPLE_RATE,
        )
    case "unstructured":
-        parser = UnstructuredAPIParser(
+        parser = UnstructuredParser(
+            use_api=settings.USE_UNSTRUCTURED_API,
            api_key=settings.UNSTRUCTURED_API_KEY,
            chunk_size=settings.CHUNK_SIZE,
            chunk_overlap=settings.CHUNK_OVERLAP,
@ -112,6 +114,7 @@ match settings.PARSER_PROVIDER:
        if not settings.ANTHROPIC_API_KEY:
            raise ValueError("Anthropic API key is required for contextual parser")
        parser = ContextualParser(
+            use_unstructured_api=settings.USE_UNSTRUCTURED_API,
            unstructured_api_key=settings.UNSTRUCTURED_API_KEY,
            assemblyai_api_key=settings.ASSEMBLYAI_API_KEY,
            chunk_size=settings.CHUNK_SIZE,
--- a/core/config.py
+++ b/core/config.py
@ -12,8 +12,8 @@ class Settings(BaseSettings):
    # Required environment variables (referenced in config.toml)
    JWT_SECRET_KEY: str = Field(..., env="JWT_SECRET_KEY")
    MONGODB_URI: str = Field(..., env="MONGODB_URI")
-    UNSTRUCTURED_API_KEY: str = Field(..., env="UNSTRUCTURED_API_KEY")

+    UNSTRUCTURED_API_KEY: Optional[str] = Field(None, env="UNSTRUCTURED_API_KEY")
    AWS_ACCESS_KEY: Optional[str] = Field(None, env="AWS_ACCESS_KEY")
    AWS_SECRET_ACCESS_KEY: Optional[str] = Field(None, env="AWS_SECRET_ACCESS_KEY")
    ASSEMBLYAI_API_KEY: Optional[str] = Field(None, env="ASSEMBLYAI_API_KEY")
@ -59,6 +59,7 @@ class Settings(BaseSettings):
    CHUNK_OVERLAP: int = 200
    DEFAULT_K: int = 4
    FRAME_SAMPLE_RATE: int = 120
+    USE_UNSTRUCTURED_API: bool = False

    # Auth settings
    JWT_ALGORITHM: str = "HS256"
@ -108,6 +109,7 @@ def get_settings() -> Settings:
        "CHUNK_OVERLAP": config["processing"]["text"]["chunk_overlap"],
        "DEFAULT_K": config["processing"]["text"]["default_k"],
        "FRAME_SAMPLE_RATE": config["processing"]["video"]["frame_sample_rate"],
+        "USE_UNSTRUCTURED_API": config["processing"]["unstructured"]["use_api"],
        # Auth settings
        "JWT_ALGORITHM": config["auth"]["jwt_algorithm"],
    }
--- a/core/parser/base_parser.py
+++ b/core/parser/base_parser.py
@ -13,7 +13,7 @@ class BaseParser(ABC):

    @abstractmethod
    async def parse_file(
-        self, file: bytes, content_type: str
+        self, file: bytes, content_type: str, filename: str
    ) -> Tuple[Dict[str, Any], List[Chunk]]:
        """Parse file content into text chunks"""
        pass
--- a/core/parser/combined_parser.py
+++ b/core/parser/combined_parser.py
@ -6,7 +6,7 @@ import magic
 from core.models.chunk import Chunk

 from core.parser.base_parser import BaseParser
-from core.parser.unstructured_parser import UnstructuredAPIParser
+from core.parser.unstructured_parser import UnstructuredParser
 from core.parser.video.parse_video import VideoParser

 logger = logging.getLogger(__name__)
@ -15,13 +15,15 @@ logger = logging.getLogger(__name__)
 class CombinedParser(BaseParser):
    def __init__(
        self,
+        use_unstructured_api: bool,
        unstructured_api_key: str,
        assemblyai_api_key: str,
        chunk_size: int,
        chunk_overlap: int,
        frame_sample_rate: int,
    ):
-        self.unstructured_parser = UnstructuredAPIParser(
+        self.unstructured_parser = UnstructuredParser(
+            use_api=use_unstructured_api,
            api_key=unstructured_api_key,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
@ -77,7 +79,7 @@ class CombinedParser(BaseParser):
        return await self.unstructured_parser.split_text(text)

    async def parse_file(
-        self, file: bytes, content_type: str
+        self, file: bytes, content_type: str, filename: str
    ) -> Tuple[Dict[str, Any], List[Chunk]]:
        """Parse file content into text chunks. Returns document metadata and a list of chunks"""
        is_video = self._is_video_file(file_bytes=file)
@ -85,7 +87,7 @@ class CombinedParser(BaseParser):
        if is_video:
            return await self._parse_video(file)
        else:
-            return await self.unstructured_parser.parse_file(file, content_type)
+            return await self.unstructured_parser.parse_file(file, content_type, filename)

    async def _parse_video(self, file: bytes) -> Tuple[Dict[str, Any], List[Chunk]]:
        """Parse video file and combine transcript and frame descriptions into chunks"""
--- a/core/parser/contextual_parser.py
+++ b/core/parser/contextual_parser.py
@ -29,6 +29,7 @@ Answer only with the succinct context and nothing else.
 class ContextualParser(BaseParser):
    def __init__(
        self,
+        use_unstructured_api: bool,
        unstructured_api_key: str,
        assemblyai_api_key: str,
        chunk_size: int,
@ -37,6 +38,7 @@ class ContextualParser(BaseParser):
        anthropic_api_key: str,
    ):
        self.combined_parser = CombinedParser(
+            use_unstructured_api=use_unstructured_api,
            unstructured_api_key=unstructured_api_key,
            assemblyai_api_key=assemblyai_api_key,
            chunk_size=chunk_size,
@ -97,7 +99,7 @@ class ContextualParser(BaseParser):
        return new_chunks

    async def parse_file(
-        self, file: bytes, content_type: str
+        self, file: bytes, content_type: str, filename: str
    ) -> Tuple[Dict[str, Any], List[Chunk]]:
        document_metadata, chunks = await self.combined_parser.parse_file(file, content_type)
        document_text = "\n".join([chunk.content for chunk in chunks])
--- a/core/parser/unstructured_parser.py
+++ b/core/parser/unstructured_parser.py
@ -10,13 +10,18 @@ from .base_parser import BaseParser
 logger = logging.getLogger(__name__)


-class UnstructuredAPIParser(BaseParser):
+class UnstructuredParser(BaseParser):
    def __init__(
        self,
+        use_api: bool,
        api_key: str,
        chunk_size: int,
        chunk_overlap: int,
    ):
+        if use_api and not api_key:
+            logger.error("API key is required if use_api is set to True")
+            raise ValueError("[UnstructuredParser] API key is required if use_api is True")
+        self.use_api = use_api
        self.api_key = api_key
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
@ -30,14 +35,15 @@ class UnstructuredAPIParser(BaseParser):
        return [Chunk(content=chunk, metadata={}) for chunk in self.text_splitter.split_text(text)]

    async def parse_file(
-        self, file: bytes, content_type: str
+        self, file: bytes, content_type: str, filename: str
    ) -> Tuple[Dict[str, Any], List[Chunk]]:
        """Parse file content using unstructured"""
        # Parse with unstructured
        loader = UnstructuredLoader(
            file=io.BytesIO(file),
-            partition_via_api=True,
-            api_key=self.api_key,
+            partition_via_api=self.use_api,
+            api_key=self.api_key if self.use_api else None,
+            metadata_filename=None if self.use_api else filename,
            chunking_strategy="by_title",
        )
        elements = loader.load()
--- a/core/services/document_service.py
+++ b/core/services/document_service.py
@ -162,7 +162,7 @@ class DocumentService:

        file_content = await file.read()
        additional_metadata, chunks = await self.parser.parse_file(
-            file_content, file.content_type or ""
+            file_content, file.content_type or "", file.filename
        )

        doc = Document(
--- a/quick_setup.py
+++ b/quick_setup.py
@ -21,12 +21,13 @@ args = parser.parse_args()

 # Configure logging based on command line arguments
 LOGGER = logging.getLogger(__name__)
-if args.debug:
-    LOGGER.setLevel(logging.DEBUG)
-elif args.quiet:
-    LOGGER.setLevel(logging.WARNING)
-else:
-    LOGGER.setLevel(logging.INFO)
+match (args.debug, args.quiet):
+    case (True, _):
+        LOGGER.setLevel(logging.DEBUG)
+    case (_, True):
+        LOGGER.setLevel(logging.WARNING)
+    case _:
+        LOGGER.setLevel(logging.INFO)

 # Add console handler with formatting
 console_handler = logging.StreamHandler()
--- a/shell.py
+++ b/shell.py
@ -32,7 +32,8 @@ class DB:
        if "localhost" in uri or "127.0.0.1" in uri:
            uri = uri.replace("databridge://", "http://")
        self.uri = uri
-        self._client = DataBridge(self.uri, is_local="localhost" in uri or "127.0.0.1" in uri)
+        is_local = "localhost" in uri or "127.0.0.1" in uri
+        self._client = DataBridge(self.uri, is_local=is_local, timeout=1000)

    def ingest_text(self, content: str, metadata: dict = None) -> dict:
        """Ingest text content into DataBridge"""