morphik-core/core/parser/unstructured_parser.py

from typing import Any, Dict, List, Tuple
import io
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_unstructured import UnstructuredLoader
from core.models.documents import Chunk
import logging

from .base_parser import BaseParser

logger = logging.getLogger(__name__)


class UnstructuredAPIParser(BaseParser):
    def __init__(
        self,
        api_key: str,
        chunk_size: int,
        chunk_overlap: int,
    ):
        self.api_key = api_key
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
            separators=["\n\n", "\n", ". ", " ", ""],
        )

    async def split_text(self, text: str) -> List[Chunk]:
        """Split plain text into chunks"""
        return [Chunk(content=chunk, metadata={}) for chunk in self.text_splitter.split_text(text)]

    async def parse_file(
        self, file: bytes, content_type: str
    ) -> Tuple[Dict[str, Any], List[Chunk]]:
        """Parse file content using unstructured"""
        # Parse with unstructured
        loader = UnstructuredLoader(
            file=io.BytesIO(file),
            partition_via_api=True,
            api_key=self.api_key,
            chunking_strategy="by_title",
        )
        elements = loader.load()
        return {}, [Chunk(content=element.page_content, metadata={}) for element in elements]