morphik-core/core/parser/unstructured_parser.py
2024-12-29 12:48:41 +05:30

45 lines
1.4 KiB
Python

from typing import Any, Dict, List, Tuple
import io
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_unstructured import UnstructuredLoader
from core.models.documents import Chunk
import logging
from .base_parser import BaseParser
logger = logging.getLogger(__name__)
class UnstructuredAPIParser(BaseParser):
def __init__(
self,
api_key: str,
chunk_size: int,
chunk_overlap: int,
):
self.api_key = api_key
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
separators=["\n\n", "\n", ". ", " ", ""],
)
async def split_text(self, text: str) -> List[Chunk]:
"""Split plain text into chunks"""
return [Chunk(content=chunk, metadata={}) for chunk in self.text_splitter.split_text(text)]
async def parse_file(
self, file: bytes, content_type: str
) -> Tuple[Dict[str, Any], List[Chunk]]:
"""Parse file content using unstructured"""
# Parse with unstructured
loader = UnstructuredLoader(
file=io.BytesIO(file),
partition_via_api=True,
api_key=self.api_key,
chunking_strategy="by_title",
)
elements = loader.load()
return {}, [Chunk(content=element.page_content, metadata={}) for element in elements]