mirror of
https://github.com/james-m-jordan/morphik-core.git
synced 2025-05-09 19:32:38 +00:00
73 lines
2.6 KiB
Python
73 lines
2.6 KiB
Python
from typing import Dict, Any, List
|
|
from base_parser import BaseParser
|
|
from unstructured.partition.auto import partition
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
import os
|
|
import tempfile
|
|
import base64
|
|
|
|
|
|
class UnstructuredAPIParser(BaseParser):
|
|
def __init__(
|
|
self,
|
|
api_key: str,
|
|
chunk_size: int = 1000,
|
|
chunk_overlap: int = 200,
|
|
api_url: str = "https://api.unstructuredapp.io"
|
|
):
|
|
self.api_key = api_key
|
|
self.api_url = api_url
|
|
self.chunk_size = chunk_size
|
|
self.chunk_overlap = chunk_overlap
|
|
self.text_splitter = RecursiveCharacterTextSplitter(
|
|
chunk_size=chunk_size,
|
|
chunk_overlap=chunk_overlap,
|
|
length_function=len,
|
|
separators=["\n\n", "\n", ". ", " ", ""]
|
|
)
|
|
|
|
def parse(self, content: str, metadata: Dict[str, Any]) -> List[str]:
|
|
"""Parse content using Unstructured API and split into chunks."""
|
|
try:
|
|
# Create temporary file for content
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=self._get_file_extension(metadata)) as temp_file:
|
|
if metadata.get("is_base64", False):
|
|
temp_file.write(base64.b64decode(content))
|
|
else:
|
|
temp_file.write(content.encode('utf-8'))
|
|
temp_file_path = temp_file.name
|
|
|
|
try:
|
|
# Use Unstructured API for parsing
|
|
elements = partition(
|
|
filename=temp_file_path,
|
|
api_key=self.api_key,
|
|
api_url=self.api_url,
|
|
partition_via_api=True
|
|
)
|
|
|
|
# Combine elements and split into chunks
|
|
full_text = "\n\n".join(str(element) for element in elements)
|
|
chunks = self.text_splitter.split_text(full_text)
|
|
|
|
return chunks
|
|
finally:
|
|
# Clean up temporary file
|
|
os.unlink(temp_file_path)
|
|
|
|
except Exception as e:
|
|
raise Exception(f"Error parsing document: {str(e)}")
|
|
|
|
def _get_file_extension(self, metadata: Dict[str, Any]) -> str:
|
|
"""Get appropriate file extension based on content type."""
|
|
content_type_mapping = {
|
|
'application/pdf': '.pdf',
|
|
'application/msword': '.doc',
|
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
|
|
'image/jpeg': '.jpg',
|
|
'image/png': '.png',
|
|
'text/plain': '.txt',
|
|
'text/html': '.html'
|
|
}
|
|
return content_type_mapping.get(metadata.get('content_type'), '.txt')
|