morphik-core/core/parser/base_parser.py
Arnav Agrawal 3c1195e001
add task queue (#87)
* add task queue

* ensure task queuing is working as expected.

* add downstream sdk changes

* bugs and pr comments

* update docker arq running logic
2025-04-16 02:31:49 -04:00

40 lines
1.0 KiB
Python

from abc import ABC, abstractmethod
from typing import Any, Dict, List, Tuple
from core.models.chunk import Chunk
class BaseParser(ABC):
"""Base class for document parsing"""
@abstractmethod
async def parse_file_to_text(
self, file: bytes, filename: str
) -> Tuple[Dict[str, Any], str]:
"""
Parse file content into text.
Args:
file: Raw file bytes
content_type: MIME type of the file
filename: Name of the file
Returns:
Tuple[Dict[str, Any], str]: (metadata, extracted_text)
- metadata: Additional metadata extracted during parsing
- extracted_text: Raw text extracted from the file
"""
pass
@abstractmethod
async def split_text(self, text: str) -> List[Chunk]:
"""
Split plain text into chunks.
Args:
text: Text to split into chunks
Returns:
List[Chunk]: List of text chunks with metadata
"""
pass