2024-11-14 23:18:37 -05:00
|
|
|
from abc import ABC, abstractmethod
|
2024-12-29 11:10:51 +05:30
|
|
|
from typing import Any, Dict, List, Tuple
|
2025-04-20 16:34:29 -07:00
|
|
|
|
2024-12-30 11:58:53 -05:00
|
|
|
from core.models.chunk import Chunk
|
2024-11-14 23:18:37 -05:00
|
|
|
|
2024-12-15 14:31:25 -05:00
|
|
|
|
2024-11-14 23:18:37 -05:00
|
|
|
class BaseParser(ABC):
|
2024-11-24 14:29:25 -05:00
|
|
|
"""Base class for document parsing"""
|
2024-12-15 14:31:25 -05:00
|
|
|
|
2024-11-24 14:29:25 -05:00
|
|
|
@abstractmethod
|
2025-04-20 16:34:29 -07:00
|
|
|
async def parse_file_to_text(self, file: bytes, filename: str) -> Tuple[Dict[str, Any], str]:
|
2025-02-15 21:02:15 +03:00
|
|
|
"""
|
|
|
|
Parse file content into text.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
file: Raw file bytes
|
|
|
|
content_type: MIME type of the file
|
|
|
|
filename: Name of the file
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Tuple[Dict[str, Any], str]: (metadata, extracted_text)
|
|
|
|
- metadata: Additional metadata extracted during parsing
|
|
|
|
- extracted_text: Raw text extracted from the file
|
|
|
|
"""
|
2024-11-24 14:29:25 -05:00
|
|
|
pass
|
2024-12-15 14:31:25 -05:00
|
|
|
|
2024-11-14 23:18:37 -05:00
|
|
|
@abstractmethod
|
2025-02-15 21:02:15 +03:00
|
|
|
async def split_text(self, text: str) -> List[Chunk]:
|
|
|
|
"""
|
|
|
|
Split plain text into chunks.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
text: Text to split into chunks
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
List[Chunk]: List of text chunks with metadata
|
|
|
|
"""
|
2024-11-14 23:18:37 -05:00
|
|
|
pass
|