morphik-core/core/models/video.py

import logging
from bisect import bisect_left
from collections import defaultdict
from typing import Dict, List, Optional, Tuple, Union

from pydantic import BaseModel, computed_field

from core.models.chunk import Chunk

logger = logging.getLogger(__name__)


class TimeSeriesData(BaseModel):
    """Time series data structure for efficient time-based queries"""

    time_to_content: Dict[float, str]

    @computed_field
    @property
    def _sorted_items(self) -> List[Tuple[float, str]]:
        return sorted(self.time_to_content.items(), key=lambda x: x[0])

    @computed_field
    @property
    def timestamps(self) -> List[float]:
        return [t for t, _ in self._sorted_items]

    @computed_field
    @property
    def contents(self) -> List[str]:
        return [c for _, c in self._sorted_items]

    @computed_field
    @property
    def content_to_times(self) -> Dict[str, List[float]]:
        result = defaultdict(list)
        for t, c in self.time_to_content.items():
            result[c].append(t)
        return dict(result)

    def _find_nearest_index(self, time: float) -> int:
        """Find index of nearest timestamp using binary search"""
        if not self.timestamps:  # Handle empty timestamps list
            return -1

        idx = bisect_left(self.timestamps, time)
        if idx == 0:
            return 0
        if idx == len(self.timestamps):
            return len(self.timestamps) - 1
        before = self.timestamps[idx - 1]
        after = self.timestamps[idx]
        return idx if (time - before) > (after - time) else idx - 1

    def at_time(self, time: float, padding: Optional[float] = None) -> Union[str, List[Tuple[float, str]]]:
        """
        Get content at or around specified time

        Args:
            time: Target timestamp
            padding: Optional time padding in seconds to get content before and after

        Returns:
            Either single content string or list of (timestamp, content) pairs if padding specified
        """
        if not self.timestamps:  # Handle empty timestamps list
            return [] if padding is not None else ""

        if padding is None:
            idx = self._find_nearest_index(time)
            return self.contents[idx]

        # Find all content within padding window
        start_time = max(time - padding, self.timestamps[0])  # Clamp to first timestamp
        end_time = min(time + padding, self.timestamps[-1])  # Clamp to last timestamp

        start_idx = self._find_nearest_index(start_time)
        end_idx = self._find_nearest_index(end_time)

        # Ensure valid indices
        start_idx = max(0, start_idx)
        end_idx = min(len(self.timestamps) - 1, end_idx)

        logger.debug(f"Retrieving content between {start_time:.2f}s and {end_time:.2f}s")
        return [(self.timestamps[i], self.contents[i]) for i in range(start_idx, end_idx + 1)]

    def times_for_content(self, content: str) -> List[float]:
        """Get all timestamps where this content appears"""
        return self.content_to_times.get(content, [])

    def to_chunks(self) -> List[Chunk]:
        return [
            Chunk(content=content, metadata={"timestamp": timestamp})
            for content, timestamp in zip(self.contents, self.timestamps)
        ]


class ParseVideoResult(BaseModel):
    metadata: Dict[str, Union[float, int]]
    frame_descriptions: TimeSeriesData
    transcript: TimeSeriesData