Fix video parsing bugs, improve server logging

2025-05-09 19:32:38 +00:00 · 2025-01-30 16:03:46 -05:00 · 2025-01-30 16:03:46 -05:00 · 20c3015038
commit 20c3015038
parent d124e6aa0d
7 changed files with 154 additions and 49 deletions
--- a/core/config.py
+++ b/core/config.py
@ -18,6 +18,7 @@ class Settings(BaseSettings):
    AWS_SECRET_ACCESS_KEY: Optional[str] = None
    OPENAI_API_KEY: Optional[str] = None
    ANTHROPIC_API_KEY: Optional[str] = None
    ASSEMBLYAI_API_KEY: Optional[str] = None
    # API configuration
    HOST: str
--- a/core/logging_config.py
+++ b/core/logging_config.py
@ -0,0 +1,39 @@
 import logging
 import sys
 from pathlib import Path
 def setup_logging():
    # Create logs directory if it doesn't exist
    log_dir = Path("logs")
    log_dir.mkdir(exist_ok=True)
    # Configure root logger
    root_logger = logging.getLogger()
    root_logger.setLevel(logging.INFO)
    # Create formatters
    console_formatter = logging.Formatter(
        "%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
    )
    # Console handler
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setFormatter(console_formatter)
    console_handler.setLevel(logging.INFO)
    # File handler
    file_handler = logging.FileHandler(log_dir / "databridge.log")
    file_handler.setFormatter(console_formatter)
    file_handler.setLevel(logging.INFO)
    # Add handlers to root logger
    root_logger.addHandler(console_handler)
    root_logger.addHandler(file_handler)
    # Set levels for specific loggers
    logging.getLogger("uvicorn").setLevel(logging.INFO)
    logging.getLogger("fastapi").setLevel(logging.INFO)
    # Set debug level for our code
    logging.getLogger("core").setLevel(logging.DEBUG)
--- a/core/parser/combined_parser.py
+++ b/core/parser/combined_parser.py
@ -104,7 +104,7 @@ class CombinedParser(BaseParser):
                assemblyai_api_key=self.assemblyai_api_key,
                frame_sample_rate=self.frame_sample_rate,
            )
-            results = parser.process_video()
+            results = await parser.process_video()
            # Get all frame descriptions
            frame_descriptions = results.frame_descriptions
            # Get all transcript text
--- a/core/parser/video/parse_video.py
+++ b/core/parser/video/parse_video.py
@ -4,6 +4,10 @@ from openai import OpenAI
 import assemblyai as aai
 import logging
 from core.models.video import TimeSeriesData, ParseVideoResult
 import tomli
 import os
 from typing import Optional, Dict, Any
 from ollama import AsyncClient
 logger = logging.getLogger(__name__)
@ -12,19 +16,71 @@ def debug_object(title, obj):
    logger.debug("\n".join(["-" * 100, title, "-" * 100, f"{obj}", "-" * 100]))
 def load_config() -> Dict[str, Any]:
    config_path = os.path.join(os.path.dirname(__file__), "../../../databridge.toml")
    with open(config_path, "rb") as f:
        return tomli.load(f)
 class VisionModelClient:
    def __init__(self, config: Dict[str, Any]):
        self.config = config["parser"]["vision"]
        self.provider = self.config.get("provider", "ollama")
        self.model_name = self.config.get("model_name", "llama3.2-vision")
        if self.provider == "openai":
            self.client = OpenAI()
        elif self.provider == "ollama":
            base_url = self.config.get("base_url", "http://localhost:11434")
            self.client = AsyncClient(host=base_url)
        else:
            raise ValueError(f"Unsupported vision model provider: {self.provider}")
    async def get_frame_description(self, image_base64: str, context: str) -> str:
        if self.provider == "openai":
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": context},
                            {
                                "type": "image_url",
                                "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
                            },
                        ],
                    }
                ],
                max_tokens=300,
            )
            return response.choices[0].message.content
        else:  # ollama
            response = await self.client.chat(
                model=self.model_name,
                messages=[{"role": "user", "content": context, "images": [image_base64]}],
            )
            return response["message"]["content"]
 class VideoParser:
-    def __init__(self, video_path: str, assemblyai_api_key: str, frame_sample_rate: int = 120):
+    def __init__(
        self, video_path: str, assemblyai_api_key: str, frame_sample_rate: Optional[int] = None
    ):
        """
        Initialize the video parser
        Args:
            video_path: Path to the video file
            assemblyai_api_key: API key for AssemblyAI
-            frame_sample_rate: Sample every nth frame for description
+            frame_sample_rate: Sample every nth frame for description (optional, defaults to config value)
        """
        logger.info(f"Initializing VideoParser for {video_path}")
        self.config = load_config()
        self.video_path = video_path
-        self.frame_sample_rate = frame_sample_rate
+        self.frame_sample_rate = frame_sample_rate or self.config["parser"]["vision"].get(
            "frame_sample_rate", 120
        )
        self.cap = cv2.VideoCapture(video_path)
        if not self.cap.isOpened():
@ -34,15 +90,15 @@ class VideoParser:
        self.fps = self.cap.get(cv2.CAP_PROP_FPS)
        self.total_frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
        self.duration = self.total_frames / self.fps
        # Initialize AssemblyAI
        aai.settings.api_key = assemblyai_api_key
-        aai_config = aai.TranscriptionConfig(
+        aai_config = aai.TranscriptionConfig(speaker_labels=True)
            speaker_labels=True
        )  # speech_model=aai.SpeechModel.nano
        self.transcriber = aai.Transcriber(config=aai_config)
-        self.transcript = TimeSeriesData(
+        self.transcript = TimeSeriesData(time_to_content={})
-            time_to_content={}
+
-        )  # empty transcript initially - TODO: have this be a lateinit somehow
+        # Initialize vision model client
-        self.gpt = OpenAI()
+        self.vision_client = VisionModelClient(self.config)
        logger.info(f"Video loaded: {self.duration:.2f}s duration, {self.fps:.2f} FPS")
@ -83,70 +139,65 @@ class VideoParser:
            {u.start / 1000: u.text for u in transcript.utterances} if transcript.utterances else {}
        )
        debug_object("Time to text", time_to_text)
-        self.transcript = TimeSeriesData(time_to_text)
+        self.transcript = TimeSeriesData(time_to_content=time_to_text)
        return self.transcript
-    def get_frame_descriptions(self) -> TimeSeriesData:
+    async def get_frame_descriptions(self) -> TimeSeriesData:
        """
-        Get descriptions for sampled frames using GPT-4
+        Get descriptions for sampled frames using configured vision model
        Returns:
            TimeSeriesData object containing frame descriptions
        """
        logger.info("Starting frame description generation")
        # Return empty TimeSeriesData if frame_sample_rate is -1 (captioning disabled)
        if self.frame_sample_rate == -1:
            logger.info("Frame captioning is disabled (frame_sample_rate = -1)")
            return TimeSeriesData(time_to_content={})
        frame_count = 0
        time_to_description = {}
        last_description = None
        logger.info("Starting main loop for frame description generation")
        while True:
            logger.info(f"Frame count: {frame_count}")
            ret, frame = self.cap.read()
            if not ret:
                logger.info("Reached end of video")
                break
            if frame_count % self.frame_sample_rate == 0:
                logger.info(f"Processing frame at {frame_count / self.fps:.2f}s")
                timestamp = frame_count / self.fps
                logger.debug(f"Processing frame at {timestamp:.2f}s")
                img_base64 = self.frame_to_base64(frame)
-                response = self.gpt.chat.completions.create(
+                context = f"""Describe this frame from a video. Focus on the main elements, actions, and any notable details. Here is the transcript around the time of the frame:
-                    model="gpt-4o-mini",
+                ---
-                    messages=[
+                {self.transcript.at_time(timestamp, padding=10)}
-                        {
+                ---
                            "role": "user",
                            "content": [
                                {
                                    "type": "text",
                                    "text": f"""Describe this frame from a video. Focus on the main elements, actions, and any notable details. Here is the transcript around the time of the frame:
                                ---
                                {self.transcript.at_time(timestamp, padding=10)}
                                ---
-                                Here is a description of the previous frame:
+                Here is a description of the previous frame:
-                                ---
+                ---
-                                {last_description if last_description else 'No previous frame description available, this is the first frame'}
+                {last_description if last_description else 'No previous frame description available, this is the first frame'}
-                                ---
+                ---
-                                In your response, only provide the description of the current frame, using the above information as context.
+                In your response, only provide the description of the current frame, using the above information as context.
-                                """,
+                """
-                                },
+
-                                {
+                last_description = await self.vision_client.get_frame_description(
-                                    "type": "image_url",
+                    img_base64, context
                                    "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
                                },
                            ],
                        }
                    ],
                    max_tokens=300,
                )
                last_description = response.choices[0].message.content
                time_to_description[timestamp] = last_description
            frame_count += 1
        logger.info(f"Generated descriptions for {len(time_to_description)} frames")
-        return TimeSeriesData(time_to_description)
+        return TimeSeriesData(time_to_content=time_to_description)
-    def process_video(self) -> ParseVideoResult:
+    async def process_video(self) -> ParseVideoResult:
        """
        Process the video to get both transcript and frame descriptions
@ -163,7 +214,7 @@ class VideoParser:
        result = ParseVideoResult(
            metadata=metadata,
            transcript=self.get_transcript(),
-            frame_descriptions=self.get_frame_descriptions(),
+            frame_descriptions=await self.get_frame_descriptions(),
        )
        logger.info("Video processing completed successfully")
        return result
--- a/databridge.toml
+++ b/databridge.toml
@ -58,7 +58,18 @@ use_unstructured_api = false
 # chunk_size = 1000
 # chunk_overlap = 200
 # use_unstructured_api = false
-# frame_sample_rate = 120
+
 [parser.vision]
 provider = "ollama"
 model_name = "llama3.2-vision"
 frame_sample_rate = -1  # Set to -1 to disable frame captioning
 # base_url = "http://localhost:11434"  # Only used for ollama
 base_url = "http://ollama:11434"  # Use if using via docker
 # [parser.vision]
 # provider = "openai"
 # model_name = "gpt-4o-mini"
 # frame_sample_rate = -1 # Set to -1 to disable frame captioning
 [reranker]
 use_reranker = false
--- a/shell.py
+++ b/shell.py
@ -159,9 +159,7 @@ class Cache:
        """Add documents to the cache"""
        return self._client_cache.add_docs(docs)
-    def query(
+    def query(self, query: str, max_tokens: int = None, temperature: float = None) -> dict:
        self, query: str, max_tokens: int = None, temperature: float = None
    ) -> dict:
        """Query the cache"""
        response = self._client_cache.query(
            query=query,
--- a/start_server.py
+++ b/start_server.py
@ -1,9 +1,13 @@
 import uvicorn
 from dotenv import load_dotenv
 from core.config import get_settings
 from core.logging_config import setup_logging
 def main():
    # Set up logging first
    setup_logging()
    # Load environment variables from .env file
    load_dotenv()
@ -16,6 +20,7 @@ def main():
        host=settings.HOST,
        port=settings.PORT,
        loop="asyncio",
        log_level="info",
        # reload=settings.RELOAD
    )