Fix video parsing bugs, improve server logging

2025-05-09 19:32:38 +00:00 · 2025-01-30 16:03:46 -05:00 · 2025-01-30 16:03:46 -05:00 · 20c3015038
commit 20c3015038
parent d124e6aa0d
7 changed files with 154 additions and 49 deletions
--- a/core/config.py
+++ b/core/config.py
@ -18,6 +18,7 @@ class Settings(BaseSettings):
    AWS_SECRET_ACCESS_KEY: Optional[str] = None
    OPENAI_API_KEY: Optional[str] = None
    ANTHROPIC_API_KEY: Optional[str] = None
+    ASSEMBLYAI_API_KEY: Optional[str] = None

    # API configuration
    HOST: str
--- a/core/logging_config.py
+++ b/core/logging_config.py
@ -0,0 +1,39 @@
+import logging
+import sys
+from pathlib import Path
+
+
+def setup_logging():
+    # Create logs directory if it doesn't exist
+    log_dir = Path("logs")
+    log_dir.mkdir(exist_ok=True)
+
+    # Configure root logger
+    root_logger = logging.getLogger()
+    root_logger.setLevel(logging.INFO)
+
+    # Create formatters
+    console_formatter = logging.Formatter(
+        "%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
+    )
+
+    # Console handler
+    console_handler = logging.StreamHandler(sys.stdout)
+    console_handler.setFormatter(console_formatter)
+    console_handler.setLevel(logging.INFO)
+
+    # File handler
+    file_handler = logging.FileHandler(log_dir / "databridge.log")
+    file_handler.setFormatter(console_formatter)
+    file_handler.setLevel(logging.INFO)
+
+    # Add handlers to root logger
+    root_logger.addHandler(console_handler)
+    root_logger.addHandler(file_handler)
+
+    # Set levels for specific loggers
+    logging.getLogger("uvicorn").setLevel(logging.INFO)
+    logging.getLogger("fastapi").setLevel(logging.INFO)
+
+    # Set debug level for our code
+    logging.getLogger("core").setLevel(logging.DEBUG)
--- a/core/parser/combined_parser.py
+++ b/core/parser/combined_parser.py
@ -104,7 +104,7 @@ class CombinedParser(BaseParser):
                assemblyai_api_key=self.assemblyai_api_key,
                frame_sample_rate=self.frame_sample_rate,
            )
-            results = parser.process_video()
+            results = await parser.process_video()
            # Get all frame descriptions
            frame_descriptions = results.frame_descriptions
            # Get all transcript text
--- a/core/parser/video/parse_video.py
+++ b/core/parser/video/parse_video.py
@ -4,6 +4,10 @@ from openai import OpenAI
 import assemblyai as aai
 import logging
 from core.models.video import TimeSeriesData, ParseVideoResult
+import tomli
+import os
+from typing import Optional, Dict, Any
+from ollama import AsyncClient

 logger = logging.getLogger(__name__)

@ -12,19 +16,71 @@ def debug_object(title, obj):
    logger.debug("\n".join(["-" * 100, title, "-" * 100, f"{obj}", "-" * 100]))


+def load_config() -> Dict[str, Any]:
+    config_path = os.path.join(os.path.dirname(__file__), "../../../databridge.toml")
+    with open(config_path, "rb") as f:
+        return tomli.load(f)
+
+
+class VisionModelClient:
+    def __init__(self, config: Dict[str, Any]):
+        self.config = config["parser"]["vision"]
+        self.provider = self.config.get("provider", "ollama")
+        self.model_name = self.config.get("model_name", "llama3.2-vision")
+
+        if self.provider == "openai":
+            self.client = OpenAI()
+        elif self.provider == "ollama":
+            base_url = self.config.get("base_url", "http://localhost:11434")
+            self.client = AsyncClient(host=base_url)
+        else:
+            raise ValueError(f"Unsupported vision model provider: {self.provider}")
+
+    async def get_frame_description(self, image_base64: str, context: str) -> str:
+        if self.provider == "openai":
+            response = self.client.chat.completions.create(
+                model=self.model_name,
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": context},
+                            {
+                                "type": "image_url",
+                                "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
+                            },
+                        ],
+                    }
+                ],
+                max_tokens=300,
+            )
+            return response.choices[0].message.content
+        else:  # ollama
+            response = await self.client.chat(
+                model=self.model_name,
+                messages=[{"role": "user", "content": context, "images": [image_base64]}],
+            )
+            return response["message"]["content"]
+
+
 class VideoParser:
-    def __init__(self, video_path: str, assemblyai_api_key: str, frame_sample_rate: int = 120):
+    def __init__(
+        self, video_path: str, assemblyai_api_key: str, frame_sample_rate: Optional[int] = None
+    ):
        """
        Initialize the video parser

        Args:
            video_path: Path to the video file
            assemblyai_api_key: API key for AssemblyAI
-            frame_sample_rate: Sample every nth frame for description
+            frame_sample_rate: Sample every nth frame for description (optional, defaults to config value)
        """
        logger.info(f"Initializing VideoParser for {video_path}")
+        self.config = load_config()
        self.video_path = video_path
-        self.frame_sample_rate = frame_sample_rate
+        self.frame_sample_rate = frame_sample_rate or self.config["parser"]["vision"].get(
+            "frame_sample_rate", 120
+        )
        self.cap = cv2.VideoCapture(video_path)

        if not self.cap.isOpened():
@ -34,15 +90,15 @@ class VideoParser:
        self.fps = self.cap.get(cv2.CAP_PROP_FPS)
        self.total_frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
        self.duration = self.total_frames / self.fps
+
+        # Initialize AssemblyAI
        aai.settings.api_key = assemblyai_api_key
-        aai_config = aai.TranscriptionConfig(
-            speaker_labels=True
-        )  # speech_model=aai.SpeechModel.nano
+        aai_config = aai.TranscriptionConfig(speaker_labels=True)
        self.transcriber = aai.Transcriber(config=aai_config)
-        self.transcript = TimeSeriesData(
-            time_to_content={}
-        )  # empty transcript initially - TODO: have this be a lateinit somehow
-        self.gpt = OpenAI()
+        self.transcript = TimeSeriesData(time_to_content={})
+
+        # Initialize vision model client
+        self.vision_client = VisionModelClient(self.config)

        logger.info(f"Video loaded: {self.duration:.2f}s duration, {self.fps:.2f} FPS")

@ -83,70 +139,65 @@ class VideoParser:
            {u.start / 1000: u.text for u in transcript.utterances} if transcript.utterances else {}
        )
        debug_object("Time to text", time_to_text)
-        self.transcript = TimeSeriesData(time_to_text)
+        self.transcript = TimeSeriesData(time_to_content=time_to_text)
        return self.transcript

-    def get_frame_descriptions(self) -> TimeSeriesData:
+    async def get_frame_descriptions(self) -> TimeSeriesData:
        """
-        Get descriptions for sampled frames using GPT-4
+        Get descriptions for sampled frames using configured vision model

        Returns:
            TimeSeriesData object containing frame descriptions
        """
        logger.info("Starting frame description generation")
+
+        # Return empty TimeSeriesData if frame_sample_rate is -1 (captioning disabled)
+        if self.frame_sample_rate == -1:
+            logger.info("Frame captioning is disabled (frame_sample_rate = -1)")
+            return TimeSeriesData(time_to_content={})
+
        frame_count = 0
        time_to_description = {}
        last_description = None
+        logger.info("Starting main loop for frame description generation")
        while True:
+            logger.info(f"Frame count: {frame_count}")
            ret, frame = self.cap.read()
            if not ret:
+                logger.info("Reached end of video")
                break

            if frame_count % self.frame_sample_rate == 0:
+                logger.info(f"Processing frame at {frame_count / self.fps:.2f}s")
                timestamp = frame_count / self.fps
                logger.debug(f"Processing frame at {timestamp:.2f}s")

                img_base64 = self.frame_to_base64(frame)

-                response = self.gpt.chat.completions.create(
-                    model="gpt-4o-mini",
-                    messages=[
-                        {
-                            "role": "user",
-                            "content": [
-                                {
-                                    "type": "text",
-                                    "text": f"""Describe this frame from a video. Focus on the main elements, actions, and any notable details. Here is the transcript around the time of the frame:
-                                ---
-                                {self.transcript.at_time(timestamp, padding=10)}
-                                ---
+                context = f"""Describe this frame from a video. Focus on the main elements, actions, and any notable details. Here is the transcript around the time of the frame:
+                ---
+                {self.transcript.at_time(timestamp, padding=10)}
+                ---

-                                Here is a description of the previous frame:
-                                ---
-                                {last_description if last_description else 'No previous frame description available, this is the first frame'}
-                                ---
+                Here is a description of the previous frame:
+                ---
+                {last_description if last_description else 'No previous frame description available, this is the first frame'}
+                ---

-                                In your response, only provide the description of the current frame, using the above information as context.
-                                """,
-                                },
-                                {
-                                    "type": "image_url",
-                                    "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
-                                },
-                            ],
-                        }
-                    ],
-                    max_tokens=300,
+                In your response, only provide the description of the current frame, using the above information as context.
+                """
+
+                last_description = await self.vision_client.get_frame_description(
+                    img_base64, context
                )
-                last_description = response.choices[0].message.content
                time_to_description[timestamp] = last_description

            frame_count += 1

        logger.info(f"Generated descriptions for {len(time_to_description)} frames")
-        return TimeSeriesData(time_to_description)
+        return TimeSeriesData(time_to_content=time_to_description)

-    def process_video(self) -> ParseVideoResult:
+    async def process_video(self) -> ParseVideoResult:
        """
        Process the video to get both transcript and frame descriptions

@ -163,7 +214,7 @@ class VideoParser:
        result = ParseVideoResult(
            metadata=metadata,
            transcript=self.get_transcript(),
-            frame_descriptions=self.get_frame_descriptions(),
+            frame_descriptions=await self.get_frame_descriptions(),
        )
        logger.info("Video processing completed successfully")
        return result
--- a/databridge.toml
+++ b/databridge.toml
@ -58,7 +58,18 @@ use_unstructured_api = false
 # chunk_size = 1000
 # chunk_overlap = 200
 # use_unstructured_api = false
-# frame_sample_rate = 120
+
+[parser.vision]
+provider = "ollama"
+model_name = "llama3.2-vision"
+frame_sample_rate = -1  # Set to -1 to disable frame captioning
+# base_url = "http://localhost:11434"  # Only used for ollama
+base_url = "http://ollama:11434"  # Use if using via docker
+
+# [parser.vision]
+# provider = "openai"
+# model_name = "gpt-4o-mini"
+# frame_sample_rate = -1 # Set to -1 to disable frame captioning

 [reranker]
 use_reranker = false
--- a/shell.py
+++ b/shell.py
@ -159,9 +159,7 @@ class Cache:
        """Add documents to the cache"""
        return self._client_cache.add_docs(docs)

-    def query(
-        self, query: str, max_tokens: int = None, temperature: float = None
-    ) -> dict:
+    def query(self, query: str, max_tokens: int = None, temperature: float = None) -> dict:
        """Query the cache"""
        response = self._client_cache.query(
            query=query,
--- a/start_server.py
+++ b/start_server.py
@ -1,9 +1,13 @@
 import uvicorn
 from dotenv import load_dotenv
 from core.config import get_settings
+from core.logging_config import setup_logging


 def main():
+    # Set up logging first
+    setup_logging()
+
    # Load environment variables from .env file
    load_dotenv()

@ -16,6 +20,7 @@ def main():
        host=settings.HOST,
        port=settings.PORT,
        loop="asyncio",
+        log_level="info",
        # reload=settings.RELOAD
    )