Fix video parsing bugs, improve server logging

This commit is contained in:
Adityavardhan Agrawal 2025-01-30 16:03:46 -05:00
parent d124e6aa0d
commit 20c3015038
7 changed files with 154 additions and 49 deletions

View File

@ -18,6 +18,7 @@ class Settings(BaseSettings):
AWS_SECRET_ACCESS_KEY: Optional[str] = None AWS_SECRET_ACCESS_KEY: Optional[str] = None
OPENAI_API_KEY: Optional[str] = None OPENAI_API_KEY: Optional[str] = None
ANTHROPIC_API_KEY: Optional[str] = None ANTHROPIC_API_KEY: Optional[str] = None
ASSEMBLYAI_API_KEY: Optional[str] = None
# API configuration # API configuration
HOST: str HOST: str

39
core/logging_config.py Normal file
View File

@ -0,0 +1,39 @@
import logging
import sys
from pathlib import Path
def setup_logging():
# Create logs directory if it doesn't exist
log_dir = Path("logs")
log_dir.mkdir(exist_ok=True)
# Configure root logger
root_logger = logging.getLogger()
root_logger.setLevel(logging.INFO)
# Create formatters
console_formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
)
# Console handler
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setFormatter(console_formatter)
console_handler.setLevel(logging.INFO)
# File handler
file_handler = logging.FileHandler(log_dir / "databridge.log")
file_handler.setFormatter(console_formatter)
file_handler.setLevel(logging.INFO)
# Add handlers to root logger
root_logger.addHandler(console_handler)
root_logger.addHandler(file_handler)
# Set levels for specific loggers
logging.getLogger("uvicorn").setLevel(logging.INFO)
logging.getLogger("fastapi").setLevel(logging.INFO)
# Set debug level for our code
logging.getLogger("core").setLevel(logging.DEBUG)

View File

@ -104,7 +104,7 @@ class CombinedParser(BaseParser):
assemblyai_api_key=self.assemblyai_api_key, assemblyai_api_key=self.assemblyai_api_key,
frame_sample_rate=self.frame_sample_rate, frame_sample_rate=self.frame_sample_rate,
) )
results = parser.process_video() results = await parser.process_video()
# Get all frame descriptions # Get all frame descriptions
frame_descriptions = results.frame_descriptions frame_descriptions = results.frame_descriptions
# Get all transcript text # Get all transcript text

View File

@ -4,6 +4,10 @@ from openai import OpenAI
import assemblyai as aai import assemblyai as aai
import logging import logging
from core.models.video import TimeSeriesData, ParseVideoResult from core.models.video import TimeSeriesData, ParseVideoResult
import tomli
import os
from typing import Optional, Dict, Any
from ollama import AsyncClient
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -12,19 +16,71 @@ def debug_object(title, obj):
logger.debug("\n".join(["-" * 100, title, "-" * 100, f"{obj}", "-" * 100])) logger.debug("\n".join(["-" * 100, title, "-" * 100, f"{obj}", "-" * 100]))
def load_config() -> Dict[str, Any]:
config_path = os.path.join(os.path.dirname(__file__), "../../../databridge.toml")
with open(config_path, "rb") as f:
return tomli.load(f)
class VisionModelClient:
def __init__(self, config: Dict[str, Any]):
self.config = config["parser"]["vision"]
self.provider = self.config.get("provider", "ollama")
self.model_name = self.config.get("model_name", "llama3.2-vision")
if self.provider == "openai":
self.client = OpenAI()
elif self.provider == "ollama":
base_url = self.config.get("base_url", "http://localhost:11434")
self.client = AsyncClient(host=base_url)
else:
raise ValueError(f"Unsupported vision model provider: {self.provider}")
async def get_frame_description(self, image_base64: str, context: str) -> str:
if self.provider == "openai":
response = self.client.chat.completions.create(
model=self.model_name,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": context},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
},
],
}
],
max_tokens=300,
)
return response.choices[0].message.content
else: # ollama
response = await self.client.chat(
model=self.model_name,
messages=[{"role": "user", "content": context, "images": [image_base64]}],
)
return response["message"]["content"]
class VideoParser: class VideoParser:
def __init__(self, video_path: str, assemblyai_api_key: str, frame_sample_rate: int = 120): def __init__(
self, video_path: str, assemblyai_api_key: str, frame_sample_rate: Optional[int] = None
):
""" """
Initialize the video parser Initialize the video parser
Args: Args:
video_path: Path to the video file video_path: Path to the video file
assemblyai_api_key: API key for AssemblyAI assemblyai_api_key: API key for AssemblyAI
frame_sample_rate: Sample every nth frame for description frame_sample_rate: Sample every nth frame for description (optional, defaults to config value)
""" """
logger.info(f"Initializing VideoParser for {video_path}") logger.info(f"Initializing VideoParser for {video_path}")
self.config = load_config()
self.video_path = video_path self.video_path = video_path
self.frame_sample_rate = frame_sample_rate self.frame_sample_rate = frame_sample_rate or self.config["parser"]["vision"].get(
"frame_sample_rate", 120
)
self.cap = cv2.VideoCapture(video_path) self.cap = cv2.VideoCapture(video_path)
if not self.cap.isOpened(): if not self.cap.isOpened():
@ -34,15 +90,15 @@ class VideoParser:
self.fps = self.cap.get(cv2.CAP_PROP_FPS) self.fps = self.cap.get(cv2.CAP_PROP_FPS)
self.total_frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT)) self.total_frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
self.duration = self.total_frames / self.fps self.duration = self.total_frames / self.fps
# Initialize AssemblyAI
aai.settings.api_key = assemblyai_api_key aai.settings.api_key = assemblyai_api_key
aai_config = aai.TranscriptionConfig( aai_config = aai.TranscriptionConfig(speaker_labels=True)
speaker_labels=True
) # speech_model=aai.SpeechModel.nano
self.transcriber = aai.Transcriber(config=aai_config) self.transcriber = aai.Transcriber(config=aai_config)
self.transcript = TimeSeriesData( self.transcript = TimeSeriesData(time_to_content={})
time_to_content={}
) # empty transcript initially - TODO: have this be a lateinit somehow # Initialize vision model client
self.gpt = OpenAI() self.vision_client = VisionModelClient(self.config)
logger.info(f"Video loaded: {self.duration:.2f}s duration, {self.fps:.2f} FPS") logger.info(f"Video loaded: {self.duration:.2f}s duration, {self.fps:.2f} FPS")
@ -83,70 +139,65 @@ class VideoParser:
{u.start / 1000: u.text for u in transcript.utterances} if transcript.utterances else {} {u.start / 1000: u.text for u in transcript.utterances} if transcript.utterances else {}
) )
debug_object("Time to text", time_to_text) debug_object("Time to text", time_to_text)
self.transcript = TimeSeriesData(time_to_text) self.transcript = TimeSeriesData(time_to_content=time_to_text)
return self.transcript return self.transcript
def get_frame_descriptions(self) -> TimeSeriesData: async def get_frame_descriptions(self) -> TimeSeriesData:
""" """
Get descriptions for sampled frames using GPT-4 Get descriptions for sampled frames using configured vision model
Returns: Returns:
TimeSeriesData object containing frame descriptions TimeSeriesData object containing frame descriptions
""" """
logger.info("Starting frame description generation") logger.info("Starting frame description generation")
# Return empty TimeSeriesData if frame_sample_rate is -1 (captioning disabled)
if self.frame_sample_rate == -1:
logger.info("Frame captioning is disabled (frame_sample_rate = -1)")
return TimeSeriesData(time_to_content={})
frame_count = 0 frame_count = 0
time_to_description = {} time_to_description = {}
last_description = None last_description = None
logger.info("Starting main loop for frame description generation")
while True: while True:
logger.info(f"Frame count: {frame_count}")
ret, frame = self.cap.read() ret, frame = self.cap.read()
if not ret: if not ret:
logger.info("Reached end of video")
break break
if frame_count % self.frame_sample_rate == 0: if frame_count % self.frame_sample_rate == 0:
logger.info(f"Processing frame at {frame_count / self.fps:.2f}s")
timestamp = frame_count / self.fps timestamp = frame_count / self.fps
logger.debug(f"Processing frame at {timestamp:.2f}s") logger.debug(f"Processing frame at {timestamp:.2f}s")
img_base64 = self.frame_to_base64(frame) img_base64 = self.frame_to_base64(frame)
response = self.gpt.chat.completions.create( context = f"""Describe this frame from a video. Focus on the main elements, actions, and any notable details. Here is the transcript around the time of the frame:
model="gpt-4o-mini", ---
messages=[ {self.transcript.at_time(timestamp, padding=10)}
{ ---
"role": "user",
"content": [
{
"type": "text",
"text": f"""Describe this frame from a video. Focus on the main elements, actions, and any notable details. Here is the transcript around the time of the frame:
---
{self.transcript.at_time(timestamp, padding=10)}
---
Here is a description of the previous frame: Here is a description of the previous frame:
--- ---
{last_description if last_description else 'No previous frame description available, this is the first frame'} {last_description if last_description else 'No previous frame description available, this is the first frame'}
--- ---
In your response, only provide the description of the current frame, using the above information as context. In your response, only provide the description of the current frame, using the above information as context.
""", """
},
{ last_description = await self.vision_client.get_frame_description(
"type": "image_url", img_base64, context
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
},
],
}
],
max_tokens=300,
) )
last_description = response.choices[0].message.content
time_to_description[timestamp] = last_description time_to_description[timestamp] = last_description
frame_count += 1 frame_count += 1
logger.info(f"Generated descriptions for {len(time_to_description)} frames") logger.info(f"Generated descriptions for {len(time_to_description)} frames")
return TimeSeriesData(time_to_description) return TimeSeriesData(time_to_content=time_to_description)
def process_video(self) -> ParseVideoResult: async def process_video(self) -> ParseVideoResult:
""" """
Process the video to get both transcript and frame descriptions Process the video to get both transcript and frame descriptions
@ -163,7 +214,7 @@ class VideoParser:
result = ParseVideoResult( result = ParseVideoResult(
metadata=metadata, metadata=metadata,
transcript=self.get_transcript(), transcript=self.get_transcript(),
frame_descriptions=self.get_frame_descriptions(), frame_descriptions=await self.get_frame_descriptions(),
) )
logger.info("Video processing completed successfully") logger.info("Video processing completed successfully")
return result return result

View File

@ -58,7 +58,18 @@ use_unstructured_api = false
# chunk_size = 1000 # chunk_size = 1000
# chunk_overlap = 200 # chunk_overlap = 200
# use_unstructured_api = false # use_unstructured_api = false
# frame_sample_rate = 120
[parser.vision]
provider = "ollama"
model_name = "llama3.2-vision"
frame_sample_rate = -1 # Set to -1 to disable frame captioning
# base_url = "http://localhost:11434" # Only used for ollama
base_url = "http://ollama:11434" # Use if using via docker
# [parser.vision]
# provider = "openai"
# model_name = "gpt-4o-mini"
# frame_sample_rate = -1 # Set to -1 to disable frame captioning
[reranker] [reranker]
use_reranker = false use_reranker = false

View File

@ -159,9 +159,7 @@ class Cache:
"""Add documents to the cache""" """Add documents to the cache"""
return self._client_cache.add_docs(docs) return self._client_cache.add_docs(docs)
def query( def query(self, query: str, max_tokens: int = None, temperature: float = None) -> dict:
self, query: str, max_tokens: int = None, temperature: float = None
) -> dict:
"""Query the cache""" """Query the cache"""
response = self._client_cache.query( response = self._client_cache.query(
query=query, query=query,

View File

@ -1,9 +1,13 @@
import uvicorn import uvicorn
from dotenv import load_dotenv from dotenv import load_dotenv
from core.config import get_settings from core.config import get_settings
from core.logging_config import setup_logging
def main(): def main():
# Set up logging first
setup_logging()
# Load environment variables from .env file # Load environment variables from .env file
load_dotenv() load_dotenv()
@ -16,6 +20,7 @@ def main():
host=settings.HOST, host=settings.HOST,
port=settings.PORT, port=settings.PORT,
loop="asyncio", loop="asyncio",
log_level="info",
# reload=settings.RELOAD # reload=settings.RELOAD
) )