Fix video parsing bugs, improve server logging

This commit is contained in:
Adityavardhan Agrawal 2025-01-30 16:03:46 -05:00
parent d124e6aa0d
commit 20c3015038
7 changed files with 154 additions and 49 deletions

View File

@ -18,6 +18,7 @@ class Settings(BaseSettings):
AWS_SECRET_ACCESS_KEY: Optional[str] = None
OPENAI_API_KEY: Optional[str] = None
ANTHROPIC_API_KEY: Optional[str] = None
ASSEMBLYAI_API_KEY: Optional[str] = None
# API configuration
HOST: str

39
core/logging_config.py Normal file
View File

@ -0,0 +1,39 @@
import logging
import sys
from pathlib import Path
def setup_logging():
# Create logs directory if it doesn't exist
log_dir = Path("logs")
log_dir.mkdir(exist_ok=True)
# Configure root logger
root_logger = logging.getLogger()
root_logger.setLevel(logging.INFO)
# Create formatters
console_formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
)
# Console handler
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setFormatter(console_formatter)
console_handler.setLevel(logging.INFO)
# File handler
file_handler = logging.FileHandler(log_dir / "databridge.log")
file_handler.setFormatter(console_formatter)
file_handler.setLevel(logging.INFO)
# Add handlers to root logger
root_logger.addHandler(console_handler)
root_logger.addHandler(file_handler)
# Set levels for specific loggers
logging.getLogger("uvicorn").setLevel(logging.INFO)
logging.getLogger("fastapi").setLevel(logging.INFO)
# Set debug level for our code
logging.getLogger("core").setLevel(logging.DEBUG)

View File

@ -104,7 +104,7 @@ class CombinedParser(BaseParser):
assemblyai_api_key=self.assemblyai_api_key,
frame_sample_rate=self.frame_sample_rate,
)
results = parser.process_video()
results = await parser.process_video()
# Get all frame descriptions
frame_descriptions = results.frame_descriptions
# Get all transcript text

View File

@ -4,6 +4,10 @@ from openai import OpenAI
import assemblyai as aai
import logging
from core.models.video import TimeSeriesData, ParseVideoResult
import tomli
import os
from typing import Optional, Dict, Any
from ollama import AsyncClient
logger = logging.getLogger(__name__)
@ -12,19 +16,71 @@ def debug_object(title, obj):
logger.debug("\n".join(["-" * 100, title, "-" * 100, f"{obj}", "-" * 100]))
def load_config() -> Dict[str, Any]:
config_path = os.path.join(os.path.dirname(__file__), "../../../databridge.toml")
with open(config_path, "rb") as f:
return tomli.load(f)
class VisionModelClient:
def __init__(self, config: Dict[str, Any]):
self.config = config["parser"]["vision"]
self.provider = self.config.get("provider", "ollama")
self.model_name = self.config.get("model_name", "llama3.2-vision")
if self.provider == "openai":
self.client = OpenAI()
elif self.provider == "ollama":
base_url = self.config.get("base_url", "http://localhost:11434")
self.client = AsyncClient(host=base_url)
else:
raise ValueError(f"Unsupported vision model provider: {self.provider}")
async def get_frame_description(self, image_base64: str, context: str) -> str:
if self.provider == "openai":
response = self.client.chat.completions.create(
model=self.model_name,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": context},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
},
],
}
],
max_tokens=300,
)
return response.choices[0].message.content
else: # ollama
response = await self.client.chat(
model=self.model_name,
messages=[{"role": "user", "content": context, "images": [image_base64]}],
)
return response["message"]["content"]
class VideoParser:
def __init__(self, video_path: str, assemblyai_api_key: str, frame_sample_rate: int = 120):
def __init__(
self, video_path: str, assemblyai_api_key: str, frame_sample_rate: Optional[int] = None
):
"""
Initialize the video parser
Args:
video_path: Path to the video file
assemblyai_api_key: API key for AssemblyAI
frame_sample_rate: Sample every nth frame for description
frame_sample_rate: Sample every nth frame for description (optional, defaults to config value)
"""
logger.info(f"Initializing VideoParser for {video_path}")
self.config = load_config()
self.video_path = video_path
self.frame_sample_rate = frame_sample_rate
self.frame_sample_rate = frame_sample_rate or self.config["parser"]["vision"].get(
"frame_sample_rate", 120
)
self.cap = cv2.VideoCapture(video_path)
if not self.cap.isOpened():
@ -34,15 +90,15 @@ class VideoParser:
self.fps = self.cap.get(cv2.CAP_PROP_FPS)
self.total_frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
self.duration = self.total_frames / self.fps
# Initialize AssemblyAI
aai.settings.api_key = assemblyai_api_key
aai_config = aai.TranscriptionConfig(
speaker_labels=True
) # speech_model=aai.SpeechModel.nano
aai_config = aai.TranscriptionConfig(speaker_labels=True)
self.transcriber = aai.Transcriber(config=aai_config)
self.transcript = TimeSeriesData(
time_to_content={}
) # empty transcript initially - TODO: have this be a lateinit somehow
self.gpt = OpenAI()
self.transcript = TimeSeriesData(time_to_content={})
# Initialize vision model client
self.vision_client = VisionModelClient(self.config)
logger.info(f"Video loaded: {self.duration:.2f}s duration, {self.fps:.2f} FPS")
@ -83,70 +139,65 @@ class VideoParser:
{u.start / 1000: u.text for u in transcript.utterances} if transcript.utterances else {}
)
debug_object("Time to text", time_to_text)
self.transcript = TimeSeriesData(time_to_text)
self.transcript = TimeSeriesData(time_to_content=time_to_text)
return self.transcript
def get_frame_descriptions(self) -> TimeSeriesData:
async def get_frame_descriptions(self) -> TimeSeriesData:
"""
Get descriptions for sampled frames using GPT-4
Get descriptions for sampled frames using configured vision model
Returns:
TimeSeriesData object containing frame descriptions
"""
logger.info("Starting frame description generation")
# Return empty TimeSeriesData if frame_sample_rate is -1 (captioning disabled)
if self.frame_sample_rate == -1:
logger.info("Frame captioning is disabled (frame_sample_rate = -1)")
return TimeSeriesData(time_to_content={})
frame_count = 0
time_to_description = {}
last_description = None
logger.info("Starting main loop for frame description generation")
while True:
logger.info(f"Frame count: {frame_count}")
ret, frame = self.cap.read()
if not ret:
logger.info("Reached end of video")
break
if frame_count % self.frame_sample_rate == 0:
logger.info(f"Processing frame at {frame_count / self.fps:.2f}s")
timestamp = frame_count / self.fps
logger.debug(f"Processing frame at {timestamp:.2f}s")
img_base64 = self.frame_to_base64(frame)
response = self.gpt.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": f"""Describe this frame from a video. Focus on the main elements, actions, and any notable details. Here is the transcript around the time of the frame:
---
{self.transcript.at_time(timestamp, padding=10)}
---
context = f"""Describe this frame from a video. Focus on the main elements, actions, and any notable details. Here is the transcript around the time of the frame:
---
{self.transcript.at_time(timestamp, padding=10)}
---
Here is a description of the previous frame:
---
{last_description if last_description else 'No previous frame description available, this is the first frame'}
---
Here is a description of the previous frame:
---
{last_description if last_description else 'No previous frame description available, this is the first frame'}
---
In your response, only provide the description of the current frame, using the above information as context.
""",
},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
},
],
}
],
max_tokens=300,
In your response, only provide the description of the current frame, using the above information as context.
"""
last_description = await self.vision_client.get_frame_description(
img_base64, context
)
last_description = response.choices[0].message.content
time_to_description[timestamp] = last_description
frame_count += 1
logger.info(f"Generated descriptions for {len(time_to_description)} frames")
return TimeSeriesData(time_to_description)
return TimeSeriesData(time_to_content=time_to_description)
def process_video(self) -> ParseVideoResult:
async def process_video(self) -> ParseVideoResult:
"""
Process the video to get both transcript and frame descriptions
@ -163,7 +214,7 @@ class VideoParser:
result = ParseVideoResult(
metadata=metadata,
transcript=self.get_transcript(),
frame_descriptions=self.get_frame_descriptions(),
frame_descriptions=await self.get_frame_descriptions(),
)
logger.info("Video processing completed successfully")
return result

View File

@ -58,7 +58,18 @@ use_unstructured_api = false
# chunk_size = 1000
# chunk_overlap = 200
# use_unstructured_api = false
# frame_sample_rate = 120
[parser.vision]
provider = "ollama"
model_name = "llama3.2-vision"
frame_sample_rate = -1 # Set to -1 to disable frame captioning
# base_url = "http://localhost:11434" # Only used for ollama
base_url = "http://ollama:11434" # Use if using via docker
# [parser.vision]
# provider = "openai"
# model_name = "gpt-4o-mini"
# frame_sample_rate = -1 # Set to -1 to disable frame captioning
[reranker]
use_reranker = false

View File

@ -159,9 +159,7 @@ class Cache:
"""Add documents to the cache"""
return self._client_cache.add_docs(docs)
def query(
self, query: str, max_tokens: int = None, temperature: float = None
) -> dict:
def query(self, query: str, max_tokens: int = None, temperature: float = None) -> dict:
"""Query the cache"""
response = self._client_cache.query(
query=query,

View File

@ -1,9 +1,13 @@
import uvicorn
from dotenv import load_dotenv
from core.config import get_settings
from core.logging_config import setup_logging
def main():
# Set up logging first
setup_logging()
# Load environment variables from .env file
load_dotenv()
@ -16,6 +20,7 @@ def main():
host=settings.HOST,
port=settings.PORT,
loop="asyncio",
log_level="info",
# reload=settings.RELOAD
)