mirror of
https://github.com/james-m-jordan/morphik-core.git
synced 2025-05-09 19:32:38 +00:00
Fix video parsing bugs, improve server logging
This commit is contained in:
parent
d124e6aa0d
commit
20c3015038
@ -18,6 +18,7 @@ class Settings(BaseSettings):
|
|||||||
AWS_SECRET_ACCESS_KEY: Optional[str] = None
|
AWS_SECRET_ACCESS_KEY: Optional[str] = None
|
||||||
OPENAI_API_KEY: Optional[str] = None
|
OPENAI_API_KEY: Optional[str] = None
|
||||||
ANTHROPIC_API_KEY: Optional[str] = None
|
ANTHROPIC_API_KEY: Optional[str] = None
|
||||||
|
ASSEMBLYAI_API_KEY: Optional[str] = None
|
||||||
|
|
||||||
# API configuration
|
# API configuration
|
||||||
HOST: str
|
HOST: str
|
||||||
|
39
core/logging_config.py
Normal file
39
core/logging_config.py
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def setup_logging():
|
||||||
|
# Create logs directory if it doesn't exist
|
||||||
|
log_dir = Path("logs")
|
||||||
|
log_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
# Configure root logger
|
||||||
|
root_logger = logging.getLogger()
|
||||||
|
root_logger.setLevel(logging.INFO)
|
||||||
|
|
||||||
|
# Create formatters
|
||||||
|
console_formatter = logging.Formatter(
|
||||||
|
"%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Console handler
|
||||||
|
console_handler = logging.StreamHandler(sys.stdout)
|
||||||
|
console_handler.setFormatter(console_formatter)
|
||||||
|
console_handler.setLevel(logging.INFO)
|
||||||
|
|
||||||
|
# File handler
|
||||||
|
file_handler = logging.FileHandler(log_dir / "databridge.log")
|
||||||
|
file_handler.setFormatter(console_formatter)
|
||||||
|
file_handler.setLevel(logging.INFO)
|
||||||
|
|
||||||
|
# Add handlers to root logger
|
||||||
|
root_logger.addHandler(console_handler)
|
||||||
|
root_logger.addHandler(file_handler)
|
||||||
|
|
||||||
|
# Set levels for specific loggers
|
||||||
|
logging.getLogger("uvicorn").setLevel(logging.INFO)
|
||||||
|
logging.getLogger("fastapi").setLevel(logging.INFO)
|
||||||
|
|
||||||
|
# Set debug level for our code
|
||||||
|
logging.getLogger("core").setLevel(logging.DEBUG)
|
@ -104,7 +104,7 @@ class CombinedParser(BaseParser):
|
|||||||
assemblyai_api_key=self.assemblyai_api_key,
|
assemblyai_api_key=self.assemblyai_api_key,
|
||||||
frame_sample_rate=self.frame_sample_rate,
|
frame_sample_rate=self.frame_sample_rate,
|
||||||
)
|
)
|
||||||
results = parser.process_video()
|
results = await parser.process_video()
|
||||||
# Get all frame descriptions
|
# Get all frame descriptions
|
||||||
frame_descriptions = results.frame_descriptions
|
frame_descriptions = results.frame_descriptions
|
||||||
# Get all transcript text
|
# Get all transcript text
|
||||||
|
@ -4,6 +4,10 @@ from openai import OpenAI
|
|||||||
import assemblyai as aai
|
import assemblyai as aai
|
||||||
import logging
|
import logging
|
||||||
from core.models.video import TimeSeriesData, ParseVideoResult
|
from core.models.video import TimeSeriesData, ParseVideoResult
|
||||||
|
import tomli
|
||||||
|
import os
|
||||||
|
from typing import Optional, Dict, Any
|
||||||
|
from ollama import AsyncClient
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -12,19 +16,71 @@ def debug_object(title, obj):
|
|||||||
logger.debug("\n".join(["-" * 100, title, "-" * 100, f"{obj}", "-" * 100]))
|
logger.debug("\n".join(["-" * 100, title, "-" * 100, f"{obj}", "-" * 100]))
|
||||||
|
|
||||||
|
|
||||||
|
def load_config() -> Dict[str, Any]:
|
||||||
|
config_path = os.path.join(os.path.dirname(__file__), "../../../databridge.toml")
|
||||||
|
with open(config_path, "rb") as f:
|
||||||
|
return tomli.load(f)
|
||||||
|
|
||||||
|
|
||||||
|
class VisionModelClient:
|
||||||
|
def __init__(self, config: Dict[str, Any]):
|
||||||
|
self.config = config["parser"]["vision"]
|
||||||
|
self.provider = self.config.get("provider", "ollama")
|
||||||
|
self.model_name = self.config.get("model_name", "llama3.2-vision")
|
||||||
|
|
||||||
|
if self.provider == "openai":
|
||||||
|
self.client = OpenAI()
|
||||||
|
elif self.provider == "ollama":
|
||||||
|
base_url = self.config.get("base_url", "http://localhost:11434")
|
||||||
|
self.client = AsyncClient(host=base_url)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported vision model provider: {self.provider}")
|
||||||
|
|
||||||
|
async def get_frame_description(self, image_base64: str, context: str) -> str:
|
||||||
|
if self.provider == "openai":
|
||||||
|
response = self.client.chat.completions.create(
|
||||||
|
model=self.model_name,
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": context},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
max_tokens=300,
|
||||||
|
)
|
||||||
|
return response.choices[0].message.content
|
||||||
|
else: # ollama
|
||||||
|
response = await self.client.chat(
|
||||||
|
model=self.model_name,
|
||||||
|
messages=[{"role": "user", "content": context, "images": [image_base64]}],
|
||||||
|
)
|
||||||
|
return response["message"]["content"]
|
||||||
|
|
||||||
|
|
||||||
class VideoParser:
|
class VideoParser:
|
||||||
def __init__(self, video_path: str, assemblyai_api_key: str, frame_sample_rate: int = 120):
|
def __init__(
|
||||||
|
self, video_path: str, assemblyai_api_key: str, frame_sample_rate: Optional[int] = None
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Initialize the video parser
|
Initialize the video parser
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
video_path: Path to the video file
|
video_path: Path to the video file
|
||||||
assemblyai_api_key: API key for AssemblyAI
|
assemblyai_api_key: API key for AssemblyAI
|
||||||
frame_sample_rate: Sample every nth frame for description
|
frame_sample_rate: Sample every nth frame for description (optional, defaults to config value)
|
||||||
"""
|
"""
|
||||||
logger.info(f"Initializing VideoParser for {video_path}")
|
logger.info(f"Initializing VideoParser for {video_path}")
|
||||||
|
self.config = load_config()
|
||||||
self.video_path = video_path
|
self.video_path = video_path
|
||||||
self.frame_sample_rate = frame_sample_rate
|
self.frame_sample_rate = frame_sample_rate or self.config["parser"]["vision"].get(
|
||||||
|
"frame_sample_rate", 120
|
||||||
|
)
|
||||||
self.cap = cv2.VideoCapture(video_path)
|
self.cap = cv2.VideoCapture(video_path)
|
||||||
|
|
||||||
if not self.cap.isOpened():
|
if not self.cap.isOpened():
|
||||||
@ -34,15 +90,15 @@ class VideoParser:
|
|||||||
self.fps = self.cap.get(cv2.CAP_PROP_FPS)
|
self.fps = self.cap.get(cv2.CAP_PROP_FPS)
|
||||||
self.total_frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
self.total_frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||||
self.duration = self.total_frames / self.fps
|
self.duration = self.total_frames / self.fps
|
||||||
|
|
||||||
|
# Initialize AssemblyAI
|
||||||
aai.settings.api_key = assemblyai_api_key
|
aai.settings.api_key = assemblyai_api_key
|
||||||
aai_config = aai.TranscriptionConfig(
|
aai_config = aai.TranscriptionConfig(speaker_labels=True)
|
||||||
speaker_labels=True
|
|
||||||
) # speech_model=aai.SpeechModel.nano
|
|
||||||
self.transcriber = aai.Transcriber(config=aai_config)
|
self.transcriber = aai.Transcriber(config=aai_config)
|
||||||
self.transcript = TimeSeriesData(
|
self.transcript = TimeSeriesData(time_to_content={})
|
||||||
time_to_content={}
|
|
||||||
) # empty transcript initially - TODO: have this be a lateinit somehow
|
# Initialize vision model client
|
||||||
self.gpt = OpenAI()
|
self.vision_client = VisionModelClient(self.config)
|
||||||
|
|
||||||
logger.info(f"Video loaded: {self.duration:.2f}s duration, {self.fps:.2f} FPS")
|
logger.info(f"Video loaded: {self.duration:.2f}s duration, {self.fps:.2f} FPS")
|
||||||
|
|
||||||
@ -83,70 +139,65 @@ class VideoParser:
|
|||||||
{u.start / 1000: u.text for u in transcript.utterances} if transcript.utterances else {}
|
{u.start / 1000: u.text for u in transcript.utterances} if transcript.utterances else {}
|
||||||
)
|
)
|
||||||
debug_object("Time to text", time_to_text)
|
debug_object("Time to text", time_to_text)
|
||||||
self.transcript = TimeSeriesData(time_to_text)
|
self.transcript = TimeSeriesData(time_to_content=time_to_text)
|
||||||
return self.transcript
|
return self.transcript
|
||||||
|
|
||||||
def get_frame_descriptions(self) -> TimeSeriesData:
|
async def get_frame_descriptions(self) -> TimeSeriesData:
|
||||||
"""
|
"""
|
||||||
Get descriptions for sampled frames using GPT-4
|
Get descriptions for sampled frames using configured vision model
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
TimeSeriesData object containing frame descriptions
|
TimeSeriesData object containing frame descriptions
|
||||||
"""
|
"""
|
||||||
logger.info("Starting frame description generation")
|
logger.info("Starting frame description generation")
|
||||||
|
|
||||||
|
# Return empty TimeSeriesData if frame_sample_rate is -1 (captioning disabled)
|
||||||
|
if self.frame_sample_rate == -1:
|
||||||
|
logger.info("Frame captioning is disabled (frame_sample_rate = -1)")
|
||||||
|
return TimeSeriesData(time_to_content={})
|
||||||
|
|
||||||
frame_count = 0
|
frame_count = 0
|
||||||
time_to_description = {}
|
time_to_description = {}
|
||||||
last_description = None
|
last_description = None
|
||||||
|
logger.info("Starting main loop for frame description generation")
|
||||||
while True:
|
while True:
|
||||||
|
logger.info(f"Frame count: {frame_count}")
|
||||||
ret, frame = self.cap.read()
|
ret, frame = self.cap.read()
|
||||||
if not ret:
|
if not ret:
|
||||||
|
logger.info("Reached end of video")
|
||||||
break
|
break
|
||||||
|
|
||||||
if frame_count % self.frame_sample_rate == 0:
|
if frame_count % self.frame_sample_rate == 0:
|
||||||
|
logger.info(f"Processing frame at {frame_count / self.fps:.2f}s")
|
||||||
timestamp = frame_count / self.fps
|
timestamp = frame_count / self.fps
|
||||||
logger.debug(f"Processing frame at {timestamp:.2f}s")
|
logger.debug(f"Processing frame at {timestamp:.2f}s")
|
||||||
|
|
||||||
img_base64 = self.frame_to_base64(frame)
|
img_base64 = self.frame_to_base64(frame)
|
||||||
|
|
||||||
response = self.gpt.chat.completions.create(
|
context = f"""Describe this frame from a video. Focus on the main elements, actions, and any notable details. Here is the transcript around the time of the frame:
|
||||||
model="gpt-4o-mini",
|
---
|
||||||
messages=[
|
{self.transcript.at_time(timestamp, padding=10)}
|
||||||
{
|
---
|
||||||
"role": "user",
|
|
||||||
"content": [
|
|
||||||
{
|
|
||||||
"type": "text",
|
|
||||||
"text": f"""Describe this frame from a video. Focus on the main elements, actions, and any notable details. Here is the transcript around the time of the frame:
|
|
||||||
---
|
|
||||||
{self.transcript.at_time(timestamp, padding=10)}
|
|
||||||
---
|
|
||||||
|
|
||||||
Here is a description of the previous frame:
|
Here is a description of the previous frame:
|
||||||
---
|
---
|
||||||
{last_description if last_description else 'No previous frame description available, this is the first frame'}
|
{last_description if last_description else 'No previous frame description available, this is the first frame'}
|
||||||
---
|
---
|
||||||
|
|
||||||
In your response, only provide the description of the current frame, using the above information as context.
|
In your response, only provide the description of the current frame, using the above information as context.
|
||||||
""",
|
"""
|
||||||
},
|
|
||||||
{
|
last_description = await self.vision_client.get_frame_description(
|
||||||
"type": "image_url",
|
img_base64, context
|
||||||
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
|
|
||||||
},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
],
|
|
||||||
max_tokens=300,
|
|
||||||
)
|
)
|
||||||
last_description = response.choices[0].message.content
|
|
||||||
time_to_description[timestamp] = last_description
|
time_to_description[timestamp] = last_description
|
||||||
|
|
||||||
frame_count += 1
|
frame_count += 1
|
||||||
|
|
||||||
logger.info(f"Generated descriptions for {len(time_to_description)} frames")
|
logger.info(f"Generated descriptions for {len(time_to_description)} frames")
|
||||||
return TimeSeriesData(time_to_description)
|
return TimeSeriesData(time_to_content=time_to_description)
|
||||||
|
|
||||||
def process_video(self) -> ParseVideoResult:
|
async def process_video(self) -> ParseVideoResult:
|
||||||
"""
|
"""
|
||||||
Process the video to get both transcript and frame descriptions
|
Process the video to get both transcript and frame descriptions
|
||||||
|
|
||||||
@ -163,7 +214,7 @@ class VideoParser:
|
|||||||
result = ParseVideoResult(
|
result = ParseVideoResult(
|
||||||
metadata=metadata,
|
metadata=metadata,
|
||||||
transcript=self.get_transcript(),
|
transcript=self.get_transcript(),
|
||||||
frame_descriptions=self.get_frame_descriptions(),
|
frame_descriptions=await self.get_frame_descriptions(),
|
||||||
)
|
)
|
||||||
logger.info("Video processing completed successfully")
|
logger.info("Video processing completed successfully")
|
||||||
return result
|
return result
|
||||||
|
@ -58,7 +58,18 @@ use_unstructured_api = false
|
|||||||
# chunk_size = 1000
|
# chunk_size = 1000
|
||||||
# chunk_overlap = 200
|
# chunk_overlap = 200
|
||||||
# use_unstructured_api = false
|
# use_unstructured_api = false
|
||||||
# frame_sample_rate = 120
|
|
||||||
|
[parser.vision]
|
||||||
|
provider = "ollama"
|
||||||
|
model_name = "llama3.2-vision"
|
||||||
|
frame_sample_rate = -1 # Set to -1 to disable frame captioning
|
||||||
|
# base_url = "http://localhost:11434" # Only used for ollama
|
||||||
|
base_url = "http://ollama:11434" # Use if using via docker
|
||||||
|
|
||||||
|
# [parser.vision]
|
||||||
|
# provider = "openai"
|
||||||
|
# model_name = "gpt-4o-mini"
|
||||||
|
# frame_sample_rate = -1 # Set to -1 to disable frame captioning
|
||||||
|
|
||||||
[reranker]
|
[reranker]
|
||||||
use_reranker = false
|
use_reranker = false
|
||||||
|
4
shell.py
4
shell.py
@ -159,9 +159,7 @@ class Cache:
|
|||||||
"""Add documents to the cache"""
|
"""Add documents to the cache"""
|
||||||
return self._client_cache.add_docs(docs)
|
return self._client_cache.add_docs(docs)
|
||||||
|
|
||||||
def query(
|
def query(self, query: str, max_tokens: int = None, temperature: float = None) -> dict:
|
||||||
self, query: str, max_tokens: int = None, temperature: float = None
|
|
||||||
) -> dict:
|
|
||||||
"""Query the cache"""
|
"""Query the cache"""
|
||||||
response = self._client_cache.query(
|
response = self._client_cache.query(
|
||||||
query=query,
|
query=query,
|
||||||
|
@ -1,9 +1,13 @@
|
|||||||
import uvicorn
|
import uvicorn
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from core.config import get_settings
|
from core.config import get_settings
|
||||||
|
from core.logging_config import setup_logging
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
# Set up logging first
|
||||||
|
setup_logging()
|
||||||
|
|
||||||
# Load environment variables from .env file
|
# Load environment variables from .env file
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
@ -16,6 +20,7 @@ def main():
|
|||||||
host=settings.HOST,
|
host=settings.HOST,
|
||||||
port=settings.PORT,
|
port=settings.PORT,
|
||||||
loop="asyncio",
|
loop="asyncio",
|
||||||
|
log_level="info",
|
||||||
# reload=settings.RELOAD
|
# reload=settings.RELOAD
|
||||||
)
|
)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user