diff --git a/python/GeminiVideoSummarization/Dockerfile b/python/GeminiVideoSummarization/Dockerfile new file mode 100644 index 00000000..626354ec --- /dev/null +++ b/python/GeminiVideoSummarization/Dockerfile @@ -0,0 +1,67 @@ +# syntax=docker/dockerfile:experimental + +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +ARG BUILD_REGISTRY +ARG BUILD_TAG=latest +FROM ${BUILD_REGISTRY}openmpf_python_executor_ssb:${BUILD_TAG} + +RUN --mount=type=tmpfs,target=/var/cache/apt \ + --mount=type=tmpfs,target=/var/lib/apt/lists \ + --mount=type=tmpfs,target=/tmp \ + apt-get update; \ + DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y wget \ + # For Google Gemini + # After installing the following /usr/bin will have: + # python3 -> python3.8 + # python3.8 + # python3.9 + python3.9 python3.9-venv libpython3.9 + +# Create separate venv for Python 3.9 subprocess +RUN mkdir -p /gemini-subprocess/venv; \ + python3.9 -m venv /gemini-subprocess/venv; \ + /gemini-subprocess/venv/bin/pip3 install google-genai google-cloud-storage pytictoc + +COPY gemini-process-video.py /gemini-subprocess + +RUN pip3 install --upgrade pip + +RUN pip3 install tenacity + +ARG RUN_TESTS=false + +RUN --mount=target=.,readwrite \ + install-component.sh; \ + if [ "${RUN_TESTS,,}" == true ]; then python tests/test_gemini.py; fi + +LABEL org.label-schema.license="Apache 2.0" \ + org.label-schema.name="OpenMPF Gemini Video Summarization" \ + org.label-schema.schema-version="1.0" \ + org.label-schema.url="https://openmpf.github.io" \ + org.label-schema.vcs-url="https://github.com/openmpf/openmpf-components" \ + org.label-schema.vendor="MITRE" \ No newline at end of file diff --git a/python/GeminiVideoSummarization/README.md b/python/GeminiVideoSummarization/README.md new file mode 100644 index 00000000..dde4a001 --- /dev/null +++ b/python/GeminiVideoSummarization/README.md @@ -0,0 +1,70 @@ +# Overview + +This repository contains source code for the OpenMPF Gemini Video Summarization Component. + +This component analyzes a video with a summary and event timeline using Vertex AI on the Google Cloud Platform(GCP) and their provided models. By default, this component does a general video summarization and event timeline. + +# Job Properties + +The following are the properties that can be specified for the component. The first eight properties have default values, but the other properties have to be specified for this component. + +- `MODEL_NAME`: The model name for which Gemini model to use. +- `GENERATION_PROMPT_PATH`: Path to txt file which contains prompt. +- `GENERATION_MAX_ATTEMPTS`: The number of attempts to get a valid JSON response from the model. +- `TIMELINE_CHECK_TARGET_THRESHOLD`: Specifies the number of seconds that video events can occur before or after video bounds. If exceeded, another attempt will be made to generate the output. Set to -1 to disable check. +- `MERGE_TRACKS`: In the context of videos, when set to true, attempt to merge tracks from the entire video. +- `TARGET_SEGMENT_LENGTH`: The length of segments the video is divided into. +- `VFR_TARGET_SEGMENT_LENGTH`: The length of segments the video is divided into. +- `PROCESS_FPS`: The amount of frames to process per second. The FPS processing limit for Gemini is (0.0, 24.0] +- `ENABLE_TIMELINE`: When set to 1, this enables the creation of timelines. If there is no custom prompt file, it will use default_prompt.txt. When set to 0, this disables timelines. If there is no custom prompt file, it will use default_prompt_no_tl.txt. + +REQUIRED PROPERTIES: +- `PROJECT_ID`: The project ID for your GCP project. +- `BUCKET_NAME`: The GCP bucket that holds the data for processing. +- `LABEL_PREFIX`: The prefix of your labels. ie. Your project name +- `LABEL_USER`: The user using the GCP resources. +- `LABEL_PURPOSE`: The reason for using the GCP resources. +- `GOOGLE_APPLICATION_CREDENTIALS`: Your gcloud CLI credentials in a json file. + +# Custom Prompts + +For the default prompt with timelines enabled refer to gemini_video_summarization_component/data/default_prompt.txt. + +For the default prompt WITHOUT timelines enabled refer to gemini_video_summarization_component/data/default_prompt_no_tl.txt. + +Set GENERATION_PROMPT_PATH to specify the path to a txt file containing a generation prompt to provide the model. + +When making a custom prompt or altering the default prompt with timelines enabled, it is required that the parts about timestamp formatting, timestamp offsets and the JSON structured output are included in your prompt. + +# GCP Certificate + +Using this component required access to Vertex AI. Which means adding a GCP certificate required. +Set GOOGLE_APPLICATION_CREDENTIALS to specify the path to the GCP certificate JSON file. + +# Timestamps + +When videos exceed 2 minutes in length, timestamps for events become more inaccurate. For accurate timestamps, it is recommended to keep TARGET_SEGMENT_LENGTH and VFR_TARGET_SEGMENT_LENGTH at 120. +If you'd prefer more cohesive summaries over timeline accuracy, you can pass the whole video as one segment by setting TARGET_SEGMENT_LENGTH and VFR_TARGET_SEGMENT_LENGTH to -1. +Keep in mind the maximum length of a video that can be processed is 45 minutes(2700s). +This means TARGET_SEGMENT_LENGTH and VFR_TARGET_SEGMENT_LENGTH both have a max of 2700 and are REQUIRED to be set for videos longer than 45 minutes. + +To prevent further inaccuracies, Gemini does timestamps best when formatting them in MM:SS format. This means the component does conversions between that format to seconds and back itself. +So, if altering the prompt, leave in instructions about timestamp formatting. + +Another cause of timestamp inaccuracies is the model you are using. Not only does descriptions and summaries lower in quality with Gemini flash models, the timestamps are also become more inaccurate. +For the most accurate timestamps, use segmentation and the latest Gemini pro model. + +# Docker Container + gemini-video-summarization: + <<: *detection-component-base + image: + volumes: + - host_directory/cert_name.json:container_directory/cert_name.json:ro # Mount the GCP file from your localhost to the container + - host_directory/prompt_file.txt:container_directory/prompt_file.txt:ro # OPTIONAL: Mount the custom prompt file from your localhost to the container + - shared_data:/opt/mpf/share + environment: + - MPF_PROP_PROJECT_ID= + - MPF_PROP_BUCKET_NAME= + ... # Add more properties here + - MPF_PROP_GOOGLE_APPLICATION_CREDENTIALS=container_directory/cert_name.json + - MPF_PROP_GENERATION_PROMPT_PATH=container_directory/prompt_file.txt # OPTIONAL, but needed if mounted a custom prompt file diff --git a/python/GeminiVideoSummarization/gemini-process-video.py b/python/GeminiVideoSummarization/gemini-process-video.py new file mode 100644 index 00000000..5cbc8474 --- /dev/null +++ b/python/GeminiVideoSummarization/gemini-process-video.py @@ -0,0 +1,160 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +import argparse +import json +import os +import sys + +from google import genai +from google.genai import types +from google.genai.types import Part +from google.cloud import storage +from google.genai.errors import ClientError + + +def main(): + parser = argparse.ArgumentParser(description='Sends image and prompt to Gemini Client for processing.') + + parser.add_argument("--model", "-m", type=str, default="gemini-2.5-flash", help="The name of the Gemini model to use.") + parser.add_argument("--data_uri", "-d", type=str, required=True, help="Path to the media file to process with Gemini.") + parser.add_argument("--prompt", "-p", type=str, help="The prompt you want to use with the video.") + parser.add_argument("--google_application_credientials", "-c", type=str, required=True, help="The JSON file to your credentials to use Vertex AI.") + parser.add_argument("--project_id", "-i", type=str, required=True, help="Name of your GCP project.") + parser.add_argument("--bucket_name", "-b", type=str, required=True, help="Name of the GCP bucket.") + parser.add_argument("--label_prefix", "-l", type=str, required=True, help="Label prefix to use when uploading the video to GCP.") + parser.add_argument("--label_user", "-u", type=str, required=True, help="User of whom is accessing the GCP resources.") + parser.add_argument("--label_purpose", "-r", type=str, required=True, help="Purpose of accessing the GCP resources.") + parser.add_argument("--segment_start", "-s", type=str, required=True, help="Start time of the current segment.") + parser.add_argument("--segment_stop", "-e", type=str, required=True, help="End time of the current segment.") + parser.add_argument("--fps", "-f", type=str, default="1.0", help="Specifies the number of frames per second of video to process.") + + args = parser.parse_args() + + try: + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = args.google_application_credientials + + # GCP resources + USER = args.label_user + PURPOSE = args.label_purpose + LABEL_PREFIX = args.label_prefix + PROJECT_ID = args.project_id + BUCKET_NAME = args.bucket_name + + PROMPT = args.prompt + MODEL = args.model + + # Video segment storage information + FILE_PATH = args.data_uri + FILE_NAME = os.path.basename(FILE_PATH) + STORAGE_PATH = USER + "/" + FILE_NAME + + SEGMENT_START = int(float(args.segment_start)) + SEGMENT_STOP = int(float(args.segment_stop)) + FPS = float(args.fps) + + # Automatically uses ADC to authenticate + client = storage.Client( + project=PROJECT_ID + ) + + # Uploads file to GCP bucket + bucket = client.bucket(BUCKET_NAME) + blob = bucket.blob(STORAGE_PATH) + + # There is no way to set a time-to-live (TTL) for a file in Google Storage. + # The file will be deleted manually at the end of this script. + # If you want to set a TTL, you can use the `lifecycle` configuration in the bucket settings. + # See: https://cloud.google.com/storage/docs/lifecycle + # For example, you can set a rule to delete files older than 30 days. + blob.upload_from_filename(FILE_PATH) + + file_uri = f"gs://{BUCKET_NAME}/{STORAGE_PATH}" + + # Automatically uses ADC to authenticate + client = genai.Client( + project=PROJECT_ID, + location="global", + vertexai=True + ) + + content_config = None + if(USER != "" and LABEL_PREFIX != "" and PURPOSE != ""): + # Use labels to help track billing and usage + content_config = types.GenerateContentConfig( + labels={ + LABEL_PREFIX + "user": USER, + LABEL_PREFIX + "purpose": PURPOSE, + LABEL_PREFIX + "modality": "video" + } + ) + + response = client.models.generate_content( + model=MODEL, + contents=types.Content( + role='user', + parts=[ + Part( + file_data=types.FileData( + file_uri=file_uri, + mime_type='video/mp4' + ), + video_metadata=types.VideoMetadata( + start_offset=str(SEGMENT_START) + "s", + end_offset=str(SEGMENT_STOP) + "s", + fps=FPS + ) + ), + Part( + text=PROMPT + ) + ], + ), + config=content_config + ) + print(f"\n{response.text}") + + # Set a generation-match precondition to avoid potential race conditions + # and data corruptions. The request to delete is aborted if the object's + # generation number does not match your precondition. + blob.reload() # fetch blob metadata to use in generation_match_precondition. + generation_match_precondition = blob.generation + + blob.delete(if_generation_match=generation_match_precondition) + + sys.exit(0) + except ClientError as e: + if hasattr(e, 'code') and e.code == 429: + print("Caught a ResourceExhausted error (429 Too Many Requests)", file=sys.stderr) + sys.exit(1) + raise + except Exception as e: + err_str = str(e) + print(err_str, file=sys.stderr) + sys.exit(1) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/python/GeminiVideoSummarization/gemini_video_summarization_component/__init__.py b/python/GeminiVideoSummarization/gemini_video_summarization_component/__init__.py new file mode 100644 index 00000000..5bb341e9 --- /dev/null +++ b/python/GeminiVideoSummarization/gemini_video_summarization_component/__init__.py @@ -0,0 +1,27 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2025 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2025 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +from .gemini_video_summarization_component import GeminiVideoSummarizationComponent \ No newline at end of file diff --git a/python/GeminiVideoSummarization/gemini_video_summarization_component/data/default_prompt.txt b/python/GeminiVideoSummarization/gemini_video_summarization_component/data/default_prompt.txt new file mode 100644 index 00000000..cbdba480 --- /dev/null +++ b/python/GeminiVideoSummarization/gemini_video_summarization_component/data/default_prompt.txt @@ -0,0 +1,35 @@ +You are a helpful assistant analyzing the provided video. + +First, write a brief summary of the video content. + +Next, create a timeline of key events that spans the full duration of the video. Include at least one event. +For each event, provide + - a short description + - the start time + - the end time. + +Use MM:SS format for all timestamps, even if the total time exceeds 60 minutes. DO NOT USE HH:MM:SS. For example, use 75:30 instead of 01:15:30. + +Assume the video starts at 00:00, even if it is a segment from a longer recording. + +Respond only with a JSON object in the following format. Do not include any extra text, commentary, or explanations: +{ + "video_summary": STRING, + "video_event_timeline": [ + { + "timestamp_start": STRING, + "timestamp_end": STRING, + "description": STRING + }, + { + "timestamp_start": STRING, + "timestamp_end": STRING, + "description": STRING + }, + { + "timestamp_start": STRING, + "timestamp_end": STRING, + "description": STRING + } + ] +} \ No newline at end of file diff --git a/python/GeminiVideoSummarization/gemini_video_summarization_component/data/default_prompt_no_tl.txt b/python/GeminiVideoSummarization/gemini_video_summarization_component/data/default_prompt_no_tl.txt new file mode 100644 index 00000000..b70eebc8 --- /dev/null +++ b/python/GeminiVideoSummarization/gemini_video_summarization_component/data/default_prompt_no_tl.txt @@ -0,0 +1,8 @@ +You are a helpful assistant analyzing the provided video. + +Write a brief summary of the video content. + +Respond only with a JSON object in the following format. Do not include any extra text, commentary, or explanations: +{ + "video_summary": STRING +} \ No newline at end of file diff --git a/python/GeminiVideoSummarization/gemini_video_summarization_component/gemini_video_summarization_component.py b/python/GeminiVideoSummarization/gemini_video_summarization_component/gemini_video_summarization_component.py new file mode 100644 index 00000000..e612a2ef --- /dev/null +++ b/python/GeminiVideoSummarization/gemini_video_summarization_component/gemini_video_summarization_component.py @@ -0,0 +1,465 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +import os +import json +import subprocess +import logging +from typing import Iterable, Mapping, Tuple, Union +from tenacity import retry, wait_random_exponential, stop_after_delay, retry_if_exception + +import mpf_component_api as mpf +import mpf_component_util as mpf_util + +logger = logging.getLogger('GeminiVideoSummarizationComponent') + +class GeminiVideoSummarizationComponent: + + def __init__(self): + + self.google_application_credentials = '' + self.project_id = '' + self.bucket_name = '' + self.label_prefix = '' + self.label_user = '' + self.label_purpose = '' + + def get_detections_from_video(self, job: mpf.VideoJob) -> Iterable[mpf.VideoTrack]: + logger.info('Received video job: %s', job.job_name) + + if job.feed_forward_track: + raise mpf.DetectionError.UNSUPPORTED_DATA_TYPE.exception( + 'Component cannot process feed forward jobs.') + + if job.stop_frame < 0: + raise mpf.DetectionError.UNSUPPORTED_DATA_TYPE.exception( + 'Job stop frame must be >= 0.') + tracks = [] + + config = JobConfig(job.job_properties, job.media_properties) + + self.google_application_credentials = config.google_application_credentials + self.project_id = config.project_id + self.bucket_name = config.bucket_name + self.label_prefix = config.label_prefix + self.label_user = config.label_user + self.label_purpose = config.label_purpose + fps = config.process_fps + enable_timeline=config.enable_timeline + + segment_start_time = job.start_frame / float(job.media_properties['FPS']) + segment_stop_time = (job.stop_frame + 1) / float(job.media_properties['FPS']) + + prompt = _read_file(config.generation_prompt_path) + + model_name = config.model_name + max_attempts = int(config.generation_max_attempts) + timeline_check_target_threshold = int(config.timeline_check_target_threshold) + + error = None + attempts = dict( + base=0, + timeline=0) + + while max(attempts.values()) < max_attempts: + error= None + response = self._get_gemini_response(model_name, job.data_uri, prompt, segment_start_time, segment_stop_time, fps) + if '```json\n' in response and '```' in response: + try: + response = response.split('```json\n')[1].split('```')[0] + except IndexError: + # Fallback if splitting fails unexpectedly + error = "Invalid response format" + continue + response_json, error = self._check_response(attempts, max_attempts, response) + if error is not None: + continue + + if enable_timeline == 1: + event_timeline = response_json['video_event_timeline'] + error = self._check_timeline( + timeline_check_target_threshold, attempts, max_attempts, segment_start_time, segment_stop_time, event_timeline) + if error is not None: + continue + + break + + if error: + raise mpf.DetectionError.DETECTION_FAILED.exception(f'Failed to produce valid JSON file: {error}') + + tracks = self._create_tracks(job, response_json, enable_timeline) + + logger.info(f"Job complete. Found {len(tracks)} tracks.") + return tracks + + def _is_rate_limit_error(self, stderr): + return "Caught a ResourceExhausted error (429 Too Many Requests)" in stderr + + @retry( + # Each wait is between 4 and multiplier * 2^n seconds, where n is the number of retries. The max wait capped at 32 seconds. + wait=wait_random_exponential(multiplier=2, max=32, min=4), + # Stops retrying after the total time waiting >=60s, checks after each attempt + stop=stop_after_delay(60), + retry=retry_if_exception(lambda e: isinstance(e, mpf.DetectionException) and getattr(e, 'rate_limit', False)) + ) + + def _create_tracks(self, job: mpf.VideoJob, response_json: dict, enable_timeline) -> Iterable[mpf.VideoTrack]: + logger.info('Creating tracks.') + tracks = [] + + segment_id = str(job.start_frame) + "-" + str(job.stop_frame) + video_fps = float(job.media_properties['FPS']) + segment_start_time = job.start_frame / float(job.media_properties['FPS']) + + frame_width = 0 + frame_height = 0 + if 'FRAME_WIDTH' in job.media_properties: + frame_width = int(job.media_properties['FRAME_WIDTH']) + if 'FRAME_HEIGHT' in job.media_properties: + frame_height = int(job.media_properties['FRAME_HEIGHT']) + + if enable_timeline == 1: + summary_track = self._create_segment_summary_track(job, response_json) + tracks.append(summary_track) + + for event in response_json['video_event_timeline']: + + # get offset start/stop times in milliseconds + event_start_time = self.convert_mm_ss_to_seconds(event["timestamp_start"], segment_start_time) * 1000 + event_stop_time = self.convert_mm_ss_to_seconds(event["timestamp_end"], segment_start_time) * 1000 + + offset_start_frame = int((event_start_time * video_fps) / 1000) + offset_stop_frame = int((event_stop_time * video_fps) / 1000) - 1 + + detection_properties={ + "SEGMENT ID": segment_id, + "TEXT": event['description'] + } + + # check offset_stop_frame + if offset_stop_frame > job.stop_frame: + logger.debug(f'offset_stop_frame outside of acceptable range ' + f'({offset_stop_frame} > {job.stop_frame}), setting offset_stop_frame to {job.stop_frame}') + offset_stop_frame = job.stop_frame + elif offset_stop_frame < job.start_frame: + logger.debug(f'offset_stop_frame outside of acceptable range ' + f'({offset_stop_frame} < {job.start_frame}), setting offset_stop_frame to {job.start_frame}') + offset_stop_frame = job.start_frame + + # check offset_start_frame + if offset_start_frame > job.stop_frame: + logger.debug(f'offset_start_frame outside of acceptable range ' + f'({offset_start_frame} > {job.stop_frame}), setting offset_start_frame to {job.stop_frame}') + offset_start_frame = job.stop_frame + elif offset_start_frame < job.start_frame: + logger.debug(f'offset_start_frame outside of acceptable range ' + f'({offset_start_frame} < {job.start_frame}), setting offset_start_frame to {job.start_frame}') + offset_start_frame = job.start_frame + + offset_middle_frame = int((offset_stop_frame - offset_start_frame) / 2) + offset_start_frame + + # check offset_middle_frame + if offset_middle_frame > job.stop_frame: + logger.debug(f'offset_middle_frame outside of acceptable range ' + f'({offset_middle_frame} > {job.stop_frame}), setting offset_middle_frame to {job.stop_frame}') + offset_middle_frame = job.stop_frame + elif offset_middle_frame < job.start_frame: + logger.debug(f'offset_middle_frame outside of acceptable range ' + f'({offset_middle_frame} < {job.start_frame}), setting offset_middle_frame to {job.start_frame}') + offset_middle_frame = job.start_frame + + track = mpf.VideoTrack( + offset_start_frame, + offset_stop_frame, + 1.0, + # Add start and top frame locations to prevent the Workflow Manager from dropping / truncating track. + # Add middle frame for artifact extraction. + frame_locations = { + offset_start_frame: mpf.ImageLocation(0, 0, frame_width, frame_height, 1.0), + offset_middle_frame: mpf.ImageLocation(0, 0, frame_width, frame_height, 1.0), + offset_stop_frame: mpf.ImageLocation(0, 0, frame_width, frame_height, 1.0) + }, + detection_properties = detection_properties + ) + + track.frame_locations[offset_middle_frame].detection_properties["EXEMPLAR"] = "1" + + tracks.append(track) + + else: # no events timeline, create summary only + tracks.append(self._create_segment_summary_track(job, response_json)) + + logger.info('Processing complete. Video segment %s summarized in %d tracks.' % (segment_id, len(tracks))) + return tracks + + def _create_segment_summary_track(self, job: mpf.VideoJob, response_json: dict) -> mpf.VideoTrack: + start_frame = job.start_frame + stop_frame = job.stop_frame + + segment_id = str(job.start_frame) + "-" + str(job.stop_frame) + detection_properties={ + "SEGMENT ID": segment_id, + "SEGMENT SUMMARY": "TRUE", + "TEXT": response_json['video_summary'] + } + frame_width = 0 + frame_height = 0 + if 'FRAME_WIDTH' in job.media_properties: + frame_width = int(job.media_properties['FRAME_WIDTH']) + if 'FRAME_HEIGHT' in job.media_properties: + frame_height = int(job.media_properties['FRAME_HEIGHT']) + + middle_frame = int((stop_frame - start_frame) / 2) + start_frame + + track = mpf.VideoTrack( + start_frame, + stop_frame, + 1.0, + # Add start and top frame locations to prevent the Workflow Manager from dropping / truncating track. + # Add middle frame for artifact extraction. + frame_locations = { + start_frame: mpf.ImageLocation(0, 0, frame_width, frame_height, 1.0), + middle_frame: mpf.ImageLocation(0, 0, frame_width, frame_height, 1.0), + stop_frame: mpf.ImageLocation(0, 0, frame_width, frame_height, 1.0) + }, + detection_properties = detection_properties + ) + + track.frame_locations[middle_frame].detection_properties["EXEMPLAR"] = "1" + + return track + + def _check_response(self, attempts: dict, max_attempts: int, response: str + ) -> Tuple[Union[dict, None], Union[str, None]]: + response_json = None + + if not response: + error = 'Empty response.' + logger.warning(error) + logger.warning(f'Failed {attempts["base"] + 1} of {max_attempts} base attempts.') + attempts['base'] += 1 + return None, error + + try: + response_json = json.loads(response) + except ValueError as ve: + error = 'Response is not valid JSON.' + logger.warning(error) + logger.warning(str(ve)) + logger.warning(f'Failed {attempts["base"] + 1} of {max_attempts} base attempts.') + attempts['base'] += 1 + return response_json, error + + return response_json, None + + + def _check_timeline(self, threshold: float, attempts: dict, max_attempts: int, + segment_start_time: float, segment_stop_time: float, event_timeline: list + ) -> Union[str, None]: + + error = None + + if not event_timeline: + error = 'No timeline events found in response.' + logger.warning(error) + logger.warning(f'Failed {attempts["timeline"] + 1} of {max_attempts} timeline attempts.') + attempts['timeline'] += 1 + return error + + for event in event_timeline: + + try: + timestamp_start = self.convert_mm_ss_to_seconds(event["timestamp_start"], segment_start_time) + timestamp_end = self.convert_mm_ss_to_seconds(event["timestamp_end"], segment_start_time) + + if timestamp_start < 0: + error = (f'Timeline event start time of {timestamp_start} < 0.') + break + + if timestamp_end < 0: + error = (f'Timeline event end time of {timestamp_end} < 0.') + break + + if timestamp_end < timestamp_start: + error = (f'Timeline event end time is less than event start time. ' + f'{timestamp_end} < {timestamp_start}.') + break + + if threshold != -1: + + if (segment_start_time - timestamp_start) > threshold: + error = (f'Timeline event start time occurs too soon before segment start time. ' + f'({segment_start_time} - {timestamp_start}) > {threshold}.') + break + + if (timestamp_end - segment_stop_time) > threshold: + error = (f'Timeline event end time occurs too late after segment stop time. ' + f'({timestamp_end} - {segment_stop_time}) > {threshold}.') + break + + except Exception as e: + error = (f'Timestamps could not be converted: {e}') + break + + if threshold != -1: + if not error: + min_event_start = min(list(map(lambda d: float(self.convert_mm_ss_to_seconds(d.get('timestamp_start'), segment_start_time)), + filter(lambda d: 'timestamp_start' in d, event_timeline)))) + + if abs(segment_start_time - min_event_start) > threshold: + error = (f'Min timeline event start time not close enough to segment start time. ' + f'abs({segment_start_time} - {min_event_start}) > {threshold}.') + + if not error: + max_event_end = max(list(map(lambda d: float(self.convert_mm_ss_to_seconds(d.get('timestamp_end'), segment_start_time)), + filter(lambda d: 'timestamp_end' in d, event_timeline)))) + + if abs(max_event_end - segment_stop_time) > threshold: + error = (f'Max timeline event end time not close enough to segment stop time. ' + f'abs({max_event_end} - {segment_stop_time}) > {threshold}.') + if error: + logger.warning(error) + logger.warning(f'Failed {attempts["timeline"] + 1} of {max_attempts} timeline attempts.') + attempts['timeline'] += 1 + return error + + return None + + def convert_mm_ss_to_seconds(self, timestamp_str, segment_start_time): + try: + minutes_str, seconds_str = timestamp_str.split(':') + minutes = int(minutes_str) + seconds = int(seconds_str) + + total_seconds = (minutes * 60) + seconds + segment_start_time + return total_seconds + except ValueError: + raise ValueError("Invalid timestamp format.") + except Exception as e: + raise Exception(f"An unexpected error occurred: {e}") + + def _get_gemini_response(self, model_name, data_uri, prompt, start, stop, fps): + process = None + try: + process = subprocess.Popen([ + "/gemini-subprocess/venv/bin/python3", + "/gemini-subprocess/gemini-process-video.py", + "-m", model_name, + "-d", data_uri, + "-p", prompt, + "-c", self.google_application_credentials, + "-i", self.project_id, + "-b", self.bucket_name, + "-l", self.label_prefix, + "-u", self.label_user, + "-r", self.label_purpose, + "-s", str(start), + "-e", str(stop), + "-f", str(fps) + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + stdout, stderr = process.communicate() + except Exception as e: + raise mpf.DetectionException( + f"Subprocess error: {e}", + mpf.DetectionError.DETECTION_FAILED) + + if process.returncode == 0: + response = stdout.decode() + logger.info(response) + return response + + stderr_decoded = stderr.decode() + if self._is_rate_limit_error(stderr_decoded): + logger.warning("Gemini rate limit hit (429). Retrying with backoff...") + ex = mpf.DetectionException( + f"Subprocess failed due to rate limiting: {stderr_decoded}", + mpf.DetectionError.DETECTION_FAILED + ) + ex.rate_limit = True + raise ex + raise mpf.DetectionException( + f"Subprocess failed: {stderr_decoded}", + mpf.DetectionError.DETECTION_FAILED + ) + +def _read_file(path: str) -> str: + try: + if not os.path.isabs(path): + base_dir = os.path.dirname(os.path.abspath(__file__)) + path = os.path.join(base_dir, path) + with open(path, 'r') as file: + return file.read() + except Exception as e: + raise mpf.DetectionError.COULD_NOT_READ_DATAFILE.exception( + f"Could not read \"{path}\": {e}" + ) from e + +class JobConfig: + def __init__(self, job_properties: Mapping[str, str], media_properties=None): + + self.generation_prompt_path = self._get_prop(job_properties, "GENERATION_PROMPT_PATH", "") + self.enable_timeline = int(self._get_prop(job_properties, "ENABLE_TIMELINE", "1")) + if self.generation_prompt_path == "" and self.enable_timeline == 1: + self.generation_prompt_path= os.path.join(os.path.dirname(__file__), 'data', 'default_prompt.txt') + else: + self.generation_prompt_path= os.path.join(os.path.dirname(__file__), 'data', 'default_prompt_no_tl.txt') + + if not os.path.exists(self.generation_prompt_path): + raise mpf.DetectionException( + "Invalid path provided for prompt file: ", + mpf.DetectionError.COULD_NOT_OPEN_DATAFILE + ) + + self.google_application_credentials = self._get_prop(job_properties, "GOOGLE_APPLICATION_CREDENTIALS", "") + if not os.path.exists(self.google_application_credentials): + raise mpf.DetectionException( + "Invalid path provided for GCP credential file: ", + mpf.DetectionError.COULD_NOT_OPEN_DATAFILE + ) + + self.model_name = self._get_prop(job_properties, "MODEL_NAME", "gemini-2.5-flash") + self.project_id = self._get_prop(job_properties, "PROJECT_ID", "") + self.bucket_name = self._get_prop(job_properties, "BUCKET_NAME", "") + self.label_prefix = self._get_prop(job_properties, "LABEL_PREFIX", "") + self.label_user = self._get_prop(job_properties, "LABEL_USER", "") + self.label_purpose = self._get_prop(job_properties, "LABEL_PURPOSE", "") + self.generation_max_attempts = self._get_prop(job_properties, "GENERATION_MAX_ATTEMPTS", "5") + self.timeline_check_target_threshold = self._get_prop(job_properties, "TIMELINE_CHECK_TARGET_THRESHOLD", "10") + self.process_fps = self._get_prop(job_properties, "PROCESS_FPS", 1.0) + + @staticmethod + def _get_prop(job_properties, key, default_value, accept_values=[]): + prop = mpf_util.get_property(job_properties, key, default_value) + if (accept_values != []) and (prop not in accept_values): + raise mpf.DetectionException( + f"Property {key} not in list of acceptable values: {accept_values}", + mpf.DetectionError.INVALID_PROPERTY + ) + return prop + +EXPORT_MPF_COMPONENT = GeminiVideoSummarizationComponent \ No newline at end of file diff --git a/python/GeminiVideoSummarization/plugin-files/descriptor/descriptor.json b/python/GeminiVideoSummarization/plugin-files/descriptor/descriptor.json new file mode 100644 index 00000000..409614b6 --- /dev/null +++ b/python/GeminiVideoSummarization/plugin-files/descriptor/descriptor.json @@ -0,0 +1,156 @@ +{ + "componentName": "GeminiVideoSummarization", + "componentVersion": "9.0", + "middlewareVersion": "9.0", + "sourceLanguage": "python", + "batchLibrary": "GeminiVideoSummarization", + "environmentVariables": [], + "algorithm": { + "name": "GeminiVideo", + "description": "Gemini prompt response generation.", + "actionType": "DETECTION", + "trackType": "TEXT", + "mediaTypes":"VIDEO", + "outputChangedCounter": 1, + "requiresCollection": { + "states": [] + }, + "providesCollection": { + "states": [ + "DETECTION", + "DETECTION_TEXT", + "DETECTION_TEXT_GEMINI_VIDEO" + ], + "properties": [ + { + "name": "GENERATION_PROMPT_PATH", + "description": "Path to a custom a file which contains the prompt that Gemini will use on the video.", + "type": "STRING", + "defaultValue": "" + }, + { + "name": "MODEL_NAME", + "description": "The model to use for Gemini inference. Examples: 'gemini-1.5-flash', 'gemini-2.5-pro'.", + "type": "STRING", + "defaultValue": "gemini-2.5-flash" + }, + { + "name": "GOOGLE_APPLICATION_CREDENTIALS", + "description": "Path to a JSON file containing your GCP credentials for Vertex AI.", + "type": "STRING", + "defaultValue": "" + }, + { + "name": "PROJECT_ID", + "description": "The project identifier to access Vertex AI.", + "type": "STRING", + "defaultValue": "" + }, + { + "name": "BUCKET_NAME", + "description": "The name of the bucket to store data for the project.", + "type": "STRING", + "defaultValue": "" + }, + { + "name": "LABEL_PREFIX", + "description": "A prefix for labels.", + "type": "STRING", + "defaultValue": "" + }, + { + "name": "LABEL_USER", + "description": "The user accessing the Vertex AI.", + "type": "STRING", + "defaultValue": "" + }, + { + "name": "LABEL_PURPOSE", + "description": "The purpose of using the Vertex AI.", + "type": "STRING", + "defaultValue": "" + }, + { + "name": "GENERATION_MAX_ATTEMPTS", + "description": "Two kinds of checks are performed on the model response: 1) response is not empty and has valid JSON in the correct format, and 2) the response timeline falls within the video segment length determined by the model. Each check has a separate counter. This value specifies the maximum number of attempts for each check. Each failed check results in another attempt to regenerate the response.", + "type": "INT", + "defaultValue": "5" + }, + { + "name": "TIMELINE_CHECK_TARGET_THRESHOLD", + "description": "Specifies the number of seconds that video events can occur before or after a segments' bounds. If exceeded, another attempt will be made to generate the output. Set to -1 to disable check.", + "type": "INT", + "defaultValue": "10" + }, + { + "name": "TARGET_SEGMENT_LENGTH", + "description": "Default segment length is 120 seconds. Set to -1 to disable segmenting the video.", + "type": "INT", + "defaultValue": "120" + }, + { + "name": "VFR_TARGET_SEGMENT_LENGTH", + "description": "Default segment length is 120 seconds. Set to -1 to disable segmenting the video.", + "type": "INT", + "defaultValue": "120" + }, + { + "name": "SEGMENT_LENGTH_SPECIFICATION", + "description": "The value for determining how to interpret TARGET_SEGMENT_LENGTH, VFR_TARGET_SEGMENT_LENGTH, MIN_SEGMENT_LENGTH, and VFR_MIN_SEGMENT_LENGTH. The value has to be SECONDS for this component.", + "type": "STRING", + "defaultValue": "SECONDS" + }, + { + "name": "MERGE_TRACKS", + "description": "In the context of videos, when set to true, attempt to merge tracks from the entire video.", + "type": "BOOLEAN", + "defaultValue": "false" + }, + { + "name": "QUALITY_SELECTION_PROPERTY", + "description": "The detection property to be used to rank the quality of a track and the quality of the detections in a track. This property would be used, for example, to select the exemplar detection for a track.", + "type": "STRING", + "defaultValue": "EXEMPLAR" + }, + { + "name": "PROCESS_FPS", + "description": "Specifies the number of frames per second of video to process.", + "type": "FLOAT", + "defaultValue": "1.0" + }, + { + "name": "ENABLE_TIMELINE", + "description": "When set to 1, generate a timeline of events for the video. When set to 0, does not generate a timeline.", + "type": "INT", + "defaultValue": "1" + } + ] + } + }, + "actions": [ + { + "name": "GEMINI VIDEO SUMMARIZATION ACTION", + "description": "Runs Gemini with a prompt for the provided video.", + "algorithm": "GeminiVideo", + "properties": [] + } + ], + "tasks": [ + { + "name": "GEMINI VIDEO SUMMARIZATION TASK", + "description": "Runs Gemini video summarization.", + "actions": [ + "GEMINI VIDEO SUMMARIZATION ACTION" + ] + } + ], + "pipelines": [ + { + "name": "GEMINI VIDEO SUMMARIZATION PIPELINE", + "description": "Runs Gemini video summarization.", + "tasks": [ + "GEMINI VIDEO SUMMARIZATION TASK" + ] + } + ] +} \ No newline at end of file diff --git a/python/GeminiVideoSummarization/pyproject.toml b/python/GeminiVideoSummarization/pyproject.toml new file mode 100644 index 00000000..23127b40 --- /dev/null +++ b/python/GeminiVideoSummarization/pyproject.toml @@ -0,0 +1,29 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/python/GeminiVideoSummarization/setup.cfg b/python/GeminiVideoSummarization/setup.cfg new file mode 100644 index 00000000..1276692c --- /dev/null +++ b/python/GeminiVideoSummarization/setup.cfg @@ -0,0 +1,46 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +[metadata] +name = GeminiVideoSummarization +version = 9.0 + +[options] +packages = gemini_video_summarization_component +install_requires = + mpf_component_api>=9.0 + mpf_component_util>=9.0 + tenacity + numpy + +[options.entry_points] +mpf.exported_component = + component = gemini_video_summarization_component.gemini_video_summarization_component:GeminiVideoSummarizationComponent + +[options.package_data] +gemini_video_summarization_component= + data/default_prompt.txt + data/default_prompt_no_tl.txt \ No newline at end of file diff --git a/python/GeminiVideoSummarization/tests/data/NOTICE b/python/GeminiVideoSummarization/tests/data/NOTICE new file mode 100644 index 00000000..1e4e61a8 --- /dev/null +++ b/python/GeminiVideoSummarization/tests/data/NOTICE @@ -0,0 +1,9 @@ +# cat.mp4 +# Video by Orhan Pergel +# Pexels License: https://www.pexels.com/license/ +# https://www.pexels.com/video/a-cat-walking-down-a-street-with-people-walking-by-20573000/ + +# dog.mp4 +# Video by Evgenia Kirpichnikova +# Pexels License: https://www.pexels.com/license/ +# https://www.pexels.com/video/close-up-of-a-brown-and-white-pet-dog-2795691/ \ No newline at end of file diff --git a/python/GeminiVideoSummarization/tests/data/cat.mp4 b/python/GeminiVideoSummarization/tests/data/cat.mp4 new file mode 100644 index 00000000..ff412dc7 Binary files /dev/null and b/python/GeminiVideoSummarization/tests/data/cat.mp4 differ diff --git a/python/GeminiVideoSummarization/tests/data/dog.mp4 b/python/GeminiVideoSummarization/tests/data/dog.mp4 new file mode 100644 index 00000000..b4459382 Binary files /dev/null and b/python/GeminiVideoSummarization/tests/data/dog.mp4 differ diff --git a/python/GeminiVideoSummarization/tests/test_gemini.py b/python/GeminiVideoSummarization/tests/test_gemini.py new file mode 100644 index 00000000..cb148b74 --- /dev/null +++ b/python/GeminiVideoSummarization/tests/test_gemini.py @@ -0,0 +1,280 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +import sys +import os +import logging +import json + +import unittest +import unittest.mock + +# Add gemini_video_summarization_component to path. +sys.path.append(os.path.join(os.path.dirname(__file__), '..')) +from gemini_video_summarization_component.gemini_video_summarization_component import GeminiVideoSummarizationComponent + +import unittest +import mpf_component_api as mpf + +logging.basicConfig(level=logging.DEBUG) +USE_MOCKS = True +TEST_DATA = "data" + +# Replace with your own desired model name +MODEL_NAME = "gemini-2.5-flash" + +# Replace with your own path to the Google Application Credentials JSON file +GOOGLE_APPLICATION_CREDENTIALS="../application_default_credentials.json" + +job_properties=dict( + GOOGLE_APPLICATION_CREDENTIALS=GOOGLE_APPLICATION_CREDENTIALS, + GENERATION_PROMPT_PATH="../gemini_video_summarization_component/data/default_prompt.txt" +) + +CAT_TIMELINE = { + "video_summary": "A cat is sitting on a cobblestone street, looking around as people walk by.", + "video_event_timeline": [ + { + "timestamp_start": "0:00", + "timestamp_end": "0:04", + "description": "The cat is sitting on the cobblestone street, looking around." + }, + { + "timestamp_start": "0:04", + "timestamp_end": "0:06", + "description": "The cat looks back at the camera and then walks away." + } + ] +} + +INVALID_CAT_TIMELINE = { + "video_summary": "A cat is sitting on a cobblestone street, looking around as people walk by.", + "video_event_timeline": [ + { + "timestamp_start": "0:00", + "timestamp_end": "0:04", + "description": "The cat is sitting on the cobblestone street, looking around." + }, + { + "timestamp_start": "0:04", + "timestamp_end": "0:17", + "description": "The cat looks back at the camera and then walks away." + } + ] +} + +CAT_VIDEO_PROPERTIES = { + 'DURATION': '6890', + 'FPS': '25', + 'FRAME_COUNT': '172', + 'FRAME_HEIGHT': '360', + 'FRAME_WIDTH': '640', + 'HAS_CONSTANT_FRAME_RATE': 'true', + 'MIME_TYPE': 'video/mp4', + 'ROTATION': '0.0' +} + +DOG_TIMELINE = { + "video_summary": "A dog sitting by a window and looking around.", + "video_event_timeline": [ + { + "timestamp_start": "0:00", + "timestamp_end": "0:06", + "description": "Dog sitting by the window." + } + ] +} + +DOG_VIDEO_PROPERTIES = { + 'DURATION': '6170', + 'FPS': '25', + 'FRAME_COUNT': '154', + 'FRAME_HEIGHT': '240', + 'FRAME_WIDTH': '426', + 'HAS_CONSTANT_FRAME_RATE': 'true', + 'MIME_TYPE': 'video/mp4', + 'ROTATION': '0.0' +} + +# events span after video segment +DOG_TIMELINE_SEGMENT_2 = { + "video_summary": "A dog sitting by a window and looking around.", + "video_event_timeline": [ + { + "timestamp_start": "0:00", + "timestamp_end": "0:06", + "description": "Dog sitting by the window." + }, + { + "timestamp_start": "0:06", + "timestamp_end": "0:08", + "description": "Dog looks towards the camera and tilts it's head." + } + ] +} + +class TestGemini(unittest.TestCase): + + def run_patched_job(self, component, job, response): + if not USE_MOCKS: + return + + if USE_MOCKS: + with unittest.mock.patch("gemini_video_summarization_component.gemini_video_summarization_component.GeminiVideoSummarizationComponent._get_gemini_response", return_value=response): + return component.get_detections_from_video(job) + + def assert_detection_region(self, detection, frame_width, frame_height): + self.assertEqual(0, detection.x_left_upper) + self.assertEqual(0, detection.y_left_upper) + self.assertEqual(frame_width, detection.width) + self.assertEqual(frame_height, detection.height) + + + def assert_first_middle_last_detections(self, track, frame_width, frame_height): + self.assertIn(track.start_frame, track.frame_locations) + self.assert_detection_region(track.frame_locations[track.start_frame], frame_width, frame_height) + + self.assertIn(track.stop_frame, track.frame_locations) + self.assert_detection_region(track.frame_locations[track.stop_frame], frame_width, frame_height) + + middle_frame = int((track.stop_frame - track.start_frame) / 2) + track.start_frame + self.assertIn(middle_frame, track.frame_locations) + self.assert_detection_region(track.frame_locations[middle_frame], frame_width, frame_height) + + def test_multiple_videos(self): + component = GeminiVideoSummarizationComponent() + + job = mpf.VideoJob('valid cat job', str(TEST_DATA + "/" + "cat.mp4"), 0, 171, job_properties, CAT_VIDEO_PROPERTIES, {}) + frame_width = int(job.media_properties['FRAME_WIDTH']) + frame_height = int(job.media_properties['FRAME_HEIGHT']) + + + results = self.run_patched_job(component, job, json.dumps(CAT_TIMELINE)) + self.assertEqual(3, len(results)) + self.assertEqual('TRUE', results[0].detection_properties['SEGMENT SUMMARY']) + self.assertIn("looking around as people walk by.", results[0].detection_properties["TEXT"]) + self.assertEqual(0, results[0].start_frame) + self.assertEqual(171, results[0].stop_frame) + self.assert_first_middle_last_detections(results[0], frame_width, frame_height) + + self.assertIn("looking around.", results[1].detection_properties["TEXT"]) + self.assertEqual(0, results[1].start_frame) # 0 * 25 + self.assertEqual(99, results[1].stop_frame) # (4 * 25) - 1 + self.assert_first_middle_last_detections(results[1], frame_width, frame_height) + + self.assertIn("looks back at the camera", results[2].detection_properties["TEXT"]) + self.assertEqual(100, results[2].start_frame) # 4 * 25 + self.assertEqual(149, results[2].stop_frame) # (6 * 25) - 1 + self.assert_first_middle_last_detections(results[2], frame_width, frame_height) + + + job = mpf.VideoJob('valid dog job', str(TEST_DATA + "/" + "dog.mp4"), 0, 153, job_properties, DOG_VIDEO_PROPERTIES, {}) + frame_width = int(job.media_properties['FRAME_WIDTH']) + frame_height = int(job.media_properties['FRAME_HEIGHT']) + + results = self.run_patched_job(component, job, json.dumps(DOG_TIMELINE)) + self.assertEqual(2, len(results)) + + self.assertEqual('TRUE', results[0].detection_properties['SEGMENT SUMMARY']) + self.assertIn("sitting by a window and looking around", results[0].detection_properties["TEXT"]) + self.assertEqual(0, results[0].start_frame) + self.assertEqual(153, results[0].stop_frame) + self.assert_first_middle_last_detections(results[0], frame_width, frame_height) + + self.assertIn("sitting by the window.", results[1].detection_properties["TEXT"]) + self.assertEqual(0, results[1].start_frame) # 0 * 25 + self.assertEqual(149, results[1].stop_frame) # (6 * 25) - 1 + self.assert_first_middle_last_detections(results[1], frame_width, frame_height) + + def test_invalid_timeline(self): + component = GeminiVideoSummarizationComponent() + + job = mpf.VideoJob('invalid cat job', str(TEST_DATA + "/" + "cat.mp4"), 0, 15000, + { + "GOOGLE_APPLICATION_CREDENTIALS": GOOGLE_APPLICATION_CREDENTIALS, + "GENERATION_PROMPT_PATH":"../gemini_video_summarization_component/data/default_prompt.txt", + "GENERATION_MAX_ATTEMPTS" : "1", + }, + CAT_VIDEO_PROPERTIES, {}) + + with self.assertRaises(mpf.DetectionException) as cm: + self.run_patched_job(component, job, json.dumps(INVALID_CAT_TIMELINE)) # don't care about results + + self.assertEqual(mpf.DetectionError.DETECTION_FAILED, cm.exception.error_code) + self.assertIn("Max timeline event end time not close enough to segment stop time.", str(cm.exception)) + + # test disabling time check + job = mpf.VideoJob('invalid cat job', str(TEST_DATA + "/" + "cat.mp4"), 0, 15000, + { + "GOOGLE_APPLICATION_CREDENTIALS": GOOGLE_APPLICATION_CREDENTIALS, + "GENERATION_PROMPT_PATH":"../gemini_video_summarization_component/data/default_prompt.txt", + "GENERATION_MAX_ATTEMPTS" : "1", + "TIMELINE_CHECK_TARGET_THRESHOLD" : "-1" + }, + CAT_VIDEO_PROPERTIES, {}) + + results = self.run_patched_job(component, job, json.dumps(INVALID_CAT_TIMELINE)) + + self.assertIn("cat", results[0].detection_properties["TEXT"]) + + def test_invalid_json_response(self): + component = GeminiVideoSummarizationComponent() + + job = mpf.VideoJob('invalid cat job JSON', str(TEST_DATA + "/" + "cat.mp4"), 0, 100, + { + "GOOGLE_APPLICATION_CREDENTIALS": GOOGLE_APPLICATION_CREDENTIALS, + "GENERATION_PROMPT_PATH":"../gemini_video_summarization_component/data/default_prompt.txt", + "GENERATION_MAX_ATTEMPTS" : "1", + }, + CAT_VIDEO_PROPERTIES, {}) + + with self.assertRaises(mpf.DetectionException) as cm: + self.run_patched_job(component, job, "garbage xyz") # don't care about results + + self.assertEqual(mpf.DetectionError.DETECTION_FAILED, cm.exception.error_code) + self.assertIn("not valid JSON", str(cm.exception)) + + def test_empty_response(self): + component = GeminiVideoSummarizationComponent() + + job = mpf.VideoJob('empty cat job', str(TEST_DATA + "/" + "cat.mp4"), 0, 171, + { + "GOOGLE_APPLICATION_CREDENTIALS": GOOGLE_APPLICATION_CREDENTIALS, + "GENERATION_PROMPT_PATH":"../gemini_video_summarization_component/data/default_prompt.txt", + "GENERATION_MAX_ATTEMPTS" : "1", + }, + CAT_VIDEO_PROPERTIES, {}) + + with self.assertRaises(mpf.DetectionException) as cm: + self.run_patched_job(component, job, "") # don't care about results + + self.assertEqual(mpf.DetectionError.DETECTION_FAILED, cm.exception.error_code) + self.assertIn("Empty response", str(cm.exception)) + + + +if __name__ == "__main__": + unittest.main(verbosity=2) \ No newline at end of file