diff --git a/python/GeminiDetection/Dockerfile b/python/GeminiDetection/Dockerfile new file mode 100644 index 00000000..55bd67be --- /dev/null +++ b/python/GeminiDetection/Dockerfile @@ -0,0 +1,67 @@ +# syntax=docker/dockerfile:experimental + +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +ARG BUILD_REGISTRY +ARG BUILD_TAG=latest +FROM ${BUILD_REGISTRY}openmpf_python_executor_ssb:${BUILD_TAG} + +RUN --mount=type=tmpfs,target=/var/cache/apt \ + --mount=type=tmpfs,target=/var/lib/apt/lists \ + --mount=type=tmpfs,target=/tmp \ + apt-get update; \ + DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y wget \ + # For Google Gemini + # After installing the following /usr/bin will have: + # python3 -> python3.8 + # python3.8 + # python3.9 + python3.9 python3.9-venv libpython3.9 + +# Create separate venv for Python 3.9 subprocess +RUN mkdir -p /gemini-subprocess/venv; \ + python3.9 -m venv /gemini-subprocess/venv; \ + /gemini-subprocess/venv/bin/pip3 install google-genai pillow numpy + +COPY gemini-process-image.py gemini_component/resource_tracker_monkeypatch.py /gemini-subprocess + +RUN pip3 install --upgrade pip + +RUN pip3 install tenacity opencv-python + +ARG RUN_TESTS=false + +RUN --mount=target=.,readwrite \ + install-component.sh; \ + if [ "${RUN_TESTS,,}" == true ]; then python tests/test_gemini.py; fi + +LABEL org.label-schema.license="Apache 2.0" \ + org.label-schema.name="OpenMPF Gemini Detection" \ + org.label-schema.schema-version="1.0" \ + org.label-schema.url="https://openmpf.github.io" \ + org.label-schema.vcs-url="https://github.com/openmpf/openmpf-components" \ + org.label-schema.vendor="MITRE" \ No newline at end of file diff --git a/python/GeminiDetection/README.md b/python/GeminiDetection/README.md new file mode 100644 index 00000000..6c0539ac --- /dev/null +++ b/python/GeminiDetection/README.md @@ -0,0 +1,63 @@ +# Overview + +This repository contains source code for the OpenMPF Gemini Detection Component. + +This component utilizes a config file that contains any number of prompts for any number of object classes. These prompts and the images/video frames are passed to the Google Gemini server to generate responses. + +# Job Properties + +The following are the properties that can be specified for the component. All properties except for GEMINI_API_KEY and CLASSIFICATION have default values, making them optional to set. + +- `GEMINI_API_KEY`: Your API key to send requests to Google Gemini +- `CLASSIFICATION`: The class of the object(s) in the media. Used to determine the prompt(s). Examples: PERSON and VEHICLE. +- `PROMPT_CONFIGURATION_PATH`: The path to JSON file which contains prompts for specified classifications. +- `JSON_PROMPT_CONFIGURATION_PATH`: The path to a JSON file which contains classes and prompts that specify Gemini to return a JSON object. +- `ENABLE_JSON_PROMPT_FORMAT`: Enables returning a JSON formatted response from Gemini, with the prompt specified at PROMPT_JSON_CONFIGURATION_PATH job property. By default set to false. +- `GENERATE_FRAME_RATE_CAP`: The threshold on the maximum number of frames to process in the video segment within one second of the native video time. +- `MODEL_NAME`: The model to use for Gemini inference. By default it is set to `"gemma-3-27b-it"`. +- `GENERATION_MAX_ATTEMPTS`: The maximum number of times the component will attempt to generate valid JSON output. + +# Config File + +The config file is a JSON formatted file that is used by the component to know which prompts to ask Gemini depending on the class of the object. The user can write their own config file and can be used by setting the `PROMPT_CONFIGURATION_PATH` property. The following is an example of the proper syntax to follow: + +```json +[ + { + "classes": [ + "DOG", + "CAT", + "HORSE" + ], + "prompts": [ + { + "detectionProperty": "DESCRIPTION", + "prompt": "Describe the animal's color and appearance." + } + ] + }, + { + "classes": [ + "DOG" + ], + "prompts": [ + { + "detectionProperty": "DOG BREED", + "prompt": "Describe the potential breeds that this dog could contain." + } + ] + } +] +``` + +Note that a class can appear in multiple entries in the JSON, such as `"DOG"` in the example. If you have multiple classes that share a prompt, you can list them together like above and then add more questions for each individual class if you wish to get more specific. + +Also be sure to make each `"detectionProperty"` distinct for a given class so that none of your prompts are overwritten. + +# Outputs + +Once the responses are generated, they are added onto the `detection_properties` dictionary of the associated `ImageLocation` object. for each prompt, the key is specified by the `"detectionProperty"` field of the config JSON and the value will be the Gemini-generated response. + +# TODO + +- Add functionality for generic class property detection diff --git a/python/GeminiDetection/gemini-process-image.py b/python/GeminiDetection/gemini-process-image.py new file mode 100755 index 00000000..4cb86497 --- /dev/null +++ b/python/GeminiDetection/gemini-process-image.py @@ -0,0 +1,83 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +import argparse +import json +import sys +import numpy as np + +from google import genai +from multiprocessing.shared_memory import SharedMemory +from google.genai.errors import ClientError +from PIL import Image + +from resource_tracker_monkeypatch import remove_shm_from_resource_tracker + +def main(): + parser = argparse.ArgumentParser(description='Sends image and prompt to Gemini Client for processing.') + + parser.add_argument("--model", "-m", type=str, default="gemma-3-27b-it", help="The name of the Gemini model to use.") + parser.add_argument("--shm-name", type=str, required=True, help="Shared memory name for image data.") + parser.add_argument("--shm-shape", type=str, required=True, help="Shape of the image in shared memory (JSON list).") + parser.add_argument("--shm-dtype", type=str, required=True, help="Numpy dtype of the image in shared memory.") + parser.add_argument("--prompt", "-p", type=str, required=True, help="The prompt you want to use with the image.") + parser.add_argument("--api_key", "-a", type=str, required=True, help="Your API key for Gemini.") + args = parser.parse_args() + + remove_shm_from_resource_tracker() + + shm = None + + try: + shape = tuple(json.loads(args.shm_shape)) + dtype = np.dtype(args.shm_dtype) + shm = SharedMemory(name=args.shm_name) + + np_img = np.ndarray(shape, dtype=dtype, buffer=shm.buf) + image = Image.fromarray(np_img) + + client = genai.Client(api_key=args.api_key) + content = client.models.generate_content(model=args.model, contents=[args.prompt, image]) + print(content.text) + sys.exit(0) + + except ClientError as e: + if hasattr(e, 'code') and e.code == 429: + print("Caught a ResourceExhausted error (429 Too Many Requests)", file=sys.stderr) + else: + print(e, file=sys.stderr) + sys.exit(1) + + except Exception as e: + print(e, file=sys.stderr) + sys.exit(1) + + finally: + if shm: + shm.close() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/python/GeminiDetection/gemini_component/__init__.py b/python/GeminiDetection/gemini_component/__init__.py new file mode 100644 index 00000000..dc3be351 --- /dev/null +++ b/python/GeminiDetection/gemini_component/__init__.py @@ -0,0 +1,27 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2025 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2025 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +from .gemini_component import GeminiComponent \ No newline at end of file diff --git a/python/GeminiDetection/gemini_component/data/json_prompts.json b/python/GeminiDetection/gemini_component/data/json_prompts.json new file mode 100644 index 00000000..987bee2c --- /dev/null +++ b/python/GeminiDetection/gemini_component/data/json_prompts.json @@ -0,0 +1,24 @@ +{ + "classPrompts": [ + { + "classes": [ + "PERSON" + ], + "prompts": [ + "If there is no person visible in the image, produce JSON matching this specification: \"person\": { \"visible_person\": false } Return person. If a person is visible, extract their features and include answers only if 100% confident, if not provide \"unsure\". If an attribute is not visible, set the value to \"not visible\". For clarification, facial features are permanent or consistent traits of the face that do not change with expressions or emotions. Examples include face shape, nose structure, lip shape, eye shape/spacing, jawline, cheekbones, and consistent marks like scars or moles. They do not include temporary expressions (e.g., smiling), emotions (e.g., sadness), or conditions like makeup or lighting. Produce JSON matching this specification: \"person\": { \"visible_person\": true, \"type\": (\"civilian\", \"guard\", \"public figure\"), \"clothing\": array<{\"type\": (ie. \"shirt\", \"pants\", \"dress\", \"t-shirt\", \"shorts\", \"skirt\", etc.), \"color\": string, \"describe\": string}>, \"age_range\": (\"minor/child\", \"adult\", \"elderly\"), \"gender\": string, \"skin_color\": (\"very fair\", \"fair\", \"medium\", \"olive\", \"brown\", \"black\"), \"race\": (\"american indian/alaska native\", \"asian\", \"black/african american\", \"hispanic/latino\", \"native hawaiian/pacific islander\", \"white\"), \"accessories\": array< \"type\": string, \"color\": string, \"describe\": string}>, \"glasses\": {\"type\": string, \"color\": string, \"describe\": string}, \"object_in_hand\": array< \"type\": clothing_enum>, \"color\": string, \"describe\": string}>, \"shoes\": {\"type\": string, \"color\": string, \"describe\": string}, \"head_features\": {\"hair_color\": string, \"bald\": boolean, \"head_cover\": {\"type\": string, \"color\": string, \"describe\": string}}, \"tattoo_features\": {\"location\": string, \"color\": string, \"describe\": string}, \"face_features\": {\"eye_color\": (ie. \"brown\", \"blue\", \"green\", \"hazel\", \"gray\", \"amber\", \"violet\", etc.), \"facial_hair_color\": string, \"facial_features\": string}, \"action_performed\": string, \"background\": {\"type\": string, \"color\": string, \"describe\": string}, \"other_notable_characteristics\": string } Return: person" + ] + }, + { + "classes": [ + "VEHICLE", + "CAR", + "TRUCK", + "BUS", + "MOTORBIKE" + ], + "prompts": [ + "If there is no vehicle visible in the image, produce JSON matching this specification: \"vehicle\": { \"visible_vehicle\": false } Return vehicle. If a vehicle is visible, extract its features and include answers only if 100% confident, if not provide \"unsure\". If an attribute is not visible, set the value to \"not visible\". Produce JSON matching this specification: \"vehicle\": { \"visible_vehicle\": true, \"make\": string, \"type\": string, \"color\": string, \"license_plate_state\": string, \"license_plate_number\": string, \"other_notable_characteristics\": string} Return: vehicle" + ] + } + ] +} \ No newline at end of file diff --git a/python/GeminiDetection/gemini_component/data/prompts.json b/python/GeminiDetection/gemini_component/data/prompts.json new file mode 100644 index 00000000..83c7b144 --- /dev/null +++ b/python/GeminiDetection/gemini_component/data/prompts.json @@ -0,0 +1,39 @@ +{ + "classPrompts": [ + { + "classes": [ + "PERSON" + ], + "prompts": [ + { + "detectionProperty": "CLOTHING", + "prompt": "Describe what this person is wearing" + }, + { + "detectionProperty": "ACTIVITY", + "prompt": "Describe what this person is doing" + } + ] + }, + { + "classes": [ + "VEHICLE", + "CAR", + "TRUCK", + "BUS" + ], + "prompts": [ + { + "detectionProperty": "DESCRIPTION", + "prompt": "Describe this vehicle" + } + ] + } + ], + "framePrompts": [ + { + "detectionProperty": "LOCATION", + "prompt": "Describe the location in this scene" + } + ] +} \ No newline at end of file diff --git a/python/GeminiDetection/gemini_component/gemini_component.py b/python/GeminiDetection/gemini_component/gemini_component.py new file mode 100644 index 00000000..21a928d9 --- /dev/null +++ b/python/GeminiDetection/gemini_component/gemini_component.py @@ -0,0 +1,677 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +import time +import os +import json +import math +import subprocess +import logging +import json +import re +import cv2 + +from typing import Mapping, Iterable +from multiprocessing.shared_memory import SharedMemory + +import numpy as np +from tenacity import retry, wait_random_exponential, stop_after_delay, retry_if_exception, before_sleep_log + +import mpf_component_api as mpf +import mpf_component_util as mpf_util + +from .resource_tracker_monkeypatch import remove_shm_from_resource_tracker + +logger = logging.getLogger('GeminiComponent') + +IGNORE_WORDS = ['unsure', 'none', 'false', 'no', 'unclear', 'n/a', 'unspecified', 'unknown', 'unreadable', 'not visible', 'none visible'] +IGNORE_PREFIXES = tuple([s + ' ' for s in IGNORE_WORDS]) + +class GeminiComponent: + detection_type = 'CLASS' + + def __init__(self): + self.gemini_api_key = '' + self.class_prompts = dict() + self.json_class_prompts = dict() + self.frame_prompts = dict() + + remove_shm_from_resource_tracker() + + def get_detections_from_image(self, image_job: mpf.ImageJob) -> Iterable[mpf.ImageLocation]: + logger.info('Received image job: %s', image_job.job_name) + + self.video_process_timer = Timer() + self.video_decode_timer = Timer() + self.frame_count = 0 + + config = JobConfig(image_job.job_properties) + image_reader = mpf_util.ImageReader(image_job) + + if image_job.feed_forward_location is None: + if config.enable_json_prompt_format: + detections = self._get_frame_detections_json(image_job, [image_reader.get_image()], config) + else: + detections = self._get_frame_detections(image_job, [image_reader.get_image()], config) + else: + if config.enable_json_prompt_format: + detections = self._get_feed_forward_detections_json(image_job.feed_forward_location, image_reader, config) + else: + detections = self._get_feed_forward_detections(image_job.feed_forward_location, image_reader, config) + + logger.info(f"Job complete. Found {len(detections)} detections.") + return detections + + def get_detections_from_video(self, video_job: mpf.VideoJob) -> Iterable[mpf.VideoTrack]: + logger.info('Received video job: %s', video_job.job_name) + + self.video_process_timer = Timer() + self.video_decode_timer = Timer() + self.frame_count = 0 + + config = JobConfig(video_job.job_properties, video_job.media_properties) + video_capture = mpf_util.VideoCapture(video_job) + + if video_job.feed_forward_track is None: + if config.enable_json_prompt_format: + tracks = self._get_frame_detections_json(video_job, video_capture, config, is_video_job=True) + else: + tracks = self._get_frame_detections(video_job, video_capture, config, is_video_job=True) + else: + if config.enable_json_prompt_format: + tracks = self._get_feed_forward_detections_json(video_job.feed_forward_track, video_capture, config, is_video_job=True) + else: + tracks = self._get_feed_forward_detections(video_job.feed_forward_track, video_capture, config, is_video_job=True) + + decode_time = self.video_decode_timer.get_seconds_elapsed_from_last_pause() + if decode_time > 0.0: + logger.info("Total frame load time: " + f"{decode_time:0.3f} seconds ({self.frame_count / decode_time:0.3f} frames/second)") + + process_time = self.video_process_timer.get_seconds_elapsed_from_last_pause() + if process_time > 0.0: + logger.info("Total detection and tracking time: " + f"{process_time:0.3f} seconds ({self.frame_count / process_time:0.3f} frames/second)") + + logger.info(f"Job complete. Found {len(tracks)} tracks.") + return tracks + + def _get_frame_detections(self, job, reader, config, is_video_job=False): + # Check if both frame_rate_cap and generate_frame_rate_cap are set > 0. If so, throw exception + if (mpf_util.get_property(job.job_properties, 'FRAME_RATE_CAP', -1) > 0) and (config.frames_per_second_to_skip > 0): + raise mpf.DetectionException( + "Cannot have FRAME_RATE_CAP and GENERATE_FRAME_RATE_CAP both set to values greater than zero on jobs without feed forward detections:", + mpf.DetectionError.INVALID_PROPERTY + ) + + self._update_prompts(config.prompt_config_path, config.json_prompt_config_path) + self.gemini_api_key = config.gemini_api_key + + tracks = [] + self.frame_count = 0 + self.video_decode_timer = Timer() + self.video_process_timer = Timer() + + self.video_decode_timer.start() + + for idx, frame in enumerate(reader): + if (config.frames_per_second_to_skip <= 0) or (idx % config.frames_per_second_to_skip == 0): + + self.video_decode_timer.pause() + self.frame_count += 1 + height, width, _ = frame.shape + detection_properties = dict() + self.video_process_timer.start() + + for tag, prompt in self.frame_prompts.items(): + response = self._get_gemini_response(config.model_name, frame, prompt) + detection_properties[tag] = response + + detection_properties['ANNOTATED BY GEMINI'] = True + self.video_process_timer.pause() + img_location = mpf.ImageLocation(0, 0, width, height, -1, detection_properties) + + if is_video_job: + tracks.append(mpf.VideoTrack(idx, idx, -1, {idx: img_location}, detection_properties)) + else: + tracks.append(img_location) + + self.video_decode_timer.start() + + if is_video_job: + for track in tracks: + reader.reverse_transform(track) + + return tracks + + def _get_frame_detections_json(self, job, reader, config, is_video_job=False): + # Check if both frame_rate_cap and generate_frame_rate_cap are set > 0. If so, throw exception + if (mpf_util.get_property(job.job_properties, 'FRAME_RATE_CAP', -1) > 0) and (config.frames_per_second_to_skip > 0): + raise mpf.DetectionException( + "Cannot have FRAME_RATE_CAP and GENERATE_FRAME_RATE_CAP both set to values greater than zero on jobs without feed forward detections:", + mpf.DetectionError.INVALID_PROPERTY + ) + + self._update_prompts(config.prompt_config_path, config.json_prompt_config_path) + self.gemini_api_key = config.gemini_api_key + + classification = config.classification.strip().lower() + + tracks = [] + self.frame_count = 0 + self.video_decode_timer = Timer() + self.video_process_timer = Timer() + self.video_decode_timer.start() + + for idx, frame in enumerate(reader): + if (config.frames_per_second_to_skip <= 0) or (idx % config.frames_per_second_to_skip == 0): + self.video_decode_timer.pause() + self.frame_count += 1 + height, width, _ = frame.shape + detection_properties = dict() + self.video_process_timer.start() + json_limit = config.generation_max_attempts + + if classification in self.json_class_prompts: + for tag, prompt in self.json_class_prompts[classification].items(): + json_attempts, json_failed = 0, True + while (json_attempts < json_limit) and (json_failed): + json_attempts += 1 + response = self._get_gemini_response(config.model_name, frame, prompt) + try: + response = response.split('```json\n')[1].split('```')[0] + response_json = json.loads(response) + self._update_detection_properties(detection_properties, response_json, classification) + json_failed = False + except Exception as e: + logger.warning(f"Gemini failed to produce valid JSON output: {e}") + logger.warning(f"Failed {json_attempts} of {json_limit} attempts.") + continue + if json_failed: + logger.warning(f"Using last full Gemini response instead of parsed JSON output.") + detection_properties['FAILED TO PROCESS GEMINI RESPONSE'] = True + detection_properties['FULL GEMINI RESPONSE'] = response + + self.video_process_timer.pause() + img_location = mpf.ImageLocation(0, 0, width, height, -1, detection_properties) + + if is_video_job: + tracks.append(mpf.VideoTrack(idx, idx, -1, { idx:img_location }, detection_properties)) + else: + tracks.append(img_location) + + self.video_decode_timer.start() + + if is_video_job: + for track in tracks: + reader.reverse_transform(track) + + return tracks + + def _get_feed_forward_detections(self, job_feed_forward, reader, config, is_video_job=False): + self._update_prompts(config.prompt_config_path, config.json_prompt_config_path) + self.gemini_api_key = config.gemini_api_key + + classification = job_feed_forward.detection_properties["CLASSIFICATION"].lower() + + frame_count = 0 + self.video_decode_timer = Timer() + self.video_process_timer = Timer() + + if is_video_job: + self.video_decode_timer.start() + frame_indices = {i: frame for i, frame in zip(job_feed_forward.frame_locations.keys(), reader)} + frames_to_process = self._get_frames_to_process(list(frame_indices.keys()), config.frames_per_second_to_skip) + for idx in frames_to_process: + self.video_decode_timer.pause() + frame = frame_indices[idx] + ff_location = job_feed_forward.frame_locations[idx] + frame_count += 1 + + if classification in self.class_prompts: + detection_properties = ff_location.detection_properties + + for tag, prompt in self.class_prompts[classification].items(): + response = self._get_gemini_response(config.model_name, frame, prompt) + detection_properties[tag] = response + detection_properties['CLASSIFICATION'] = classification.upper() + detection_properties['ANNOTATED BY GEMINI'] = True + + self.video_decode_timer.start() + return [job_feed_forward] + else: + if classification in self.class_prompts: + detection_properties = job_feed_forward.detection_properties + if hasattr(job_feed_forward, 'data_uri'): + image = job_feed_forward.data_uri + else: + image = reader.get_image() + + for tag, prompt in self.class_prompts[classification].items(): + response = self._get_gemini_response(config.model_name, image, prompt) + detection_properties[tag] = response + detection_properties['CLASSIFICATION'] = classification.upper() + detection_properties['ANNOTATED BY GEMINI'] = True + return [job_feed_forward] + + def _get_feed_forward_detections_json(self, job_feed_forward, reader, config, is_video_job=False): + self._update_prompts(config.prompt_config_path, config.json_prompt_config_path) + self.gemini_api_key = config.gemini_api_key + json_limit = config.generation_max_attempts + + classification = job_feed_forward.detection_properties["CLASSIFICATION"].lower() + self.frame_count = 0 + self.video_decode_timer = Timer() + self.video_process_timer = Timer() + prompts_to_use = self.json_class_prompts if config.enable_json_prompt_format else self.class_prompts + + if is_video_job: + self.video_decode_timer.start() + frame_indices = {i: frame for i, frame in zip(job_feed_forward.frame_locations.keys(), reader)} + for idx in self._get_frames_to_process(list(frame_indices.keys()), config.frames_per_second_to_skip): + self.video_decode_timer.pause() + frame = frame_indices[idx] + ff_location = job_feed_forward.frame_locations[idx] + self.frame_count += 1 + + if classification in prompts_to_use: + for tag, prompt in prompts_to_use[classification].items(): + json_attempts, json_failed = 0, True + + while (json_attempts < json_limit) and (json_failed): + json_attempts += 1 + response = self._get_gemini_response(config.model_name, frame, prompt) + try: + response = response.split('```json\n')[1].split('```')[0] + response_json = json.loads(response) + self._update_detection_properties(ff_location.detection_properties, response_json, classification) + json_failed = False + except Exception as e: + logger.warning(f"Gemini failed to produce valid JSON output: {e}") + logger.warning(f"Failed {json_attempts} of {json_limit} attempts.") + continue + if json_failed: + logger.warning(f"Using last full Gemini response instead of parsed JSON output.") + ff_location.detection_properties['FAILED TO PROCESS GEMINI RESPONSE'] = True + ff_location.detection_properties['FULL GEMINI RESPONSE'] = response + + self.video_decode_timer.start() + + return [job_feed_forward] + else: + image = reader.get_image() + if classification in prompts_to_use: + for tag, prompt in prompts_to_use[classification].items(): + json_attempts, json_failed = 0, True + while (json_attempts < json_limit) and (json_failed): + json_attempts += 1 + response = self._get_gemini_response(config.model_name, image, prompt) + try: + response = response.split('```json\n')[1].split('```')[0] + response_json = json.loads(response) + self._update_detection_properties(job_feed_forward.detection_properties, response_json, classification) + json_failed = False + except Exception as e: + logger.warning(f"Gemini failed to produce valid JSON output: {e}") + logger.warning(f"Failed {json_attempts} of {json_limit} attempts.") + continue + if json_failed: + logger.warning(f"Using last full Gemini response instead of parsed JSON output.") + job_feed_forward.detection_properties['FAILED TO PROCESS GEMINI RESPONSE'] = True + job_feed_forward.detection_properties['FULL GEMINI RESPONSE'] = response + return [job_feed_forward] + + def _resize_frame(self, frame, max_dim=4500): + h, w = frame.shape[:2] + scale = min(max_dim / w, max_dim / h) + if scale < 1.0: + new_w = int(w * scale) + new_h = int(h * scale) + resized_frame = cv2.resize(frame, (new_w, new_h), interpolation=cv2.INTER_AREA) + return resized_frame + else: + return frame + + def _update_detection_properties(self, detection_properties, response_json, classification): + + is_person = (('CLASSIFICATION' in detection_properties) and (detection_properties['CLASSIFICATION'].lower() == 'person')) \ + or (classification == 'person') + + vehicle_classes = ['vehicle', 'car', 'truck', 'bus', 'motorbike'] + is_vehicle = (('CLASSIFICATION' in detection_properties) and (detection_properties['CLASSIFICATION'].lower() in vehicle_classes)) \ + or (classification in vehicle_classes) + + key_list = self._get_keys(response_json, True) # TODO: flatten should be an algorithm property or specified in the prompts file + key_vals = dict() + keywords = [] + for key_str in key_list: + split_key = [' '.join(x.split('_')) for x in ('gemini' + key_str).split('||')] + key, val = " ".join([s.upper() for s in split_key[:-1]]), split_key[-1] + key_vals[key] = val + + # TODO: Implement this generically to work with any class. Specify rollup class in prompt JSON file. + ignore_person = is_person and ('GEMINI VISIBLE PERSON' in key_vals) and (self._ignore(key_vals['GEMINI VISIBLE PERSON'])) + ignore_vehicle = is_vehicle and ('GEMINI VISIBLE VEHICLE' in key_vals) and (self._ignore(key_vals['GEMINI VISIBLE VEHICLE'])) + + if not ignore_person and not ignore_vehicle: + tmp_key_vals = dict(key_vals) + for key, val in key_vals.items(): + if 'VISIBLE' in key: + tmp_key_vals.pop(key) + if self._ignore(val): + keywords.append(key.split(' VISIBLE ')[1]) + key_vals = tmp_key_vals + + tmp_key_vals = dict(key_vals) + for keyword in keywords: + pattern = re.compile(fr'\b{keyword}\b') + for key_to_remove in filter(pattern.search, key_vals): + tmp_key_vals.pop(key_to_remove, None) + key_vals = tmp_key_vals + + tmp_key_vals = dict(key_vals) + for key, val in key_vals.items(): + if self._ignore(val): + tmp_key_vals.pop(key) + key_vals = tmp_key_vals + + detection_properties.update(key_vals) + + detection_properties['CLASSIFICATION'] = classification.upper() + detection_properties['ANNOTATED BY GEMINI'] = True + logger.debug(f"{detection_properties=}") + + def _get_keys(self, response_json, flatten): + if not response_json: + yield f'||none' + + elif isinstance(response_json, (str, bool)): + yield f'||{response_json}' + + elif isinstance(response_json, list): + yield f'||{json.dumps(response_json)}' + + elif isinstance(response_json, dict): + if self._is_lowest_level(response_json): + + tmp_response_json = dict(response_json) + for key, val in response_json.items(): + if self._ignore(val): + tmp_response_json.pop(key) + response_json = tmp_response_json + + if not response_json: + yield f'||none' + elif flatten: + yield from (f'||{key}||{val}' for key, val in response_json.items()) + else: + yield f'||{json.dumps(response_json)}' + + else: + for key, value in response_json.items(): + if self._ignore(key): + yield f'||none' + else: + yield from (f'||{key}{p}' for p in self._get_keys(value, flatten)) + + @staticmethod + def _is_lowest_level(response_json): + for key, val in response_json.items(): + if not isinstance(val, str): + return False + return True + + @staticmethod + def _ignore(input): + return not input or \ + input.strip().lower() in IGNORE_WORDS or \ + input.strip().lower().startswith(IGNORE_PREFIXES) + + def _update_prompts(self, prompt_config_path, json_prompt_config_path): + ''' + Updates self.class_prompts dictionary to have the following format + + { + CLASS1: {TAG1: PROMPT1}, + CLASS2: {TAG2: PROMPT2, TAG3: PROMPT3}, + ... + } + + and self.frame_prompts to be a dict of key, prompt string pairs. + ''' + try: + with open(prompt_config_path, 'r') as f: + data = json.load(f) + class_dicts, frame_dicts = data['classPrompts'], data['framePrompts'] + for class_dict in class_dicts: + classes, prompts = [cls.lower() for cls in class_dict['classes']], class_dict['prompts'] + for cls in classes: + if cls not in self.class_prompts: + self.class_prompts[cls] = dict() + self.class_prompts[cls].update({ dct['detectionProperty']:dct['prompt'] for dct in prompts }) + + for frame_dict in frame_dicts: + self.frame_prompts[frame_dict['detectionProperty']] = frame_dict['prompt'] + + with open(json_prompt_config_path, 'r') as f: + data = json.load(f) + json_class_dicts = data['classPrompts'] + for class_dict in json_class_dicts: + classes, prompts = [cls.lower() for cls in class_dict['classes']], class_dict['prompts'] + for cls in classes: + for idx, prompt in enumerate(prompts): + self.json_class_prompts[cls] = { f'JSON_{idx}':prompt } + + except Exception as e: + raise mpf.DetectionException( + f"Invalid JSON structure for component: {e}", + mpf.DetectionError.COULD_NOT_READ_DATAFILE + ) + + def _is_rate_limit_error(self, stderr): + return "Caught a ResourceExhausted error (429 Too Many Requests)" in stderr + + @retry( + # Each wait is between 4 and multiplier * 2^n seconds, where n is the number of retries. The max wait capped at 32 seconds. + wait=wait_random_exponential(multiplier=2, max=32, min=4), + # Stops retrying after the total time waiting >=60s, checks after each attempt + stop=stop_after_delay(60), + retry=retry_if_exception(lambda e: isinstance(e, mpf.DetectionException) and getattr(e, 'rate_limit', False)), + before_sleep=before_sleep_log(logger, logging.WARNING) + ) + def _get_gemini_response(self, model_name, frame, prompt): + mod_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + mod_frame = self._resize_frame(mod_frame) + + shape = mod_frame.shape + dtype = mod_frame.dtype + + shm = None + stderr_decoded = None + + try: + shm = SharedMemory(create=True, size=mod_frame.nbytes) + np_shm = np.ndarray(shape, dtype=dtype, buffer=shm.buf) + np_shm[:] = mod_frame[:] + + logger.debug(f"Shared memory created: {shm.name}") + + try: + process = subprocess.Popen([ + "/gemini-subprocess/venv/bin/python3", + "/gemini-subprocess/gemini-process-image.py", + '-m', model_name, + "--shm-name", shm.name, + "--shm-shape", json.dumps(shape), + "--shm-dtype", str(dtype), + "-p", prompt, + "-a", self.gemini_api_key], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + + stdout, stderr = process.communicate() + + if process.returncode == 0: + response = stdout.decode() + logger.info(response) + return response + + stderr_decoded = stderr.decode() + + except Exception as e: + raise mpf.DetectionException( + f"Subprocess error: {e}", + mpf.DetectionError.DETECTION_FAILED + ) + + if self._is_rate_limit_error(stderr_decoded): + ex = mpf.DetectionException( + f"Subprocess failed due to rate limiting: {stderr_decoded}", + mpf.DetectionError.DETECTION_FAILED + ) + ex.rate_limit = True + raise ex + + raise mpf.DetectionException( + f"Subprocess failed: {stderr_decoded}", + mpf.DetectionError.DETECTION_FAILED + ) + + finally: + if shm: + shm.close() + try: + shm.unlink() + except FileNotFoundError: + logger.info(f"Shared memory '{shm.name}' already unlinked or does not exist.") + except Exception as e: + raise mpf.DetectionException( + f"Shared memory '{shm.name}' error: {e}", + mpf.DetectionError.OTHER_DETECTION_ERROR_TYPE + ) + + def _get_frames_to_process(self, frame_locations: list, skip: int) -> list: + if not frame_locations: + return [] + + retval = [] + curr = frame_locations[0] + retval.append(curr) + want = curr + skip + + for i in range(1, len(frame_locations)): + + next = math.inf + if i + 1 < len(frame_locations): + next = frame_locations[i + 1] + + if next < want: + continue + + curr = frame_locations[i] + + curr_delta = abs(want - curr) + next_delta = abs(next - want) + + too_close_to_last = (curr - retval[-1]) <= (skip / 3) + + if curr_delta <= next_delta and not too_close_to_last: + retval.append(curr) + want = curr + skip + continue + + if next != math.inf: + retval.append(next) + want = next + skip + + return retval + + +class JobConfig: + def __init__(self, job_properties: Mapping[str, str], media_properties=None): + self.prompt_config_path = self._get_prop(job_properties, "PROMPT_CONFIGURATION_PATH", "") + if self.prompt_config_path == "": + self.prompt_config_path = os.path.join(os.path.dirname(__file__), 'data', 'prompts.json') + + if not os.path.exists(self.prompt_config_path): + raise mpf.DetectionException( + "Invalid path provided for prompt config file: ", + mpf.DetectionError.COULD_NOT_OPEN_DATAFILE + ) + + self.json_prompt_config_path = self._get_prop(job_properties, "JSON_PROMPT_CONFIGURATION_PATH", "") + if self.json_prompt_config_path == "": + self.json_prompt_config_path = os.path.join(os.path.dirname(__file__), 'data', 'json_prompts.json') + + self.enable_json_prompt_format = self._get_prop(job_properties, "ENABLE_JSON_PROMPT_FORMAT", False) + + self.gemini_api_key = self._get_prop(job_properties, "GEMINI_API_KEY", "") + self.classification = self._get_prop(job_properties, "CLASSIFICATION", "") + self.model_name = self._get_prop(job_properties, "MODEL_NAME", "gemma-3-27b-it") + self.generation_max_attempts = self._get_prop(job_properties, "GENERATION_MAX_ATTEMPTS", 5) + + generate_frame_rate_cap = self._get_prop(job_properties, "GENERATE_FRAME_RATE_CAP", 1.0) + if (media_properties != None) and (generate_frame_rate_cap > 0): + # Check if fps exists. If not throw mpf.DetectionError.MISSING_PROPERTY exception + try: + self.frames_per_second_to_skip = max(1, math.floor(float(media_properties['FPS']) / generate_frame_rate_cap)) + except Exception as e: + raise mpf.DetectionException( + f"FPS not found for media: {e}", + mpf.DetectionError.MISSING_PROPERTY + ) + else: + self.frames_per_second_to_skip = -1 + + + @staticmethod + def _get_prop(job_properties, key, default_value, accept_values=[]): + prop = mpf_util.get_property(job_properties, key, default_value) + if (accept_values != []) and (prop not in accept_values): + raise mpf.DetectionException( + f"Property {key} not in list of acceptable values: {accept_values}", + mpf.DetectionError.INVALID_PROPERTY + ) + return prop + +class Timer: + def __init__(self): + self._seconds_elapsed = 0.0 + self._last_start_time = None + + def start(self): + if self._last_start_time is None: + self._last_start_time = time.perf_counter() + + def pause(self): + if self._last_start_time is not None: + self._seconds_elapsed += time.perf_counter() - self._last_start_time + self._last_start_time = None + + def get_seconds_elapsed_from_last_pause(self) -> float: + return self._seconds_elapsed + +EXPORT_MPF_COMPONENT = GeminiComponent \ No newline at end of file diff --git a/python/GeminiDetection/gemini_component/resource_tracker_monkeypatch.py b/python/GeminiDetection/gemini_component/resource_tracker_monkeypatch.py new file mode 100644 index 00000000..a75f0d8d --- /dev/null +++ b/python/GeminiDetection/gemini_component/resource_tracker_monkeypatch.py @@ -0,0 +1,25 @@ +# Taken from: https://bugs.python.org/file49859/mprt_monkeypatch.py +# Refer to: https://stackoverflow.com/questions/62748654/python-3-8-shared-memory-resource-tracker-producing-unexpected-warnings-at-appli + +from multiprocessing import resource_tracker + +def remove_shm_from_resource_tracker(): + """Monkey-patch multiprocessing.resource_tracker so SharedMemory won't be tracked + + More details at: https://bugs.python.org/issue38119 + """ + + def fix_register(name, rtype): + if rtype == "shared_memory": + return + return resource_tracker._resource_tracker.register(self, name, rtype) + resource_tracker.register = fix_register + + def fix_unregister(name, rtype): + if rtype == "shared_memory": + return + return resource_tracker._resource_tracker.unregister(self, name, rtype) + resource_tracker.unregister = fix_unregister + + if "shared_memory" in resource_tracker._CLEANUP_FUNCS: + del resource_tracker._CLEANUP_FUNCS["shared_memory"] \ No newline at end of file diff --git a/python/GeminiDetection/plugin-files/descriptor/descriptor.json b/python/GeminiDetection/plugin-files/descriptor/descriptor.json new file mode 100644 index 00000000..ebf3c722 --- /dev/null +++ b/python/GeminiDetection/plugin-files/descriptor/descriptor.json @@ -0,0 +1,287 @@ +{ + "componentName": "GeminiDetection", + "componentVersion": "9.0", + "middlewareVersion": "9.0", + "sourceLanguage": "python", + "batchLibrary": "GeminiDetection", + "environmentVariables": [], + "algorithm": { + "name": "Gemini", + "description": "Gemini prompt response generation.", + "actionType": "DETECTION", + "trackType": "CLASS", + "outputChangedCounter": 1, + "requiresCollection": { + "states": [] + }, + "providesCollection": { + "states": [ + "DETECTION", + "DETECTION_CLASS", + "DETECTION_CLASS_GEMINI" + ], + "properties": [ + { + "name": "PROMPT_CONFIGURATION_PATH", + "description": "Path to a custom JSON file which contains the classes and associated prompts that will be sent to Gemini.", + "type": "STRING", + "defaultValue": "" + }, + { + "name": "JSON_PROMPT_CONFIGURATION_PATH", + "description": "Path to a custom JSON file which contains classes and prompts that specify Gemini to return a JSON object.", + "type": "STRING", + "defaultValue": "" + }, + { + "name": "ENABLE_JSON_PROMPT_FORMAT", + "description": "Enables returning a JSON formatted response from Gemini, with the prompt specified at PROMPT_JSON_CONFIGURATION_PATH job property.", + "type": "BOOLEAN", + "defaultValue": "false" + }, + { + "name": "GEMINI_API_KEY", + "description": "Gemini API key.", + "type": "STRING", + "defaultValue": "" + }, + { + "name": "GENERATE_FRAME_RATE_CAP", + "description": "The threshold on the maximum number of frames to process in the video segment within one second of the native video time. If set to a value > 0, then an internal frame interval is calculated as max(1, floor(mediaNativeFPS / GENERATE_FRAME_RATE_CAP)). If set <= 0, property is disabled and every frame is used. Throws exception if FRAME_RATE_CAP and GENERATE_FRAME_RATE_CAP both set > 0.", + "type": "DOUBLE", + "defaultValue": "1.0" + }, + { + "name": "CLASSIFICATION", + "description": "The class of the object(s) in the media. Used to determine the prompt(s). Examples: PERSON, VEHICLE.", + "type": "STRING", + "defaultValue": "" + }, + { + "name": "MODEL_NAME", + "description": "The model to use for Gemini inference. Examples: 'gemini-1.5-flash', 'gemini-1.5-pro'.", + "type": "STRING", + "defaultValue": "gemma-3-27b-it" + }, + { + "name": "GENERATION_MAX_ATTEMPTS", + "description": "The maximum number of times the component will attempt to generate valid JSON output.", + "type": "INT", + "defaultValue": "5" + } + ] + } + }, + "actions": [ + { + "name": "GEMINI DETECTION ACTION", + "description": "Runs Gemini with prompts for each video frame passed in.", + "algorithm": "Gemini", + "properties": [ + { + "name": "ARTIFACT_EXTRACTION_POLICY_BEST_DETECTION_PROP_NAMES_LIST", + "value": "ANNOTATED BY GEMINI" + } + ] + }, + { + "name": "GEMINI PERSON DETECTION (WITH JSON PROMPT) ACTION", + "description": "Runs Gemini person prompt that specifies JSON object outputs on images and videos at the specified frame rate.", + "algorithm": "Gemini", + "properties": [ + { + "name": "CLASSIFICATION", + "value": "PERSON" + }, + { + "name": "ENABLE_JSON_PROMPT_FORMAT", + "value": "true" + }, + { + "name": "ARTIFACT_EXTRACTION_POLICY_BEST_DETECTION_PROP_NAMES_LIST", + "value": "ANNOTATED BY GEMINI" + } + ] + }, + { + "name": "GEMINI VEHICLE DETECTION (WITH JSON PROMPT) ACTION", + "description": "Runs Gemini vehicle prompt that specifies JSON object outputs on images and videos at the specified frame rate.", + "algorithm": "Gemini", + "properties": [ + { + "name": "CLASSIFICATION", + "value": "VEHICLE" + }, + { + "name": "ENABLE_JSON_PROMPT_FORMAT", + "value": "true" + }, + { + "name": "ARTIFACT_EXTRACTION_POLICY_BEST_DETECTION_PROP_NAMES_LIST", + "value": "ANNOTATED BY GEMINI" + } + ] + }, + { + "name": "GEMINI DETECTION (WITH FF REGION AND JSON PROMPT) ACTION", + "description": "Runs Gemini with prompts that specify JSON object outputs for the class of the feed forward detection passed in.", + "algorithm": "Gemini", + "properties": [ + { + "name": "FEED_FORWARD_TYPE", + "value": "REGION" + }, + { + "name": "ENABLE_JSON_PROMPT_FORMAT", + "value": "true" + }, + { + "name": "ARTIFACT_EXTRACTION_POLICY_BEST_DETECTION_PROP_NAMES_LIST", + "value": "ANNOTATED BY GEMINI" + } + ] + }, + { + "name": "GEMINI DETECTION (EXEMPLAR ONLY WITH FF REGION) ACTION", + "description": "Runs Gemini with prompts for the class of the feed forward detection passed in, only processing for the exemplar of each track.", + "algorithm": "Gemini", + "properties": [ + { + "name": "FEED_FORWARD_TYPE", + "value": "REGION" + }, + { + "name": "FEED_FORWARD_TOP_QUALITY_COUNT", + "value": "1" + }, + { + "name": "GENERATE_FRAME_RATE_CAP", + "value": "-1" + } + ] + }, + { + "name": "GEMINI DETECTION (EXEMPLAR ONLY WITH FF REGION AND JSON PROMPT) ACTION", + "description": "Runs Gemini with prompts that specify JSON object outputs for the class of the feed forward detection passed in, only processing for the exemplar of each track.", + "algorithm": "Gemini", + "properties": [ + { + "name": "FEED_FORWARD_TYPE", + "value": "REGION" + }, + { + "name": "FEED_FORWARD_TOP_QUALITY_COUNT", + "value": "1" + }, + { + "name": "ENABLE_JSON_PROMPT_FORMAT", + "value": "true" + }, + { + "name": "GENERATE_FRAME_RATE_CAP", + "value": "-1" + } + ] + } + ], + "tasks": [ + { + "name": "GEMINI DETECTION TASK", + "description": "Runs Gemini with prompts for each video frame passed in.", + "actions": [ + "GEMINI DETECTION ACTION" + ] + }, + { + "name": "GEMINI PERSON DETECTION (WITH JSON PROMPT) TASK", + "description": "Runs Gemini person prompt that specifies JSON object outputs on images and videos at the specified frame rate.", + "actions": [ + "GEMINI PERSON DETECTION (WITH JSON PROMPT) ACTION" + ] + }, + { + "name": "GEMINI VEHICLE DETECTION (WITH JSON PROMPT) TASK", + "description": "Runs Gemini vehicle prompt that specifies JSON object outputs on images and videos at the specified frame rate.", + "actions": [ + "GEMINI VEHICLE DETECTION (WITH JSON PROMPT) ACTION" + ] + }, + { + "name": "GEMINI DETECTION (WITH FF REGION AND JSON PROMPT) TASK", + "description": "Runs Gemini with prompts that specify JSON object outputs for the class of the feed forward detection passed in.", + "actions": [ + "GEMINI DETECTION (WITH FF REGION AND JSON PROMPT) ACTION" + ] + }, + { + "name": "GEMINI DETECTION (EXEMPLAR ONLY WITH FF REGION) TASK", + "description": "Runs Gemini with prompts for the class of the feed forward detection passed in, only processing for the exemplar of each track.", + "actions": [ + "GEMINI DETECTION (EXEMPLAR ONLY WITH FF REGION) ACTION" + ] + }, + { + "name": "GEMINI DETECTION (EXEMPLAR ONLY WITH FF REGION AND JSON PROMPT) TASK", + "description": "Runs Gemini with prompts that specify JSON object outputs for the class of the feed forward detection passed in, only processing for the exemplar of each track.", + "actions": [ + "GEMINI DETECTION (EXEMPLAR ONLY WITH FF REGION AND JSON PROMPT) ACTION" + ] + } + ], + "pipelines": [ + { + "name": "GEMINI DETECTION PIPELINE", + "description":"Runs Gemini with prompts on images and videos at the specified frame rate.", + "tasks": [ + "GEMINI DETECTION TASK" + ] + }, + { + "name": "GEMINI PERSON DETECTION (WITH JSON PROMPT) PIPELINE", + "description":"Runs Gemini person prompt that specifies JSON object outputs on images and videos at the specified frame rate.", + "tasks": [ + "GEMINI PERSON DETECTION (WITH JSON PROMPT) TASK" + ] + }, + { + "name": "GEMINI VEHICLE DETECTION (WITH JSON PROMPT) PIPELINE", + "description":"Runs Gemini vehicle prompt that specifies JSON object outputs on images and videos at the specified frame rate.", + "tasks": [ + "GEMINI VEHICLE DETECTION (WITH JSON PROMPT) TASK" + ] + }, + { + "name": "GEMINI DETECTION (WITH FF REGION FROM TRITON YOLO AND JSON PROMPT) PIPELINE", + "description":"Runs Gemini with prompts that specify JSON object outputs on images and videos at the specified frame rate.", + "tasks": [ + "OCV TRITON YOLO OBJECT DETECTION TASK", + "GEMINI DETECTION (WITH FF REGION AND JSON PROMPT) TASK" + ] + }, + { + "name": "GEMINI DETECTION (WITH FF REGION FROM TRITON YOLO AND JSON PROMPT AND MARKUP) PIPELINE", + "description":"Runs Gemini with prompts that specify JSON object outputs on images and videos at the specified frame rate. Performs markup.", + "tasks": [ + "OCV TRITON YOLO OBJECT DETECTION TASK", + "GEMINI DETECTION (WITH FF REGION AND JSON PROMPT) TASK", + "OCV GENERIC MARKUP TASK" + ] + }, + { + "name": "GEMINI DETECTION (EXEMPLAR ONLY WITH FF REGION FROM TRITON YOLO) PIPELINE", + "description": "Runs Gemini with prompts for the class of the detection passed in, only processing the exemplar from each track.", + "tasks": [ + "OCV TRITON YOLO OBJECT DETECTION TASK", + "GEMINI DETECTION (EXEMPLAR ONLY WITH FF REGION) TASK" + ] + }, + { + "name": "GEMINI DETECTION (EXEMPLAR ONLY WITH FF REGION FROM TRITON YOLO AND JSON PROMPT) PIPELINE", + "description": "Runs Gemini with prompts that specify JSON object outputs for the class of the detection passed in, only processing the exemplar from each track.", + "tasks": [ + "OCV TRITON YOLO OBJECT DETECTION TASK", + "GEMINI DETECTION (EXEMPLAR ONLY WITH FF REGION AND JSON PROMPT) TASK" + ] + } + ] +} \ No newline at end of file diff --git a/python/GeminiDetection/pyproject.toml b/python/GeminiDetection/pyproject.toml new file mode 100644 index 00000000..23127b40 --- /dev/null +++ b/python/GeminiDetection/pyproject.toml @@ -0,0 +1,29 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/python/GeminiDetection/setup.cfg b/python/GeminiDetection/setup.cfg new file mode 100644 index 00000000..86797ce7 --- /dev/null +++ b/python/GeminiDetection/setup.cfg @@ -0,0 +1,44 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +[metadata] +name = GeminiDetection +version = 9.0 + +[options] +packages = gemini_component +install_requires = + mpf_component_api>=9.0 + mpf_component_util>=9.0 + tenacity + numpy + +[options.entry_points] +mpf.exported_component = + component = gemini_component.gemini_component:GeminiComponent + +[options.package_data] +gemini_component=data/prompts.json, data/json_prompts.json diff --git a/python/GeminiDetection/tests/data/NOTICE b/python/GeminiDetection/tests/data/NOTICE new file mode 100644 index 00000000..cd215b7f --- /dev/null +++ b/python/GeminiDetection/tests/data/NOTICE @@ -0,0 +1,13 @@ +# dog.jpg +# Public domain + +# person.jpg +# Photo crop from 2017 COCO Validation Set + +# test_video.mp4 +# Created from public domain images + +# car.jpg +# Photo by Sven D on Unsplash +# Unsplash License: https://unsplash.com/license +# https://unsplash.com/photos/parked-white-ford-explorer-suv-a4S6KUuLeoM \ No newline at end of file diff --git a/python/GeminiDetection/tests/data/car.jpg b/python/GeminiDetection/tests/data/car.jpg new file mode 100644 index 00000000..2ddd23f1 Binary files /dev/null and b/python/GeminiDetection/tests/data/car.jpg differ diff --git a/python/GeminiDetection/tests/data/custom_json_prompts.json b/python/GeminiDetection/tests/data/custom_json_prompts.json new file mode 100644 index 00000000..ae86e97d --- /dev/null +++ b/python/GeminiDetection/tests/data/custom_json_prompts.json @@ -0,0 +1,20 @@ +{ + "classPrompts": [ + { + "classes": [ + "DOG" + ], + "prompts": [ + "Describe the dog in JSON. The JSON should have the following keys: breed, color, size." + ] + }, + { + "classes": [ + "PERSON" + ], + "prompts": [ + "Describe the person in JSON. The JSON should have the following keys: hair_color (if unsure, respond with unsure), clothes, activity." + ] + } + ] +} \ No newline at end of file diff --git a/python/GeminiDetection/tests/data/custom_prompts.json b/python/GeminiDetection/tests/data/custom_prompts.json new file mode 100644 index 00000000..2dcdef47 --- /dev/null +++ b/python/GeminiDetection/tests/data/custom_prompts.json @@ -0,0 +1,25 @@ +{ + "classPrompts": [ + { + "classes": [ + "DOG" + ], + "prompts": [ + { + "detectionProperty": "DESCRIPTION", + "prompt": "Describe the color and breed of the dog." + } + ] + } + ], + "framePrompts": [ + { + "detectionProperty": "DESCRIPTION", + "prompt": "Describe this image" + }, + { + "detectionProperty": "LOCATION", + "prompt": "Describe the location in this scene" + } + ] +} \ No newline at end of file diff --git a/python/GeminiDetection/tests/data/dog.jpg b/python/GeminiDetection/tests/data/dog.jpg new file mode 100644 index 00000000..ed24dac9 Binary files /dev/null and b/python/GeminiDetection/tests/data/dog.jpg differ diff --git a/python/GeminiDetection/tests/data/outputs/test-ignore-person-output.txt b/python/GeminiDetection/tests/data/outputs/test-ignore-person-output.txt new file mode 100644 index 00000000..5201089f --- /dev/null +++ b/python/GeminiDetection/tests/data/outputs/test-ignore-person-output.txt @@ -0,0 +1,72 @@ +```json +{ + "visible_person": false, + "person": { + "Type": "unsure", + "clothing": { + "headwear": [ + { + "type": "hoodie", + "color": "black", + "location": "head", + "description": "A black hood that is pulled up." + } + ], + "top_layer": { + "type": "jacket", + "color": "black", + "location": "torso", + "description": "A dark colored jacket covering the torso." + }, + "lower_layer": { + "type": "unsure", + "color": "dark", + "location": "pants", + "description": "Dark-colored pants that are only partially visible." + } + }, + "estimated_age_range": "adult", + "estimated_gender": "male", + "estimated_race": "unsure", + "accessories": [], + "visible_glasses": false, + "glasses": {}, + "visible_object_in_hand": false, + "object_in_hand": {}, + "person_wearing_shoe": true, + "shoe": { + "type": "sneaker", + "color": "black", + "description": "Black sneaker on the foot." + }, + "visible_head_hair": false, + "head_features": { + "head_hair_color": "dark", + "bald": true, + "visible_head_cover": true, + "head_cover_type": "hoodie", + "visible_tattoo": false, + "tattoo_features": {} + }, + "visible_face": false, + "visible_eye": false, + "face_features": { + "visible_eye": false, + "eye_color": "unsure", + "visible_facial_hair": false, + "facial_hair_color": "unsure", + "facial_features": {}, + "emotion_of_person": "unsure" + }, + "action_performed": "walking", + "background": { + "describe": "The person is in a building with an indoor surface visible behind them.", + "color": "unspecified", + "type": "indoors" + }, + "other_notable_characteristics": { + "unsure": true + } + } +} +``` \ No newline at end of file diff --git a/python/GeminiDetection/tests/data/outputs/test-json-response-image-output.txt b/python/GeminiDetection/tests/data/outputs/test-json-response-image-output.txt new file mode 100644 index 00000000..dc3a0977 --- /dev/null +++ b/python/GeminiDetection/tests/data/outputs/test-json-response-image-output.txt @@ -0,0 +1,60 @@ +```json +{ + "visible_person": true, + "person": { + "Type": "Public figure", + "clothing": { + "jacket": "gray", + "shirt": "white", + "tie": "gold" + }, + "estimated_age_range": "adult", + "estimated_gender": "male", + "estimated_race": "Caucasian", + "accessories": [ + { + "type": "glasses", + "color": "black" + } + ], + "visible_glasses": true, + "glasses": { + "type": "spectacles", + "color": "black", + "describe": "A pair of black rimmed glasses." + }, + "visible_object_in_hand": false, + "object_in_hand": null, + "person_wearing_shoe": true, + "shoe": { + "type": "business dress shoes", + "color": "black", + "describe": "A pair of black dress shoes." + }, + "visible_head_hair": true, + "head_features": { + "head_hair_color": "graying brown", + "bald": false, + "visible_head_cover": false, + "head_cover_type": null, + "visible_tattoo": false, + "tattoo_features": null + }, + "visible_face": true, + "visible_eye": true, + "face_features": { + "visible_eye": true, + "eye_color": "blue", + "visible_facial_hair": false, + "facial_hair_color": null, + "facial_features": null, + "emotion_of_person": "neutral" + }, + "action_performed": "Shaking hands with another person", + "background": { + "describe": "The background shows an event venue with tables and chairs. The setting suggests a formal or semi-formal gathering." + }, + "other_notable_characteristics": null + } +} +``` \ No newline at end of file diff --git a/python/GeminiDetection/tests/data/outputs/test-json-response-video-output.txt b/python/GeminiDetection/tests/data/outputs/test-json-response-video-output.txt new file mode 100644 index 00000000..a172d9aa --- /dev/null +++ b/python/GeminiDetection/tests/data/outputs/test-json-response-video-output.txt @@ -0,0 +1,7 @@ +```json +{ + "breed": "Collie", + "color": "black and white", + "size": "medium to large" +} +``` \ No newline at end of file diff --git a/python/GeminiDetection/tests/data/outputs/test-unsure-output.txt b/python/GeminiDetection/tests/data/outputs/test-unsure-output.txt new file mode 100644 index 00000000..fd55215b --- /dev/null +++ b/python/GeminiDetection/tests/data/outputs/test-unsure-output.txt @@ -0,0 +1,62 @@ +```json +{ + "person": { + "accessories": [], + "action performed": "Walking", + "background": { + "describe" : "The woman is walking indoors, possibly in a corridor or hall. The background is out of focus and does not provide any specific information." + }, + "clothing": { + "upper body clothing": "black top", + "lower body clothing": "black pants" + }, + "estimated age range": "n/a", + "estimated gender": "female", + "estimated race": "unsure", + "face features": { + "emotion of person": "unsure", + "eye color": "unsure", + "facial hair color": "unsure" + }, + "glasses": { + "type": "none", + "color": "none", + "describe": "No glasses visible" + }, + "head features": { + "tattoo features": {} + }, + "object in hand": { + "type": "bag", + "color": "dark", + "describe": "Person is carrying a large dark bag." + }, + "shoe": { + "type": "sneakers", + "color": "white", + "describe": "Woman is wearing white sneakers" + }, + "type": "civilian", + "skin color": "unsure – not enough detail to confidently describe an object.", + "nest level 1": { + "level 1": "valid", + "ignore me": "unsure", + "nest level 2": { + "level 2": "valid", + "ignore me": "unsure", + "nest level 3": { + "level 3": "valid", + "ignore me": "unsure" + } + } + }, + "other_notable_characteristics": { + "unsure": true, + "normal" : "behavior", + "behavior" : "normal", + "no" : "unusual", + "unusual" : "no" + } + } +} +``` \ No newline at end of file diff --git a/python/GeminiDetection/tests/data/outputs/test-video-nth-frame-json-output.txt b/python/GeminiDetection/tests/data/outputs/test-video-nth-frame-json-output.txt new file mode 100644 index 00000000..4b3555f5 --- /dev/null +++ b/python/GeminiDetection/tests/data/outputs/test-video-nth-frame-json-output.txt @@ -0,0 +1,7 @@ +```json +{ + "breed": "Collie", + "color": "Black and white", + "size": "Large" +} +``` \ No newline at end of file diff --git a/python/GeminiDetection/tests/data/outputs/test-visible-output.txt b/python/GeminiDetection/tests/data/outputs/test-visible-output.txt new file mode 100644 index 00000000..6a08fa03 --- /dev/null +++ b/python/GeminiDetection/tests/data/outputs/test-visible-output.txt @@ -0,0 +1,69 @@ +```json +{ + "visible_person": true, + "person": { + "Type": "unsure", + "clothing": { + "headwear": [ + { + "type": "hoodie", + "color": "black", + "location": "head", + "description": "A black hood that is pulled up." + } + ], + "top_layer": { + "type": "jacket", + "color": "black", + "location": "torso", + "description": "A dark colored jacket covering the torso." + }, + "lower_layer": { + "type": "unsure", + "color": "dark", + "location": "pants", + "description": "Dark-colored pants that are only partially visible." + } + }, + "estimated_age_range": "adult", + "estimated_gender": "male", + "estimated_race": "unsure", + "accessories": [], + "visible_glasses": false, + "glasses": {}, + "visible_object_in_hand": false, + "object_in_hand": {}, + "person_wearing_shoe": true, + "shoe": { + "type": "sneaker", + "color": "black", + "description": "Black sneaker on the foot." + }, + "visible_head_hair": false, + "head_features": { + "head_hair_color": "dark", + "bald": true, + "visible_head_cover": true, + "head_cover_type": "hoodie", + "visible_tattoo": false, + "tattoo_features": {} + }, + "visible_face": false, + "visible_eye": false, + "face_features": { + "visible_eye": false, + "eye_color": "unsure", + "visible_facial_hair": false, + "facial_hair_color": "unsure", + "facial_features": {}, + "emotion_of_person": "unsure" + }, + "action_performed": "walking", + "background": { + "describe": "The person is in a building with an indoor surface visible behind them.", + "color": "unspecified", + "type": "indoors" + } + } +} +``` \ No newline at end of file diff --git a/python/GeminiDetection/tests/data/person.jpg b/python/GeminiDetection/tests/data/person.jpg new file mode 100644 index 00000000..a6e548e8 Binary files /dev/null and b/python/GeminiDetection/tests/data/person.jpg differ diff --git a/python/GeminiDetection/tests/data/test_video.mp4 b/python/GeminiDetection/tests/data/test_video.mp4 new file mode 100644 index 00000000..1303a1ea Binary files /dev/null and b/python/GeminiDetection/tests/data/test_video.mp4 differ diff --git a/python/GeminiDetection/tests/test_gemini.py b/python/GeminiDetection/tests/test_gemini.py new file mode 100644 index 00000000..346c5499 --- /dev/null +++ b/python/GeminiDetection/tests/test_gemini.py @@ -0,0 +1,534 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +import sys +import os +import logging +import warnings +import numpy as np + +# Add gemini_component to path. +sys.path.append(os.path.join(os.path.dirname(__file__), '..')) +from gemini_component.gemini_component import GeminiComponent + +import unittest +import unittest.mock +from unittest.mock import MagicMock, Mock +import mpf_component_api as mpf + +logging.basicConfig(level=logging.DEBUG) +USE_MOCKS = True + +# Replace with your own API key +GEMINI_API_KEY = '' + +# Replace with your own desired model name +MODEL_NAME = "gemma-3-27-it" + +class TestGemini(unittest.TestCase): + def run_patched_job(self, component, job, side_effect_function): + if isinstance(job, mpf.ImageJob): + detection_func = component.get_detections_from_image + elif isinstance(job, mpf.VideoJob): + detection_func = component.get_detections_from_video + else: + raise Exception("Must be image or video job.") + + if not USE_MOCKS: + return detection_func(job) + + with unittest.mock.patch("gemini_component.gemini_component.GeminiComponent._get_gemini_response", side_effect=side_effect_function): + results = list(detection_func(job)) + return results + + def test_image_file(self): + ff_loc = mpf.ImageLocation(0, 0, 347, 374, -1, dict(CLASSIFICATION="PERSON")) + job = mpf.ImageJob( + job_name='test-image', + data_uri=self._get_test_file('person.jpg'), + job_properties=dict( + GEMINI_API_KEY=GEMINI_API_KEY, + MODEL_NAME=MODEL_NAME + ), + media_properties={}, + feed_forward_location=ff_loc + ) + component = GeminiComponent() + + def side_effect_function(model_name, data_uri, prompt): + if prompt == "Describe what this person is wearing": + response = "The person in the image is wearing a dark suit with a matching tie. The shirt underneath appears to be light-colored, possibly white or off-white. He has glasses on his face and is smiling as he shakes hands with someone who isn't fully visible in the frame. His attire suggests a formal setting, possibly for business or an event that requires professional dress code." + elif prompt == "Describe what this person is doing": + response = "The person in the image appears to be shaking someone's hand. They are wearing a suit and tie, which suggests they may be in a professional or formal setting. The context of the photo is not clear from this angle, but it looks like they could be at an event or gathering where such interactions are common." + + close_unlink_shm(data_uri) + return {"response": f"{response}"} + + result = self.run_patched_job(component, job, side_effect_function)[0] + + self.assertTrue("CLOTHING" in result.detection_properties and "ACTIVITY" in result.detection_properties) + self.assertTrue(len(result.detection_properties["CLOTHING"]) > 0 and len(result.detection_properties["ACTIVITY"]) > 0) + + def test_image_file_no_prompts(self): + ff_loc = mpf.ImageLocation(0, 0, 347, 374, -1, dict(CLASSIFICATION="PERSON")) + job = mpf.ImageJob( + job_name='test-image-no-prompts', + data_uri=self._get_test_file('person.jpg'), + job_properties=dict( + PROMPT_CONFIGURATION_PATH=self._get_test_file('custom_prompts.json'), + GEMINI_API_KEY=GEMINI_API_KEY, + MODEL_NAME=MODEL_NAME + ), + media_properties={}, + feed_forward_location=ff_loc + ) + component = GeminiComponent() + + def side_effect_function(model_name, data_uri, prompt): + close_unlink_shm(data_uri) + return {"response": ""} + + result = self.run_patched_job(component, job, side_effect_function)[0] + self.assertTrue(len(result.detection_properties) == 1 and result.detection_properties['CLASSIFICATION'] == 'PERSON') + + def test_custom_config(self): + ff_loc = mpf.ImageLocation(0, 0, 900, 1600, -1, dict(CLASSIFICATION="DOG")) + job = mpf.ImageJob( + job_name='test-custom', + data_uri=self._get_test_file('dog.jpg'), + job_properties=dict( + PROMPT_CONFIGURATION_PATH=self._get_test_file('custom_prompts.json'), + GEMINI_API_KEY=GEMINI_API_KEY, + MODEL_NAME=MODEL_NAME + ), + media_properties={}, + feed_forward_location=ff_loc + ) + component = GeminiComponent() + + def side_effect_function(model_name, data_uri, prompt): + if prompt == "Describe the color and breed of the dog.": + response = "The dog in the image appears to be a Golden Retriever. The breed is known for its golden-colored fur, which can range from pale blonde to deeper golden shades, often with some darker feathering around the ears and along the tail. This specific dog has a beautiful golden coat that suggests it may be younger or well-groomed. The facial features of Golden Retriever dogs are also quite distinctive, such as their expressive eyes and long, floppy ears. They are medium to large-sized breed with a friendly and intelligent disposition." + + close_unlink_shm(data_uri) + return {"response": f"{response}"} + + result = self.run_patched_job(component, job, side_effect_function)[0] + + self.assertTrue("DESCRIPTION" in result.detection_properties) + self.assertTrue(len(result.detection_properties["DESCRIPTION"]) > 0) + + def test_video_file(self): + warnings.filterwarnings(action="ignore", message="unclosed", category=ResourceWarning) + + ff_track = mpf.VideoTrack(0, 0, -1, {}, {'CLASSIFICATION': 'DOG'}) + ff_track.frame_locations[0] = mpf.ImageLocation(0, 0, 3456, 5184, -1, {'CLASSIFICATION': 'DOG', 'CLASSIFICATION CONFIDENCE LIST': '-1', 'CLASSIFICATION LIST': 'DOG'}) + + job = mpf.VideoJob( + job_name='test-video', + data_uri=self._get_test_file('test_video.mp4'), + start_frame=0, + stop_frame=0, + job_properties=dict( + PROMPT_CONFIGURATION_PATH=self._get_test_file('custom_prompts.json'), + GEMINI_API_KEY=GEMINI_API_KEY, + MODEL_NAME=MODEL_NAME, + GENERATE_FRAME_RATE_CAP='-1' + ), + media_properties={}, + feed_forward_track=ff_track + ) + component = GeminiComponent() + + def side_effect_function(model_name, data_uri, prompt): + if prompt == "Describe the color and breed of the dog.": + response = "The dog in the image appears to be a Border Collie. The breed is characterized by its black and white color pattern, which you can see here with distinct patches of black fur against a mostly white background. Border Collies are known for their intelligent eyes and expressive faces, which they use to work livestock. They also have a double coat that is thick and wavy in texture. In this photo, the dog looks well-groomed and healthy." + + close_unlink_shm(data_uri) + return {"response": f"{response}"} + + result = list(self.run_patched_job(component, job, side_effect_function))[0] + for ff_location in result.frame_locations.values(): + self.assertTrue("DESCRIPTION" in ff_location.detection_properties) + self.assertTrue(len(ff_location.detection_properties['DESCRIPTION']) > 0) + + def test_full_frame_image(self): + job = mpf.ImageJob( + job_name='test-full-frame-image', + data_uri=self._get_test_file('dog.jpg'), + job_properties=dict( + PROMPT_CONFIGURATION_PATH=self._get_test_file('custom_prompts.json'), + GEMINI_API_KEY=GEMINI_API_KEY, + MODEL_NAME=MODEL_NAME + ), + media_properties={}, + feed_forward_location=None + ) + component = GeminiComponent() + def side_effect_function(model_name, data_uri, prompt): + if prompt == "Describe this image": + response = "The image shows a medium-sized dog sitting on a couch. The dog appears to be a breed with tan and white fur, likely a mix given the irregular patterns of its coat. It has a scrunched up expression on its face, possibly indicating curiosity or attentiveness towards something off-camera. There is a small animal, potentially another pet such as a cat, in front of the dog's paws. The background is blurred but seems to be an indoor setting with natural light filtering through." + elif prompt == "Describe the location in this scene": + response = "The image shows a dog sitting on a couch indoors. The room has a light wood floor and there's a glimpse of what appears to be an artwork or picture frame hanging on the wall in the background. The focus is on the dog, which suggests that it's either the main subject of the photograph or someone wanted to capture a candid moment with their pet." + + close_unlink_shm(data_uri) + return {"response": f"{response}"} + + results = self.run_patched_job(component, job, side_effect_function) + for result in results: + self.assertTrue("LOCATION" in result.detection_properties and "DESCRIPTION" in result.detection_properties) + self.assertTrue(len(result.detection_properties["LOCATION"]) > 0 and len(result.detection_properties["DESCRIPTION"]) > 0) + + def test_full_frame_video(self): + job = mpf.VideoJob( + job_name='test-full-frame-video', + data_uri=self._get_test_file('test_video.mp4'), + start_frame=0, + stop_frame=14, + job_properties=dict( + PROMPT_CONFIGURATION_PATH=self._get_test_file('custom_prompts.json'), + GEMINI_API_KEY=GEMINI_API_KEY, + MODEL_NAME=MODEL_NAME, + GENERATE_FRAME_RATE_CAP='-1' + ), + media_properties={}, + feed_forward_track=None + ) + component = GeminiComponent() + + def side_effect_function(model_name, data_uri, prompt): + if prompt == "Describe this image": + response = "This is a photo of a dog with a black, white, and grey coat. The dog appears to be a Border Collie or similar breed known for its distinctive coloring. It's sitting on what looks like a concrete surface outdoors, possibly in a yard or on a patio. The dog has a focused gaze towards the camera, and its mouth is slightly open, suggesting it might be panting or perhaps reacting to the person taking the photo. In the background, there are elements of a fence and vegetation, indicating that this setting could be near a garden or a fenced area. The lighting suggests it's daytime." + elif prompt == "Describe the location in this scene": + response = "The image shows a dog sitting on what appears to be a stone or concrete floor. The dog is facing the camera with its mouth open, revealing its teeth and tongue, which could suggest it's panting or smiling. There is a fence in the background, indicating that this might be an outdoor area such as a garden, patio, or a residential backyard. Beyond the fence, there are some plants and trees, suggesting a natural environment. The lighting appears to be diffused, possibly from cloudy weather or shaded by nearby structures or foliage. There's no text visible in the image." + + close_unlink_shm(data_uri) + return {"response": f"{response}"} + + results = self.run_patched_job(component, job, side_effect_function) + for result in results: + self.assertTrue("LOCATION" in result.detection_properties and "DESCRIPTION" in result.detection_properties) + self.assertTrue(len(result.detection_properties["LOCATION"]) > 0 and len(result.detection_properties["DESCRIPTION"]) > 0) + + def test_json_response_image(self): + ff_loc = mpf.ImageLocation(0, 0, 347, 374, -1, dict(CLASSIFICATION="PERSON")) + job = mpf.ImageJob( + job_name='test-json-response-image', + data_uri=self._get_test_file('person.jpg'), + job_properties=dict( + GEMINI_API_KEY=GEMINI_API_KEY, + MODEL_NAME=MODEL_NAME, + ENABLE_JSON_PROMPT_FORMAT='True' + ), + media_properties={}, + feed_forward_location=ff_loc + ) + component = GeminiComponent() + + def side_effect_function(model_name, data_uri, prompt): + with open(os.path.join(os.path.dirname(__file__), 'data', 'outputs', f"{job.job_name}-output.txt")) as f: + response = f.read() + + close_unlink_shm(data_uri) + return response + + result = self.run_patched_job(component, job, side_effect_function)[0] + self.assertTrue(len(result.detection_properties) > 1) + + def test_json_response_video(self): + warnings.filterwarnings(action="ignore", message="unclosed", category=ResourceWarning) + + ff_track = mpf.VideoTrack(0, 0, -1, {}, {'CLASSIFICATION': 'DOG'}) + ff_track.frame_locations[0] = mpf.ImageLocation(0, 0, 3456, 5184, -1, {'CLASSIFICATION': 'DOG', 'CLASSIFICATION CONFIDENCE LIST': '-1', 'CLASSIFICATION LIST': 'DOG'}) + + job = mpf.VideoJob( + job_name='test-json-response-video', + data_uri=self._get_test_file('test_video.mp4'), + start_frame=0, + stop_frame=0, + job_properties=dict( + GEMINI_API_KEY=GEMINI_API_KEY, + MODEL_NAME=MODEL_NAME, + ENABLE_JSON_PROMPT_FORMAT='True', + JSON_PROMPT_CONFIGURATION_PATH=self._get_test_file('custom_json_prompts.json'), + GENERATE_FRAME_RATE_CAP='-1' + ), + media_properties={}, + feed_forward_track=ff_track + ) + component = GeminiComponent() + + def side_effect_function(model_name, data_uri, prompt): + with open(os.path.join(os.path.dirname(__file__), 'data', 'outputs', f"{job.job_name}-output.txt")) as f: + response = f.read() + + close_unlink_shm(data_uri) + return response + + result = list(self.run_patched_job(component, job, side_effect_function))[0] + self.assertTrue(len(result.frame_locations[0].detection_properties) > 3) + + def test_video_file_nth_frame(self): + warnings.filterwarnings(action="ignore", message="unclosed", category=ResourceWarning) + + ff_track = mpf.VideoTrack(0, 0, -1, {}, {'CLASSIFICATION': 'DOG'}) + for i in range(5): + ff_track.frame_locations[i] = mpf.ImageLocation(0, 0, 3456, 5184, -1, {'CLASSIFICATION': 'DOG', 'CLASSIFICATION CONFIDENCE LIST': '-1', 'CLASSIFICATION LIST': 'DOG'}) + + job = mpf.VideoJob( + job_name='test-video-nth-frame', + data_uri=self._get_test_file('test_video.mp4'), + start_frame=0, + stop_frame=4, + job_properties=dict( + PROMPT_CONFIGURATION_PATH=self._get_test_file('custom_prompts.json'), + GEMINI_API_KEY=GEMINI_API_KEY, + MODEL_NAME=MODEL_NAME, + GENERATE_FRAME_RATE_CAP='1.0' + ), + media_properties={ + 'FPS': '2' + }, + feed_forward_track=ff_track + ) + component = GeminiComponent() + + def side_effect_function(model_name, data_uri, prompt): + if prompt == "Describe the color and breed of the dog.": + response = "The dog in the image appears to be a Border Collie. The breed is characterized by its black and white color pattern, which you can see here with distinct patches of black fur against a mostly white background. Border Collies are known for their intelligent eyes and expressive faces, which they use to work livestock. They also have a double coat that is thick and wavy in texture. In this photo, the dog looks well-groomed and healthy." + + close_unlink_shm(data_uri) + return {"response": f"{response}"} + + result = list(self.run_patched_job(component, job, side_effect_function))[0] + + for i, ff_location in result.frame_locations.items(): + if i % 2 == 0: + self.assertTrue("DESCRIPTION" in ff_location.detection_properties) + self.assertTrue(len(ff_location.detection_properties['DESCRIPTION']) > 0) + else: + self.assertTrue("DESCRIPTION" not in ff_location.detection_properties) + + def test_video_file_nth_frame_json(self): + warnings.filterwarnings(action="ignore", message="unclosed", category=ResourceWarning) + + ff_track = mpf.VideoTrack(0, 0, -1, {}, {'CLASSIFICATION': 'DOG'}) + for i in range(5): + ff_track.frame_locations[i] = mpf.ImageLocation(0, 0, 3456, 5184, -1, {'CLASSIFICATION': 'DOG', 'CLASSIFICATION CONFIDENCE LIST': '-1', 'CLASSIFICATION LIST': 'DOG'}) + + job = mpf.VideoJob( + job_name='test-video-nth-frame-json', + data_uri=self._get_test_file('test_video.mp4'), + start_frame=0, + stop_frame=4, + job_properties=dict( + JSON_PROMPT_CONFIGURATION_PATH=self._get_test_file('custom_json_prompts.json'), + ENABLE_JSON_PROMPT_FORMAT='True', + GEMINI_API_KEY=GEMINI_API_KEY, + MODEL_NAME=MODEL_NAME + ), + media_properties={ + 'FPS': '2' + }, + feed_forward_track=ff_track + ) + component = GeminiComponent() + + def side_effect_function(model_name, data_uri, prompt): + with open(os.path.join(os.path.dirname(__file__), 'data', 'outputs', f"{job.job_name}-output.txt")) as f: + response = f.read() + + close_unlink_shm(data_uri) + return response + + result = list(self.run_patched_job(component, job, side_effect_function))[0] + + for i, ff_location in result.frame_locations.items(): + if i % 2 == 0: + self.assertTrue("ANNOTATED BY GEMINI" in ff_location.detection_properties) + else: + self.assertTrue("ANNOTATED BY GEMINI" not in ff_location.detection_properties) + + def test_unsure_results(self): + ff_loc = mpf.ImageLocation(0, 0, 347, 374, -1, dict(CLASSIFICATION="PERSON")) + job = mpf.ImageJob( + job_name='test-unsure', + data_uri=self._get_test_file('person.jpg'), + job_properties=dict( + GEMINI_API_KEY=GEMINI_API_KEY, + MODEL_NAME=MODEL_NAME, + JSON_PROMPT_CONFIGURATION_PATH=self._get_test_file('custom_json_prompts.json'), + ENABLE_JSON_PROMPT_FORMAT='True' + ), + media_properties={}, + feed_forward_location=ff_loc + ) + component = GeminiComponent() + + expected_detection_properties = { + 'CLASSIFICATION': 'PERSON', + 'GEMINI PERSON ACTION PERFORMED': 'Walking', + 'GEMINI PERSON BACKGROUND DESCRIBE': 'The woman is walking indoors, possibly in a corridor or hall. The background is out of focus and does not provide any specific information.', + 'GEMINI PERSON CLOTHING UPPER BODY CLOTHING': 'black top', 'GEMINI PERSON CLOTHING LOWER BODY CLOTHING': 'black pants', + 'GEMINI PERSON ESTIMATED GENDER': 'female', 'GEMINI PERSON OBJECT IN HAND TYPE': 'bag', 'GEMINI PERSON OBJECT IN HAND COLOR': 'dark', + 'GEMINI PERSON OBJECT IN HAND DESCRIBE': 'Person is carrying a large dark bag.', + 'GEMINI PERSON SHOE TYPE': 'sneakers', 'GEMINI PERSON SHOE COLOR': 'white', + 'GEMINI PERSON SHOE DESCRIBE': 'Woman is wearing white sneakers', + 'GEMINI PERSON TYPE': 'civilian', + 'GEMINI PERSON NEST LEVEL 1 LEVEL 1': 'valid', + 'GEMINI PERSON NEST LEVEL 1 NEST LEVEL 2 LEVEL 2': 'valid', + 'GEMINI PERSON NEST LEVEL 1 NEST LEVEL 2 NEST LEVEL 3 LEVEL 3': 'valid', + 'GEMINI PERSON OTHER NOTABLE CHARACTERISTICS NORMAL': 'behavior', + 'GEMINI PERSON OTHER NOTABLE CHARACTERISTICS BEHAVIOR': 'normal', + 'ANNOTATED BY GEMINI': True + } + + def side_effect_function(model_name, data_uri, prompt): + with open(os.path.join(os.path.dirname(__file__), 'data', 'outputs', f"{job.job_name}-output.txt")) as f: + response = f.read() + + close_unlink_shm(data_uri) + return response + + result = self.run_patched_job(component, job, side_effect_function)[0] + + self.assertEqual(result.detection_properties, expected_detection_properties) + + def test_visible_results(self): + ff_loc = mpf.ImageLocation(0, 0, 347, 374, -1, dict(CLASSIFICATION="PERSON")) + job = mpf.ImageJob( + job_name='test-visible', + data_uri=self._get_test_file('person.jpg'), + job_properties=dict( + GEMINI_API_KEY=GEMINI_API_KEY, + MODEL_NAME=MODEL_NAME, + JSON_PROMPT_CONFIGURATION_PATH=self._get_test_file('custom_json_prompts.json'), + ENABLE_JSON_PROMPT_FORMAT='True' + ), + media_properties={}, + feed_forward_location=ff_loc + ) + component = GeminiComponent() + + expected_detection_properties = { + 'CLASSIFICATION': 'PERSON', + 'GEMINI PERSON CLOTHING HEADWEAR': '[{"type": "hoodie", "color": "black", "location": "head", "description": "A black hood that is pulled up."}]', + 'GEMINI PERSON CLOTHING TOP LAYER TYPE': 'jacket', + 'GEMINI PERSON CLOTHING TOP LAYER COLOR': 'black', + 'GEMINI PERSON CLOTHING TOP LAYER LOCATION': 'torso', + 'GEMINI PERSON CLOTHING TOP LAYER DESCRIPTION': 'A dark colored jacket covering the torso.', + 'GEMINI PERSON CLOTHING LOWER LAYER COLOR': 'dark', + 'GEMINI PERSON CLOTHING LOWER LAYER LOCATION': 'pants', + 'GEMINI PERSON CLOTHING LOWER LAYER DESCRIPTION': 'Dark-colored pants that are only partially visible.', + 'GEMINI PERSON ESTIMATED AGE RANGE': 'adult', + 'GEMINI PERSON ESTIMATED GENDER': 'male', + 'GEMINI PERSON PERSON WEARING SHOE': 'True', + 'GEMINI PERSON SHOE TYPE': 'sneaker', + 'GEMINI PERSON SHOE COLOR': 'black', + 'GEMINI PERSON SHOE DESCRIPTION': 'Black sneaker on the foot.', + 'GEMINI PERSON HEAD FEATURES BALD': 'True', + 'GEMINI PERSON HEAD FEATURES HEAD COVER TYPE': 'hoodie', + 'GEMINI PERSON ACTION PERFORMED': 'walking', + 'GEMINI PERSON BACKGROUND DESCRIBE': 'The person is in a building with an indoor surface visible behind them.', + 'GEMINI PERSON BACKGROUND TYPE': 'indoors', + 'ANNOTATED BY GEMINI': True + } + + def side_effect_function(model_name, data_uri, prompt): + with open(os.path.join(os.path.dirname(__file__), 'data', 'outputs', f"{job.job_name}-output.txt")) as f: + response = f.read() + + close_unlink_shm(data_uri) + return response + + result = self.run_patched_job(component, job, side_effect_function)[0] + + self.assertEqual(result.detection_properties, expected_detection_properties) + + def test_ignore_person_results(self): + ff_loc = mpf.ImageLocation(0, 0, 347, 374, -1, dict(CLASSIFICATION="PERSON")) + job = mpf.ImageJob( + job_name='test-ignore-person', + data_uri=self._get_test_file('person.jpg'), + job_properties=dict( + GEMINI_API_KEY=GEMINI_API_KEY, + MODEL_NAME=MODEL_NAME, + JSON_PROMPT_CONFIGURATION_PATH=self._get_test_file('custom_json_prompts.json'), + ENABLE_JSON_PROMPT_FORMAT='True' + ), + media_properties={}, + feed_forward_location=ff_loc + ) + component = GeminiComponent() + + expected_detection_properties = { + 'CLASSIFICATION' : 'PERSON', + 'ANNOTATED BY GEMINI' : True + } + + def side_effect_function(model, data_uri, prompt): + with open(os.path.join(os.path.dirname(__file__), 'data', 'outputs', f"{job.job_name}-output.txt")) as f: + response = f.read() + close_unlink_shm(data_uri) + return response + + result = self.run_patched_job(component, job, side_effect_function)[0] + + self.assertEqual(result.detection_properties, expected_detection_properties) + + def test_get_frames(self): + component = GeminiComponent() + self.assertEqual(component._get_frames_to_process([], 1), []) + self.assertEqual(component._get_frames_to_process([1], 2), [1]) + self.assertEqual(component._get_frames_to_process([503], 2), [503]) + self.assertEqual(component._get_frames_to_process([503, 1_000], 5_000), [503]) + self.assertEqual(component._get_frames_to_process([0,1,2,3,4,5], 1), [0,1,2,3,4,5]) + self.assertEqual(component._get_frames_to_process([0,1,2,3,4,5], 2), [0,2,4, 5]) + self.assertEqual(component._get_frames_to_process([0,1,2,3,4,5], 3), [0,3,5]) + self.assertEqual(component._get_frames_to_process([0,1,2,3,4,5,900], 3), [0,3,5,900]) + self.assertEqual(component._get_frames_to_process([4,900,902,905,906,907,908,909,910,911,912,913], 5), [4,900,905,910,913]) + self.assertEqual(component._get_frames_to_process([910,911,912,913,914,915,916,917,918], 6), [910,916]) + self.assertEqual(component._get_frames_to_process([910,911,912,913,914,915,916,917,918,919], 6), [910,916,919]) + self.assertEqual(component._get_frames_to_process([910,911,912,913,914,915,916,917,918,919,920], 6), [910,916,920]) + self.assertEqual(component._get_frames_to_process([910,911,912,913,914,915,916,917,918,919,920,921], 6), [910,916,921]) + self.assertEqual(component._get_frames_to_process([910,911,912,913,914,915,916,917,918,919,920,921,922], 6), [910,916,922]) + self.assertEqual(component._get_frames_to_process([910,911,912,913,914,915,916,917,918,5_000,5_001,10_000], 6), [910,916,5_000,10_000]) + + @staticmethod + def _get_test_file(filename): + return os.path.join(os.path.dirname(__file__), 'data', filename) + +def close_unlink_shm(data_uri): + if isinstance(data_uri, tuple): + _, shm = data_uri + shm.close() + try: + shm.unlink() + except FileNotFoundError: + pass + +if __name__ == '__main__': + unittest.main(verbosity=2) \ No newline at end of file