From 8b7b5add6685e979fd7815c4ea7c540d039c5c36 Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Tue, 23 Sep 2025 15:05:18 -0400 Subject: [PATCH 1/5] Updating WtP models. Adding sentence splitting option. --- python/AzureTranslation/README.md | 25 ++++--- .../plugin-files/descriptor/descriptor.json | 4 +- .../tests/test_acs_translation.py | 75 +++++++++++++++++++ 3 files changed, 91 insertions(+), 13 deletions(-) diff --git a/python/AzureTranslation/README.md b/python/AzureTranslation/README.md index d12a81f8..5cadb230 100644 --- a/python/AzureTranslation/README.md +++ b/python/AzureTranslation/README.md @@ -87,25 +87,28 @@ must be provided. Neither has a default value. The following settings control the behavior of dividing input text into acceptable chunks for processing. -Through preliminary investigation, we identified the [WtP library ("Where's the +Through preliminary investigation, we identified the [SaT/WtP library ("Segment any Text" / "Where's the Point")](https://github.com/bminixhofer/wtpsplit) and [spaCy's multilingual sentence detection model](https://spacy.io/models) for identifying sentence breaks in a large section of text. -WtP models are trained to split up multilingual text by sentence without the need of an +SaT/WtP models are trained to split up multilingual text by sentence without the need of an input language tag. The disadvantage is that the most accurate WtP models will need ~3.5 -GB of GPU memory. On the other hand, spaCy has a single multilingual sentence detection +GB of GPU memory. SaT models are a more recent addition and considered to be a more accurate +set of sentence segmentation models; their resource costs are similar to WtP. + +On the other hand, spaCy has a single multilingual sentence detection that appears to work better for splitting up English text in certain cases, unfortunately this model lacks support handling for Chinese punctuation. -- `SENTENCE_MODEL`: Specifies the desired WtP or spaCy sentence detection model. For CPU - and runtime considerations, the author of WtP recommends using `wtp-bert-mini`. More - advanced WtP models that use GPU resources (up to ~8 GB) are also available. See list of - WtP model names +- `SENTENCE_MODEL`: Specifies the desired SaT/WtP or spaCy sentence detection model. For CPU + and runtime considerations, the authors of SaT/WtP recommends using `sat-3l-sm` or `wtp-bert-mini`. + More advanced SaT/WtP models that use GPU resources (up to ~8 GB for WtP) are also available. See list of + model names [here](https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#available-models). The only available spaCy model (for text with unknown language) is `xx_sent_ud_sm`. - Review list of languages supported by WtP + Review list of languages supported by SaT/WtP [here](https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#supported-languages). Review models and languages supported by spaCy [here](https://spacy.io/models). @@ -116,15 +119,15 @@ this model lacks support handling for Chinese punctuation. [here](https://discourse.mozilla.org/t/proposal-sentences-lenght-limit-from-14-words-to-100-characters). - `SENTENCE_SPLITTER_INCLUDE_INPUT_LANG`: Specifies whether to pass input language to - sentence splitter algorithm. Currently, only WtP supports model threshold adjustments by + sentence splitter algorithm. Currently, only SaT/WtP supports model threshold adjustments by input language. - `SENTENCE_MODEL_CPU_ONLY`: If set to TRUE, only use CPU resources for the sentence detection model. If set to FALSE, allow sentence model to also use GPU resources. - For most runs using spaCy `xx_sent_ud_sm` or `wtp-bert-mini` models, GPU resources + For most runs using spaCy `xx_sent_ud_sm`, `sat-3l-sm`, or `wtp-bert-mini` models, GPU resources are not required. If using more advanced WtP models like `wtp-canine-s-12l`, it is recommended to set `SENTENCE_MODEL_CPU_ONLY=FALSE` to improve performance. - That model can use up to ~3.5 GB of GPU memory. + That WtP model can use up to ~3.5 GB of GPU memory. Please note, to fully enable this option, you must also rebuild the Docker container with the following change: Within the Dockerfile, set `ARG BUILD_TYPE=gpu`. diff --git a/python/AzureTranslation/plugin-files/descriptor/descriptor.json b/python/AzureTranslation/plugin-files/descriptor/descriptor.json index a04762bb..f66891f6 100644 --- a/python/AzureTranslation/plugin-files/descriptor/descriptor.json +++ b/python/AzureTranslation/plugin-files/descriptor/descriptor.json @@ -95,7 +95,7 @@ }, { "name": "SENTENCE_MODEL", - "description": "Name of sentence segmentation model. Supported options are spaCy's multilingual `xx_sent_ud_sm` model and the Where's the Point (WtP) `wtp-bert-mini` model.", + "description": "Name of sentence segmentation model. Supported options are spaCy's multilingual `xx_sent_ud_sm` model, Segment any Text (SaT) `sat-3l-sm` model, and Where's the Point (WtP) `wtp-bert-mini` model.", "type": "STRING", "defaultValue": "wtp-bert-mini" }, @@ -107,7 +107,7 @@ }, { "name": "SENTENCE_MODEL_WTP_DEFAULT_ADAPTOR_LANGUAGE", - "description": "More advanced WTP models will require a target language. This property sets the default language to use for sentence splitting, unless `FROM_LANGUAGE`, `SUGGESTED_FROM_LANGUAGE`, or Azure language detection return a different, WtP-supported language option.", + "description": "More advanced WtP/SaT models will require a target language. This property sets the default language to use for sentence splitting, unless `FROM_LANGUAGE`, `SUGGESTED_FROM_LANGUAGE`, or Azure language detection return a different, WtP-supported language option.", "type": "STRING", "defaultValue": "en" }, diff --git a/python/AzureTranslation/tests/test_acs_translation.py b/python/AzureTranslation/tests/test_acs_translation.py index d2297f71..90206f8e 100644 --- a/python/AzureTranslation/tests/test_acs_translation.py +++ b/python/AzureTranslation/tests/test_acs_translation.py @@ -65,12 +65,14 @@ class TestAcsTranslation(unittest.TestCase): mock_server: ClassVar['MockServer'] wtp_model: ClassVar['TextSplitterModel'] + sat_model: ClassVar['TextSplitterModel'] spacy_model: ClassVar['TextSplitterModel'] @classmethod def setUpClass(cls): cls.mock_server = MockServer() cls.wtp_model = TextSplitterModel("wtp-bert-mini", "cpu", "en") + cls.sat_model = TextSplitterModel("sat-3l-sm", "cpu", "en") cls.spacy_model = TextSplitterModel("xx_sent_ud_sm", "cpu", "en") @@ -669,6 +671,79 @@ def test_split_wtp_unknown_lang(self, _): 'Spaces should be kept due to incorrect language detection.') + @mock.patch.object(TranslationClient, 'DETECT_MAX_CHARS', new_callable=lambda: 150) + def test_split_sat_unknown_lang(self, _): + # Check that the text splitter does not have an issue + # processing an unknown detected language. + self.set_results_file('invalid-lang-detect-result.json') + self.set_results_file('split-sentence/art-of-war-translation-1.json') + self.set_results_file('split-sentence/art-of-war-translation-2.json') + self.set_results_file('split-sentence/art-of-war-translation-3.json') + self.set_results_file('split-sentence/art-of-war-translation-4.json') + + text = (TEST_DATA / 'split-sentence/art-of-war.txt').read_text() + detection_props = dict(TEXT=text) + TranslationClient(get_test_properties(), self.sat_model).add_translations(detection_props) + + self.assertEqual(5, len(detection_props)) + self.assertEqual(text, detection_props['TEXT']) + + expected_translation = (TEST_DATA / 'split-sentence/art-war-translation.txt') \ + .read_text().strip() + self.assertEqual(expected_translation, detection_props['TRANSLATION']) + self.assertEqual('EN', detection_props['TRANSLATION TO LANGUAGE']) + + self.assertEqual('fake-lang', detection_props['TRANSLATION SOURCE LANGUAGE']) + self.assertAlmostEqual(1.0, + float(detection_props['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) + + detect_request_text = self.get_request_body()[0]['Text'] + self.assertEqual(text[0:TranslationClient.DETECT_MAX_CHARS], detect_request_text) + + expected_chunk_lengths = [88, 118, 116, 106] + self.assertEqual(sum(expected_chunk_lengths), len(text)) + + # Due to an incorrect language detection, newlines are + # not properly replaced for Chinese text, and + # additional whitespace is present in the text. + # This alters the behavior of WtP sentence splitting. + translation_request1 = self.get_request_body()[0]['Text'] + self.assertEqual(expected_chunk_lengths[0], len(translation_request1)) + self.assertTrue(translation_request1.startswith('兵者,')) + self.assertTrue(translation_request1.endswith('而不危也;')) + self.assertNotIn('\n', translation_request1, + 'Newlines were not properly removed') + self.assertIn(' ', translation_request1, + 'Spaces should be kept due to incorrect language detection.') + + translation_request2 = self.get_request_body()[0]['Text'] + self.assertEqual(expected_chunk_lengths[1], len(translation_request2)) + self.assertTrue(translation_request2.startswith('天者,陰陽')) + self.assertTrue(translation_request2.endswith('兵眾孰強?')) + self.assertNotIn('\n', translation_request2, + 'Newlines were not properly removed') + self.assertIn(' ', translation_request2, + 'Spaces should be kept due to incorrect language detection.') + + translation_request3 = self.get_request_body()[0]['Text'] + self.assertEqual(expected_chunk_lengths[2], len(translation_request3)) + self.assertTrue(translation_request3.startswith('士卒孰練?')) + self.assertTrue(translation_request3.endswith('亂而取之, ')) + self.assertNotIn('\n', translation_request3, + 'Newlines were not properly removed') + self.assertIn(' ', translation_request3, + 'Spaces should be kept due to incorrect language detection.') + + translation_request4 = self.get_request_body()[0]['Text'] + self.assertEqual(expected_chunk_lengths[3], len(translation_request4)) + self.assertTrue(translation_request4.startswith('實而備之,')) + self.assertTrue(translation_request4.endswith('勝負見矣。 ')) + self.assertNotIn('\n', translation_request4, + 'Newlines were not properly removed') + self.assertIn(' ', translation_request4, + 'Spaces should be kept due to incorrect language detection.') + + def test_newline_removal(self): def replace(text): From 0138e9b9dfafd55a91967f1e3e14461008f3ebe4 Mon Sep 17 00:00:00 2001 From: regexer Date: Wed, 1 Oct 2025 17:34:01 -0400 Subject: [PATCH 2/5] Update LlamaVideoSummarization to use TIMELINE_CHECK_ACCEPTABLE_THRESHOLD (#409) * Validate timestamps. --------- Co-authored-by: jrobble --- .../__init__.py | 177 ++++++++++------ .../plugin-files/descriptor/descriptor.json | 8 +- .../tests/test_llama_video_summarization.py | 199 ++++++++++++++---- 3 files changed, 275 insertions(+), 109 deletions(-) diff --git a/python/LlamaVideoSummarization/llama_video_summarization_component/__init__.py b/python/LlamaVideoSummarization/llama_video_summarization_component/__init__.py index 49544129..2b749651 100644 --- a/python/LlamaVideoSummarization/llama_video_summarization_component/__init__.py +++ b/python/LlamaVideoSummarization/llama_video_summarization_component/__init__.py @@ -31,9 +31,10 @@ import pickle import socket import subprocess +import re from jsonschema import validate, ValidationError -from typing import Any, Iterable, List, Mapping, Tuple, Union +from typing import Any, cast, Iterable, List, Mapping, Tuple, Union import mpf_component_api as mpf import mpf_component_util as mpf_util @@ -45,7 +46,6 @@ class LlamaVideoSummarizationComponent: def __init__(self): self.child_process = ChildProcess(['/llama/venv/bin/python3', '/llama/summarize_video.py', str(log.getEffectiveLevel())]) - def get_detections_from_video(self, job: mpf.VideoJob) -> Iterable[mpf.VideoTrack]: try: log.info('Received video job.') @@ -62,6 +62,15 @@ def get_detections_from_video(self, job: mpf.VideoJob) -> Iterable[mpf.VideoTrac segment_stop_time = (job.stop_frame + 1) / float(job.media_properties['FPS']) job_config = _parse_properties(job.job_properties, segment_start_time) + + if job_config['timeline_check_target_threshold'] < 0 and \ + job_config['timeline_check_acceptable_threshold'] >= 0: + log.warning('TIMELINE_CHECK_ACCEPTABLE_THRESHOLD will be ignored since TIMELINE_CHECK_TARGET_THRESHOLD < 0.') + + if job_config['timeline_check_acceptable_threshold'] < job_config['timeline_check_target_threshold']: + raise mpf.DetectionError.INVALID_PROPERTY.exception( + 'TIMELINE_CHECK_ACCEPTABLE_THRESHOLD must be >= TIMELINE_CHECK_TARGET_THRESHOLD.') + job_config['video_path'] = job.data_uri job_config['segment_start_time'] = segment_start_time job_config['segment_stop_time'] = segment_stop_time @@ -89,10 +98,12 @@ def _get_response_from_subprocess(self, job_config: dict) -> dict: max_attempts = job_config['generation_max_attempts'] timeline_check_target_threshold = job_config['timeline_check_target_threshold'] + timeline_check_acceptable_threshold = job_config['timeline_check_acceptable_threshold'] segment_start_time = job_config['segment_start_time'] segment_stop_time = job_config['segment_stop_time'] - response_json = {} + response_json = None + acceptable_json = None error = None while max(attempts.values()) < max_attempts: response = self.child_process.send_job_get_response(job_config) @@ -100,113 +111,133 @@ def _get_response_from_subprocess(self, job_config: dict) -> dict: if error is not None: continue - # if no error, then response_json should be valid - event_timeline = response_json['video_event_timeline'] # type: ignore - - if timeline_check_target_threshold != -1: - error = self._check_timeline( - timeline_check_target_threshold, attempts, max_attempts, segment_start_time, segment_stop_time, event_timeline) + if timeline_check_target_threshold >= 0: + acceptable, error = self._check_timeline( + timeline_check_target_threshold, timeline_check_acceptable_threshold, + attempts, max_attempts, segment_start_time, segment_stop_time, cast(dict, response_json)) + if acceptable: + acceptable_json = response_json if error is not None: continue break if error: - raise mpf.DetectionError.DETECTION_FAILED.exception(f'Subprocess failed: {error}') + if acceptable_json is not None: + log.info('Couldn\'t satisfy target threshold. Falling back to response that satisfies acceptable threshold.') + return acceptable_json + else: + raise mpf.DetectionError.DETECTION_FAILED.exception(f'Subprocess failed: {error}') - # if no error, then response_json should be valid + # if no error, then response_json should be valid and meet target criteria return response_json # type: ignore def _check_response(self, attempts: dict, max_attempts: int, schema_json: dict, response: str ) -> Tuple[Union[dict, None], Union[str, None]]: + error = None response_json = None if not response: error = 'Empty response.' - log.warning(error) - log.warning(f'Failed {attempts["base"] + 1} of {max_attempts} base attempts.') - attempts['base'] += 1 - return None, error - try: - response_json = json.loads(response) - except ValueError as ve: - error = 'Response is not valid JSON.' + if not error: + try: + response_json = json.loads(response) + except ValueError as ve: + error = f'Response is not valid JSON. {str(ve)}' + + if not error and response_json: + try: + validate(response_json, schema_json) + except ValidationError as ve: + error = f'Response JSON is not in the desired format. {str(ve)}' + + if not error and response_json: + try: + event_timeline = response_json['video_event_timeline'] + for event in event_timeline: + # update values for later use + event["timestamp_start"] = _get_timestamp_value(event["timestamp_start"]) + event["timestamp_end"] = _get_timestamp_value(event["timestamp_end"]) + except ValueError as ve: + error = f'Response JSON is not in the desired format. {str(ve)}' + + if error: log.warning(error) - log.warning(str(ve)) log.warning(f'Failed {attempts["base"] + 1} of {max_attempts} base attempts.') attempts['base'] += 1 - return response_json, error - try: - validate(response_json, schema_json) - except ValidationError as ve: - error = 'Response JSON is not in the desired format.' - log.warning(error) - log.warning(str(ve)) - log.warning(f'Failed {attempts["base"] + 1} of {max_attempts} base attempts.') - attempts['base'] += 1 - return response_json, error - - return response_json, None + return response_json, error - def _check_timeline(self, threshold: float, attempts: dict, max_attempts: int, - segment_start_time: float, segment_stop_time: float, event_timeline: list - ) -> Union[str, None]: + def _check_timeline(self, target_threshold: float, accept_threshold: float, attempts: dict, max_attempts: int, + segment_start_time: float, segment_stop_time: float, response_json: dict + ) -> Tuple[bool, Union[str, None]]: - error = None + event_timeline = response_json['video_event_timeline'] # type: ignore + + acceptable_checks = dict( + near_seg_start = False, + near_seg_stop = False) + + hard_error = None + soft_error = None for event in event_timeline: - timestamp_start = _get_timestamp_value(event["timestamp_start"]) - timestamp_end = _get_timestamp_value(event["timestamp_end"]) + timestamp_start = event["timestamp_start"] + timestamp_end = event["timestamp_end"] if timestamp_start < 0: - error = (f'Timeline event start time of {timestamp_start} < 0.') + hard_error = (f'Timeline event start time of {timestamp_start} < 0.') break if timestamp_end < 0: - error = (f'Timeline event end time of {timestamp_end} < 0.') + hard_error = (f'Timeline event end time of {timestamp_end} < 0.') break if timestamp_end < timestamp_start: - error = (f'Timeline event end time is less than event start time. ' + hard_error = (f'Timeline event end time is less than event start time. ' f'{timestamp_end} < {timestamp_start}.') break - - if (segment_start_time - timestamp_start) > threshold: - error = (f'Timeline event start time occurs too soon before segment start time. ' - f'({segment_start_time} - {timestamp_start}) > {threshold}.') - break - if (timestamp_end - segment_stop_time) > threshold: - error = (f'Timeline event end time occurs too late after segment stop time. ' - f'({timestamp_end} - {segment_stop_time}) > {threshold}.') - break - - if not error: + minmax_errors = [] + if not hard_error: min_event_start = min(list(map(lambda d: _get_timestamp_value(d.get('timestamp_start')), filter(lambda d: 'timestamp_start' in d, event_timeline)))) - - if abs(segment_start_time - min_event_start) > threshold: - error = (f'Min timeline event start time not close enough to segment start time. ' - f'abs({segment_start_time} - {min_event_start}) > {threshold}.') - - if not error: + max_event_end = max(list(map(lambda d: _get_timestamp_value(d.get('timestamp_end')), filter(lambda d: 'timestamp_end' in d, event_timeline)))) - if abs(max_event_end - segment_stop_time) > threshold: - error = (f'Max timeline event end time not close enough to segment stop time. ' - f'abs({max_event_end} - {segment_stop_time}) > {threshold}.') + if abs(segment_start_time - min_event_start) > target_threshold: + minmax_errors.append((f'Min timeline event start time not close enough to segment start time. ' + f'abs({segment_start_time} - {min_event_start}) > {target_threshold}.')) + + if abs(max_event_end - segment_stop_time) > target_threshold: + minmax_errors.append((f'Max timeline event end time not close enough to segment stop time. ' + f'abs({max_event_end} - {segment_stop_time}) > {target_threshold}.')) + + if accept_threshold >= 0: + acceptable_checks['near_seg_start'] = abs(segment_start_time - min_event_start) <= accept_threshold + + acceptable_checks['near_seg_stop'] = abs(max_event_end - segment_stop_time) <= accept_threshold + + acceptable = not hard_error and all(acceptable_checks.values()) + + if len(minmax_errors) > 0: + soft_error = minmax_errors.pop() + + error = None + if hard_error: + error = hard_error + elif soft_error: + error = soft_error if error: log.warning(error) log.warning(f'Failed {attempts["timeline"] + 1} of {max_attempts} timeline attempts.') attempts['timeline'] += 1 - return error - - return None + + return acceptable, error def _create_segment_summary_track(self, job: mpf.VideoJob, response_json: dict) -> mpf.VideoTrack: @@ -263,8 +294,8 @@ def _create_tracks(self, job: mpf.VideoJob, response_json: dict) -> Iterable[mpf for event in response_json['video_event_timeline']: # get offset start/stop times in milliseconds - event_start_time = int(_get_timestamp_value(event['timestamp_start']) * 1000) - event_stop_time = int(_get_timestamp_value(event['timestamp_end']) * 1000) + event_start_time = int(event['timestamp_start'] * 1000) + event_stop_time = int(event['timestamp_end'] * 1000) offset_start_frame = int((event_start_time * video_fps) / 1000) offset_stop_frame = int((event_stop_time * video_fps) / 1000) - 1 @@ -331,13 +362,18 @@ def _create_tracks(self, job: mpf.VideoJob, response_json: dict) -> Iterable[mpf log.info('Processing complete. Video segment %s summarized in %d tracks.' % (segment_id, len(tracks))) return tracks + def _get_timestamp_value(seconds: Any) -> float: if isinstance(seconds, str): - secval = float(seconds.replace('s', '')) + if re.match(r"^\s*\d+(\.\d*)?\s*[Ss]?$", seconds): + secval = float(re.sub('s', '', seconds, flags=re.IGNORECASE)) + else: + raise mpf.DetectionError.DETECTION_FAILED.exception(f'Invalid timestamp: {seconds}') else: secval = float(seconds) return secval + def _parse_properties(props: Mapping[str, str], segment_start_time: float) -> dict: process_fps = mpf_util.get_property( props, 'PROCESS_FPS', 1) @@ -356,6 +392,8 @@ def _parse_properties(props: Mapping[str, str], segment_start_time: float) -> di props, 'GENERATION_MAX_ATTEMPTS', 5) timeline_check_target_threshold = mpf_util.get_property( props, 'TIMELINE_CHECK_TARGET_THRESHOLD', 10) + timeline_check_acceptable_threshold = mpf_util.get_property( + props, 'TIMELINE_CHECK_ACCEPTABLE_THRESHOLD', 30) generation_prompt = _read_file(generation_prompt_path) % (segment_start_time) @@ -373,7 +411,8 @@ def _parse_properties(props: Mapping[str, str], segment_start_time: float) -> di generation_json_schema = generation_json_schema, system_prompt = system_prompt, generation_max_attempts = generation_max_attempts, - timeline_check_target_threshold = timeline_check_target_threshold + timeline_check_target_threshold = timeline_check_target_threshold, + timeline_check_acceptable_threshold = timeline_check_acceptable_threshold ) @@ -400,6 +439,7 @@ def __init__(self, start_cmd: List[str]): env=env) self._socket = parent_socket.makefile('rwb') + def __del__(self): print("Terminating subprocess...") self._socket.close() @@ -407,6 +447,7 @@ def __del__(self): self._proc.wait() print("Subprocess terminated") + def send_job_get_response(self, config: dict): job_bytes = pickle.dumps(config) self._socket.write(len(job_bytes).to_bytes(4, 'little')) diff --git a/python/LlamaVideoSummarization/plugin-files/descriptor/descriptor.json b/python/LlamaVideoSummarization/plugin-files/descriptor/descriptor.json index 45e15329..55f7943a 100644 --- a/python/LlamaVideoSummarization/plugin-files/descriptor/descriptor.json +++ b/python/LlamaVideoSummarization/plugin-files/descriptor/descriptor.json @@ -65,10 +65,16 @@ }, { "name": "TIMELINE_CHECK_TARGET_THRESHOLD", - "description": "Specifies the number of seconds that video events can occur before or after video segment bounds. If exceeded, another attempt will be made to generate the output. Set to -1 to disable check.", + "description": "Specifies the number of seconds that video events can occur before or after video segment bounds. If exceeded, another attempt will be made to generate the output. See also the TIMELINE_CHECK_THRESHOLD_ACCEPTABLE property. Set to < 0 to disable check (e.g. -1).", "type": "INT", "defaultValue": "10" }, + { + "name": "TIMELINE_CHECK_ACCEPTABLE_THRESHOLD", + "description": "A secondary timeline validation threshold, in seconds that specifies the number of seconds video events can occur before or after video segment bounds, which will result in an \"acceptable\" timeline. Additional attempts will be made to generate a timeline within the \"desired\" range of TIMELINE_CHECK_TARGET_THRESHOLD, until GENERATION_MAX_ATTEMPTS is reached, after which the \"acceptable\" timeline is returned, or the component responds with an error. Set to < 0 to disable check (e.g. -1).", + "type": "INT", + "defaultValue": "30" + }, { "name": "TARGET_SEGMENT_LENGTH", "description": "Default segment length is 180 seconds. Set to -1 to disable segmenting the video.", diff --git a/python/LlamaVideoSummarization/tests/test_llama_video_summarization.py b/python/LlamaVideoSummarization/tests/test_llama_video_summarization.py index 0b2de0ca..2f3c96c8 100644 --- a/python/LlamaVideoSummarization/tests/test_llama_video_summarization.py +++ b/python/LlamaVideoSummarization/tests/test_llama_video_summarization.py @@ -26,6 +26,7 @@ from __future__ import annotations +import copy import json import logging import os @@ -69,7 +70,7 @@ }, { "timestamp_start": "5.0", - "timestamp_end": "6.8", + "timestamp_end": "6.8s", "description": "The cat looks back at the camera and then walks away." } ] @@ -203,9 +204,9 @@ def run_patched_job(self, component, job, response): self.mock_child_process_send_job.return_value = response return component.get_detections_from_video(job) + - - def assert_detection_region(self, detection, frame_width, frame_height): + def assert_detection_region(self, detection, frame_width, frame_height): self.assertEqual(0, detection.x_left_upper) self.assertEqual(0, detection.y_left_upper) self.assertEqual(frame_width, detection.width) @@ -294,10 +295,11 @@ def test_invalid_timeline(self): job = mpf.VideoJob('cat job', str(TEST_DATA / 'cat.mp4'), 0, 15000, { - "GENERATION_MAX_ATTEMPTS" : "1" + "GENERATION_MAX_ATTEMPTS" : "1", + "TIMELINE_CHECK_TARGET_THRESHOLD" : "10" }, CAT_VIDEO_PROPERTIES, {}) - + with self.assertRaises(mpf.DetectionException) as cm: self.run_patched_job(component, job, json.dumps( { @@ -340,6 +342,59 @@ def test_invalid_json_response(self): self.assertEqual(mpf.DetectionError.DETECTION_FAILED, cm.exception.error_code) self.assertIn("not valid JSON", str(cm.exception)) + + def test_schema_check(self): + component = LlamaVideoSummarizationComponent() + + job = mpf.VideoJob('cat job', str(TEST_DATA / 'cat.mp4'), 0, 171, + { + "GENERATION_MAX_ATTEMPTS" : "1" + }, + CAT_VIDEO_PROPERTIES, {}) + + with self.assertRaises(mpf.DetectionException) as cm: + self.run_patched_job(component, job, json.dumps( + { + "video_summary": "This is a video of a cat.", + "video_event_timeline": [ + { + "timestamp_start": "0.00", + "bad": "8.04", + "description": "The cat is sitting on the cobblestone street, looking around." + } + ] + })) # don't care about results + + self.assertEqual(mpf.DetectionError.DETECTION_FAILED, cm.exception.error_code) + self.assertIn("'timestamp_end' is a required property", str(cm.exception)) + + + def test_invalid_timestamp(self): + component = LlamaVideoSummarizationComponent() + + job = mpf.VideoJob('cat job', str(TEST_DATA / 'cat.mp4'), 0, 171, + { + "GENERATION_MAX_ATTEMPTS" : "1" + }, + CAT_VIDEO_PROPERTIES, {}) + + with self.assertRaises(mpf.DetectionException) as cm: + self.run_patched_job(component, job, json.dumps( + { + "video_summary": "This is a video of a cat.", + "video_event_timeline": [ + { + "timestamp_start": "7:12", + "timestamp_end": "8:04", + "description": "The cat is sitting on the cobblestone street, looking around." + } + ] + })) # don't care about results + + self.assertEqual(mpf.DetectionError.DETECTION_FAILED, cm.exception.error_code) + self.assertIn("Invalid timestamp: ", str(cm.exception)) + + def test_empty_response(self): component = LlamaVideoSummarizationComponent() @@ -355,17 +410,21 @@ def test_empty_response(self): self.assertEqual(mpf.DetectionError.DETECTION_FAILED, cm.exception.error_code) self.assertIn("Empty response", str(cm.exception)) + def test_timeline_integrity(self): component = LlamaVideoSummarizationComponent() - DRONE_TIMELINE_SEGMENT_1['video_event_timeline'].append({ + drone_timeline_segment_1 = copy.deepcopy(DRONE_TIMELINE_SEGMENT_1) + drone_timeline_segment_2 = copy.deepcopy(DRONE_TIMELINE_SEGMENT_2) + + drone_timeline_segment_1['video_event_timeline'].append({ "timestamp_start": 185.81, "timestamp_end": 235.77, "description": "The camera zooms in on the protesters, showing their faces and the details of their signs." }) # test min/max track frame overrides (with TIMELINE_CHECK_TARGET_THRESHOLD=-1) - DRONE_TIMELINE_SEGMENT_1["video_event_timeline"].append({ + drone_timeline_segment_1["video_event_timeline"].append({ "timestamp_start": 236.77, "timestamp_end": 179.96, "description": "The camera pans out to show the entire scene, including the fountain and the surrounding buildings." @@ -387,8 +446,8 @@ def test_timeline_integrity(self): feed_forward_track=None) # event that starts within range but ends outside of valid frames - DRONE_TIMELINE_SEGMENT_1["video_event_timeline"][2]["timestamp_end"] = 185.0 - job1_results = self.run_patched_job(component, job1, json.dumps(DRONE_TIMELINE_SEGMENT_1)) + drone_timeline_segment_1["video_event_timeline"][2]["timestamp_end"] = 185.0 + job1_results = self.run_patched_job(component, job1, json.dumps(drone_timeline_segment_1)) self.assertEqual(6, len(job1_results)) self.assertIn('SEGMENT SUMMARY', job1_results[0].detection_properties) @@ -421,69 +480,56 @@ def test_timeline_integrity(self): PROCESS_FPS=1, MAX_FRAMES=180, MAX_NEW_TOKENS=4096, - TIMELINE_CHECK_TARGET_THRESHOLD=20 + TIMELINE_CHECK_TARGET_THRESHOLD=20, + TIMELINE_CHECK_ACCEPTABLE_THRESHOLD=20 ), media_properties=DRONE_VIDEO_PROPERTIES, feed_forward_track=None) - + with self.assertRaises(mpf.DetectionException) as cm: - self.run_patched_job(component, job2, json.dumps(DRONE_TIMELINE_SEGMENT_2)) + self.run_patched_job(component, job2, json.dumps(drone_timeline_segment_2)) self.assertEqual(mpf.DetectionError.DETECTION_FAILED, cm.exception.error_code) self.assertIn("Timeline event start time of -45.2 < 0.", str(cm.exception)) - DRONE_TIMELINE_SEGMENT_2['video_event_timeline'].pop(0) - - with self.assertRaises(mpf.DetectionException) as cm: - self.run_patched_job(component, job2, json.dumps(DRONE_TIMELINE_SEGMENT_2)) - - self.assertEqual(mpf.DetectionError.DETECTION_FAILED, cm.exception.error_code) - self.assertIn("Timeline event start time occurs too soon before segment start time. (179.9798 - 0.0) > 20.", str(cm.exception)) - - DRONE_TIMELINE_SEGMENT_2['video_event_timeline'].pop(0) + drone_timeline_segment_2['video_event_timeline'].pop(0) + drone_timeline_segment_2['video_event_timeline'].pop(0) + drone_timeline_segment_2['video_event_timeline'][-1]["timestamp_end"] = 295.0 with self.assertRaises(mpf.DetectionException) as cm: - self.run_patched_job(component, job2, json.dumps(DRONE_TIMELINE_SEGMENT_2)) - - self.assertEqual(mpf.DetectionError.DETECTION_FAILED, cm.exception.error_code) - self.assertIn("Timeline event end time occurs too late after segment stop time. (381.17 - 299.96633333333335) > 20.", str(cm.exception)) - - DRONE_TIMELINE_SEGMENT_2['video_event_timeline'][-1]["timestamp_end"] = 295.0 - - with self.assertRaises(mpf.DetectionException) as cm: - self.run_patched_job(component, job2, json.dumps(DRONE_TIMELINE_SEGMENT_2)) + self.run_patched_job(component, job2, json.dumps(drone_timeline_segment_2)) self.assertEqual(mpf.DetectionError.DETECTION_FAILED, cm.exception.error_code) self.assertIn("Timeline event end time is less than event start time. 295.0 < 299.42.", str(cm.exception)) - DRONE_TIMELINE_SEGMENT_2['video_event_timeline'].pop() - event1 = DRONE_TIMELINE_SEGMENT_2['video_event_timeline'].pop(0) + drone_timeline_segment_2['video_event_timeline'].pop() + event1 = drone_timeline_segment_2['video_event_timeline'].pop(0) with self.assertRaises(mpf.DetectionException) as cm: - self.run_patched_job(component, job2, json.dumps(DRONE_TIMELINE_SEGMENT_2)) + self.run_patched_job(component, job2, json.dumps(drone_timeline_segment_2)) self.assertEqual(mpf.DetectionError.DETECTION_FAILED, cm.exception.error_code) self.assertIn("Min timeline event start time not close enough to segment start time.", str(cm.exception)) - DRONE_TIMELINE_SEGMENT_2['video_event_timeline'].insert(0, event1) - DRONE_TIMELINE_SEGMENT_2['video_event_timeline'][1]["timestamp_end"] = -5.0 # 298.46 + drone_timeline_segment_2['video_event_timeline'].insert(0, event1) + drone_timeline_segment_2['video_event_timeline'][1]["timestamp_end"] = -5.0 # 298.46 with self.assertRaises(mpf.DetectionException) as cm: - self.run_patched_job(component, job2, json.dumps(DRONE_TIMELINE_SEGMENT_2)) + self.run_patched_job(component, job2, json.dumps(drone_timeline_segment_2)) self.assertEqual(mpf.DetectionError.DETECTION_FAILED, cm.exception.error_code) self.assertIn("Timeline event end time of -5.0 < 0.", str(cm.exception)) - DRONE_TIMELINE_SEGMENT_2['video_event_timeline'][1]["timestamp_end"] = 250.0 + drone_timeline_segment_2['video_event_timeline'][1]["timestamp_end"] = 250.0 with self.assertRaises(mpf.DetectionException) as cm: - self.run_patched_job(component, job2, json.dumps(DRONE_TIMELINE_SEGMENT_2)) + self.run_patched_job(component, job2, json.dumps(drone_timeline_segment_2)) self.assertEqual(mpf.DetectionError.DETECTION_FAILED, cm.exception.error_code) self.assertIn("Max timeline event end time not close enough to segment stop time.", str(cm.exception)) - DRONE_TIMELINE_SEGMENT_2['video_event_timeline'][1]["timestamp_end"] = 298.46 - job2_results = self.run_patched_job(component, job2, json.dumps(DRONE_TIMELINE_SEGMENT_2)) + drone_timeline_segment_2['video_event_timeline'][1]["timestamp_end"] = 298.46 + job2_results = self.run_patched_job(component, job2, json.dumps(drone_timeline_segment_2)) self.assertEqual(3, len(job2_results)) self.assertIn('SEGMENT SUMMARY', job2_results[0].detection_properties) @@ -505,5 +551,78 @@ def test_timeline_integrity(self): self.assertIsNotNone(job2_results[2].frame_locations[8943]) + def test_timeline_acceptable_threshold(self): + component = LlamaVideoSummarizationComponent() + drone_timeline_segment_1 = copy.deepcopy(DRONE_TIMELINE_SEGMENT_1) + drone_timeline_segment_2 = copy.deepcopy(DRONE_TIMELINE_SEGMENT_2) + + job = mpf.VideoJob( + job_name='drone.mp4-segment-1', + data_uri=str( TEST_DATA / 'drone.mp4'), + start_frame=0, + stop_frame=5393, # 5393 + 1 = 5394 --> 179.9798 secs + job_properties=dict( + GENERATION_MAX_ATTEMPTS=2, + PROCESS_FPS=1, + MAX_FRAMES=180, + MAX_NEW_TOKENS=4096, + TIMELINE_CHECK_TARGET_THRESHOLD=10, + TIMELINE_CHECK_ACCEPTABLE_THRESHOLD=5 # must be higher than 10 + ), + media_properties=DRONE_VIDEO_PROPERTIES, + feed_forward_track=None) + + with self.assertRaises(mpf.DetectionException) as cm: + self.run_patched_job(component, job, json.dumps(drone_timeline_segment_1)) + + self.assertEqual(mpf.DetectionError.INVALID_PROPERTY, cm.exception.error_code) + self.assertIn("TIMELINE_CHECK_ACCEPTABLE_THRESHOLD must be >= TIMELINE_CHECK_TARGET_THRESHOLD.", str(cm.exception)) + + job1 = mpf.VideoJob( + job_name='drone.mp4-segment-1', + data_uri=str( TEST_DATA / 'drone.mp4'), + start_frame=0, + stop_frame=5393, # 5393 + 1 = 5394 --> 179.9798 secs + job_properties=dict( + GENERATION_MAX_ATTEMPTS=2, + PROCESS_FPS=1, + MAX_FRAMES=180, + MAX_NEW_TOKENS=4096, + TIMELINE_CHECK_TARGET_THRESHOLD=10, + TIMELINE_CHECK_ACCEPTABLE_THRESHOLD=30 + ), + media_properties=DRONE_VIDEO_PROPERTIES, + feed_forward_track=None) + + drone_timeline_segment_1["video_event_timeline"][0]["timestamp_start"] += 11.0 + drone_timeline_segment_1["video_event_timeline"][2]["timestamp_end"] += 20.0 + job1_results = self.run_patched_job(component, job1, json.dumps(drone_timeline_segment_1)) + self.assertEqual(4, len(job1_results)) + + + job2 = mpf.VideoJob( + job_name='drone.mp4-segment-2', + data_uri=str( TEST_DATA / 'drone.mp4'), + start_frame=5394, + stop_frame=8989, # 8989 - 5394 + 1 = 3596 --> 119.9865 secs + job_properties=dict( + GENERATION_MAX_ATTEMPTS=2, + PROCESS_FPS=1, + MAX_FRAMES=180, + MAX_NEW_TOKENS=4096, + TIMELINE_CHECK_TARGET_THRESHOLD=10, + TIMELINE_CHECK_ACCEPTABLE_THRESHOLD=30 + ), + media_properties=DRONE_VIDEO_PROPERTIES, + feed_forward_track=None) + + drone_timeline_segment_2["video_event_timeline"].pop(0) + drone_timeline_segment_2["video_event_timeline"][0]["timestamp_start"] = 179.98 - 20 + drone_timeline_segment_2["video_event_timeline"][0]["timestamp_end"] = 178.0 + drone_timeline_segment_2["video_event_timeline"][-1]["timestamp_end"] = 325.0 + job2_results = self.run_patched_job(component, job2, json.dumps(drone_timeline_segment_2)) + self.assertEqual(5, len(job2_results)) + + if __name__ == "__main__": unittest.main(verbosity=2) From b40f3e8115e0d1d7de4b45dd4d6ee03b5c787225 Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Tue, 14 Oct 2025 03:17:47 -0400 Subject: [PATCH 3/5] Updating documentation. Adding license file for NLLB component. --- python/AzureTranslation/LICENSE | 9 +- python/NllbTranslation/LICENSE | 84 ++++ python/NllbTranslation/README.md | 441 +++++++++--------- .../plugin-files/descriptor/descriptor.json | 2 +- 4 files changed, 315 insertions(+), 221 deletions(-) create mode 100644 python/NllbTranslation/LICENSE diff --git a/python/AzureTranslation/LICENSE b/python/AzureTranslation/LICENSE index 2344b622..847284f6 100644 --- a/python/AzureTranslation/LICENSE +++ b/python/AzureTranslation/LICENSE @@ -19,15 +19,18 @@ is used in a deployment or embedded within another project, it is requested that you send an email to opensource@mitre.org in order to let us know where this software is being used. +The nlp_text_splitter utlity uses the following sentence detection libraries: + ***************************************************************************** -The WtP, "Where the Point", sentence segmentation library falls under the MIT License: +The WtP, "Where the Point", and SaT, "Segment any Text" sentence segmentation +library falls under the MIT License: -https://github.com/bminixhofer/wtpsplit/blob/main/LICENSE +https://github.com/segment-any-text/wtpsplit/blob/main/LICENSE MIT License -Copyright (c) 2024 Benjamin Minixhofer +Copyright (c) 2024 Benjamin Minixhofer, Markus Frohmann, Igor Sterner Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/python/NllbTranslation/LICENSE b/python/NllbTranslation/LICENSE new file mode 100644 index 00000000..ef7840e2 --- /dev/null +++ b/python/NllbTranslation/LICENSE @@ -0,0 +1,84 @@ +/***************************************************************************** +* Copyright 2024 The MITRE Corporation * +* * +* Licensed under the Apache License, Version 2.0 (the "License"); * +* you may not use this file except in compliance with the License. * +* You may obtain a copy of the License at * +* * +* http://www.apache.org/licenses/LICENSE-2.0 * +* * +* Unless required by applicable law or agreed to in writing, software * +* distributed under the License is distributed on an "AS IS" BASIS, * +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * +* See the License for the specific language governing permissions and * +* limitations under the License. * +******************************************************************************/ + +This project contains content developed by The MITRE Corporation. If this code +is used in a deployment or embedded within another project, it is requested +that you send an email to opensource@mitre.org in order to let us know where +this software is being used. + + +The "No Language Left Behind" (NLLB) models on Hugging Face are distributed +under the CC-BY-NC-4.0 license (Creative Commons Attribution-NonCommercial 4.0), +hence they must be downloaded and run separately under non-commercial restrictions. + +The code within this repository falls under Apache 2.0 License. + +The nlp_text_splitter utlity uses the following sentence detection libraries: + +***************************************************************************** + +The WtP, "Where the Point", and SaT, "Segment any Text" sentence segmentation +library falls under the MIT License: + +https://github.com/segment-any-text/wtpsplit/blob/main/LICENSE + +MIT License + +Copyright (c) 2024 Benjamin Minixhofer, Markus Frohmann, Igor Sterner + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +***************************************************************************** + +The spaCy Natural Language Processing library falls under the MIT License: + +The MIT License (MIT) + +Copyright (C) 2016-2024 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. \ No newline at end of file diff --git a/python/NllbTranslation/README.md b/python/NllbTranslation/README.md index ad0b1590..00291d0c 100644 --- a/python/NllbTranslation/README.md +++ b/python/NllbTranslation/README.md @@ -8,12 +8,12 @@ To accommodate smaller deployment enviroments, this component can use smaller NL # Recommended System Requirements -- **GPU (recommended for default 3.3B model)** - - NVIDIA GPU with CUDA support - - At least **24 GB of GPU VRAM** +- **GPU (recommended for default 3.3B model)** + - NVIDIA GPU with CUDA support + - At least **24 GB of GPU VRAM** -- **CPU-only (not recommended for 3.3B model unless sufficient memory is available)** - - At least **32 GB of system RAM** +- **CPU-only (not recommended for 3.3B model unless sufficient memory is available)** + - At least **32 GB of system RAM** ### Example Model Requirements @@ -47,15 +47,22 @@ The below properties can be optionally provided to alter the behavior of the com - `NLLB_MODEL`: Specifies which No Language Left Behind (NLLB) model to use. The default model is `facebook/nllb-200-3.3B` and is included in the pre-built NLLB Translation docker image. If this property is configured with a different model, the component will attempt to download the specified model from Hugging Face. See [Recommended System Requirements](#recommended-system-requirements) for additional information. -- `SENTENCE_MODEL`: Specifies the desired WtP or spaCy sentence detection model. For CPU - and runtime considerations, the author of WtP recommends using `wtp-bert-mini`. More - advanced WtP models that use GPU resources (up to ~8 GB) are also available. See list of - WtP model names - [here](https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#available-models). The - only available spaCy model (for text with unknown language) is `xx_sent_ud_sm`. +- `SENTENCE_MODEL`: Specifies the desired SaT/WtP or spaCy sentence detection model. For CPU + and runtime considerations, the authors of SaT/WtP recommends using `sat-3l-sm` or `wtp-bert-mini`. + More advanced SaT/WtP models that use GPU resources (up to ~8 GB for WtP) are also available. + + See list of model names below: + + - [WtP Models](https://github.com/segment-any-text/wtpsplit/tree/1.3.0?tab=readme-ov-file#available-models) + - [SaT Models](https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#available-models). + + Please note, the only available spaCy model (for text with unknown language) is `xx_sent_ud_sm`. + + Review list of languages supported by SaT/WtP below: + + - [WtP Models](https://github.com/segment-any-text/wtpsplit/tree/1.3.0?tab=readme-ov-file#supported-languages) + - [SaT Models](https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#supported-languages) - Review list of languages supported by WtP - [here](https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#supported-languages). Review models and languages supported by spaCy [here](https://spacy.io/models). - `SENTENCE_SPLITTER_CHAR_COUNT`: Specifies maximum number of characters to process @@ -87,209 +94,209 @@ The below properties can be optionally provided to alter the behavior of the com # Language Identifiers The following are the ISO 639-3 and ISO 15924 codes, and their corresponding languages which Nllb can translate. -| ISO-639-3 | ISO-15924 | Language +| ISO-639-3 | ISO-15924 | Language | --------- | ---------- | ---------------------------------- -| ace | Arab | Acehnese Arabic -| ace | Latn | Acehnese Latin -| acm | Arab | Mesopotamian Arabic -| acq | Arab | Ta’izzi-Adeni Arabic -| aeb | Arab | Tunisian Arabic -| afr | Latn | Afrikaans -| ajp | Arab | South Levantine Arabic -| aka | Latn | Akan -| amh | Ethi | Amharic -| apc | Arab | North Levantine Arabic -| arb | Arab | Modern Standard Arabic +| ace | Arab | Acehnese Arabic +| ace | Latn | Acehnese Latin +| acm | Arab | Mesopotamian Arabic +| acq | Arab | Ta’izzi-Adeni Arabic +| aeb | Arab | Tunisian Arabic +| afr | Latn | Afrikaans +| ajp | Arab | South Levantine Arabic +| aka | Latn | Akan +| amh | Ethi | Amharic +| apc | Arab | North Levantine Arabic +| arb | Arab | Modern Standard Arabic | arb | Latn | Modern Standard Arabic (Romanized) -| ars | Arab | Najdi Arabic -| ary | Arab | Moroccan Arabic -| arz | Arab | Egyptian Arabic -| asm | Beng | Assamese -| ast | Latn | Asturian -| awa | Deva | Awadhi -| ayr | Latn | Central Aymara -| azb | Arab | South Azerbaijani -| azj | Latn | North Azerbaijani -| bak | Cyrl | Bashkir -| bam | Latn | Bambara -| ban | Latn | Balinese -| bel | Cyrl | Belarusian -| bem | Latn | Bemba -| ben | Beng | Bengali -| bho | Deva | Bhojpuri -| bjn | Arab | Banjar (Arabic script) -| bjn | Latn | Banjar (Latin script) -| bod | Tibt | Standard Tibetan -| bos | Latn | Bosnian -| bug | Latn | Buginese -| bul | Cyrl | Bulgarian -| cat | Latn | Catalan -| ceb | Latn | Cebuano -| ces | Latn | Czech -| cjk | Latn | Chokwe -| ckb | Arab | Central Kurdish -| crh | Latn | Crimean Tatar -| cym | Latn | Welsh -| dan | Latn | Danish -| deu | Latn | German -| dik | Latn | Southwestern Dinka -| dyu | Latn | Dyula -| dzo | Tibt | Dzongkha -| ell | Grek | Greek -| eng | Latn | English -| epo | Latn | Esperanto -| est | Latn | Estonian -| eus | Latn | Basque -| ewe | Latn | Ewe -| fao | Latn | Faroese -| fij | Latn | Fijian -| fin | Latn | Finnish -| fon | Latn | Fon -| fra | Latn | French -| fur | Latn | Friulian -| fuv | Latn | Nigerian Fulfulde -| gla | Latn | Scottish Gaelic -| gle | Latn | Irish -| glg | Latn | Galician -| grn | Latn | Guarani -| guj | Gujr | Gujarati -| hat | Latn | Haitian Creole -| hau | Latn | Hausa -| heb | Hebr | Hebrew -| hin | Deva | Hindi -| hne | Deva | Chhattisgarhi -| hrv | Latn | Croatian -| hun | Latn | Hungarian -| hye | Armn | Armenian -| ibo | Latn | Igbo -| ilo | Latn | Ilocano -| ind | Latn | Indonesian -| isl | Latn | Icelandic -| ita | Latn | Italian -| jav | Latn | Javanese -| jpn | Jpan | Japanese -| kab | Latn | Kabyle -| kac | Latn | Jingpho -| kam | Latn | Kamba -| kan | Knda | Kannada -| kas | Arab | Kashmiri (Arabic script) -| kas | Deva | Kashmiri (Devanagari script) -| kat | Geor | Georgian -| knc | Arab | Central Kanuri (Arabic script) -| knc | Latn | Central Kanuri (Latin script) -| kaz | Cyrl | Kazakh -| kbp | Latn | Kabiyè -| kea | Latn | Kabuverdianu -| khm | Khmr | Khmer -| kik | Latn | Kikuyu -| kin | Latn | Kinyarwanda -| kir | Cyrl | Kyrgyz -| kmb | Latn | Kimbundu -| kmr | Latn | Northern Kurdish -| kon | Latn | Kikongo -| kor | Hang | Korean -| lao | Laoo | Lao -| lij | Latn | Ligurian -| lim | Latn | Limburgish -| lin | Latn | Lingala -| lit | Latn | Lithuanian -| lmo | Latn | Lombard -| ltg | Latn | Latgalian -| ltz | Latn | Luxembourgish -| lua | Latn | Luba-Kasai -| lug | Latn | Ganda -| luo | Latn | Luo -| lus | Latn | Mizo -| lvs | Latn | Standard Latvian -| mag | Deva | Magahi -| mai | Deva | Maithili -| mal | Mlym | Malayalam -| mar | Deva | Marathi -| min | Arab | Minangkabau (Arabic script) -| min | Latn | Minangkabau (Latin script) -| mkd | Cyrl | Macedonian -| plt | Latn | Plateau Malagasy -| mlt | Latn | Maltese -| mni | Beng | Meitei (Bengali script) -| khk | Cyrl | Halh Mongolian -| mos | Latn | Mossi -| mri | Latn | Maori -| mya | Mymr | Burmese -| nld | Latn | Dutch -| nno | Latn | Norwegian Nynorsk -| nob | Latn | Norwegian Bokmål -| npi | Deva | Nepali -| nso | Latn | Northern Sotho -| nus | Latn | Nuer -| nya | Latn | Nyanja -| oci | Latn | Occitan -| gaz | Latn | West Central Oromo -| ory | Orya | Odia -| pag | Latn | Pangasinan -| pan | Guru | Eastern Panjabi -| pap | Latn | Papiamento -| pes | Arab | Western Persian -| pol | Latn | Polish -| por | Latn | Portuguese -| prs | Arab | Dari -| pbt | Arab | Southern Pashto -| quy | Latn | Ayacucho Quechua -| ron | Latn | Romanian -| run | Latn | Rundi -| rus | Cyrl | Russian -| sag | Latn | Sango -| san | Deva | Sanskrit -| sat | Olck | Santali -| scn | Latn | Sicilian -| shn | Mymr | Shan -| sin | Sinh | Sinhala -| slk | Latn | Slovak -| slv | Latn | Slovenian -| smo | Latn | Samoan -| sna | Latn | Shona -| snd | Arab | Sindhi -| som | Latn | Somali -| sot | Latn | Southern Sotho -| spa | Latn | Spanish -| als | Latn | Tosk Albanian -| srd | Latn | Sardinian -| srp | Cyrl | Serbian -| ssw | Latn | Swati -| sun | Latn | Sundanese -| swe | Latn | Swedish -| swh | Latn | Swahili -| szl | Latn | Silesian -| tam | Taml | Tamil -| tat | Cyrl | Tatar -| tel | Telu | Telugu -| tgk | Cyrl | Tajik -| tgl | Latn | Tagalog -| tha | Thai | Thai -| tir | Ethi | Tigrinya -| taq | Latn | Tamasheq (Latin script) -| taq | Tfng | Tamasheq (Tifinagh script) -| tpi | Latn | Tok Pisin -| tsn | Latn | Tswana -| tso | Latn | Tsonga -| tuk | Latn | Turkmen -| tum | Latn | Tumbuka -| tur | Latn | Turkish -| twi | Latn | Twi -| tzm | Tfng | Central Atlas Tamazight -| uig | Arab | Uyghur -| ukr | Cyrl | Ukrainian -| umb | Latn | Umbundu -| urd | Arab | Urdu -| uzn | Latn | Northern Uzbek -| vec | Latn | Venetian -| vie | Latn | Vietnamese -| war | Latn | Waray -| wol | Latn | Wolof -| xho | Latn | Xhosa -| ydd | Hebr | Eastern Yiddish -| yor | Latn | Yoruba -| yue | Hant | Yue Chinese -| zho | Hans | Chinese (Simplified) -| zho | Hant | Chinese (Traditional) -| zsm | Latn | Standard Malay -| zul | Latn | Zulu +| ars | Arab | Najdi Arabic +| ary | Arab | Moroccan Arabic +| arz | Arab | Egyptian Arabic +| asm | Beng | Assamese +| ast | Latn | Asturian +| awa | Deva | Awadhi +| ayr | Latn | Central Aymara +| azb | Arab | South Azerbaijani +| azj | Latn | North Azerbaijani +| bak | Cyrl | Bashkir +| bam | Latn | Bambara +| ban | Latn | Balinese +| bel | Cyrl | Belarusian +| bem | Latn | Bemba +| ben | Beng | Bengali +| bho | Deva | Bhojpuri +| bjn | Arab | Banjar (Arabic script) +| bjn | Latn | Banjar (Latin script) +| bod | Tibt | Standard Tibetan +| bos | Latn | Bosnian +| bug | Latn | Buginese +| bul | Cyrl | Bulgarian +| cat | Latn | Catalan +| ceb | Latn | Cebuano +| ces | Latn | Czech +| cjk | Latn | Chokwe +| ckb | Arab | Central Kurdish +| crh | Latn | Crimean Tatar +| cym | Latn | Welsh +| dan | Latn | Danish +| deu | Latn | German +| dik | Latn | Southwestern Dinka +| dyu | Latn | Dyula +| dzo | Tibt | Dzongkha +| ell | Grek | Greek +| eng | Latn | English +| epo | Latn | Esperanto +| est | Latn | Estonian +| eus | Latn | Basque +| ewe | Latn | Ewe +| fao | Latn | Faroese +| fij | Latn | Fijian +| fin | Latn | Finnish +| fon | Latn | Fon +| fra | Latn | French +| fur | Latn | Friulian +| fuv | Latn | Nigerian Fulfulde +| gla | Latn | Scottish Gaelic +| gle | Latn | Irish +| glg | Latn | Galician +| grn | Latn | Guarani +| guj | Gujr | Gujarati +| hat | Latn | Haitian Creole +| hau | Latn | Hausa +| heb | Hebr | Hebrew +| hin | Deva | Hindi +| hne | Deva | Chhattisgarhi +| hrv | Latn | Croatian +| hun | Latn | Hungarian +| hye | Armn | Armenian +| ibo | Latn | Igbo +| ilo | Latn | Ilocano +| ind | Latn | Indonesian +| isl | Latn | Icelandic +| ita | Latn | Italian +| jav | Latn | Javanese +| jpn | Jpan | Japanese +| kab | Latn | Kabyle +| kac | Latn | Jingpho +| kam | Latn | Kamba +| kan | Knda | Kannada +| kas | Arab | Kashmiri (Arabic script) +| kas | Deva | Kashmiri (Devanagari script) +| kat | Geor | Georgian +| knc | Arab | Central Kanuri (Arabic script) +| knc | Latn | Central Kanuri (Latin script) +| kaz | Cyrl | Kazakh +| kbp | Latn | Kabiyè +| kea | Latn | Kabuverdianu +| khm | Khmr | Khmer +| kik | Latn | Kikuyu +| kin | Latn | Kinyarwanda +| kir | Cyrl | Kyrgyz +| kmb | Latn | Kimbundu +| kmr | Latn | Northern Kurdish +| kon | Latn | Kikongo +| kor | Hang | Korean +| lao | Laoo | Lao +| lij | Latn | Ligurian +| lim | Latn | Limburgish +| lin | Latn | Lingala +| lit | Latn | Lithuanian +| lmo | Latn | Lombard +| ltg | Latn | Latgalian +| ltz | Latn | Luxembourgish +| lua | Latn | Luba-Kasai +| lug | Latn | Ganda +| luo | Latn | Luo +| lus | Latn | Mizo +| lvs | Latn | Standard Latvian +| mag | Deva | Magahi +| mai | Deva | Maithili +| mal | Mlym | Malayalam +| mar | Deva | Marathi +| min | Arab | Minangkabau (Arabic script) +| min | Latn | Minangkabau (Latin script) +| mkd | Cyrl | Macedonian +| plt | Latn | Plateau Malagasy +| mlt | Latn | Maltese +| mni | Beng | Meitei (Bengali script) +| khk | Cyrl | Halh Mongolian +| mos | Latn | Mossi +| mri | Latn | Maori +| mya | Mymr | Burmese +| nld | Latn | Dutch +| nno | Latn | Norwegian Nynorsk +| nob | Latn | Norwegian Bokmål +| npi | Deva | Nepali +| nso | Latn | Northern Sotho +| nus | Latn | Nuer +| nya | Latn | Nyanja +| oci | Latn | Occitan +| gaz | Latn | West Central Oromo +| ory | Orya | Odia +| pag | Latn | Pangasinan +| pan | Guru | Eastern Panjabi +| pap | Latn | Papiamento +| pes | Arab | Western Persian +| pol | Latn | Polish +| por | Latn | Portuguese +| prs | Arab | Dari +| pbt | Arab | Southern Pashto +| quy | Latn | Ayacucho Quechua +| ron | Latn | Romanian +| run | Latn | Rundi +| rus | Cyrl | Russian +| sag | Latn | Sango +| san | Deva | Sanskrit +| sat | Olck | Santali +| scn | Latn | Sicilian +| shn | Mymr | Shan +| sin | Sinh | Sinhala +| slk | Latn | Slovak +| slv | Latn | Slovenian +| smo | Latn | Samoan +| sna | Latn | Shona +| snd | Arab | Sindhi +| som | Latn | Somali +| sot | Latn | Southern Sotho +| spa | Latn | Spanish +| als | Latn | Tosk Albanian +| srd | Latn | Sardinian +| srp | Cyrl | Serbian +| ssw | Latn | Swati +| sun | Latn | Sundanese +| swe | Latn | Swedish +| swh | Latn | Swahili +| szl | Latn | Silesian +| tam | Taml | Tamil +| tat | Cyrl | Tatar +| tel | Telu | Telugu +| tgk | Cyrl | Tajik +| tgl | Latn | Tagalog +| tha | Thai | Thai +| tir | Ethi | Tigrinya +| taq | Latn | Tamasheq (Latin script) +| taq | Tfng | Tamasheq (Tifinagh script) +| tpi | Latn | Tok Pisin +| tsn | Latn | Tswana +| tso | Latn | Tsonga +| tuk | Latn | Turkmen +| tum | Latn | Tumbuka +| tur | Latn | Turkish +| twi | Latn | Twi +| tzm | Tfng | Central Atlas Tamazight +| uig | Arab | Uyghur +| ukr | Cyrl | Ukrainian +| umb | Latn | Umbundu +| urd | Arab | Urdu +| uzn | Latn | Northern Uzbek +| vec | Latn | Venetian +| vie | Latn | Vietnamese +| war | Latn | Waray +| wol | Latn | Wolof +| xho | Latn | Xhosa +| ydd | Hebr | Eastern Yiddish +| yor | Latn | Yoruba +| yue | Hant | Yue Chinese +| zho | Hans | Chinese (Simplified) +| zho | Hant | Chinese (Traditional) +| zsm | Latn | Standard Malay +| zul | Latn | Zulu diff --git a/python/NllbTranslation/plugin-files/descriptor/descriptor.json b/python/NllbTranslation/plugin-files/descriptor/descriptor.json index 8420e2c1..95635c5d 100644 --- a/python/NllbTranslation/plugin-files/descriptor/descriptor.json +++ b/python/NllbTranslation/plugin-files/descriptor/descriptor.json @@ -58,7 +58,7 @@ }, { "name": "SENTENCE_MODEL", - "description": "Name of sentence segmentation model. Supported options are spaCy's multilingual `xx_sent_ud_sm` model and the Where's the Point (WtP) `wtp-bert-mini` model.", + "description": "Name of sentence segmentation model. Supported options are spaCy's multilingual `xx_sent_ud_sm` model, Segment any Text (SaT) `sat-3l-sm` model, and Where's the Point (WtP) `wtp-bert-mini` model.", "type": "STRING", "defaultValue": "wtp-bert-mini" }, From 315bf6d73ba15d69204dc88cfd4613fac4bdb1af Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Wed, 15 Oct 2025 23:45:54 -0400 Subject: [PATCH 4/5] Adding support for new text splitter. Merging develop changes. --- python/AzureTranslation/README.md | 4 ++ .../acs_translation_component.py | 12 ++++- .../plugin-files/descriptor/descriptor.json | 8 ++- python/NllbTranslation/README.md | 11 ++++ .../nllb_translation_component.py | 26 +++++++--- .../plugin-files/descriptor/descriptor.json | 12 +++++ .../tests/test_nllb_translation.py | 52 ++++++++++++++----- 7 files changed, 101 insertions(+), 24 deletions(-) diff --git a/python/AzureTranslation/README.md b/python/AzureTranslation/README.md index 5cadb230..87f4ed6f 100644 --- a/python/AzureTranslation/README.md +++ b/python/AzureTranslation/README.md @@ -118,6 +118,10 @@ this model lacks support handling for Chinese punctuation. lengths [here](https://discourse.mozilla.org/t/proposal-sentences-lenght-limit-from-14-words-to-100-characters). +- `SENTENCE_SPLITTER_MODE`: Specifies text splitting behavior, options include: + - `DEFAULT` : Splits text into chunks based on the `SENTENCE_SPLITTER_CHAR_COUNT` limit. + - `SENTENCE`: Splits text at detected sentence boundaries. This mode creates more sentence breaks than `DEFAULT`, which is more focused on avoiding text splits unless the chunk size is reached. + - `SENTENCE_SPLITTER_INCLUDE_INPUT_LANG`: Specifies whether to pass input language to sentence splitter algorithm. Currently, only SaT/WtP supports model threshold adjustments by input language. diff --git a/python/AzureTranslation/acs_translation_component/acs_translation_component.py b/python/AzureTranslation/acs_translation_component/acs_translation_component.py index 6f89c050..f14fc5a5 100644 --- a/python/AzureTranslation/acs_translation_component/acs_translation_component.py +++ b/python/AzureTranslation/acs_translation_component/acs_translation_component.py @@ -471,6 +471,10 @@ def __init__(self, job_properties: Mapping[str, str], "en") nlp_model_setting = mpf_util.get_property(job_properties, "SENTENCE_MODEL_CPU_ONLY", True) + self._sentence_splitter_mode = mpf_util.get_property(job_properties, + "SENTENCE_SPLITTER_MODE", + "DEFAULT") + if not nlp_model_setting: nlp_model_setting = "cuda" else: @@ -500,14 +504,18 @@ def split_input_text(self, text: str, from_lang: Optional[str], self._num_boundary_chars, get_azure_char_count, self._sentence_model, - from_lang) + from_lang, + split_mode=self._sentence_splitter_mode, + newline_behavior='NONE') # This component already uses a newline filtering step. else: divided_text_list = TextSplitter.split( text, TranslationClient.DETECT_MAX_CHARS, self._num_boundary_chars, get_azure_char_count, - self._sentence_model) + self._sentence_model, + split_mode=self._sentence_splitter_mode, + newline_behavior='NONE') # This component already uses a newline filtering step. chunks = list(divided_text_list) diff --git a/python/AzureTranslation/plugin-files/descriptor/descriptor.json b/python/AzureTranslation/plugin-files/descriptor/descriptor.json index f66891f6..64072f6f 100644 --- a/python/AzureTranslation/plugin-files/descriptor/descriptor.json +++ b/python/AzureTranslation/plugin-files/descriptor/descriptor.json @@ -71,10 +71,16 @@ }, { "name": "STRIP_NEW_LINE_BEHAVIOR", - "description": "The translation endpoint treats newline characters as sentence boundaries. To prevent this newlines can be removed from the input text. Valid values are SPACE (replace with space character), REMOVE (remove newlines), NONE (leave newlines as they are), and GUESS (If source language is Chinese or Japanese use REMOVE, else use SPACE).", + "description": "The translation endpoint and text splitter treat newline characters as sentence boundaries. To prevent this newlines can be removed from the input text. Valid values are SPACE (replace with space character), REMOVE (remove newlines), NONE (leave newlines as they are), and GUESS (If source language is Chinese or Japanese use REMOVE, else use SPACE).", "type": "STRING", "defaultValue": "GUESS" }, + { + "name": "SENTENCE_SPLITTER_MODE", + "description": "Determines how text is split: `DEFAULT` mode splits text into chunks based on the character limit, while `SENTENCE` mode splits text strictly at sentence boundaries (may yield smaller segments), unless the character limit is reached.", + "type": "STRING", + "defaultValue": "DEFAULT" + }, { "name": "DETECT_BEFORE_TRANSLATE", "description": "Use the /detect endpoint to check if translation can be skipped because the text is already in TO_LANGUAGE.", diff --git a/python/NllbTranslation/README.md b/python/NllbTranslation/README.md index 00291d0c..d5ba93eb 100644 --- a/python/NllbTranslation/README.md +++ b/python/NllbTranslation/README.md @@ -75,6 +75,17 @@ The below properties can be optionally provided to alter the behavior of the com sentence splitter algorithm. Currently, only WtP supports model threshold adjustments by input language. +- `SENTENCE_SPLITTER_MODE`: Specifies text splitting behavior, options include: + - `DEFAULT` : Splits text into chunks based on the `SENTENCE_SPLITTER_CHAR_COUNT` limit. + - `SENTENCE`: Splits text at detected sentence boundaries. This mode creates more sentence breaks than `DEFAULT`, which is more focused on avoiding text splits unless the chunk size is reached. + +- `SENTENCE_SPLITTER_NEWLINE_BEHAVIOR`: Specifies how individual newlines between characters should be handled when splitting text. Options include: + - `GUESS` (default): Automatically replace newlines with either spaces or remove them, depending on the detected script between newlines. + - `SPACE`: Always replaces newlines with a space, regardless of script. + - `REMOVE`: Always removes newlines entirely, joining the adjacent characters directly. + - `NONE`: Leaves newlines as-is in the input text. + Please note that multiple adjacent newlines are treated as a manual text divide, across all settings. This is to ensure subtitles and other singular text examples are properly separated from other text during translation. + - `SENTENCE_MODEL_CPU_ONLY`: If set to TRUE, only use CPU resources for the sentence detection model. If set to FALSE, allow sentence model to also use GPU resources. For most runs using spaCy `xx_sent_ud_sm` or `wtp-bert-mini` models, GPU resources diff --git a/python/NllbTranslation/nllb_component/nllb_translation_component.py b/python/NllbTranslation/nllb_component/nllb_translation_component.py index bd658110..7613ad00 100644 --- a/python/NllbTranslation/nllb_component/nllb_translation_component.py +++ b/python/NllbTranslation/nllb_component/nllb_translation_component.py @@ -61,7 +61,7 @@ def get_detections_from_image(self, job: mpf.ImageJob) -> Sequence[mpf.ImageLoca def get_detections_from_audio(self, job: mpf.AudioJob) -> Sequence[mpf.AudioTrack]: logger.info(f'Received audio job.') return self._get_feed_forward_detections(job.job_properties, job.feed_forward_track, video_job=False) - + def get_detections_from_video(self, job: mpf.VideoJob) -> Sequence[mpf.VideoTrack]: logger.info(f'Received video job.') return self._get_feed_forward_detections(job.job_properties, job.feed_forward_track, video_job=True) @@ -127,7 +127,7 @@ def _load_tokenizer(self, config: Dict[str, str]) -> None: src_lang=config.translate_from_language, device_map=self._model.device) elapsed = time.time() - start logger.debug(f"Successfully loaded tokenizer in {elapsed} seconds.") - + def _load_model(self, model_name: str = None, config: Dict[str, str] = None) -> None: try: if model_name is None: @@ -135,10 +135,10 @@ def _load_model(self, model_name: str = None, config: Dict[str, str] = None) -> model_name = DEFAULT_NLLB_MODEL else: model_name = config.nllb_model - + model_path = '/models/' + model_name offload_folder = model_path + '/.weights' - + if os.path.isdir(model_path) and os.path.isfile(os.path.join(model_path, "config.json")): # model is stored locally; we do not need to load the tokenizer here logger.info(f"Loading model from local directory: {model_path}") @@ -154,7 +154,7 @@ def _load_model(self, model_name: str = None, config: Dict[str, str] = None) -> logger.debug(f"Saving model in {model_path}") self._model.save_pretrained(model_path) self._tokenizer.save_pretrained(model_path) - + except Exception: logger.exception( f'Failed to complete job due to the following exception:') @@ -207,14 +207,18 @@ def _get_translation(self, config: Dict[str, str], text_to_translate: str) -> st 0, len, text_splitter_model, - wtp_lang) + wtp_lang, + split_mode=config._sentence_split_mode, + newline_behavior=config._newline_behavior) else: input_text_sentences = TextSplitter.split( text, config.nllb_character_limit, 0, len, - text_splitter_model) + text_splitter_model, + split_mode=config._sentence_split_mode, + newline_behavior=config._newline_behavior) text_list = list(input_text_sentences) logger.info(f'Input text split into {len(text_list)} sentences.') @@ -264,6 +268,12 @@ def __init__(self, props: Mapping[str, str], ff_props: Dict[str, str]) -> None: ).split(',') ] + self._sentence_split_mode = mpf_util.get_property( + props, 'SENTENCE_SPLITTER_MODE', 'DEFAULT') + + self._newline_behavior = mpf_util.get_property( + props, 'SENTENCE_SPLITTER_NEWLINE_BEHAVIOR', 'GUESS') + # default model, cached self.nllb_model = mpf_util.get_property(props, "NLLB_MODEL", DEFAULT_NLLB_MODEL) @@ -344,7 +354,7 @@ def __init__(self, props: Mapping[str, str], ff_props: Dict[str, str]) -> None: f'Failed to complete job due to the following exception:') raise - + if not self.translate_from_language: logger.exception('Unsupported or no source language provided') raise mpf.DetectionException( diff --git a/python/NllbTranslation/plugin-files/descriptor/descriptor.json b/python/NllbTranslation/plugin-files/descriptor/descriptor.json index 95635c5d..f4688110 100644 --- a/python/NllbTranslation/plugin-files/descriptor/descriptor.json +++ b/python/NllbTranslation/plugin-files/descriptor/descriptor.json @@ -103,6 +103,18 @@ "description": "The ISO-15924 language code for language and script that the input text should be translated from.", "type": "STRING", "defaultValue": "" + }, + { + "name": "SENTENCE_SPLITTER_MODE", + "description": "Determines how text is split: `DEFAULT` mode splits text into chunks based on the character limit, while `SENTENCE` mode splits text strictly at sentence boundaries (may yield smaller segments), unless the character limit is reached.", + "type": "STRING", + "defaultValue": "DEFAULT" + }, + { + "name": "SENTENCE_SPLITTER_NEWLINE_BEHAVIOR", + "description": "The text splitter treats newline characters as sentence boundaries. To prevent this newlines can be removed from the input text during splitting. Valid values are SPACE (replace with space character), REMOVE (remove newlines), NONE (leave newlines as they are), and GUESS (If source language is Chinese or Japanese use REMOVE, else use SPACE).", + "type": "STRING", + "defaultValue": "GUESS" } ] } diff --git a/python/NllbTranslation/tests/test_nllb_translation.py b/python/NllbTranslation/tests/test_nllb_translation.py index e9c66e45..754c70f3 100644 --- a/python/NllbTranslation/tests/test_nllb_translation.py +++ b/python/NllbTranslation/tests/test_nllb_translation.py @@ -112,7 +112,7 @@ def test_audio_job(self): self.assertEqual(self.OUTPUT_0, props["TRANSLATION"]) def test_video_job(self): - + ff_track = mpf.VideoTrack( 0, 1, -1, { @@ -120,7 +120,7 @@ def test_video_job(self): 1: mpf.ImageLocation(0, 10, 10, 10, -1, dict(TRANSCRIPT=self.SAMPLE_2)) }, dict(TEXT=self.SAMPLE_0)) - + #set default props test_generic_job_props: dict[str, str] = dict(self.defaultProps) #load source language @@ -161,8 +161,8 @@ def test_plaintext_job(self): test_generic_job_props['DEFAULT_SOURCE_LANGUAGE'] = 'deu' test_generic_job_props['DEFAULT_SOURCE_SCRIPT'] = 'Latn' - job = mpf.GenericJob('Test Plaintext', - str(Path(__file__).parent / 'data' / 'translation.txt'), + job = mpf.GenericJob('Test Plaintext', + str(Path(__file__).parent / 'data' / 'translation.txt'), test_generic_job_props, {}) result_track: Sequence[mpf.GenericTrack] = self.component.get_detections_from_generic(job) @@ -185,7 +185,7 @@ def test_translate_first_ff_property(self): 1: mpf.ImageLocation(0, 10, 10, 10, -1, dict(TEXT=self.SAMPLE_0,TRANSCRIPT=self.SAMPLE_2)) }, dict(TRANSCRIPT=self.SAMPLE_0)) - + job = mpf.VideoJob('Test Video', 'test.mp4', 0, 1, test_generic_job_props, @@ -247,7 +247,7 @@ def test_translate_all_ff_properties(self): frame_2_props = result[0].frame_locations[2].detection_properties self.assertNotIn("OTHER TRANSLATION", frame_2_props) self.assertIn("OTHER", frame_2_props) - + def test_translate_first_frame_location_property(self): # set default props test_generic_job_props: dict[str, str] = dict(self.defaultProps) @@ -264,7 +264,7 @@ def test_translate_first_frame_location_property(self): 0: mpf.ImageLocation(0, 0, 10, 10, -1, dict(OTHER_PROPERTY="Other prop text", TEXT=self.SAMPLE_1)), 1: mpf.ImageLocation(0, 10, 10, 10, -1, dict(TRANSCRIPT=self.SAMPLE_2)) }) - + job = mpf.VideoJob('Test Video', 'test.mp4', 0, 1, test_generic_job_props, @@ -388,7 +388,7 @@ def test_feed_forward_language(self): #set default props test_generic_job_props: dict[str, str] = dict(self.defaultProps) - ff_track = mpf.GenericTrack(-1, dict(TEXT=self.SAMPLE_0, + ff_track = mpf.GenericTrack(-1, dict(TEXT=self.SAMPLE_0, LANGUAGE='deu', ISO_SCRIPT='Latn')) job = mpf.GenericJob('Test Generic', 'test.pdf', test_generic_job_props, {}, ff_track) @@ -401,7 +401,7 @@ def test_eng_to_eng_translation(self): #set default props test_generic_job_props: dict[str, str] = dict(self.defaultProps) - ff_track = mpf.GenericTrack(-1, dict(TEXT='This is English text that should not be translated.', + ff_track = mpf.GenericTrack(-1, dict(TEXT='This is English text that should not be translated.', LANGUAGE='eng', ISO_SCRIPT='Latn')) job = mpf.GenericJob('Test Generic', 'test.pdf', test_generic_job_props, {}, ff_track) @@ -476,6 +476,8 @@ def test_paragraph_split_job(self): #load source language test_generic_job_props['DEFAULT_SOURCE_LANGUAGE'] = 'por' test_generic_job_props['DEFAULT_SOURCE_SCRIPT'] = 'Latn' + test_generic_job_props['SENTENCE_SPLITTER_MODE'] = 'DEFAULT' + test_generic_job_props['SENTENCE_SPLITTER_NEWLINE_BEHAVIOR'] = 'GUESS' # excerpt from https://www.gutenberg.org/ebooks/16443 pt_text="""Teimam de facto estes em que são indispensaveis os vividos raios do @@ -496,7 +498,7 @@ def test_paragraph_split_job(self): satisfeitos do mundo, satisfeitos dos homens e, muito especialmente, satisfeitos de si. """ - pt_text_translation = "They fear, indeed, those in whom the vivid rays of our unblinking sun, or the unclouded face of the moon in the peninsular firmament, where it has not, like that of London--to break at the cost of a plumbeo heaven--are indispensable, to pour joy into the soul and send to the semblances the reflection of them; they imagine fatally pursued from _spleen_, hopelessly gloomy and sullen, as if at every moment they were emerging from the subterranean galleries of a pit-coal mine, our British allies. How they deceive themselves or how they intend to deceive us! This is an illusion or bad faith, against which much is vainly complained the unlevel and accentuated expression of bliss, which shines through on the face. The European Parliament has been a great help to the people of Europe in the past, and it is a great help to us in the present." + pt_text_translation = "They fear, indeed, those in whom the vivid rays of our unblinking sun, or the unclouded face of the moon in the peninsular firmament, where it has not, like that of London--to break at the cost of a plumbeo heaven--are indispensable, to pour joy into the soul and send to the semblances the reflection of them; they imagine fatally pursued from _spleen_, hopelessly gloomy and dreary, as if every moment they came out of the underground galleries of a pit-coal mine, How they deceive or how they intend to deceive us! is this an illusion or bad faith, against which there is much claim in vain the indelevel and accented expression of beatitude, which shines on the illuminated face of the men from beyond the Manch, who seem to walk among us, wrapped in dense atmosphere of perennial contentment, satisfied The European Union is a global community of nations, which is not only a community of nations, but also a community of nations." ff_track = mpf.GenericTrack(-1, dict(TEXT=pt_text)) job = mpf.GenericJob('Test Generic', 'test.pdf', test_generic_job_props, {}, ff_track) @@ -505,6 +507,30 @@ def test_paragraph_split_job(self): result_props: dict[str, str] = result_track[0].detection_properties self.assertEqual(pt_text_translation, result_props["TRANSLATION"]) + + test_generic_job_props['SENTENCE_SPLITTER_MODE'] = 'SENTENCE' + test_generic_job_props['SENTENCE_SPLITTER_NEWLINE_BEHAVIOR'] = 'GUESS' + + pt_text_translation = "They fear, indeed, those in whom the vivid rays of our unblinking sun, or the unclouded face of the moon in the peninsular firmament, where it has not, like that of London--to break at the cost of a plumbeo heaven--are indispensable to pour joy into the soul and send to the countenances the reflection of them; They imagine themselves fatally haunted by spleen, hopelessly gloomy and sullen, as if at every moment they were emerging from the underground galleries of a pit-coal mine, Our British allies. How they deceive themselves or how they intend to deceive us! Is this an illusion or bad faith, against which there is much to be lamented in vain the indelevel and accentuated expression of beatitude, which shines through the illuminated faces of the men from beyond the Channel, who seem to walk among us, wrapped in a dense atmosphere of perenne contentment, satisfied with the world, satisfied with men and, very especially, satisfied with themselves? i. the" + ff_track = mpf.GenericTrack(-1, dict(TEXT=pt_text)) + job = mpf.GenericJob('Test Generic', 'test.pdf', test_generic_job_props, {}, ff_track) + result_track: Sequence[mpf.GenericTrack] = self.component.get_detections_from_generic(job) + + result_props: dict[str, str] = result_track[0].detection_properties + self.assertEqual(pt_text_translation, result_props["TRANSLATION"]) + + + test_generic_job_props['SENTENCE_SPLITTER_MODE'] = 'DEFAULT' + test_generic_job_props['SENTENCE_SPLITTER_NEWLINE_BEHAVIOR'] = 'NONE' + + pt_text_translation = "They fear, indeed, those in whom the vivid rays of our unblinking sun, or the unclouded face of the moon in the peninsular firmament, where it has not, like that of London--to break at the cost of a plumbeo heaven--are indispensable, to pour joy into the soul and send to the semblances the reflection of them; they imagine fatally pursued from _spleen_, hopelessly gloomy and sullen, as if at every moment they were emerging from the subterranean galleries of a pit-coal mine, our British allies. How they deceive themselves or how they intend to deceive us! This is an illusion or bad faith, against which much is vainly complained the unlevel and accentuated expression of bliss, which shines through on the face. The European Parliament has been a great help to the people of Europe in the past, and it is a great help to us in the present." + ff_track = mpf.GenericTrack(-1, dict(TEXT=pt_text)) + job = mpf.GenericJob('Test Generic', 'test.pdf', test_generic_job_props, {}, ff_track) + result_track: Sequence[mpf.GenericTrack] = self.component.get_detections_from_generic(job) + + result_props: dict[str, str] = result_track[0].detection_properties + self.assertEqual(pt_text_translation, result_props["TRANSLATION"]) + def test_wtp_with_flores_iso_lookup(self): #set default props test_generic_job_props: dict[str, str] = dict(self.defaultProps) @@ -612,11 +638,11 @@ def test_should_translate(self): self.assertFalse(should_translate("꩐꩑꩒꩓꩔꩕꩖꩗꩘꩙")) # Cham digits (\uAA50-\uAA59) self.assertFalse(should_translate("꯰꯱꯲꯳꯴꯵꯶꯷꯸꯹")) # Meetei Mayek digits (\uABF0-\uABF9) self.assertFalse(should_translate("0123456789")) # Full width digits (\uFF10-\uFF19) - + with self.subTest('Letter_Number: a letterlike numeric character'): letter_numbers = "ᛮᛯᛰⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫⅬⅭⅮⅯⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹⅺⅻⅼⅽⅾⅿↀↁↂↅↆↇↈ〇〡〢〣〤〥〦〧〨〩〸〹〺ꛦꛧꛨꛩꛪꛫꛬꛭꛮꛯ" self.assertFalse(should_translate(letter_numbers)) - + with self.subTest('Other_Number: a numeric character of other type'): other_numbers1 = "²³¹¼½¾৴৵৶৷৸৹୲୳୴୵୶୷௰௱௲౸౹౺౻౼౽౾൘൙൚൛൜൝൞൰൱൲൳൴൵൶൷൸༪༫༬༭༮༯༰༱༲༳፩፪፫፬፭፮፯፰፱፲፳፴፵፶፷፸፹፺፻፼" other_numbers2 = "៰៱៲៳៴៵៶៷៸៹᧚⁰⁴⁵⁶⁷⁸⁹₀₁₂₃₄₅₆₇₈₉⅐⅑⅒⅓⅔⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞⅟↉①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳" @@ -854,7 +880,7 @@ def test_wtp_iso_conversion(self): self.assertEqual(WtpLanguageSettings.convert_to_iso(NllbLanguageMapper.get_normalized_iso('zul_Latn')), 'zu') # languages supported by NLLB but not supported by WTP Splitter - self.assertIsNone(WtpLanguageSettings.convert_to_iso(NllbLanguageMapper.get_normalized_iso('aka_Latn'))) # 'ak' Akan + self.assertIsNone(WtpLanguageSettings.convert_to_iso(NllbLanguageMapper.get_normalized_iso('aka_Latn'))) # 'ak' Akan self.assertIsNone(WtpLanguageSettings.convert_to_iso(NllbLanguageMapper.get_normalized_iso('bem_Latn'))) # 'sw' Bemba self.assertIsNone(WtpLanguageSettings.convert_to_iso(NllbLanguageMapper.get_normalized_iso('bod_Tibt'))) # 'bo' Tibetan self.assertIsNone(WtpLanguageSettings.convert_to_iso(NllbLanguageMapper.get_normalized_iso('bos_Latn'))) # 'bs' Bosnian From ae281c3ad02059bd9dea6689777d050b028996bd Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Thu, 16 Oct 2025 10:49:20 -0400 Subject: [PATCH 5/5] Adding support for new text splitter. Merging develop changes. --- python/AzureTranslation/README.md | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/python/AzureTranslation/README.md b/python/AzureTranslation/README.md index 87f4ed6f..09294a0c 100644 --- a/python/AzureTranslation/README.md +++ b/python/AzureTranslation/README.md @@ -103,13 +103,20 @@ this model lacks support handling for Chinese punctuation. - `SENTENCE_MODEL`: Specifies the desired SaT/WtP or spaCy sentence detection model. For CPU and runtime considerations, the authors of SaT/WtP recommends using `sat-3l-sm` or `wtp-bert-mini`. - More advanced SaT/WtP models that use GPU resources (up to ~8 GB for WtP) are also available. See list of - model names - [here](https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#available-models). The - only available spaCy model (for text with unknown language) is `xx_sent_ud_sm`. + More advanced SaT/WtP models that use GPU resources (up to ~8 GB for WtP) are also available. + + See list of model names below: + + - [WtP Models](https://github.com/segment-any-text/wtpsplit/tree/1.3.0?tab=readme-ov-file#available-models) + - [SaT Models](https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#available-models). + + Please note, the only available spaCy model (for text with unknown language) is `xx_sent_ud_sm`. + + Review list of languages supported by SaT/WtP below: + + - [WtP Models](https://github.com/segment-any-text/wtpsplit/tree/1.3.0?tab=readme-ov-file#supported-languages) + - [SaT Models](https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#supported-languages) - Review list of languages supported by SaT/WtP - [here](https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#supported-languages). Review models and languages supported by spaCy [here](https://spacy.io/models). - `SENTENCE_SPLITTER_CHAR_COUNT`: Specifies maximum number of characters to process