diff --git a/python/ArgosTranslation/argos_translation_component/argos_language_mapper.py b/python/ArgosTranslation/argos_translation_component/argos_language_mapper.py index 83f320f2..dfa1bc2a 100644 --- a/python/ArgosTranslation/argos_translation_component/argos_language_mapper.py +++ b/python/ArgosTranslation/argos_translation_component/argos_language_mapper.py @@ -80,7 +80,9 @@ class ArgosLanguageMapper: "ukr": "uk", "zho": "zh", "zho-hans": "zh", - "zho-hant": "zt" + "zho-hant": "zt", + "hans": "zh", + "hant": "zt" } # Chinese has two scripts, traditional and simplified. diff --git a/python/ArgosTranslation/argos_translation_component/argos_translation_component.py b/python/ArgosTranslation/argos_translation_component/argos_translation_component.py index 8bd1969e..8f088fb7 100644 --- a/python/ArgosTranslation/argos_translation_component/argos_translation_component.py +++ b/python/ArgosTranslation/argos_translation_component/argos_translation_component.py @@ -25,7 +25,7 @@ ############################################################################# -from typing import Sequence, Dict, Tuple +from typing import Optional, Sequence, Dict, NamedTuple import pathlib import logging @@ -34,26 +34,35 @@ from argostranslate import package, translate +from argos_translation_component.multi_language_processor import DetectedLangInfo, TranslationMetrics, MultiLanguageProcessor from .argos_language_mapper import ArgosLanguageMapper logger = logging.getLogger('ArgosTranslationComponent') +MULTI_LANG_REPORT = "MULTI_LANGUAGE_REPORT" +UNIDENTIFIED_LANGUAGE = 'UNKNOWN' + +class ArgosTranslationCache(NamedTuple): + source_lang: str + target_lang: str + text: str +class ArgosTranslationModel(NamedTuple): + source_lang: str + target_lang: str + model: Optional[translate.ITranslation] class ArgosTranslationComponent: def get_detections_from_video(self, job: mpf.VideoJob) -> Sequence[mpf.VideoTrack]: logger.info(f'Received video job.') - return self.get_feed_forward_detections(job, job.feed_forward_track, video_job=True) def get_detections_from_image(self, job: mpf.ImageJob) -> Sequence[mpf.ImageLocation]: logger.info(f'Received image job.') - return self.get_feed_forward_detections(job, job.feed_forward_location) def get_detections_from_audio(self, job: mpf.AudioJob) -> Sequence[mpf.AudioTrack]: logger.info(f'Received audio job.') - return self.get_feed_forward_detections(job, job.feed_forward_track) @staticmethod @@ -107,10 +116,10 @@ def get_feed_forward_detections(job, job_feed_forward, video_job=False): class TranslationWrapper: def __init__(self, job_props): self.supported_languages = self.get_supported_languages_codes() - self._installed_languages = translate.get_installed_languages() - self.installed_lang_codes = [lang.code for lang in self._installed_languages] + # (Input Lang, Output Lang, Translation Model) + self._cached_translation_model = ArgosTranslationModel("", "", None) self._props_to_translate = [ prop.strip() for prop in @@ -141,7 +150,7 @@ def __init__(self, job_props): prop_type=str ).split(',') ] - + # Set a default language to translate from. self._from_lang = mpf_util.get_property( properties=job_props, key='DEFAULT_SOURCE_LANGUAGE', @@ -156,11 +165,19 @@ def __init__(self, job_props): prop_type=str ).lower().strip() + if self._from_lang == "": + logger.info("No default source language selected!") + elif self._from_lang in ArgosLanguageMapper.iso_map: + self._from_lang = ArgosLanguageMapper.get_code(self._from_lang, self._from_script) + elif self._from_lang != "" and self._from_lang not in self.supported_languages: + raise mpf.DetectionError.DETECTION_FAILED.exception( + f"Default source language, {self._from_lang}, is not supported." + ) # TODO: Add support for non-English translations in the future. + self._to_lang_word_separator = " " self._to_lang = "en" - - self._translation_cache: Dict[str, Tuple[str, str]] = {} + self._translation_cache: Dict[ArgosTranslationCache, str] = {} @staticmethod def get_supported_languages_codes(): @@ -171,116 +188,146 @@ def get_supported_languages_codes(): package.update_package_index() available_packages = package.get_available_packages() - # TODO: Update if we want to support translations to non-English languages. - available_packages = [y.from_code for y in list( - filter( - lambda x: x.to_code == "en", available_packages - ) - )] + available_packages = [y.from_code for y in available_packages] return available_packages - def add_translations(self, ff_props: Dict[str, str]): - for prop_to_translate in self._props_to_translate: - input_text = ff_props.get(prop_to_translate, None) - if input_text: - break - else: - logger.warning("No text to translate found in track.") - return - - if cached_translation := self._translation_cache.get(input_text): - ff_props['TRANSLATION'] = cached_translation[0] - ff_props['TRANSLATION_SOURCE_LANGUAGE'] = cached_translation[1] - return - for script_prop_name in self._script_prop_names: - if script_prop_name in ff_props: - self._from_script = ff_props.get(script_prop_name).lower().strip() - break - - for lang_prop_name in self._lang_prop_names: - if lang_prop_name in ff_props: - lang = ff_props.get(lang_prop_name).lower().strip() - if lang in self.supported_languages: - self._from_lang = lang - break - # TODO: Change if supporting non-English translations. - elif lang in ('en','eng'): - ff_props['SKIPPED_TRANSLATION'] = 'TRUE' - logger.info(f'Skipped translation of the "{prop_to_translate}" ' - f'property because it was already in the target language.') - return - elif lang in ArgosLanguageMapper.iso_map: - # Convert supported languages to ISO-639-1 - self._from_lang = ArgosLanguageMapper.get_code(lang, self._from_script) - break - else: - raise mpf.DetectionError.DETECTION_FAILED.exception( - f"Source language, {lang}, is not supported." - ) - else: - # Before converting to IS0-639-1, keep name of original default setting. - source_lang_name = self._from_lang - if self._from_lang in ArgosLanguageMapper.iso_map: - self._from_lang = ArgosLanguageMapper.get_code(self._from_lang, self._from_script) - if self._from_lang == 'en': - ff_props['SKIPPED_TRANSLATION'] = 'TRUE' - logger.info(f'Skipped translation of the "{prop_to_translate}" ' - f'property because it was already in the target language.') - return - - if self._from_lang == "": - raise mpf.DetectionError.MISSING_PROPERTY.exception( - 'None of the properties from "LANGUAGE_FEED_FORWARD_PROP" ' - f'({self._lang_prop_names}) were found in the feed forward track and no ' - '"DEFAULT_SOURCE_LANGUAGE" was provided.') - - if self._from_lang != "" and self._from_lang not in self.supported_languages: + def _check_lang_script_codes(self, lang:str, script:str): + if lang: + if lang in self.supported_languages: + return lang + elif lang in ArgosLanguageMapper.iso_map: + lang = ArgosLanguageMapper.get_code(lang, script) + return lang + else: raise mpf.DetectionError.DETECTION_FAILED.exception( - f"Default source language, {source_lang_name}, is not supported." + f"Source language, {lang}, is not supported." ) + else: + raise mpf.DetectionError.MISSING_PROPERTY.exception( + 'None of the properties from "LANGUAGE_FEED_FORWARD_PROP" ' + f'({self._lang_prop_names}) were found in the feed forward track and no ' + '"DEFAULT_SOURCE_LANGUAGE" was provided.') - if self._from_lang not in self.installed_lang_codes: - logger.info(f"Language {self._from_lang} is not installed. Installing package.") + def _check_installed_lang(self, source_lang:str): + if source_lang not in self.installed_lang_codes: + logger.info(f"Language {source_lang} is not installed. Installing package.") # From Argos Translate for downloading language models. available_packages = package.get_available_packages() available_package = list( filter( - lambda x: x.from_code == self._from_lang and x.to_code == self._to_lang, available_packages + lambda x: x.from_code == source_lang and x.to_code == self._to_lang, available_packages ) )[0] download_path = available_package.download() package.install_from_path(download_path) - logger.info(f"Successfully installed {self._from_lang}.") + logger.info(f"Successfully installed {source_lang}.") self.installed_lang_codes = [lang.code for lang in translate.get_installed_languages()] self._installed_languages = translate.get_installed_languages() + # (Input Lang, Output Lang, Translation Model) + if self._cached_translation_model.source_lang == source_lang and \ + self._cached_translation_model.target_lang == self._to_lang: + return + from_lang = list(filter( - lambda x: x.code == self._from_lang, + lambda x: x.code == source_lang, self._installed_languages))[0] to_lang = list(filter( lambda x: x.code == self._to_lang, self._installed_languages))[0] - translation = from_lang.get_translation(to_lang) + translation_model = from_lang.get_translation(to_lang) + + self._cached_translation_model = ArgosTranslationModel(source_lang, self._to_lang, translation_model) + + def _run_translation(self, source_lang: str, prop_to_translate: str, input_text:str) -> str: + if source_lang == self._to_lang: + logger.info(f'Skipped translation of the "{prop_to_translate}" ' + f'property because it was already in the target language.') + return input_text + + if cached_translation := self._translation_cache.get(ArgosTranslationCache(source_lang, self._to_lang, input_text)): + return cached_translation - if translation is None: + self._check_installed_lang(source_lang) + + if self._cached_translation_model.model is None: raise mpf.DetectionError.DETECTION_FAILED.exception( - f"No valid translation model from {self._from_lang} to {self._to_lang}, " + f"No valid translation model from {source_lang} to {self._to_lang}, " f"check if any packages are missing." ) - logger.info(f"Translating the {prop_to_translate} property.") + translated_text = self._cached_translation_model.model.translate(input_text) + trans_key = ArgosTranslationCache(source_lang, self._to_lang, input_text) + self._translation_cache[trans_key] = translated_text + logger.info("Translation complete.") + return translated_text + + def _run_reports_through_translation(self, + metrics: TranslationMetrics, + prop_to_translate: str, + input_text: str): + for report in metrics.language_reports: + sub_text = input_text[report.start_idx:report.end_idx] + sub_lang = report.language + if sub_lang == UNIDENTIFIED_LANGUAGE: + logger.info(f'Text to translate contains an UNIDENTIFIED language ' + f'for this section [{report.start_idx}-{report.end_idx}].') + metrics.translations.append(sub_text) + metrics.unknown_lang = True + continue + sub_script = report.script + sub_source_lang = self._check_lang_script_codes(sub_lang, sub_script) + translated_text = self._run_translation(sub_source_lang, + prop_to_translate, + sub_text) + if translated_text == sub_text: + logger.info(f'Text to translate unchanged ' + f'for this section [{report.start_idx}-{report.end_idx}].') + metrics.translations.append(sub_text) + else: + metrics.skipped_translation = False + metrics.translations.append(translated_text) + metrics.lang_text_count[sub_source_lang] += len(sub_text) + metrics.lang_conf[sub_source_lang].append(report.conf) - translated_text = translation.translate(input_text) + def add_translations(self, ff_props: Dict[str, str]): + for prop_to_translate in self._props_to_translate: + input_text = ff_props.get(prop_to_translate, None) + if input_text: + break + else: + logger.warning("No text to translate found in track.") + return - self._translation_cache[input_text] = (translated_text, self._from_lang) + source_script = self._from_script + for script_prop_name in self._script_prop_names: + if script_prop_name in ff_props: + source_script = ff_props.get(script_prop_name).lower().strip() + break - logger.info("Translation complete.") + source_lang = self._from_lang + for lang_prop_name in self._lang_prop_names: + if lang_prop_name in ff_props: + source_lang = ff_props.get(lang_prop_name).lower().strip() + break - ff_props['TRANSLATION_SOURCE_LANGUAGE'] = self._from_lang - ff_props['TRANSLATION'] = translated_text + metrics = TranslationMetrics() + if text_language_report := ff_props.get(MULTI_LANG_REPORT): + for report in text_language_report.split(";"): + metrics.language_reports.append(MultiLanguageProcessor.extract_lang_report(report)) + else: + metrics.language_reports.append(DetectedLangInfo(source_lang, source_script, 0, len(input_text), 1)) + + self._run_reports_through_translation(metrics, prop_to_translate, input_text) + MultiLanguageProcessor.aggregate_translation_results(metrics, + prop_to_translate, + self._to_lang, + self._to_lang_word_separator, + ff_props, + logger) \ No newline at end of file diff --git a/python/ArgosTranslation/argos_translation_component/multi_language_processor.py b/python/ArgosTranslation/argos_translation_component/multi_language_processor.py new file mode 100644 index 00000000..5cfe98a1 --- /dev/null +++ b/python/ArgosTranslation/argos_translation_component/multi_language_processor.py @@ -0,0 +1,105 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +import collections +from typing import Dict, NamedTuple +import logging + +class DetectedLangInfo(NamedTuple): + language: str + script: str + start_idx: int + end_idx: int + conf: float + +class TranslationMetrics: + def __init__(self): + self.language_reports = [] + self.translations = [] + self.lang_conf = collections.defaultdict(lambda: []) + self.lang_text_count = collections.defaultdict(lambda: 0) + self.skipped_translation = True + self.unknown_lang = False + +class MultiLanguageProcessor: + @staticmethod + def extract_lang_report(report: str) -> DetectedLangInfo: + """This helper function extracts the language, script, and start-end indexes + of TEXT submitted with an associated Multiple Language Detection report + + The function is called whenever the MULTI_LANGUAGE_REPORT is + present as a feed-forward property. + + Args: + report (str): A single language-script report. + Each report is formatted as follows: + `language-script, start-end indexes, prediction confidence` + + Ex. `eng-latin, 10-120, 0.998` corresponds to an english section of + text starting at char index 10 and ending at char index 120. + + Returns: + DetectedLangInfo: The language, script, start index, end index, + and confidence of prediction respectively. + """ + + # Split `language-script, start-end indexes, prediction confidence` report into + # respective parts for translation. + report_list = report.split(",") + language_info = report_list[0].split()[1] + if '-' in language_info: + (lang, script) = language_info.split("-") + else: + lang = language_info + script = "" + + (start, end) = report_list[1].split()[1].split("-") + conf = report_list[2].split()[1] + + return DetectedLangInfo(lang, script, int(start), int(end), float(conf)) + + @staticmethod + def aggregate_translation_results(metrics: TranslationMetrics, + prop_to_translate:str, + to_lang: str, + to_lang_word_separator: str, + detections: Dict[str, str], + logger: logging.Logger): + if metrics.skipped_translation: + if metrics.unknown_lang: + logger.info(f'Skipped translation of the "{prop_to_translate}" ' + f'property.') + else: + logger.info(f'Skipped translation of the "{prop_to_translate}" ' + f'property because it was already in the target language.') + detections['SKIPPED_TRANSLATION'] = 'TRUE' + else: + main_source_lang = max(metrics.lang_text_count.items(), key=lambda x: x[1])[0] + detections['TRANSLATION SOURCE LANGUAGE'] = main_source_lang + detections['TRANSLATION TO LANGUAGE'] = to_lang + detections['TRANSLATION'] = to_lang_word_separator.join(metrics.translations) + detections['TRANSLATION SOURCE LANGUAGE CONFIDENCE'] = str(sum(metrics.lang_conf[main_source_lang])/\ + len(metrics.lang_conf[main_source_lang])) \ No newline at end of file diff --git a/python/ArgosTranslation/plugin-files/descriptor/descriptor.json b/python/ArgosTranslation/plugin-files/descriptor/descriptor.json index 216e8307..046022c1 100644 --- a/python/ArgosTranslation/plugin-files/descriptor/descriptor.json +++ b/python/ArgosTranslation/plugin-files/descriptor/descriptor.json @@ -73,8 +73,7 @@ "name": "ARGOS TRANSLATION TEXT FILE ACTION", "description": "Uses Argos Translation to perform translation on a plain text file.", "algorithm": "ARGOSTRANSLATION", - "properties": [ - ] + "properties": [] } ], "tasks": [ @@ -102,4 +101,4 @@ ] } ] -} +} \ No newline at end of file diff --git a/python/ArgosTranslation/tests/test_argos_translation.py b/python/ArgosTranslation/tests/test_argos_translation.py index ae6cbcd5..21822a83 100644 --- a/python/ArgosTranslation/tests/test_argos_translation.py +++ b/python/ArgosTranslation/tests/test_argos_translation.py @@ -41,6 +41,13 @@ CHINESE_SHORT_SAMPLE = "你好,你叫什么名字?" SHORT_OUTPUT = "Where's the library?" +SPANISH_ENGLISH_INPUT = SPANISH_SHORT_SAMPLE + SHORT_OUTPUT + +MULTI_LANGUAGE_REPORT_SPA_ENG = ( + "lang: spa-latn, section: 0-26, conf: 1; " + "lang: eng-latn, section: 26-46, conf: 1" +) + # Note: Argos-Chinese translations have improved over time. SHORT_OUTPUT_CHINESE = "Hello. What's your name?" @@ -71,7 +78,7 @@ def test_generic_job(self): result = comp.get_detections_from_generic(job) self.assertEqual(1, len(result)) - self.assertEqual('es', result[0].detection_properties['TRANSLATION_SOURCE_LANGUAGE']) + self.assertEqual('es', result[0].detection_properties['TRANSLATION SOURCE LANGUAGE']) self.assertEqual(SHORT_OUTPUT, result[0].detection_properties['TRANSLATION']) def test_plaintext_job(self): @@ -81,7 +88,7 @@ def test_plaintext_job(self): result = comp.get_detections_from_generic(job) self.assertEqual(1, len(result)) - self.assertEqual('es', result[0].detection_properties['TRANSLATION_SOURCE_LANGUAGE']) + self.assertEqual('es', result[0].detection_properties['TRANSLATION SOURCE LANGUAGE']) self.assertEqual(SHORT_OUTPUT, result[0].detection_properties['TRANSLATION']) def test_audio_job(self): @@ -91,7 +98,7 @@ def test_audio_job(self): result = comp.get_detections_from_audio(job) self.assertEqual(1, len(result)) - self.assertEqual('es', result[0].detection_properties['TRANSLATION_SOURCE_LANGUAGE']) + self.assertEqual('es', result[0].detection_properties['TRANSLATION SOURCE LANGUAGE']) self.assertEqual(SHORT_OUTPUT, result[0].detection_properties['TRANSLATION']) def test_image_job(self): @@ -101,9 +108,20 @@ def test_image_job(self): result = comp.get_detections_from_image(job) self.assertEqual(1, len(result)) - self.assertEqual('es', result[0].detection_properties['TRANSLATION_SOURCE_LANGUAGE']) + self.assertEqual('es', result[0].detection_properties['TRANSLATION SOURCE LANGUAGE']) self.assertEqual(SHORT_OUTPUT, result[0].detection_properties['TRANSLATION']) + def test_language_report(self): + ff_loc = mpf.ImageLocation(0, 0, 10, 10, -1, dict(TEXT=SPANISH_SHORT_SAMPLE+SHORT_OUTPUT, LANGUAGE='ES', + MULTI_LANGUAGE_REPORT=MULTI_LANGUAGE_REPORT_SPA_ENG)) + job = mpf.ImageJob('Test Image', 'test.jpg', dict(DEFAULT_SOURCE_LANGUAGE='ZH'), {}, ff_loc) + comp = ArgosTranslationComponent() + result = comp.get_detections_from_image(job) + + self.assertEqual(1, len(result)) + self.assertEqual('es', result[0].detection_properties['TRANSLATION SOURCE LANGUAGE']) + self.assertEqual(SHORT_OUTPUT + " " + SHORT_OUTPUT, result[0].detection_properties['TRANSLATION']) + def test_video_job(self): ff_track = mpf.VideoTrack( 0, 1, -1, @@ -117,9 +135,9 @@ def test_video_job(self): result = comp.get_detections_from_video(job) self.assertEqual(1, len(result)) - self.assertEqual('es', result[0].detection_properties['TRANSLATION_SOURCE_LANGUAGE']) - self.assertEqual('es', result[0].frame_locations[0].detection_properties['TRANSLATION_SOURCE_LANGUAGE']) - self.assertEqual('ru', result[0].frame_locations[1].detection_properties['TRANSLATION_SOURCE_LANGUAGE']) + self.assertEqual('es', result[0].detection_properties['TRANSLATION SOURCE LANGUAGE']) + self.assertEqual('es', result[0].frame_locations[0].detection_properties['TRANSLATION SOURCE LANGUAGE']) + self.assertEqual('ru', result[0].frame_locations[1].detection_properties['TRANSLATION SOURCE LANGUAGE']) self.assertEqual(SHORT_OUTPUT, result[0].detection_properties['TRANSLATION']) self.assertEqual(SHORT_OUTPUT, result[0].frame_locations[0].detection_properties['TRANSLATION']) self.assertEqual(SHORT_OUTPUT, result[0].frame_locations[1].detection_properties['TRANSLATION']) @@ -143,9 +161,9 @@ def test_language_behavior(self): # Should skip English tracks self.assertEqual('TRUE', result[0].frame_locations[3].detection_properties['SKIPPED_TRANSLATION']) - self.assertEqual('ru', result[0].frame_locations[0].detection_properties['TRANSLATION_SOURCE_LANGUAGE']) - self.assertEqual('es', result[0].frame_locations[1].detection_properties['TRANSLATION_SOURCE_LANGUAGE']) - self.assertEqual('zh', result[0].frame_locations[2].detection_properties['TRANSLATION_SOURCE_LANGUAGE']) + self.assertEqual('ru', result[0].frame_locations[0].detection_properties['TRANSLATION SOURCE LANGUAGE']) + self.assertEqual('es', result[0].frame_locations[1].detection_properties['TRANSLATION SOURCE LANGUAGE']) + self.assertEqual('zh', result[0].frame_locations[2].detection_properties['TRANSLATION SOURCE LANGUAGE']) self.assertEqual(SHORT_OUTPUT, result[0].frame_locations[0].detection_properties['TRANSLATION']) self.assertEqual(SHORT_OUTPUT, result[0].frame_locations[1].detection_properties['TRANSLATION']) self.assertEqual(SHORT_OUTPUT_CHINESE, result[0].frame_locations[2].detection_properties['TRANSLATION']) @@ -163,7 +181,7 @@ def test_large_text(self): result = comp.get_detections_from_generic(job) self.assertEqual(1, len(result)) - self.assertEqual('es', result[0].detection_properties['TRANSLATION_SOURCE_LANGUAGE']) + self.assertEqual('es', result[0].detection_properties['TRANSLATION SOURCE LANGUAGE']) trans_result = result[0].detection_properties['TRANSLATION'].replace("nullify","nurture") trans_result = trans_result.replace("founded on these principles","founded on those principles") @@ -185,7 +203,7 @@ def test_medium_text(self): result = comp.get_detections_from_generic(job) self.assertEqual(1, len(result)) - self.assertEqual('ru', result[0].detection_properties['TRANSLATION_SOURCE_LANGUAGE']) + self.assertEqual('ru', result[0].detection_properties['TRANSLATION SOURCE LANGUAGE']) self.assertEqual(MED_OUTPUT, result[0].detection_properties['TRANSLATION']) def test_no_feed_forward_location(self): @@ -243,7 +261,7 @@ def test_iso_map(self): result = comp.get_detections_from_image(job) self.assertEqual(1, len(result)) - self.assertEqual('es', result[0].detection_properties['TRANSLATION_SOURCE_LANGUAGE']) + self.assertEqual('es', result[0].detection_properties['TRANSLATION SOURCE LANGUAGE']) self.assertEqual(SHORT_OUTPUT, result[0].detection_properties['TRANSLATION']) def test_translation_cache(self): @@ -265,7 +283,7 @@ def test_translation_cache(self): self.assertEqual(SPANISH_SHORT_SAMPLE, result.detection_properties['TEXT']) self.assertEqual(SHORT_OUTPUT, result.detection_properties['TRANSLATION']) - self.assertEqual('es', result.detection_properties['TRANSLATION_SOURCE_LANGUAGE']) + self.assertEqual('es', result.detection_properties['TRANSLATION SOURCE LANGUAGE']) detection1 = result.frame_locations[0] self.assertEqual(SPANISH_SHORT_SAMPLE, detection1.detection_properties['TEXT']) diff --git a/python/AzureTranslation/acs_translation_component/acs_translation_component.py b/python/AzureTranslation/acs_translation_component/acs_translation_component.py index 0ecf5195..9bd6e4f8 100644 --- a/python/AzureTranslation/acs_translation_component/acs_translation_component.py +++ b/python/AzureTranslation/acs_translation_component/acs_translation_component.py @@ -37,18 +37,20 @@ import urllib.request import uuid from typing import Callable, Dict, List, Literal, Mapping, Match, NamedTuple, \ - Optional, Sequence, TypedDict, TypeVar, Union + Optional, Sequence, Tuple, TypedDict, TypeVar, Union import mpf_component_api as mpf import mpf_component_util as mpf_util +from acs_translation_component.multi_language_processor import DetectedLangInfo, TranslationMetrics, MultiLanguageProcessor from nlp_text_splitter import TextSplitterModel, TextSplitter from . import convert_language_code log = logging.getLogger('AcsTranslationComponent') - +MULTI_LANG_REPORT = "MULTI_LANGUAGE_REPORT" +UNIDENTIFIED_LANGUAGE = 'UNKNOWN' class AcsTranslationComponent: @@ -199,6 +201,75 @@ def __init__(self, job_properties: Mapping[str, str], sentence_model: TextSplitt self.translation_count = 0 + def _select_source_lang(self, translation_result: TranslationResult) -> Tuple[str, str, float]: + if detect_result := translation_result.detect_result: + source_lang = detect_result.primary_language + source_lang_confidence = str(detect_result.primary_language_confidence) + + # This is only used for MULTI_LANGUAGE_REPORT. + # When MULTI_LANGUAGE_REPORT is present, + # detections will not generate an alternative prediction or score. + source_lang_conf_score = detect_result.primary_language_confidence + + if detect_result.alternative_language: + source_lang += f'; {detect_result.alternative_language}' + source_lang_confidence += f'; {detect_result.alternative_language_confidence}' + return (source_lang, source_lang_confidence, source_lang_conf_score) + return ("", "", -1) + + def _run_reports_through_translation(self, + text_to_translate: str, + prop_to_translate: str, + metrics: TranslationMetrics, + detection_properties: Dict[str,str]): + + if metrics.singleton: + translation_result = self._translate_text(text_to_translate, detection_properties) + source_lang, source_lang_confidence, _ = self._select_source_lang(translation_result) + if source_lang: + detection_properties['TRANSLATION SOURCE LANGUAGE'] = source_lang + detection_properties['TRANSLATION SOURCE LANGUAGE CONFIDENCE'] \ + = source_lang_confidence + if translation_result.skipped: + detection_properties['SKIPPED TRANSLATION'] = 'TRUE' + if translation_result.language_not_supported: + detection_properties['MISSING_LANGUAGE_MODELS'] \ + = translation_result.detect_result.primary_language + else: + log.info(f'Skipped translation of the "{prop_to_translate}" property because it was ' + 'already in the target language.') + else: + detection_properties['TRANSLATION'] = translation_result.translated_text + log.info(f'Successfully translated the "{prop_to_translate}" property.') + elif metrics.language_reports: + for language_report in metrics.language_reports: + sub_text = text_to_translate[language_report.start_idx:language_report.end_idx] + if language_report.language.upper() == UNIDENTIFIED_LANGUAGE: + metrics.unknown_lang.add(language_report.language) + metrics.translations.append(sub_text) + continue + + translation_result = self._translate_text(sub_text, + detection_properties, + language_report) + source_lang, _, source_lang_conf_score \ + = self._select_source_lang(translation_result) + + if translation_result.language_not_supported: + metrics.unknown_lang.add(translation_result.detect_result.primary_language) + + if not translation_result.skipped: + metrics.skipped_translation = False + metrics.lang_text_count[source_lang]+=len(sub_text) + metrics.lang_conf[source_lang].append(source_lang_conf_score) + metrics.translations.append(translation_result.translated_text) + else: + metrics.translations.append(sub_text) + + if metrics.unknown_lang: + detection_properties['MISSING_LANGUAGE_MODELS'] = ','.join(x for x in metrics.unknown_lang) + + def add_translations(self, detection_properties: Dict[str, str]) -> None: """ Adds translations to detection_properties. detection_properties is modified in place. @@ -211,38 +282,34 @@ def add_translations(self, detection_properties: Dict[str, str]) -> None: continue log.info(f'Attempting to translate the "{prop_name}" property...') - translation_result = self._translate_text(text_to_translate, detection_properties) detection_properties['TRANSLATION TO LANGUAGE'] = self._to_language - if detect_result := translation_result.detect_result: - source_lang = detect_result.primary_language - source_lang_confidence = str(detect_result.primary_language_confidence) - if detect_result.alternative_language: - source_lang += f'; {detect_result.alternative_language}' - source_lang_confidence += f'; {detect_result.alternative_language_confidence}' - - detection_properties['TRANSLATION SOURCE LANGUAGE'] = source_lang - - detection_properties['TRANSLATION SOURCE LANGUAGE CONFIDENCE'] \ - = source_lang_confidence - - if translation_result.skipped: - detection_properties['SKIPPED TRANSLATION'] = 'TRUE' - if translation_result.language_not_supported: - detection_properties['MISSING_LANGUAGE_MODELS'] \ - = translation_result.detect_result.primary_language - else: - log.info(f'Skipped translation of the "{prop_name}" property because it was ' - 'already in the target language.') - else: - detection_properties['TRANSLATION'] = translation_result.translated_text - log.info(f'Successfully translated the "{prop_name}" property.') + metrics = TranslationMetrics() + if text_language_report := detection_properties.get(MULTI_LANG_REPORT): + metrics.singleton = False + for report in text_language_report.split(";"): + metrics.language_reports.append(MultiLanguageProcessor.extract_lang_report(report)) + + self._run_reports_through_translation( + text_to_translate, + prop_name, + metrics, + detection_properties) + + MultiLanguageProcessor.aggregate_translation_results(metrics, + prop_name, + self._to_language, + self._to_lang_word_separator, + detection_properties, + log, + metrics.singleton) self.translation_count += 1 return # Only process first matched property. - def _translate_text(self, text: str, detection_properties: Dict[str, str]) -> TranslationResult: + def _translate_text(self, text: str, detection_properties: Dict[str, str], + lang_report: Optional[DetectedLangInfo] = None) -> TranslationResult: """ Translates the given text. If the text is longer than ACS allows, we will split up the text and translate each part separately. If, during the current job, we have seen the @@ -251,7 +318,12 @@ def _translate_text(self, text: str, detection_properties: Dict[str, str]) -> Tr if cached_translation := self._translation_cache.get(text): return cached_translation - if self._provided_from_language: + if lang_report: + source_lang_script = f"{lang_report.language}-{lang_report.script}" + source_lang = convert_language_code.iso_to_bcp(source_lang_script) + detect_result = DetectResult(source_lang, lang_report.conf) + from_lang, from_lang_confidence, *_ = detect_result + elif self._provided_from_language: detect_result = DetectResult(self._provided_from_language, 1) from_lang, from_lang_confidence, *_ = detect_result elif upstream_lang := self._get_upstream_language(detection_properties): @@ -553,9 +625,6 @@ def set_query_params(url: str, query_params: Mapping[str, str]) -> str: return urllib.parse.urlunparse(replaced_parts) - - - def get_acs_headers(subscription_key: str) -> Dict[str, str]: return {'Ocp-Apim-Subscription-Key': subscription_key, 'Content-type': 'application/json; charset=UTF-8', diff --git a/python/AzureTranslation/acs_translation_component/multi_language_processor.py b/python/AzureTranslation/acs_translation_component/multi_language_processor.py new file mode 100644 index 00000000..e027a9db --- /dev/null +++ b/python/AzureTranslation/acs_translation_component/multi_language_processor.py @@ -0,0 +1,112 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +import collections +from typing import Dict, NamedTuple +import logging + +class DetectedLangInfo(NamedTuple): + language: str + script: str + start_idx: int + end_idx: int + conf: float + +class TranslationMetrics: + def __init__(self): + self.language_reports = [] + self.translations = [] + self.lang_conf = collections.defaultdict(lambda: []) + self.lang_text_count = collections.defaultdict(lambda: 0) + self.skipped_translation = True + self.unknown_lang = set() + self.singleton = True + +# TODO: Transfer improvements to other components. +class MultiLanguageProcessor: + @staticmethod + def extract_lang_report(report: str) -> DetectedLangInfo: + """This helper function extracts the language, script, and start-end indexes + of TEXT submitted with an associated Multiple Language Detection report + + The function is called whenever the MULTI_LANGUAGE_REPORT is + present as a feed-forward property. + + Args: + report (str): A single language-script report. + Each report is formatted as follows: + `language-script, start-end indexes, prediction confidence` + + Ex. `eng-latin, 10-120, 0.998` corresponds to an english section of + text starting at char index 10 and ending at char index 120. + + Returns: + DetectedLangInfo: The language, script, start index, end index, + and confidence of prediction respectively. + """ + + # Split `language-script, start-end indexes, prediction confidence` report into + # respective parts for translation. + report_list = report.split(",") + language_info = report_list[0].split()[1] + if '-' in language_info: + (lang, script) = language_info.split("-") + else: + lang = language_info + script = "" + + (start, end) = report_list[1].split()[1].split("-") + conf = report_list[2].split()[1] + + return DetectedLangInfo(lang, script, int(start), int(end), float(conf)) + + @staticmethod + def aggregate_translation_results(metrics: TranslationMetrics, + prop_to_translate:str, + to_lang: str, + to_lang_word_separator: str, + detections: Dict[str, str], + logger: logging.Logger, + skip_aggregation: bool = False): + + if not skip_aggregation: + if metrics.skipped_translation: + if metrics.unknown_lang: + logger.info(f'Skipped translation of the "{prop_to_translate}" ' + f'property.') + else: + logger.info(f'Skipped translation of the "{prop_to_translate}" ' + f'property because it was already in the target language.') + detections['SKIPPED TRANSLATION'] = 'TRUE' + else: + main_source_lang = max(metrics.lang_text_count.items(), key=lambda x: x[1])[0] + detections['TRANSLATION SOURCE LANGUAGE'] = main_source_lang + detections['TRANSLATION TO LANGUAGE'] = to_lang + detections['TRANSLATION'] = to_lang_word_separator.join(metrics.translations) + detections['TRANSLATION SOURCE LANGUAGE CONFIDENCE'] = str(sum(metrics.lang_conf[main_source_lang])/\ + len(metrics.lang_conf[main_source_lang])) + + logger.info(f'Successfully translated the "{prop_to_translate}" property.') diff --git a/python/AzureTranslation/tests/data/NOTICE b/python/AzureTranslation/tests/data/NOTICE index 9fc41f18..ff2d2a7f 100644 --- a/python/AzureTranslation/tests/data/NOTICE +++ b/python/AzureTranslation/tests/data/NOTICE @@ -2,3 +2,7 @@ Contains the beginning of "The Art of War" by Sunzi in Traditional Chinese. Public Domain https://www.gutenberg.org/ebooks/12407 + +# results-chinese-long.json +Contains translated text from the Universal Declaration of Human Rights. +https://www.un.org/en/about-us/universal-declaration-of-human-rights \ No newline at end of file diff --git a/python/AzureTranslation/tests/data/results-chinese-long.json b/python/AzureTranslation/tests/data/results-chinese-long.json new file mode 100644 index 00000000..3924fb13 --- /dev/null +++ b/python/AzureTranslation/tests/data/results-chinese-long.json @@ -0,0 +1,10 @@ +[ + { + "translations": [ + { + "text": "Article 1. All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should be treated in a spirit of brotherhood.", + "to": "en" + } + ] + } +] \ No newline at end of file diff --git a/python/AzureTranslation/tests/test_acs_translation.py b/python/AzureTranslation/tests/test_acs_translation.py index d2297f71..19ce9298 100644 --- a/python/AzureTranslation/tests/test_acs_translation.py +++ b/python/AzureTranslation/tests/test_acs_translation.py @@ -57,6 +57,31 @@ SPANISH_SAMPLE_TEXT = '¿Dónde está la biblioteca?' SPANISH_SAMPLE_TEXT_ENG_TRANSLATE = 'Where\'s the library?' +CHINESE_SAMPLE_TEXT_REPORT = "第一 条人人生而自由,在尊严和权利上一律平等。他们赋有理性和良心,并应以兄 弟关系的精神相对待。" +CHINESE_SAMPLE_TEXT_REPORT_ENG_TRANSLATE = ( + "Article 1. All human beings are born free and equal in dignity and rights. " + "They are endowed with reason and conscience and should be treated in a spirit of brotherhood." +) + +MULTI_LANGUAGE_REPORT = ( + "lang: zho-hans, section: 0-48, conf: 0.6386098072481651; " + "lang: eng-latn, section: 48-216, conf: 0.4348198323425756" +) + +UNKNOWN_MULTI_LANGUAGE_REPORT = ( + "lang: UNKNOWN, section: 0-48, conf: 0.6386098072481651; " + "lang: eng-latn, section: 48-216, conf: 0.4348198323425756" +) + +MULTI_LANGUAGE_REPORT_ENG_ONLY = ( + "lang: eng-latn, section: 48-216, conf: 0.4348198323425756" +) + +MULTI_LANGUAGE_REPORT_INCOMPLETE = ( + "lang: zho, section: 0-48, conf: 0.6386098072481651; " + "lang: eng-latn, section: 48-216, conf: 0.4348198323425756" +) + TEST_DATA = pathlib.Path(__file__).parent / 'data' logging.basicConfig(level=logging.DEBUG) @@ -908,7 +933,6 @@ def test_language_ff_prop_different(self): self.assertEqual(1, len(results)) result = results[0] - print(result.detection_properties) self.assertEqual(SPANISH_SAMPLE_TEXT, result.detection_properties['TEXT']) self.assertEqual(SPANISH_SAMPLE_TEXT_ENG_TRANSLATE, @@ -928,7 +952,6 @@ def test_unsupported_ff_prop(self): self.assertEqual(1, len(results)) result_props = results[0].detection_properties - print(result_props) self.assertNotIn('TRANSLATION', result_props) self.assertEqual('EN', result_props['TRANSLATION TO LANGUAGE']) self.assertEqual('si', result_props['TRANSLATION SOURCE LANGUAGE']) @@ -1040,6 +1063,67 @@ def test_azure_char_count(self): self.assertEqual(20, get_azure_char_count('😀' * 5 + '👍' * 5)) + def test_multi_lang_report(self): + self.set_results_file('results-chinese-long.json') + self.set_results_file('results-chinese-long.json') + text = CHINESE_SAMPLE_TEXT_REPORT + CHINESE_SAMPLE_TEXT_REPORT_ENG_TRANSLATE + detection_props = dict(TEXT=text, MULTI_LANGUAGE_REPORT=MULTI_LANGUAGE_REPORT) + + ff_track = mpf.GenericTrack(-1, detection_props) + job = mpf.GenericJob('Test', 'test.pdf', get_test_properties(), {}, ff_track) + results = list(AcsTranslationComponent().get_detections_from_generic(job)) + + self.assertEqual(1, len(results)) + self.assertEqual(text, results[0].detection_properties['TEXT']) + + expected_translation = (CHINESE_SAMPLE_TEXT_REPORT_ENG_TRANSLATE + ' ' + CHINESE_SAMPLE_TEXT_REPORT_ENG_TRANSLATE) + self.assertEqual(expected_translation, detection_props['TRANSLATION']) + self.assertEqual('EN', detection_props['TRANSLATION TO LANGUAGE']) + + self.assertEqual('zh-hans', detection_props['TRANSLATION SOURCE LANGUAGE']) + self.assertAlmostEqual(0.6386098072481651, + float(detection_props['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) + + detect_request_text = self.get_request_body()[0]['Text'] + self.assertEqual(text[0:48], detect_request_text) + + # Translation should still work with only lang info. + detection_props = dict(TEXT=text, MULTI_LANGUAGE_REPORT=MULTI_LANGUAGE_REPORT_INCOMPLETE) + ff_track = mpf.GenericTrack(-1, detection_props) + job = mpf.GenericJob('Test', 'test.pdf', get_test_properties(), {}, ff_track) + results = list(AcsTranslationComponent().get_detections_from_generic(job)) + + self.assertEqual(1, len(results)) + self.assertEqual(text, results[0].detection_properties['TEXT']) + self.assertEqual(expected_translation, detection_props['TRANSLATION']) + self.assertAlmostEqual(0.6386098072481651, + float(detection_props['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) + detect_request_text = self.get_request_body()[0]['Text'] + self.assertEqual(text[0:48], detect_request_text) + + + # Translation should be skipped for this case + detection_props = dict(TEXT=text, MULTI_LANGUAGE_REPORT=UNKNOWN_MULTI_LANGUAGE_REPORT) + ff_track = mpf.GenericTrack(-1, detection_props) + job = mpf.GenericJob('Test', 'test.pdf', get_test_properties(), {}, ff_track) + results = list(AcsTranslationComponent().get_detections_from_generic(job)) + + self.assertEqual(1, len(results)) + self.assertEqual(text, results[0].detection_properties['TEXT']) + self.assertEqual(results[0].detection_properties['SKIPPED TRANSLATION'], "TRUE") + + # Translation should be skipped for this case + detection_props = dict(TEXT=CHINESE_SAMPLE_TEXT_REPORT_ENG_TRANSLATE, + MULTI_LANGUAGE_REPORT=MULTI_LANGUAGE_REPORT_ENG_ONLY) + ff_track = mpf.GenericTrack(-1, detection_props) + job = mpf.GenericJob('Test', 'test.pdf', get_test_properties(), {}, ff_track) + results = list(AcsTranslationComponent().get_detections_from_generic(job)) + + self.assertEqual(1, len(results)) + self.assertEqual(CHINESE_SAMPLE_TEXT_REPORT_ENG_TRANSLATE, + results[0].detection_properties['TEXT']) + self.assertEqual(results[0].detection_properties['SKIPPED TRANSLATION'], "TRUE") + def get_test_properties(**extra_properties): return { 'ACS_URL': os.getenv('ACS_URL', 'http://localhost:10670/translator'),