diff --git a/python/AzureTranslation/LICENSE b/python/AzureTranslation/LICENSE index 2344b622..847284f6 100644 --- a/python/AzureTranslation/LICENSE +++ b/python/AzureTranslation/LICENSE @@ -19,15 +19,18 @@ is used in a deployment or embedded within another project, it is requested that you send an email to opensource@mitre.org in order to let us know where this software is being used. +The nlp_text_splitter utlity uses the following sentence detection libraries: + ***************************************************************************** -The WtP, "Where the Point", sentence segmentation library falls under the MIT License: +The WtP, "Where the Point", and SaT, "Segment any Text" sentence segmentation +library falls under the MIT License: -https://github.com/bminixhofer/wtpsplit/blob/main/LICENSE +https://github.com/segment-any-text/wtpsplit/blob/main/LICENSE MIT License -Copyright (c) 2024 Benjamin Minixhofer +Copyright (c) 2024 Benjamin Minixhofer, Markus Frohmann, Igor Sterner Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/python/AzureTranslation/README.md b/python/AzureTranslation/README.md index d12a81f8..09294a0c 100644 --- a/python/AzureTranslation/README.md +++ b/python/AzureTranslation/README.md @@ -87,26 +87,36 @@ must be provided. Neither has a default value. The following settings control the behavior of dividing input text into acceptable chunks for processing. -Through preliminary investigation, we identified the [WtP library ("Where's the +Through preliminary investigation, we identified the [SaT/WtP library ("Segment any Text" / "Where's the Point")](https://github.com/bminixhofer/wtpsplit) and [spaCy's multilingual sentence detection model](https://spacy.io/models) for identifying sentence breaks in a large section of text. -WtP models are trained to split up multilingual text by sentence without the need of an +SaT/WtP models are trained to split up multilingual text by sentence without the need of an input language tag. The disadvantage is that the most accurate WtP models will need ~3.5 -GB of GPU memory. On the other hand, spaCy has a single multilingual sentence detection +GB of GPU memory. SaT models are a more recent addition and considered to be a more accurate +set of sentence segmentation models; their resource costs are similar to WtP. + +On the other hand, spaCy has a single multilingual sentence detection that appears to work better for splitting up English text in certain cases, unfortunately this model lacks support handling for Chinese punctuation. -- `SENTENCE_MODEL`: Specifies the desired WtP or spaCy sentence detection model. For CPU - and runtime considerations, the author of WtP recommends using `wtp-bert-mini`. More - advanced WtP models that use GPU resources (up to ~8 GB) are also available. See list of - WtP model names - [here](https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#available-models). The - only available spaCy model (for text with unknown language) is `xx_sent_ud_sm`. +- `SENTENCE_MODEL`: Specifies the desired SaT/WtP or spaCy sentence detection model. For CPU + and runtime considerations, the authors of SaT/WtP recommends using `sat-3l-sm` or `wtp-bert-mini`. + More advanced SaT/WtP models that use GPU resources (up to ~8 GB for WtP) are also available. + + See list of model names below: + + - [WtP Models](https://github.com/segment-any-text/wtpsplit/tree/1.3.0?tab=readme-ov-file#available-models) + - [SaT Models](https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#available-models). + + Please note, the only available spaCy model (for text with unknown language) is `xx_sent_ud_sm`. + + Review list of languages supported by SaT/WtP below: + + - [WtP Models](https://github.com/segment-any-text/wtpsplit/tree/1.3.0?tab=readme-ov-file#supported-languages) + - [SaT Models](https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#supported-languages) - Review list of languages supported by WtP - [here](https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#supported-languages). Review models and languages supported by spaCy [here](https://spacy.io/models). - `SENTENCE_SPLITTER_CHAR_COUNT`: Specifies maximum number of characters to process @@ -115,16 +125,20 @@ this model lacks support handling for Chinese punctuation. lengths [here](https://discourse.mozilla.org/t/proposal-sentences-lenght-limit-from-14-words-to-100-characters). +- `SENTENCE_SPLITTER_MODE`: Specifies text splitting behavior, options include: + - `DEFAULT` : Splits text into chunks based on the `SENTENCE_SPLITTER_CHAR_COUNT` limit. + - `SENTENCE`: Splits text at detected sentence boundaries. This mode creates more sentence breaks than `DEFAULT`, which is more focused on avoiding text splits unless the chunk size is reached. + - `SENTENCE_SPLITTER_INCLUDE_INPUT_LANG`: Specifies whether to pass input language to - sentence splitter algorithm. Currently, only WtP supports model threshold adjustments by + sentence splitter algorithm. Currently, only SaT/WtP supports model threshold adjustments by input language. - `SENTENCE_MODEL_CPU_ONLY`: If set to TRUE, only use CPU resources for the sentence detection model. If set to FALSE, allow sentence model to also use GPU resources. - For most runs using spaCy `xx_sent_ud_sm` or `wtp-bert-mini` models, GPU resources + For most runs using spaCy `xx_sent_ud_sm`, `sat-3l-sm`, or `wtp-bert-mini` models, GPU resources are not required. If using more advanced WtP models like `wtp-canine-s-12l`, it is recommended to set `SENTENCE_MODEL_CPU_ONLY=FALSE` to improve performance. - That model can use up to ~3.5 GB of GPU memory. + That WtP model can use up to ~3.5 GB of GPU memory. Please note, to fully enable this option, you must also rebuild the Docker container with the following change: Within the Dockerfile, set `ARG BUILD_TYPE=gpu`. diff --git a/python/AzureTranslation/acs_translation_component/acs_translation_component.py b/python/AzureTranslation/acs_translation_component/acs_translation_component.py index 6f89c050..f14fc5a5 100644 --- a/python/AzureTranslation/acs_translation_component/acs_translation_component.py +++ b/python/AzureTranslation/acs_translation_component/acs_translation_component.py @@ -471,6 +471,10 @@ def __init__(self, job_properties: Mapping[str, str], "en") nlp_model_setting = mpf_util.get_property(job_properties, "SENTENCE_MODEL_CPU_ONLY", True) + self._sentence_splitter_mode = mpf_util.get_property(job_properties, + "SENTENCE_SPLITTER_MODE", + "DEFAULT") + if not nlp_model_setting: nlp_model_setting = "cuda" else: @@ -500,14 +504,18 @@ def split_input_text(self, text: str, from_lang: Optional[str], self._num_boundary_chars, get_azure_char_count, self._sentence_model, - from_lang) + from_lang, + split_mode=self._sentence_splitter_mode, + newline_behavior='NONE') # This component already uses a newline filtering step. else: divided_text_list = TextSplitter.split( text, TranslationClient.DETECT_MAX_CHARS, self._num_boundary_chars, get_azure_char_count, - self._sentence_model) + self._sentence_model, + split_mode=self._sentence_splitter_mode, + newline_behavior='NONE') # This component already uses a newline filtering step. chunks = list(divided_text_list) diff --git a/python/AzureTranslation/plugin-files/descriptor/descriptor.json b/python/AzureTranslation/plugin-files/descriptor/descriptor.json index a04762bb..64072f6f 100644 --- a/python/AzureTranslation/plugin-files/descriptor/descriptor.json +++ b/python/AzureTranslation/plugin-files/descriptor/descriptor.json @@ -71,10 +71,16 @@ }, { "name": "STRIP_NEW_LINE_BEHAVIOR", - "description": "The translation endpoint treats newline characters as sentence boundaries. To prevent this newlines can be removed from the input text. Valid values are SPACE (replace with space character), REMOVE (remove newlines), NONE (leave newlines as they are), and GUESS (If source language is Chinese or Japanese use REMOVE, else use SPACE).", + "description": "The translation endpoint and text splitter treat newline characters as sentence boundaries. To prevent this newlines can be removed from the input text. Valid values are SPACE (replace with space character), REMOVE (remove newlines), NONE (leave newlines as they are), and GUESS (If source language is Chinese or Japanese use REMOVE, else use SPACE).", "type": "STRING", "defaultValue": "GUESS" }, + { + "name": "SENTENCE_SPLITTER_MODE", + "description": "Determines how text is split: `DEFAULT` mode splits text into chunks based on the character limit, while `SENTENCE` mode splits text strictly at sentence boundaries (may yield smaller segments), unless the character limit is reached.", + "type": "STRING", + "defaultValue": "DEFAULT" + }, { "name": "DETECT_BEFORE_TRANSLATE", "description": "Use the /detect endpoint to check if translation can be skipped because the text is already in TO_LANGUAGE.", @@ -95,7 +101,7 @@ }, { "name": "SENTENCE_MODEL", - "description": "Name of sentence segmentation model. Supported options are spaCy's multilingual `xx_sent_ud_sm` model and the Where's the Point (WtP) `wtp-bert-mini` model.", + "description": "Name of sentence segmentation model. Supported options are spaCy's multilingual `xx_sent_ud_sm` model, Segment any Text (SaT) `sat-3l-sm` model, and Where's the Point (WtP) `wtp-bert-mini` model.", "type": "STRING", "defaultValue": "wtp-bert-mini" }, @@ -107,7 +113,7 @@ }, { "name": "SENTENCE_MODEL_WTP_DEFAULT_ADAPTOR_LANGUAGE", - "description": "More advanced WTP models will require a target language. This property sets the default language to use for sentence splitting, unless `FROM_LANGUAGE`, `SUGGESTED_FROM_LANGUAGE`, or Azure language detection return a different, WtP-supported language option.", + "description": "More advanced WtP/SaT models will require a target language. This property sets the default language to use for sentence splitting, unless `FROM_LANGUAGE`, `SUGGESTED_FROM_LANGUAGE`, or Azure language detection return a different, WtP-supported language option.", "type": "STRING", "defaultValue": "en" }, diff --git a/python/AzureTranslation/tests/test_acs_translation.py b/python/AzureTranslation/tests/test_acs_translation.py index d2297f71..90206f8e 100644 --- a/python/AzureTranslation/tests/test_acs_translation.py +++ b/python/AzureTranslation/tests/test_acs_translation.py @@ -65,12 +65,14 @@ class TestAcsTranslation(unittest.TestCase): mock_server: ClassVar['MockServer'] wtp_model: ClassVar['TextSplitterModel'] + sat_model: ClassVar['TextSplitterModel'] spacy_model: ClassVar['TextSplitterModel'] @classmethod def setUpClass(cls): cls.mock_server = MockServer() cls.wtp_model = TextSplitterModel("wtp-bert-mini", "cpu", "en") + cls.sat_model = TextSplitterModel("sat-3l-sm", "cpu", "en") cls.spacy_model = TextSplitterModel("xx_sent_ud_sm", "cpu", "en") @@ -669,6 +671,79 @@ def test_split_wtp_unknown_lang(self, _): 'Spaces should be kept due to incorrect language detection.') + @mock.patch.object(TranslationClient, 'DETECT_MAX_CHARS', new_callable=lambda: 150) + def test_split_sat_unknown_lang(self, _): + # Check that the text splitter does not have an issue + # processing an unknown detected language. + self.set_results_file('invalid-lang-detect-result.json') + self.set_results_file('split-sentence/art-of-war-translation-1.json') + self.set_results_file('split-sentence/art-of-war-translation-2.json') + self.set_results_file('split-sentence/art-of-war-translation-3.json') + self.set_results_file('split-sentence/art-of-war-translation-4.json') + + text = (TEST_DATA / 'split-sentence/art-of-war.txt').read_text() + detection_props = dict(TEXT=text) + TranslationClient(get_test_properties(), self.sat_model).add_translations(detection_props) + + self.assertEqual(5, len(detection_props)) + self.assertEqual(text, detection_props['TEXT']) + + expected_translation = (TEST_DATA / 'split-sentence/art-war-translation.txt') \ + .read_text().strip() + self.assertEqual(expected_translation, detection_props['TRANSLATION']) + self.assertEqual('EN', detection_props['TRANSLATION TO LANGUAGE']) + + self.assertEqual('fake-lang', detection_props['TRANSLATION SOURCE LANGUAGE']) + self.assertAlmostEqual(1.0, + float(detection_props['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) + + detect_request_text = self.get_request_body()[0]['Text'] + self.assertEqual(text[0:TranslationClient.DETECT_MAX_CHARS], detect_request_text) + + expected_chunk_lengths = [88, 118, 116, 106] + self.assertEqual(sum(expected_chunk_lengths), len(text)) + + # Due to an incorrect language detection, newlines are + # not properly replaced for Chinese text, and + # additional whitespace is present in the text. + # This alters the behavior of WtP sentence splitting. + translation_request1 = self.get_request_body()[0]['Text'] + self.assertEqual(expected_chunk_lengths[0], len(translation_request1)) + self.assertTrue(translation_request1.startswith('兵者,')) + self.assertTrue(translation_request1.endswith('而不危也;')) + self.assertNotIn('\n', translation_request1, + 'Newlines were not properly removed') + self.assertIn(' ', translation_request1, + 'Spaces should be kept due to incorrect language detection.') + + translation_request2 = self.get_request_body()[0]['Text'] + self.assertEqual(expected_chunk_lengths[1], len(translation_request2)) + self.assertTrue(translation_request2.startswith('天者,陰陽')) + self.assertTrue(translation_request2.endswith('兵眾孰強?')) + self.assertNotIn('\n', translation_request2, + 'Newlines were not properly removed') + self.assertIn(' ', translation_request2, + 'Spaces should be kept due to incorrect language detection.') + + translation_request3 = self.get_request_body()[0]['Text'] + self.assertEqual(expected_chunk_lengths[2], len(translation_request3)) + self.assertTrue(translation_request3.startswith('士卒孰練?')) + self.assertTrue(translation_request3.endswith('亂而取之, ')) + self.assertNotIn('\n', translation_request3, + 'Newlines were not properly removed') + self.assertIn(' ', translation_request3, + 'Spaces should be kept due to incorrect language detection.') + + translation_request4 = self.get_request_body()[0]['Text'] + self.assertEqual(expected_chunk_lengths[3], len(translation_request4)) + self.assertTrue(translation_request4.startswith('實而備之,')) + self.assertTrue(translation_request4.endswith('勝負見矣。 ')) + self.assertNotIn('\n', translation_request4, + 'Newlines were not properly removed') + self.assertIn(' ', translation_request4, + 'Spaces should be kept due to incorrect language detection.') + + def test_newline_removal(self): def replace(text): diff --git a/python/NllbTranslation/LICENSE b/python/NllbTranslation/LICENSE new file mode 100644 index 00000000..ef7840e2 --- /dev/null +++ b/python/NllbTranslation/LICENSE @@ -0,0 +1,84 @@ +/***************************************************************************** +* Copyright 2024 The MITRE Corporation * +* * +* Licensed under the Apache License, Version 2.0 (the "License"); * +* you may not use this file except in compliance with the License. * +* You may obtain a copy of the License at * +* * +* http://www.apache.org/licenses/LICENSE-2.0 * +* * +* Unless required by applicable law or agreed to in writing, software * +* distributed under the License is distributed on an "AS IS" BASIS, * +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * +* See the License for the specific language governing permissions and * +* limitations under the License. * +******************************************************************************/ + +This project contains content developed by The MITRE Corporation. If this code +is used in a deployment or embedded within another project, it is requested +that you send an email to opensource@mitre.org in order to let us know where +this software is being used. + + +The "No Language Left Behind" (NLLB) models on Hugging Face are distributed +under the CC-BY-NC-4.0 license (Creative Commons Attribution-NonCommercial 4.0), +hence they must be downloaded and run separately under non-commercial restrictions. + +The code within this repository falls under Apache 2.0 License. + +The nlp_text_splitter utlity uses the following sentence detection libraries: + +***************************************************************************** + +The WtP, "Where the Point", and SaT, "Segment any Text" sentence segmentation +library falls under the MIT License: + +https://github.com/segment-any-text/wtpsplit/blob/main/LICENSE + +MIT License + +Copyright (c) 2024 Benjamin Minixhofer, Markus Frohmann, Igor Sterner + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +***************************************************************************** + +The spaCy Natural Language Processing library falls under the MIT License: + +The MIT License (MIT) + +Copyright (C) 2016-2024 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. \ No newline at end of file diff --git a/python/NllbTranslation/README.md b/python/NllbTranslation/README.md index ad0b1590..d5ba93eb 100644 --- a/python/NllbTranslation/README.md +++ b/python/NllbTranslation/README.md @@ -8,12 +8,12 @@ To accommodate smaller deployment enviroments, this component can use smaller NL # Recommended System Requirements -- **GPU (recommended for default 3.3B model)** - - NVIDIA GPU with CUDA support - - At least **24 GB of GPU VRAM** +- **GPU (recommended for default 3.3B model)** + - NVIDIA GPU with CUDA support + - At least **24 GB of GPU VRAM** -- **CPU-only (not recommended for 3.3B model unless sufficient memory is available)** - - At least **32 GB of system RAM** +- **CPU-only (not recommended for 3.3B model unless sufficient memory is available)** + - At least **32 GB of system RAM** ### Example Model Requirements @@ -47,15 +47,22 @@ The below properties can be optionally provided to alter the behavior of the com - `NLLB_MODEL`: Specifies which No Language Left Behind (NLLB) model to use. The default model is `facebook/nllb-200-3.3B` and is included in the pre-built NLLB Translation docker image. If this property is configured with a different model, the component will attempt to download the specified model from Hugging Face. See [Recommended System Requirements](#recommended-system-requirements) for additional information. -- `SENTENCE_MODEL`: Specifies the desired WtP or spaCy sentence detection model. For CPU - and runtime considerations, the author of WtP recommends using `wtp-bert-mini`. More - advanced WtP models that use GPU resources (up to ~8 GB) are also available. See list of - WtP model names - [here](https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#available-models). The - only available spaCy model (for text with unknown language) is `xx_sent_ud_sm`. +- `SENTENCE_MODEL`: Specifies the desired SaT/WtP or spaCy sentence detection model. For CPU + and runtime considerations, the authors of SaT/WtP recommends using `sat-3l-sm` or `wtp-bert-mini`. + More advanced SaT/WtP models that use GPU resources (up to ~8 GB for WtP) are also available. + + See list of model names below: + + - [WtP Models](https://github.com/segment-any-text/wtpsplit/tree/1.3.0?tab=readme-ov-file#available-models) + - [SaT Models](https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#available-models). + + Please note, the only available spaCy model (for text with unknown language) is `xx_sent_ud_sm`. + + Review list of languages supported by SaT/WtP below: + + - [WtP Models](https://github.com/segment-any-text/wtpsplit/tree/1.3.0?tab=readme-ov-file#supported-languages) + - [SaT Models](https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#supported-languages) - Review list of languages supported by WtP - [here](https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#supported-languages). Review models and languages supported by spaCy [here](https://spacy.io/models). - `SENTENCE_SPLITTER_CHAR_COUNT`: Specifies maximum number of characters to process @@ -68,6 +75,17 @@ The below properties can be optionally provided to alter the behavior of the com sentence splitter algorithm. Currently, only WtP supports model threshold adjustments by input language. +- `SENTENCE_SPLITTER_MODE`: Specifies text splitting behavior, options include: + - `DEFAULT` : Splits text into chunks based on the `SENTENCE_SPLITTER_CHAR_COUNT` limit. + - `SENTENCE`: Splits text at detected sentence boundaries. This mode creates more sentence breaks than `DEFAULT`, which is more focused on avoiding text splits unless the chunk size is reached. + +- `SENTENCE_SPLITTER_NEWLINE_BEHAVIOR`: Specifies how individual newlines between characters should be handled when splitting text. Options include: + - `GUESS` (default): Automatically replace newlines with either spaces or remove them, depending on the detected script between newlines. + - `SPACE`: Always replaces newlines with a space, regardless of script. + - `REMOVE`: Always removes newlines entirely, joining the adjacent characters directly. + - `NONE`: Leaves newlines as-is in the input text. + Please note that multiple adjacent newlines are treated as a manual text divide, across all settings. This is to ensure subtitles and other singular text examples are properly separated from other text during translation. + - `SENTENCE_MODEL_CPU_ONLY`: If set to TRUE, only use CPU resources for the sentence detection model. If set to FALSE, allow sentence model to also use GPU resources. For most runs using spaCy `xx_sent_ud_sm` or `wtp-bert-mini` models, GPU resources @@ -87,209 +105,209 @@ The below properties can be optionally provided to alter the behavior of the com # Language Identifiers The following are the ISO 639-3 and ISO 15924 codes, and their corresponding languages which Nllb can translate. -| ISO-639-3 | ISO-15924 | Language +| ISO-639-3 | ISO-15924 | Language | --------- | ---------- | ---------------------------------- -| ace | Arab | Acehnese Arabic -| ace | Latn | Acehnese Latin -| acm | Arab | Mesopotamian Arabic -| acq | Arab | Ta’izzi-Adeni Arabic -| aeb | Arab | Tunisian Arabic -| afr | Latn | Afrikaans -| ajp | Arab | South Levantine Arabic -| aka | Latn | Akan -| amh | Ethi | Amharic -| apc | Arab | North Levantine Arabic -| arb | Arab | Modern Standard Arabic +| ace | Arab | Acehnese Arabic +| ace | Latn | Acehnese Latin +| acm | Arab | Mesopotamian Arabic +| acq | Arab | Ta’izzi-Adeni Arabic +| aeb | Arab | Tunisian Arabic +| afr | Latn | Afrikaans +| ajp | Arab | South Levantine Arabic +| aka | Latn | Akan +| amh | Ethi | Amharic +| apc | Arab | North Levantine Arabic +| arb | Arab | Modern Standard Arabic | arb | Latn | Modern Standard Arabic (Romanized) -| ars | Arab | Najdi Arabic -| ary | Arab | Moroccan Arabic -| arz | Arab | Egyptian Arabic -| asm | Beng | Assamese -| ast | Latn | Asturian -| awa | Deva | Awadhi -| ayr | Latn | Central Aymara -| azb | Arab | South Azerbaijani -| azj | Latn | North Azerbaijani -| bak | Cyrl | Bashkir -| bam | Latn | Bambara -| ban | Latn | Balinese -| bel | Cyrl | Belarusian -| bem | Latn | Bemba -| ben | Beng | Bengali -| bho | Deva | Bhojpuri -| bjn | Arab | Banjar (Arabic script) -| bjn | Latn | Banjar (Latin script) -| bod | Tibt | Standard Tibetan -| bos | Latn | Bosnian -| bug | Latn | Buginese -| bul | Cyrl | Bulgarian -| cat | Latn | Catalan -| ceb | Latn | Cebuano -| ces | Latn | Czech -| cjk | Latn | Chokwe -| ckb | Arab | Central Kurdish -| crh | Latn | Crimean Tatar -| cym | Latn | Welsh -| dan | Latn | Danish -| deu | Latn | German -| dik | Latn | Southwestern Dinka -| dyu | Latn | Dyula -| dzo | Tibt | Dzongkha -| ell | Grek | Greek -| eng | Latn | English -| epo | Latn | Esperanto -| est | Latn | Estonian -| eus | Latn | Basque -| ewe | Latn | Ewe -| fao | Latn | Faroese -| fij | Latn | Fijian -| fin | Latn | Finnish -| fon | Latn | Fon -| fra | Latn | French -| fur | Latn | Friulian -| fuv | Latn | Nigerian Fulfulde -| gla | Latn | Scottish Gaelic -| gle | Latn | Irish -| glg | Latn | Galician -| grn | Latn | Guarani -| guj | Gujr | Gujarati -| hat | Latn | Haitian Creole -| hau | Latn | Hausa -| heb | Hebr | Hebrew -| hin | Deva | Hindi -| hne | Deva | Chhattisgarhi -| hrv | Latn | Croatian -| hun | Latn | Hungarian -| hye | Armn | Armenian -| ibo | Latn | Igbo -| ilo | Latn | Ilocano -| ind | Latn | Indonesian -| isl | Latn | Icelandic -| ita | Latn | Italian -| jav | Latn | Javanese -| jpn | Jpan | Japanese -| kab | Latn | Kabyle -| kac | Latn | Jingpho -| kam | Latn | Kamba -| kan | Knda | Kannada -| kas | Arab | Kashmiri (Arabic script) -| kas | Deva | Kashmiri (Devanagari script) -| kat | Geor | Georgian -| knc | Arab | Central Kanuri (Arabic script) -| knc | Latn | Central Kanuri (Latin script) -| kaz | Cyrl | Kazakh -| kbp | Latn | Kabiyè -| kea | Latn | Kabuverdianu -| khm | Khmr | Khmer -| kik | Latn | Kikuyu -| kin | Latn | Kinyarwanda -| kir | Cyrl | Kyrgyz -| kmb | Latn | Kimbundu -| kmr | Latn | Northern Kurdish -| kon | Latn | Kikongo -| kor | Hang | Korean -| lao | Laoo | Lao -| lij | Latn | Ligurian -| lim | Latn | Limburgish -| lin | Latn | Lingala -| lit | Latn | Lithuanian -| lmo | Latn | Lombard -| ltg | Latn | Latgalian -| ltz | Latn | Luxembourgish -| lua | Latn | Luba-Kasai -| lug | Latn | Ganda -| luo | Latn | Luo -| lus | Latn | Mizo -| lvs | Latn | Standard Latvian -| mag | Deva | Magahi -| mai | Deva | Maithili -| mal | Mlym | Malayalam -| mar | Deva | Marathi -| min | Arab | Minangkabau (Arabic script) -| min | Latn | Minangkabau (Latin script) -| mkd | Cyrl | Macedonian -| plt | Latn | Plateau Malagasy -| mlt | Latn | Maltese -| mni | Beng | Meitei (Bengali script) -| khk | Cyrl | Halh Mongolian -| mos | Latn | Mossi -| mri | Latn | Maori -| mya | Mymr | Burmese -| nld | Latn | Dutch -| nno | Latn | Norwegian Nynorsk -| nob | Latn | Norwegian Bokmål -| npi | Deva | Nepali -| nso | Latn | Northern Sotho -| nus | Latn | Nuer -| nya | Latn | Nyanja -| oci | Latn | Occitan -| gaz | Latn | West Central Oromo -| ory | Orya | Odia -| pag | Latn | Pangasinan -| pan | Guru | Eastern Panjabi -| pap | Latn | Papiamento -| pes | Arab | Western Persian -| pol | Latn | Polish -| por | Latn | Portuguese -| prs | Arab | Dari -| pbt | Arab | Southern Pashto -| quy | Latn | Ayacucho Quechua -| ron | Latn | Romanian -| run | Latn | Rundi -| rus | Cyrl | Russian -| sag | Latn | Sango -| san | Deva | Sanskrit -| sat | Olck | Santali -| scn | Latn | Sicilian -| shn | Mymr | Shan -| sin | Sinh | Sinhala -| slk | Latn | Slovak -| slv | Latn | Slovenian -| smo | Latn | Samoan -| sna | Latn | Shona -| snd | Arab | Sindhi -| som | Latn | Somali -| sot | Latn | Southern Sotho -| spa | Latn | Spanish -| als | Latn | Tosk Albanian -| srd | Latn | Sardinian -| srp | Cyrl | Serbian -| ssw | Latn | Swati -| sun | Latn | Sundanese -| swe | Latn | Swedish -| swh | Latn | Swahili -| szl | Latn | Silesian -| tam | Taml | Tamil -| tat | Cyrl | Tatar -| tel | Telu | Telugu -| tgk | Cyrl | Tajik -| tgl | Latn | Tagalog -| tha | Thai | Thai -| tir | Ethi | Tigrinya -| taq | Latn | Tamasheq (Latin script) -| taq | Tfng | Tamasheq (Tifinagh script) -| tpi | Latn | Tok Pisin -| tsn | Latn | Tswana -| tso | Latn | Tsonga -| tuk | Latn | Turkmen -| tum | Latn | Tumbuka -| tur | Latn | Turkish -| twi | Latn | Twi -| tzm | Tfng | Central Atlas Tamazight -| uig | Arab | Uyghur -| ukr | Cyrl | Ukrainian -| umb | Latn | Umbundu -| urd | Arab | Urdu -| uzn | Latn | Northern Uzbek -| vec | Latn | Venetian -| vie | Latn | Vietnamese -| war | Latn | Waray -| wol | Latn | Wolof -| xho | Latn | Xhosa -| ydd | Hebr | Eastern Yiddish -| yor | Latn | Yoruba -| yue | Hant | Yue Chinese -| zho | Hans | Chinese (Simplified) -| zho | Hant | Chinese (Traditional) -| zsm | Latn | Standard Malay -| zul | Latn | Zulu +| ars | Arab | Najdi Arabic +| ary | Arab | Moroccan Arabic +| arz | Arab | Egyptian Arabic +| asm | Beng | Assamese +| ast | Latn | Asturian +| awa | Deva | Awadhi +| ayr | Latn | Central Aymara +| azb | Arab | South Azerbaijani +| azj | Latn | North Azerbaijani +| bak | Cyrl | Bashkir +| bam | Latn | Bambara +| ban | Latn | Balinese +| bel | Cyrl | Belarusian +| bem | Latn | Bemba +| ben | Beng | Bengali +| bho | Deva | Bhojpuri +| bjn | Arab | Banjar (Arabic script) +| bjn | Latn | Banjar (Latin script) +| bod | Tibt | Standard Tibetan +| bos | Latn | Bosnian +| bug | Latn | Buginese +| bul | Cyrl | Bulgarian +| cat | Latn | Catalan +| ceb | Latn | Cebuano +| ces | Latn | Czech +| cjk | Latn | Chokwe +| ckb | Arab | Central Kurdish +| crh | Latn | Crimean Tatar +| cym | Latn | Welsh +| dan | Latn | Danish +| deu | Latn | German +| dik | Latn | Southwestern Dinka +| dyu | Latn | Dyula +| dzo | Tibt | Dzongkha +| ell | Grek | Greek +| eng | Latn | English +| epo | Latn | Esperanto +| est | Latn | Estonian +| eus | Latn | Basque +| ewe | Latn | Ewe +| fao | Latn | Faroese +| fij | Latn | Fijian +| fin | Latn | Finnish +| fon | Latn | Fon +| fra | Latn | French +| fur | Latn | Friulian +| fuv | Latn | Nigerian Fulfulde +| gla | Latn | Scottish Gaelic +| gle | Latn | Irish +| glg | Latn | Galician +| grn | Latn | Guarani +| guj | Gujr | Gujarati +| hat | Latn | Haitian Creole +| hau | Latn | Hausa +| heb | Hebr | Hebrew +| hin | Deva | Hindi +| hne | Deva | Chhattisgarhi +| hrv | Latn | Croatian +| hun | Latn | Hungarian +| hye | Armn | Armenian +| ibo | Latn | Igbo +| ilo | Latn | Ilocano +| ind | Latn | Indonesian +| isl | Latn | Icelandic +| ita | Latn | Italian +| jav | Latn | Javanese +| jpn | Jpan | Japanese +| kab | Latn | Kabyle +| kac | Latn | Jingpho +| kam | Latn | Kamba +| kan | Knda | Kannada +| kas | Arab | Kashmiri (Arabic script) +| kas | Deva | Kashmiri (Devanagari script) +| kat | Geor | Georgian +| knc | Arab | Central Kanuri (Arabic script) +| knc | Latn | Central Kanuri (Latin script) +| kaz | Cyrl | Kazakh +| kbp | Latn | Kabiyè +| kea | Latn | Kabuverdianu +| khm | Khmr | Khmer +| kik | Latn | Kikuyu +| kin | Latn | Kinyarwanda +| kir | Cyrl | Kyrgyz +| kmb | Latn | Kimbundu +| kmr | Latn | Northern Kurdish +| kon | Latn | Kikongo +| kor | Hang | Korean +| lao | Laoo | Lao +| lij | Latn | Ligurian +| lim | Latn | Limburgish +| lin | Latn | Lingala +| lit | Latn | Lithuanian +| lmo | Latn | Lombard +| ltg | Latn | Latgalian +| ltz | Latn | Luxembourgish +| lua | Latn | Luba-Kasai +| lug | Latn | Ganda +| luo | Latn | Luo +| lus | Latn | Mizo +| lvs | Latn | Standard Latvian +| mag | Deva | Magahi +| mai | Deva | Maithili +| mal | Mlym | Malayalam +| mar | Deva | Marathi +| min | Arab | Minangkabau (Arabic script) +| min | Latn | Minangkabau (Latin script) +| mkd | Cyrl | Macedonian +| plt | Latn | Plateau Malagasy +| mlt | Latn | Maltese +| mni | Beng | Meitei (Bengali script) +| khk | Cyrl | Halh Mongolian +| mos | Latn | Mossi +| mri | Latn | Maori +| mya | Mymr | Burmese +| nld | Latn | Dutch +| nno | Latn | Norwegian Nynorsk +| nob | Latn | Norwegian Bokmål +| npi | Deva | Nepali +| nso | Latn | Northern Sotho +| nus | Latn | Nuer +| nya | Latn | Nyanja +| oci | Latn | Occitan +| gaz | Latn | West Central Oromo +| ory | Orya | Odia +| pag | Latn | Pangasinan +| pan | Guru | Eastern Panjabi +| pap | Latn | Papiamento +| pes | Arab | Western Persian +| pol | Latn | Polish +| por | Latn | Portuguese +| prs | Arab | Dari +| pbt | Arab | Southern Pashto +| quy | Latn | Ayacucho Quechua +| ron | Latn | Romanian +| run | Latn | Rundi +| rus | Cyrl | Russian +| sag | Latn | Sango +| san | Deva | Sanskrit +| sat | Olck | Santali +| scn | Latn | Sicilian +| shn | Mymr | Shan +| sin | Sinh | Sinhala +| slk | Latn | Slovak +| slv | Latn | Slovenian +| smo | Latn | Samoan +| sna | Latn | Shona +| snd | Arab | Sindhi +| som | Latn | Somali +| sot | Latn | Southern Sotho +| spa | Latn | Spanish +| als | Latn | Tosk Albanian +| srd | Latn | Sardinian +| srp | Cyrl | Serbian +| ssw | Latn | Swati +| sun | Latn | Sundanese +| swe | Latn | Swedish +| swh | Latn | Swahili +| szl | Latn | Silesian +| tam | Taml | Tamil +| tat | Cyrl | Tatar +| tel | Telu | Telugu +| tgk | Cyrl | Tajik +| tgl | Latn | Tagalog +| tha | Thai | Thai +| tir | Ethi | Tigrinya +| taq | Latn | Tamasheq (Latin script) +| taq | Tfng | Tamasheq (Tifinagh script) +| tpi | Latn | Tok Pisin +| tsn | Latn | Tswana +| tso | Latn | Tsonga +| tuk | Latn | Turkmen +| tum | Latn | Tumbuka +| tur | Latn | Turkish +| twi | Latn | Twi +| tzm | Tfng | Central Atlas Tamazight +| uig | Arab | Uyghur +| ukr | Cyrl | Ukrainian +| umb | Latn | Umbundu +| urd | Arab | Urdu +| uzn | Latn | Northern Uzbek +| vec | Latn | Venetian +| vie | Latn | Vietnamese +| war | Latn | Waray +| wol | Latn | Wolof +| xho | Latn | Xhosa +| ydd | Hebr | Eastern Yiddish +| yor | Latn | Yoruba +| yue | Hant | Yue Chinese +| zho | Hans | Chinese (Simplified) +| zho | Hant | Chinese (Traditional) +| zsm | Latn | Standard Malay +| zul | Latn | Zulu diff --git a/python/NllbTranslation/nllb_component/nllb_translation_component.py b/python/NllbTranslation/nllb_component/nllb_translation_component.py index bd658110..7613ad00 100644 --- a/python/NllbTranslation/nllb_component/nllb_translation_component.py +++ b/python/NllbTranslation/nllb_component/nllb_translation_component.py @@ -61,7 +61,7 @@ def get_detections_from_image(self, job: mpf.ImageJob) -> Sequence[mpf.ImageLoca def get_detections_from_audio(self, job: mpf.AudioJob) -> Sequence[mpf.AudioTrack]: logger.info(f'Received audio job.') return self._get_feed_forward_detections(job.job_properties, job.feed_forward_track, video_job=False) - + def get_detections_from_video(self, job: mpf.VideoJob) -> Sequence[mpf.VideoTrack]: logger.info(f'Received video job.') return self._get_feed_forward_detections(job.job_properties, job.feed_forward_track, video_job=True) @@ -127,7 +127,7 @@ def _load_tokenizer(self, config: Dict[str, str]) -> None: src_lang=config.translate_from_language, device_map=self._model.device) elapsed = time.time() - start logger.debug(f"Successfully loaded tokenizer in {elapsed} seconds.") - + def _load_model(self, model_name: str = None, config: Dict[str, str] = None) -> None: try: if model_name is None: @@ -135,10 +135,10 @@ def _load_model(self, model_name: str = None, config: Dict[str, str] = None) -> model_name = DEFAULT_NLLB_MODEL else: model_name = config.nllb_model - + model_path = '/models/' + model_name offload_folder = model_path + '/.weights' - + if os.path.isdir(model_path) and os.path.isfile(os.path.join(model_path, "config.json")): # model is stored locally; we do not need to load the tokenizer here logger.info(f"Loading model from local directory: {model_path}") @@ -154,7 +154,7 @@ def _load_model(self, model_name: str = None, config: Dict[str, str] = None) -> logger.debug(f"Saving model in {model_path}") self._model.save_pretrained(model_path) self._tokenizer.save_pretrained(model_path) - + except Exception: logger.exception( f'Failed to complete job due to the following exception:') @@ -207,14 +207,18 @@ def _get_translation(self, config: Dict[str, str], text_to_translate: str) -> st 0, len, text_splitter_model, - wtp_lang) + wtp_lang, + split_mode=config._sentence_split_mode, + newline_behavior=config._newline_behavior) else: input_text_sentences = TextSplitter.split( text, config.nllb_character_limit, 0, len, - text_splitter_model) + text_splitter_model, + split_mode=config._sentence_split_mode, + newline_behavior=config._newline_behavior) text_list = list(input_text_sentences) logger.info(f'Input text split into {len(text_list)} sentences.') @@ -264,6 +268,12 @@ def __init__(self, props: Mapping[str, str], ff_props: Dict[str, str]) -> None: ).split(',') ] + self._sentence_split_mode = mpf_util.get_property( + props, 'SENTENCE_SPLITTER_MODE', 'DEFAULT') + + self._newline_behavior = mpf_util.get_property( + props, 'SENTENCE_SPLITTER_NEWLINE_BEHAVIOR', 'GUESS') + # default model, cached self.nllb_model = mpf_util.get_property(props, "NLLB_MODEL", DEFAULT_NLLB_MODEL) @@ -344,7 +354,7 @@ def __init__(self, props: Mapping[str, str], ff_props: Dict[str, str]) -> None: f'Failed to complete job due to the following exception:') raise - + if not self.translate_from_language: logger.exception('Unsupported or no source language provided') raise mpf.DetectionException( diff --git a/python/NllbTranslation/plugin-files/descriptor/descriptor.json b/python/NllbTranslation/plugin-files/descriptor/descriptor.json index 8420e2c1..f4688110 100644 --- a/python/NllbTranslation/plugin-files/descriptor/descriptor.json +++ b/python/NllbTranslation/plugin-files/descriptor/descriptor.json @@ -58,7 +58,7 @@ }, { "name": "SENTENCE_MODEL", - "description": "Name of sentence segmentation model. Supported options are spaCy's multilingual `xx_sent_ud_sm` model and the Where's the Point (WtP) `wtp-bert-mini` model.", + "description": "Name of sentence segmentation model. Supported options are spaCy's multilingual `xx_sent_ud_sm` model, Segment any Text (SaT) `sat-3l-sm` model, and Where's the Point (WtP) `wtp-bert-mini` model.", "type": "STRING", "defaultValue": "wtp-bert-mini" }, @@ -103,6 +103,18 @@ "description": "The ISO-15924 language code for language and script that the input text should be translated from.", "type": "STRING", "defaultValue": "" + }, + { + "name": "SENTENCE_SPLITTER_MODE", + "description": "Determines how text is split: `DEFAULT` mode splits text into chunks based on the character limit, while `SENTENCE` mode splits text strictly at sentence boundaries (may yield smaller segments), unless the character limit is reached.", + "type": "STRING", + "defaultValue": "DEFAULT" + }, + { + "name": "SENTENCE_SPLITTER_NEWLINE_BEHAVIOR", + "description": "The text splitter treats newline characters as sentence boundaries. To prevent this newlines can be removed from the input text during splitting. Valid values are SPACE (replace with space character), REMOVE (remove newlines), NONE (leave newlines as they are), and GUESS (If source language is Chinese or Japanese use REMOVE, else use SPACE).", + "type": "STRING", + "defaultValue": "GUESS" } ] } diff --git a/python/NllbTranslation/tests/test_nllb_translation.py b/python/NllbTranslation/tests/test_nllb_translation.py index e9c66e45..754c70f3 100644 --- a/python/NllbTranslation/tests/test_nllb_translation.py +++ b/python/NllbTranslation/tests/test_nllb_translation.py @@ -112,7 +112,7 @@ def test_audio_job(self): self.assertEqual(self.OUTPUT_0, props["TRANSLATION"]) def test_video_job(self): - + ff_track = mpf.VideoTrack( 0, 1, -1, { @@ -120,7 +120,7 @@ def test_video_job(self): 1: mpf.ImageLocation(0, 10, 10, 10, -1, dict(TRANSCRIPT=self.SAMPLE_2)) }, dict(TEXT=self.SAMPLE_0)) - + #set default props test_generic_job_props: dict[str, str] = dict(self.defaultProps) #load source language @@ -161,8 +161,8 @@ def test_plaintext_job(self): test_generic_job_props['DEFAULT_SOURCE_LANGUAGE'] = 'deu' test_generic_job_props['DEFAULT_SOURCE_SCRIPT'] = 'Latn' - job = mpf.GenericJob('Test Plaintext', - str(Path(__file__).parent / 'data' / 'translation.txt'), + job = mpf.GenericJob('Test Plaintext', + str(Path(__file__).parent / 'data' / 'translation.txt'), test_generic_job_props, {}) result_track: Sequence[mpf.GenericTrack] = self.component.get_detections_from_generic(job) @@ -185,7 +185,7 @@ def test_translate_first_ff_property(self): 1: mpf.ImageLocation(0, 10, 10, 10, -1, dict(TEXT=self.SAMPLE_0,TRANSCRIPT=self.SAMPLE_2)) }, dict(TRANSCRIPT=self.SAMPLE_0)) - + job = mpf.VideoJob('Test Video', 'test.mp4', 0, 1, test_generic_job_props, @@ -247,7 +247,7 @@ def test_translate_all_ff_properties(self): frame_2_props = result[0].frame_locations[2].detection_properties self.assertNotIn("OTHER TRANSLATION", frame_2_props) self.assertIn("OTHER", frame_2_props) - + def test_translate_first_frame_location_property(self): # set default props test_generic_job_props: dict[str, str] = dict(self.defaultProps) @@ -264,7 +264,7 @@ def test_translate_first_frame_location_property(self): 0: mpf.ImageLocation(0, 0, 10, 10, -1, dict(OTHER_PROPERTY="Other prop text", TEXT=self.SAMPLE_1)), 1: mpf.ImageLocation(0, 10, 10, 10, -1, dict(TRANSCRIPT=self.SAMPLE_2)) }) - + job = mpf.VideoJob('Test Video', 'test.mp4', 0, 1, test_generic_job_props, @@ -388,7 +388,7 @@ def test_feed_forward_language(self): #set default props test_generic_job_props: dict[str, str] = dict(self.defaultProps) - ff_track = mpf.GenericTrack(-1, dict(TEXT=self.SAMPLE_0, + ff_track = mpf.GenericTrack(-1, dict(TEXT=self.SAMPLE_0, LANGUAGE='deu', ISO_SCRIPT='Latn')) job = mpf.GenericJob('Test Generic', 'test.pdf', test_generic_job_props, {}, ff_track) @@ -401,7 +401,7 @@ def test_eng_to_eng_translation(self): #set default props test_generic_job_props: dict[str, str] = dict(self.defaultProps) - ff_track = mpf.GenericTrack(-1, dict(TEXT='This is English text that should not be translated.', + ff_track = mpf.GenericTrack(-1, dict(TEXT='This is English text that should not be translated.', LANGUAGE='eng', ISO_SCRIPT='Latn')) job = mpf.GenericJob('Test Generic', 'test.pdf', test_generic_job_props, {}, ff_track) @@ -476,6 +476,8 @@ def test_paragraph_split_job(self): #load source language test_generic_job_props['DEFAULT_SOURCE_LANGUAGE'] = 'por' test_generic_job_props['DEFAULT_SOURCE_SCRIPT'] = 'Latn' + test_generic_job_props['SENTENCE_SPLITTER_MODE'] = 'DEFAULT' + test_generic_job_props['SENTENCE_SPLITTER_NEWLINE_BEHAVIOR'] = 'GUESS' # excerpt from https://www.gutenberg.org/ebooks/16443 pt_text="""Teimam de facto estes em que são indispensaveis os vividos raios do @@ -496,7 +498,7 @@ def test_paragraph_split_job(self): satisfeitos do mundo, satisfeitos dos homens e, muito especialmente, satisfeitos de si. """ - pt_text_translation = "They fear, indeed, those in whom the vivid rays of our unblinking sun, or the unclouded face of the moon in the peninsular firmament, where it has not, like that of London--to break at the cost of a plumbeo heaven--are indispensable, to pour joy into the soul and send to the semblances the reflection of them; they imagine fatally pursued from _spleen_, hopelessly gloomy and sullen, as if at every moment they were emerging from the subterranean galleries of a pit-coal mine, our British allies. How they deceive themselves or how they intend to deceive us! This is an illusion or bad faith, against which much is vainly complained the unlevel and accentuated expression of bliss, which shines through on the face. The European Parliament has been a great help to the people of Europe in the past, and it is a great help to us in the present." + pt_text_translation = "They fear, indeed, those in whom the vivid rays of our unblinking sun, or the unclouded face of the moon in the peninsular firmament, where it has not, like that of London--to break at the cost of a plumbeo heaven--are indispensable, to pour joy into the soul and send to the semblances the reflection of them; they imagine fatally pursued from _spleen_, hopelessly gloomy and dreary, as if every moment they came out of the underground galleries of a pit-coal mine, How they deceive or how they intend to deceive us! is this an illusion or bad faith, against which there is much claim in vain the indelevel and accented expression of beatitude, which shines on the illuminated face of the men from beyond the Manch, who seem to walk among us, wrapped in dense atmosphere of perennial contentment, satisfied The European Union is a global community of nations, which is not only a community of nations, but also a community of nations." ff_track = mpf.GenericTrack(-1, dict(TEXT=pt_text)) job = mpf.GenericJob('Test Generic', 'test.pdf', test_generic_job_props, {}, ff_track) @@ -505,6 +507,30 @@ def test_paragraph_split_job(self): result_props: dict[str, str] = result_track[0].detection_properties self.assertEqual(pt_text_translation, result_props["TRANSLATION"]) + + test_generic_job_props['SENTENCE_SPLITTER_MODE'] = 'SENTENCE' + test_generic_job_props['SENTENCE_SPLITTER_NEWLINE_BEHAVIOR'] = 'GUESS' + + pt_text_translation = "They fear, indeed, those in whom the vivid rays of our unblinking sun, or the unclouded face of the moon in the peninsular firmament, where it has not, like that of London--to break at the cost of a plumbeo heaven--are indispensable to pour joy into the soul and send to the countenances the reflection of them; They imagine themselves fatally haunted by spleen, hopelessly gloomy and sullen, as if at every moment they were emerging from the underground galleries of a pit-coal mine, Our British allies. How they deceive themselves or how they intend to deceive us! Is this an illusion or bad faith, against which there is much to be lamented in vain the indelevel and accentuated expression of beatitude, which shines through the illuminated faces of the men from beyond the Channel, who seem to walk among us, wrapped in a dense atmosphere of perenne contentment, satisfied with the world, satisfied with men and, very especially, satisfied with themselves? i. the" + ff_track = mpf.GenericTrack(-1, dict(TEXT=pt_text)) + job = mpf.GenericJob('Test Generic', 'test.pdf', test_generic_job_props, {}, ff_track) + result_track: Sequence[mpf.GenericTrack] = self.component.get_detections_from_generic(job) + + result_props: dict[str, str] = result_track[0].detection_properties + self.assertEqual(pt_text_translation, result_props["TRANSLATION"]) + + + test_generic_job_props['SENTENCE_SPLITTER_MODE'] = 'DEFAULT' + test_generic_job_props['SENTENCE_SPLITTER_NEWLINE_BEHAVIOR'] = 'NONE' + + pt_text_translation = "They fear, indeed, those in whom the vivid rays of our unblinking sun, or the unclouded face of the moon in the peninsular firmament, where it has not, like that of London--to break at the cost of a plumbeo heaven--are indispensable, to pour joy into the soul and send to the semblances the reflection of them; they imagine fatally pursued from _spleen_, hopelessly gloomy and sullen, as if at every moment they were emerging from the subterranean galleries of a pit-coal mine, our British allies. How they deceive themselves or how they intend to deceive us! This is an illusion or bad faith, against which much is vainly complained the unlevel and accentuated expression of bliss, which shines through on the face. The European Parliament has been a great help to the people of Europe in the past, and it is a great help to us in the present." + ff_track = mpf.GenericTrack(-1, dict(TEXT=pt_text)) + job = mpf.GenericJob('Test Generic', 'test.pdf', test_generic_job_props, {}, ff_track) + result_track: Sequence[mpf.GenericTrack] = self.component.get_detections_from_generic(job) + + result_props: dict[str, str] = result_track[0].detection_properties + self.assertEqual(pt_text_translation, result_props["TRANSLATION"]) + def test_wtp_with_flores_iso_lookup(self): #set default props test_generic_job_props: dict[str, str] = dict(self.defaultProps) @@ -612,11 +638,11 @@ def test_should_translate(self): self.assertFalse(should_translate("꩐꩑꩒꩓꩔꩕꩖꩗꩘꩙")) # Cham digits (\uAA50-\uAA59) self.assertFalse(should_translate("꯰꯱꯲꯳꯴꯵꯶꯷꯸꯹")) # Meetei Mayek digits (\uABF0-\uABF9) self.assertFalse(should_translate("0123456789")) # Full width digits (\uFF10-\uFF19) - + with self.subTest('Letter_Number: a letterlike numeric character'): letter_numbers = "ᛮᛯᛰⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫⅬⅭⅮⅯⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹⅺⅻⅼⅽⅾⅿↀↁↂↅↆↇↈ〇〡〢〣〤〥〦〧〨〩〸〹〺ꛦꛧꛨꛩꛪꛫꛬꛭꛮꛯ" self.assertFalse(should_translate(letter_numbers)) - + with self.subTest('Other_Number: a numeric character of other type'): other_numbers1 = "²³¹¼½¾৴৵৶৷৸৹୲୳୴୵୶୷௰௱௲౸౹౺౻౼౽౾൘൙൚൛൜൝൞൰൱൲൳൴൵൶൷൸༪༫༬༭༮༯༰༱༲༳፩፪፫፬፭፮፯፰፱፲፳፴፵፶፷፸፹፺፻፼" other_numbers2 = "៰៱៲៳៴៵៶៷៸៹᧚⁰⁴⁵⁶⁷⁸⁹₀₁₂₃₄₅₆₇₈₉⅐⅑⅒⅓⅔⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞⅟↉①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳" @@ -854,7 +880,7 @@ def test_wtp_iso_conversion(self): self.assertEqual(WtpLanguageSettings.convert_to_iso(NllbLanguageMapper.get_normalized_iso('zul_Latn')), 'zu') # languages supported by NLLB but not supported by WTP Splitter - self.assertIsNone(WtpLanguageSettings.convert_to_iso(NllbLanguageMapper.get_normalized_iso('aka_Latn'))) # 'ak' Akan + self.assertIsNone(WtpLanguageSettings.convert_to_iso(NllbLanguageMapper.get_normalized_iso('aka_Latn'))) # 'ak' Akan self.assertIsNone(WtpLanguageSettings.convert_to_iso(NllbLanguageMapper.get_normalized_iso('bem_Latn'))) # 'sw' Bemba self.assertIsNone(WtpLanguageSettings.convert_to_iso(NllbLanguageMapper.get_normalized_iso('bod_Tibt'))) # 'bo' Tibetan self.assertIsNone(WtpLanguageSettings.convert_to_iso(NllbLanguageMapper.get_normalized_iso('bos_Latn'))) # 'bs' Bosnian