openmpf · hhuangMITRE · Sep 23, 2025 · Oct 1, 2025 · Oct 14, 2025 · Oct 14, 2025
diff --git a/python/AzureTranslation/README.md b/python/AzureTranslation/README.md
@@ -87,25 +87,28 @@ must be provided. Neither has a default value.
 The following settings control the behavior of dividing input text into acceptable chunks
 for processing.
 
-Through preliminary investigation, we identified the [WtP library ("Where's the
+Through preliminary investigation, we identified the [SaT/WtP library ("Segment any Text" / "Where's the
 Point")](https://github.com/bminixhofer/wtpsplit) and [spaCy's multilingual sentence
 detection model](https://spacy.io/models) for identifying sentence breaks
 in a large section of text.
 
-WtP models are trained to split up multilingual text by sentence without the need of an
+SaT/WtP models are trained to split up multilingual text by sentence without the need of an
 input language tag. The disadvantage is that the most accurate WtP models will need ~3.5
-GB of GPU memory. On the other hand, spaCy has a single multilingual sentence detection
+GB of GPU memory. SaT models are a more recent addition and considered to be a more accurate
+set of sentence segmentation models; their resource costs are similar to WtP.
+
+On the other hand, spaCy has a single multilingual sentence detection
 that appears to work better for splitting up English text in certain cases, unfortunately
 this model lacks support handling for Chinese punctuation.
 
-- `SENTENCE_MODEL`: Specifies the desired WtP or spaCy sentence detection model. For CPU
-  and runtime considerations, the author of WtP recommends using `wtp-bert-mini`. More
-  advanced WtP models that use GPU resources (up to ~8 GB) are also available. See list of
-  WtP model names
+- `SENTENCE_MODEL`: Specifies the desired SaT/WtP or spaCy sentence detection model. For CPU
+  and runtime considerations, the authors of SaT/WtP recommends using `sat-3l-sm` or `wtp-bert-mini`.
+  More advanced SaT/WtP models that use GPU resources (up to ~8 GB for WtP) are also available. See list of
+  model names
   [here](https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#available-models). The
   only available spaCy model (for text with unknown language) is `xx_sent_ud_sm`.
 
-  Review list of languages supported by WtP
+  Review list of languages supported by SaT/WtP
   [here](https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#supported-languages).
   Review models and languages supported by spaCy [here](https://spacy.io/models).
 
@@ -116,15 +119,15 @@ this model lacks support handling for Chinese punctuation.
   [here](https://discourse.mozilla.org/t/proposal-sentences-lenght-limit-from-14-words-to-100-characters).
 
 - `SENTENCE_SPLITTER_INCLUDE_INPUT_LANG`: Specifies whether to pass input language to
-  sentence splitter algorithm. Currently, only WtP supports model threshold adjustments by
+  sentence splitter algorithm. Currently, only SaT/WtP supports model threshold adjustments by
   input language.
 
 - `SENTENCE_MODEL_CPU_ONLY`: If set to TRUE, only use CPU resources for the sentence
   detection model. If set to FALSE, allow sentence model to also use GPU resources.
-  For most runs using spaCy `xx_sent_ud_sm` or `wtp-bert-mini` models, GPU resources
+  For most runs using spaCy `xx_sent_ud_sm`, `sat-3l-sm`, or `wtp-bert-mini` models, GPU resources
   are not required. If using more advanced WtP models like `wtp-canine-s-12l`,
   it is recommended to set `SENTENCE_MODEL_CPU_ONLY=FALSE` to improve performance.
-  That model can use up to ~3.5 GB of GPU memory.
+  That WtP model can use up to ~3.5 GB of GPU memory.
 
   Please note, to fully enable this option, you must also rebuild the Docker container
   with the following change: Within the Dockerfile, set `ARG BUILD_TYPE=gpu`.

diff --git a/python/AzureTranslation/plugin-files/descriptor/descriptor.json b/python/AzureTranslation/plugin-files/descriptor/descriptor.json
@@ -95,7 +95,7 @@
         },
         {
           "name": "SENTENCE_MODEL",
-          "description": "Name of sentence segmentation model. Supported options are spaCy's multilingual `xx_sent_ud_sm` model and the Where's the Point (WtP) `wtp-bert-mini` model.",
+          "description": "Name of sentence segmentation model. Supported options are spaCy's multilingual `xx_sent_ud_sm` model, Segment any Text (SaT) `sat-3l-sm` model, and Where's the Point (WtP) `wtp-bert-mini` model.",
           "type": "STRING",
           "defaultValue": "wtp-bert-mini"
         },
@@ -107,7 +107,7 @@
         },
         {
           "name": "SENTENCE_MODEL_WTP_DEFAULT_ADAPTOR_LANGUAGE",
-          "description": "More advanced WTP models will require a target language. This property sets the default language to use for sentence splitting, unless `FROM_LANGUAGE`, `SUGGESTED_FROM_LANGUAGE`, or Azure language detection return a different, WtP-supported language option.",
+          "description": "More advanced WtP/SaT models will require a target language. This property sets the default language to use for sentence splitting, unless `FROM_LANGUAGE`, `SUGGESTED_FROM_LANGUAGE`, or Azure language detection return a different, WtP-supported language option.",
           "type": "STRING",
           "defaultValue": "en"
         },

diff --git a/python/AzureTranslation/tests/test_acs_translation.py b/python/AzureTranslation/tests/test_acs_translation.py
@@ -65,12 +65,14 @@ class TestAcsTranslation(unittest.TestCase):
 
     mock_server: ClassVar['MockServer']
     wtp_model: ClassVar['TextSplitterModel']
+    sat_model: ClassVar['TextSplitterModel']
     spacy_model: ClassVar['TextSplitterModel']
 
     @classmethod
     def setUpClass(cls):
         cls.mock_server = MockServer()
         cls.wtp_model = TextSplitterModel("wtp-bert-mini", "cpu", "en")
+        cls.sat_model = TextSplitterModel("sat-3l-sm", "cpu", "en")
         cls.spacy_model = TextSplitterModel("xx_sent_ud_sm", "cpu", "en")
 
 
@@ -669,6 +671,79 @@ def test_split_wtp_unknown_lang(self, _):
                       'Spaces should be kept due to incorrect language detection.')
 
 
+    @mock.patch.object(TranslationClient, 'DETECT_MAX_CHARS', new_callable=lambda: 150)
+    def test_split_sat_unknown_lang(self, _):
+        # Check that the text splitter does not have an issue
+        # processing an unknown detected language.
+        self.set_results_file('invalid-lang-detect-result.json')
+        self.set_results_file('split-sentence/art-of-war-translation-1.json')
+        self.set_results_file('split-sentence/art-of-war-translation-2.json')
+        self.set_results_file('split-sentence/art-of-war-translation-3.json')
+        self.set_results_file('split-sentence/art-of-war-translation-4.json')
+
+        text = (TEST_DATA / 'split-sentence/art-of-war.txt').read_text()
+        detection_props = dict(TEXT=text)
+        TranslationClient(get_test_properties(), self.sat_model).add_translations(detection_props)
+
+        self.assertEqual(5, len(detection_props))
+        self.assertEqual(text, detection_props['TEXT'])
+
+        expected_translation = (TEST_DATA / 'split-sentence/art-war-translation.txt') \
+            .read_text().strip()
+        self.assertEqual(expected_translation, detection_props['TRANSLATION'])
+        self.assertEqual('EN', detection_props['TRANSLATION TO LANGUAGE'])
+
+        self.assertEqual('fake-lang', detection_props['TRANSLATION SOURCE LANGUAGE'])
+        self.assertAlmostEqual(1.0,
+            float(detection_props['TRANSLATION SOURCE LANGUAGE CONFIDENCE']))
+
+        detect_request_text = self.get_request_body()[0]['Text']
+        self.assertEqual(text[0:TranslationClient.DETECT_MAX_CHARS], detect_request_text)
+
+        expected_chunk_lengths = [88, 118, 116, 106]
+        self.assertEqual(sum(expected_chunk_lengths), len(text))
+
+        # Due to an incorrect language detection, newlines are
+        # not properly replaced for Chinese text, and
+        # additional whitespace is present in the text.
+        # This alters the behavior of WtP sentence splitting.
+        translation_request1 = self.get_request_body()[0]['Text']
+        self.assertEqual(expected_chunk_lengths[0], len(translation_request1))
+        self.assertTrue(translation_request1.startswith('兵者，'))
+        self.assertTrue(translation_request1.endswith('而不危也；'))
+        self.assertNotIn('\n', translation_request1,
+                         'Newlines were not properly removed')
+        self.assertIn(' ', translation_request1,
+                      'Spaces should be kept due to incorrect language detection.')
+
+        translation_request2 = self.get_request_body()[0]['Text']
+        self.assertEqual(expected_chunk_lengths[1], len(translation_request2))
+        self.assertTrue(translation_request2.startswith('天者，陰陽'))
+        self.assertTrue(translation_request2.endswith('兵眾孰強？'))
+        self.assertNotIn('\n', translation_request2,
+                         'Newlines were not properly removed')
+        self.assertIn(' ', translation_request2,
+                      'Spaces should be kept due to incorrect language detection.')
+
+        translation_request3 = self.get_request_body()[0]['Text']
+        self.assertEqual(expected_chunk_lengths[2], len(translation_request3))
+        self.assertTrue(translation_request3.startswith('士卒孰練？'))
+        self.assertTrue(translation_request3.endswith('亂而取之， '))
+        self.assertNotIn('\n', translation_request3,
+                         'Newlines were not properly removed')
+        self.assertIn(' ', translation_request3,
+                      'Spaces should be kept due to incorrect language detection.')
+
+        translation_request4 = self.get_request_body()[0]['Text']
+        self.assertEqual(expected_chunk_lengths[3], len(translation_request4))
+        self.assertTrue(translation_request4.startswith('實而備之，'))
+        self.assertTrue(translation_request4.endswith('勝負見矣。 '))
+        self.assertNotIn('\n', translation_request4,
+                         'Newlines were not properly removed')
+        self.assertIn(' ', translation_request4,
+                      'Spaces should be kept due to incorrect language detection.')
+
+
     def test_newline_removal(self):
 
         def replace(text):