diff --git a/LICENSE b/LICENSE index c4d1011..f79bf7b 100644 --- a/LICENSE +++ b/LICENSE @@ -25,13 +25,14 @@ The nlp_text_splitter utlity uses the following sentence detection libraries: ***************************************************************************** -The WtP, "Where the Point", sentence segmentation library falls under the MIT License: +The WtP, "Where the Point", and SaT, "Segment any Text" sentence segmentation +library falls under the MIT License: -https://github.com/bminixhofer/wtpsplit/blob/main/LICENSE +https://github.com/segment-any-text/wtpsplit/blob/main/LICENSE MIT License -Copyright (c) 2024 Benjamin Minixhofer +Copyright (c) 2024 Benjamin Minixhofer, Markus Frohmann, Igor Sterner Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/detection/nlp_text_splitter/README.md b/detection/nlp_text_splitter/README.md index e8c7d14..f050a41 100644 --- a/detection/nlp_text_splitter/README.md +++ b/detection/nlp_text_splitter/README.md @@ -1,8 +1,8 @@ # Overview This directory contains the source code, test examples, and installation script -for the OpenMPF NlpTextSplitter tool, which uses WtP and spaCy libraries -to detect sentences in a given chunk of text. +for the OpenMPF NlpTextSplitter tool, which uses **SaT (Segment any Text)**, +**WtP (Where's the Point)**, and **spaCy** to detect sentences in a given chunk of text. # Background @@ -10,16 +10,48 @@ Our primary motivation for creating this tool was to find a lightweight, accurat sentence detection capability to support a large variety of text processing tasks including translation and tagging. -Through preliminary investigation, we identified the [WtP library ("Where's the -Point")](https://github.com/bminixhofer/wtpsplit) and [spaCy's multilingual sentence +Through preliminary investigation, we identified the [WtP/SaT library ("Where's the +Point"/"Segment any Text")](https://github.com/bminixhofer/wtpsplit) and [spaCy's multilingual sentence detection model](https://spacy.io/models) for identifying sentence breaks in a large section of text. WtP models are trained to split up multilingual text by sentence without the need of an input language tag. The disadvantage is that the most accurate WtP models will need ~3.5 -GB of GPU memory. On the other hand, spaCy has a single multilingual sentence detection -that appears to work better for splitting up English text in certain cases. Unfortunately -this model lacks support handling for Chinese punctuation. +GB of GPU memory. SaT is the newer successor to WtP from the same authors and +generally offers better accuracy/efficiency. + +On the other hand, spaCy has a single multilingual sentence detection +that appears to work better for splitting up English text in certain cases. + +This component has been updated to use the Azure Translation Component's NewLineBehavior class +for swapping newlines with either whitespace or removing it altogether based on script detected. + +The reason why we need to consider the script/character encodings is because certain languages +will treat whitespace between words as possessing different meanings. For instance in Chinese + +`电脑` would mean `computer` but `电 脑` would mean `electricity brain`. + +When calling the NLP text splitter, users can adjust the following parameters to control for sentence +splitting behaviors: + +- `split_mode`: set to `DEFAULT` for splitting by chunk size and `SENTENCE` when splitting by sentences + +- `newline_behavior` : controls how newlines are handled in a submitted input text. Options include: + - `GUESS` to choose ' ' for space-separated langs; '' for Chinese/Japanese/Korean. + - `SPACE` to always replace with a single space. + - `REMOVE` to always remove (no space). + - `NONE` to no change. + +For instance: +``` + result = list(TextSplitter.split(input_text, + ... + self.sat_model, + split_mode='DEFAULT') + newline_behavior='NONE') +``` +Will attempt to split using an SaT model, using the default chunking parameters and no newline adjustments. + # Installation @@ -40,12 +72,13 @@ Please note that several customizations are supported: setup a PyTorch installation with CUDA (GPU) libraries. - `--wtp-models-dir |-m `: Add this parameter to - change the default WtP model installation directory + change the default WtP/SaT model installation directory (default: `/opt/wtp/models`). - `--install-wtp-model|-w `: Add this parameter to specify - additional WTP models for installation. This parameter can be provided - multiple times to install more than one model. + additional WtP/SaT models for installation. Accepts both WtP names + (e.g., `wtp-bert-mini`) and SaT names (e.g., `sat-3l-sm`). + This parameter can be provided multiple times to install more than one model. - `--install-spacy-model|-s `: Add this parameter to specify additional spaCy models for installation. This parameter can be provided diff --git a/detection/nlp_text_splitter/install.sh b/detection/nlp_text_splitter/install.sh index 38d1f5c..749b682 100755 --- a/detection/nlp_text_splitter/install.sh +++ b/detection/nlp_text_splitter/install.sh @@ -7,11 +7,11 @@ # under contract, and is subject to the Rights in Data-General Clause # # 52.227-14, Alt. IV (DEC 2007). # # # -# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +# Copyright 2025 The MITRE Corporation. All Rights Reserved. # ############################################################################# ############################################################################# -# Copyright 2024 The MITRE Corporation # +# Copyright 2025 The MITRE Corporation # # # # Licensed under the Apache License, Version 2.0 (the "License"); # # you may not use this file except in compliance with the License. # @@ -37,7 +37,7 @@ main() { fi eval set -- "$options" local wtp_models_dir=/opt/wtp/models - local wtp_models=("wtp-bert-mini") + local wtp_models=("wtp-bert-mini" "sat-3l-sm") local spacy_models=("xx_sent_ud_sm") while true; do case "$1" in @@ -107,10 +107,20 @@ download_wtp_models() { for model_name in "${model_names[@]}"; do echo "Downloading the $model_name model to $wtp_models_dir." - local wtp_model_dir="$wtp_models_dir/$model_name" + local model_dir="$wtp_models_dir/$model_name" + + # Decide which HF org to use based on model prefix. + # - WtP: benjamin/ + # - SaT: segment-any-text/ + local hf_owner="benjamin" + case "$model_name" in + sat-*) hf_owner="segment-any-text" ;; + esac + python3 -c \ "from huggingface_hub import snapshot_download; \ - snapshot_download('benjamin/$model_name', local_dir='$wtp_model_dir')" + snapshot_download(repo_id='${hf_owner}/${model_name}', local_dir='${model_dir}')" + done } @@ -149,12 +159,12 @@ Options --text-splitter-dir, -t : Path to text splitter source code. (defaults to to the same directory as this script) --gpu, -g: Install the GPU version of PyTorch - --wtp-models-dir , -m : Path where WTP models will be stored. + --wtp-models-dir , -m : Path where WtP/SaT models will be stored. (defaults to /opt/wtp/models) - --install-wtp-model, -w : Name of a WTP model to install in addtion to wtp-bert-mini. + --install-wtp-model, -w : Name of a WTP or SaT model to install in addition to 'wtp-bert-mini' and 'sat-3l-sm. This option can be provided more than once to specify multiple models. - --install-spacy-model | -s : Names of a spaCy model to install in addtion to + --install-spacy-model | -s : Names of a spaCy model to install in addition to xx_sent_ud_sm. The option can be provided more than once to specify multiple models. " diff --git a/detection/nlp_text_splitter/nlp_text_splitter/__init__.py b/detection/nlp_text_splitter/nlp_text_splitter/__init__.py index 3913b9a..804dc83 100644 --- a/detection/nlp_text_splitter/nlp_text_splitter/__init__.py +++ b/detection/nlp_text_splitter/nlp_text_splitter/__init__.py @@ -5,11 +5,11 @@ # under contract, and is subject to the Rights in Data-General Clause # # 52.227-14, Alt. IV (DEC 2007). # # # -# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +# Copyright 2025 The MITRE Corporation. All Rights Reserved. # ############################################################################# ############################################################################# -# Copyright 2024 The MITRE Corporation # +# Copyright 2025 The MITRE Corporation # # # # Licensed under the Apache License, Version 2.0 (the "License"); # # you may not use this file except in compliance with the License. # @@ -30,35 +30,36 @@ from importlib.resources.abc import Traversable import spacy -from wtpsplit import WtP -from typing import Callable, List, Optional, Tuple - -from .wtp_lang_settings import WtpLanguageSettings - import torch +from wtpsplit import WtP, SaT +from typing import Callable, List, Optional, Tuple, Union + +from .wtp_lang_settings import WtpLanguageSettings +from .newline_behavior import NewLineBehavior DEFAULT_WTP_MODELS = "/opt/wtp/models" # If we want to package model installation with this utility in the future: -WTP_MODELS_PATH: Traversable = importlib.resources.files(__name__) / 'models' +MODELS_PATH: Traversable = importlib.resources.files(__name__) / 'models' log = logging.getLogger(__name__) + # These models must have an specified language during sentence splitting. -WTP_MANDATORY_ADAPTOR = ['wtp-canine-s-1l', - 'wtp-canine-s-3l', - 'wtp-canine-s-6l', - 'wtp-canine-s-9l', - 'wtp-canine-s-12l'] +WTP_MANDATORY_ADAPTOR = { + 'wtp-canine-s-1l', + 'wtp-canine-s-3l', + 'wtp-canine-s-6l', + 'wtp-canine-s-9l', + 'wtp-canine-s-12l', +} -GPU_AVAILABLE = False -if torch.cuda.is_available(): - GPU_AVAILABLE = True +GPU_AVAILABLE = torch.cuda.is_available() class TextSplitterModel: - # To hold spaCy, WtP, and other potential sentence detection models in cache + # To hold spaCy, WtP, SaT, and other potential sentence detection models in cache def __init__(self, model_name: str, model_setting: str, default_lang: str = "en") -> None: self._model_name = "" @@ -68,68 +69,95 @@ def __init__(self, model_name: str, model_setting: str, default_lang: str = "en" self.split = lambda t, **param: [t] self.update_model(model_name, model_setting, default_lang) - def update_model(self, model_name: str, model_setting: str = "cpu", default_lang: str="en"): - if model_name: - if "wtp" in model_name: - self._update_wtp_model(model_name, model_setting, default_lang) - self.split = self._split_wtp - log.info(f"Setup WtP model: {model_name}") - else: - self._update_spacy_model(model_name) - self.split = self._split_spacy - log.info(f"Setup spaCy model: {model_name}") - - def _update_wtp_model(self, wtp_model_name: str, - model_setting: str, - default_lang: str) -> None: + def update_model(self, model_name: str, model_setting: str = "cpu", default_lang: str = "en"): + if not model_name: + return + + lower_name = model_name.lower() + if lower_name.startswith("wtp"): + self._update_wtp_model(model_name, model_setting, default_lang) + self.split = self._split_wtp + log.info(f"Setup WtP model: {model_name}") + elif lower_name.startswith("sat"): + self._update_sat_model(model_name, model_setting, default_lang) + self.split = self._split_sat + log.info(f"Setup SaT model: {model_name}") + else: + self._update_spacy_model(model_name) + self.split = self._split_spacy + log.info(f"Setup spaCy model: {model_name}") - if model_setting == "gpu" or model_setting == "cuda": + def _resolve_cpu_gpu_device(self, model_setting: str) -> str: + if model_setting in ("gpu", "cuda"): if GPU_AVAILABLE: - model_setting = "cuda" + return "cuda" else: log.warning("PyTorch determined that CUDA is not available. " "You may need to update the NVIDIA driver for the host system, " "or reinstall PyTorch with GPU support by setting " "ARGS BUILD_TYPE=gpu in the Dockerfile when building this component.") - model_setting = "cpu" - elif model_setting != "cpu": - log.warning("Invalid WtP model setting. Only `cpu` and `cuda` " - "(or `gpu`) WtP model options available at this time. " + return "cpu" + if model_setting != "cpu": + log.warning( + f"Invalid model setting {model_setting}. Only `cpu` and `cuda` " + "(or `gpu`) WtP/SaT model options available at this time. " "Defaulting to `cpu` mode.") - model_setting = "cpu" + return "cpu" + + def _find_local_model_path(self, model_name: str) -> Optional[str]: + candidate = MODELS_PATH / model_name + if candidate.is_file() or candidate.is_dir(): + with importlib.resources.as_file(candidate) as path: + return str(path) - if wtp_model_name in WTP_MANDATORY_ADAPTOR: - self._mandatory_wtp_language = True - self._default_lang = default_lang + fallback = os.path.join(DEFAULT_WTP_MODELS, model_name) + if os.path.exists(fallback): + return fallback + return None - if self._model_name == wtp_model_name and self._model_setting == model_setting: - log.info(f"Using cached model, running on {self._model_setting}: " - f"{self._model_name}") + def _update_wtp_model(self, wtp_model_name: str, + model_setting: str, + default_lang: str) -> None: + device = self._resolve_cpu_gpu_device(model_setting) + + self._model_name = wtp_model_name + self._model_setting = device + self._default_lang = default_lang + self._mandatory_wtp_language = (wtp_model_name in WTP_MANDATORY_ADAPTOR) + + local_path = self._find_local_model_path(wtp_model_name) + + if local_path: + log.info(f"Using downloaded WtP model at {local_path}") + self.wtp_model = WtP(local_path) else: - self._model_setting = model_setting - self._model_name = wtp_model_name - # Check if model has been downloaded - if (WTP_MODELS_PATH / wtp_model_name).is_file(): - log.info(f"Using downloaded {wtp_model_name} model.") - with importlib.resources.as_file(WTP_MODELS_PATH / wtp_model_name) as path: - self.wtp_model = WtP(str(path)) - elif os.path.exists(os.path.join(DEFAULT_WTP_MODELS, - wtp_model_name)): - - log.info(f"Using downloaded {wtp_model_name} model.") - wtp_model_name = os.path.join(DEFAULT_WTP_MODELS, - wtp_model_name) - self.wtp_model = WtP(wtp_model_name) - else: - log.warning(f"Model {wtp_model_name} not found, " - "downloading from hugging face.") - self.wtp_model = WtP(wtp_model_name) + log.warning(f"WtP model {wtp_model_name} not found locally; downloading from Hugging Face.") + self.wtp_model = WtP(wtp_model_name) + self.wtp_model.to(device) + + def _update_sat_model(self, sat_model_name: str, model_setting: str, default_lang: str) -> None: + device = self._resolve_cpu_gpu_device(model_setting) + + self._model_name = sat_model_name + self._model_setting = device + self._default_lang = default_lang + self._mandatory_wtp_language = (sat_model_name in WTP_MANDATORY_ADAPTOR) + + local_path = self._find_local_model_path(sat_model_name) + + if local_path: + log.info(f"Using downloaded SaT model at {local_path}") + self.sat_model = SaT(local_path) + else: + log.warning(f"SaT model {sat_model_name} not found locally; downloading from Hugging Face.") + self.sat_model = SaT(sat_model_name) + + # Move model to device; SaT benefits from half precision on GPU. + if device == "cuda": + self.sat_model.half().to("cuda") + else: + self.sat_model.to("cpu") - if model_setting != "cpu" and model_setting != "cuda": - log.warning(f"Invalid setting for WtP runtime {model_setting}. " - "Defaulting to CPU mode.") - model_setting = "cpu" - self.wtp_model.to(model_setting) def _split_wtp(self, text: str, lang: Optional[str] = None) -> List[str]: if lang: @@ -152,6 +180,10 @@ def _update_spacy_model(self, spacy_model_name: str): self.spacy_model = spacy.load(spacy_model_name, exclude=["parser"]) self.spacy_model.enable_pipe("senter") + def _split_sat(self, text: str, lang: Optional[str] = None) -> List[str]: + # TODO: For now, we'll only use the SaT models that are language agnostic. + return self.sat_model.split(text) + def _split_spacy(self, text: str, lang: Optional[str] = None) -> List[str]: # TODO: We may add an auto model selection for spaCy in the future. # However, the drawback is we will also need to @@ -160,29 +192,46 @@ def _split_spacy(self, text: str, lang: Optional[str] = None) -> List[str]: return [sent.text_with_ws for sent in processed_text.sents] class TextSplitter: + NewLineBehaviorType = Union[ + NewLineBehavior.Behavior, # 'GUESS' | 'SPACE' | 'REMOVE' | 'NONE' | callable | None + ] def __init__( self, text: str, limit: int, num_boundary_chars: int, get_text_size: Callable[[str], int], sentence_model: TextSplitterModel, - in_lang: Optional[str] = None) -> None: + in_lang: Optional[str] = None, + split_mode: str = 'DEFAULT', + newline_behavior: NewLineBehaviorType = 'GUESS' + ) -> None: + self._sentence_model = sentence_model self._limit = limit self._num_boundary_chars = num_boundary_chars self._get_text_size = get_text_size + self._in_lang = in_lang + self._split_mode = split_mode + + self._newline_fn: Callable[[str, Optional[str]], str] = NewLineBehavior.get(newline_behavior) self._text = "" self._text_full_size = 0 self._overhead_size = 0 self._soft_limit = self._limit - self._in_lang = in_lang if text: self.set_text(text) def set_text(self, text: str): - self._text = text - self._text_full_size = self._get_text_size(text) - chars_per_size = len(text) / self._text_full_size + + if text: + self._text = self._newline_fn(text, self._in_lang) + else: + self._text = text + + self._text_full_size = self._get_text_size(self._text) + + text_size = self._text_full_size if self._text_full_size > 0 else 1 + chars_per_size = len(self._text) / text_size self._overhead_size = self._get_text_size('') self._soft_limit = int(self._limit * chars_per_size) - self._overhead_size @@ -194,7 +243,6 @@ def set_text(self, text: str): # before applying chars_per_size weighting. self._soft_limit = max(1, int((self._limit - self._overhead_size) * chars_per_size)) - def _isolate_largest_section(self, text:str) -> str: # Using cached word splitting model, isolate largest section of text string_length = len(text) @@ -209,7 +257,7 @@ def _isolate_largest_section(self, text:str) -> str: substring_list = self._sentence_model.split(substring, lang = self._in_lang) div_index = string_length - len(substring_list[-1]) - if div_index==start_indx: + if div_index == start_indx: return text return text[0:div_index] @@ -218,17 +266,48 @@ def _isolate_largest_section(self, text:str) -> str: def split(cls, text: str, limit: int, num_boundary_chars: int, get_text_size: Callable[[str], int], sentence_model: TextSplitterModel, - in_lang: Optional[str] = None - ): - return cls(text, limit, num_boundary_chars, get_text_size, sentence_model, in_lang)._split() - + in_lang: Optional[str] = None, + split_mode: str = 'DEFAULT', + newline_behavior: NewLineBehavior.Behavior = 'GUESS' + ): + return cls( + text, limit, num_boundary_chars, get_text_size, + sentence_model, in_lang, split_mode, newline_behavior + )._split() def _split(self): + if self._split_mode == 'SENTENCE': + yield from self._split_sentences_individually() + else: + yield from self._split_default() + + def _split_default(self): if self._text_full_size <= self._limit: yield self._text else: yield from self._split_internal(self._text) + def _split_sentences_individually(self): + """ + Yield one sentence at a time. If any individual sentence exceeds the limit, + reuse the internal chunking logic to subdivide that sentence. + """ + sentences = self._sentence_model.split(self._text, lang=self._in_lang) + for sentence in sentences: + if self._get_text_size(sentence) <= self._limit: + yield sentence + else: + # Split oversized sentence using the default internal logic. + yield from self._split_sentence_text(sentence) + + def _split_sentence_text(self, text: str): + saved = (self._text, self._text_full_size, self._overhead_size, self._soft_limit) + try: + self.set_text(text) + yield from self._split_internal(text) + finally: + self._text, self._text_full_size, self._overhead_size, self._soft_limit = saved + def _split_internal(self, text): right = text while True: @@ -250,9 +329,7 @@ def _divide(self, text) -> Tuple[str, str]: left = self._isolate_largest_section(left) return left, text[len(left):] - char_per_size = len(left) / left_size - - + char_per_size = len(left) / max(left_size, 1) limit = int(self._limit * char_per_size) - self._overhead_size if limit < 1: diff --git a/detection/nlp_text_splitter/nlp_text_splitter/newline_behavior.py b/detection/nlp_text_splitter/nlp_text_splitter/newline_behavior.py new file mode 100644 index 0000000..8424438 --- /dev/null +++ b/detection/nlp_text_splitter/nlp_text_splitter/newline_behavior.py @@ -0,0 +1,155 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2025 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2025 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +from __future__ import annotations + +import bisect +import re +from typing import Callable, Literal, Optional, Union + +import mpf_component_api as mpf + +# Languages that typically do NOT use spaces between words +NO_SPACE_LANGS = ('JA', 'YUE', 'ZH-HANS', 'ZH-HANT') + +class ChineseAndJapaneseCodePoints: + # From http://www.unicode.org/charts/ + RANGES = sorted(( + range(0x2e80, 0x2fe0), + range(0x2ff0, 0x3130), + range(0x3190, 0x3300), + range(0x3400, 0x4dc0), + range(0x4e00, 0xa4d0), + range(0xf900, 0xfb00), + range(0xfe10, 0xfe20), + range(0xfe30, 0xfe70), + range(0xff00, 0xffa0), + range(0x16f00, 0x16fa0), + range(0x16fe0, 0x18d09), + range(0x1b000, 0x1b300), + range(0x1f200, 0x1f300), + range(0x20000, 0x2a6de), + range(0x2a700, 0x2ebe1), + range(0x2f800, 0x2fa20), + range(0x30000, 0x3134b) + ), key=lambda r: r.start) + + RANGE_BEGINS = [r.start for r in RANGES] + + @classmethod + def check_char(cls, char: str) -> bool: + """ + Determine whether or not the given character is in the Unicode code point ranges assigned + to Chinese and Japanese. + """ + code_point = ord(char[0]) + if code_point < cls.RANGE_BEGINS[0]: + return False + else: + idx = bisect.bisect_right(cls.RANGE_BEGINS, code_point) + return code_point in cls.RANGES[idx - 1] + + +class NewLineBehavior: + """ + Provides a callable to normalize *single* newline events while preserving intended breaks. + Modes: + - 'GUESS' : choose ' ' for space-separated langs; '' for CJK. + - 'SPACE' : always replace with a single space. + - 'REMOVE' : always remove (no space). + - 'NONE' : no change. + + Users can also provide a custom callable to augment NewLineBehavior. + """ + + Behavior = Union[ + Literal['GUESS', 'SPACE', 'REMOVE', 'NONE'], + Callable[[str, Optional[str]], str], + None + + ] + + @classmethod + def get(cls, behavior: Behavior) -> Callable[[str, Optional[str]], str]: + if callable(behavior): + return behavior + + # Default to GUESS if None or invalid string + if behavior is None: + behavior = 'GUESS' + + behavior = behavior.upper() + + if behavior == 'GUESS': + return lambda s, l: cls._replace_new_lines(s, cls._guess_lang_separator(s, l)) + elif behavior == 'REMOVE': + return lambda s, _: cls._replace_new_lines(s, '') + elif behavior == 'SPACE': + return lambda s, _: cls._replace_new_lines(s, ' ') + elif behavior == 'NONE': + return lambda s, _: s + else: + raise mpf.DetectionError.INVALID_PROPERTY.exception( + f'"{behavior}" is not a valid value for the "STRIP_NEW_LINE_BEHAVIOR" property. ' + 'Valid value are GUESS, REMOVE, SPACE, NONE.') + + @staticmethod + def _guess_lang_separator(text: str, language: Optional[str]) -> Literal['', ' ']: + if language: + if language.upper() in NO_SPACE_LANGS: + return '' + else: + return ' ' + else: + first_alpha_letter = next((ch for ch in text if ch.isalpha()), 'a') + if ChineseAndJapaneseCodePoints.check_char(first_alpha_letter): + return '' + else: + return ' ' + + + REPLACE_NEW_LINE_REGEX = re.compile(r''' + \s? # Include preceding whitespace character if present + (? str: + + def do_replacement(match: Match[str]) -> str: + match_text = match.group(0) + if match_text == '\n': + # Surrounding characters are not whitespace. + return replacement + else: + # There is already whitespace next to newline character, so it can just be removed. + return match_text.replace('\n', '', 1) + + return cls.REPLACE_NEW_LINE_REGEX.sub(do_replacement, text) \ No newline at end of file diff --git a/detection/nlp_text_splitter/nlp_text_splitter/wtp_lang_settings.py b/detection/nlp_text_splitter/nlp_text_splitter/wtp_lang_settings.py index c682fd3..05a3936 100644 --- a/detection/nlp_text_splitter/nlp_text_splitter/wtp_lang_settings.py +++ b/detection/nlp_text_splitter/nlp_text_splitter/wtp_lang_settings.py @@ -5,11 +5,11 @@ # under contract, and is subject to the Rights in Data-General Clause # # 52.227-14, Alt. IV (DEC 2007). # # # -# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +# Copyright 2025 The MITRE Corporation. All Rights Reserved. # ############################################################################# ############################################################################# -# Copyright 2024 The MITRE Corporation # +# Copyright 2025 The MITRE Corporation # # # # Licensed under the Apache License, Version 2.0 (the "License"); # # you may not use this file except in compliance with the License. # diff --git a/detection/nlp_text_splitter/pyproject.toml b/detection/nlp_text_splitter/pyproject.toml index 992a847..cffb338 100644 --- a/detection/nlp_text_splitter/pyproject.toml +++ b/detection/nlp_text_splitter/pyproject.toml @@ -5,11 +5,11 @@ # under contract, and is subject to the Rights in Data-General Clause # # 52.227-14, Alt. IV (DEC 2007). # # # -# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +# Copyright 2025 The MITRE Corporation. All Rights Reserved. # ############################################################################# ############################################################################# -# Copyright 2024 The MITRE Corporation # +# Copyright 2025 The MITRE Corporation # # # # Licensed under the Apache License, Version 2.0 (the "License"); # # you may not use this file except in compliance with the License. # diff --git a/detection/nlp_text_splitter/tests/test_text_splitter.py b/detection/nlp_text_splitter/tests/test_text_splitter.py index 9782870..030f3db 100644 --- a/detection/nlp_text_splitter/tests/test_text_splitter.py +++ b/detection/nlp_text_splitter/tests/test_text_splitter.py @@ -5,11 +5,11 @@ # under contract, and is subject to the Rights in Data-General Clause # # 52.227-14, Alt. IV (DEC 2007). # # # -# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +# Copyright 2025 The MITRE Corporation. All Rights Reserved. # ############################################################################# ############################################################################# -# Copyright 2024 The MITRE Corporation # +# Copyright 2025 The MITRE Corporation # # # # Licensed under the Apache License, Version 2.0 (the "License"); # # you may not use this file except in compliance with the License. # @@ -38,6 +38,21 @@ def setUpClass(cls): cls.wtp_model = TextSplitterModel("wtp-bert-mini", "cpu", "en") cls.wtp_adv_model = TextSplitterModel("wtp-canine-s-1l", "cpu", "zh") cls.spacy_model = TextSplitterModel("xx_sent_ud_sm", "cpu", "en") + cls.sat_model = TextSplitterModel("sat-3l-sm", "cpu", "en") + + def test_sat_basic_sentence_split(self): + input_text = 'Hello, what is your name? My name is John.' + actual = list(TextSplitter.split(input_text, + 100, + 100, + len, + self.sat_model, + split_mode='SENTENCE')) + self.assertEqual(2, len(actual)) + self.assertEqual('Hello, what is your name? ', actual[0]) + self.assertEqual('My name is John.', actual[1]) + + def test_split_engine_difference(self): # Note: Only WtP's multilingual models @@ -58,8 +73,14 @@ def test_split_engine_difference(self): actual = self.wtp_model._split_wtp(text) self.assertEqual(10, len(actual)) + # SaT seems to try to split using additional features, in addition to newlines. + actual = self.sat_model._split_sat(text) + self.assertEqual(19, len(actual)) + def test_guess_split_simple_sentence(self): - input_text = 'Hello, what is your name? My name is John.' + input_text = 'Hello, what is your name? My name is John. C. Finn.' + + # WtP Produces a clean split. actual = list(TextSplitter.split(input_text, 28, 28, @@ -71,9 +92,24 @@ def test_guess_split_simple_sentence(self): # "Hello, what is your name?" self.assertEqual('Hello, what is your name? ', actual[0]) # " My name is John." - self.assertEqual('My name is John.', actual[1]) + self.assertEqual('My name is John. C. Finn.', actual[1]) + + # Seems SaT is a bit more aggressive at splitting text. + actual = list(TextSplitter.split(input_text, + 500, + 500, + len, + self.sat_model, + split_mode='SENTENCE')) + self.assertEqual(input_text, ''.join(actual)) + self.assertEqual(3, len(actual)) + + # "Hello, what is your name?" + self.assertEqual('Hello, what is your name? ', actual[0]) + # " My name is John." + self.assertEqual('My name is John. ', actual[1]) + self.assertEqual('C. Finn.', actual[2]) - input_text = 'Hello, what is your name? My name is John.' actual = list(TextSplitter.split(input_text, 28, 28, @@ -85,7 +121,7 @@ def test_guess_split_simple_sentence(self): # "Hello, what is your name?" self.assertEqual('Hello, what is your name? ', actual[0]) # " My name is John." - self.assertEqual('My name is John.', actual[1]) + self.assertEqual('My name is John. C. Finn.', actual[1]) def test_split_sentence_end_punctuation(self): input_text = 'Hello. How are you? asdfasdf' @@ -124,7 +160,8 @@ def test_guess_split_edge_cases(self): 30, 30, len, - self.wtp_model)) + self.wtp_model, + newline_behavior = "NONE")) self.assertEqual(input_text, ''.join(actual)) self.assertEqual(4, len(actual)) @@ -135,11 +172,30 @@ def test_guess_split_edge_cases(self): self.assertEqual("Maybe...maybe not? \n ", actual[2]) self.assertEqual("All done, I think!", actual[3]) + # Split using WtP model. + actual = list(TextSplitter.split(input_text, + 30, + 30, + len, + self.wtp_model, + newline_behavior = "GUESS")) + + self.assertEqual(input_text.replace('\n',''), ''.join(actual)) + self.assertEqual(4, len(actual)) + + # WtP should detect and split out each sentence + self.assertEqual("This is a sentence (Dr.Test). ", actual[0]) + self.assertEqual("Is this, a sentence as well? ", actual[1]) + self.assertEqual("Maybe...maybe not? ", actual[2]) + self.assertEqual("All done, I think!", actual[3]) + + actual = list(TextSplitter.split(input_text, 35, 35, len, - self.spacy_model)) + self.spacy_model, + newline_behavior = "NONE")) self.assertEqual(input_text, ''.join(actual)) self.assertEqual(4, len(actual))