Merge pull request #3673 from flairNLP/staccato_mod

alanakbik · web-flow · commit 5d04a9b09127 · 2025-06-11T18:16:46.000+02:00
Improve StaccatoTokenizer
diff --git a/flair/tokenization.py b/flair/tokenization.py
@@ -495,10 +495,18 @@ def __init__(self):
         self.digits = r"\d+"  # One or more digits
         self.kanji = r"[\u4e00-\u9fff]"  # Kanji characters
 
+        # Base pattern for letters in Latin-based scripts, including diacritics
+        latin_chars = r"[a-zA-Z\u00C0-\u02AF\u1E00-\u1EFF]"
+
+        # Pattern to capture abbreviations with at least two periods (e.g., "U.S.", "e.g.")
+        # This prevents matching single words at the end of a sentence (e.g., "X.").
+        abbrev_segment = f"{latin_chars}{{1,3}}"
+        self.abbreviations = rf"\b(?:{abbrev_segment}\.){{2,}}"
+
         # Unicode ranges for various alphabets and scripts
         # This includes Latin, Cyrillic, Greek, Hebrew, Arabic, Japanese Kana, Korean Hangul, etc.
         alphabets_list = [
-            r"[a-zA-Z]+",  # Latin
+            rf"{latin_chars}+",  # Latin
             r"[\u0400-\u04FF\u0500-\u052F]+",  # Cyrillic and Cyrillic Supplement
             r"[\u0370-\u03FF\u1F00-\u1FFF]+",  # Greek and Coptic
             r"[\u0590-\u05FF]+",  # Hebrew
@@ -512,8 +520,10 @@ def __init__(self):
         self.alphabet_pattern = "|".join(alphabets_list)
 
         # Combined pattern for re.findall:
-        # Captures letter sequences OR digit sequences OR Kanji OR punctuation/symbols
-        combined_pattern = f"({self.alphabet_pattern})|({self.digits})|({self.kanji})|({self.punctuation})"
+        # Captures abbreviations OR letter sequences OR digit sequences OR Kanji OR punctuation/symbols
+        combined_pattern = (
+            f"({self.abbreviations})|({self.alphabet_pattern})|({self.digits})|({self.kanji})|({self.punctuation})"
+        )
         # Pre-compile the regex for efficiency
         self.token_pattern = re.compile(combined_pattern)
 
diff --git a/tests/test_tokenize_sentence.py b/tests/test_tokenize_sentence.py
@@ -568,6 +568,97 @@ def test_create_sentence_with_staccato_tokenizer():
     assert sentence.tokens[3].text == "."
 
 
+def test_staccato_tokenizer_with_umlauts():
+    # Test German umlauts and other diacritics are not split from words
+    german_sentence = Sentence("US-Präsident Trump und die bösen Füchse.", use_tokenizer=StaccatoTokenizer())
+    expected_german_tokens = ["US", "-", "Präsident", "Trump", "und", "die", "bösen", "Füchse", "."]
+    assert [token.text for token in german_sentence.tokens] == expected_german_tokens
+
+    # Test with various diacritics
+    multi_diacritic_sentence = Sentence("Voilà, el pingüino se quejó de l'été.", use_tokenizer=StaccatoTokenizer())
+    expected_multi_diacritic_tokens = [
+        "Voilà",
+        ",",
+        "el",
+        "pingüino",
+        "se",
+        "quejó",
+        "de",
+        "l",
+        "'",
+        "été",
+        ".",
+    ]
+    assert [token.text for token in multi_diacritic_sentence.tokens] == expected_multi_diacritic_tokens
+
+
+def test_staccato_tokenizer_abbreviations():
+    tokenizer = StaccatoTokenizer()
+
+    # Case 1: Abbreviations with multiple periods should be one token
+    text_1 = "The firm is U.S.A. Inc. and i.e. in the U.S. we use e.g. to give examples."
+    sentence_1 = Sentence(text_1, use_tokenizer=tokenizer)
+    expected_tokens_1 = [
+        "The",
+        "firm",
+        "is",
+        "U.S.A.",
+        "Inc",
+        ".",
+        "and",
+        "i.e.",
+        "in",
+        "the",
+        "U.S.",
+        "we",
+        "use",
+        "e.g.",
+        "to",
+        "give",
+        "examples",
+        ".",
+    ]
+    assert [token.text for token in sentence_1.tokens] == expected_tokens_1
+
+    # Case 2: Single letter/short word with a dot at sentence end should be split
+    text_2 = "He wrote on X. Then Dr. Smith arrived."
+    sentence_2 = Sentence(text_2, use_tokenizer=tokenizer)
+    expected_tokens_2 = [
+        "He",
+        "wrote",
+        "on",
+        "X",
+        ".",
+        "Then",
+        "Dr",
+        ".",
+        "Smith",
+        "arrived",
+        ".",
+    ]
+    assert [token.text for token in sentence_2.tokens] == expected_tokens_2
+
+    # Case 3: A mix of cases
+    text_3 = "The item is from the U.K. (i.e. not the U.S.A.)."
+    sentence_3 = Sentence(text_3, use_tokenizer=tokenizer)
+    expected_tokens_3 = [
+        "The",
+        "item",
+        "is",
+        "from",
+        "the",
+        "U.K.",
+        "(",
+        "i.e.",
+        "not",
+        "the",
+        "U.S.A.",
+        ")",
+        ".",
+    ]
+    assert [token.text for token in sentence_3.tokens] == expected_tokens_3
+
+
 def test_staccato_tokenizer_with_numbers_and_punctuation():
     sentence = Sentence("It's 03-16-2025", use_tokenizer=StaccatoTokenizer())