Skip to content

Commit 5d04a9b

Browse files
authored
Merge pull request #3673 from flairNLP/staccato_mod
Improve StaccatoTokenizer
2 parents fa9e439 + 6e39cfc commit 5d04a9b

File tree

2 files changed

+104
-3
lines changed

2 files changed

+104
-3
lines changed

flair/tokenization.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -495,10 +495,18 @@ def __init__(self):
495495
self.digits = r"\d+" # One or more digits
496496
self.kanji = r"[\u4e00-\u9fff]" # Kanji characters
497497

498+
# Base pattern for letters in Latin-based scripts, including diacritics
499+
latin_chars = r"[a-zA-Z\u00C0-\u02AF\u1E00-\u1EFF]"
500+
501+
# Pattern to capture abbreviations with at least two periods (e.g., "U.S.", "e.g.")
502+
# This prevents matching single words at the end of a sentence (e.g., "X.").
503+
abbrev_segment = f"{latin_chars}{{1,3}}"
504+
self.abbreviations = rf"\b(?:{abbrev_segment}\.){{2,}}"
505+
498506
# Unicode ranges for various alphabets and scripts
499507
# This includes Latin, Cyrillic, Greek, Hebrew, Arabic, Japanese Kana, Korean Hangul, etc.
500508
alphabets_list = [
501-
r"[a-zA-Z]+", # Latin
509+
rf"{latin_chars}+", # Latin
502510
r"[\u0400-\u04FF\u0500-\u052F]+", # Cyrillic and Cyrillic Supplement
503511
r"[\u0370-\u03FF\u1F00-\u1FFF]+", # Greek and Coptic
504512
r"[\u0590-\u05FF]+", # Hebrew
@@ -512,8 +520,10 @@ def __init__(self):
512520
self.alphabet_pattern = "|".join(alphabets_list)
513521

514522
# Combined pattern for re.findall:
515-
# Captures letter sequences OR digit sequences OR Kanji OR punctuation/symbols
516-
combined_pattern = f"({self.alphabet_pattern})|({self.digits})|({self.kanji})|({self.punctuation})"
523+
# Captures abbreviations OR letter sequences OR digit sequences OR Kanji OR punctuation/symbols
524+
combined_pattern = (
525+
f"({self.abbreviations})|({self.alphabet_pattern})|({self.digits})|({self.kanji})|({self.punctuation})"
526+
)
517527
# Pre-compile the regex for efficiency
518528
self.token_pattern = re.compile(combined_pattern)
519529

tests/test_tokenize_sentence.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -568,6 +568,97 @@ def test_create_sentence_with_staccato_tokenizer():
568568
assert sentence.tokens[3].text == "."
569569

570570

571+
def test_staccato_tokenizer_with_umlauts():
572+
# Test German umlauts and other diacritics are not split from words
573+
german_sentence = Sentence("US-Präsident Trump und die bösen Füchse.", use_tokenizer=StaccatoTokenizer())
574+
expected_german_tokens = ["US", "-", "Präsident", "Trump", "und", "die", "bösen", "Füchse", "."]
575+
assert [token.text for token in german_sentence.tokens] == expected_german_tokens
576+
577+
# Test with various diacritics
578+
multi_diacritic_sentence = Sentence("Voilà, el pingüino se quejó de l'été.", use_tokenizer=StaccatoTokenizer())
579+
expected_multi_diacritic_tokens = [
580+
"Voilà",
581+
",",
582+
"el",
583+
"pingüino",
584+
"se",
585+
"quejó",
586+
"de",
587+
"l",
588+
"'",
589+
"été",
590+
".",
591+
]
592+
assert [token.text for token in multi_diacritic_sentence.tokens] == expected_multi_diacritic_tokens
593+
594+
595+
def test_staccato_tokenizer_abbreviations():
596+
tokenizer = StaccatoTokenizer()
597+
598+
# Case 1: Abbreviations with multiple periods should be one token
599+
text_1 = "The firm is U.S.A. Inc. and i.e. in the U.S. we use e.g. to give examples."
600+
sentence_1 = Sentence(text_1, use_tokenizer=tokenizer)
601+
expected_tokens_1 = [
602+
"The",
603+
"firm",
604+
"is",
605+
"U.S.A.",
606+
"Inc",
607+
".",
608+
"and",
609+
"i.e.",
610+
"in",
611+
"the",
612+
"U.S.",
613+
"we",
614+
"use",
615+
"e.g.",
616+
"to",
617+
"give",
618+
"examples",
619+
".",
620+
]
621+
assert [token.text for token in sentence_1.tokens] == expected_tokens_1
622+
623+
# Case 2: Single letter/short word with a dot at sentence end should be split
624+
text_2 = "He wrote on X. Then Dr. Smith arrived."
625+
sentence_2 = Sentence(text_2, use_tokenizer=tokenizer)
626+
expected_tokens_2 = [
627+
"He",
628+
"wrote",
629+
"on",
630+
"X",
631+
".",
632+
"Then",
633+
"Dr",
634+
".",
635+
"Smith",
636+
"arrived",
637+
".",
638+
]
639+
assert [token.text for token in sentence_2.tokens] == expected_tokens_2
640+
641+
# Case 3: A mix of cases
642+
text_3 = "The item is from the U.K. (i.e. not the U.S.A.)."
643+
sentence_3 = Sentence(text_3, use_tokenizer=tokenizer)
644+
expected_tokens_3 = [
645+
"The",
646+
"item",
647+
"is",
648+
"from",
649+
"the",
650+
"U.K.",
651+
"(",
652+
"i.e.",
653+
"not",
654+
"the",
655+
"U.S.A.",
656+
")",
657+
".",
658+
]
659+
assert [token.text for token in sentence_3.tokens] == expected_tokens_3
660+
661+
571662
def test_staccato_tokenizer_with_numbers_and_punctuation():
572663
sentence = Sentence("It's 03-16-2025", use_tokenizer=StaccatoTokenizer())
573664

0 commit comments

Comments
 (0)