@@ -568,6 +568,97 @@ def test_create_sentence_with_staccato_tokenizer():
568568 assert sentence .tokens [3 ].text == "."
569569
570570
571+ def test_staccato_tokenizer_with_umlauts ():
572+ # Test German umlauts and other diacritics are not split from words
573+ german_sentence = Sentence ("US-Präsident Trump und die bösen Füchse." , use_tokenizer = StaccatoTokenizer ())
574+ expected_german_tokens = ["US" , "-" , "Präsident" , "Trump" , "und" , "die" , "bösen" , "Füchse" , "." ]
575+ assert [token .text for token in german_sentence .tokens ] == expected_german_tokens
576+
577+ # Test with various diacritics
578+ multi_diacritic_sentence = Sentence ("Voilà, el pingüino se quejó de l'été." , use_tokenizer = StaccatoTokenizer ())
579+ expected_multi_diacritic_tokens = [
580+ "Voilà" ,
581+ "," ,
582+ "el" ,
583+ "pingüino" ,
584+ "se" ,
585+ "quejó" ,
586+ "de" ,
587+ "l" ,
588+ "'" ,
589+ "été" ,
590+ "." ,
591+ ]
592+ assert [token .text for token in multi_diacritic_sentence .tokens ] == expected_multi_diacritic_tokens
593+
594+
595+ def test_staccato_tokenizer_abbreviations ():
596+ tokenizer = StaccatoTokenizer ()
597+
598+ # Case 1: Abbreviations with multiple periods should be one token
599+ text_1 = "The firm is U.S.A. Inc. and i.e. in the U.S. we use e.g. to give examples."
600+ sentence_1 = Sentence (text_1 , use_tokenizer = tokenizer )
601+ expected_tokens_1 = [
602+ "The" ,
603+ "firm" ,
604+ "is" ,
605+ "U.S.A." ,
606+ "Inc" ,
607+ "." ,
608+ "and" ,
609+ "i.e." ,
610+ "in" ,
611+ "the" ,
612+ "U.S." ,
613+ "we" ,
614+ "use" ,
615+ "e.g." ,
616+ "to" ,
617+ "give" ,
618+ "examples" ,
619+ "." ,
620+ ]
621+ assert [token .text for token in sentence_1 .tokens ] == expected_tokens_1
622+
623+ # Case 2: Single letter/short word with a dot at sentence end should be split
624+ text_2 = "He wrote on X. Then Dr. Smith arrived."
625+ sentence_2 = Sentence (text_2 , use_tokenizer = tokenizer )
626+ expected_tokens_2 = [
627+ "He" ,
628+ "wrote" ,
629+ "on" ,
630+ "X" ,
631+ "." ,
632+ "Then" ,
633+ "Dr" ,
634+ "." ,
635+ "Smith" ,
636+ "arrived" ,
637+ "." ,
638+ ]
639+ assert [token .text for token in sentence_2 .tokens ] == expected_tokens_2
640+
641+ # Case 3: A mix of cases
642+ text_3 = "The item is from the U.K. (i.e. not the U.S.A.)."
643+ sentence_3 = Sentence (text_3 , use_tokenizer = tokenizer )
644+ expected_tokens_3 = [
645+ "The" ,
646+ "item" ,
647+ "is" ,
648+ "from" ,
649+ "the" ,
650+ "U.K." ,
651+ "(" ,
652+ "i.e." ,
653+ "not" ,
654+ "the" ,
655+ "U.S.A." ,
656+ ")" ,
657+ "." ,
658+ ]
659+ assert [token .text for token in sentence_3 .tokens ] == expected_tokens_3
660+
661+
571662def test_staccato_tokenizer_with_numbers_and_punctuation ():
572663 sentence = Sentence ("It's 03-16-2025" , use_tokenizer = StaccatoTokenizer ())
573664
0 commit comments