Skip to content

Commit 6e39cfc

Browse files
committed
Black formatting
1 parent 88ef4e4 commit 6e39cfc

File tree

2 files changed

+45
-6
lines changed

2 files changed

+45
-6
lines changed

flair/tokenization.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -521,7 +521,9 @@ def __init__(self):
521521

522522
# Combined pattern for re.findall:
523523
# Captures abbreviations OR letter sequences OR digit sequences OR Kanji OR punctuation/symbols
524-
combined_pattern = f"({self.abbreviations})|({self.alphabet_pattern})|({self.digits})|({self.kanji})|({self.punctuation})"
524+
combined_pattern = (
525+
f"({self.abbreviations})|({self.alphabet_pattern})|({self.digits})|({self.kanji})|({self.punctuation})"
526+
)
525527
# Pre-compile the regex for efficiency
526528
self.token_pattern = re.compile(combined_pattern)
527529

tests/test_tokenize_sentence.py

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -599,25 +599,62 @@ def test_staccato_tokenizer_abbreviations():
599599
text_1 = "The firm is U.S.A. Inc. and i.e. in the U.S. we use e.g. to give examples."
600600
sentence_1 = Sentence(text_1, use_tokenizer=tokenizer)
601601
expected_tokens_1 = [
602-
"The", "firm", "is", "U.S.A.", "Inc", ".", "and", "i.e.", "in", "the",
603-
"U.S.", "we", "use", "e.g.", "to", "give", "examples", ".",
602+
"The",
603+
"firm",
604+
"is",
605+
"U.S.A.",
606+
"Inc",
607+
".",
608+
"and",
609+
"i.e.",
610+
"in",
611+
"the",
612+
"U.S.",
613+
"we",
614+
"use",
615+
"e.g.",
616+
"to",
617+
"give",
618+
"examples",
619+
".",
604620
]
605621
assert [token.text for token in sentence_1.tokens] == expected_tokens_1
606622

607623
# Case 2: Single letter/short word with a dot at sentence end should be split
608624
text_2 = "He wrote on X. Then Dr. Smith arrived."
609625
sentence_2 = Sentence(text_2, use_tokenizer=tokenizer)
610626
expected_tokens_2 = [
611-
"He", "wrote", "on", "X", ".", "Then", "Dr", ".", "Smith", "arrived", ".",
627+
"He",
628+
"wrote",
629+
"on",
630+
"X",
631+
".",
632+
"Then",
633+
"Dr",
634+
".",
635+
"Smith",
636+
"arrived",
637+
".",
612638
]
613639
assert [token.text for token in sentence_2.tokens] == expected_tokens_2
614640

615641
# Case 3: A mix of cases
616642
text_3 = "The item is from the U.K. (i.e. not the U.S.A.)."
617643
sentence_3 = Sentence(text_3, use_tokenizer=tokenizer)
618644
expected_tokens_3 = [
619-
"The", "item", "is", "from", "the", "U.K.", "(", "i.e.",
620-
"not", "the", "U.S.A.", ")", ".",
645+
"The",
646+
"item",
647+
"is",
648+
"from",
649+
"the",
650+
"U.K.",
651+
"(",
652+
"i.e.",
653+
"not",
654+
"the",
655+
"U.S.A.",
656+
")",
657+
".",
621658
]
622659
assert [token.text for token in sentence_3.tokens] == expected_tokens_3
623660

0 commit comments

Comments
 (0)