From 5e1ee975c94287a984bc231fe82d02d8153c8d7e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 27 May 2025 11:09:37 +0200 Subject: [PATCH 1/3] Fix quirk of enum values in Python After the Cython 3 change, the types of enum members such as spacy.parts_of_speech.NOUN became 'flag', rather than simple 'int'. This change mostly doesn't matter because the flag type does duck-type like an int -- it compares, additions, prints etc the same. However, it doesn't repr the same and if you do an isinstance check it will fail. It's therefore better to just make them ints like they were before. --- spacy/attrs.pyx | 3 +++ spacy/parts_of_speech.pyx | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index 363dd094dcd..50b868bc410 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -91,6 +91,9 @@ IDS = { "MORPH": MORPH, "IDX": IDX, } +# Make these ints in Python, so that we don't get this unexpected 'flag' type +# This will match the behaviour before Cython 3 +IDS = {name: int(value) for name, value in IDS.items()} # ATTR IDs, in order of the symbol diff --git a/spacy/parts_of_speech.pyx b/spacy/parts_of_speech.pyx index 1e643c09923..9e539c16cb6 100644 --- a/spacy/parts_of_speech.pyx +++ b/spacy/parts_of_speech.pyx @@ -23,7 +23,9 @@ IDS = { "SPACE": SPACE } - +# Make these ints in Python, so that we don't get this unexpected 'flag' type +# This will match the behaviour before Cython 3 +IDS = {name: int(value) for name, value in IDS.items()} NAMES = {value: key for key, value in IDS.items()} # As of Cython 3.1, the global Python namespace no longer has the enum From 80aa445f343ca33a21060eab70235600427d25e9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 28 May 2025 17:27:36 +0200 Subject: [PATCH 2/3] Format --- spacy/lang/ht/__init__.py | 3 + spacy/lang/ht/lex_attrs.py | 3 + spacy/lang/ht/punctuation.py | 61 +++--- spacy/lang/ht/stop_words.py | 3 +- spacy/lang/ht/tag_map.py | 20 +- spacy/lang/ht/tokenizer_exceptions.py | 195 +++++++++--------- spacy/tests/lang/ht/test_exceptions.py | 14 +- .../tests/lang/ht/test_prefix_suffix_infix.py | 4 +- spacy/tests/lang/ht/test_text.py | 7 +- 9 files changed, 183 insertions(+), 127 deletions(-) diff --git a/spacy/lang/ht/__init__.py b/spacy/lang/ht/__init__.py index e5c1c27702a..9fc2df40ce8 100644 --- a/spacy/lang/ht/__init__.py +++ b/spacy/lang/ht/__init__.py @@ -22,10 +22,12 @@ class HaitianCreoleDefaults(BaseDefaults): stop_words = STOP_WORDS tag_map = TAG_MAP + class HaitianCreole(Language): lang = "ht" Defaults = HaitianCreoleDefaults + @HaitianCreole.factory( "lemmatizer", assigns=["token.lemma"], @@ -49,4 +51,5 @@ def make_lemmatizer( nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer ) + __all__ = ["HaitianCreole"] diff --git a/spacy/lang/ht/lex_attrs.py b/spacy/lang/ht/lex_attrs.py index 8a3ec1ff9ee..ab1a39a8234 100644 --- a/spacy/lang/ht/lex_attrs.py +++ b/spacy/lang/ht/lex_attrs.py @@ -49,6 +49,7 @@ "P": "Pa", } + def like_num(text): text = text.strip().lower() if text.startswith(("+", "-", "±", "~")): @@ -69,9 +70,11 @@ def like_num(text): return True return False + def norm_custom(text): return NORM_MAP.get(text, text.lower()) + LEX_ATTRS = { LIKE_NUM: like_num, NORM: norm_custom, diff --git a/spacy/lang/ht/punctuation.py b/spacy/lang/ht/punctuation.py index 61d88d6e1a5..0077db1c032 100644 --- a/spacy/lang/ht/punctuation.py +++ b/spacy/lang/ht/punctuation.py @@ -16,28 +16,43 @@ _prefixes_elision = "m n l y t k w" _prefixes_elision += " " + _prefixes_elision.upper() -TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [ - r"(?:({pe})[{el}])(?=[{a}])".format( - a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision) - ) -] +TOKENIZER_PREFIXES = ( + LIST_PUNCT + + LIST_QUOTES + + [ + r"(?:({pe})[{el}])(?=[{a}])".format( + a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision) + ) + ] +) -TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [ - r"(?<=[0-9])%", # numbers like 10% - r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers - r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters - r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions - r"(?<=[{a}0-9])\)", # right parenthesis after letter/number - r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA), # period after letter if space or end of string - r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis -] +TOKENIZER_SUFFIXES = ( + LIST_PUNCT + + LIST_QUOTES + + LIST_ELLIPSES + + [ + r"(?<=[0-9])%", # numbers like 10% + r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers + r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters + r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions + r"(?<=[{a}0-9])\)", # right parenthesis after letter/number + r"(?<=[{a}])\.(?=\s|$)".format( + a=ALPHA + ), # period after letter if space or end of string + r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis + ] +) -TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [ - r"(?<=[0-9])[+\-\*^](?=[0-9-])", - r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( - al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES - ), - r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), - r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), - r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION), -] +TOKENIZER_INFIXES = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[0-9])[+\-\*^](?=[0-9-])", + r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES + ), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), + r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION), + ] +) diff --git a/spacy/lang/ht/stop_words.py b/spacy/lang/ht/stop_words.py index 6243887a4dc..50998e0e5ff 100644 --- a/spacy/lang/ht/stop_words.py +++ b/spacy/lang/ht/stop_words.py @@ -39,8 +39,7 @@ men mèsi oswa osinon -""" -.split() +""".split() ) # Add common contractions, with and without apostrophe variants diff --git a/spacy/lang/ht/tag_map.py b/spacy/lang/ht/tag_map.py index 8c9cdd6d49b..261d1aef3b0 100644 --- a/spacy/lang/ht/tag_map.py +++ b/spacy/lang/ht/tag_map.py @@ -1,4 +1,22 @@ -from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X +from spacy.symbols import ( + NOUN, + VERB, + AUX, + ADJ, + ADV, + PRON, + DET, + ADP, + SCONJ, + CCONJ, + PART, + INTJ, + NUM, + PROPN, + PUNCT, + SYM, + X, +) TAG_MAP = { "NOUN": {"pos": NOUN}, diff --git a/spacy/lang/ht/tokenizer_exceptions.py b/spacy/lang/ht/tokenizer_exceptions.py index b44ad7a6fbc..4d617fd3683 100644 --- a/spacy/lang/ht/tokenizer_exceptions.py +++ b/spacy/lang/ht/tokenizer_exceptions.py @@ -1,5 +1,6 @@ from spacy.symbols import ORTH, NORM + def make_variants(base, first_norm, second_orth, second_norm): return { base: [ @@ -7,14 +8,16 @@ def make_variants(base, first_norm, second_orth, second_norm): {ORTH: second_orth, NORM: second_norm}, ], base.capitalize(): [ - {ORTH: base.split("'")[0].capitalize() + "'", NORM: first_norm.capitalize()}, + { + ORTH: base.split("'")[0].capitalize() + "'", + NORM: first_norm.capitalize(), + }, {ORTH: second_orth, NORM: second_norm}, - ] + ], } -TOKENIZER_EXCEPTIONS = { - "Dr.": [{ORTH: "Dr."}] -} + +TOKENIZER_EXCEPTIONS = {"Dr.": [{ORTH: "Dr."}]} # Apostrophe forms TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap")) @@ -29,93 +32,95 @@ def make_variants(base, first_norm, second_orth, second_norm): TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap")) # Non-apostrophe contractions (with capitalized variants) -TOKENIZER_EXCEPTIONS.update({ - "map": [ - {ORTH: "m", NORM: "mwen"}, - {ORTH: "ap", NORM: "ap"}, - ], - "Map": [ - {ORTH: "M", NORM: "Mwen"}, - {ORTH: "ap", NORM: "ap"}, - ], - "lem": [ - {ORTH: "le", NORM: "le"}, - {ORTH: "m", NORM: "mwen"}, - ], - "Lem": [ - {ORTH: "Le", NORM: "Le"}, - {ORTH: "m", NORM: "mwen"}, - ], - "lew": [ - {ORTH: "le", NORM: "le"}, - {ORTH: "w", NORM: "ou"}, - ], - "Lew": [ - {ORTH: "Le", NORM: "Le"}, - {ORTH: "w", NORM: "ou"}, - ], - "nap": [ - {ORTH: "n", NORM: "nou"}, - {ORTH: "ap", NORM: "ap"}, - ], - "Nap": [ - {ORTH: "N", NORM: "Nou"}, - {ORTH: "ap", NORM: "ap"}, - ], - "lap": [ - {ORTH: "l", NORM: "li"}, - {ORTH: "ap", NORM: "ap"}, - ], - "Lap": [ - {ORTH: "L", NORM: "Li"}, - {ORTH: "ap", NORM: "ap"}, - ], - "yap": [ - {ORTH: "y", NORM: "yo"}, - {ORTH: "ap", NORM: "ap"}, - ], - "Yap": [ - {ORTH: "Y", NORM: "Yo"}, - {ORTH: "ap", NORM: "ap"}, - ], - "mte": [ - {ORTH: "m", NORM: "mwen"}, - {ORTH: "te", NORM: "te"}, - ], - "Mte": [ - {ORTH: "M", NORM: "Mwen"}, - {ORTH: "te", NORM: "te"}, - ], - "mpral": [ - {ORTH: "m", NORM: "mwen"}, - {ORTH: "pral", NORM: "pral"}, - ], - "Mpral": [ - {ORTH: "M", NORM: "Mwen"}, - {ORTH: "pral", NORM: "pral"}, - ], - "wap": [ - {ORTH: "w", NORM: "ou"}, - {ORTH: "ap", NORM: "ap"}, - ], - "Wap": [ - {ORTH: "W", NORM: "Ou"}, - {ORTH: "ap", NORM: "ap"}, - ], - "kap": [ - {ORTH: "k", NORM: "ki"}, - {ORTH: "ap", NORM: "ap"}, - ], - "Kap": [ - {ORTH: "K", NORM: "Ki"}, - {ORTH: "ap", NORM: "ap"}, - ], - "tap": [ - {ORTH: "t", NORM: "te"}, - {ORTH: "ap", NORM: "ap"}, - ], - "Tap": [ - {ORTH: "T", NORM: "Te"}, - {ORTH: "ap", NORM: "ap"}, - ], -}) +TOKENIZER_EXCEPTIONS.update( + { + "map": [ + {ORTH: "m", NORM: "mwen"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Map": [ + {ORTH: "M", NORM: "Mwen"}, + {ORTH: "ap", NORM: "ap"}, + ], + "lem": [ + {ORTH: "le", NORM: "le"}, + {ORTH: "m", NORM: "mwen"}, + ], + "Lem": [ + {ORTH: "Le", NORM: "Le"}, + {ORTH: "m", NORM: "mwen"}, + ], + "lew": [ + {ORTH: "le", NORM: "le"}, + {ORTH: "w", NORM: "ou"}, + ], + "Lew": [ + {ORTH: "Le", NORM: "Le"}, + {ORTH: "w", NORM: "ou"}, + ], + "nap": [ + {ORTH: "n", NORM: "nou"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Nap": [ + {ORTH: "N", NORM: "Nou"}, + {ORTH: "ap", NORM: "ap"}, + ], + "lap": [ + {ORTH: "l", NORM: "li"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Lap": [ + {ORTH: "L", NORM: "Li"}, + {ORTH: "ap", NORM: "ap"}, + ], + "yap": [ + {ORTH: "y", NORM: "yo"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Yap": [ + {ORTH: "Y", NORM: "Yo"}, + {ORTH: "ap", NORM: "ap"}, + ], + "mte": [ + {ORTH: "m", NORM: "mwen"}, + {ORTH: "te", NORM: "te"}, + ], + "Mte": [ + {ORTH: "M", NORM: "Mwen"}, + {ORTH: "te", NORM: "te"}, + ], + "mpral": [ + {ORTH: "m", NORM: "mwen"}, + {ORTH: "pral", NORM: "pral"}, + ], + "Mpral": [ + {ORTH: "M", NORM: "Mwen"}, + {ORTH: "pral", NORM: "pral"}, + ], + "wap": [ + {ORTH: "w", NORM: "ou"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Wap": [ + {ORTH: "W", NORM: "Ou"}, + {ORTH: "ap", NORM: "ap"}, + ], + "kap": [ + {ORTH: "k", NORM: "ki"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Kap": [ + {ORTH: "K", NORM: "Ki"}, + {ORTH: "ap", NORM: "ap"}, + ], + "tap": [ + {ORTH: "t", NORM: "te"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Tap": [ + {ORTH: "T", NORM: "Te"}, + {ORTH: "ap", NORM: "ap"}, + ], + } +) diff --git a/spacy/tests/lang/ht/test_exceptions.py b/spacy/tests/lang/ht/test_exceptions.py index 685b72c0767..ea2e2b2046a 100644 --- a/spacy/tests/lang/ht/test_exceptions.py +++ b/spacy/tests/lang/ht/test_exceptions.py @@ -29,4 +29,16 @@ def test_ht_tokenizer_handles_basic_abbreviation(ht_tokenizer, text): def test_ht_tokenizer_full_sentence(ht_tokenizer): text = "Si'm ka vini, m'ap pale ak li." tokens = [t.text for t in ht_tokenizer(text)] - assert tokens == ["Si", "'m", "ka", "vini", ",", "m'", "ap", "pale", "ak", "li", "."] + assert tokens == [ + "Si", + "'m", + "ka", + "vini", + ",", + "m'", + "ap", + "pale", + "ak", + "li", + ".", + ] diff --git a/spacy/tests/lang/ht/test_prefix_suffix_infix.py b/spacy/tests/lang/ht/test_prefix_suffix_infix.py index 7dabec17aff..5ff409cd9e1 100644 --- a/spacy/tests/lang/ht/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/ht/test_prefix_suffix_infix.py @@ -37,7 +37,9 @@ def test_ht_tokenizer_splits_uneven_wrap(ht_tokenizer, text): assert len(tokens) == 5 -@pytest.mark.parametrize("text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)]) +@pytest.mark.parametrize( + "text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)] +) def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length): tokens = ht_tokenizer(text) assert len(tokens) == length diff --git a/spacy/tests/lang/ht/test_text.py b/spacy/tests/lang/ht/test_text.py index f396e352af6..e63299fc097 100644 --- a/spacy/tests/lang/ht/test_text.py +++ b/spacy/tests/lang/ht/test_text.py @@ -16,7 +16,6 @@ def test_ht_tokenizer_handles_long_text(ht_tokenizer): assert len(tokens) == 84 - @pytest.mark.parametrize( "text,length", [ @@ -66,14 +65,14 @@ def test_ht_lex_attrs_capitals(word): @pytest.mark.parametrize( - "word, expected", [ + "word, expected", + [ ("'m", "mwen"), ("'n", "nou"), ("'l", "li"), ("'y", "yo"), ("'w", "ou"), - ] + ], ) def test_ht_lex_attrs_norm_custom(word, expected): assert norm_custom(word) == expected - From c015dd1fa6f28a324340dacf4a409e92af8a3af8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 28 May 2025 17:27:59 +0200 Subject: [PATCH 3/3] isort --- spacy/lang/ht/__init__.py | 4 ++-- spacy/lang/ht/lemmatizer.py | 2 +- spacy/lang/ht/punctuation.py | 4 ++-- spacy/lang/ht/tag_map.py | 16 ++++++++-------- spacy/lang/ht/tokenizer_exceptions.py | 2 +- spacy/tests/lang/ht/test_noun_chunks.py | 1 + 6 files changed, 15 insertions(+), 14 deletions(-) diff --git a/spacy/lang/ht/__init__.py b/spacy/lang/ht/__init__.py index 9fc2df40ce8..7f9feb0575f 100644 --- a/spacy/lang/ht/__init__.py +++ b/spacy/lang/ht/__init__.py @@ -5,11 +5,11 @@ from ...language import BaseDefaults, Language from .lemmatizer import HaitianCreoleLemmatizer from .lex_attrs import LEX_ATTRS -from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES, TOKENIZER_SUFFIXES +from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS from .syntax_iterators import SYNTAX_ITERATORS -from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tag_map import TAG_MAP +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS class HaitianCreoleDefaults(BaseDefaults): diff --git a/spacy/lang/ht/lemmatizer.py b/spacy/lang/ht/lemmatizer.py index 9ac096f6df1..52bf23d2390 100644 --- a/spacy/lang/ht/lemmatizer.py +++ b/spacy/lang/ht/lemmatizer.py @@ -1,8 +1,8 @@ from typing import List, Tuple +from ...lookups import Lookups from ...pipeline import Lemmatizer from ...tokens import Token -from ...lookups import Lookups class HaitianCreoleLemmatizer(Lemmatizer): diff --git a/spacy/lang/ht/punctuation.py b/spacy/lang/ht/punctuation.py index 0077db1c032..c4a5d090ee5 100644 --- a/spacy/lang/ht/punctuation.py +++ b/spacy/lang/ht/punctuation.py @@ -4,10 +4,10 @@ ALPHA_UPPER, CONCAT_QUOTES, HYPHENS, - LIST_PUNCT, - LIST_QUOTES, LIST_ELLIPSES, LIST_ICONS, + LIST_PUNCT, + LIST_QUOTES, merge_chars, ) diff --git a/spacy/lang/ht/tag_map.py b/spacy/lang/ht/tag_map.py index 261d1aef3b0..a190984a61a 100644 --- a/spacy/lang/ht/tag_map.py +++ b/spacy/lang/ht/tag_map.py @@ -1,20 +1,20 @@ from spacy.symbols import ( - NOUN, - VERB, - AUX, ADJ, - ADV, - PRON, - DET, ADP, - SCONJ, + ADV, + AUX, CCONJ, - PART, + DET, INTJ, + NOUN, NUM, + PART, + PRON, PROPN, PUNCT, + SCONJ, SYM, + VERB, X, ) diff --git a/spacy/lang/ht/tokenizer_exceptions.py b/spacy/lang/ht/tokenizer_exceptions.py index 4d617fd3683..deb152c25e8 100644 --- a/spacy/lang/ht/tokenizer_exceptions.py +++ b/spacy/lang/ht/tokenizer_exceptions.py @@ -1,4 +1,4 @@ -from spacy.symbols import ORTH, NORM +from spacy.symbols import NORM, ORTH def make_variants(base, first_norm, second_orth, second_norm): diff --git a/spacy/tests/lang/ht/test_noun_chunks.py b/spacy/tests/lang/ht/test_noun_chunks.py index 76c5a1df32d..fcefd7dfd37 100644 --- a/spacy/tests/lang/ht/test_noun_chunks.py +++ b/spacy/tests/lang/ht/test_noun_chunks.py @@ -1,4 +1,5 @@ import pytest + from spacy.tokens import Doc