diff --git a/src/risk_assessment/classification/__init__.py b/src/risk_assessment/classification/__init__.py index 60ebdc0..503d4e6 100644 --- a/src/risk_assessment/classification/__init__.py +++ b/src/risk_assessment/classification/__init__.py @@ -130,7 +130,7 @@ class DatasetClassificationConfiguration: >>> config = DatasetClassificationConfiguration( ... identifiers=[Email(), Phone()], ... mark_unknown=True, - ... unknonw_type="UNKNOWN" + ... unknown_type="UNKNOWN" ... ) """ @@ -139,7 +139,7 @@ def __init__( identifiers: list[Identifier | str], strategy: DatasetClassificationStrategy = FrequencyBasedDatasetClassificationStrategy(), mark_unknown: bool = True, - unknonw_type: str = "UNKNOWN", + unknown_type: str = "UNKNOWN", ) -> None: """Initialize the classification configuration. @@ -147,12 +147,12 @@ def __init__( identifiers: List of Identifier instances or fully qualified name strings. strategy: Classification strategy to use (default: frequency-based). mark_unknown: Whether to mark unidentified values as unknown (default: True). - unknonw_type: Label for unknown values (default: "UNKNOWN"). + unknown_type: Label for unknown values (default: "UNKNOWN"). """ self.identifiers = build_identifiers(identifiers) self.strategy = strategy self.mark_unknown = mark_unknown - self.unknown_type = unknonw_type + self.unknown_type = unknown_type @dataclass diff --git a/src/risk_assessment/classification/identifiers/age.py b/src/risk_assessment/classification/identifiers/age.py index 26a3f91..7c37b40 100644 --- a/src/risk_assessment/classification/identifiers/age.py +++ b/src/risk_assessment/classification/identifiers/age.py @@ -140,8 +140,8 @@ class AgeImproved(Identifier): compile(r"^deceased\s+([0-9]+)$", I | U), compile(r"^died\s+at\s+([0-9]+)$", I | U), compile(r"^died\s+([0-9]+)-old\s+age$", I | U), - compile(r"^died\s+of\s+([\w|'|-]+\s+){1,3}at\s+([0-9]+)$", I | U), - compile(r"^died\s+of\s+([\w|'|-]+\s+){1,3}at\s+age\s+(of\s+)?([0-9]+)$", I | U), + compile(r"^died\s+of\s+([\w'|-]+\s+){1,3}at\s+([0-9]+)$", I | U), + compile(r"^died\s+of\s+([\w'|-]+\s+){1,3}at\s+age\s+(of\s+)?([0-9]+)$", I | U), compile(r"^passed\s+away\s+at\s+age\s+([0-9]+)$", I | U), ] diff --git a/src/risk_assessment/classification/identifiers/geography.py b/src/risk_assessment/classification/identifiers/geography.py index 7f844a8..0caea36 100644 --- a/src/risk_assessment/classification/identifiers/geography.py +++ b/src/risk_assessment/classification/identifiers/geography.py @@ -16,7 +16,7 @@ logger = logging.getLogger(__name__) -def _extract_all_langugage_city_names(file: str) -> list[str]: +def _extract_all_language_city_names(file: str) -> list[str]: """Extract city names from a multi-language file. Args: @@ -683,7 +683,7 @@ def __init__(self) -> None: "GU", "Guam", "VI", - "Vigin Islands", + "Virgin Islands", "PR", "Puerto Rico", "FM", diff --git a/src/risk_assessment/classification/identifiers/us_postal_address.py b/src/risk_assessment/classification/identifiers/us_postal_address.py index d36cc03..ce16807 100644 --- a/src/risk_assessment/classification/identifiers/us_postal_address.py +++ b/src/risk_assessment/classification/identifiers/us_postal_address.py @@ -1,5 +1,3 @@ -import re - import re2 from risk_assessment.classification.identifiers import Identifier @@ -633,7 +631,7 @@ r"|TX|Texas" r"|UT|Utah" r"|VT|Vermont" - r"|VA|Virginia[H]" + r"|VA|Virginia" r"|WA|Washington" r"|WV|West Virginia" r"|WI|Wisconsin" @@ -705,7 +703,7 @@ def __init__(self) -> None: + r",?(?:\s+\w{3,})+,?(?:\s+\w{3,})+,?\s+" + STATE_AND_POSSESSIONS + r")" - r"|(?:\w{3,}(:?\s+\w{3,})*,?\s+" + STATE_AND_POSSESSIONS + r",?\s+" + ZIP_CODE + r"(?:\s+\w{2,})+)" + r"|(?:\w{3,}(?:\s+\w{3,})*,?\s+" + STATE_AND_POSSESSIONS + r",?\s+" + ZIP_CODE + r"(?:\s+\w{2,})+)" r"|(?:\d+(?:\s+\w{3,})*(?:\s+" + SUFFIX + r")?,?\s+" @@ -768,7 +766,7 @@ def _check_that_case_is_consistent(text: str) -> bool: False if mixed case is detected """ # Split on whitespace and commas using proper regex - tokens = re.split(r"[\s,]+", text) + tokens = re2.split(r"[\s,]+", text) upper_count = 0 lower_count = 0 diff --git a/tests/classification/identifiers/test_geography.py b/tests/classification/identifiers/test_geography.py index 4375cff..8567190 100644 --- a/tests/classification/identifiers/test_geography.py +++ b/tests/classification/identifiers/test_geography.py @@ -4,7 +4,7 @@ from risk_assessment.classification.identifiers.geography import ( UKPostCode, UnitedStateState, - _extract_all_langugage_city_names, + _extract_all_language_city_names, ) @@ -182,7 +182,7 @@ def test_uk_postcode_suppors_for_known_formats(): def test_all_city_names(): - identifier = City("data/all_language_city_names.txt", _extract_all_langugage_city_names) + identifier = City("data/all_language_city_names.txt", _extract_all_language_city_names) assert len(identifier.data) == 930425, len(identifier.data) diff --git a/tests/classification/unstructured/test_aggregator.py b/tests/classification/unstructured/test_aggregator.py index fe8d310..988f842 100644 --- a/tests/classification/unstructured/test_aggregator.py +++ b/tests/classification/unstructured/test_aggregator.py @@ -333,7 +333,6 @@ def test_aggregation_different_tokenizers(): entities1 = [Entity(len("my_email is: "), len(data), "Email", frozenset(["DRL"]))] entities2 = [Entity(len("my_email is: "), len(data), "URI", frozenset(["STANZA"]))] - entities3 = [Entity(len("my_email is: "), len("my_email is: john"), "NAME", frozenset(["DRL2"]))] entities3 = [ Entity(len("my_email is: john"), len("my_email is: john.doe"), "NAME", frozenset(["SPACY"])), Entity(len("my_email is: john.doe"), len(data), "URI", frozenset(["Spacy"])),