Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions src/risk_assessment/classification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ class DatasetClassificationConfiguration:
>>> config = DatasetClassificationConfiguration(
... identifiers=[Email(), Phone()],
... mark_unknown=True,
... unknonw_type="UNKNOWN"
... unknown_type="UNKNOWN"
... )
"""

Expand All @@ -139,20 +139,20 @@ def __init__(
identifiers: list[Identifier | str],
strategy: DatasetClassificationStrategy = FrequencyBasedDatasetClassificationStrategy(),
mark_unknown: bool = True,
unknonw_type: str = "UNKNOWN",
unknown_type: str = "UNKNOWN",
) -> None:
"""Initialize the classification configuration.

Args:
identifiers: List of Identifier instances or fully qualified name strings.
strategy: Classification strategy to use (default: frequency-based).
mark_unknown: Whether to mark unidentified values as unknown (default: True).
unknonw_type: Label for unknown values (default: "UNKNOWN").
unknown_type: Label for unknown values (default: "UNKNOWN").
"""
self.identifiers = build_identifiers(identifiers)
self.strategy = strategy
self.mark_unknown = mark_unknown
self.unknown_type = unknonw_type
self.unknown_type = unknown_type


@dataclass
Expand Down
4 changes: 2 additions & 2 deletions src/risk_assessment/classification/identifiers/age.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,8 @@ class AgeImproved(Identifier):
compile(r"^deceased\s+([0-9]+)$", I | U),
compile(r"^died\s+at\s+([0-9]+)$", I | U),
compile(r"^died\s+([0-9]+)-old\s+age$", I | U),
compile(r"^died\s+of\s+([\w|'|-]+\s+){1,3}at\s+([0-9]+)$", I | U),
compile(r"^died\s+of\s+([\w|'|-]+\s+){1,3}at\s+age\s+(of\s+)?([0-9]+)$", I | U),
compile(r"^died\s+of\s+([\w'|-]+\s+){1,3}at\s+([0-9]+)$", I | U),
compile(r"^died\s+of\s+([\w'|-]+\s+){1,3}at\s+age\s+(of\s+)?([0-9]+)$", I | U),
compile(r"^passed\s+away\s+at\s+age\s+([0-9]+)$", I | U),
]

Expand Down
4 changes: 2 additions & 2 deletions src/risk_assessment/classification/identifiers/geography.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
logger = logging.getLogger(__name__)


def _extract_all_langugage_city_names(file: str) -> list[str]:
def _extract_all_language_city_names(file: str) -> list[str]:
"""Extract city names from a multi-language file.

Args:
Expand Down Expand Up @@ -683,7 +683,7 @@ def __init__(self) -> None:
"GU",
"Guam",
"VI",
"Vigin Islands",
"Virgin Islands",
"PR",
"Puerto Rico",
"FM",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import re

import re2

from risk_assessment.classification.identifiers import Identifier
Expand Down Expand Up @@ -633,7 +631,7 @@
r"|TX|Texas"
r"|UT|Utah"
r"|VT|Vermont"
r"|VA|Virginia[H]"
r"|VA|Virginia"
r"|WA|Washington"
r"|WV|West Virginia"
r"|WI|Wisconsin"
Expand Down Expand Up @@ -705,7 +703,7 @@ def __init__(self) -> None:
+ r",?(?:\s+\w{3,})+,?(?:\s+\w{3,})+,?\s+"
+ STATE_AND_POSSESSIONS
+ r")"
r"|(?:\w{3,}(:?\s+\w{3,})*,?\s+" + STATE_AND_POSSESSIONS + r",?\s+" + ZIP_CODE + r"(?:\s+\w{2,})+)"
r"|(?:\w{3,}(?:\s+\w{3,})*,?\s+" + STATE_AND_POSSESSIONS + r",?\s+" + ZIP_CODE + r"(?:\s+\w{2,})+)"
r"|(?:\d+(?:\s+\w{3,})*(?:\s+"
+ SUFFIX
+ r")?,?\s+"
Expand Down Expand Up @@ -768,7 +766,7 @@ def _check_that_case_is_consistent(text: str) -> bool:
False if mixed case is detected
"""
# Split on whitespace and commas using proper regex
tokens = re.split(r"[\s,]+", text)
tokens = re2.split(r"[\s,]+", text)

upper_count = 0
lower_count = 0
Expand Down
4 changes: 2 additions & 2 deletions tests/classification/identifiers/test_geography.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from risk_assessment.classification.identifiers.geography import (
UKPostCode,
UnitedStateState,
_extract_all_langugage_city_names,
_extract_all_language_city_names,
)


Expand Down Expand Up @@ -182,7 +182,7 @@ def test_uk_postcode_suppors_for_known_formats():


def test_all_city_names():
identifier = City("data/all_language_city_names.txt", _extract_all_langugage_city_names)
identifier = City("data/all_language_city_names.txt", _extract_all_language_city_names)

assert len(identifier.data) == 930425, len(identifier.data)

Expand Down
1 change: 0 additions & 1 deletion tests/classification/unstructured/test_aggregator.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,6 @@ def test_aggregation_different_tokenizers():

entities1 = [Entity(len("my_email is: "), len(data), "Email", frozenset(["DRL"]))]
entities2 = [Entity(len("my_email is: "), len(data), "URI", frozenset(["STANZA"]))]
entities3 = [Entity(len("my_email is: "), len("my_email is: john"), "NAME", frozenset(["DRL2"]))]
entities3 = [
Entity(len("my_email is: john"), len("my_email is: john.doe"), "NAME", frozenset(["SPACY"])),
Entity(len("my_email is: john.doe"), len(data), "URI", frozenset(["Spacy"])),
Expand Down
Loading