Skip to content
Merged
8 changes: 4 additions & 4 deletions .secrets.baseline
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"files": "^.secrets.baseline$",
"lines": null
},
"generated_at": "2026-05-15T10:59:03Z",
"generated_at": "2026-05-16T20:54:57Z",
"plugins_used": [
{
"name": "AWSKeyDetector"
Expand Down Expand Up @@ -82,7 +82,7 @@
"hashed_secret": "f3e0d184814b86dc1c4eb623edde7610cf212567",
"is_secret": false,
"is_verified": false,
"line_number": 845,
"line_number": 860,
"type": "Hex High Entropy String",
"verified_result": null
}
Expand Down Expand Up @@ -188,15 +188,15 @@
"hashed_secret": "f3e0d184814b86dc1c4eb623edde7610cf212567",
"is_secret": false,
"is_verified": false,
"line_number": 163,
"line_number": 161,
"type": "Hex High Entropy String",
"verified_result": null
},
{
"hashed_secret": "804ec071803318791b835cffd6e509c8d32239db",
"is_secret": false,
"is_verified": false,
"line_number": 165,
"line_number": 163,
"type": "Hex High Entropy String",
"verified_result": null
}
Expand Down
21 changes: 17 additions & 4 deletions src/risk_assessment/classification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"""

from collections import defaultdict
from contextlib import suppress
from dataclasses import dataclass
from typing import Any, cast

Expand Down Expand Up @@ -45,12 +46,24 @@ def create_instance(identifier_fqn: str) -> Identifier:
for comp in parts[1:]:
module = getattr(module, comp)

if type(module) is type(Identifier):
m = module()
return cast(Identifier, m)
# Verify that module is a class (type) and is a subclass of Identifier
if not isinstance(module, type):
raise ValueError(
f"{identifier_fqn} is not a class. "
"Expected a subclass of `risk_assessment.classification.identifiers.Identifier`, "
f"but got {type(module).__name__}"
)

try:
if issubclass(module, Identifier):
return module()
except TypeError as e:
# issubclass() raises TypeError if module is not a class (shouldn't happen due to isinstance check above)
raise ValueError(f"{identifier_fqn} cannot be checked as a subclass: {e}") from e

raise ValueError(
f"{identifier_fqn} does not exists or is not a subclass of `risk_assessment.classification.identifiers.Identifier`"
f"{identifier_fqn} is not a subclass of `risk_assessment.classification.identifiers.Identifier`. "
f"Found class: {module.__name__}"
)


Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import re
from re import Pattern

from risk_assessment.classification.identifiers import Identifier

Expand All @@ -20,7 +19,7 @@


class AccountsOfficeReferenceNumber(Identifier):
pattern: Pattern[str] = re.compile(r"^\d{3}P[a-z]\d{7}(?:\d|X)(\d{4})?$", re.I) # 13 or 16 characters
pattern: re.Pattern[str] = re.compile(r"^\d{3}P[a-z]\d{7}(?:\d|X)(\d{4})?$", re.I) # 13 or 16 characters

def is_of_this_type(self, text: str) -> bool:
match = self.pattern.match(text)
Expand Down
5 changes: 2 additions & 3 deletions src/risk_assessment/classification/identifiers/age.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
age expressions with units (years, months, weeks), and age-related phrases.
"""

from contextlib import suppress
from re import I, Pattern, U, compile

from word2number.w2n import word_to_num
Expand Down Expand Up @@ -38,13 +39,11 @@ def is_of_this_type(self, text: str | int) -> bool:
int_value: int = 10_000_000

if isinstance(text, str):
try:
with suppress(ValueError):
int_value = int(text, base=10)

if text != str(int_value):
return False
except ValueError:
pass

elif isinstance(text, int):
int_value = text
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import re
from re import Pattern

from risk_assessment.classification.identifiers import Identifier

Expand Down Expand Up @@ -57,7 +56,7 @@ def _validate_checksum(federal_reserve_routing: str, aba_institution: str, check


class AmericanBankersAssociationNumber(Identifier):
pattern: Pattern[str] = re.compile(r"^(\d{4})(\d{4})(\d)$")
pattern: re.Pattern[str] = re.compile(r"^(\d{4})(\d{4})(\d)$")

def is_of_this_type(self, text: str) -> bool:
match = self.pattern.match(text)
Expand Down
19 changes: 9 additions & 10 deletions src/risk_assessment/classification/identifiers/date.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import re
from collections.abc import Callable, Iterable
from datetime import datetime
from re import Match, Pattern
from typing import Any

import re2
Expand All @@ -17,9 +16,9 @@


def _compute_unique_patterns(
patterns: dict[str, Pattern[str]],
ampm_patterns: dict[str, Pattern[str]],
patterns_with_processing: dict[str, tuple[Pattern[str], Callable[[Match[str]], str]]],
patterns: dict[str, re.Pattern[str]],
ampm_patterns: dict[str, re.Pattern[str]],
patterns_with_processing: dict[str, tuple[re.Pattern[str], Callable[[re.Match[str]], str]]],
) -> str:
"""Compute unique regex patterns from multiple pattern dictionaries.

Expand All @@ -43,7 +42,7 @@ def _compute_unique_patterns(
return "|".join(unique_patterns)


_RePatternLike = Pattern[str] | Any
_RePatternLike = re.Pattern[str] | Any


class DateTime(Identifier):
Expand Down Expand Up @@ -77,7 +76,7 @@ class DateTime(Identifier):
True
"""

patterns: dict[str, Pattern[str]] = {
patterns: dict[str, re.Pattern[str]] = {
r"%d %b %Y %H:%M:%S %z": re.compile(
r"^\d{1,2} \w{3} \d{4} \d{1,2}:\d{1,2}:\d{1,2} [+-]?\d{2}\d{2}(?:\d{2}(?:\.\d{6})?)?$", re.I | re.U
),
Expand Down Expand Up @@ -158,7 +157,7 @@ class DateTime(Identifier):
r"%y年%m・%d": re.compile(r"^\d{2}年\d{1,2}・\d{1,2}$", re.I | re.U),
r"%y年%m": re.compile(r"^\d{2}年\d{1,2}$", re.I | re.U),
}
ampm_patterns: dict[str, Pattern[str]] = {
ampm_patterns: dict[str, re.Pattern[str]] = {
r"%B %d, %Y %I:%M %p": re.compile(r"^\w{4,} \d{1,2}, \d{4} \d{1,2}:\d{1,2} [AP]M$", re.I | re.U),
r"%a %b %d, %Y %I:%M %p": re.compile(r"^\w{3} \w{3} \d{1,2}, \d{4} \d{1,2}:\d{1,2} [AP]M$", re.I | re.U),
r"%d/%m/%Y %I:%M %p": re.compile(r"^\d{1,2}/\d{1,2}/\d{4} \d{1,2}:\d{1,2} [AP]M$", re.I | re.U),
Expand All @@ -180,7 +179,7 @@ class DateTime(Identifier):
r"^\d{4}/\d{1,2}/\d{1,2} \d{1,2}:\d{1,2}:\d{1,2} [AP]M GMT[+-]\d{1,2}$", re.I | re.U
),
}
patterns_with_processing: dict[str, tuple[Pattern[str], Callable[[Match[str]], str]]] = {
patterns_with_processing: dict[str, tuple[re.Pattern[str], Callable[[re.Match[str]], str]]] = {
r"%Y/%m/%d %I:%M:%S %p %Z": (
re.compile(r"^(\d{4}/\d{1,2}/\d{1,2} \d{1,2}:\d{1,2}:\d{1,2} [AP]M (?:\w{3}))[\+-]\d+$", re.I | re.U),
lambda m: m.group(1),
Expand Down Expand Up @@ -388,7 +387,7 @@ def is_of_this_type(self, text: str) -> bool:


def _match_patterns_with_code(
patterns: Iterable[tuple[str, tuple[Pattern[str], Callable[[Match[str]], str]]]], text: str
patterns: Iterable[tuple[str, tuple[re.Pattern[str], Callable[[re.Match[str]], str]]]], text: str
) -> bool:
"""Match text against patterns that require preprocessing.

Expand Down Expand Up @@ -439,7 +438,7 @@ def _match_format(format: str, text: str) -> bool:
return False


def _match_patterns(patterns: Iterable[tuple[str, Pattern[str]]], text: str) -> bool:
def _match_patterns(patterns: Iterable[tuple[str, re.Pattern[str]]], text: str) -> bool:
"""Match text against multiple datetime patterns.

Args:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import re
from pathlib import Path
from re import Pattern

from risk_assessment.classification.identifiers import Identifier

Expand All @@ -20,7 +19,7 @@ def _load_valid_zipcodes() -> dict[str, set[str]]:


class FrenchPostalCode(Identifier):
pattern: Pattern[str] = re.compile(r"^(\d{2})(\d{3})$")
pattern: re.Pattern[str] = re.compile(r"^(\d{2})(\d{3})$")
departments: dict[str, set[str]] = _load_valid_zipcodes()

def is_of_this_type(self, text: str) -> bool:
Expand Down
5 changes: 2 additions & 3 deletions src/risk_assessment/classification/identifiers/geography.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import logging
import re
from collections.abc import Callable, Iterable
from contextlib import suppress
from pathlib import Path

from risk_assessment.classification.identifiers import DictionaryIdentifier, Identifier
Expand Down Expand Up @@ -512,13 +513,11 @@ def is_of_this_type(self, text: str) -> bool:
"""
text = text.strip()
if len(text) == 5:
try:
with suppress(ValueError):
int_code = int(text, base=10)
for _, (m, M) in self.valid_codes.items():
if m <= int_code <= M:
return True
except ValueError:
pass

return False

Expand Down
27 changes: 13 additions & 14 deletions src/risk_assessment/classification/identifiers/japan_address.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,31 @@
import re
from re import Pattern
from re import I, Pattern, U, compile

from risk_assessment.classification.identifiers import Identifier


class JapanAddress(Identifier):
patterns: list[Pattern[str]] = [
re.compile(
r"^\d+\s+\w+\s+\w{3,}-\w{2,5}\s+\w{3,}(?:-\w{2,3})?,\s+\w{3,}\s+(?:〒\s*)?\d{3}-\d{4}\s+JAPAN$", re.I | re.U
compile(
r"^\d+\s+\w+\s+\w{3,}-\w{2,5}\s+\w{3,}(?:-\w{2,3})?,\s+\w{3,}\s+(?:〒\s*)?\d{3}-\d{4}\s+JAPAN$", I | U
), # rural
re.compile(
compile(
r"^\d+-\d+,\s+\w{3,}\s+\d+-chome\s+\w{3,}(?:-\w{3,})*-(?:shi|gun|ku|machi|cho),\s+\w+(?:-(?:ken|fu|to))?\s+(?:〒\s*)?\d{3}-\d{4}\s+JAPAN$",
re.I | re.U,
I | U,
), # city
re.compile(
compile(
r"^\d+-\d+-\d+,\s+\w{3,}\s+\w{3,}(?:-\w{3,})*-(?:shi|gun|ku|machi|cho),\s+\w+(?:-(?:ken|fu|to))?\s+(?:〒\s*)?\d{3}-\d{4}\s+JAPAN$",
re.I | re.U,
), # city, compressed # re.compile(r"", re.I | re.U), # city
re.compile(
I | U,
), # city, compressed
compile(
r"^\d+-\d+,\s+\w{3,}\s+\w{3,}(?:-\w{3,})*-(?:shi|gun|ku|machi|cho),\s+\w+(?:-(?:ken|fu|to))?\s+(?:〒\s*)?\d{3}-\d{4}\s+JAPAN$$",
re.I | re.U,
I | U,
), # city as prefecture
re.compile(
compile(
r"^JAPAN\s+(?:〒\s*)?\d{3}-\d{4}\s+\w+(?:-(?:ken|fu|to))?\s+\w{3,}(?:-\w{3,})*-(?:shi|gun|ku|machi|cho)\s+\w{3,}\s+\d+(?:-chome)?(?:\s+|-)\d+-\d+$",
re.I | re.U,
I | U,
), # oneliner
# from RWD
re.compile(r"^〒?(:?\d+-\d+)\s+\w+\s*\d+$"),
compile(r"^〒?(:?\d+-\d+)\s+\w+\s*\d+$"),
]

def is_of_this_type(self, text: str) -> bool:
Expand Down
Loading
Loading