From 9e035823de872dd999a3736bfa23ff3676996998 Mon Sep 17 00:00:00 2001 From: Stefano Braghin <527806+stefano81@users.noreply.github.com> Date: Sat, 16 May 2026 20:20:27 +0100 Subject: [PATCH 1/9] fix: address "Module is imported with `import` and `import from`" issue from CodeQL Signed-off-by: Stefano Braghin <527806+stefano81@users.noreply.github.com> --- .../accounts_office_reference_number.py | 3 +-- .../american_bankers_association.py | 3 +-- .../classification/identifiers/date.py | 19 +++++++------ .../identifiers/french_postal_code.py | 3 +-- .../identifiers/japan_address.py | 27 +++++++++---------- .../classification/identifiers/vehicle.py | 5 ++-- 6 files changed, 27 insertions(+), 33 deletions(-) diff --git a/src/risk_assessment/classification/identifiers/accounts_office_reference_number.py b/src/risk_assessment/classification/identifiers/accounts_office_reference_number.py index 3a64ad3..d96de3c 100644 --- a/src/risk_assessment/classification/identifiers/accounts_office_reference_number.py +++ b/src/risk_assessment/classification/identifiers/accounts_office_reference_number.py @@ -1,5 +1,4 @@ import re -from re import Pattern from risk_assessment.classification.identifiers import Identifier @@ -20,7 +19,7 @@ class AccountsOfficeReferenceNumber(Identifier): - pattern: Pattern[str] = re.compile(r"^\d{3}P[a-z]\d{7}(?:\d|X)(\d{4})?$", re.I) # 13 or 16 characters + pattern: re.Pattern[str] = re.compile(r"^\d{3}P[a-z]\d{7}(?:\d|X)(\d{4})?$", re.I) # 13 or 16 characters def is_of_this_type(self, text: str) -> bool: match = self.pattern.match(text) diff --git a/src/risk_assessment/classification/identifiers/american_bankers_association.py b/src/risk_assessment/classification/identifiers/american_bankers_association.py index 24d01f6..53df487 100644 --- a/src/risk_assessment/classification/identifiers/american_bankers_association.py +++ b/src/risk_assessment/classification/identifiers/american_bankers_association.py @@ -1,5 +1,4 @@ import re -from re import Pattern from risk_assessment.classification.identifiers import Identifier @@ -57,7 +56,7 @@ def _validate_checksum(federal_reserve_routing: str, aba_institution: str, check class AmericanBankersAssociationNumber(Identifier): - pattern: Pattern[str] = re.compile(r"^(\d{4})(\d{4})(\d)$") + pattern: re.Pattern[str] = re.compile(r"^(\d{4})(\d{4})(\d)$") def is_of_this_type(self, text: str) -> bool: match = self.pattern.match(text) diff --git a/src/risk_assessment/classification/identifiers/date.py b/src/risk_assessment/classification/identifiers/date.py index fbfb410..93e6aec 100644 --- a/src/risk_assessment/classification/identifiers/date.py +++ b/src/risk_assessment/classification/identifiers/date.py @@ -8,7 +8,6 @@ import re from collections.abc import Callable, Iterable from datetime import datetime -from re import Match, Pattern from typing import Any import re2 @@ -17,9 +16,9 @@ def _compute_unique_patterns( - patterns: dict[str, Pattern[str]], - ampm_patterns: dict[str, Pattern[str]], - patterns_with_processing: dict[str, tuple[Pattern[str], Callable[[Match[str]], str]]], + patterns: dict[str, re.Pattern[str]], + ampm_patterns: dict[str, re.Pattern[str]], + patterns_with_processing: dict[str, tuple[re.Pattern[str], Callable[[re.Match[str]], str]]], ) -> str: """Compute unique regex patterns from multiple pattern dictionaries. @@ -43,7 +42,7 @@ def _compute_unique_patterns( return "|".join(unique_patterns) -_RePatternLike = Pattern[str] | Any +_RePatternLike = re.Pattern[str] | Any class DateTime(Identifier): @@ -77,7 +76,7 @@ class DateTime(Identifier): True """ - patterns: dict[str, Pattern[str]] = { + patterns: dict[str, re.Pattern[str]] = { r"%d %b %Y %H:%M:%S %z": re.compile( r"^\d{1,2} \w{3} \d{4} \d{1,2}:\d{1,2}:\d{1,2} [+-]?\d{2}\d{2}(?:\d{2}(?:\.\d{6})?)?$", re.I | re.U ), @@ -158,7 +157,7 @@ class DateTime(Identifier): r"%y年%m・%d": re.compile(r"^\d{2}年\d{1,2}・\d{1,2}$", re.I | re.U), r"%y年%m": re.compile(r"^\d{2}年\d{1,2}$", re.I | re.U), } - ampm_patterns: dict[str, Pattern[str]] = { + ampm_patterns: dict[str, re.Pattern[str]] = { r"%B %d, %Y %I:%M %p": re.compile(r"^\w{4,} \d{1,2}, \d{4} \d{1,2}:\d{1,2} [AP]M$", re.I | re.U), r"%a %b %d, %Y %I:%M %p": re.compile(r"^\w{3} \w{3} \d{1,2}, \d{4} \d{1,2}:\d{1,2} [AP]M$", re.I | re.U), r"%d/%m/%Y %I:%M %p": re.compile(r"^\d{1,2}/\d{1,2}/\d{4} \d{1,2}:\d{1,2} [AP]M$", re.I | re.U), @@ -180,7 +179,7 @@ class DateTime(Identifier): r"^\d{4}/\d{1,2}/\d{1,2} \d{1,2}:\d{1,2}:\d{1,2} [AP]M GMT[+-]\d{1,2}$", re.I | re.U ), } - patterns_with_processing: dict[str, tuple[Pattern[str], Callable[[Match[str]], str]]] = { + patterns_with_processing: dict[str, tuple[re.Pattern[str], Callable[[re.Match[str]], str]]] = { r"%Y/%m/%d %I:%M:%S %p %Z": ( re.compile(r"^(\d{4}/\d{1,2}/\d{1,2} \d{1,2}:\d{1,2}:\d{1,2} [AP]M (?:\w{3}))[\+-]\d+$", re.I | re.U), lambda m: m.group(1), @@ -388,7 +387,7 @@ def is_of_this_type(self, text: str) -> bool: def _match_patterns_with_code( - patterns: Iterable[tuple[str, tuple[Pattern[str], Callable[[Match[str]], str]]]], text: str + patterns: Iterable[tuple[str, tuple[re.Pattern[str], Callable[[re.Match[str]], str]]]], text: str ) -> bool: """Match text against patterns that require preprocessing. @@ -439,7 +438,7 @@ def _match_format(format: str, text: str) -> bool: return False -def _match_patterns(patterns: Iterable[tuple[str, Pattern[str]]], text: str) -> bool: +def _match_patterns(patterns: Iterable[tuple[str, re.Pattern[str]]], text: str) -> bool: """Match text against multiple datetime patterns. Args: diff --git a/src/risk_assessment/classification/identifiers/french_postal_code.py b/src/risk_assessment/classification/identifiers/french_postal_code.py index aa10323..0e253a8 100644 --- a/src/risk_assessment/classification/identifiers/french_postal_code.py +++ b/src/risk_assessment/classification/identifiers/french_postal_code.py @@ -1,6 +1,5 @@ import re from pathlib import Path -from re import Pattern from risk_assessment.classification.identifiers import Identifier @@ -20,7 +19,7 @@ def _load_valid_zipcodes() -> dict[str, set[str]]: class FrenchPostalCode(Identifier): - pattern: Pattern[str] = re.compile(r"^(\d{2})(\d{3})$") + pattern: re.Pattern[str] = re.compile(r"^(\d{2})(\d{3})$") departments: dict[str, set[str]] = _load_valid_zipcodes() def is_of_this_type(self, text: str) -> bool: diff --git a/src/risk_assessment/classification/identifiers/japan_address.py b/src/risk_assessment/classification/identifiers/japan_address.py index 356cd37..59fdb76 100644 --- a/src/risk_assessment/classification/identifiers/japan_address.py +++ b/src/risk_assessment/classification/identifiers/japan_address.py @@ -1,32 +1,31 @@ -import re -from re import Pattern +from re import I, Pattern, U, compile from risk_assessment.classification.identifiers import Identifier class JapanAddress(Identifier): patterns: list[Pattern[str]] = [ - re.compile( - r"^\d+\s+\w+\s+\w{3,}-\w{2,5}\s+\w{3,}(?:-\w{2,3})?,\s+\w{3,}\s+(?:〒\s*)?\d{3}-\d{4}\s+JAPAN$", re.I | re.U + compile( + r"^\d+\s+\w+\s+\w{3,}-\w{2,5}\s+\w{3,}(?:-\w{2,3})?,\s+\w{3,}\s+(?:〒\s*)?\d{3}-\d{4}\s+JAPAN$", I | U ), # rural - re.compile( + compile( r"^\d+-\d+,\s+\w{3,}\s+\d+-chome\s+\w{3,}(?:-\w{3,})*-(?:shi|gun|ku|machi|cho),\s+\w+(?:-(?:ken|fu|to))?\s+(?:〒\s*)?\d{3}-\d{4}\s+JAPAN$", - re.I | re.U, + I | U, ), # city - re.compile( + compile( r"^\d+-\d+-\d+,\s+\w{3,}\s+\w{3,}(?:-\w{3,})*-(?:shi|gun|ku|machi|cho),\s+\w+(?:-(?:ken|fu|to))?\s+(?:〒\s*)?\d{3}-\d{4}\s+JAPAN$", - re.I | re.U, - ), # city, compressed # re.compile(r"", re.I | re.U), # city - re.compile( + I | U, + ), # city, compressed + compile( r"^\d+-\d+,\s+\w{3,}\s+\w{3,}(?:-\w{3,})*-(?:shi|gun|ku|machi|cho),\s+\w+(?:-(?:ken|fu|to))?\s+(?:〒\s*)?\d{3}-\d{4}\s+JAPAN$$", - re.I | re.U, + I | U, ), # city as prefecture - re.compile( + compile( r"^JAPAN\s+(?:〒\s*)?\d{3}-\d{4}\s+\w+(?:-(?:ken|fu|to))?\s+\w{3,}(?:-\w{3,})*-(?:shi|gun|ku|machi|cho)\s+\w{3,}\s+\d+(?:-chome)?(?:\s+|-)\d+-\d+$", - re.I | re.U, + I | U, ), # oneliner # from RWD - re.compile(r"^〒?(:?\d+-\d+)\s+\w+\s*\d+$"), + compile(r"^〒?(:?\d+-\d+)\s+\w+\s*\d+$"), ] def is_of_this_type(self, text: str) -> bool: diff --git a/src/risk_assessment/classification/identifiers/vehicle.py b/src/risk_assessment/classification/identifiers/vehicle.py index 50e0a06..7d7f55a 100644 --- a/src/risk_assessment/classification/identifiers/vehicle.py +++ b/src/risk_assessment/classification/identifiers/vehicle.py @@ -4,8 +4,7 @@ with checksum verification and World Manufacturer Identifier validation. """ -import re -from re import Pattern +from re import Pattern, compile from risk_assessment.classification.identifiers import Identifier @@ -32,7 +31,7 @@ class VehicleIdentificationNumber(Identifier): True """ - pattern: Pattern[str] = re.compile( + pattern: Pattern[str] = compile( r"^([ABCDEFGHJKLMNPRSTUVWXYZ0-9]{3})([ABCDEFGHJKLMNPRSTUVWXYZ0-9]{6})([ABCDEFGHJKLMNPRSTUVWXYZ0-9]{8})$" ) From fd3e5841b1c2f4df65e044e522b6be766fb39856 Mon Sep 17 00:00:00 2001 From: Stefano Braghin <527806+stefano81@users.noreply.github.com> Date: Sat, 16 May 2026 20:28:01 +0100 Subject: [PATCH 2/9] fix: address "Empty except" issue from CodeQL Signed-off-by: Stefano Braghin <527806+stefano81@users.noreply.github.com> --- .../classification/identifiers/age.py | 5 ++--- .../classification/identifiers/geography.py | 5 ++--- .../classification/identifiers/network.py | 21 +++++++------------ 3 files changed, 11 insertions(+), 20 deletions(-) diff --git a/src/risk_assessment/classification/identifiers/age.py b/src/risk_assessment/classification/identifiers/age.py index 2732119..26a3f91 100644 --- a/src/risk_assessment/classification/identifiers/age.py +++ b/src/risk_assessment/classification/identifiers/age.py @@ -4,6 +4,7 @@ age expressions with units (years, months, weeks), and age-related phrases. """ +from contextlib import suppress from re import I, Pattern, U, compile from word2number.w2n import word_to_num @@ -38,13 +39,11 @@ def is_of_this_type(self, text: str | int) -> bool: int_value: int = 10_000_000 if isinstance(text, str): - try: + with suppress(ValueError): int_value = int(text, base=10) if text != str(int_value): return False - except ValueError: - pass elif isinstance(text, int): int_value = text diff --git a/src/risk_assessment/classification/identifiers/geography.py b/src/risk_assessment/classification/identifiers/geography.py index 2193d7e..7f844a8 100644 --- a/src/risk_assessment/classification/identifiers/geography.py +++ b/src/risk_assessment/classification/identifiers/geography.py @@ -8,6 +8,7 @@ import logging import re from collections.abc import Callable, Iterable +from contextlib import suppress from pathlib import Path from risk_assessment.classification.identifiers import DictionaryIdentifier, Identifier @@ -512,13 +513,11 @@ def is_of_this_type(self, text: str) -> bool: """ text = text.strip() if len(text) == 5: - try: + with suppress(ValueError): int_code = int(text, base=10) for _, (m, M) in self.valid_codes.items(): if m <= int_code <= M: return True - except ValueError: - pass return False diff --git a/src/risk_assessment/classification/identifiers/network.py b/src/risk_assessment/classification/identifiers/network.py index f92e0b3..c004cdf 100644 --- a/src/risk_assessment/classification/identifiers/network.py +++ b/src/risk_assessment/classification/identifiers/network.py @@ -4,6 +4,7 @@ IP addresses (both versions), and URIs/URLs. """ +from contextlib import suppress from ipaddress import AddressValueError, IPv4Address, IPv6Address from logging import getLogger from pathlib import Path @@ -63,11 +64,9 @@ def _valid_ipv6_hostname(text: str) -> bool: Returns: True if text is a valid IPv6 address, False otherwise. """ - try: - if IPv6Address(text) is not None: + with suppress(AddressValueError): + if IPv6Address(text): return True - except AddressValueError: - pass return False @@ -106,11 +105,9 @@ def is_of_this_type(self, text: str) -> bool: Returns: True if text is a valid IPv4 address, False otherwise. """ - try: + with suppress(AddressValueError): if IPv4Address(text) is not None: return True - except AddressValueError: - pass return False @@ -136,13 +133,11 @@ def is_of_this_type(self, text: str, allow_double_colon: bool = True) -> bool: Returns: True if text is a valid IPv6 address, False otherwise. """ - try: + with suppress(AddressValueError): if IPv6Address(text) is not None: if text == "::" and not allow_double_colon: return False return True - except AddressValueError: - pass return False @@ -211,7 +206,8 @@ def is_of_this_type(self, text: str) -> bool: """ if len(text.strip()) != len(text): return False - try: + + with suppress(Exception): result = urlparse(text) if result is not None: @@ -223,7 +219,4 @@ def is_of_this_type(self, text: str) -> bool: if text.startswith("www.") or text.startswith("mail."): return self.is_of_this_type(f"http://{text}") - except Exception: - return False - return False From b4dca85cf8bdd150c5b5987f2ca246d3d869d642 Mon Sep 17 00:00:00 2001 From: Stefano Braghin <527806+stefano81@users.noreply.github.com> Date: Sat, 16 May 2026 20:29:44 +0100 Subject: [PATCH 3/9] fix: address "Unused import" from CodeQL Signed-off-by: Stefano Braghin <527806+stefano81@users.noreply.github.com> --- tests/classification/identifiers/test_credit_card.py | 3 --- tests/classification/identifiers/test_us_address.py | 4 ---- tests/classification/unstructured/test_utility.py | 2 -- 3 files changed, 9 deletions(-) diff --git a/tests/classification/identifiers/test_credit_card.py b/tests/classification/identifiers/test_credit_card.py index c7f0638..c00e7bb 100644 --- a/tests/classification/identifiers/test_credit_card.py +++ b/tests/classification/identifiers/test_credit_card.py @@ -1,6 +1,3 @@ -import json -from pathlib import Path - import pytest from risk_assessment.classification.identifiers import CreditCard diff --git a/tests/classification/identifiers/test_us_address.py b/tests/classification/identifiers/test_us_address.py index 92c405a..9332331 100644 --- a/tests/classification/identifiers/test_us_address.py +++ b/tests/classification/identifiers/test_us_address.py @@ -1,7 +1,3 @@ -import datetime - -import pytest - from risk_assessment.classification.identifiers import USPostalAddress diff --git a/tests/classification/unstructured/test_utility.py b/tests/classification/unstructured/test_utility.py index 4844a3a..ffa266d 100644 --- a/tests/classification/unstructured/test_utility.py +++ b/tests/classification/unstructured/test_utility.py @@ -1,8 +1,6 @@ import json from pathlib import Path -import pytest - from risk_assessment.classification.unstructured import Entity from risk_assessment.classification.unstructured.utility import AnnotationWriter From 139c0b8c7edd0bb36435990bc75dc6b5c5976ad8 Mon Sep 17 00:00:00 2001 From: Stefano Braghin <527806+stefano81@users.noreply.github.com> Date: Sat, 16 May 2026 20:34:28 +0100 Subject: [PATCH 4/9] fix: address "Unmatchable caret in regular expression" from CodeQL Signed-off-by: Stefano Braghin <527806+stefano81@users.noreply.github.com> --- .../classification/identifiers/national_identifier.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/risk_assessment/classification/identifiers/national_identifier.py b/src/risk_assessment/classification/identifiers/national_identifier.py index 7184090..7596c1c 100644 --- a/src/risk_assessment/classification/identifiers/national_identifier.py +++ b/src/risk_assessment/classification/identifiers/national_identifier.py @@ -81,7 +81,7 @@ class NIRFrance(Identifier): """ pattern = re.compile( - r"^([7182]\d{14})$|^([7182])( )(\d{2})( )(\d{2})( )(\d{2})( )(\d{3})( )(\d{3})( )(\d{2})$|^([7128]\d{12} \d{2})$|^([7182]\d{4}2[AB]\d{8})$|^([7182])( )(\d{2})( )(\d{2})( )(2[AB])( )(\d{3})( )(\d{3})( )(\d{2})$" + r"^(([7182]\d{14})|([7182])( )(\d{2})( )(\d{2})( )(\d{2})( )(\d{3})( )(\d{3})( )(\d{2})|([7128]\d{12} \d{2})|([7182]\d{4}2[AB]\d{8})|([7182])( )(\d{2})( )(\d{2})( )(2[AB])( )(\d{3})( )(\d{3})( )(\d{2}))$" ) def is_of_this_type(self, text: str) -> bool: @@ -117,7 +117,7 @@ class TINGermany(Identifier): """ pattern = re.compile( - r"^([1-9]\d{11})$|^([1-9]\d \d{3} \d{3} \d{3})$|^([1-9][0-9])([,])(d{3})([,])(\d{3})([,])(\d{3})$|^([1-9][0-9])([.])(d{3})([.])(\d{3})([.])(\d{3})$|^([1-9][0-9])([\/])(d{3})([\/])(\d{3})([\/])(\d{3})$|^([1-9]\d{10})$" + r"^(([1-9]\d{11})|([1-9]\d \d{3} \d{3} \d{3})|([1-9][0-9])([,])(d{3})([,])(\d{3})([,])(\d{3})|([1-9][0-9])([.])(d{3})([.])(\d{3})([.])(\d{3})|([1-9][0-9])([\/])(d{3})([\/])(\d{3})([\/])(\d{3})|([1-9]\d{10}))$" ) # noqa def check_last_digit(self, first_ten_digits: list[str], check_digit: str) -> bool: @@ -182,7 +182,7 @@ def is_of_this_type(self, text: str) -> bool: class AadhaarNumber(Identifier): - pattern = re.compile(r"^([2-9]\d{3} \d{4} \d{4})$|^([2-9]\d{11})$") + pattern = re.compile(r"^(([2-9]\d{3} \d{4} \d{4})|([2-9]\d{11}))$") def is_of_this_type(self, text: str) -> bool: # https://en.wikipedia.org/wiki/Aadhaar @@ -312,7 +312,7 @@ def validate_checksum(self, text: str) -> bool: class MyNumberJapan(Identifier): pattern = re.compile( - r"^(\d{4}.\d{4}.\d{4})$|^(\d{4},\d{4},\d{4})$|^(\d{4}-\d{4}-\d{4})|(\d{4} \d{4} \d{4})$|^(\d{12})$" + r"^((\d{4}.\d{4}.\d{4})|(\d{4},\d{4},\d{4})|(\d{4}-\d{4}-\d{4})|(\d{4} \d{4} \d{4})|(\d{12}))$" ) def is_of_this_type(self, text: str) -> bool: From 73d61a39f27c190b09122c3fb0db031da73bdaca Mon Sep 17 00:00:00 2001 From: Stefano Braghin <527806+stefano81@users.noreply.github.com> Date: Sat, 16 May 2026 20:35:31 +0100 Subject: [PATCH 5/9] fix: address "Commented-out code" from CodeQL Signed-off-by: Stefano Braghin <527806+stefano81@users.noreply.github.com> --- .../classification/unstructured/aggregator.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/risk_assessment/classification/unstructured/aggregator.py b/src/risk_assessment/classification/unstructured/aggregator.py index 02456d2..b7599be 100644 --- a/src/risk_assessment/classification/unstructured/aggregator.py +++ b/src/risk_assessment/classification/unstructured/aggregator.py @@ -335,12 +335,6 @@ def find_split_point(self, entities: list[Entity]) -> list[int]: return sorted(split_points) def validate_entities(self, entity_list: list[Entity], text: str) -> list[Entity]: - # validated_entitites: list[Entity] = [] - # for entity in entity_list: - # if self.validate_entity(entity, text): - # validated_entitites.append(entity) - # return validated_entitites - return [entity for entity in entity_list if self.validate_entity(entity, text)] def validate_entity(self, entity: Entity, text: str) -> bool: From b0131fdfd8ee2a9fba30b24f6b5954f99cc46312 Mon Sep 17 00:00:00 2001 From: Stefano Braghin <527806+stefano81@users.noreply.github.com> Date: Sat, 16 May 2026 20:39:42 +0100 Subject: [PATCH 6/9] fix: address "Non-callable called" from CodeQL Signed-off-by: Stefano Braghin <527806+stefano81@users.noreply.github.com> --- .../classification/__init__.py | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/risk_assessment/classification/__init__.py b/src/risk_assessment/classification/__init__.py index 58e571e..60ebdc0 100644 --- a/src/risk_assessment/classification/__init__.py +++ b/src/risk_assessment/classification/__init__.py @@ -6,6 +6,7 @@ """ from collections import defaultdict +from contextlib import suppress from dataclasses import dataclass from typing import Any, cast @@ -45,12 +46,24 @@ def create_instance(identifier_fqn: str) -> Identifier: for comp in parts[1:]: module = getattr(module, comp) - if type(module) is type(Identifier): - m = module() - return cast(Identifier, m) + # Verify that module is a class (type) and is a subclass of Identifier + if not isinstance(module, type): + raise ValueError( + f"{identifier_fqn} is not a class. " + "Expected a subclass of `risk_assessment.classification.identifiers.Identifier`, " + f"but got {type(module).__name__}" + ) + + try: + if issubclass(module, Identifier): + return module() + except TypeError as e: + # issubclass() raises TypeError if module is not a class (shouldn't happen due to isinstance check above) + raise ValueError(f"{identifier_fqn} cannot be checked as a subclass: {e}") from e raise ValueError( - f"{identifier_fqn} does not exists or is not a subclass of `risk_assessment.classification.identifiers.Identifier`" + f"{identifier_fqn} is not a subclass of `risk_assessment.classification.identifiers.Identifier`. " + f"Found class: {module.__name__}" ) From c7000f3846886a13b392bea10e5b3b7813c4a35b Mon Sep 17 00:00:00 2001 From: Stefano Braghin <527806+stefano81@users.noreply.github.com> Date: Sat, 16 May 2026 20:41:49 +0100 Subject: [PATCH 7/9] fix: address "Redundant comparison" from CodeQL Signed-off-by: Stefano Braghin <527806+stefano81@users.noreply.github.com> --- .../identifiers/us_postal_address.py | 36 +++++++++++++------ 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/src/risk_assessment/classification/identifiers/us_postal_address.py b/src/risk_assessment/classification/identifiers/us_postal_address.py index 19b5d9d..e829c4e 100644 --- a/src/risk_assessment/classification/identifiers/us_postal_address.py +++ b/src/risk_assessment/classification/identifiers/us_postal_address.py @@ -758,24 +758,40 @@ def _quick_check_there_are_multiple_tokens(text: str) -> bool: def _check_that_case_is_consistent(text: str) -> bool: - tokens = text.split(r"[\s+|,]") + """Check if the text has consistent casing (all uppercase or all lowercase initial letters). + + Args: + text: The text to check for case consistency + + Returns: + bool: True if all alphabetic initial letters are consistently upper or lower case, + False if mixed case is detected + """ + import re + + # Split on whitespace and commas using proper regex + tokens = re.split(r"[\s,]+", text) upper_count = 0 lower_count = 0 for token in tokens: - if len(token.strip()) == 0: + token = token.strip() + if not token: continue - begin = token[0] + # Get first character + first_char = token[0] - if begin.isalpha(): - if begin.islower(): + if first_char.isalpha(): + if first_char.islower(): lower_count += 1 - elif begin.isupper(): + elif first_char.isupper(): upper_count += 1 - else: - # raise ValueError() - return False + # Note: Non-ASCII alphabetic characters that are neither upper nor lower + # are ignored rather than causing the function to return False - return not (lower_count > 0 and lower_count > 0) # either all upper or all lower + # Return True if case is consistent: either all upper OR all lower (not both) + # If no alphabetic characters found, consider it consistent (return True) + has_mixed_case = lower_count > 0 and upper_count > 0 + return not has_mixed_case From 66c22eb21d9eab235d361c524a076e12a2a32ab2 Mon Sep 17 00:00:00 2001 From: Stefano Braghin <527806+stefano81@users.noreply.github.com> Date: Sat, 16 May 2026 20:45:45 +0100 Subject: [PATCH 8/9] fix: tests were not passing "correctly", re-classified examples Signed-off-by: Stefano Braghin <527806+stefano81@users.noreply.github.com> --- tests/classification/identifiers/test_us_address.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/classification/identifiers/test_us_address.py b/tests/classification/identifiers/test_us_address.py index 9332331..88dcb07 100644 --- a/tests/classification/identifiers/test_us_address.py +++ b/tests/classification/identifiers/test_us_address.py @@ -55,24 +55,22 @@ def test_from_rwd(): assert identifier.is_of_this_type("1160 South Main Street 322, Middletown, Connecticut, U.S.A. 06457-5044") assert identifier.is_of_this_type("12489 W 84Th DR, ARVADA, Jefferson County, CO") - assert identifier.is_of_this_type("12489 W 84Th DR, ARVADA, Jefferson County, CO") - assert identifier.is_of_this_type("13992 E 107Th AVE, COMMERCE CITY, Adams County, CO") - assert identifier.is_of_this_type("13992 E 107Th AVE, COMMERCE CITY, Adams County, CO") assert identifier.is_of_this_type("13992 E 107Th AVE, COMMERCE CITY, Adams County, CO") assert identifier.is_of_this_type("16101 Road J, CORTEZ, Montezuma County, CO") assert identifier.is_of_this_type("16122 W 70Th AVE, ARVADA, Jefferson County, CO") + assert identifier.is_of_this_type("1824 Alto Ln, Lutz, FL 33558") assert identifier.is_of_this_type("1824 Alto Ln, Lutz") - assert identifier.is_of_this_type("1824 Alto Ln, Lutz, FL 33558 Republican Party of Florida") assert identifier.is_of_this_type("415 W. Route 66, 201") assert identifier.is_of_this_type("52 Canyon Cove LN, DRAKE, Larimer County, CO") assert identifier.is_of_this_type("646 Riverview Trace Ct, Fort Myers, Florida 33916") - assert identifier.is_of_this_type("Lutz, FL 33558 Republican Party of Florida") assert not identifier.is_of_this_type("1160 South Main Street 322, Middletown, Connecticut, U.S.A. 06457-5044.") assert not identifier.is_of_this_type("1824 Alto Ln, Lutz ") assert not identifier.is_of_this_type("1824 Alto Ln, Lutz, FL 33558 Republican Party of Florida.") + assert not identifier.is_of_this_type("1824 Alto Ln, Lutz, FL 33558 Republican Party of Florida") assert not identifier.is_of_this_type("Camera LensesSimply put, the better you understand different") assert not identifier.is_of_this_type("Dec 2018 00:00:00 -0000Template - Content graphicsAlex") assert not identifier.is_of_this_type("Lutz, FL 33558 Republican Party of Florida.") + assert not identifier.is_of_this_type("Lutz, FL 33558 Republican Party of Florida") def test_invalid(): From 6466d3460c83df026f4078e9b03fe556e5161e66 Mon Sep 17 00:00:00 2001 From: Stefano Braghin <527806+stefano81@users.noreply.github.com> Date: Sat, 16 May 2026 21:54:59 +0100 Subject: [PATCH 9/9] fix: fix "Unused local variable" from CodeQL, and make the Mexican CURP work properly Signed-off-by: Stefano Braghin <527806+stefano81@users.noreply.github.com> --- .secrets.baseline | 8 +-- .../identifiers/national_identifier.py | 71 +++++++++++-------- .../identifiers/test_national_id.py | 10 ++- 3 files changed, 51 insertions(+), 38 deletions(-) diff --git a/.secrets.baseline b/.secrets.baseline index d309f75..22df845 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -3,7 +3,7 @@ "files": "^.secrets.baseline$", "lines": null }, - "generated_at": "2026-05-15T10:59:03Z", + "generated_at": "2026-05-16T20:54:57Z", "plugins_used": [ { "name": "AWSKeyDetector" @@ -82,7 +82,7 @@ "hashed_secret": "f3e0d184814b86dc1c4eb623edde7610cf212567", "is_secret": false, "is_verified": false, - "line_number": 845, + "line_number": 860, "type": "Hex High Entropy String", "verified_result": null } @@ -188,7 +188,7 @@ "hashed_secret": "f3e0d184814b86dc1c4eb623edde7610cf212567", "is_secret": false, "is_verified": false, - "line_number": 163, + "line_number": 161, "type": "Hex High Entropy String", "verified_result": null }, @@ -196,7 +196,7 @@ "hashed_secret": "804ec071803318791b835cffd6e509c8d32239db", "is_secret": false, "is_verified": false, - "line_number": 165, + "line_number": 163, "type": "Hex High Entropy String", "verified_result": null } diff --git a/src/risk_assessment/classification/identifiers/national_identifier.py b/src/risk_assessment/classification/identifiers/national_identifier.py index 7596c1c..bb4c364 100644 --- a/src/risk_assessment/classification/identifiers/national_identifier.py +++ b/src/risk_assessment/classification/identifiers/national_identifier.py @@ -364,46 +364,44 @@ def is_of_this_type(self, text: str) -> bool: class MexicoCURP(Identifier): - pattern = re.compile( - r"^[A-Z][AEIOU][A-Z]{2}(\d{2})(\d{2})(\d{2})[HMX]([A-Z]{2})[BCDFGHJKLMNPQRSTVWXYZ]{3}([A-Z0-9])(\d)" - ) + pattern = re.compile(r"^[A-Z][AEIOU][A-Z]{2}(\d{2})(\d{2})(\d{2})[HMX]([A-Z]{2})[A-Z]{3}([A-Z0-9])(\d)") # http://www.statoids.com/umx.html states: set[str] = { - "AG", - "BN", + "AS", + "BC", "BS", - "CA", + "CC", + "CS", "CH", + "DF", "CL", "CM", - "COCP", - "DF", - "DU", - "GJ", + "DG", + "GT", "GR", - "HI", - "JA", + "HG", + "JC", "MC", - "MR", - "MX", - "NA", + "MN", + "MS", + "NT", "NL", - "OA", - "PU", - "QE", + "OC", + "PL", + "QO", "QR", - "SI", + "SP", "SL", - "SO", - "TB", + "SR", + "TC", + "TS", "TL", - "TM", - "VE", "VZ", - "YU", - "ZA", + "YN", + "ZS", "NE", # code for people born abroad } + CURP_CHARACTERS = "0123456789ABCDEFGHIJKLMNÑOPQRSTUVWXYZ" def is_of_this_type(self, text: str) -> bool: match = MexicoCURP.pattern.match(text) @@ -414,19 +412,36 @@ def is_of_this_type(self, text: str) -> bool: day = int(match.group(3), base=10) state = match.group(4) century_flag = match.group(5) - parity = match.group(6) # noqa + # parity = match.group(6) # noqa if all(c.isdigit() for c in century_flag): year = int(f"20{year_2d}") else: year = int(f"19{year_2d}") - if state in self.states and _valid_birth_date(day, month, year): - return True + if state in self.states: + if _valid_birth_date(day, month, year): + if _valid_curp_parity(text): + return True return False +def _valid_curp_parity(text: str) -> bool: + """Validate the parity check digit of a Mexican CURP. + + Args: + text: The CURP string to validate. + + Returns: + True if the parity check digit is valid, False otherwise. + """ + start = 18 + return text[-1] == str( + -sum((start - i) * MexicoCURP.CURP_CHARACTERS.index(n) for i, n in enumerate(text[:-1])) % 10 + ) + + class CanadaSIN(LuhnIdentifier): _pattern = re.compile(r"^\d{3}[\- ]?\d{3}[\- ]?\d{3}$") diff --git a/tests/classification/identifiers/test_national_id.py b/tests/classification/identifiers/test_national_id.py index 5a2a313..e870f14 100644 --- a/tests/classification/identifiers/test_national_id.py +++ b/tests/classification/identifiers/test_national_id.py @@ -1,4 +1,5 @@ import pytest +from faker import Faker from risk_assessment.classification.identifiers import IsraelID from risk_assessment.classification.identifiers.national_identifier import ( @@ -39,14 +40,11 @@ def test_israel_national_id(faker): assert identifier.is_of_this_type(example), example -def test_mexican_curb(): - examples = [ - "HEGG560427MVZRRL04", - ] - +def test_mexican_curp(): identifier = MexicoCURP() - for example in examples: + faker = Faker("es_MX") + for example in [faker.curp() for _ in range(100)]: assert identifier.is_of_this_type(example), example