Skip to content

Commit 2f33c18

Browse files
authored
feat: Add Nigeria recognizers (National Identity Number and Vehicle Registration) (#1863)
1 parent 5404c84 commit 2f33c18

File tree

9 files changed

+483
-0
lines changed

9 files changed

+483
-0
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ All notable changes to this project will be documented in this file.
55
## [unreleased]
66
### Analyzer
77
#### Added
8+
- Nigerian National Identification Number (NG_NIN) recognizer with Verhoeff checksum validation
9+
- Nigerian Vehicle Registration (NG_VEHICLE_REGISTRATION) recognizer for current format plates (2011+)
810
- US_NPI recognizer for National Provider Identifier with Luhn checksum validation and context support (#1847) (Thanks @stevenelliottjr)
911
- UK Postcode (UK_POSTCODE) recognizer with pattern matching and context support
1012

docs/supported_entities.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,12 @@ For more information, refer to the [adding new recognizers documentation](analyz
110110
| KR_RRN | The Korean Resident Registration Number (RRN) is a 13-digit number issued to all Korean residents. | Pattern match, context and custom logic. |
111111

112112

113+
### Nigeria
114+
| FieldType | Description | Detection Method |
115+
|------------|---------------------------------------------------------------------------------------------------------|------------------------------------------|
116+
| NG_NIN | The Nigerian National Identification Number (NIN) is a unique 11-digit number issued by the National Identity Management Commission (NIMC). | Pattern match, context, and checksum |
117+
| NG_VEHICLE_REGISTRATION | Nigerian vehicle registration plate number in the current format (2011+): 3 letters (LGA code), 3 digits (serial), 2 letters (year/batch). | Pattern match and context |
118+
113119
### Thai
114120
| FieldType | Description | Detection Method |
115121
|------------|---------------------------------------------------------------------------------------------------------|------------------------------------------|

presidio-analyzer/presidio_analyzer/conf/default_recognizers.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,18 @@ recognizers:
108108
type: predefined
109109
enabled: false
110110

111+
- name: NgNinRecognizer
112+
supported_languages:
113+
- en
114+
type: predefined
115+
enabled: false
116+
117+
- name: NgVehicleRegistrationRecognizer
118+
supported_languages:
119+
- en
120+
type: predefined
121+
enabled: false
122+
111123
- name: InPanRecognizer
112124
supported_languages:
113125
- en

presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,12 @@
4343
from .country_specific.korea.kr_passport_recognizer import KrPassportRecognizer
4444
from .country_specific.korea.kr_rrn_recognizer import KrRrnRecognizer
4545

46+
# Nigeria recognizers
47+
from .country_specific.nigeria.ng_nin_recognizer import NgNinRecognizer
48+
from .country_specific.nigeria.ng_vehicle_registration_recognizer import (
49+
NgVehicleRegistrationRecognizer,
50+
)
51+
4652
# Poland recognizers
4753
from .country_specific.poland.pl_pesel_recognizer import PlPeselRecognizer
4854

@@ -180,5 +186,7 @@
180186
"AzureOpenAILangExtractRecognizer",
181187
"BasicLangExtractRecognizer",
182188
"KrPassportRecognizer",
189+
"NgNinRecognizer",
190+
"NgVehicleRegistrationRecognizer",
183191
"MedicalNERRecognizer",
184192
]
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
"""Nigeria-specific recognizers."""
2+
3+
from .ng_nin_recognizer import NgNinRecognizer
4+
from .ng_vehicle_registration_recognizer import NgVehicleRegistrationRecognizer
5+
6+
__all__ = [
7+
"NgNinRecognizer",
8+
"NgVehicleRegistrationRecognizer",
9+
]
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
from typing import List, Optional
2+
3+
from presidio_analyzer import Pattern, PatternRecognizer
4+
5+
6+
class NgNinRecognizer(PatternRecognizer):
7+
"""
8+
Recognizes Nigerian National Identification Number (NIN).
9+
10+
The NIN is an 11-digit number issued by the National Identity Management
11+
Commission (NIMC). The last digit is a Verhoeff checksum.
12+
13+
Reference: https://nimc.gov.ng/
14+
15+
:param patterns: List of patterns to be used by this recognizer
16+
:param context: List of context words to increase confidence in detection
17+
:param supported_language: Language this recognizer supports
18+
:param supported_entity: The entity this recognizer can detect
19+
"""
20+
21+
PATTERNS = [
22+
Pattern(
23+
"NIN (Very Weak)",
24+
r"\b\d{11}\b",
25+
0.01,
26+
),
27+
]
28+
29+
CONTEXT = [
30+
"nin",
31+
"national identification number",
32+
"national identity number",
33+
"nimc",
34+
"national identity",
35+
"nigeria id",
36+
"nigerian identification",
37+
]
38+
39+
def __init__(
40+
self,
41+
patterns: Optional[List[Pattern]] = None,
42+
context: Optional[List[str]] = None,
43+
supported_language: str = "en",
44+
supported_entity: str = "NG_NIN",
45+
name: Optional[str] = None,
46+
) -> None:
47+
patterns = patterns if patterns else self.PATTERNS
48+
context = context if context else self.CONTEXT
49+
super().__init__(
50+
supported_entity=supported_entity,
51+
patterns=patterns,
52+
context=context,
53+
supported_language=supported_language,
54+
name=name,
55+
)
56+
57+
def validate_result(self, pattern_text: str) -> bool:
58+
"""Validate the NIN by checking length, digits, and Verhoeff checksum."""
59+
return self.__check_nin(pattern_text)
60+
61+
def __check_nin(self, value: str) -> bool:
62+
return (
63+
len(value) == 11
64+
and value.isnumeric()
65+
and self._is_verhoeff_number(int(value))
66+
)
67+
68+
@staticmethod
69+
def _is_verhoeff_number(input_number: int) -> bool:
70+
"""
71+
Check if the input number is a true Verhoeff number.
72+
73+
:param input_number: Number to validate
74+
:return: True if the number passes the Verhoeff checksum
75+
"""
76+
__d__ = [
77+
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
78+
[1, 2, 3, 4, 0, 6, 7, 8, 9, 5],
79+
[2, 3, 4, 0, 1, 7, 8, 9, 5, 6],
80+
[3, 4, 0, 1, 2, 8, 9, 5, 6, 7],
81+
[4, 0, 1, 2, 3, 9, 5, 6, 7, 8],
82+
[5, 9, 8, 7, 6, 0, 4, 3, 2, 1],
83+
[6, 5, 9, 8, 7, 1, 0, 4, 3, 2],
84+
[7, 6, 5, 9, 8, 2, 1, 0, 4, 3],
85+
[8, 7, 6, 5, 9, 3, 2, 1, 0, 4],
86+
[9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
87+
]
88+
__p__ = [
89+
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
90+
[1, 5, 7, 6, 2, 8, 3, 0, 9, 4],
91+
[5, 8, 0, 3, 7, 9, 6, 1, 4, 2],
92+
[8, 9, 1, 6, 0, 4, 3, 5, 2, 7],
93+
[9, 4, 5, 3, 1, 2, 6, 8, 7, 0],
94+
[4, 2, 8, 6, 5, 7, 3, 9, 0, 1],
95+
[2, 7, 9, 3, 8, 0, 6, 4, 1, 5],
96+
[7, 0, 4, 6, 9, 1, 3, 2, 5, 8],
97+
]
98+
__inv__ = [0, 4, 3, 2, 1, 5, 6, 7, 8, 9]
99+
100+
c = 0
101+
inverted_number = list(map(int, reversed(str(input_number))))
102+
for i in range(len(inverted_number)):
103+
c = __d__[c][__p__[i % 8][inverted_number[i]]]
104+
return __inv__[c] == 0
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
from typing import List, Optional
2+
3+
from presidio_analyzer import Pattern, PatternRecognizer
4+
5+
6+
class NgVehicleRegistrationRecognizer(PatternRecognizer):
7+
"""
8+
Recognizes Nigerian vehicle registration plate numbers (current format, 2011+).
9+
10+
The current format is: ABC-123DE
11+
- 3 letters: LGA (Local Government Area) code
12+
- Hyphen separator (may be omitted or replaced with space)
13+
- 3 digits: serial number (001-999)
14+
- 2 letters: year code + batch code
15+
16+
Reference: https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_Nigeria
17+
18+
:param patterns: List of patterns to be used by this recognizer
19+
:param context: List of context words to increase confidence in detection
20+
:param supported_language: Language this recognizer supports
21+
:param supported_entity: The entity this recognizer can detect
22+
"""
23+
24+
PATTERNS = [
25+
Pattern(
26+
"Nigeria Vehicle Registration",
27+
r"\b[A-Z]{3}[- ]?\d{3}[A-Z]{2}\b",
28+
0.5,
29+
),
30+
]
31+
32+
CONTEXT = [
33+
"plate number",
34+
"vehicle registration",
35+
"license plate",
36+
"number plate",
37+
"plate",
38+
"vehicle",
39+
"registration",
40+
]
41+
42+
def __init__(
43+
self,
44+
patterns: Optional[List[Pattern]] = None,
45+
context: Optional[List[str]] = None,
46+
supported_language: str = "en",
47+
supported_entity: str = "NG_VEHICLE_REGISTRATION",
48+
name: Optional[str] = None,
49+
) -> None:
50+
patterns = patterns if patterns else self.PATTERNS
51+
context = context if context else self.CONTEXT
52+
super().__init__(
53+
supported_entity=supported_entity,
54+
patterns=patterns,
55+
context=context,
56+
supported_language=supported_language,
57+
name=name,
58+
)
Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
import pytest
2+
from presidio_analyzer.predefined_recognizers import NgNinRecognizer
3+
4+
from tests.assertions import assert_result_within_score_range
5+
6+
7+
def _generate_verhoeff_digit(num_str: str) -> str:
8+
"""Generate a Verhoeff check digit for a numeric string."""
9+
__d__ = [
10+
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
11+
[1, 2, 3, 4, 0, 6, 7, 8, 9, 5],
12+
[2, 3, 4, 0, 1, 7, 8, 9, 5, 6],
13+
[3, 4, 0, 1, 2, 8, 9, 5, 6, 7],
14+
[4, 0, 1, 2, 3, 9, 5, 6, 7, 8],
15+
[5, 9, 8, 7, 6, 0, 4, 3, 2, 1],
16+
[6, 5, 9, 8, 7, 1, 0, 4, 3, 2],
17+
[7, 6, 5, 9, 8, 2, 1, 0, 4, 3],
18+
[8, 7, 6, 5, 9, 3, 2, 1, 0, 4],
19+
[9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
20+
]
21+
__p__ = [
22+
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
23+
[1, 5, 7, 6, 2, 8, 3, 0, 9, 4],
24+
[5, 8, 0, 3, 7, 9, 6, 1, 4, 2],
25+
[8, 9, 1, 6, 0, 4, 3, 5, 2, 7],
26+
[9, 4, 5, 3, 1, 2, 6, 8, 7, 0],
27+
[4, 2, 8, 6, 5, 7, 3, 9, 0, 1],
28+
[2, 7, 9, 3, 8, 0, 6, 4, 1, 5],
29+
[7, 0, 4, 6, 9, 1, 3, 2, 5, 8],
30+
]
31+
__inv__ = [0, 4, 3, 2, 1, 5, 6, 7, 8, 9]
32+
33+
c = 0
34+
digits = list(map(int, reversed(num_str)))
35+
for i in range(len(digits)):
36+
c = __d__[c][__p__[(i + 1) % 8][digits[i]]]
37+
return str(__inv__[c])
38+
39+
40+
# Pre-generated valid 11-digit NINs (10 random digits + Verhoeff check digit)
41+
VALID_NIN_1 = "1234567890" + _generate_verhoeff_digit("1234567890")
42+
VALID_NIN_2 = "9876543210" + _generate_verhoeff_digit("9876543210")
43+
VALID_NIN_3 = "5551234567" + _generate_verhoeff_digit("5551234567")
44+
45+
46+
@pytest.fixture(scope="module")
47+
def recognizer():
48+
return NgNinRecognizer()
49+
50+
51+
@pytest.fixture(scope="module")
52+
def entities():
53+
return ["NG_NIN"]
54+
55+
56+
@pytest.mark.parametrize(
57+
"text, expected_len, expected_positions, expected_score_ranges",
58+
[
59+
# fmt: off
60+
# Valid NINs (validate_result promotes score to 1.0)
61+
(
62+
VALID_NIN_1,
63+
1,
64+
((0, 11),),
65+
((1.0, 1.0),),
66+
),
67+
(
68+
f"NIN: {VALID_NIN_2}",
69+
1,
70+
((5, 16),),
71+
((1.0, 1.0),),
72+
),
73+
(
74+
f"My NIN is {VALID_NIN_1} and yours is {VALID_NIN_3}",
75+
2,
76+
((10, 21), (35, 46)),
77+
((1.0, 1.0), (1.0, 1.0)),
78+
),
79+
# Invalid: fails Verhoeff checksum (flip last digit)
80+
(
81+
"12345678901",
82+
0,
83+
(),
84+
(),
85+
),
86+
# Invalid: wrong length (10 digits)
87+
(
88+
"1234567890",
89+
0,
90+
(),
91+
(),
92+
),
93+
# Invalid: wrong length (12 digits)
94+
(
95+
"123456789012",
96+
0,
97+
(),
98+
(),
99+
),
100+
# Invalid: embedded in longer number (not word boundary)
101+
(
102+
f"99{VALID_NIN_1}88",
103+
0,
104+
(),
105+
(),
106+
),
107+
# Invalid: non-numeric
108+
(
109+
"1234567890a",
110+
0,
111+
(),
112+
(),
113+
),
114+
# fmt: on
115+
],
116+
)
117+
def test_when_nin_in_text_then_all_ng_nins_found(
118+
text,
119+
expected_len,
120+
expected_positions,
121+
expected_score_ranges,
122+
recognizer,
123+
entities,
124+
):
125+
results = recognizer.analyze(text, entities)
126+
assert len(results) == expected_len
127+
128+
for res, (st_pos, fn_pos), (st_score, fn_score) in zip(
129+
results, expected_positions, expected_score_ranges
130+
):
131+
assert_result_within_score_range(
132+
res, entities[0], st_pos, fn_pos, st_score, fn_score
133+
)
134+
135+
136+
class TestVerhoeffChecksum:
137+
"""Direct tests for the Verhoeff checksum method."""
138+
139+
def test_when_valid_verhoeff_then_returns_true(self):
140+
valid_number = int(VALID_NIN_1)
141+
assert NgNinRecognizer._is_verhoeff_number(valid_number) is True
142+
143+
def test_when_invalid_verhoeff_then_returns_false(self):
144+
# Flip the last digit to break the checksum
145+
broken = VALID_NIN_1[:-1] + str((int(VALID_NIN_1[-1]) + 1) % 10)
146+
assert NgNinRecognizer._is_verhoeff_number(int(broken)) is False
147+
148+
def test_when_all_zeros_then_returns_true(self):
149+
# Verhoeff: 00000000000 is valid (checksum of all zeros is 0)
150+
assert NgNinRecognizer._is_verhoeff_number(0) is True

0 commit comments

Comments
 (0)