Skip to content

Commit 5eb2078

Browse files
RektPunkSharonHart
andauthored
[Feature] add Korean Foreigner Registration Number recognizer (#1825)
* add frn recognizer * make copilot happy --------- Co-authored-by: Sharon Hart <sharonh.dev@gmail.com>
1 parent 7850688 commit 5eb2078

File tree

7 files changed

+182
-20
lines changed

7 files changed

+182
-20
lines changed

docs/supported_entities.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,10 +99,12 @@ For more information, refer to the [adding new recognizers documentation](analyz
9999
### Korea
100100
| FieldType | Description | Detection Method |
101101
|------------|---------------------------------------------------------------------------------------------------------|------------------------------------------|
102-
| KR_BRN | The Korean Business Registration Number (BRN) is a 10-digit number assigned to business entities for taxation purposes. | Pattern match, context and custom logic. |
103-
| KR_RRN | The Korean Resident Registration Number (RRN) is a 13-digit number issued to all Korean residents. | Pattern match, context and custom logic. |
104102
| KR_DRIVER_LICENSE | The Korean driver license number is a 12-digit number. | Pattern match, context and custom logic. |
103+
| KR_FRN | The Korean Foreigner Registration Number (FRN) is a 13-digit number. | Pattern match, context and custom logic. |
105104
| KR_PASSPORT| The Korean Passport Number | Pattern match, context. |
105+
| KR_BRN | The Korean Business Registration Number (BRN) is a 10-digit number assigned to business entities for taxation purposes. | Pattern match, context and custom logic. |
106+
| KR_RRN | The Korean Resident Registration Number (RRN) is a 13-digit number issued to all Korean residents. | Pattern match, context and custom logic. |
107+
106108

107109
### Thai
108110
| FieldType | Description | Detection Method |

presidio-analyzer/presidio_analyzer/conf/default_recognizers.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,13 @@ recognizers:
175175
type: predefined
176176
enabled: false
177177

178+
- name: KrFrnRecognizer
179+
supported_languages:
180+
- ko
181+
- kr
182+
type: predefined
183+
enabled: false
184+
178185
- name: ThTninRecognizer
179186
supported_languages:
180187
- th

presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
from .country_specific.korea.kr_driver_license_recognizer import (
4040
KrDriverLicenseRecognizer,
4141
)
42+
from .country_specific.korea.kr_frn_recognizer import KrFrnRecognizer
4243
from .country_specific.korea.kr_passport_recognizer import KrPassportRecognizer
4344
from .country_specific.korea.kr_rrn_recognizer import KrRrnRecognizer
4445

@@ -162,6 +163,7 @@
162163
"KrBrnRecognizer",
163164
"KrRrnRecognizer",
164165
"KrDriverLicenseRecognizer",
166+
"KrFrnRecognizer",
165167
"ThTninRecognizer",
166168
"LangExtractRecognizer",
167169
"AzureOpenAILangExtractRecognizer",

presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/korea/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,14 @@
22

33
from .kr_brn_recognizer import KrBrnRecognizer
44
from .kr_driver_license_recognizer import KrDriverLicenseRecognizer
5+
from .kr_frn_recognizer import KrFrnRecognizer
56
from .kr_passport_recognizer import KrPassportRecognizer
67
from .kr_rrn_recognizer import KrRrnRecognizer
78

89
__all__ = [
10+
"KrDriverLicenseRecognizer",
11+
"KrPassportRecognizer",
12+
"KrFrnRecognizer",
913
"KrBrnRecognizer",
1014
"KrDriverLicenseRecognizer",
1115
"KrPassportRecognizer",
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
from typing import List, Optional, Tuple
2+
3+
from presidio_analyzer import Pattern
4+
from presidio_analyzer.predefined_recognizers.country_specific.korea.kr_rrn_recognizer import ( # noqa: E501
5+
KrRrnRecognizer,
6+
)
7+
8+
9+
class KrFrnRecognizer(KrRrnRecognizer):
10+
"""
11+
Recognize Korean Foreigner Registration Number (FRN).
12+
13+
The Korean FRN is a 13-digit number issued to registered foreigners in Korea.
14+
The format is YYMMDD-GHIJKLX where:
15+
- YYMMDD represents the birth date.
16+
- G determines gender and century (5-8).
17+
- 5: Male, 1900-1999 birth
18+
- 6: Female, 1900-1999 birth
19+
- 7: Male, 2000-2099 birth
20+
- 8: Female, 2000-2099 birth
21+
- (Note: 9 and 0 are sometimes used for 1800s birth, but rare)
22+
23+
For FRNs issued before October 2020:
24+
- HIJKL is a serial number assigned by district
25+
- X is a check digit calculated using the preceding 12 digits
26+
27+
For FRNs issued after October 2020:
28+
- HIJKLX is a random number
29+
30+
Reference: https://en.wikipedia.org/wiki/Resident_registration_number
31+
32+
:param patterns: List of patterns to be used by this recognizer
33+
:param context: List of context words to increase confidence in detection
34+
:param supported_language: Language this recognizer supports
35+
:param supported_entity: The entity this recognizer can detect
36+
:param replacement_pairs: List of tuples with potential replacement values
37+
for different strings to be used during pattern matching.
38+
This can allow a greater variety in input, for example by removing dashes.
39+
"""
40+
41+
PATTERNS = [
42+
Pattern(
43+
"FRN (Medium)",
44+
r"(?<!\d)\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])(-?)[5-8]\d{6}(?!\d)",
45+
0.5,
46+
)
47+
]
48+
49+
CONTEXT = [
50+
"외국인등록번호",
51+
"Korean FRN",
52+
"FRN",
53+
"Foreigner Registration Number",
54+
"Korean Foreigner Registration Number",
55+
"외국인번호",
56+
]
57+
58+
def __init__(
59+
self,
60+
patterns: Optional[List[Pattern]] = None,
61+
context: Optional[List[str]] = None,
62+
supported_language: str = "ko",
63+
supported_entity: str = "KR_FRN",
64+
replacement_pairs: Optional[List[Tuple[str, str]]] = None,
65+
name: Optional[str] = None,
66+
):
67+
super().__init__(
68+
patterns=patterns if patterns else self.PATTERNS,
69+
context=context if context else self.CONTEXT,
70+
supported_language=supported_language,
71+
supported_entity=supported_entity,
72+
replacement_pairs=replacement_pairs,
73+
name=name,
74+
)
75+
76+
def _validate_checksum(self, frn: str) -> bool:
77+
"""
78+
Validate the checksum of Korean FRN.
79+
80+
The checksum is calculated using the preceding 12 digits.
81+
X = (13 - (2A+3B+4C+5D+6E+7F+8G+9H+2I+3J+4K+5L) mod 11) mod 10
82+
83+
:param frn: The FRN to validate
84+
:return: True if checksum is valid, False otherwise
85+
"""
86+
digit_sum = super()._compute_checksum(frn)
87+
checksum = (13 - (digit_sum % 11)) % 10
88+
return checksum == int(frn[12])

presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/korea/kr_rrn_recognizer.py

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import List, Optional, Tuple, Union
1+
from typing import List, Optional, Tuple
22

33
from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer
44

@@ -70,7 +70,7 @@ def __init__(
7070
name=name,
7171
)
7272

73-
def validate_result(self, pattern_text: str) -> Union[bool, None]:
73+
def validate_result(self, pattern_text: str) -> Optional[bool]:
7474
"""
7575
Validate the pattern logic e.g., by running checksum on a detected pattern.
7676
@@ -112,6 +112,20 @@ def _validate_region_code(self, region_code: int) -> bool:
112112
"""
113113
return True if 0 <= region_code <= 95 else False
114114

115+
def _compute_checksum(self, rn: str) -> int:
116+
"""
117+
Compute the weighted digit sum used for the RRN checksum calculation.
118+
119+
The sum is calculated over the first 12 digits of the RRN using
120+
the weights [2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5].
121+
122+
:param rn: The resident registration number as a string.
123+
Only the first 12 digits are used in the computation.
124+
:return: The integer sum of the products of digits and weights.
125+
"""
126+
weights = [2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5]
127+
return sum(int(rn[i]) * weights[i] for i in range(12))
128+
115129
def _validate_checksum(self, rrn: str) -> bool:
116130
"""
117131
Validate the checksum of Korean RRN.
@@ -122,21 +136,6 @@ def _validate_checksum(self, rrn: str) -> bool:
122136
:param rrn: The RRN to validate
123137
:return: True if checksum is valid, False otherwise
124138
"""
125-
126-
digit_sum = (
127-
2 * int(rrn[0])
128-
+ 3 * int(rrn[1])
129-
+ 4 * int(rrn[2])
130-
+ 5 * int(rrn[3])
131-
+ 6 * int(rrn[4])
132-
+ 7 * int(rrn[5])
133-
+ 8 * int(rrn[6])
134-
+ 9 * int(rrn[7])
135-
+ 2 * int(rrn[8])
136-
+ 3 * int(rrn[9])
137-
+ 4 * int(rrn[10])
138-
+ 5 * int(rrn[11])
139-
)
139+
digit_sum = self._compute_checksum(rrn)
140140
checksum = (11 - (digit_sum % 11)) % 10
141-
142141
return checksum == int(rrn[12])
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import pytest
2+
3+
from tests import assert_result_within_score_range
4+
from presidio_analyzer.predefined_recognizers.country_specific.korea.kr_frn_recognizer import KrFrnRecognizer
5+
6+
@pytest.fixture(scope="module")
7+
def recognizer():
8+
return KrFrnRecognizer()
9+
10+
@pytest.fixture(scope="module")
11+
def entities():
12+
return ["KR_FRN"]
13+
14+
@pytest.mark.parametrize(
15+
"text, expected_len, expected_positions, expected_score_ranges",
16+
[
17+
# Valid FRNs, but medium match
18+
("911124-5678901", 1, ((0, 14),), ((0.5, 0.5),), ),
19+
("9111245678901", 1, ((0, 13),), ((0.5, 0.5),), ),
20+
("000505-7637892", 1, ((0, 14),), ((0.5, 0.5),), ),
21+
("0005056637892", 1, ((0, 13),), ((0.5, 0.5),), ),
22+
("His Korean FRN is 911124-5678901", 1, ((18, 32),), ((0.5, 0.5),), ),
23+
24+
# Valid FRNs, strong match by validate_result()
25+
("911124-5678906", 1, ((0, 14),), ((1.0, 1.0),), ),
26+
("9111245678906", 1, ((0, 13),), ((1.0, 1.0),), ),
27+
("050912-6000012", 1, ((0, 14),), ((1.0, 1.0),), ),
28+
("0509126000012", 1, ((0, 13),), ((1.0, 1.0),), ),
29+
("His FRN is 9111245678906", 1, ((11, 24),), ((1.0, 1.0),), ),
30+
31+
# Invalid FRNs
32+
("001332-1234567", 0, (), (),),
33+
("0013321234567", 0, (), (),),
34+
("960121+1021413", 0, (), (),),
35+
("960111-10214131", 0, (), (),),
36+
("960303-0021413", 0, (), (),),
37+
("760413-1212134", 0, (), (),),
38+
("000402-2214431", 0, (), (),),
39+
("051102-9234110", 0, (), (),),
40+
],
41+
)
42+
def test_when_all_frns_then_succeed(
43+
text,
44+
expected_len,
45+
expected_positions,
46+
expected_score_ranges,
47+
recognizer,
48+
entities,
49+
max_score,
50+
):
51+
results = recognizer.analyze(text, entities)
52+
assert len(results) == expected_len
53+
for res, (st_pos, fn_pos), (st_score, fn_score) in zip(
54+
results, expected_positions, expected_score_ranges
55+
):
56+
if fn_score == "max":
57+
fn_score = max_score
58+
assert_result_within_score_range(
59+
res, entities[0], st_pos, fn_pos, st_score, fn_score
60+
)

0 commit comments

Comments
 (0)