Skip to content

Commit 48dd576

Browse files
authored
Merge pull request #94 from maxmind/horgh/normalize
Replace fewer TLDs when normalizing
2 parents 308d8fe + 8728b00 commit 48dd576

File tree

3 files changed

+63
-10
lines changed

3 files changed

+63
-10
lines changed

CHANGELOG.md

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,8 @@
1717
* Duplicate `.com`s are now removed from email domain names when
1818
`hash_address` is used. For example, `example.com.com` will become
1919
`example.com`.
20-
* Extraneous characters after `.com` are now removed from email domain
21-
names when `hash_address` is used. For example, `example.comfoo` will
22-
become `example.com`.
23-
* Certain `.com` typos are now normalized to `.com` when `hash_address` is
24-
used. For example, `example.cam` will become `example.com`.
20+
* Certain TLD typos are now normalized when `hash_address` is used. For
21+
example, `example.comcom` will become `example.com`.
2522
* Additional `gmail.com` domain names with leading digits are now
2623
normalized when `hash_address` is used. For example, `100gmail.com` will
2724
become `gmail.com`.

lib/minfraud/components/email.rb

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,55 @@ def clean_email_address(address)
131131
}.freeze
132132
private_constant :TYPO_DOMAINS
133133

134+
TYPO_TLDS = {
135+
'comm' => 'com',
136+
'commm' => 'com',
137+
'commmm' => 'com',
138+
'comn' => 'com',
139+
140+
'cbm' => 'com',
141+
'ccm' => 'com',
142+
'cdm' => 'com',
143+
'cem' => 'com',
144+
'cfm' => 'com',
145+
'cgm' => 'com',
146+
'chm' => 'com',
147+
'cim' => 'com',
148+
'cjm' => 'com',
149+
'ckm' => 'com',
150+
'clm' => 'com',
151+
'cmm' => 'com',
152+
'cnm' => 'com',
153+
'cpm' => 'com',
154+
'cqm' => 'com',
155+
'crm' => 'com',
156+
'csm' => 'com',
157+
'ctm' => 'com',
158+
'cum' => 'com',
159+
'cvm' => 'com',
160+
'cwm' => 'com',
161+
'cxm' => 'com',
162+
'cym' => 'com',
163+
'czm' => 'com',
164+
165+
'col' => 'com',
166+
'con' => 'com',
167+
168+
'dom' => 'com',
169+
'don' => 'com',
170+
'som' => 'com',
171+
'son' => 'com',
172+
'vom' => 'com',
173+
'von' => 'com',
174+
'xom' => 'com',
175+
'xon' => 'com',
176+
177+
'clam' => 'com',
178+
'colm' => 'com',
179+
'comcom' => 'com',
180+
}.freeze
181+
private_constant :TYPO_TLDS
182+
134183
EQUIVALENT_DOMAINS = {
135184
'googlemail.com' => 'gmail.com',
136185
'pm.me' => 'protonmail.com',
@@ -330,10 +379,16 @@ def clean_domain(domain)
330379
domain = SimpleIDN.to_ascii(domain)
331380

332381
domain.sub!(/(?:\.com){2,}$/, '.com')
333-
domain.sub!(/\.com[^.]+$/, '.com')
334-
domain.sub!(/(?:\.(?:com|c[a-z]{1,2}m|co[ln]|[dsvx]o[mn]|))$/, '.com')
335382
domain.sub!(/^\d+(?:gmail?\.com)$/, 'gmail.com')
336383

384+
idx = domain.rindex('.')
385+
if !idx.nil?
386+
tld = domain[idx + 1..]
387+
if TYPO_TLDS.key?(tld)
388+
domain = "#{domain[0, idx]}.#{TYPO_TLDS[tld]}"
389+
end
390+
end
391+
337392
if TYPO_DOMAINS.key?(domain)
338393
domain = TYPO_DOMAINS[domain]
339394
end

spec/components/email_spec.rb

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@
6363
{ input: ' [email protected]', output: '[email protected]' },
6464
{
6565
input: '[email protected]|abc124472372',
66-
output: '[email protected]',
66+
output: '[email protected]|abc124472372',
6767
},
6868
{ input: '[email protected]', output: '[email protected]' },
6969
{ input: '[email protected]', output: '[email protected]' },
@@ -76,9 +76,10 @@
7676
{ input: '[email protected]', output: '[email protected]' },
7777
{ input: '[email protected]', output: '[email protected]' },
7878
{ input: '[email protected]', output: '[email protected]' },
79-
{ input: '[email protected]', output: 'foo@example.com' },
80-
{ input: '[email protected]', output: 'foo@example.com' },
79+
{ input: '[email protected]', output: 'foo@example.comfoo' },
80+
{ input: '[email protected]', output: 'foo@example.cam' },
8181
{ input: '[email protected]', output: '[email protected]' },
82+
{ input: '[email protected]', output: '[email protected]' },
8283
]
8384

8485
tests.each do |i|

0 commit comments

Comments
 (0)