Skip to content

Commit 308d8fe

Browse files
authored
Merge pull request #93 from maxmind/horgh/email-normalize
Add additional email normalization
2 parents e9d72f1 + 86cc6bf commit 308d8fe

File tree

3 files changed

+259
-6
lines changed

3 files changed

+259
-6
lines changed

CHANGELOG.md

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,33 @@
11
# Changelog
22

3+
## v2.5.0
4+
5+
* Equivalent domain names are now normalized when `hash_address` is used.
6+
For example, `googlemail.com` will become `gmail.com`.
7+
* Periods are now removed from `gmail.com` email address local parts when
8+
`hash_address` is used. For example, `[email protected]` will become
9+
10+
* Fastmail alias subdomain email addresses are now normalized when
11+
`hash_address` is used. For example, `[email protected]` will
12+
13+
* Additional `yahoo.com` email addresses now have aliases removed from
14+
their local part when `hash_address` is used. For example,
15+
`[email protected]` will become `[email protected]` for additional
16+
`yahoo.com` domains.
17+
* Duplicate `.com`s are now removed from email domain names when
18+
`hash_address` is used. For example, `example.com.com` will become
19+
`example.com`.
20+
* Extraneous characters after `.com` are now removed from email domain
21+
names when `hash_address` is used. For example, `example.comfoo` will
22+
become `example.com`.
23+
* Certain `.com` typos are now normalized to `.com` when `hash_address` is
24+
used. For example, `example.cam` will become `example.com`.
25+
* Additional `gmail.com` domain names with leading digits are now
26+
normalized when `hash_address` is used. For example, `100gmail.com` will
27+
become `gmail.com`.
28+
* Additional `gmail.com` typos are now normalized when `hash_address` is
29+
used. For example, `gmali.com` will become `gmail.com`.
30+
331
## v2.4.0 (2024-01-12)
432

533
* Ruby 2.7+ is now required. If you're using Ruby 2.5 or 2.6, please use

lib/minfraud/components/email.rb

Lines changed: 221 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -90,29 +90,237 @@ def clean_email_address(address)
9090

9191
domain = clean_domain(domain)
9292

93-
if domain == 'yahoo.com'
93+
if YAHOO_DOMAINS.key?(domain)
9494
local_part.sub!(/\A([^-]+)-.*\z/, '\1')
9595
else
9696
local_part.sub!(/\A([^+]+)\+.*\z/, '\1')
9797
end
9898

99+
if domain == 'gmail.com'
100+
local_part.gsub!('.', '')
101+
end
102+
103+
domain_parts = domain.split('.')
104+
if domain_parts.length > 2
105+
possible_domain = domain_parts[1..].join('.')
106+
if FASTMAIL_DOMAINS.key?(possible_domain)
107+
domain = possible_domain
108+
if local_part != ''
109+
local_part = domain_parts[0]
110+
end
111+
end
112+
end
113+
99114
"#{local_part}@#{domain}"
100115
end
101116

102117
TYPO_DOMAINS = {
103118
# gmail.com
104-
'35gmai.com' => 'gmail.com',
105-
'636gmail.com' => 'gmail.com',
119+
'gmai.com' => 'gmail.com',
106120
'gamil.com' => 'gmail.com',
107-
'gmail.comu' => 'gmail.com',
121+
'gmali.com' => 'gmail.com',
108122
'gmial.com' => 'gmail.com',
109123
'gmil.com' => 'gmail.com',
124+
'gmaill.com' => 'gmail.com',
125+
'gmailm.com' => 'gmail.com',
126+
'gmailo.com' => 'gmail.com',
127+
'gmailyhoo.com' => 'gmail.com',
110128
'yahoogmail.com' => 'gmail.com',
111129
# outlook.com
112130
'putlook.com' => 'outlook.com',
113131
}.freeze
114132
private_constant :TYPO_DOMAINS
115133

134+
EQUIVALENT_DOMAINS = {
135+
'googlemail.com' => 'gmail.com',
136+
'pm.me' => 'protonmail.com',
137+
'proton.me' => 'protonmail.com',
138+
'yandex.by' => 'yandex.ru',
139+
'yandex.com' => 'yandex.ru',
140+
'yandex.kz' => 'yandex.ru',
141+
'yandex.ua' => 'yandex.ru',
142+
'ya.ru' => 'yandex.ru',
143+
}.freeze
144+
private_constant :EQUIVALENT_DOMAINS
145+
146+
FASTMAIL_DOMAINS = {
147+
'123mail.org' => true,
148+
'150mail.com' => true,
149+
'150ml.com' => true,
150+
'16mail.com' => true,
151+
'2-mail.com' => true,
152+
'4email.net' => true,
153+
'50mail.com' => true,
154+
'airpost.net' => true,
155+
'allmail.net' => true,
156+
'bestmail.us' => true,
157+
'cluemail.com' => true,
158+
'elitemail.org' => true,
159+
'emailcorner.net' => true,
160+
'emailengine.net' => true,
161+
'emailengine.org' => true,
162+
'emailgroups.net' => true,
163+
'emailplus.org' => true,
164+
'emailuser.net' => true,
165+
'eml.cc' => true,
166+
'f-m.fm' => true,
167+
'fast-email.com' => true,
168+
'fast-mail.org' => true,
169+
'fastem.com' => true,
170+
'fastemail.us' => true,
171+
'fastemailer.com' => true,
172+
'fastest.cc' => true,
173+
'fastimap.com' => true,
174+
'fastmail.cn' => true,
175+
'fastmail.co.uk' => true,
176+
'fastmail.com' => true,
177+
'fastmail.com.au' => true,
178+
'fastmail.de' => true,
179+
'fastmail.es' => true,
180+
'fastmail.fm' => true,
181+
'fastmail.fr' => true,
182+
'fastmail.im' => true,
183+
'fastmail.in' => true,
184+
'fastmail.jp' => true,
185+
'fastmail.mx' => true,
186+
'fastmail.net' => true,
187+
'fastmail.nl' => true,
188+
'fastmail.org' => true,
189+
'fastmail.se' => true,
190+
'fastmail.to' => true,
191+
'fastmail.tw' => true,
192+
'fastmail.uk' => true,
193+
'fastmail.us' => true,
194+
'fastmailbox.net' => true,
195+
'fastmessaging.com' => true,
196+
'fea.st' => true,
197+
'fmail.co.uk' => true,
198+
'fmailbox.com' => true,
199+
'fmgirl.com' => true,
200+
'fmguy.com' => true,
201+
'ftml.net' => true,
202+
'h-mail.us' => true,
203+
'hailmail.net' => true,
204+
'imap-mail.com' => true,
205+
'imap.cc' => true,
206+
'imapmail.org' => true,
207+
'inoutbox.com' => true,
208+
'internet-e-mail.com' => true,
209+
'internet-mail.org' => true,
210+
'internetemails.net' => true,
211+
'internetmailing.net' => true,
212+
'jetemail.net' => true,
213+
'justemail.net' => true,
214+
'letterboxes.org' => true,
215+
'mail-central.com' => true,
216+
'mail-page.com' => true,
217+
'mailandftp.com' => true,
218+
'mailas.com' => true,
219+
'mailbolt.com' => true,
220+
'mailc.net' => true,
221+
'mailcan.com' => true,
222+
'mailforce.net' => true,
223+
'mailftp.com' => true,
224+
'mailhaven.com' => true,
225+
'mailingaddress.org' => true,
226+
'mailite.com' => true,
227+
'mailmight.com' => true,
228+
'mailnew.com' => true,
229+
'mailsent.net' => true,
230+
'mailservice.ms' => true,
231+
'mailup.net' => true,
232+
'mailworks.org' => true,
233+
'ml1.net' => true,
234+
'mm.st' => true,
235+
'myfastmail.com' => true,
236+
'mymacmail.com' => true,
237+
'nospammail.net' => true,
238+
'ownmail.net' => true,
239+
'petml.com' => true,
240+
'postinbox.com' => true,
241+
'postpro.net' => true,
242+
'proinbox.com' => true,
243+
'promessage.com' => true,
244+
'realemail.net' => true,
245+
'reallyfast.biz' => true,
246+
'reallyfast.info' => true,
247+
'rushpost.com' => true,
248+
'sent.as' => true,
249+
'sent.at' => true,
250+
'sent.com' => true,
251+
'speedpost.net' => true,
252+
'speedymail.org' => true,
253+
'ssl-mail.com' => true,
254+
'swift-mail.com' => true,
255+
'the-fastest.net' => true,
256+
'the-quickest.com' => true,
257+
'theinternetemail.com' => true,
258+
'veryfast.biz' => true,
259+
'veryspeedy.net' => true,
260+
'warpmail.net' => true,
261+
'xsmail.com' => true,
262+
'yepmail.net' => true,
263+
'your-mail.com' => true,
264+
}.freeze
265+
private_constant :FASTMAIL_DOMAINS
266+
267+
YAHOO_DOMAINS = {
268+
'y7mail.com' => true,
269+
'yahoo.at' => true,
270+
'yahoo.be' => true,
271+
'yahoo.bg' => true,
272+
'yahoo.ca' => true,
273+
'yahoo.cl' => true,
274+
'yahoo.co.id' => true,
275+
'yahoo.co.il' => true,
276+
'yahoo.co.in' => true,
277+
'yahoo.co.kr' => true,
278+
'yahoo.co.nz' => true,
279+
'yahoo.co.th' => true,
280+
'yahoo.co.uk' => true,
281+
'yahoo.co.za' => true,
282+
'yahoo.com' => true,
283+
'yahoo.com.ar' => true,
284+
'yahoo.com.au' => true,
285+
'yahoo.com.br' => true,
286+
'yahoo.com.co' => true,
287+
'yahoo.com.hk' => true,
288+
'yahoo.com.hr' => true,
289+
'yahoo.com.mx' => true,
290+
'yahoo.com.my' => true,
291+
'yahoo.com.pe' => true,
292+
'yahoo.com.ph' => true,
293+
'yahoo.com.sg' => true,
294+
'yahoo.com.tr' => true,
295+
'yahoo.com.tw' => true,
296+
'yahoo.com.ua' => true,
297+
'yahoo.com.ve' => true,
298+
'yahoo.com.vn' => true,
299+
'yahoo.cz' => true,
300+
'yahoo.de' => true,
301+
'yahoo.dk' => true,
302+
'yahoo.ee' => true,
303+
'yahoo.es' => true,
304+
'yahoo.fi' => true,
305+
'yahoo.fr' => true,
306+
'yahoo.gr' => true,
307+
'yahoo.hu' => true,
308+
'yahoo.ie' => true,
309+
'yahoo.in' => true,
310+
'yahoo.it' => true,
311+
'yahoo.lt' => true,
312+
'yahoo.lv' => true,
313+
'yahoo.nl' => true,
314+
'yahoo.no' => true,
315+
'yahoo.pl' => true,
316+
'yahoo.pt' => true,
317+
'yahoo.ro' => true,
318+
'yahoo.se' => true,
319+
'yahoo.sk' => true,
320+
'ymail.com' => true,
321+
}.freeze
322+
private_constant :YAHOO_DOMAINS
323+
116324
def clean_domain(domain)
117325
domain = domain.strip
118326

@@ -121,10 +329,19 @@ def clean_domain(domain)
121329

122330
domain = SimpleIDN.to_ascii(domain)
123331

332+
domain.sub!(/(?:\.com){2,}$/, '.com')
333+
domain.sub!(/\.com[^.]+$/, '.com')
334+
domain.sub!(/(?:\.(?:com|c[a-z]{1,2}m|co[ln]|[dsvx]o[mn]|))$/, '.com')
335+
domain.sub!(/^\d+(?:gmail?\.com)$/, 'gmail.com')
336+
124337
if TYPO_DOMAINS.key?(domain)
125338
domain = TYPO_DOMAINS[domain]
126339
end
127340

341+
if EQUIVALENT_DOMAINS.key?(domain)
342+
domain = EQUIVALENT_DOMAINS[domain]
343+
end
344+
128345
domain
129346
end
130347
end

spec/components/email_spec.rb

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,14 +63,22 @@
6363
{ input: ' [email protected]', output: '[email protected]' },
6464
{
6565
input: '[email protected]|abc124472372',
66-
output: '[email protected]|abc124472372',
66+
output: '[email protected]',
6767
},
6868
{ input: '[email protected]', output: '[email protected]' },
6969
{ input: '[email protected]', output: '[email protected]' },
7070
{ input: '[email protected]', output: '[email protected]' },
7171
{ input: '[email protected]', output: '[email protected]' },
72-
{ input: '[email protected]', output: 'gamil.com@gmail.com' },
72+
{ input: '[email protected]', output: 'gamilcom@gmail.com' },
7373
{ input: 'Test+alias@bücher.com', output: '[email protected]' },
74+
{ input: '[email protected]', output: '[email protected]' },
75+
{ input: '[email protected]', output: '[email protected]' },
76+
{ input: '[email protected]', output: '[email protected]' },
77+
{ input: '[email protected]', output: '[email protected]' },
78+
{ input: '[email protected]', output: '[email protected]' },
79+
{ input: '[email protected]', output: '[email protected]' },
80+
{ input: '[email protected]', output: '[email protected]' },
81+
{ input: '[email protected]', output: '[email protected]' },
7482
]
7583

7684
tests.each do |i|

0 commit comments

Comments
 (0)