Skip to content

Commit 1989968

Browse files
author
Gal Ben David
committed
- changed tessil map to robbin hood library
- fix: the library was changing the original domain when it included upper cased chars - updated public suffix list
1 parent e8229fe commit 1989968

File tree

10 files changed

+2465
-3379
lines changed

10 files changed

+2465
-3379
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ PyDomainExtractor is a library intended for parsing domain names into their part
3939
### Built With
4040

4141
* [GNU libidn2](https://www.gnu.org/software/libidn/#libidn2)
42-
* [Tessil/robin-map](https://github.com/Tessil/robin-map)
42+
* [robin-hood-hashing](https://github.com/martinus/robin-hood-hashing)
4343
* [Public Suffix List](https://publicsuffix.org/)
4444

4545

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setuptools.setup(
77
name='PyDomainExtractor',
8-
version='0.8.4',
8+
version='0.8.5',
99
author='Gal Ben David',
1010
author_email='[email protected]',
1111
url='https://github.com/Intsights/PyDomainExtractor',

src/public_suffix_list.h

Lines changed: 28 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1157,7 +1157,7 @@ gov.gr
11571157
// gs : https://en.wikipedia.org/wiki/.gs
11581158
gs
11591159
1160-
// gt : http://www.gt/politicas_de_registro.html
1160+
// gt : https://www.gt/sitio/registration_policy.php?lang=en
11611161
gt
11621162
com.gt
11631163
edu.gt
@@ -7115,7 +7115,7 @@ org.zw
71157115
71167116
// newGTLDs
71177117
7118-
// List of new gTLDs imported from https://www.icann.org/resources/registries/gtlds/v2/gtlds.json on 2020-10-08T17:45:32Z
7118+
// List of new gTLDs imported from https://www.icann.org/resources/registries/gtlds/v2/gtlds.json on 2020-10-28T17:55:28Z
71197119
// This list is auto-generated, don't edit it manually.
71207120
// aaa : 2015-02-26 American Automobile Association, Inc.
71217121
aaa
@@ -7333,7 +7333,7 @@ author
73337333
// auto : 2014-11-13 XYZ.COM LLC
73347334
auto
73357335
7336-
// autos : 2014-01-09 DERAutos, LLC
7336+
// autos : 2014-01-09 XYZ.COM LLC
73377337
autos
73387338
73397339
// avianca : 2015-01-08 Avianca Holdings S.A.
@@ -7483,7 +7483,7 @@ bmw
74837483
// bnpparibas : 2014-05-29 BNP Paribas
74847484
bnpparibas
74857485
7486-
// boats : 2014-12-04 DERBoats, LLC
7486+
// boats : 2014-12-04 XYZ.COM LLC
74877487
boats
74887488
74897489
// boehringer : 2015-07-09 Boehringer Ingelheim International GmbH
@@ -7522,7 +7522,7 @@ bot
75227522
// boutique : 2013-11-14 Binky Moon, LLC
75237523
boutique
75247524
7525-
// box : 2015-11-12 .BOX INC.
7525+
// box : 2015-11-12 Intercap Registry Inc.
75267526
box
75277527
75287528
// bradesco : 2014-12-18 Banco Bradesco S.A.
@@ -8506,7 +8506,7 @@ homedepot
85068506
// homegoods : 2015-07-16 The TJX Companies, Inc.
85078507
homegoods
85088508
8509-
// homes : 2014-01-09 DERHomes, LLC
8509+
// homes : 2014-01-09 XYZ.COM LLC
85108510
homes
85118511
85128512
// homesense : 2015-07-16 The TJX Companies, Inc.
@@ -9082,7 +9082,7 @@ moscow
90829082
// moto : 2015-06-04 Motorola Trademark Holdings, LLC
90839083
moto
90849084
9085-
// motorcycles : 2014-01-09 DERMotorcycles, LLC
9085+
// motorcycles : 2014-01-09 XYZ.COM LLC
90869086
motorcycles
90879087
90889088
// mov : 2014-01-30 Charleston Road Registry Inc.
@@ -9247,7 +9247,7 @@ one
92479247
// ong : 2014-03-06 Public Interest Registry
92489248
ong
92499249
9250-
// onl : 2013-09-16 I-Registry Ltd.
9250+
// onl : 2013-09-16 iRegistry GmbH
92519251
onl
92529252
92539253
// online : 2015-01-15 DotOnline Inc.
@@ -9544,7 +9544,7 @@ reviews
95449544
// rexroth : 2015-06-18 Robert Bosch GMBH
95459545
rexroth
95469546
9547-
// rich : 2013-11-21 I-Registry Ltd.
9547+
// rich : 2013-11-21 iRegistry GmbH
95489548
rich
95499549
95509550
// richardli : 2015-05-14 Pacific Century Asset Management (HK) Limited
@@ -10600,7 +10600,7 @@ vermögensberatung
1060010600
// xyz : 2013-12-05 XYZ.COM LLC
1060110601
xyz
1060210602
10603-
// yachts : 2014-01-09 DERYachts, LLC
10603+
// yachts : 2014-01-09 XYZ.COM LLC
1060410604
yachts
1060510605
1060610606
// yahoo : 2015-04-02 Yahoo! Domain Services Inc.
@@ -10685,12 +10685,6 @@ barsy.ca
1068510685
// Submitted by Werner Kaltofen <[email protected]>
1068610686
kasserver.com
1068710687
10688-
// Algorithmia, Inc. : algorithmia.com
10689-
// Submitted by Eli Perelman <[email protected]>
10690-
*.algorithmia.com
10691-
!teams.algorithmia.com
10692-
!test.algorithmia.com
10693-
1069410688
// Altervista: https://www.altervista.org
1069510689
// Submitted by Carlo Cannas <[email protected]>
1069610690
altervista.org
@@ -11807,6 +11801,10 @@ ukco.me
1180711801
// submitted by Koen Van Isterdael <[email protected]>
1180811802
mydobiss.com
1180911803
11804+
// FH Muenster : https://www.fh-muenster.de
11805+
// Submitted by Robin Naundorf <[email protected]>
11806+
fh-muenster.io
11807+
1181011808
// Filegear Inc. : https://www.filegear.com
1181111809
// Submitted by Jason Zhu <[email protected]>
1181211810
filegear.me
@@ -11877,6 +11875,7 @@ usercontent.jp
1187711875
gentapps.com
1187811876
gentlentapis.com
1187911877
lab.ms
11878+
cdn-edges.net
1188011879
1188111880
// GitHub, Inc.
1188211881
// Submitted by Patrick Toomey <[email protected]>
@@ -11936,9 +11935,10 @@ pagespeedmobilizer.com
1193611935
publishproxy.com
1193711936
withgoogle.com
1193811937
withyoutube.com
11939-
cloudfunctions.net
11938+
*.gateway.dev
1194011939
cloud.goog
1194111940
translate.goog
11941+
cloudfunctions.net
1194211942
1194311943
blogspot.ae
1194411944
blogspot.al
@@ -12451,11 +12451,17 @@ eu.meteorapp.com
1245112451
co.pl
1245212452
1245312453
// Microsoft Corporation : http://microsoft.com
12454-
// Submitted by Mostafa Elzeiny <moelzein@microsoft.com>
12454+
// Submitted by Mitch Webster <miwebst@microsoft.com>
1245512455
*.azurecontainer.io
1245612456
azurewebsites.net
1245712457
azure-mobile.net
1245812458
cloudapp.net
12459+
azurestaticapps.net
12460+
centralus.azurestaticapps.net
12461+
eastasia.azurestaticapps.net
12462+
eastus2.azurestaticapps.net
12463+
westeurope.azurestaticapps.net
12464+
westus2.azurestaticapps.net
1245912465
1246012466
// minion.systems : http://minion.systems
1246112467
// Submitted by Robert Böttinger <[email protected]>
@@ -12890,6 +12896,10 @@ byen.site
1289012896
// Submitted by Kor Nielsen <[email protected]>
1289112897
pubtls.org
1289212898
12899+
// QOTO, Org.
12900+
// Submitted by Jeffrey Phillips Freeman <[email protected]>
12901+
qoto.io
12902+
1289312903
// Qualifio : https://qualifio.com/
1289412904
// Submitted by Xavier De Cock <[email protected]>
1289512905
qualifioapp.com

src/pydomainextractor.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@
1010
#include <codecvt>
1111
#include <memory>
1212
#include <idn2.h>
13-
#include <tsl/robin_set.h>
1413

14+
#include "robin_hood.h"
1515
#include "public_suffix_list.h"
1616

1717

@@ -262,12 +262,12 @@ class DomainExtractor {
262262
return true;
263263
}
264264

265-
tsl::robin_set<std::string> known_tlds;
266-
tsl::robin_set<std::string> blacklisted_tlds;
267-
tsl::robin_set<std::string> wildcard_tlds;
268-
tsl::robin_set<std::string_view> known_tlds_views;
269-
tsl::robin_set<std::string_view> blacklisted_tlds_views;
270-
tsl::robin_set<std::string_view> wildcard_tlds_views;
265+
robin_hood::unordered_set<std::string> known_tlds;
266+
robin_hood::unordered_set<std::string> blacklisted_tlds;
267+
robin_hood::unordered_set<std::string> wildcard_tlds;
268+
robin_hood::unordered_set<std::string_view> known_tlds_views;
269+
robin_hood::unordered_set<std::string_view> blacklisted_tlds_views;
270+
robin_hood::unordered_set<std::string_view> wildcard_tlds_views;
271271
};
272272

273273

@@ -326,7 +326,7 @@ static PyObject * DomainExtractor_extract(
326326
DomainExtractorObject * self,
327327
PyObject * arg
328328
) {
329-
const char * input = PyUnicode_AsUTF8(arg);
329+
std::string input(PyUnicode_AsUTF8(arg));
330330

331331
try {
332332
auto extracted_domain = self->domain_extractor->extract(input);
@@ -382,7 +382,7 @@ static PyObject * DomainExtractor_extract_from_url(
382382
DomainExtractorObject * self,
383383
PyObject * arg
384384
) {
385-
const char * input = PyUnicode_AsUTF8(arg);
385+
std::string input(PyUnicode_AsUTF8(arg));
386386
std::string_view url(input);
387387

388388
std::size_t scheme_separator_position = url.find("//");
@@ -463,7 +463,7 @@ static PyObject * DomainExtractor_is_valid_domain(
463463
DomainExtractorObject * self,
464464
PyObject * arg
465465
) {
466-
const char * input = PyUnicode_AsUTF8(arg);
466+
std::string input(PyUnicode_AsUTF8(arg));
467467

468468
auto valid_domain = self->domain_extractor->is_valid_domain(std::string(input));
469469
if (valid_domain == true) {

0 commit comments

Comments
 (0)