Skip to content

Commit 3d292e4

Browse files
author
Gal Ben David
committed
Improve performance. Update PublicSuffixList
1 parent 63c7f51 commit 3d292e4

File tree

4 files changed

+113
-53
lines changed

4 files changed

+113
-53
lines changed

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
MIT License
22

3-
Copyright (c) 2019 Gal Ben David
3+
Copyright (c) 2022 Gal Ben David
44

55
Permission is hereby granted, free of charge, to any person obtaining a copy
66
of this software and associated documentation files (the "Software"), to deal

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setuptools.setup(
77
name='PyDomainExtractor',
8-
version='0.9.4',
8+
version='0.10.0',
99
author='Gal Ben David',
1010
author_email='[email protected]',
1111
url='https://github.com/Intsights/PyDomainExtractor',

src/extractor.cpp

Lines changed: 69 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,7 @@ static PyMemberDef DomainExtractor_members[] = {
322322
PyObject * subdomain_key_py = PyUnicode_FromString("subdomain");
323323
PyObject * domain_key_py = PyUnicode_FromString("domain");
324324
PyObject * suffix_key_py = PyUnicode_FromString("suffix");
325+
PyObject * empty_string_py = PyUnicode_FromString("");
325326
static PyObject * DomainExtractor_extract(
326327
DomainExtractorObject * self,
327328
PyObject * arg
@@ -333,40 +334,55 @@ static PyObject * DomainExtractor_extract(
333334

334335
PyObject * dict = PyDict_New();
335336

336-
PyObject * subdomain_py = PyUnicode_DecodeUTF8(
337-
std::get<0>(extracted_domain).data(),
338-
std::get<0>(extracted_domain).size(),
339-
NULL
340-
);
341-
PyObject * domain_py = PyUnicode_DecodeUTF8(
342-
std::get<1>(extracted_domain).data(),
343-
std::get<1>(extracted_domain).size(),
344-
NULL
345-
);
346-
PyObject * suffix_py = PyUnicode_DecodeUTF8(
347-
std::get<2>(extracted_domain).data(),
348-
std::get<2>(extracted_domain).size(),
349-
NULL
350-
);
351-
337+
PyObject * subdomain_py;
338+
if (std::get<0>(extracted_domain).size() == 0) {
339+
subdomain_py = PyUnicode_FromObject(empty_string_py);
340+
} else {
341+
subdomain_py = PyUnicode_DecodeUTF8(
342+
std::get<0>(extracted_domain).data(),
343+
std::get<0>(extracted_domain).size(),
344+
NULL
345+
);
346+
}
352347
PyDict_SetItem(
353348
dict,
354349
PyUnicode_FromObject(subdomain_key_py),
355350
subdomain_py
356351
);
352+
Py_DECREF(subdomain_py);
353+
354+
PyObject * domain_py;
355+
if (std::get<1>(extracted_domain).size() == 0) {
356+
domain_py = PyUnicode_FromObject(empty_string_py);
357+
} else {
358+
domain_py = PyUnicode_DecodeUTF8(
359+
std::get<1>(extracted_domain).data(),
360+
std::get<1>(extracted_domain).size(),
361+
NULL
362+
);
363+
}
357364
PyDict_SetItem(
358365
dict,
359366
PyUnicode_FromObject(domain_key_py),
360367
domain_py
361368
);
369+
Py_DECREF(domain_py);
370+
371+
PyObject * suffix_py;
372+
if (std::get<2>(extracted_domain).size() == 0) {
373+
suffix_py = PyUnicode_FromObject(empty_string_py);
374+
} else {
375+
suffix_py = PyUnicode_DecodeUTF8(
376+
std::get<2>(extracted_domain).data(),
377+
std::get<2>(extracted_domain).size(),
378+
NULL
379+
);
380+
}
362381
PyDict_SetItem(
363382
dict,
364383
PyUnicode_FromObject(suffix_key_py),
365384
suffix_py
366385
);
367-
368-
Py_DECREF(subdomain_py);
369-
Py_DECREF(domain_py);
370386
Py_DECREF(suffix_py);
371387

372388
return dict;
@@ -413,44 +429,58 @@ static PyObject * DomainExtractor_extract_from_url(
413429

414430
PyObject * dict = PyDict_New();
415431

416-
PyObject * subdomain_py = PyUnicode_DecodeUTF8(
417-
std::get<0>(extracted_domain).data(),
418-
std::get<0>(extracted_domain).size(),
419-
NULL
420-
);
421-
PyObject * domain_py = PyUnicode_DecodeUTF8(
422-
std::get<1>(extracted_domain).data(),
423-
std::get<1>(extracted_domain).size(),
424-
NULL
425-
);
426-
PyObject * suffix_py = PyUnicode_DecodeUTF8(
427-
std::get<2>(extracted_domain).data(),
428-
std::get<2>(extracted_domain).size(),
429-
NULL
430-
);
431-
432+
PyObject * subdomain_py;
433+
if (std::get<0>(extracted_domain).size() == 0) {
434+
subdomain_py = PyUnicode_FromObject(empty_string_py);
435+
} else {
436+
subdomain_py = PyUnicode_DecodeUTF8(
437+
std::get<0>(extracted_domain).data(),
438+
std::get<0>(extracted_domain).size(),
439+
NULL
440+
);
441+
}
432442
PyDict_SetItem(
433443
dict,
434444
PyUnicode_FromObject(subdomain_key_py),
435445
subdomain_py
436446
);
447+
Py_DECREF(subdomain_py);
448+
449+
PyObject * domain_py;
450+
if (std::get<1>(extracted_domain).size() == 0) {
451+
domain_py = PyUnicode_FromObject(empty_string_py);
452+
} else {
453+
domain_py = PyUnicode_DecodeUTF8(
454+
std::get<1>(extracted_domain).data(),
455+
std::get<1>(extracted_domain).size(),
456+
NULL
457+
);
458+
}
437459
PyDict_SetItem(
438460
dict,
439461
PyUnicode_FromObject(domain_key_py),
440462
domain_py
441463
);
464+
Py_DECREF(domain_py);
465+
466+
PyObject * suffix_py;
467+
if (std::get<2>(extracted_domain).size() == 0) {
468+
suffix_py = PyUnicode_FromObject(empty_string_py);
469+
} else {
470+
suffix_py = PyUnicode_DecodeUTF8(
471+
std::get<2>(extracted_domain).data(),
472+
std::get<2>(extracted_domain).size(),
473+
NULL
474+
);
475+
}
442476
PyDict_SetItem(
443477
dict,
444478
PyUnicode_FromObject(suffix_key_py),
445479
suffix_py
446480
);
447-
448-
Py_DECREF(subdomain_py);
449-
Py_DECREF(domain_py);
450481
Py_DECREF(suffix_py);
451482

452483
return dict;
453-
454484
} catch (const std::runtime_error &exception) {
455485
PyErr_SetString(PyExc_ValueError, exception.what());
456486

src/public_suffix_list.h

Lines changed: 42 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -870,17 +870,17 @@ gov.cx
870870
871871
// cy : http://www.nic.cy/
872872
// Submitted by registry Panayiotou Fotia <[email protected]>
873+
// namespace policies URL https://www.nic.cy/portal//sites/default/files/symfonia_gia_eggrafi.pdf
873874
cy
874875
ac.cy
875876
biz.cy
876877
com.cy
877878
ekloges.cy
878879
gov.cy
879880
ltd.cy
880-
name.cy
881+
mil.cy
881882
net.cy
882883
org.cy
883-
parliament.cy
884884
press.cy
885885
pro.cy
886886
tm.cy
@@ -7136,7 +7136,7 @@ org.zw
71367136
71377137
// newGTLDs
71387138
7139-
// List of new gTLDs imported from https://www.icann.org/resources/registries/gtlds/v2/gtlds.json on 2021-12-30T15:13:57Z
7139+
// List of new gTLDs imported from https://www.icann.org/resources/registries/gtlds/v2/gtlds.json on 2022-02-18T15:13:38Z
71407140
// This list is auto-generated, don't edit it manually.
71417141
// aaa : 2015-02-26 American Automobile Association, Inc.
71427142
aaa
@@ -7561,9 +7561,6 @@ brother
75617561
// brussels : 2014-02-06 DNS.be vzw
75627562
brussels
75637563
7564-
// budapest : 2013-11-21 Minds + Machines Group Limited
7565-
budapest
7566-
75677564
// bugatti : 2015-07-23 Bugatti International SA
75687565
bugatti
75697566
@@ -7600,7 +7597,7 @@ call
76007597
// calvinklein : 2015-07-30 PVH gTLD Holdings LLC
76017598
calvinklein
76027599
7603-
// cam : 2016-04-21 AC Webconnecting Holding B.V.
7600+
// cam : 2016-04-21 Cam Connecting SARL
76047601
cam
76057602
76067603
// camera : 2013-08-27 Binky Moon, LLC
@@ -7873,9 +7870,6 @@ cruise
78737870
// cruises : 2013-12-05 Binky Moon, LLC
78747871
cruises
78757872
7876-
// csc : 2014-09-25 Alliance-One Services, Inc.
7877-
csc
7878-
78797873
// cuisinella : 2014-04-03 SCHMIDT GROUPE S.A.S.
78807874
cuisinella
78817875
@@ -10816,6 +10810,10 @@ myasustor.com
1081610810
// Submitted by Sam Smyth <[email protected]>
1081710811
cdn.prod.atlassian-dev.net
1081810812
10813+
// Authentick UG (haftungsbeschränkt) : https://authentick.net
10814+
// Submitted by Lukas Reschke <[email protected]>
10815+
translated.page
10816+
1081910817
// AVM : https://avm.de
1082010818
// Submitted by Andreas Weise <[email protected]>
1082110819
myfritz.net
@@ -10869,6 +10867,10 @@ theshop.jp
1086910867
shopselect.net
1087010868
base.shop
1087110869
10870+
// Beget Ltd
10871+
// Submitted by Lev Nekrasov <[email protected]>
10872+
*.beget.app
10873+
1087210874
// BetaInABox
1087310875
// Submitted by Adrian <[email protected]>
1087410876
betainabox.com
@@ -11245,6 +11247,11 @@ dedyn.io
1124511247
*.rss.my.id
1124611248
*.diher.solutions
1124711249
11250+
// Discord Inc : https://discord.com
11251+
// Submitted by Sahn Lam <[email protected]>
11252+
discordsays.com
11253+
discordsez.com
11254+
1124811255
// DNS Africa Ltd https://dns.business
1124911256
// Submitted by Calvin Browne <[email protected]>
1125011257
jozi.biz
@@ -11969,10 +11976,21 @@ futuremailing.at
1196911976
*.kunden.ortsinfo.at
1197011977
*.statics.cloud
1197111978
11972-
// GDS : https://www.gov.uk/service-manual/operations/operating-servicegovuk-subdomains
11973-
// Submitted by David Illsley <[email protected]>
11979+
// GDS : https://www.gov.uk/service-manual/technology/managing-domain-names
11980+
// Submitted by Stephen Ford <[email protected]>
11981+
independent-commission.uk
11982+
independent-inquest.uk
11983+
independent-inquiry.uk
11984+
independent-panel.uk
11985+
independent-review.uk
11986+
public-inquiry.uk
11987+
royal-commission.uk
1197411988
service.gov.uk
1197511989
11990+
// CDDO : https://www.gov.uk/guidance/get-an-api-domain-on-govuk
11991+
// Submitted by Jamie Tanna <[email protected]>
11992+
api.gov.uk
11993+
1197611994
// Gehirn Inc. : https://www.gehirn.co.jp/
1197711995
// Submitted by Kohei YOSHIDA <[email protected]>
1197811996
gehirn.ne.jp
@@ -12442,6 +12460,10 @@ js.org
1244212460
kaas.gg
1244312461
khplay.nl
1244412462
12463+
// Kapsi : https://kapsi.fi
12464+
// Submitted by Tomi Juntunen <[email protected]>
12465+
kapsi.fi
12466+
1244512467
// Keyweb AG : https://www.keyweb.de
1244612468
// Submitted by Martin Dannehl <[email protected]>
1244712469
keymachine.de
@@ -13051,6 +13073,10 @@ pleskns.com
1305113073
// Submitted by Maximilian Schieder <[email protected]>
1305213074
dyn53.io
1305313075
13076+
// Porter : https://porter.run/
13077+
// Submitted by Rudraksh MK <[email protected]>
13078+
onporter.run
13079+
1305413080
// Positive Codes Technology Company : http://co.bn/faq.html
1305513081
// Submitted by Zulfais <[email protected]>
1305613082
co.bn
@@ -13695,6 +13721,10 @@ me.vu
1369513721
// Submitted by Serhii Rostilo <[email protected]>
1369613722
v.ua
1369713723
13724+
// Vultr Objects : https://www.vultr.com/products/object-storage/
13725+
// Submitted by Niels Maumenee <[email protected]>
13726+
*.vultrobjects.com
13727+
1369813728
// Waffle Computer Inc., Ltd. : https://docs.waffleinfo.com
1369913729
// Submitted by Masayuki Note <[email protected]>
1370013730
wafflecell.com

0 commit comments

Comments
 (0)