Skip to content
This repository was archived by the owner on Feb 3, 2024. It is now read-only.

Commit d42a7f5

Browse files
author
DannyCork
authored
Merge pull request #258 from maarten-boot/development
convert file with supported tld's to Dict
2 parents eceae65 + 7fd6d6b commit d42a7f5

File tree

15 files changed

+3793
-918
lines changed

15 files changed

+3793
-918
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,3 +72,4 @@ typescript
7272
test.out
7373
diff.out
7474
tmp/
75+
1

DONE

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,8 @@ DONE
4040

4141
- add nic to default test group for makeTestdataAll.sh
4242

43+
- convert the list of tld to Dict
44+
- allow override or change and adding new domains without needing a new version directly
45+
- tested with existing testdomains, all reponses will now respond with the true tld not the one with a underscore
46+
47+
- add simple autodetect based on tld from IANA, try to use the .com patterns to se if we get someting usefull

README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,15 @@ Raise an issue https://github.com/DannyCork/python-whois/issues/new
7676
2023-01-18: sorrowless
7777
* add an opportunity to specify maximum cache age
7878

79+
2023-01-25: maarten_boot
80+
* convert the tld file to a Dict, we now no longer need a mappper for python keywords or second level domains.
81+
* utf8 level domains also need no mapper anymore an can be added as is with a translation to xn--<something>
82+
* added xn-- tlds for all known utf-8 domains we currently have
83+
* we can now add new domains on the fly or change them: whois.mergeExternalDictWithRegex(aDictToOverride) see example testExtend.py
84+
85+
2023-01-27: maarten_boot
86+
* add autodetect via iana tld file (this has only tld's)
87+
7988
## Support
8089
* Python 3.x is supported.
8190
* Python 2.x IS NOT supported.

TODO

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
TODO
22

3-
# pt is difficult it often gives no data, it works in aws frankfurt through
3+
# pt is difficult it often gives no data, it works in aws frankfurt though
44
ERROR: output; missing nameserver 'ns1.dnscpanel.com.' for tld: pt
55
ERROR: output; missing nameserver 'ns2.dnscpanel.com.' for tld: pt

analize_patterns.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
#! /usr/bin/env python3
2+
3+
import sys
4+
import re
5+
from typing import (
6+
# Optional,
7+
# List,
8+
Dict,
9+
)
10+
11+
# most likely we can now introduce trailing whitespace trim on all lines from whois,
12+
# and simplefy trailing whitespace rules
13+
# as \r is already gone now and that was the most disticnt line ending
14+
# occasionally we need to detect \n\s+ for groups that belong together
15+
# mostly with indented blocks of nameservers
16+
17+
# import whois
18+
from whois.tld_regexpr import ZZ
19+
20+
21+
def buildRegCollection(zz: Dict):
22+
regCollection = {}
23+
# get all regexes
24+
for name in zz:
25+
# print(name)
26+
z = zz[name]
27+
for key in z:
28+
if key is None:
29+
continue
30+
31+
if key.startswith("_"):
32+
continue
33+
34+
if key in ["extend"]:
35+
continue
36+
37+
if key not in regCollection:
38+
regCollection[key] = {}
39+
40+
reg = z[key]
41+
if reg is None:
42+
continue
43+
44+
regCollection[key][reg] = None
45+
if isinstance(reg, str):
46+
regCollection[key][reg] = re.compile(reg, flags=re.IGNORECASE)
47+
48+
return regCollection
49+
50+
51+
if __name__ == "__main__":
52+
regCollection = buildRegCollection(ZZ)
53+
54+
for name in sorted(regCollection.keys()):
55+
print(f"## {name}", file=sys.stderr)
56+
for key in sorted(regCollection[name].keys()):
57+
if key:
58+
print(f"{name}: {key}")

compare_known_tld.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
#! /usr/bin/env python3
2+
3+
# clone https://github.com/jophy/iana_tld_list in ./tmp
4+
5+
import urllib.request
6+
7+
from tmp.iana_tld_list.iana import IANA
8+
9+
import whois
10+
from whois._1_query import _do_whois_query
11+
12+
# allow verbose messages during testing (all on stderr)
13+
verbose = False
14+
15+
# by default the all tld file will be refreshed ever 24 hours,
16+
# but you can force a new download anytime also
17+
forceDownloadTld = False
18+
19+
# do you want to overwrite the results file ?
20+
overwrite = True
21+
22+
# do you want interactive questions if files will be re-written?
23+
interactive = False
24+
25+
# if autoProcessAll is True: all tld's will be processed (initial run > 20 minutes)
26+
autoProcessAll = False
27+
28+
with_test_original = True
29+
30+
dirName = "/tmp/iana_data"
31+
32+
i = IANA(
33+
dirName=dirName,
34+
verbose=verbose,
35+
overwrite=overwrite,
36+
interactive=interactive,
37+
autoProcessAll=autoProcessAll,
38+
forceDownloadTld=forceDownloadTld,
39+
)
40+
41+
# ge python whois known tld's and second level domains
42+
known = sorted(whois.validTlds())
43+
44+
# get iana data
45+
URL = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
46+
response = urllib.request.urlopen(URL)
47+
data = response.read().decode("utf-8").lower()
48+
dataList = sorted(data.splitlines())
49+
50+
# filter out known names and try to detect names not known by iana
51+
for name in known:
52+
if name in dataList:
53+
continue
54+
if "." in name:
55+
continue
56+
if name not in dataList:
57+
print(f"{name} tld name from python_whois is not known in IANA list")
58+
continue
59+
60+
dataList2 = []
61+
for name in dataList:
62+
if name in known:
63+
continue
64+
dataList2.append(name)
65+
66+
# Try to auto detect new domains via IANA and some known common regex lists like .com
67+
found = {}
68+
for tld in dataList2:
69+
data, status = i.getInfoOnOneTld(tld)
70+
71+
xtest = data and ("whois" in data) and (data["whois"]) and (data["whois"] != "NULL")
72+
if not xtest:
73+
print(f"no whois info for tld: {tld} {data}")
74+
continue
75+
76+
wh = data["whois"]
77+
if wh.endswith(f".{tld}"):
78+
dd = wh.split(".")[-2:]
79+
else:
80+
dd = ["meta", tld]
81+
82+
print(f"try: {tld}")
83+
zz = _do_whois_query(
84+
dd,
85+
ignore_returncode=False,
86+
server=wh,
87+
)
88+
89+
pp = {"_server": wh, "extend": "com"}
90+
aDictToTestOverride = {tld: pp}
91+
92+
whois.mergeExternalDictWithRegex(aDictToTestOverride)
93+
try:
94+
d = whois.query(".".join(dd))
95+
if d:
96+
print(d.__dict__)
97+
if len(d.name_servers) > 0:
98+
found[tld] = pp
99+
print(f"## ZZ['{tld}'] = {found[tld]} # auto-detected via IANA tld")
100+
except Exception as e:
101+
print(e)

convert_to_dict.sh

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#! /usr/bin/env bash
2+
3+
FILE="whois/tld_regexpr.py"
4+
FILE2="whois/tld_regexpr2.py"
5+
6+
cat "$FILE" |
7+
perl -np -e '
8+
# translate all tld to DICT and substitute for the real tld in case of _
9+
s/^([a-z]+)_([a-z]+)\s+=/ZZ["$1.$2"] =/;
10+
s/^([a-z]+)\s+=/ZZ["$1"] =/;
11+
# if we refer to a tld also change _ to .
12+
s/"extend":\s+"(\w+)_(\w+)"/"extend": "$1.$2"/;
13+
' |
14+
tee "$FILE2"

testExtend.py

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
#!/usr/bin/python3
2+
import whois
3+
4+
Verbose = True
5+
6+
"""
7+
initial testing had errors for these
8+
we DONT have xn--3ds443g 在线 (online)
9+
we DONT have xn--45q11c 八卦 (gossip)
10+
we DONT have xn--czru2d 商城 (mall)
11+
we DONT have xn--fiq228c5hs 中文网 (website)
12+
we DONT have xn--hxt814e 网店 (webshop)
13+
"""
14+
15+
16+
def t1(domain: str, text: str):
17+
print(f"{text}: {domain}")
18+
try:
19+
d = whois.query(domain)
20+
if d:
21+
print(d.__dict__)
22+
else:
23+
print(d)
24+
except Exception as e:
25+
print(domain, e)
26+
27+
28+
def xMain():
29+
aDictToTestOverride = {
30+
"si": { # changing a existing one
31+
"domain_name": r"domain:\s+(.+)",
32+
"status": r"status:\s+(.+)",
33+
"registrar": r"registrar:\s+(.+)",
34+
"name_servers": r"nameserver:\s*(.+)",
35+
"creation_date": r"created:\s+(.+)",
36+
"expiration_date": None,
37+
"updated_date": None,
38+
"registrant_country": None,
39+
},
40+
"mk": { # defining a non existant one, meanwhile this is now supported so the test is meaningless
41+
"extend": "com",
42+
"domain_name": r"domain:\s+(.+)",
43+
"status": r"status:\s+(.+)",
44+
"registrar": r"registrar:\s+(.+)",
45+
"name_servers": r"nserver:\s*(.+)",
46+
"creation_date": r"registered:\s+(.+)",
47+
"expiration_date": r"expire:\s+(.+)",
48+
"updated_date": r"changed:\s+(.+)",
49+
"registrant_country": None,
50+
"registrant": r"registrant:\s+(.+)",
51+
},
52+
}
53+
54+
domains = [
55+
"google.si",
56+
"google.mk",
57+
]
58+
for domain in domains:
59+
t1(domain, "BEFORE")
60+
61+
whois.mergeExternalDictWithRegex(aDictToTestOverride)
62+
63+
for domain in domains:
64+
t1(domain, "AFTER")
65+
66+
67+
xMain()
68+
69+
"""
70+
71+
% Domain Information over Whois protocol
72+
%
73+
% Whoisd Server Version: 3.9.0
74+
% Timestamp: Fri Nov 25 16:49:33 2022
75+
76+
domain: google.mk
77+
registrant: UNET-R11
78+
admin-c: UNET-C12
79+
nsset: UNET-NS191
80+
registrar: UNET-REG
81+
registered: 13.05.2008 14:00:00
82+
changed: 17.04.2014 12:50:32
83+
expire: 13.05.2023
84+
85+
contact: UNET-R11
86+
org: Google LLC
87+
name: Google LLC
88+
address: Amphiteatre Parkway 1600
89+
address: Mountain View
90+
address: 94043
91+
address: US
92+
phone: +1.6502530000
93+
fax-no: +1.6502530000
94+
95+
registrar: UNET-REG
96+
created: 25.03.2014 11:48:02
97+
changed: 29.09.2021 16:26:23
98+
99+
contact: UNET-C12
100+
name: Mark Monitor Inc.
101+
address: 3540 East Longwing Lane Suite 300
102+
address: Meridian
103+
address: 83646
104+
address: US
105+
phone: +1.2083895740
106+
107+
registrar: UNET-REG
108+
created: 25.03.2014 11:48:00
109+
changed: 19.11.2019 16:47:01
110+
111+
nsset: UNET-NS191
112+
nserver: ns2.google.com
113+
nserver: ns1.google.com
114+
tech-c: UNET-C12
115+
registrar: UNET-REG
116+
created: 17.04.2014 12:50:22
117+
changed: 17.04.2014 21:02:14
118+
119+
120+
"""

testdata/meta.rk/input

Lines changed: 0 additions & 38 deletions
This file was deleted.

testdata/meta.rk/output

Lines changed: 0 additions & 6 deletions
This file was deleted.

0 commit comments

Comments
 (0)