Skip to content
This repository was archived by the owner on Feb 3, 2024. It is now read-only.

Commit b943d44

Browse files
authored
Merge branch 'DannyCork:master' into master
2 parents 6905eeb + fdde1a6 commit b943d44

File tree

15 files changed

+1709
-320
lines changed

15 files changed

+1709
-320
lines changed

.github/workflows/python-publish.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ jobs:
3333
- name: Build package
3434
run: python -m build
3535
- name: Publish package
36-
uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
36+
# uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
37+
uses: pypa/gh-action-pypi-publish@release/v1
3738
with:
3839
user: __token__
3940
password: ${{ secrets.PYPI_API_TOKEN }}

TODO

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
investigate why adding utf8 based domains creates:
2+
3+
UnknownTld meta.xn--11b4c The TLD xn--11b4c is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
4+
UnknownTld meta.xn--11b5bs3a The TLD xn--11b5bs3a is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
5+
UnknownTld meta.xn--2scrj The TLD xn--2scrj is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
6+
UnknownTld meta.xn--31bsy5d The TLD xn--31bsy5d is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
7+
UnknownTld meta.xn--3hcrj The TLD xn--3hcrj is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
8+
UnknownTld meta.xn--45br5r The TLD xn--45br5r is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
9+
UnknownTld meta.xn--45brj The TLD xn--45brj is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
10+
UnknownTld meta.xn--c2br The TLD xn--c2br is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
11+
UnknownTld meta.xn--clchc0ea0b The TLD xn--clchc0ea0b is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
12+
UnknownTld meta.xn--fpcrj The TLD xn--fpcrj is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
13+
UnknownTld meta.xn--gecrj The TLD xn--gecrj is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
14+
UnknownTld meta.xn--h2breg The TLD xn--h2breg is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
15+
UnknownTld meta.xn--h2brj The TLD xn--h2brj is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
16+
UnknownTld meta.xn--hlcj6aya The TLD xn--hlcj6aya is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
17+
UnknownTld meta.xn--qwcrj The TLD xn--qwcrj is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
18+
UnknownTld meta.xn--s9brj The TLD xn--s9brj is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
19+
UnknownTld meta.xn--xkc2dl3a The TLD xn--xkc2dl3a is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
20+

analizer/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
*_cache/

analizer/analizeIanaTld.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
#! /usr/bin/env python3
2+
from typing import (
3+
Any,
4+
)
5+
6+
import io
7+
import re
8+
from dns.resolver import (
9+
Resolver,
10+
LRUCache,
11+
)
12+
13+
import json
14+
15+
from ianaCrawler import IanaCrawler
16+
from pslGrabber import PslGrabber
17+
from ianaDatabase import IanaDatabase
18+
19+
20+
def xMain() -> None:
21+
verbose: bool = True
22+
dbFileName: str = "IanaDb.sqlite"
23+
24+
iad: Any = IanaDatabase(verbose=verbose)
25+
iad.connectDb(dbFileName)
26+
iad.createTableTld()
27+
iad.createTablePsl()
28+
29+
resolver: Resolver = Resolver()
30+
resolver.cache = LRUCache() # type: ignore
31+
32+
iac = IanaCrawler(verbose=verbose, resolver=resolver)
33+
iac.getTldInfo()
34+
iac.addInfoToAllTld()
35+
xx = iac.getResults()
36+
for item in xx["data"]:
37+
sql, data = iad.makeInsOrUpdSqlTld(xx["header"], item)
38+
iad.doSql(sql, data)
39+
if verbose:
40+
print(json.dumps(iac.getResults(), indent=2, ensure_ascii=False))
41+
42+
pg = PslGrabber()
43+
response = pg.getData(pg.getUrl())
44+
text = response.text
45+
buf = io.StringIO(text)
46+
47+
section = ""
48+
while True:
49+
line = buf.readline()
50+
if not line:
51+
break
52+
53+
z = line.strip()
54+
if len(z):
55+
if "// ===END " in z:
56+
section = ""
57+
58+
if "// ===BEGIN ICANN" in z:
59+
section = "ICANN"
60+
61+
if "// ===BEGIN PRIVATE" in z:
62+
section = "PRIVATE"
63+
64+
if section == "PRIVATE":
65+
continue
66+
67+
if re.match(r"^\s*//", z):
68+
# print("SKIP", z)
69+
continue
70+
71+
n = 0
72+
z = z.split()[0]
73+
if "." in z:
74+
tld = z.split(".")[-1]
75+
n = len(z.split("."))
76+
else:
77+
tld = z
78+
79+
sql, data = iad.makeInsOrUpdSqlPsl(pg.ColumnsPsl(), [tld, z, n, section, None])
80+
if verbose:
81+
print(data)
82+
iad.doSql(sql, data)
83+
84+
85+
xMain()

analizer/ianaCrawler.py

Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
#! /usr/bin/env python3
2+
from typing import (
3+
Optional,
4+
List,
5+
Dict,
6+
Any,
7+
# Tuple,
8+
)
9+
10+
import sys
11+
from bs4 import BeautifulSoup
12+
import time
13+
import requests_cache
14+
15+
16+
class IanaCrawler:
17+
URL: str = "https://www.iana.org/domains/root/db"
18+
CacheTime: int = 3600 * 24 # default 24 hours
19+
Session: Any = None
20+
cacheName: str = ".iana_cache"
21+
verbose: bool = False
22+
cacheBackend: str = "filesystem"
23+
records: List[Any] = []
24+
columns: List[Any] = []
25+
26+
resolver: Any = None
27+
28+
def __init__(
29+
self,
30+
verbose: bool = False,
31+
resolver: Any = None,
32+
):
33+
self.verbose = verbose
34+
self.resolver = resolver
35+
self.Session = requests_cache.CachedSession(
36+
self.cacheName,
37+
backend=self.cacheBackend,
38+
)
39+
40+
def getUrl(self) -> str:
41+
return self.URL
42+
43+
def getBasicBs(
44+
self,
45+
url: str,
46+
) -> BeautifulSoup:
47+
try:
48+
response = self.Session.get(url)
49+
except Exception as e:
50+
# in case of no data, sleep and try again
51+
print(e, file=sys.stderr)
52+
time.sleep(15)
53+
response = self.Session.get(url)
54+
55+
soup = BeautifulSoup(response.text, "html.parser")
56+
return soup
57+
58+
def getAdditionalItem(
59+
self,
60+
what: str,
61+
data: List[str],
62+
) -> Optional[str]:
63+
64+
for i in [0, 1]:
65+
try:
66+
z: str = f"{what}:"
67+
if z in data[i]:
68+
return data[i].replace(z, "").strip()
69+
except Exception as _:
70+
_ = _
71+
return None
72+
return None
73+
74+
def getTldInfo(self) -> None:
75+
soup = self.getBasicBs(self.getUrl())
76+
table: Any = soup.find("table") # the first table has the tld data
77+
78+
self.records: List[Any] = []
79+
self.columns: List[Any] = []
80+
n = 0
81+
for tr in table.findAll("tr"):
82+
n += 1
83+
# extract header info if present
84+
ths = tr.findAll("th")
85+
if ths != []:
86+
for each in ths:
87+
self.columns.append(each.text)
88+
continue
89+
90+
# extrct data
91+
trs = tr.findAll("td")
92+
record = []
93+
for each in trs:
94+
try:
95+
link = each.find("a")["href"]
96+
aa = link.split("/")
97+
record.append(aa[-1].replace(".html", ""))
98+
record.append(each.text.strip())
99+
except Exception as _:
100+
_ = _
101+
record.append(each.text)
102+
self.records.append(record)
103+
104+
self.columns.insert(0, "Link")
105+
106+
def getTldPWithString(
107+
self,
108+
url: str,
109+
text: str,
110+
) -> Optional[str]:
111+
soup = self.getBasicBs(url)
112+
gfg: List[Any] = soup.find_all(lambda tag: tag.name == "p" and text in tag.text)
113+
if len(gfg):
114+
s: str = gfg[0].text.strip()
115+
return s
116+
return None
117+
118+
def resolveWhois(
119+
self,
120+
whois: str,
121+
) -> List[Any]:
122+
ll: List[Any] = []
123+
if self.resolver:
124+
try:
125+
answer = list(self.resolver.resolve(whois, "A").response.answer)
126+
except Exception as e:
127+
print(whois, e, file=sys.stderr)
128+
time.sleep(30)
129+
answer = list(self.resolver.resolve(whois, "A").response.answer)
130+
131+
for a in answer:
132+
s = str(a)
133+
if "\n" in s:
134+
ss = s.split("\n")
135+
ll.append(ss)
136+
else:
137+
ll.append(s)
138+
139+
if self.verbose:
140+
print(a)
141+
return ll
142+
143+
def addInfoToOneTld(
144+
self,
145+
tldItem: List[Any],
146+
) -> List[str]:
147+
url = tldItem[0]
148+
149+
if self.verbose:
150+
print(url, file=sys.stderr)
151+
152+
if tldItem[3] == "Not assigned":
153+
tldItem[3] = None
154+
155+
zz = {
156+
"Whois": "WHOIS Server",
157+
"RegistrationUrl": "URL for registration services",
158+
}
159+
for key, val in zz.items():
160+
regDataW = self.getTldPWithString(self.getUrl() + "/" + url + ".html", val)
161+
if regDataW:
162+
regDataW = regDataW.replace(val, key)
163+
regDataA = regDataW.split("\n")
164+
165+
for s in [key]:
166+
tldItem.append(self.getAdditionalItem(s, regDataA))
167+
else:
168+
tldItem.append(None)
169+
170+
if tldItem[4]:
171+
ll = self.resolveWhois(tldItem[4])
172+
tldItem.append(ll)
173+
else:
174+
tldItem.append(None)
175+
176+
if self.verbose:
177+
print(url, tldItem, file=sys.stderr)
178+
179+
return tldItem
180+
181+
def addInfoToAllTld(self) -> None:
182+
records2 = []
183+
for tldItem in self.records:
184+
rr = self.addInfoToOneTld(tldItem)
185+
if self.verbose:
186+
print(len(rr), rr)
187+
records2.append(rr)
188+
self.columns.insert(4, "Whois")
189+
self.columns.insert(5, "RegistrationUrl")
190+
self.columns.insert(6, "DnsResolve-A")
191+
self.records = records2
192+
self.columns[3] = self.columns[3].replace(" ", "_")
193+
194+
def getResults(self) -> Dict[str, Any]:
195+
ll = list(self.columns)
196+
ll[3] = ll[3].replace(" ", "_")
197+
return {
198+
"header": ll,
199+
"data": self.records,
200+
}

0 commit comments

Comments
 (0)