mboot-github
diff --git a/‎.github/workflows/python-publish.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/python-publish.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎TODO‎
Lines changed: 20 additions & 0 deletions b/‎TODO‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎analizer/.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎analizer/.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎analizer/analizeIanaTld.py‎
Lines changed: 85 additions & 0 deletions b/‎analizer/analizeIanaTld.py‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎analizer/ianaCrawler.py‎
Lines changed: 200 additions & 0 deletions b/‎analizer/ianaCrawler.py‎
Lines changed: 200 additions & 0 deletions
@@ -33,7 +33,8 @@ jobs:
     - name: Build package
       run: python -m build
     - name: Publish package
-      uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
+      # uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
+      uses: pypa/gh-action-pypi-publish@release/v1
       with:
         user: __token__
         password: ${{ secrets.PYPI_API_TOKEN }}
@@ -0,0 +1,20 @@
+investigate why adding utf8 based domains creates:
+
+UnknownTld meta.xn--11b4c The TLD xn--11b4c is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
+UnknownTld meta.xn--11b5bs3a The TLD xn--11b5bs3a is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
+UnknownTld meta.xn--2scrj The TLD xn--2scrj is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
+UnknownTld meta.xn--31bsy5d The TLD xn--31bsy5d is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
+UnknownTld meta.xn--3hcrj The TLD xn--3hcrj is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
+UnknownTld meta.xn--45br5r The TLD xn--45br5r is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
+UnknownTld meta.xn--45brj The TLD xn--45brj is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
+UnknownTld meta.xn--c2br The TLD xn--c2br is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
+UnknownTld meta.xn--clchc0ea0b The TLD xn--clchc0ea0b is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
+UnknownTld meta.xn--fpcrj The TLD xn--fpcrj is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
+UnknownTld meta.xn--gecrj The TLD xn--gecrj is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
+UnknownTld meta.xn--h2breg The TLD xn--h2breg is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
+UnknownTld meta.xn--h2brj The TLD xn--h2brj is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
+UnknownTld meta.xn--hlcj6aya The TLD xn--hlcj6aya is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
+UnknownTld meta.xn--qwcrj The TLD xn--qwcrj is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
+UnknownTld meta.xn--s9brj The TLD xn--s9brj is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
+UnknownTld meta.xn--xkc2dl3a The TLD xn--xkc2dl3a is currently not supported by this package. Use validTlds() to see what toplevel domains are supported.
+
@@ -0,0 +1 @@
+*_cache/
@@ -0,0 +1,85 @@
+#! /usr/bin/env python3
+from typing import (
+    Any,
+)
+
+import io
+import re
+from dns.resolver import (
+    Resolver,
+    LRUCache,
+)
+
+import json
+
+from ianaCrawler import IanaCrawler
+from pslGrabber import PslGrabber
+from ianaDatabase import IanaDatabase
+
+
+def xMain() -> None:
+    verbose: bool = True
+    dbFileName: str = "IanaDb.sqlite"
+
+    iad: Any = IanaDatabase(verbose=verbose)
+    iad.connectDb(dbFileName)
+    iad.createTableTld()
+    iad.createTablePsl()
+
+    resolver: Resolver = Resolver()
+    resolver.cache = LRUCache()  # type: ignore
+
+    iac = IanaCrawler(verbose=verbose, resolver=resolver)
+    iac.getTldInfo()
+    iac.addInfoToAllTld()
+    xx = iac.getResults()
+    for item in xx["data"]:
+        sql, data = iad.makeInsOrUpdSqlTld(xx["header"], item)
+        iad.doSql(sql, data)
+    if verbose:
+        print(json.dumps(iac.getResults(), indent=2, ensure_ascii=False))
+
+    pg = PslGrabber()
+    response = pg.getData(pg.getUrl())
+    text = response.text
+    buf = io.StringIO(text)
+
+    section = ""
+    while True:
+        line = buf.readline()
+        if not line:
+            break
+
+        z = line.strip()
+        if len(z):
+            if "// ===END " in z:
+                section = ""
+
+            if "// ===BEGIN ICANN" in z:
+                section = "ICANN"
+
+            if "// ===BEGIN PRIVATE" in z:
+                section = "PRIVATE"
+
+            if section == "PRIVATE":
+                continue
+
+            if re.match(r"^\s*//", z):
+                # print("SKIP", z)
+                continue
+
+            n = 0
+            z = z.split()[0]
+            if "." in z:
+                tld = z.split(".")[-1]
+                n = len(z.split("."))
+            else:
+                tld = z
+
+            sql, data = iad.makeInsOrUpdSqlPsl(pg.ColumnsPsl(), [tld, z, n, section, None])
+            if verbose:
+                print(data)
+            iad.doSql(sql, data)
+
+
+xMain()
@@ -0,0 +1,200 @@
+#! /usr/bin/env python3
+from typing import (
+    Optional,
+    List,
+    Dict,
+    Any,
+    # Tuple,
+)
+
+import sys
+from bs4 import BeautifulSoup
+import time
+import requests_cache
+
+
+class IanaCrawler:
+    URL: str = "https://www.iana.org/domains/root/db"
+    CacheTime: int = 3600 * 24  # default 24 hours
+    Session: Any = None
+    cacheName: str = ".iana_cache"
+    verbose: bool = False
+    cacheBackend: str = "filesystem"
+    records: List[Any] = []
+    columns: List[Any] = []
+
+    resolver: Any = None
+
+    def __init__(
+        self,
+        verbose: bool = False,
+        resolver: Any = None,
+    ):
+        self.verbose = verbose
+        self.resolver = resolver
+        self.Session = requests_cache.CachedSession(
+            self.cacheName,
+            backend=self.cacheBackend,
+        )
+
+    def getUrl(self) -> str:
+        return self.URL
+
+    def getBasicBs(
+        self,
+        url: str,
+    ) -> BeautifulSoup:
+        try:
+            response = self.Session.get(url)
+        except Exception as e:
+            # in case of no data, sleep and try again
+            print(e, file=sys.stderr)
+            time.sleep(15)
+            response = self.Session.get(url)
+
+        soup = BeautifulSoup(response.text, "html.parser")
+        return soup
+
+    def getAdditionalItem(
+        self,
+        what: str,
+        data: List[str],
+    ) -> Optional[str]:
+
+        for i in [0, 1]:
+            try:
+                z: str = f"{what}:"
+                if z in data[i]:
+                    return data[i].replace(z, "").strip()
+            except Exception as _:
+                _ = _
+                return None
+        return None
+
+    def getTldInfo(self) -> None:
+        soup = self.getBasicBs(self.getUrl())
+        table: Any = soup.find("table")  # the first table has the tld data
+
+        self.records: List[Any] = []
+        self.columns: List[Any] = []
+        n = 0
+        for tr in table.findAll("tr"):
+            n += 1
+            # extract header info if present
+            ths = tr.findAll("th")
+            if ths != []:
+                for each in ths:
+                    self.columns.append(each.text)
+                continue
+
+            # extrct data
+            trs = tr.findAll("td")
+            record = []
+            for each in trs:
+                try:
+                    link = each.find("a")["href"]
+                    aa = link.split("/")
+                    record.append(aa[-1].replace(".html", ""))
+                    record.append(each.text.strip())
+                except Exception as _:
+                    _ = _
+                    record.append(each.text)
+            self.records.append(record)
+
+        self.columns.insert(0, "Link")
+
+    def getTldPWithString(
+        self,
+        url: str,
+        text: str,
+    ) -> Optional[str]:
+        soup = self.getBasicBs(url)
+        gfg: List[Any] = soup.find_all(lambda tag: tag.name == "p" and text in tag.text)
+        if len(gfg):
+            s: str = gfg[0].text.strip()
+            return s
+        return None
+
+    def resolveWhois(
+        self,
+        whois: str,
+    ) -> List[Any]:
+        ll: List[Any] = []
+        if self.resolver:
+            try:
+                answer = list(self.resolver.resolve(whois, "A").response.answer)
+            except Exception as e:
+                print(whois, e, file=sys.stderr)
+                time.sleep(30)
+                answer = list(self.resolver.resolve(whois, "A").response.answer)
+
+            for a in answer:
+                s = str(a)
+                if "\n" in s:
+                    ss = s.split("\n")
+                    ll.append(ss)
+                else:
+                    ll.append(s)
+
+                if self.verbose:
+                    print(a)
+        return ll
+
+    def addInfoToOneTld(
+        self,
+        tldItem: List[Any],
+    ) -> List[str]:
+        url = tldItem[0]
+
+        if self.verbose:
+            print(url, file=sys.stderr)
+
+        if tldItem[3] == "Not assigned":
+            tldItem[3] = None
+
+        zz = {
+            "Whois": "WHOIS Server",
+            "RegistrationUrl": "URL for registration services",
+        }
+        for key, val in zz.items():
+            regDataW = self.getTldPWithString(self.getUrl() + "/" + url + ".html", val)
+            if regDataW:
+                regDataW = regDataW.replace(val, key)
+                regDataA = regDataW.split("\n")
+
+                for s in [key]:
+                    tldItem.append(self.getAdditionalItem(s, regDataA))
+            else:
+                tldItem.append(None)
+
+        if tldItem[4]:
+            ll = self.resolveWhois(tldItem[4])
+            tldItem.append(ll)
+        else:
+            tldItem.append(None)
+
+        if self.verbose:
+            print(url, tldItem, file=sys.stderr)
+
+        return tldItem
+
+    def addInfoToAllTld(self) -> None:
+        records2 = []
+        for tldItem in self.records:
+            rr = self.addInfoToOneTld(tldItem)
+            if self.verbose:
+                print(len(rr), rr)
+            records2.append(rr)
+        self.columns.insert(4, "Whois")
+        self.columns.insert(5, "RegistrationUrl")
+        self.columns.insert(6, "DnsResolve-A")
+        self.records = records2
+        self.columns[3] = self.columns[3].replace(" ", "_")
+
+    def getResults(self) -> Dict[str, Any]:
+        ll = list(self.columns)
+        ll[3] = ll[3].replace(" ", "_")
+        return {
+            "header": ll,
+            "data": self.records,
+        }