Merge pull request #233 from maarten-boot/temp2

MooCow · web-flow · commit 38fb737c9d0d · 2022-11-04T13:17:54.000Z
add support for iana domains
diff --git a/README.md b/README.md
@@ -62,6 +62,9 @@ Raise an issue https://github.com/DannyCork/python-whois/issues/new
  * all tests from the original program are now files in the ./tests directory
  * test can be done on all supported tld's with -a or --all and limitest by regex with -r <pattern> or --reg=<pattern>
 
+2022-11-04: maarten_boot
+ * add support for Iana example.com, example.net
+
 ## Support
  * Python 3.x is supported.
  * Python 2.x IS NOT supported.
diff --git a/test2.py b/test2.py
@@ -4,7 +4,6 @@
 import re
 import getopt
 import sys
-import json
 
 Verbose = False
 Failures = {}
diff --git a/whois/_2_parse.py b/whois/_2_parse.py
@@ -63,44 +63,34 @@ def cleanupWhoisResponse(
     verbose: bool = False,
     with_cleanup_results: bool = False,
 ) -> str:
-
-    if 0:
-        if verbose:
-            print(f"BEFORE cleanup: \n{response}", file=sys.stderr)
+    tmp2 = []
 
     tmp: List = response.split("\n")
-
-    tmp2 = []
     for line in tmp:
         # some servers respond with: % Quota exceeded in the comment section (lines starting with %)
         if "quota exceeded" in line.lower():
             raise WhoisQuotaExceeded(response)
 
-        if with_cleanup_results is True and line.startswith("%"):
+        if with_cleanup_results is True and line.startswith("%"):  # only remove if requested
             continue
 
-        if "REDACTED FOR PRIVACY" in line:
+        if "REDACTED FOR PRIVACY" in line:  # these lines contibute nothing so ignore
             continue
 
-        if line.startswith("Terms of Use:"):
+        if line.startswith("Terms of Use:"):  # these lines contibute nothing so ignore
             continue
 
         tmp2.append(line)
 
-    response = "\n".join(tmp2)
-    if 0:
-        if verbose:
-            print(f"AFTER cleanup: \n{response}", file=sys.stderr)
-
-    return response
+    return "\n".join(tmp2)
 
 
 def handleShortResponse(
     tld: str,
     dl: List,
     whois_str: str,
     verbose: bool = False,
-): # returns None or raises one of (WhoisQuotaExceeded, FailedParsingWhoisOutput)
+):  # returns None or raises one of (WhoisQuotaExceeded, FailedParsingWhoisOutput)
     if verbose:
         d = ".".join(dl)
         print(f"line count < 5:: {tld} {d} {whois_str}", file=sys.stderr)
@@ -150,6 +140,7 @@ def handleShortResponse(
     # ---------------------------------
     raise FailedParsingWhoisOutput(whois_str)
 
+
 def doDnsSec(whois_str: str) -> bool:
     whois_dnssec: Any = whois_str.split("DNSSEC:")
     if len(whois_dnssec) >= 2:
@@ -158,35 +149,79 @@ def doDnsSec(whois_str: str) -> bool:
             return True
     return False
 
-def doSourceIana(whois_str: str, verbose: bool = False) -> str:
-    # here we can handle the example.com and example.net permanent IANA domains
-
-    if verbose:
-        msg = f"i have seen source: IANA"
-        print(msg, file=sys.stderr)
-
-    whois_splitted = whois_str.split("source:       IANA")
-    if len(whois_splitted) == 2:
-        whois_str = whois_splitted[1] # often this is actually just whitespace
-    return whois_str
 
 def doIfServerNameLookForDomainName(whois_str: str, verbose: bool = False) -> str:
     # not often available anymore
     if re.findall(r"Server Name:\s?(.+)", whois_str, re.IGNORECASE):
         if verbose:
-            msg = f"i have seen Server Name:, looking for Domain Name:"
+            msg = "i have seen Server Name:, looking for Domain Name:"
             print(msg, file=sys.stderr)
         whois_str = whois_str[whois_str.find("Domain Name:") :]
     return whois_str
 
+
+def doExtractPattensIanaFromWhoisString(tld: str, r: Dict, whois_str: str, verbose: bool = False):
+    # now handle the actual format if this whois response
+    iana = {
+        "domain_name": r"domain:\s?([^\n]+)",
+        "registrar": r"organisation:\s?([^\n]+)",
+        "creation_date": r"created:\s?([^\n]+)",
+    }
+    for k, v in iana.items():
+        zz = re.findall(v, whois_str)
+        if zz:
+            if verbose:
+                print(tld, zz, file=sys.stderr)
+            r[k] = zz
+    return r
+
+
+def doSourceIana(tld: str, r: Dict, whois_str: str, verbose: bool = False) -> str:
+    # here we can handle the example.com and example.net permanent IANA domains
+
+    if verbose:
+        msg = "i have seen source: IANA"
+        print(msg, file=sys.stderr)
+
+    whois_splitted = whois_str.split("source:       IANA")
+    if len(whois_splitted) == 2 and whois_splitted[1].strip() != "":
+        # if we see source: IANA and the part after is not only whitespace
+        if verbose:
+            msg = f"after IANA: {whois_splitted[1]}"
+            print(msg, file=sys.stderr)
+
+        return whois_splitted[1], None
+
+    # try to parse this as a IANA domain as after is only whitespace
+    r = doExtractPattensFromWhoisString(tld, r, whois_str, verbose)  # set default values
+
+    # now handle the actual format if this whois response
+    r = doExtractPattensIanaFromWhoisString(tld, r, whois_str, verbose)
+
+    return whois_str, r
+
+
+def doExtractPattensFromWhoisString(tld: str, r: Dict, whois_str: str, verbose: bool = False):
+    for k, v in TLD_RE.get(tld, TLD_RE["com"]).items():  # use TLD_RE["com"] as default if a regex is missing
+        if k.startswith("_"):  # skip meta element like: _server or _privateRegistry
+            continue
+
+        # Historical: here we use 'empty string' as default, not None
+        if v is None:
+            r[k] = [""]
+        else:
+            r[k] = v.findall(whois_str) or [""]
+
+    return r
+
+
 def do_parse(
     whois_str: str,
     tld: str,
     dl: List[str],
     verbose: bool = False,
     with_cleanup_results=False,
 ) -> Optional[Dict[str, Any]]:
-    r: Dict[str, Any] = {"tld": tld}
 
     whois_str = cleanupWhoisResponse(
         response=whois_str,
@@ -197,22 +232,17 @@ def do_parse(
     if whois_str.count("\n") < 5:
         return handleShortResponse(tld, dl, whois_str, verbose)
 
-    r["DNSSEC"] = doDnsSec(whois_str) # check the status of DNSSEC
+    r: Dict[str, Any] = {
+        "tld": tld,
+        "DNSSEC": doDnsSec(whois_str),
+    }
 
-    if "source:       IANA" in whois_str: # prepare for handling historical IANA domains
-        whois_str = doSourceIana(whois_str, verbose)
+    if "source:       IANA" in whois_str:  # prepare for handling historical IANA domains
+        whois_str, ianaDomain = doSourceIana(tld, r, whois_str, verbose)
+        if ianaDomain is not None:
+            return ianaDomain
 
-    if "Server Name" in whois_str: # handle old type Server Name (not very common anymore)
+    if "Server Name" in whois_str:  # handle old type Server Name (not very common anymore)
         whois_str = doIfServerNameLookForDomainName(whois_str, verbose)
 
-    for k, v in TLD_RE.get(tld, TLD_RE["com"]).items(): # use TLD_RE["com"] as default if a regex is missing
-        if k.startswith("_"): # skip meta element like: _server or _privateRegistry
-            continue
-
-        # Historical: here we use 'empty string' as default, not None
-        if v is None:
-            r[k] = [""]
-        else:
-            r[k] = v.findall(whois_str) or [""]
-
-    return r
+    return doExtractPattensFromWhoisString(tld, r, whois_str, verbose)