Skip to content

Commit e8fbd73

Browse files
committed
Fix .ai parsing and normalization (joepie91#45), add 'role' field, improve output format of test runner
1 parent baee878 commit e8fbd73

File tree

18 files changed

+241
-41
lines changed

18 files changed

+241
-41
lines changed

pwhois

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ else:
7575
data_map = OrderedDict({})
7676
data_map["handle"] ="NIC handle"
7777
data_map["name"] ="Name"
78+
data_map["role"] ="Role"
7879
data_map["organization"] = "Organization"
7980
data_map["street"] = "Street address"
8081
data_map["postalcode"] = "Postal code"

pythonwhois/parse.py

Lines changed: 85 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,7 @@ def allow_trailing_comma_dict(regexes):
317317
" Technical Contact:\n (?P<name>.+) (?P<email>.+)\n (?P<phone>.*)\n (?P<fax>.*)\n", # .com.tw (Western registrars)
318318
"Technical Contact Information:\n\n(?:Given name: (?P<firstname>.+)\n)?(?:Family name: (?P<lastname>.+)\n)?(?:Company name: (?P<organization>.+)\n)?Address: (?P<street>.+)\nCountry: (?P<country>.+)\nPhone: (?P<phone>.*)\nFax: (?P<fax>.*)\nEmail: (?P<email>.+)\n(?:Account Name: (?P<handle>.+)\n)?", # HKDNR (.hk)
319319
"TECH ID:(?P<handle>.+)\nTECH Name:(?P<name>.*)\n(?:TECH Organization:(?P<organization>.*)\n)?TECH Street1:(?P<street1>.+?)\n(?:TECH Street2:(?P<street2>.+?)\n(?:TECH Street3:(?P<street3>.+?)\n)?)?TECH City:(?P<city>.+)\nTECH State:(?P<state>.*)\nTECH Postal Code:(?P<postalcode>.+)\nTECH Country:(?P<country>[A-Z]+)\nTECH Phone:(?P<phone>.*?)\nTECH Fax:(?P<fax>.*)\nTECH Email:(?P<email>.+)\n", # Realtime Register
320-
"Technical Contact\n NIC Handle \(if known\)\.+:(?P<handle>.*)\n \(I\)ndividual \(R\)ole\.+:.*\n Name \(Last, First\)\.+:(?P<name>.*)\n Organization Name\.+:(?P<organization>.*)\n Street Address\.+:(?P<street1>.*)\n City\.+: (?P<city>.*)\n State\.+: (?P<state>.*)\n Postal Code\.+:(?P<postalcode>.*)\n Country\.+:(?P<country>.*)\n Phone Number\.+:(?P<phone>.*)\n Fax Number\.+:(?P<fax>.*)\n E-Mailbox\.+:(?P<email>.*)", # .ai
320+
"Technical Contact\n NIC Handle \(if known\)\.+:(?P<handle>.*)\n \(I\)ndividual \(R\)ole\.+:(?P<role>.*)\n Name \(Last, First\)\.+:(?P<name>.*)\n Organization Name\.+:(?P<organization>.*)\n Street Address\.+:(?P<street1>.*)\n City\.+: (?P<city>.*)\n State\.+: (?P<state>.*)\n Postal Code\.+:(?P<postalcode>.*)\n Country\.+:(?P<country>.*)\n Phone Number\.+:(?P<phone>.*)\n Fax Number\.+:(?P<fax>.*)\n E-Mailbox\.+:(?P<email>.*)", # .ai
321321
]
322322

323323
admin_contact_regexes = [
@@ -344,7 +344,7 @@ def allow_trailing_comma_dict(regexes):
344344
" Administrative Contact:\n (?P<name>.+) (?P<email>.+)\n (?P<phone>.*)\n (?P<fax>.*)\n", # .com.tw (Western registrars)
345345
"Administrative Contact Information:\n\n(?:Given name: (?P<firstname>.+)\n)?(?:Family name: (?P<lastname>.+)\n)?(?:Company name: (?P<organization>.+)\n)?Address: (?P<street>.+)\nCountry: (?P<country>.+)\nPhone: (?P<phone>.*)\nFax: (?P<fax>.*)\nEmail: (?P<email>.+)\n(?:Account Name: (?P<handle>.+)\n)?", # HKDNR (.hk)
346346
"ADMIN ID:(?P<handle>.+)\nADMIN Name:(?P<name>.*)\n(?:ADMIN Organization:(?P<organization>.*)\n)?ADMIN Street1:(?P<street1>.+?)\n(?:ADMIN Street2:(?P<street2>.+?)\n(?:ADMIN Street3:(?P<street3>.+?)\n)?)?ADMIN City:(?P<city>.+)\nADMIN State:(?P<state>.*)\nADMIN Postal Code:(?P<postalcode>.+)\nADMIN Country:(?P<country>[A-Z]+)\nADMIN Phone:(?P<phone>.*?)\nADMIN Fax:(?P<fax>.*)\nADMIN Email:(?P<email>.+)\n", # Realtime Register
347-
"Administrative Contact\n NIC Handle \(if known\)\.+:(?P<handle>.*)\n \(I\)ndividual \(R\)ole\.+:.*\n Name \(Last, First\)\.+:(?P<name>.*)\n Organization Name\.+:(?P<organization>.*)\n Street Address\.+:(?P<street1>.*)\n City\.+: (?P<city>.*)\n State\.+: (?P<state>.*)\n Postal Code\.+:(?P<postalcode>.*)\n Country\.+:(?P<country>.*)\n Phone Number\.+:(?P<phone>.*)\n Fax Number\.+:(?P<fax>.*)\n E-Mailbox\.+:(?P<email>.*)", # .ai
347+
"Administrative Contact\n NIC Handle \(if known\)\.+:(?P<handle>.*)\n \(I\)ndividual \(R\)ole\.+:(?P<role>.*)\n Name \(Last, First\)\.+:(?P<name>.*)\n Organization Name\.+:(?P<organization>.*)\n Street Address\.+:(?P<street1>.*)\n City\.+: (?P<city>.*)\n State\.+: (?P<state>.*)\n Postal Code\.+:(?P<postalcode>.*)\n Country\.+:(?P<country>.*)\n Phone Number\.+:(?P<phone>.*)\n Fax Number\.+:(?P<fax>.*)\n E-Mailbox\.+:(?P<email>.*)", # .ai
348348
]
349349

350350
billing_contact_regexes = [
@@ -362,7 +362,7 @@ def allow_trailing_comma_dict(regexes):
362362
"Billing Contact Information :[ ]*\n[ ]+(?P<firstname>.*)\n[ ]+(?P<lastname>.*)\n[ ]+(?P<organization>.*)\n[ ]+(?P<email>.*)\n[ ]+(?P<street>.*)\n[ ]+(?P<city>.*)\n[ ]+(?P<postalcode>.*)\n[ ]+(?P<phone>.*)\n[ ]+(?P<fax>.*)\n\n", # GAL Communication
363363
"Billing Contact:\n Name: (?P<name>.+)\n City: (?P<city>.+)\n State: (?P<state>.+)\n Country: (?P<country>.+)\n", # Akky (.com.mx)
364364
"BILLING ID:(?P<handle>.+)\nBILLING Name:(?P<name>.*)\n(?:BILLING Organization:(?P<organization>.*)\n)?BILLING Street1:(?P<street1>.+?)\n(?:BILLING Street2:(?P<street2>.+?)\n(?:BILLING Street3:(?P<street3>.+?)\n)?)?BILLING City:(?P<city>.+)\nBILLING State:(?P<state>.*)\nBILLING Postal Code:(?P<postalcode>.+)\nBILLING Country:(?P<country>[A-Z]+)\nBILLING Phone:(?P<phone>.*?)\nBILLING Fax:(?P<fax>.*)\nBILLING Email:(?P<email>.+)\n", # Realtime Register
365-
"Billing Contact\n NIC Handle \(if known\)\.+:(?P<handle>.*)\n \(I\)ndividual \(R\)ole\.+:.*\n Name \(Last, First\)\.+:(?P<name>.*)\n Organization Name\.+:(?P<organization>.*)\n Street Address\.+:(?P<street1>.*)\n City\.+: (?P<city>.*)\n State\.+: (?P<state>.*)\n Postal Code\.+:(?P<postalcode>.*)\n Country\.+:(?P<country>.*)\n Phone Number\.+:(?P<phone>.*)\n Fax Number\.+:(?P<fax>.*)\n E-Mailbox\.+:(?P<email>.*)", # .ai
365+
"Billing Contact\n NIC Handle \(if known\)\.+:(?P<handle>.*)\n \(I\)ndividual \(R\)ole\.+:(?P<role>.*)\n Name \(Last, First\)\.+:(?P<name>.*)\n Organization Name\.+:(?P<organization>.*)\n Street Address\.+:(?P<street1>.*)\n City\.+: (?P<city>.*)\n State\.+: (?P<state>.*)\n Postal Code\.+:(?P<postalcode>.*)\n Country\.+:(?P<country>.*)\n Phone Number\.+:(?P<phone>.*)\n Fax Number\.+:(?P<fax>.*)\n E-Mailbox\.+:(?P<email>.*)", # .ai
366366
]
367367

368368
# Some registries use NIC handle references instead of directly listing contacts...
@@ -433,17 +433,19 @@ def allow_trailing_comma_dict(regexes):
433433
)
434434

435435
organization_regexes = (
436-
r"(?:^|\s|,)limited\.?($|\s)",
437-
r"(?:^|\s|,)holdings\.?($|\s)",
438-
r"(?:^|\s|,)(?:in)?corporat(?:ed?|ion)\.?($|\s)",
439-
r"(?:^|\s|,)company\.?($|\s)",
440-
r"(?:^|\s|,)operations\.?($|\s)",
441-
r"(?:^|\s|,)association\.?($|\s)",
442-
r"(?:^|\s|,)council\.?($|\s)",
443-
r"(?:^|\s|,)university\.?($|\s)",
444-
r"(?:^|\s|,)college\.?($|\s)",
445-
r"(?:^|\s|,)services?\.?($|\s)",
446-
r"(?:^|\s|,)cabinet\.?($|\s)",
436+
r"(?:^|\s|,)limited\.?($|\s|,)",
437+
r"(?:^|\s|,)holdings\.?($|\s|,)",
438+
r"(?:^|\s|,)(?:in)?corporat(?:ed?|ion)\.?($|\s|,)",
439+
r"(?:^|\s|,)company\.?($|\s|,)",
440+
r"(?:^|\s|,)operations\.?($|\s|,)",
441+
r"(?:^|\s|,)association\.?($|\s|,)",
442+
r"(?:^|\s|,)council\.?($|\s|,)",
443+
r"(?:^|\s|,)university\.?($|\s|,)",
444+
r"(?:^|\s|,)college\.?($|\s|,)",
445+
r"(?:^|\s|,)services?\.?($|\s|,)",
446+
r"(?:^|\s|,)cabinet\.?($|\s|,)",
447+
r"(?:^|\s|,)billing\.?($|\s|,)",
448+
r"(?:^|\s|,)administration\.?($|\s|,)",
447449
)
448450

449451
known_abbreviations = allow_trailing_comma_dict({
@@ -459,6 +461,10 @@ def allow_trailing_comma_dict(regexes):
459461
"d/b/a": r"^(?:d\/b\/a|dba)$",
460462
})
461463

464+
role_regexes = (
465+
r"(?:^|\s|,)administrator\.?($|\s|,)",
466+
)
467+
462468
country_regexes = [r"(?:\s|,)" + dotify(country_code.upper()) + r"($|\s)" for country_code in countries.keys()]
463469

464470
for key in ('id', 'status', 'creation_date', 'expiration_date', 'updated_date', 'registrar', 'whois_server', 'nameservers', 'emails'):
@@ -475,6 +481,7 @@ def allow_trailing_comma_dict(regexes):
475481
admin_contact_regexes = precompile_regexes(admin_contact_regexes)
476482
nic_contact_regexes = precompile_regexes(nic_contact_regexes)
477483

484+
role_regexes = precompile_regexes(role_regexes, re.IGNORECASE)
478485
organization_regexes = precompile_regexes(organization_regexes, re.IGNORECASE)
479486
abbreviated_organization_regexes = precompile_regexes(abbreviated_organization_regexes, re.IGNORECASE)
480487
country_regexes = precompile_regexes(country_regexes)
@@ -750,6 +757,11 @@ def normalize_data(data, normalized):
750757
is_organization = is_organization_name(line) or is_fuzzy_duplicate(line, organization_lines)
751758

752759
if is_organization:
760+
if "," in line:
761+
name_words = re.split(name_separators, line)
762+
if is_full_incorporation_form(name_words[0]):
763+
line = reverse_name_comma(line)
764+
753765
new_organization_lines.append(line)
754766
del name_lines[i]
755767

@@ -775,31 +787,66 @@ def normalize_data(data, normalized):
775787
elif 'organization' in contact:
776788
del contact["organization"]
777789

790+
new_roles = []
791+
778792
if 'name' in contact:
779793
# Check whether the name is reversed; first name last, last name first.
780794
names = contact['name'].splitlines()
781795
unswapped_names = []
782796

783797
for name in names:
784798
if "," in name:
785-
name_segments = [segment.strip() for segment in name.split(",")]
786-
first_segment = name_segments.pop()
787-
name = first_segment + " " + ', '.join(name_segments)
799+
name = reverse_name_comma(name)
788800
else:
789801
# Split the name into normalized (ie. alpha-only) 'words' for comparison. We only care about ASCII, as our first-name
790802
# list currently only contains English names.
791803
name_words = [filter_characters(segment, non_name_characters) for segment in name.split()]
792804

793805
if len(name_words) > 1 and is_first_name(name_words[-1]) and not is_first_name(name_words[0]):
794806
# The last 'word' was in the common first names, but the first one was not. Likely swapped around.
795-
name_segments = re.split(name_separators, name)
796-
name_segments.insert(0, name_segments.pop())
797-
name = ' '.join(name_segments)
807+
name = reverse_name(name)
798808

799-
unswapped_names.append(name)
809+
if is_role(name):
810+
new_roles.append(name)
811+
else:
812+
unswapped_names.append(name)
800813

801-
contact['name'] = "\n".join(unswapped_names)
802-
814+
if len(unswapped_names) > 0:
815+
contact['name'] = "\n".join(unswapped_names)
816+
else:
817+
del contact['name']
818+
819+
if 'organization' in contact:
820+
organizations = contact['organization'].splitlines()
821+
new_organizations = []
822+
823+
for organization in organizations:
824+
if is_role(organization):
825+
new_roles.append(organization)
826+
else:
827+
new_organizations.append(organization)
828+
829+
if len(new_organizations) > 0:
830+
contact['organization'] = "\n".join(new_organizations)
831+
else:
832+
del contact['organization']
833+
834+
if 'street' in contact:
835+
streets = contact['street'].splitlines()
836+
837+
if is_role(streets[0]):
838+
new_roles.append(streets[0])
839+
streets = streets[1:]
840+
841+
contact['street'] = "\n".join(streets)
842+
843+
if 'role' in contact:
844+
existing_roles = contact['role'].splitlines()
845+
else:
846+
existing_roles = []
847+
848+
if len(new_roles) > 0:
849+
contact['role'] = "\n".join(new_roles + existing_roles)
803850

804851
if "street" in contact:
805852
lines = [x.strip() for x in contact["street"].splitlines()]
@@ -833,7 +880,7 @@ def normalize_data(data, normalized):
833880
if key in contact and contact[key] is not None and (normalized == True or key in normalized):
834881
contact[key] = normalize_name(contact[key], abbreviation_threshold=3)
835882

836-
for key in ("city", "organization", "state", "country"):
883+
for key in ("role", "city", "organization", "state", "country"):
837884
if key in contact and contact[key] is not None and (normalized == True or key in normalized):
838885
contact[key] = normalize_name(contact[key], abbreviation_threshold=3, length_threshold=3, check_known_incorrect=True)
839886

@@ -945,7 +992,10 @@ def is_full_incorporation_form(word):
945992
return match_regexes(word, organization_regexes)
946993

947994
def is_abbreviated_incorporation_form(word):
948-
return match_regexes(word, abbreviated_organization_regexes)
995+
return match_regexes(word, abbreviated_organization_regexes)
996+
997+
def is_role(line):
998+
return match_regexes(line, role_regexes)
949999

9501000
def is_country(word):
9511001
return match_regexes(word, country_regexes)
@@ -986,6 +1036,16 @@ def match_regexes_dict(string, regexes):
9861036
def capitalize_words(line):
9871037
return ' '.join([word.capitalize() for word in line.split(" ")])
9881038

1039+
def reverse_name(name):
1040+
name_segments = re.split(name_separators, name)
1041+
name_segments.insert(0, name_segments.pop())
1042+
return ' '.join(name_segments)
1043+
1044+
def reverse_name_comma(name):
1045+
name_segments = [segment.strip() for segment in name.split(",")]
1046+
first_segment = name_segments.pop()
1047+
return first_segment + " " + ', '.join(name_segments)
1048+
9891049
def normalize_word(word, abbreviation_threshold=4, lowercase_domains=True):
9901050
if is_known_abbreviation(word):
9911051
return get_known_abbreviation(word)

test.py

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,15 @@
77
unicode_stdout = codecs.getwriter(sys.stdout.encoding)(sys.stdout)
88
unicode_stderr = codecs.getwriter(sys.stderr.encoding)(sys.stderr)
99

10+
if sys.version_info < (3, 0):
11+
def is_string(data):
12+
"""Test for string with support for python 2."""
13+
return isinstance(data, basestring)
14+
else:
15+
def is_string(data):
16+
"""Test for string with support for python 3."""
17+
return isinstance(data, str)
18+
1019
# FIXME: The testing script is currently incapable of testing referenced NIC handles that are
1120
# retrieved separately, such as is the case with the JPRS registry for .co.jp. This
1221
# really needs to be fixed, to ensure that contact parsing for this doesn't break.
@@ -28,7 +37,6 @@ def read_encoded_file(file_path):
2837
except Exception:
2938
pass
3039

31-
3240
parser = argparse.ArgumentParser(description="Runs or modifies the test suite for python-whois.")
3341
parser.add_argument("mode", nargs=1, choices=["run", "update"], default="run", help="Whether to run or update the tests. Only update if you know what you're doing!")
3442
parser.add_argument("target", nargs="+", help="The targets to run/modify tests for. Use 'all' to run the full test suite.")
@@ -47,9 +55,14 @@ def encoded_json_dumps(obj):
4755
def json_fallback(obj):
4856
if isinstance(obj, datetime.datetime):
4957
return obj.isoformat()
58+
elif is_string(obj):
59+
return indent_values(obj)
5060
else:
5161
return obj
5262

63+
def indent_values(string):
64+
return string.replace("\n", "\n ")
65+
5366
def recursive_encode(obj, encoding):
5467
for key in list(obj.keys()):
5568
if isinstance(obj[key], dict):
@@ -69,11 +82,13 @@ def recursive_compare(obj1, obj2, chain=[]):
6982
s1 = set(obj1.keys())
7083
s2 = set(obj2.keys())
7184

72-
for item in s1.difference(s2):
73-
errors.append("(%s) Key present in previous data, but missing in current data: %s" % (chain_name, item))
85+
for key in s1.difference(s2):
86+
value = json_fallback(obj1[key])
87+
errors.append("(%s) Key present in previous data, but missing in current data: `%s`\n [---] %s" % (chain_name, key, value))
7488

75-
for item in s2.difference(s1):
76-
errors.append("(%s) New key present in current data, but missing in previous data: %s" % (chain_name, item))
89+
for key in s2.difference(s1):
90+
value = json_fallback(obj2[key])
91+
errors.append("(%s) New key present in current data, but missing in previous data: `%s`\n [+++] %s" % (chain_name, key, value))
7792

7893
for key in s1.intersection(s2):
7994
if isinstance(obj1[key], dict) and isinstance(obj2[key], dict):
@@ -82,10 +97,10 @@ def recursive_compare(obj1, obj2, chain=[]):
8297
lst1 = [json_fallback(x) for x in obj1[key]]
8398
lst2 = [json_fallback(x) for x in obj2[key]]
8499
if set(lst1) != set(lst2):
85-
errors.append("(%s) List mismatch in key %s.\n [old] %s\n [new] %s" % (chain_name, key, set(lst1), set(lst2)))
100+
errors.append("(%s) List mismatch in key `%s`.\n [old] %s\n [new] %s" % (chain_name, key, set(lst1), set(lst2)))
86101
else:
87102
if json_fallback(obj1[key]) != json_fallback(obj2[key]):
88-
errors.append("(%s) Data mismatch in key %s.\n [old] %s\n [new] %s" % (chain_name, key, json_fallback(obj1[key]), json_fallback(obj2[key])))
103+
errors.append("(%s) Data mismatch in key `%s`.\n [old] %s\n [new] %s" % (chain_name, key, json_fallback(obj1[key]), json_fallback(obj2[key])))
89104

90105
return errors
91106

test/data/apple.ai

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
DOMAIN INFORMATION
2+
3+
Complete Domain Name........: apple.ai
4+
Organization Using Domain Name
5+
Organization Name..........: Apple Inc.
6+
Street Address.............: 1 Infinite Loop
7+
City.......................: Cupertino
8+
State......................: CA
9+
Postal Code................: 95014
10+
Country....................: United States
11+
Administrative Contact
12+
NIC Handle (if known)......:
13+
(I)ndividual (R)ole........:
14+
Name (Last, First).........:
15+
Organization Name..........: Apple Inc.
16+
Street Address.............: 1 Infinite Loop
17+
City.......................: Cupertino
18+
State......................: CA
19+
Postal Code................: 95014
20+
Country....................: United States
21+
Phone Number...............: +1.4089961010
22+
Fax Number.................: +1.4089741560
23+
E-Mailbox..................: [email protected]
24+
Technical Contact
25+
NIC Handle (if known)......:
26+
(I)ndividual (R)ole........:
27+
Name (Last, First).........: Administrator, DNS
28+
Organization Name..........: CSC Corporate Domains, Inc.
29+
Street Address.............: 2711 Centerville Rd.
30+
City.......................: Wilmington
31+
State......................: DE
32+
Postal Code................: 19808
33+
Country....................: United States
34+
Phone Number...............: +1.3026365400
35+
Fax Number.................: +1.3026365454
36+
E-Mailbox..................: [email protected]
37+
Billing Contact
38+
NIC Handle (if known)......:
39+
(I)ndividual (R)ole........:
40+
Name (Last, First).........: Billing, ccTLD
41+
Organization Name..........: Corporation Service Company
42+
Street Address.............: 2711 Centerville Road Suite 400
43+
City.......................: Wilmington
44+
State......................: DE
45+
Postal Code................: 19808
46+
Country....................: United States
47+
Phone Number...............: +1.3026365400
48+
Fax Number.................: +1.3026365454
49+
E-Mailbox..................: [email protected]
50+
Nameservers
51+
Primary Server Hostname....: nserver.apple.com
52+
Primary Server Netaddress..:
53+
Secondary Server Hostname..: nserver2.apple.com
54+
Secondary Server Netaddress:
55+
Third Server Hostname......: nserver3.apple.com
56+
Fourth Server Hostname.....: nserver4.apple.com
57+
58+
59+
Last paid by: [email protected]
60+

0 commit comments

Comments
 (0)