@@ -317,7 +317,7 @@ def allow_trailing_comma_dict(regexes):
317317 " Technical Contact:\n (?P<name>.+) (?P<email>.+)\n (?P<phone>.*)\n (?P<fax>.*)\n " , # .com.tw (Western registrars)
318318 "Technical Contact Information:\n \n (?:Given name: (?P<firstname>.+)\n )?(?:Family name: (?P<lastname>.+)\n )?(?:Company name: (?P<organization>.+)\n )?Address: (?P<street>.+)\n Country: (?P<country>.+)\n Phone: (?P<phone>.*)\n Fax: (?P<fax>.*)\n Email: (?P<email>.+)\n (?:Account Name: (?P<handle>.+)\n )?" , # HKDNR (.hk)
319319 "TECH ID:(?P<handle>.+)\n TECH Name:(?P<name>.*)\n (?:TECH Organization:(?P<organization>.*)\n )?TECH Street1:(?P<street1>.+?)\n (?:TECH Street2:(?P<street2>.+?)\n (?:TECH Street3:(?P<street3>.+?)\n )?)?TECH City:(?P<city>.+)\n TECH State:(?P<state>.*)\n TECH Postal Code:(?P<postalcode>.+)\n TECH Country:(?P<country>[A-Z]+)\n TECH Phone:(?P<phone>.*?)\n TECH Fax:(?P<fax>.*)\n TECH Email:(?P<email>.+)\n " , # Realtime Register
320- "Technical Contact\n NIC Handle \(if known\)\.+:(?P<handle>.*)\n \(I\)ndividual \(R\)ole\.+:.* \n Name \(Last, First\)\.+:(?P<name>.*)\n Organization Name\.+:(?P<organization>.*)\n Street Address\.+:(?P<street1>.*)\n City\.+: (?P<city>.*)\n State\.+: (?P<state>.*)\n Postal Code\.+:(?P<postalcode>.*)\n Country\.+:(?P<country>.*)\n Phone Number\.+:(?P<phone>.*)\n Fax Number\.+:(?P<fax>.*)\n E-Mailbox\.+:(?P<email>.*)" , # .ai
320+ "Technical Contact\n NIC Handle \(if known\)\.+:(?P<handle>.*)\n \(I\)ndividual \(R\)ole\.+:(?P<role>.*) \n Name \(Last, First\)\.+:(?P<name>.*)\n Organization Name\.+:(?P<organization>.*)\n Street Address\.+:(?P<street1>.*)\n City\.+: (?P<city>.*)\n State\.+: (?P<state>.*)\n Postal Code\.+:(?P<postalcode>.*)\n Country\.+:(?P<country>.*)\n Phone Number\.+:(?P<phone>.*)\n Fax Number\.+:(?P<fax>.*)\n E-Mailbox\.+:(?P<email>.*)" , # .ai
321321]
322322
323323admin_contact_regexes = [
@@ -344,7 +344,7 @@ def allow_trailing_comma_dict(regexes):
344344 " Administrative Contact:\n (?P<name>.+) (?P<email>.+)\n (?P<phone>.*)\n (?P<fax>.*)\n " , # .com.tw (Western registrars)
345345 "Administrative Contact Information:\n \n (?:Given name: (?P<firstname>.+)\n )?(?:Family name: (?P<lastname>.+)\n )?(?:Company name: (?P<organization>.+)\n )?Address: (?P<street>.+)\n Country: (?P<country>.+)\n Phone: (?P<phone>.*)\n Fax: (?P<fax>.*)\n Email: (?P<email>.+)\n (?:Account Name: (?P<handle>.+)\n )?" , # HKDNR (.hk)
346346 "ADMIN ID:(?P<handle>.+)\n ADMIN Name:(?P<name>.*)\n (?:ADMIN Organization:(?P<organization>.*)\n )?ADMIN Street1:(?P<street1>.+?)\n (?:ADMIN Street2:(?P<street2>.+?)\n (?:ADMIN Street3:(?P<street3>.+?)\n )?)?ADMIN City:(?P<city>.+)\n ADMIN State:(?P<state>.*)\n ADMIN Postal Code:(?P<postalcode>.+)\n ADMIN Country:(?P<country>[A-Z]+)\n ADMIN Phone:(?P<phone>.*?)\n ADMIN Fax:(?P<fax>.*)\n ADMIN Email:(?P<email>.+)\n " , # Realtime Register
347- "Administrative Contact\n NIC Handle \(if known\)\.+:(?P<handle>.*)\n \(I\)ndividual \(R\)ole\.+:.* \n Name \(Last, First\)\.+:(?P<name>.*)\n Organization Name\.+:(?P<organization>.*)\n Street Address\.+:(?P<street1>.*)\n City\.+: (?P<city>.*)\n State\.+: (?P<state>.*)\n Postal Code\.+:(?P<postalcode>.*)\n Country\.+:(?P<country>.*)\n Phone Number\.+:(?P<phone>.*)\n Fax Number\.+:(?P<fax>.*)\n E-Mailbox\.+:(?P<email>.*)" , # .ai
347+ "Administrative Contact\n NIC Handle \(if known\)\.+:(?P<handle>.*)\n \(I\)ndividual \(R\)ole\.+:(?P<role>.*) \n Name \(Last, First\)\.+:(?P<name>.*)\n Organization Name\.+:(?P<organization>.*)\n Street Address\.+:(?P<street1>.*)\n City\.+: (?P<city>.*)\n State\.+: (?P<state>.*)\n Postal Code\.+:(?P<postalcode>.*)\n Country\.+:(?P<country>.*)\n Phone Number\.+:(?P<phone>.*)\n Fax Number\.+:(?P<fax>.*)\n E-Mailbox\.+:(?P<email>.*)" , # .ai
348348]
349349
350350billing_contact_regexes = [
@@ -362,7 +362,7 @@ def allow_trailing_comma_dict(regexes):
362362 "Billing Contact Information :[ ]*\n [ ]+(?P<firstname>.*)\n [ ]+(?P<lastname>.*)\n [ ]+(?P<organization>.*)\n [ ]+(?P<email>.*)\n [ ]+(?P<street>.*)\n [ ]+(?P<city>.*)\n [ ]+(?P<postalcode>.*)\n [ ]+(?P<phone>.*)\n [ ]+(?P<fax>.*)\n \n " , # GAL Communication
363363 "Billing Contact:\n Name: (?P<name>.+)\n City: (?P<city>.+)\n State: (?P<state>.+)\n Country: (?P<country>.+)\n " , # Akky (.com.mx)
364364 "BILLING ID:(?P<handle>.+)\n BILLING Name:(?P<name>.*)\n (?:BILLING Organization:(?P<organization>.*)\n )?BILLING Street1:(?P<street1>.+?)\n (?:BILLING Street2:(?P<street2>.+?)\n (?:BILLING Street3:(?P<street3>.+?)\n )?)?BILLING City:(?P<city>.+)\n BILLING State:(?P<state>.*)\n BILLING Postal Code:(?P<postalcode>.+)\n BILLING Country:(?P<country>[A-Z]+)\n BILLING Phone:(?P<phone>.*?)\n BILLING Fax:(?P<fax>.*)\n BILLING Email:(?P<email>.+)\n " , # Realtime Register
365- "Billing Contact\n NIC Handle \(if known\)\.+:(?P<handle>.*)\n \(I\)ndividual \(R\)ole\.+:.* \n Name \(Last, First\)\.+:(?P<name>.*)\n Organization Name\.+:(?P<organization>.*)\n Street Address\.+:(?P<street1>.*)\n City\.+: (?P<city>.*)\n State\.+: (?P<state>.*)\n Postal Code\.+:(?P<postalcode>.*)\n Country\.+:(?P<country>.*)\n Phone Number\.+:(?P<phone>.*)\n Fax Number\.+:(?P<fax>.*)\n E-Mailbox\.+:(?P<email>.*)" , # .ai
365+ "Billing Contact\n NIC Handle \(if known\)\.+:(?P<handle>.*)\n \(I\)ndividual \(R\)ole\.+:(?P<role>.*) \n Name \(Last, First\)\.+:(?P<name>.*)\n Organization Name\.+:(?P<organization>.*)\n Street Address\.+:(?P<street1>.*)\n City\.+: (?P<city>.*)\n State\.+: (?P<state>.*)\n Postal Code\.+:(?P<postalcode>.*)\n Country\.+:(?P<country>.*)\n Phone Number\.+:(?P<phone>.*)\n Fax Number\.+:(?P<fax>.*)\n E-Mailbox\.+:(?P<email>.*)" , # .ai
366366]
367367
368368# Some registries use NIC handle references instead of directly listing contacts...
@@ -433,17 +433,19 @@ def allow_trailing_comma_dict(regexes):
433433)
434434
435435organization_regexes = (
436- r"(?:^|\s|,)limited\.?($|\s)" ,
437- r"(?:^|\s|,)holdings\.?($|\s)" ,
438- r"(?:^|\s|,)(?:in)?corporat(?:ed?|ion)\.?($|\s)" ,
439- r"(?:^|\s|,)company\.?($|\s)" ,
440- r"(?:^|\s|,)operations\.?($|\s)" ,
441- r"(?:^|\s|,)association\.?($|\s)" ,
442- r"(?:^|\s|,)council\.?($|\s)" ,
443- r"(?:^|\s|,)university\.?($|\s)" ,
444- r"(?:^|\s|,)college\.?($|\s)" ,
445- r"(?:^|\s|,)services?\.?($|\s)" ,
446- r"(?:^|\s|,)cabinet\.?($|\s)" ,
436+ r"(?:^|\s|,)limited\.?($|\s|,)" ,
437+ r"(?:^|\s|,)holdings\.?($|\s|,)" ,
438+ r"(?:^|\s|,)(?:in)?corporat(?:ed?|ion)\.?($|\s|,)" ,
439+ r"(?:^|\s|,)company\.?($|\s|,)" ,
440+ r"(?:^|\s|,)operations\.?($|\s|,)" ,
441+ r"(?:^|\s|,)association\.?($|\s|,)" ,
442+ r"(?:^|\s|,)council\.?($|\s|,)" ,
443+ r"(?:^|\s|,)university\.?($|\s|,)" ,
444+ r"(?:^|\s|,)college\.?($|\s|,)" ,
445+ r"(?:^|\s|,)services?\.?($|\s|,)" ,
446+ r"(?:^|\s|,)cabinet\.?($|\s|,)" ,
447+ r"(?:^|\s|,)billing\.?($|\s|,)" ,
448+ r"(?:^|\s|,)administration\.?($|\s|,)" ,
447449)
448450
449451known_abbreviations = allow_trailing_comma_dict ({
@@ -459,6 +461,10 @@ def allow_trailing_comma_dict(regexes):
459461 "d/b/a" : r"^(?:d\/b\/a|dba)$" ,
460462})
461463
464+ role_regexes = (
465+ r"(?:^|\s|,)administrator\.?($|\s|,)" ,
466+ )
467+
462468country_regexes = [r"(?:\s|,)" + dotify (country_code .upper ()) + r"($|\s)" for country_code in countries .keys ()]
463469
464470for key in ('id' , 'status' , 'creation_date' , 'expiration_date' , 'updated_date' , 'registrar' , 'whois_server' , 'nameservers' , 'emails' ):
@@ -475,6 +481,7 @@ def allow_trailing_comma_dict(regexes):
475481admin_contact_regexes = precompile_regexes (admin_contact_regexes )
476482nic_contact_regexes = precompile_regexes (nic_contact_regexes )
477483
484+ role_regexes = precompile_regexes (role_regexes , re .IGNORECASE )
478485organization_regexes = precompile_regexes (organization_regexes , re .IGNORECASE )
479486abbreviated_organization_regexes = precompile_regexes (abbreviated_organization_regexes , re .IGNORECASE )
480487country_regexes = precompile_regexes (country_regexes )
@@ -750,6 +757,11 @@ def normalize_data(data, normalized):
750757 is_organization = is_organization_name (line ) or is_fuzzy_duplicate (line , organization_lines )
751758
752759 if is_organization :
760+ if "," in line :
761+ name_words = re .split (name_separators , line )
762+ if is_full_incorporation_form (name_words [0 ]):
763+ line = reverse_name_comma (line )
764+
753765 new_organization_lines .append (line )
754766 del name_lines [i ]
755767
@@ -775,31 +787,66 @@ def normalize_data(data, normalized):
775787 elif 'organization' in contact :
776788 del contact ["organization" ]
777789
790+ new_roles = []
791+
778792 if 'name' in contact :
779793 # Check whether the name is reversed; first name last, last name first.
780794 names = contact ['name' ].splitlines ()
781795 unswapped_names = []
782796
783797 for name in names :
784798 if "," in name :
785- name_segments = [segment .strip () for segment in name .split ("," )]
786- first_segment = name_segments .pop ()
787- name = first_segment + " " + ', ' .join (name_segments )
799+ name = reverse_name_comma (name )
788800 else :
789801 # Split the name into normalized (ie. alpha-only) 'words' for comparison. We only care about ASCII, as our first-name
790802 # list currently only contains English names.
791803 name_words = [filter_characters (segment , non_name_characters ) for segment in name .split ()]
792804
793805 if len (name_words ) > 1 and is_first_name (name_words [- 1 ]) and not is_first_name (name_words [0 ]):
794806 # The last 'word' was in the common first names, but the first one was not. Likely swapped around.
795- name_segments = re .split (name_separators , name )
796- name_segments .insert (0 , name_segments .pop ())
797- name = ' ' .join (name_segments )
807+ name = reverse_name (name )
798808
799- unswapped_names .append (name )
809+ if is_role (name ):
810+ new_roles .append (name )
811+ else :
812+ unswapped_names .append (name )
800813
801- contact ['name' ] = "\n " .join (unswapped_names )
802-
814+ if len (unswapped_names ) > 0 :
815+ contact ['name' ] = "\n " .join (unswapped_names )
816+ else :
817+ del contact ['name' ]
818+
819+ if 'organization' in contact :
820+ organizations = contact ['organization' ].splitlines ()
821+ new_organizations = []
822+
823+ for organization in organizations :
824+ if is_role (organization ):
825+ new_roles .append (organization )
826+ else :
827+ new_organizations .append (organization )
828+
829+ if len (new_organizations ) > 0 :
830+ contact ['organization' ] = "\n " .join (new_organizations )
831+ else :
832+ del contact ['organization' ]
833+
834+ if 'street' in contact :
835+ streets = contact ['street' ].splitlines ()
836+
837+ if is_role (streets [0 ]):
838+ new_roles .append (streets [0 ])
839+ streets = streets [1 :]
840+
841+ contact ['street' ] = "\n " .join (streets )
842+
843+ if 'role' in contact :
844+ existing_roles = contact ['role' ].splitlines ()
845+ else :
846+ existing_roles = []
847+
848+ if len (new_roles ) > 0 :
849+ contact ['role' ] = "\n " .join (new_roles + existing_roles )
803850
804851 if "street" in contact :
805852 lines = [x .strip () for x in contact ["street" ].splitlines ()]
@@ -833,7 +880,7 @@ def normalize_data(data, normalized):
833880 if key in contact and contact [key ] is not None and (normalized == True or key in normalized ):
834881 contact [key ] = normalize_name (contact [key ], abbreviation_threshold = 3 )
835882
836- for key in ("city" , "organization" , "state" , "country" ):
883+ for key in ("role" , " city" , "organization" , "state" , "country" ):
837884 if key in contact and contact [key ] is not None and (normalized == True or key in normalized ):
838885 contact [key ] = normalize_name (contact [key ], abbreviation_threshold = 3 , length_threshold = 3 , check_known_incorrect = True )
839886
@@ -945,7 +992,10 @@ def is_full_incorporation_form(word):
945992 return match_regexes (word , organization_regexes )
946993
947994def is_abbreviated_incorporation_form (word ):
948- return match_regexes (word , abbreviated_organization_regexes )
995+ return match_regexes (word , abbreviated_organization_regexes )
996+
997+ def is_role (line ):
998+ return match_regexes (line , role_regexes )
949999
9501000def is_country (word ):
9511001 return match_regexes (word , country_regexes )
@@ -986,6 +1036,16 @@ def match_regexes_dict(string, regexes):
9861036def capitalize_words (line ):
9871037 return ' ' .join ([word .capitalize () for word in line .split (" " )])
9881038
1039+ def reverse_name (name ):
1040+ name_segments = re .split (name_separators , name )
1041+ name_segments .insert (0 , name_segments .pop ())
1042+ return ' ' .join (name_segments )
1043+
1044+ def reverse_name_comma (name ):
1045+ name_segments = [segment .strip () for segment in name .split ("," )]
1046+ first_segment = name_segments .pop ()
1047+ return first_segment + " " + ', ' .join (name_segments )
1048+
9891049def normalize_word (word , abbreviation_threshold = 4 , lowercase_domains = True ):
9901050 if is_known_abbreviation (word ):
9911051 return get_known_abbreviation (word )
0 commit comments