Skip to content

Commit 9203d83

Browse files
committed
Benchmarking and optimizations
1 parent 672f649 commit 9203d83

File tree

3 files changed

+58
-6
lines changed

3 files changed

+58
-6
lines changed

pwhois

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,12 @@ else:
101101
if key in contact_data and contact_data[key] is not None:
102102
label = " " + value + (" " * (widest_label - len(value))) + " :"
103103
if sys.version_info < (3, 0):
104-
actual_data = unicode(contact_data[key])
104+
if type(contact_data[key]) == str:
105+
actual_data = contact_data[key].decode("utf-8")
106+
elif type(contact_data[key]) == datetime.datetime:
107+
actual_data = unicode(contact_data[key])
108+
else:
109+
actual_data = contact_data[key]
105110
else:
106111
actual_data = str(contact_data[key])
107112
if "\n" in actual_data: # Indent multi-line values properly

pythonwhois/parse.py

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,9 @@ def read_dataset(filename, destination, abbrev_key, name_key, is_dict=False):
4848
read_dataset("states_us.dat", states_us, "abbreviation", "name", is_dict=True)
4949
read_dataset("states_ca.dat", states_ca, "abbreviation", "name", is_dict=True)
5050

51+
def precompile_regexes(source, flags=0):
52+
return [re.compile(regex, flags) for regex in source]
53+
5154
grammar = {
5255
"_data": {
5356
'id': ['Domain ID:[ ]*(?P<val>.+)'],
@@ -389,6 +392,30 @@ def preprocess_regex(regex):
389392
r"\ss\.?a\.?r\.?l\.?($|\s)",
390393
)
391394

395+
grammar["_data"]["id"] = precompile_regexes(grammar["_data"]["id"], re.IGNORECASE)
396+
grammar["_data"]["status"] = precompile_regexes(grammar["_data"]["status"], re.IGNORECASE)
397+
grammar["_data"]["creation_date"] = precompile_regexes(grammar["_data"]["creation_date"], re.IGNORECASE)
398+
grammar["_data"]["expiration_date"] = precompile_regexes(grammar["_data"]["expiration_date"], re.IGNORECASE)
399+
grammar["_data"]["updated_date"] = precompile_regexes(grammar["_data"]["updated_date"], re.IGNORECASE)
400+
grammar["_data"]["registrar"] = precompile_regexes(grammar["_data"]["registrar"], re.IGNORECASE)
401+
grammar["_data"]["whois_server"] = precompile_regexes(grammar["_data"]["whois_server"], re.IGNORECASE)
402+
grammar["_data"]["nameservers"] = precompile_regexes(grammar["_data"]["nameservers"], re.IGNORECASE)
403+
grammar["_data"]["emails"] = precompile_regexes(grammar["_data"]["emails"], re.IGNORECASE)
404+
405+
grammar["_dateformats"] = precompile_regexes(grammar["_dateformats"], re.IGNORECASE)
406+
407+
registrant_regexes = precompile_regexes(registrant_regexes)
408+
tech_contact_regexes = precompile_regexes(tech_contact_regexes)
409+
billing_contact_regexes = precompile_regexes(billing_contact_regexes)
410+
admin_contact_regexes = precompile_regexes(admin_contact_regexes)
411+
nic_contact_regexes = precompile_regexes(nic_contact_regexes)
412+
organization_regexes = precompile_regexes(organization_regexes, re.IGNORECASE)
413+
414+
nic_contact_references["registrant"] = precompile_regexes(nic_contact_references["registrant"])
415+
nic_contact_references["tech"] = precompile_regexes(nic_contact_references["tech"])
416+
nic_contact_references["admin"] = precompile_regexes(nic_contact_references["admin"])
417+
nic_contact_references["billing"] = precompile_regexes(nic_contact_references["billing"])
418+
392419
if sys.version_info < (3, 0):
393420
def is_string(data):
394421
"""Test for string with support for python 2."""
@@ -409,7 +436,7 @@ def parse_raw_whois(raw_data, normalized=[], never_query_handles=True, handle_se
409436
if (rule_key in data) == False:
410437
for line in segment.splitlines():
411438
for regex in rule_regexes:
412-
result = re.search(regex, line, re.IGNORECASE)
439+
result = re.search(regex, line)
413440

414441
if result is not None:
415442
val = result.group("val").strip()
@@ -634,7 +661,7 @@ def normalize_data(data, normalized):
634661
new_lines = []
635662
for i, line in enumerate(lines):
636663
for regex in organization_regexes:
637-
if re.search(regex, line, re.IGNORECASE):
664+
if re.search(regex, line):
638665
new_lines.append(line)
639666
del lines[i]
640667
break
@@ -650,7 +677,7 @@ def normalize_data(data, normalized):
650677
lines = [x.strip() for x in contact["street"].splitlines()]
651678
if len(lines) > 1:
652679
for regex in organization_regexes:
653-
if re.search(regex, lines[0], re.IGNORECASE):
680+
if re.search(regex, lines[0]):
654681
contact["organization"] = lines[0]
655682
contact["street"] = "\n".join(lines[1:])
656683
break
@@ -714,7 +741,7 @@ def parse_dates(dates):
714741

715742
for date in dates:
716743
for rule in grammar['_dateformats']:
717-
result = re.match(rule, date, re.IGNORECASE)
744+
result = re.match(rule, date)
718745

719746
if result is not None:
720747
try:

test.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/usr/bin/env python2
22

3-
import sys, argparse, os, pythonwhois, json, datetime, codecs
3+
import sys, argparse, os, pythonwhois, json, datetime, codecs, time
44
import pkgutil
55
import encodings
66

@@ -94,6 +94,8 @@ def recursive_compare(obj1, obj2, chain=[]):
9494
targets.sort()
9595

9696
if args.mode[0] == "run":
97+
times_default = []
98+
times_normalized = []
9799
errors = False
98100
suites = []
99101
for target in targets:
@@ -134,7 +136,9 @@ def recursive_compare(obj1, obj2, chain=[]):
134136
total = len(suites) * 2
135137
for target, data, target_default, target_normalized in suites:
136138
for normalization in (True, []):
139+
start_time = time.time()
137140
parsed = pythonwhois.parse.parse_raw_whois(data, normalized=normalization)
141+
time_taken = (time.time() - start_time) * 1000 # in ms
138142
parsed = json.loads(encoded_json_dumps(parsed)) # Stupid Unicode hack
139143

140144
if normalization == True:
@@ -155,6 +159,10 @@ def recursive_compare(obj1, obj2, chain=[]):
155159
sys.stdout.write(OK)
156160
sys.stdout.write(progress_prefix + "%s passed in %s mode.\n" % (target, mode))
157161
sys.stderr.write(ENDC)
162+
if normalization == True:
163+
times_normalized.append(time_taken)
164+
else:
165+
times_default.append(time_taken)
158166
total_passed += 1
159167
else:
160168
sys.stderr.write(FAIL)
@@ -169,6 +177,18 @@ def recursive_compare(obj1, obj2, chain=[]):
169177
total_failed += 1
170178
done += 1
171179

180+
if len(times_default) > 0:
181+
average_default = int(sum(times_default) / float(len(times_default)))
182+
min_default = min(times_default)
183+
max_default = max(times_default)
184+
sys.stdout.write("Timing in default mode: %dms avg, %dms min, %dms max\n" % (average_default, min_default, max_default))
185+
186+
if len(times_normalized) > 0:
187+
average_normalized = int(sum(times_normalized) / float(len(times_normalized)))
188+
min_normalized = min(times_normalized)
189+
max_normalized = max(times_normalized)
190+
sys.stdout.write("Timing in normalized mode: %dms avg, %dms min, %dms max\n" % (average_normalized, min_normalized, max_normalized))
191+
172192
if total_failed == 0:
173193
sys.stdout.write(OK)
174194
sys.stdout.write("All tests passed!\n")

0 commit comments

Comments
 (0)