Benchmarking and optimizations

joepie91 · joepie91 · commit 9203d83c0358 · 2014-06-28T17:03:43.000+02:00
diff --git a/pwhois b/pwhois
@@ -101,7 +101,12 @@ else:
 						if key in contact_data and contact_data[key] is not None:
 							label = "    " + value + (" " * (widest_label - len(value))) + " :"
 							if sys.version_info < (3, 0):
-								actual_data = unicode(contact_data[key])
+								if type(contact_data[key]) == str:
+									actual_data = contact_data[key].decode("utf-8")
+								elif type(contact_data[key]) == datetime.datetime:
+									actual_data = unicode(contact_data[key])
+								else:
+									actual_data = contact_data[key]
 							else:
 								actual_data = str(contact_data[key])
 							if "\n" in actual_data: # Indent multi-line values properly
diff --git a/pythonwhois/parse.py b/pythonwhois/parse.py
@@ -48,6 +48,9 @@ def read_dataset(filename, destination, abbrev_key, name_key, is_dict=False):
 read_dataset("states_us.dat", states_us, "abbreviation", "name", is_dict=True)
 read_dataset("states_ca.dat", states_ca, "abbreviation", "name", is_dict=True)
 
+def precompile_regexes(source, flags=0):
+	return [re.compile(regex, flags) for regex in source]
+	
 grammar = {
 	"_data": {
 		'id':			['Domain ID:[ ]*(?P<val>.+)'],
@@ -389,6 +392,30 @@ def preprocess_regex(regex):
 	r"\ss\.?a\.?r\.?l\.?($|\s)",
 )
 
+grammar["_data"]["id"] = precompile_regexes(grammar["_data"]["id"], re.IGNORECASE)
+grammar["_data"]["status"] = precompile_regexes(grammar["_data"]["status"], re.IGNORECASE)
+grammar["_data"]["creation_date"] = precompile_regexes(grammar["_data"]["creation_date"], re.IGNORECASE)
+grammar["_data"]["expiration_date"] = precompile_regexes(grammar["_data"]["expiration_date"], re.IGNORECASE)
+grammar["_data"]["updated_date"] = precompile_regexes(grammar["_data"]["updated_date"], re.IGNORECASE)
+grammar["_data"]["registrar"] = precompile_regexes(grammar["_data"]["registrar"], re.IGNORECASE)
+grammar["_data"]["whois_server"] = precompile_regexes(grammar["_data"]["whois_server"], re.IGNORECASE)
+grammar["_data"]["nameservers"] = precompile_regexes(grammar["_data"]["nameservers"], re.IGNORECASE)
+grammar["_data"]["emails"] = precompile_regexes(grammar["_data"]["emails"], re.IGNORECASE)
+
+grammar["_dateformats"] = precompile_regexes(grammar["_dateformats"], re.IGNORECASE)
+
+registrant_regexes = precompile_regexes(registrant_regexes)
+tech_contact_regexes = precompile_regexes(tech_contact_regexes)
+billing_contact_regexes = precompile_regexes(billing_contact_regexes)
+admin_contact_regexes = precompile_regexes(admin_contact_regexes)
+nic_contact_regexes = precompile_regexes(nic_contact_regexes)
+organization_regexes = precompile_regexes(organization_regexes, re.IGNORECASE)
+
+nic_contact_references["registrant"] = precompile_regexes(nic_contact_references["registrant"])
+nic_contact_references["tech"] = precompile_regexes(nic_contact_references["tech"])
+nic_contact_references["admin"] = precompile_regexes(nic_contact_references["admin"])
+nic_contact_references["billing"] = precompile_regexes(nic_contact_references["billing"])
+
 if sys.version_info < (3, 0):
 	def is_string(data):
 		"""Test for string with support for python 2."""
@@ -409,7 +436,7 @@ def parse_raw_whois(raw_data, normalized=[], never_query_handles=True, handle_se
 			if (rule_key in data) == False:
 				for line in segment.splitlines():
 					for regex in rule_regexes:
-						result = re.search(regex, line, re.IGNORECASE)
+						result = re.search(regex, line)
 
 						if result is not None:
 							val = result.group("val").strip()
@@ -634,7 +661,7 @@ def normalize_data(data, normalized):
 				new_lines = []
 				for i, line in enumerate(lines):
 					for regex in organization_regexes:
-						if re.search(regex, line, re.IGNORECASE):
+						if re.search(regex, line):
 							new_lines.append(line)
 							del lines[i]
 							break
@@ -650,7 +677,7 @@ def normalize_data(data, normalized):
 				lines = [x.strip() for x in contact["street"].splitlines()]
 				if len(lines) > 1:
 					for regex in organization_regexes:
-						if re.search(regex, lines[0], re.IGNORECASE):
+						if re.search(regex, lines[0]):
 							contact["organization"] = lines[0]
 							contact["street"] = "\n".join(lines[1:])
 							break
@@ -714,7 +741,7 @@ def parse_dates(dates):
 
 	for date in dates:
 		for rule in grammar['_dateformats']:
-			result = re.match(rule, date, re.IGNORECASE)
+			result = re.match(rule, date)
 
 			if result is not None:
 				try:
diff --git a/test.py b/test.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python2
 
-import sys, argparse, os, pythonwhois, json, datetime, codecs
+import sys, argparse, os, pythonwhois, json, datetime, codecs, time
 import pkgutil
 import encodings
 
@@ -94,6 +94,8 @@ def recursive_compare(obj1, obj2, chain=[]):
 targets.sort()
 
 if args.mode[0] == "run":
+	times_default = []
+	times_normalized = []
 	errors = False
 	suites = []
 	for target in targets:
@@ -134,7 +136,9 @@ def recursive_compare(obj1, obj2, chain=[]):
 	total = len(suites) * 2
 	for target, data, target_default, target_normalized in suites:
 		for normalization in (True, []):
+			start_time = time.time()
 			parsed = pythonwhois.parse.parse_raw_whois(data, normalized=normalization)
+			time_taken = (time.time() - start_time) * 1000 # in ms
 			parsed = json.loads(encoded_json_dumps(parsed)) # Stupid Unicode hack
 			
 			if normalization == True:
@@ -155,6 +159,10 @@ def recursive_compare(obj1, obj2, chain=[]):
 				sys.stdout.write(OK)
 				sys.stdout.write(progress_prefix + "%s passed in %s mode.\n" % (target, mode))
 				sys.stderr.write(ENDC)
+				if normalization == True:
+					times_normalized.append(time_taken)
+				else:
+					times_default.append(time_taken)
 				total_passed += 1
 			else:
 				sys.stderr.write(FAIL)
@@ -169,6 +177,18 @@ def recursive_compare(obj1, obj2, chain=[]):
 				total_failed += 1
 			done += 1
 		
+	if len(times_default) > 0:
+		average_default = int(sum(times_default) / float(len(times_default)))
+		min_default = min(times_default)
+		max_default = max(times_default)
+		sys.stdout.write("Timing in default mode: %dms avg, %dms min, %dms max\n" % (average_default, min_default, max_default))
+		
+	if len(times_normalized) > 0:
+		average_normalized = int(sum(times_normalized) / float(len(times_normalized)))
+		min_normalized = min(times_normalized)
+		max_normalized = max(times_normalized)
+		sys.stdout.write("Timing in normalized mode: %dms avg, %dms min, %dms max\n" % (average_normalized, min_normalized, max_normalized))
+		
 	if total_failed == 0:
 		sys.stdout.write(OK)
 		sys.stdout.write("All tests passed!\n")