pandas-dev · ghost · Nov 24, 2012 · Nov 24, 2012
diff --git a/test_perf.sh b/test_perf.sh
@@ -3,10 +3,10 @@
 CURDIR=$(pwd)
 BASEDIR=$(readlink -f $(dirname $0  ))
 
-echo "Use vbench to compare HEAD against a known-good baseline."
+echo "Use vbench to compare the performance of one commit against another."
 echo "Make sure the python 'vbench' library is installed..\n"
 
 cd "$BASEDIR/vb_suite/"
-python test_perf.py
+python test_perf.py $@
 
 cd "$CURDIR"
diff --git a/vb_suite/test_perf.py b/vb_suite/test_perf.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
 """
@@ -27,21 +27,38 @@
 everything and calculate a ration for the timing information.
 7) print the results to the log file and to stdout.
 
-Known Issues: vbench fails to locate a baseline if HEAD is not a descendent
 """
-import sys
-import shutil
 
-from pandas import *
-from vbench.api import BenchmarkRunner
-from vbench.db import BenchmarkDB
-from vbench.git import GitRepo
+import shutil
+import os
+import argparse
 import tempfile
 
-from suite import *
-
-BASELINE_COMMIT = 'bdbca8e'  # v0.9,1 + regression fix
-LOG_FILE = os.path.abspath(os.path.join(REPO_PATH, 'vb_suite.log'))
+from pandas import DataFrame
+
+DEFAULT_MIN_DURATION = 0.01
+BASELINE_COMMIT = 'bdbca8e3dc' # 9,1 + regression fix # TODO: detect upstream/master
+
+parser = argparse.ArgumentParser(description='Use vbench to generate a report comparing performance between two commits.')
+parser.add_argument('-a', '--auto',
+                    help='Execute a run using the defaults for the base and target commits.',
+                    action='store_true',
+                    default=False)
+parser.add_argument('-b','--base-commit',
+                    help='The commit serving as performance baseline (default: %s).' % BASELINE_COMMIT,
+                    type=str)
+parser.add_argument('-t','--target-commit',
+                    help='The commit to compare against the baseline (default: HEAD).',
+                    type=str)
+parser.add_argument('-m', '--min-duration',
+                    help='Minimum duration (in ms) of baseline test for inclusion in report (default: %.3f).' % DEFAULT_MIN_DURATION,
+                    type=float,
+                    default=0.01)
+parser.add_argument('-o', '--output',
+                    metavar="<file>",
+                    dest='log_file',
+                    help='path of file in which to save the report (default: vb_suite.log).')
+args = parser.parse_args()
 
 def get_results_df(db,rev):
     """Takes a git commit hash and returns a Dataframe of benchmark results
@@ -59,44 +76,68 @@ def prprint(s):
     print("*** %s"%s)
 
 def main():
+    from vbench.api import BenchmarkRunner
+    from vbench.db import BenchmarkDB
+    from suite import REPO_PATH, BUILD, DB_PATH, PREPARE, dependencies, benchmarks
+
+    if not args.base_commit:
+        args.base_commit = BASELINE_COMMIT
+
+    # GitRepo wants exactly 7 character hash?
+    args.base_commit = args.base_commit[:7]
+    if args.target_commit:
+        args.target_commit = args.target_commit[:7]
+
+    if not args.log_file:
+        args.log_file = os.path.abspath(os.path.join(REPO_PATH, 'vb_suite.log'))
+
     TMP_DIR =  tempfile.mkdtemp()
     prprint("TMP_DIR = %s" % TMP_DIR)
-    prprint("LOG_FILE = %s\n" % LOG_FILE)
+    prprint("LOG_FILE = %s\n" % args.log_file)
 
     try:
-        logfile = open(LOG_FILE, 'w')
+        logfile = open(args.log_file, 'w')
 
         prprint( "Processing Repo at '%s'..." % REPO_PATH)
-        repo = GitRepo(REPO_PATH)
 
         # get hashes of baseline and current head
-        h_head = repo.shas[-1]
-        h_baseline = BASELINE_COMMIT
 
         prprint( "Opening DB at '%s'...\n" % DB_PATH)
         db = BenchmarkDB(DB_PATH)
 
-        prprint( 'Comparing Head [%s] : %s ' % (h_head, repo.messages.get(h_head,"")))
-        prprint( 'Against baseline [%s] : %s \n' % (h_baseline,
-                repo.messages.get(h_baseline,"")))
 
         prprint("Initializing Runner...")
         runner = BenchmarkRunner(benchmarks, REPO_PATH, REPO_PATH, BUILD, DB_PATH,
                                  TMP_DIR, PREPARE, always_clean=True,
             #                             run_option='eod', start_date=START_DATE,
                                  module_dependencies=dependencies)
 
+        repo = runner.repo #(steal the parsed git repo used by runner)
+
+        # ARGH. reparse the repo, not discarding any commits,
+        # and overwrite the previous parse results
+        #prprint ("Slaughtering kittens..." )
+        (repo.shas, repo.messages,
+         repo.timestamps, repo.authors) = _parse_commit_log(REPO_PATH)
+
+        h_head = args.target_commit or repo.shas[-1]
+        h_baseline = args.base_commit
+
+        prprint('Target [%s] : %s\n' % (h_head, repo.messages.get(h_head,"")))
+        prprint('Baseline [%s] : %s\n' % (h_baseline,repo.messages.get(h_baseline,"")))
+
+
         prprint ("removing any previous measurements for the commits." )
         db.delete_rev_results(h_baseline)
         db.delete_rev_results(h_head)
 
         # TODO: we could skip this, but we need to make sure all
         # results are in the DB, which is a little tricky with
         # start dates and so on.
-        prprint( "Running benchmarks for baseline commit '%s'" % h_baseline)
+        prprint( "Running benchmarks for baseline [%s]" % h_baseline)
         runner._run_and_write_results(h_baseline)
 
-        prprint ("Running benchmarks for current HEAD '%s'" % h_head)
+        prprint ("Running benchmarks for target [%s]" % h_head)
         runner._run_and_write_results(h_head)
 
         prprint( 'Processing results...')
@@ -108,26 +149,71 @@ def main():
                                 t_baseline=baseline_res['timing'],
                                 ratio=ratio,
                                 name=baseline_res.name),columns=["t_head","t_baseline","ratio","name"])
-        totals = totals.ix[totals.t_head > 0.010] # ignore sub 10micros
+        totals = totals.ix[totals.t_head > args.min_duration] # ignore below threshold
         totals = totals.dropna().sort("ratio").set_index('name') # sort in ascending order
 
         s = "\n\nResults:\n" + totals.to_string(float_format=lambda x: "%0.4f" %x) + "\n\n"
-        s += "Columns: test_name | head_time [ms] | baseline_time [ms] | ratio\n\n"
-        s += "- a Ratio of 1.30 means HEAD is 30% slower then the Baseline.\n\n"
+        s += "Columns: test_name | target_duration [ms] | baseline_duration [ms] | ratio\n\n"
+        s += "- a Ratio of 1.30 means the target commit is 30% slower then the baseline.\n\n"
 
-        s += 'Head [%s] : %s\n' % (h_head, repo.messages.get(h_head,""))
+        s += 'Target [%s] : %s\n' % (h_head, repo.messages.get(h_head,""))
         s += 'Baseline [%s] : %s\n\n' % (h_baseline,repo.messages.get(h_baseline,""))
 
         logfile.write(s)
         logfile.close()
 
         prprint(s )
-        prprint("Results were also written to the logfile at '%s'\n" % LOG_FILE)
+        prprint("Results were also written to the logfile at '%s'\n" % args.log_file)
 
     finally:
         #        print("Disposing of TMP_DIR: %s" % TMP_DIR)
         shutil.rmtree(TMP_DIR)
         logfile.close()
 
+
+# hack , vbench.git ignores some commits, but we
+# need to be able to reference any commit.
+# modified from vbench.git
+def _parse_commit_log(repo_path):
+    from vbench.git import parser, _convert_timezones
+    from pandas import Series
+    git_cmd = 'git --git-dir=%s/.git --work-tree=%s ' % (repo_path, repo_path)
+    githist = git_cmd + ('log --graph --pretty=format:'
+                          '\"::%h::%cd::%s::%an\" > githist.txt')
+    os.system(githist)
+    githist = open('githist.txt').read()
+    os.remove('githist.txt')
+
+    shas = []
+    timestamps = []
+    messages = []
+    authors = []
+    for line in githist.split('\n'):
+        if '*' not in line.split("::")[0]: # skip non-commit lines
+            continue
+
+        _, sha, stamp, message, author = line.split('::', 4)
+
+        # parse timestamp into datetime object
+        stamp = parser.parse(stamp)
+
+        shas.append(sha)
+        timestamps.append(stamp)
+        messages.append(message)
+        authors.append(author)
+
+    # to UTC for now
+    timestamps = _convert_timezones(timestamps)
+
+    shas = Series(shas, timestamps)
+    messages = Series(messages, shas)
+    timestamps = Series(timestamps, shas)
+    authors = Series(authors, shas)
+    return shas[::-1], messages[::-1], timestamps[::-1], authors[::-1]
+
+
 if __name__ == '__main__':
-    main()
+    if not args.auto and not args.base_commit and not args.target_commit:
+        parser.print_help()
+    else:
+        main()