[process-stats-dir] Support (re)setting & comparing to CSV baselines.

graydon · graydon · commit 5718b76dedd7 · 2017-06-01T17:25:48.000-07:00
diff --git a/utils/process-stats-dir.py b/utils/process-stats-dir.py
@@ -294,40 +294,127 @@ def show_incrementality(args):
                                   incrementality=pct))
 
 
-def compare_frontend_stats(args):
-    assert(len(args.remainder) == 2)
-    (olddir, newdir) = args.remainder
+def diff_and_pct(old, new):
+    if old == 0:
+        if new == 0:
+            return (0, 0.0)
+        else:
+            return (new, 100.0)
+    delta = (new - old)
+    delta_pct = round((float(delta) / float(old)) * 100.0, 2)
+    return (delta, delta_pct)
+
+
+def update_epoch_value(d, name, epoch, value):
+    changed = 0
+    if name in d:
+        (existing_epoch, existing_value) = d[name]
+        if existing_epoch > epoch:
+            print("note: keeping newer value %d from epoch %d for %s"
+                  % (existing_value, existing_epoch, name))
+            epoch = existing_epoch
+            value = existing_value
+        elif existing_value == value:
+            epoch = existing_epoch
+        else:
+            (_, delta_pct) = diff_and_pct(existing_value, value)
+            print ("note: changing value %d -> %d (%.2f%%) for %s" %
+                   (existing_value, value, delta_pct, name))
+            changed = 1
+    d[name] = (epoch, value)
+    return (epoch, value, changed)
+
+
+def read_stats_dict_from_csv(f):
+    infieldnames = ["epoch", "name", "value"]
+    c = csv.DictReader(f, infieldnames,
+                       dialect='excel-tab',
+                       quoting=csv.QUOTE_NONNUMERIC)
+    d = {}
+    for row in c:
+        epoch = int(row["epoch"])
+        name = row["name"]
+        value = int(row["value"])
+        update_epoch_value(d, name, epoch, value)
+    return d
+
+
+# The idea here is that a "baseline" is a (tab-separated) CSV file full of
+# the counters you want to track, each prefixed by an epoch timestamp of
+# the last time the value was reset.
+#
+# When you set a fresh baseline, all stats in the provided stats dir are
+# written to the baseline. When you set against an _existing_ baseline,
+# only the counters mentioned in the existing baseline are updated, and
+# only if their values differ.
+#
+# Finally, since it's a line-oriented CSV file, you can put:
+#
+#    mybaseline.csv merge=union
+#
+# in your .gitattributes file, and forget about merge conflicts. The reader
+# function above will take the later epoch anytime it detects duplicates,
+# so union-merging is harmless. Duplicates will be eliminated whenever the
+# next baseline-set is done.
+def set_csv_baseline(args):
+    existing = None
+    if os.path.exists(args.set_csv_baseline):
+        with open(args.set_csv_baseline, "r") as f:
+            existing = read_stats_dict_from_csv(f)
+            print ("updating %d baseline entries in %s" %
+                   (len(existing), args.set_csv_baseline))
+    else:
+        print "making new baseline " + args.set_csv_baseline
+    fieldnames = ["epoch", "name", "value"]
+    with open(args.set_csv_baseline, "wb") as f:
+        out = csv.DictWriter(f, fieldnames, dialect='excel-tab',
+                             quoting=csv.QUOTE_NONNUMERIC)
+        m = merge_all_jobstats([s for d in args.remainder
+                                for s in load_stats_dir(d)])
+        changed = 0
+        newepoch = int(time.time())
+        for name in sorted(m.stats.keys()):
+            epoch = newepoch
+            value = m.stats[name]
+            if existing is not None:
+                if name not in existing:
+                    continue
+                (epoch, value, chg) = update_epoch_value(existing, name,
+                                                         epoch, value)
+                changed += chg
+            out.writerow(dict(epoch=int(epoch),
+                              name=name,
+                              value=int(value)))
+        if existing is not None:
+            print "changed %d entries in baseline" % changed
+    return 0
+
+
+def compare_to_csv_baseline(args):
+    old_stats = read_stats_dict_from_csv(args.compare_to_csv_baseline)
+    m = merge_all_jobstats([s for d in args.remainder
+                            for s in load_stats_dir(d)])
+    new_stats = m.stats
 
     regressions = 0
-    fieldnames = ["old", "new", "delta_pct", "name"]
-    out = csv.DictWriter(args.output, fieldnames, dialect='excel-tab')
+    outfieldnames = ["old", "new", "delta_pct", "name"]
+    out = csv.DictWriter(args.output, outfieldnames, dialect='excel-tab')
     out.writeheader()
 
-    old_stats = load_stats_dir(olddir)
-    new_stats = load_stats_dir(newdir)
-    old_merged = merge_all_jobstats([x for x in old_stats
-                                     if x.is_frontend_job()])
-    new_merged = merge_all_jobstats([x for x in new_stats
-                                     if x.is_frontend_job()])
-    if old_merged is None or new_merged is None:
-        return regressions
-    for stat_name in sorted(old_merged.stats.keys()):
-        if stat_name in new_merged.stats:
-            old = old_merged.stats[stat_name]
-            new = new_merged.stats.get(stat_name, 0)
-            if old == 0 or new == 0:
-                continue
-            delta = (new - old)
-            delta_pct = round((float(delta) / float(old)) * 100.0, 2)
-            if (stat_name.startswith("time.") and
-               abs(delta) < args.delta_usec_thresh):
-                continue
-            if abs(delta_pct) < args.delta_pct_thresh:
-                continue
-            out.writerow(dict(name=stat_name, old=old, new=new,
-                              delta_pct=delta_pct))
-            if delta > 0:
-                regressions += 1
+    for stat_name in sorted(old_stats.keys()):
+        (_, old) = old_stats[stat_name]
+        new = new_stats.get(stat_name, 0)
+        (delta, delta_pct) = diff_and_pct(old, new)
+        if (stat_name.startswith("time.") and
+           abs(delta) < args.delta_usec_thresh):
+            continue
+        if abs(delta_pct) < args.delta_pct_thresh:
+            continue
+        out.writerow(dict(name=stat_name,
+                          old=int(old), new=int(new),
+                          delta_pct=delta_pct))
+        if delta > 0:
+            regressions += 1
     return regressions
 
 
@@ -364,8 +451,12 @@ def main():
                        help="emit a 'catapult'-compatible trace of events")
     modes.add_argument("--incrementality", action="store_true",
                        help="summarize the 'incrementality' of a build")
-    modes.add_argument("--compare-frontend-stats", action="store_true",
-                       help="Compare frontend stats from two stats-dirs")
+    modes.add_argument("--set-csv-baseline", type=str, default=None,
+                       help="Merge stats from a stats-dir into a CSV baseline")
+    modes.add_argument("--compare-to-csv-baseline",
+                       type=argparse.FileType('rb', 0),
+                       metavar="BASELINE.csv",
+                       help="Compare stats dir to named CSV baseline")
     modes.add_argument("--lnt", action="store_true",
                        help="Emit an LNT-compatible test summary")
     parser.add_argument('remainder', nargs=argparse.REMAINDER,
@@ -377,8 +468,10 @@ def main():
         return 1
     if args.catapult:
         write_catapult_trace(args)
-    elif args.compare_frontend_stats:
-        return compare_frontend_stats(args)
+    elif args.set_csv_baseline is not None:
+        return set_csv_baseline(args)
+    elif args.compare_to_csv_baseline:
+        return compare_to_csv_baseline(args)
     elif args.incrementality:
         if args.paired:
             show_paired_incrementality(args)