nvbench/python/scripts/nvbench_compare.py at f77d001206a94a0fc7e57e155a4a30f0806fac8c · NVIDIA/nvbench · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
#!/usr/bin/env python

import argparse
import math
import os
import sys

import jsondiff
import tabulate
from colorama import Fore

try:
    from nvbench_json import reader
except ImportError:
    from scripts.nvbench_json import reader


# Parse version string into tuple, "x.y.z" -> (x, y, z)
def version_tuple(v):
    return tuple(map(int, (v.split("."))))


tabulate_version = version_tuple(tabulate.__version__)

all_ref_devices = []
all_cmp_devices = []
config_count = 0
unknown_count = 0
failure_count = 0
pass_count = 0


def find_matching_bench(needle, haystack):
    for hay in haystack:
        if hay["name"] == needle["name"]:
            return hay
    return None


def find_device_by_id(device_id, all_devices):
    for device in all_devices:
        if device["id"] == device_id:
            return device
    return None


def format_int64_axis_value(axis_name, axis_value, axes):
    axis = next(filter(lambda ax: ax["name"] == axis_name, axes))
    axis_flags = axis["flags"]
    value = int(axis_value["value"])
    if axis_flags == "pow2":
        value = math.log2(value)
        return "2^%d" % value
    return "%d" % value


def format_float64_axis_value(axis_name, axis_value, axes):
    return "%.5g" % float(axis_value["value"])


def format_type_axis_value(axis_name, axis_value, axes):
    return "%s" % axis_value["value"]


def format_string_axis_value(axis_name, axis_value, axes):
    return "%s" % axis_value["value"]


def format_axis_value(axis_name, axis_value, axes):
    axis = next(filter(lambda ax: ax["name"] == axis_name, axes))
    axis_type = axis["type"]
    if axis_type == "int64":
        return format_int64_axis_value(axis_name, axis_value, axes)
    elif axis_type == "float64":
        return format_float64_axis_value(axis_name, axis_value, axes)
    elif axis_type == "type":
        return format_type_axis_value(axis_name, axis_value, axes)
    elif axis_type == "string":
        return format_string_axis_value(axis_name, axis_value, axes)


def format_duration(seconds):
    if seconds >= 1:
        multiplier = 1.0
        units = "s"
    elif seconds >= 1e-3:
        multiplier = 1e3
        units = "ms"
    elif seconds >= 1e-6:
        multiplier = 1e6
        units = "us"
    else:
        multiplier = 1e6
        units = "us"
    return "%0.3f %s" % (seconds * multiplier, units)


def format_percentage(percentage):
    # When there aren't enough samples for a meaningful noise measurement,
    # the noise is recorded as infinity. Unfortunately, JSON spec doesn't
    # allow for inf, so these get turned into null.
    if percentage is None:
        return "inf"
    return "%0.2f%%" % (percentage * 100.0)


def compare_benches(ref_benches, cmp_benches, threshold, plot):
    if plot:
        import matplotlib.pyplot as plt
        import seaborn as sns

        sns.set()

    for cmp_bench in cmp_benches:
        ref_bench = find_matching_bench(cmp_bench, ref_benches)
        if not ref_bench:
            continue

        print("# %s\n" % (cmp_bench["name"]))

        cmp_device_ids = cmp_bench["devices"]
        axes = cmp_bench["axes"]
        ref_states = ref_bench["states"]
        cmp_states = cmp_bench["states"]

        axes = axes if axes else []

        headers = [x["name"] for x in axes]
        colalign = ["center"] * len(headers)

        headers.append("Ref Time")
        colalign.append("right")
        headers.append("Ref Noise")
        colalign.append("right")
        headers.append("Cmp Time")
        colalign.append("right")
        headers.append("Cmp Noise")
        colalign.append("right")
        headers.append("Diff")
        colalign.append("right")
        headers.append("%Diff")
        colalign.append("right")
        headers.append("Status")
        colalign.append("center")

        added_batch_headers = False

        for cmp_device_id in cmp_device_ids:
            rows = []
            plot_data = {"cmp": {}, "ref": {}, "cmp_noise": {}, "ref_noise": {}}

            for cmp_state in cmp_states:
                cmp_state_name = cmp_state["name"]
                ref_state = next(
                    filter(lambda st: st["name"] == cmp_state_name, ref_states), None
                )
                if not ref_state:
                    continue

                axis_values = cmp_state["axis_values"]
                if not axis_values:
                    axis_values = []

                row = []
                for axis_value in axis_values:
                    axis_value_name = axis_value["name"]
                    row.append(format_axis_value(axis_value_name, axis_value, axes))

                cmp_summaries = cmp_state["summaries"]
                ref_summaries = ref_state["summaries"]

                if not ref_summaries or not cmp_summaries:
                    continue

                def lookup_summary(summaries, tag):
                    return next(filter(lambda s: s["tag"] == tag, summaries), None)

                cmp_time_summary = lookup_summary(
                    cmp_summaries, "nv/cold/time/gpu/mean"
                )
                ref_time_summary = lookup_summary(
                    ref_summaries, "nv/cold/time/gpu/mean"
                )
                cmp_noise_summary = lookup_summary(
                    cmp_summaries, "nv/cold/time/gpu/stdev/relative"
                )
                ref_noise_summary = lookup_summary(
                    ref_summaries, "nv/cold/time/gpu/stdev/relative"
                )
                cmp_batch_summary = lookup_summary(
                    cmp_summaries, "nv/batch/time/gpu/mean"
                )
                ref_batch_summary = lookup_summary(
                    ref_summaries, "nv/batch/time/gpu/mean"
                )

                # TODO: Use other timings, too. Maybe multiple rows, with a
                # "Timing" column + values "CPU/GPU/Batch"?
                if not all(
                    [
                        cmp_time_summary,
                        ref_time_summary,
                        cmp_noise_summary,
                        ref_noise_summary,
                    ]
                ):
                    continue

                has_batch_data = cmp_batch_summary and ref_batch_summary
                if has_batch_data and not added_batch_headers:
                    headers.append("B Ref Time")
                    colalign.append("right")
                    headers.append("B Cmp Time")
                    colalign.append("right")
                    headers.append("B Diff")
                    colalign.append("right")
                    headers.append("B %Diff")
                    colalign.append("right")
                    headers.append("B Status")
                    colalign.append("center")
                    added_batch_headers = True

                def extract_value(summary):
                    summary_data = summary["data"]
                    value_data = next(
                        filter(lambda v: v["name"] == "value", summary_data)
                    )
                    assert value_data["type"] == "float64"
                    return value_data["value"]

                cmp_time = extract_value(cmp_time_summary)
                ref_time = extract_value(ref_time_summary)
                cmp_noise = extract_value(cmp_noise_summary)
                ref_noise = extract_value(ref_noise_summary)
                if has_batch_data:
                    cmp_batch_time = extract_value(cmp_batch_summary)
                    ref_batch_time = extract_value(ref_batch_summary)

                # Convert string encoding to expected numerics:
                cmp_time = float(cmp_time)
                ref_time = float(ref_time)

                diff = cmp_time - ref_time
                frac_diff = diff / ref_time

                if has_batch_data:
                    cmp_batch_time = float(cmp_batch_time)
                    ref_batch_time = float(ref_batch_time)
                    diff_batch = cmp_batch_time - ref_batch_time
                    frac_diff_batch = diff_batch / ref_batch_time

                if ref_noise and cmp_noise:
                    ref_noise = float(ref_noise)
                    cmp_noise = float(cmp_noise)
                    min_noise = min(ref_noise, cmp_noise)
                elif ref_noise:
                    ref_noise = float(ref_noise)
                    min_noise = ref_noise
                elif cmp_noise:
                    cmp_noise = float(cmp_noise)
                    min_noise = cmp_noise
                else:
                    min_noise = None  # Noise is inf

                if plot:
                    axis_name = []
                    axis_value = "--"
                    for aid in range(len(axis_values)):
                        if axis_values[aid]["name"] != plot:
                            axis_name.append(
                                "{} = {}".format(
                                    axis_values[aid]["name"], axis_values[aid]["value"]
                                )
                            )
                        else:
                            axis_value = float(axis_values[aid]["value"])
                    axis_name = ", ".join(axis_name)

                    if axis_name not in plot_data["cmp"]:
                        plot_data["cmp"][axis_name] = {}
                        plot_data["ref"][axis_name] = {}
                        plot_data["cmp_noise"][axis_name] = {}
                        plot_data["ref_noise"][axis_name] = {}

                    plot_data["cmp"][axis_name][axis_value] = cmp_time
                    plot_data["ref"][axis_name][axis_value] = ref_time
                    plot_data["cmp_noise"][axis_name][axis_value] = cmp_noise
                    plot_data["ref_noise"][axis_name][axis_value] = ref_noise

                global config_count
                global unknown_count
                global pass_count
                global failure_count

                config_count += 1
                if not min_noise:
                    unknown_count += 1
                    status = Fore.YELLOW + "????" + Fore.RESET
                elif abs(frac_diff) <= min_noise:
                    pass_count += 1
                    status = Fore.BLUE + "SAME" + Fore.RESET
                elif diff < 0:
                    failure_count += 1
                    status = Fore.GREEN + "FAST" + Fore.RESET
                else:
                    failure_count += 1
                    status = Fore.RED + "SLOW" + Fore.RESET

                if has_batch_data:
                    if (
                        abs(frac_diff_batch) <= 0.01
                    ):  # TODO(bgruber): what value to use here?
                        pass_count += 1
                        batch_status = Fore.BLUE + "SAME" + Fore.RESET
                    elif diff_batch < 0:
                        failure_count += 1
                        batch_status = Fore.GREEN + "FAST" + Fore.RESET
                    else:
                        failure_count += 1
                        batch_status = Fore.RED + "SLOW" + Fore.RESET

                if abs(frac_diff) >= threshold:
                    row.append(format_duration(ref_time))
                    row.append(format_percentage(ref_noise))
                    row.append(format_duration(cmp_time))
                    row.append(format_percentage(cmp_noise))
                    row.append(format_duration(diff))
                    row.append(format_percentage(frac_diff))
                    row.append(status)

                    if has_batch_data:
                        row.append(format_duration(ref_batch_time))
                        row.append(format_duration(cmp_batch_time))
                        row.append(format_duration(diff_batch))
                        row.append(format_percentage(frac_diff_batch))
                        row.append(batch_status)

                    rows.append(row)

            if len(rows) == 0:
                continue

            cmp_device = find_device_by_id(cmp_device_id, all_cmp_devices)
            ref_device = find_device_by_id(ref_state["device"], all_ref_devices)

            if cmp_device == ref_device:
                print("## [%d] %s\n" % (cmp_device["id"], cmp_device["name"]))
            else:
                print(
                    "## [%d] %s vs. [%d] %s\n"
                    % (
                        ref_device["id"],
                        ref_device["name"],
                        cmp_device["id"],
                        cmp_device["name"],
                    )
                )
            # colalign and github format require tabulate 0.8.3
            if tabulate_version >= (0, 8, 3):
                print(
                    tabulate.tabulate(
                        rows, headers=headers, colalign=colalign, tablefmt="github"
                    )
                )
            else:
                print(tabulate.tabulate(rows, headers=headers, tablefmt="markdown"))

            print("")

            if plot:
                plt.xscale("log")
                plt.yscale("log")
                plt.xlabel(plot)
                plt.ylabel("time [s]")
                plt.title(cmp_device["name"])

                def plot_line(key, shape, label):
                    x = [float(x) for x in plot_data[key][axis].keys()]
                    y = list(plot_data[key][axis].values())

                    noise = list(plot_data[key + "_noise"][axis].values())

                    top = [y[i] + y[i] * noise[i] for i in range(len(x))]
                    bottom = [y[i] - y[i] * noise[i] for i in range(len(x))]

                    p = plt.plot(x, y, shape, marker="o", label=label)
                    plt.fill_between(x, bottom, top, color=p[0].get_color(), alpha=0.1)

                for axis in plot_data["cmp"].keys():
                    plot_line("cmp", "-", axis)
                    plot_line("ref", "--", axis + " ref")

                plt.legend()
                plt.show()


def main():
    help_text = "%(prog)s [reference.json compare.json | reference_dir/ compare_dir/]"
    parser = argparse.ArgumentParser(prog="nvbench_compare", usage=help_text)
    parser.add_argument(
        "--ignore-devices",
        dest="ignore_devices",
        default=False,
        help="Ignore differences in the device sections and compare anyway",
        action="store_true",
    )
    parser.add_argument(
        "--threshold-diff",
        type=float,
        dest="threshold",
        default=0.0,
        help="only show benchmarks where percentage diff is >= THRESHOLD",
    )
    parser.add_argument(
        "--plot-along", type=str, dest="plot", default=None, help="plot results"
    )

    args, files_or_dirs = parser.parse_known_args()
    print(files_or_dirs)

    if len(files_or_dirs) != 2:
        parser.print_help()
        sys.exit(1)

    # if provided two directories, find all the exactly named files
    # in both and treat them as the reference and compare
    to_compare = []
    if os.path.isdir(files_or_dirs[0]) and os.path.isdir(files_or_dirs[1]):
        for f in os.listdir(files_or_dirs[1]):
            if os.path.splitext(f)[1] != ".json":
                continue
            r = os.path.join(files_or_dirs[0], f)
            c = os.path.join(files_or_dirs[1], f)
            if (
                os.path.isfile(r)
                and os.path.isfile(c)
                and os.path.getsize(r) > 0
                and os.path.getsize(c) > 0
            ):
                to_compare.append((r, c))
    else:
        to_compare = [(files_or_dirs[0], files_or_dirs[1])]

    for ref, comp in to_compare:
        ref_root = reader.read_file(ref)
        cmp_root = reader.read_file(comp)

        global all_ref_devices
        global all_cmp_devices
        all_ref_devices = ref_root["devices"]
        all_cmp_devices = cmp_root["devices"]

        if ref_root["devices"] != cmp_root["devices"]:
            print(
                (Fore.YELLOW if args.ignore_devices else Fore.RED)
                + "Device sections do not match:"
                + Fore.RESET
            )
            print(
                jsondiff.diff(
                    ref_root["devices"], cmp_root["devices"], syntax="symmetric"
                )
            )
            if not args.ignore_devices:
                sys.exit(1)

        compare_benches(
            ref_root["benchmarks"], cmp_root["benchmarks"], args.threshold, args.plot
        )

    print("# Summary\n")
    print("- Total Matches: %d" % config_count)
    print("  - Pass    (diff <= min_noise): %d" % pass_count)
    print("  - Unknown (infinite noise):    %d" % unknown_count)
    print("  - Failure (diff > min_noise):  %d" % failure_count)
    return failure_count


if __name__ == "__main__":
    sys.exit(main())