|
1 | 1 | import time
|
| 2 | +import os |
| 3 | +import json |
2 | 4 |
|
3 | 5 | import torch
|
4 |
| -from torch.profiler import profile |
| 6 | +from torch.profiler import profile, ProfilerActivity |
5 | 7 |
|
6 | 8 |
|
7 | 9 | def synchronize():
|
8 | 10 | pass
|
9 | 11 |
|
10 | 12 |
|
| 13 | +class NullContext: |
| 14 | + def __enter__(self): |
| 15 | + pass |
| 16 | + |
| 17 | + def __exit__(self, exc_type, exc_val, exc_tb): |
| 18 | + pass |
| 19 | + |
| 20 | + |
11 | 21 | def dump_chrome_trace(f, input, trace_filename, optimize_ctx, activities, num_runs=1,
|
12 | 22 | devices=None, kwargs_for_f=None, kwargs_for_profiler=None):
|
13 | 23 | """
|
@@ -55,3 +65,136 @@ def dump_chrome_trace(f, input, trace_filename, optimize_ctx, activities, num_ru
|
55 | 65 | prof.export_chrome_trace(trace_filename)
|
56 | 66 |
|
57 | 67 | return timing
|
| 68 | + |
| 69 | + |
| 70 | +def get_chrome_trace_events(filename): |
| 71 | + f = open(filename) |
| 72 | + data = json.load(f) |
| 73 | + events = data["traceEvents"] |
| 74 | + return events |
| 75 | + |
| 76 | + |
| 77 | +def is_gpu_compute_event(event): |
| 78 | + global gpu_pids |
| 79 | + return "pid" in event and event["pid"] in gpu_pids and "ph" in event and event["ph"] == "X" |
| 80 | + |
| 81 | + |
| 82 | +def get_sorted_gpu_events(events): |
| 83 | + sorted_gpu_events = [] |
| 84 | + for event in events: |
| 85 | + if(not is_gpu_compute_event(event)): |
| 86 | + continue |
| 87 | + sorted_gpu_events.append(event) |
| 88 | + return sorted(sorted_gpu_events, key=lambda x: x["ts"]) |
| 89 | + |
| 90 | + |
| 91 | +def get_duration(sorted_gpu_events): |
| 92 | + if len(sorted_gpu_events) == 0: |
| 93 | + return 0 |
| 94 | + event = sorted_gpu_events[0] |
| 95 | + current_end_time = event["ts"] + event["dur"] |
| 96 | + total_duration = event["dur"] |
| 97 | + for event in sorted_gpu_events[1:]: |
| 98 | + start_time = max(event["ts"], current_end_time) |
| 99 | + end_time = event["ts"] + event["dur"] |
| 100 | + total_duration = total_duration + max(end_time - start_time, 0) |
| 101 | + current_end_time = max(current_end_time, end_time) |
| 102 | + return total_duration |
| 103 | + |
| 104 | + |
| 105 | +def get_sorted_gpu_mm_conv_events(events): |
| 106 | + def is_mm_conv_event(event): |
| 107 | + return "name" in event and ("gemm" in event["name"] or "conv" in event["name"] |
| 108 | + or "cutlass" in event["name"] or "wgrad" in event["name"]) |
| 109 | + gpu_events = get_sorted_gpu_events(events) |
| 110 | + sorted_events = [] |
| 111 | + for event in gpu_events: |
| 112 | + if(not is_mm_conv_event(event)): |
| 113 | + continue |
| 114 | + sorted_events.append(event) |
| 115 | + return sorted_events |
| 116 | + |
| 117 | + |
| 118 | +gpu_pids = [] |
| 119 | + |
| 120 | + |
| 121 | +def compute_utilization(filename: str, total_length: float): |
| 122 | + """ |
| 123 | + Process the chrome traces outputs by the pytorch profiler to compute GPU Utilization |
| 124 | + and percent of times spent on matmal and convolution |
| 125 | +
|
| 126 | + Args: |
| 127 | + filename(str): Name of chrome traces file produced by pytorch profiler |
| 128 | +
|
| 129 | + total_length(float): total length of the process without profiler in second |
| 130 | +
|
| 131 | + Return: |
| 132 | + tuple: (GPU Utilization, percent of time spent on matmal and convolution) |
| 133 | + """ |
| 134 | + events = get_chrome_trace_events(filename) |
| 135 | + |
| 136 | + # get pids of GPU events |
| 137 | + global gpu_pids |
| 138 | + gpu_pids = [] |
| 139 | + for event in events: |
| 140 | + if "name" not in event: |
| 141 | + continue |
| 142 | + if event["name"] == 'process_labels' and "GPU" in event["args"]["labels"]: |
| 143 | + gpu_pids.append(event["pid"]) |
| 144 | + |
| 145 | + total_length = total_length * 1e6 |
| 146 | + sorted_gpu_events = get_sorted_gpu_events(events) |
| 147 | + utilization = get_duration(sorted_gpu_events) / total_length |
| 148 | + |
| 149 | + sorted_gpu_mm_conv_events = get_sorted_gpu_mm_conv_events(events) |
| 150 | + mm_conv_utilization = get_duration(sorted_gpu_mm_conv_events) / total_length |
| 151 | + |
| 152 | + return utilization, mm_conv_utilization |
| 153 | + |
| 154 | + |
| 155 | +def benchmark_utilization(f, input, trace_folder, optimize_ctx=None, trace_file_name="tmp_chrome_trace", num_runs=1): |
| 156 | + """ |
| 157 | + Benchmark the GPU Utilization and percent of time spent on matmal and convolution operations of |
| 158 | + running f(input, **kwargs_for_f) with [optimize_ctx] [num_runs] times. |
| 159 | + It will produce a chrome trace file in trace_folder/trace_file_name.json |
| 160 | +
|
| 161 | + Example: |
| 162 | +
|
| 163 | + ``` |
| 164 | + def f(a): |
| 165 | + return a.sum() |
| 166 | + a = torch.rand(2**20, device="cuda") |
| 167 | + utilization, mm_conv_utilization = benchmark_utilization(f, a, "tmp", trace_file_name = "tmp_chrome_trace") |
| 168 | + ``` |
| 169 | +
|
| 170 | + Args: |
| 171 | + f: function to benchmark |
| 172 | +
|
| 173 | + input: input to :attr:`f` |
| 174 | +
|
| 175 | + trace_folder: name of the folder to store the chrome trace |
| 176 | +
|
| 177 | + optimize_ctx: the context in which f will run |
| 178 | +
|
| 179 | + trace_file_name: name of the dumped chrome trace file, default to "tmp_chrome_trace" |
| 180 | +
|
| 181 | + num_runs: number of times to run f, excluding the warm-up runs, default to 1. |
| 182 | +
|
| 183 | + Return: |
| 184 | + tuple: (GPU Utilization, percent of time spent on matmal and convolution) |
| 185 | +
|
| 186 | + """ |
| 187 | + isExist = os.path.exists(trace_folder) |
| 188 | + if not isExist: |
| 189 | + os.makedirs(trace_folder) |
| 190 | + print("create folder " + trace_folder) |
| 191 | + |
| 192 | + if optimize_ctx is None: |
| 193 | + optimize_ctx = NullContext() |
| 194 | + |
| 195 | + chrome_trace_file_name = os.path.join(trace_folder, trace_file_name + ".json") |
| 196 | + total_length = dump_chrome_trace(f, input, chrome_trace_file_name, optimize_ctx, |
| 197 | + [ProfilerActivity.CUDA], num_runs=num_runs, devices="cuda") |
| 198 | + utilization, mm_conv_utilization = compute_utilization(chrome_trace_file_name, total_length) |
| 199 | + |
| 200 | + return utilization, mm_conv_utilization |
0 commit comments