Skip to content
This repository was archived by the owner on Aug 21, 2025. It is now read-only.

Commit 137e4e1

Browse files
authored
Utilities to get GPU Utilization from chrome trace dumps (#868)
Command-line tool to get GPU Utilization from chrome trace dumps (
1 parent a228a1d commit 137e4e1

File tree

1 file changed

+121
-0
lines changed

1 file changed

+121
-0
lines changed

benchmarks/chrome_trace_parser.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
#!/usr/bin/env python
2+
import argparse
3+
import json
4+
import os
5+
import logging
6+
import pandas as pd
7+
8+
# process the chrome traces output by the pytorch profiler
9+
# require the json input file's name to be in format {model_name}_chrome_trace_*.json
10+
# the runtimes file should have format (model_name, time)
11+
12+
gpu_pids = []
13+
14+
def is_gpu_compute_event(event):
15+
global gpu_pids
16+
return "pid" in event and event["pid"] in gpu_pids and "ph" in event and event["ph"] == "X"
17+
18+
def get_events(filename):
19+
f = open(filename)
20+
data = json.load(f)
21+
events = data["traceEvents"]
22+
return events
23+
24+
def get_sorted_gpu_events(events):
25+
sorted_gpu_events = []
26+
for event in events:
27+
if(not is_gpu_compute_event(event)):
28+
continue
29+
sorted_gpu_events.append(event)
30+
return sorted(sorted_gpu_events, key=lambda x: x["ts"])
31+
32+
def get_sorted_gpu_mm_conv_events(events):
33+
def is_mm_conv_event(event):
34+
return "name" in event and ("gemm" in event["name"] or "conv" in event["name"]
35+
or "cutlass" in event["name"] or "wgrad" in event["name"])
36+
gpu_events = get_sorted_gpu_events(events)
37+
sorted_events = []
38+
for event in gpu_events:
39+
if(not is_mm_conv_event(event)):
40+
continue
41+
sorted_events.append(event)
42+
return sorted_events
43+
44+
def get_duration(sorted_gpu_events):
45+
event = sorted_gpu_events[0]
46+
current_end_time = event["ts"] + event["dur"]
47+
total_duration = event["dur"]
48+
for event in sorted_gpu_events[1:]:
49+
start_time = max(event["ts"], current_end_time)
50+
end_time = event["ts"] + event["dur"]
51+
total_duration = total_duration + max(end_time - start_time, 0)
52+
current_end_time = max(current_end_time, end_time)
53+
return total_duration
54+
55+
def get_model_name(filename):
56+
_, tail = os.path.split(filename)
57+
modelname = tail[:tail.find("_chrome_trace")]
58+
return modelname
59+
60+
def get_total_length(run_times_df, modelname):
61+
return float(run_times_df[run_times_df["name"]==modelname]["runtime"])
62+
63+
def main():
64+
parser = argparse.ArgumentParser()
65+
group = parser.add_mutually_exclusive_group(required=True)
66+
parser.add_argument(
67+
"--runtime", "-runf", help="file name of the runtime file", required=True
68+
)
69+
group.add_argument(
70+
"--filename", "-f", action="append", help="a filename of the json file to process"
71+
)
72+
group.add_argument(
73+
"--folder", "-fd", help="a folder of the json files to process"
74+
)
75+
args = parser.parse_args()
76+
77+
run_times_df = pd.read_csv(args.runtime)
78+
79+
if args.filename:
80+
filenames = args.filename
81+
elif args.folder:
82+
filenames = []
83+
directory = args.folder
84+
for filename in os.listdir(directory):
85+
f = os.path.join(directory, filename)
86+
if os.path.isfile(f) and f.endswith(".json"):
87+
filenames.append(f)
88+
else:
89+
print("Please provide a filename or a folder name")
90+
91+
print(f"modelname, GPU Utilization, MM and Conv time")
92+
93+
for filename in filenames:
94+
try:
95+
events = get_events(filename)
96+
97+
# get pids of GPU events
98+
global gpu_pids
99+
for event in events:
100+
if "name" not in event:
101+
continue
102+
if event["name"] == 'process_labels' and "GPU" in event["args"]["labels"]:
103+
gpu_pids.append(event["pid"])
104+
105+
modelname = get_model_name(filename)
106+
total_length = get_total_length(run_times_df, modelname) * 1e6
107+
108+
sorted_gpu_events = get_sorted_gpu_events(events)
109+
utilization = get_duration(sorted_gpu_events) / total_length
110+
111+
sorted_gpu_mm_conv_events = get_sorted_gpu_mm_conv_events(events)
112+
mm_conv_utilization = get_duration(sorted_gpu_mm_conv_events) / total_length
113+
114+
print(f"{modelname}, {utilization}, {mm_conv_utilization}")
115+
except:
116+
logging.exception(f"{filename}, ERROR")
117+
print(f"{filename}, ERROR")
118+
119+
120+
if __name__ == "__main__":
121+
main()

0 commit comments

Comments
 (0)