12
12
import torch
13
13
import torch ._dynamo .config
14
14
import torch ._inductor .config
15
+ torch ._inductor .config .cpp .enable_kernel_profile = True
15
16
16
17
def device_sync (device ):
17
18
if "cuda" in device :
@@ -132,7 +133,7 @@ def encode_tokens(tokenizer, string, bos=True, device='cuda'):
132
133
tokens = tokenizer .encode (string )
133
134
if bos :
134
135
tokens = [tokenizer .bos_id ()] + tokens
135
- return torch .tensor (tokens , dtype = torch .int , device = device )
136
+ return torch .tensor (tokens , dtype = torch .int , device = args . device )
136
137
137
138
def _load_model (checkpoint_path , device , precision , use_tp ):
138
139
with torch .device ('meta' ):
@@ -248,8 +249,13 @@ def callback(x):
248
249
if (i != num_samples - 1 or not profile ) or (use_tp and rank != 0 ):
249
250
prof = contextlib .nullcontext ()
250
251
else :
251
- torch .profiler ._utils ._init_for_cuda_graphs ()
252
- prof = torch .profiler .profile ()
252
+ if device == 'cuda' :
253
+ torch .profiler ._utils ._init_for_cuda_graphs ()
254
+ prof = torch .profiler .profile (activities = [torch .profiler .ProfilerActivity .CPU , torch .profiler .ProfilerActivity .CUDA ], use_cuda = True )
255
+ profile_sort = 'self_cuda_time_total'
256
+ elif device == 'cpu' :
257
+ prof = torch .profiler .profile (activities = [torch .profiler .ProfilerActivity .CPU ])
258
+ profile_sort = 'self_cpu_time_total'
253
259
with prof :
254
260
y = generate (
255
261
model ,
@@ -263,6 +269,8 @@ def callback(x):
263
269
if i == - 1 :
264
270
print (f"Compilation time: { time .perf_counter () - t0 :.2f} seconds" )
265
271
continue
272
+ if hasattr (prof , "key_averages" ):
273
+ print (prof .key_averages ().table (sort_by = profile_sort , row_limit = - 1 ))
266
274
if hasattr (prof , "export_chrome_trace" ):
267
275
if use_tp :
268
276
prof .export_chrome_trace (f"{ profile } _rank_{ rank } .json" )
0 commit comments