3636from tensorflow .python .saved_model import signature_constants
3737from tensorflow .python .saved_model import tag_constants
3838
39-
4039__all__ = ["BaseBenchmarkRunner" ]
4140
4241
@@ -71,7 +70,8 @@ def __init__(self, args):
7170
7271 if args .use_xla_auto_jit :
7372 print ("[Benchmark] - Activating XLA JIT Auto Clustering" )
74- os .environ ["TF_XLA_FLAGS" ] = "--tf_xla_auto_jit=2 --tf_xla_cpu_global_jit"
73+ os .environ ["TF_XLA_FLAGS" ] = "--tf_xla_auto_jit=2"
74+ os .environ ["TF_XLA_FLAGS" ] += " --tf_xla_cpu_global_jit"
7575
7676 if args .no_tf32 :
7777 print ("[Benchmark] - Deactivating the use of TF32 format" )
@@ -111,10 +111,14 @@ def _config_gpu_memory(self, gpu_mem_cap):
111111 else :
112112 try :
113113 set_virtual_device_configuration = tf .config .set_virtual_device_configuration
114- device_config = tf .config .LogicalDeviceConfiguration (memory_limit = gpu_mem_cap )
114+ device_config = tf .config .LogicalDeviceConfiguration (
115+ memory_limit = gpu_mem_cap
116+ )
115117 except AttributeError :
116118 set_virtual_device_configuration = tf .config .experimental .set_virtual_device_configuration
117- device_config = tf .config .experimental .VirtualDeviceConfiguration (memory_limit = gpu_mem_cap )
119+ device_config = tf .config .experimental .VirtualDeviceConfiguration (
120+ memory_limit = gpu_mem_cap
121+ )
118122
119123 set_virtual_device_configuration (gpu , [device_config ])
120124 except RuntimeError as e :
@@ -133,9 +137,9 @@ def _export_runtime_metrics_to_json(self, metric_dict):
133137 return
134138
135139 metric_dict = {
136- # Creating a copy to avoid modifying the original
137- "results" : copy .deepcopy (metric_dict ),
138- "runtime_arguments" : vars (self ._args )
140+ # Creating a copy to avoid modifying the original
141+ "results" : copy .deepcopy (metric_dict ),
142+ "runtime_arguments" : vars (self ._args )
139143 }
140144
141145 with open (file_path , 'w' ) as json_f :
@@ -160,6 +164,7 @@ def _export_runtime_metrics_to_csv(self, metric_dict):
160164
161165 data = {f"metric_{ k } " : v for k , v in metric_dict .items ()}
162166
167+ # yapf: disable
163168 args_to_save = [
164169 "batch_size" ,
165170 "input_saved_model_dir" ,
@@ -172,6 +177,7 @@ def _export_runtime_metrics_to_csv(self, metric_dict):
172177 "use_xla" ,
173178 "use_xla_auto_jit"
174179 ]
180+ # yapf: enable
175181
176182 runtime_arguments = vars (self ._args )
177183 for key in args_to_save :
@@ -181,11 +187,15 @@ def _export_runtime_metrics_to_csv(self, metric_dict):
181187
182188 if not os .path .isfile (file_path ):
183189 with open (file_path , 'w' ) as outcsv :
184- writer = csv .DictWriter (outcsv , fieldnames = fieldnames , delimiter = ',' )
190+ writer = csv .DictWriter (
191+ outcsv , fieldnames = fieldnames , delimiter = ','
192+ )
185193 writer .writeheader ()
186194
187195 with open (file_path , 'a' ) as outcsv :
188- writer = csv .DictWriter (outcsv , fieldnames = fieldnames , delimiter = ',' )
196+ writer = csv .DictWriter (
197+ outcsv , fieldnames = fieldnames , delimiter = ','
198+ )
189199 writer .writerow (data )
190200
191201 except Exception as e :
@@ -209,7 +219,9 @@ def load_model_from_disk(
209219 graph_func = saved_model_loaded .signatures [signature_key ]
210220
211221 if precision == "FP16" :
212- tf .config .optimizer .set_experimental_options ({"auto_mixed_precision" : True })
222+ tf .config .optimizer .set_experimental_options ({
223+ "auto_mixed_precision" : True
224+ })
213225
214226 # Known TF Issue: https://github.com/tensorflow/tensorflow/issues/37615#issuecomment-767804930
215227 # it looks like if the original trackable object is released by
@@ -429,7 +441,10 @@ def infer_batch(x):
429441 memcopy_times = []
430442 dequeue_times = []
431443
432- def log_step (step_idx , display_every , iter_time , memcpyHtoD_time , dequeue_time ):
444+ def log_step (
445+ step_idx , display_every , iter_time , memcpyHtoD_time ,
446+ dequeue_time
447+ ):
433448 if step_idx % display_every == 0 :
434449 print (
435450 f"step { step_idx :04d} , "
@@ -439,6 +454,7 @@ def log_step(step_idx, display_every, iter_time, memcpyHtoD_time, dequeue_time):
439454 )
440455
441456 if self ._args .tf_profile_export_path :
457+
442458 def start_profiling ():
443459 if self ._args .tf_profile_verbose :
444460 profiler_opts = tf .profiler .experimental .ProfilerOptions (
@@ -482,9 +498,9 @@ def start_profiling():
482498 ds_iter = iter (dataset )
483499
484500 dequeue_batch_fn = get_dequeue_batch_fn (
485- ds_iter ,
486- use_xla = self ._args .use_xla ,
487- use_synthetic_data = self ._args .use_synthetic_data
501+ ds_iter ,
502+ use_xla = self ._args .use_xla ,
503+ use_synthetic_data = self ._args .use_synthetic_data
488504 )
489505
490506 force_data_on_gpu_fn = get_force_data_on_gpu_fn (
@@ -500,10 +516,8 @@ def start_profiling():
500516 if step_idx == self ._args .num_warmup_iterations - 5 :
501517 start_profiling ()
502518
503- if (
504- self ._args .num_iterations is not None and
505- step_idx > self ._args .num_iterations
506- ):
519+ if (self ._args .num_iterations is not None and
520+ step_idx > self ._args .num_iterations ):
507521 break
508522
509523 with tracing_ctx ('' , step_num = step_idx , _r = 1 ):
@@ -534,28 +548,44 @@ def start_profiling():
534548 log_step (
535549 step_idx ,
536550 display_every = self ._args .display_every ,
537- iter_time = np .mean (iter_times [- self ._args .display_every :]) * 1000 ,
538- memcpyHtoD_time = np .mean (memcopy_times [- self ._args .display_every :]) * 1000 ,
539- dequeue_time = np .mean (dequeue_times [- self ._args .display_every :]) * 1000
551+ iter_time = np .mean (
552+ iter_times [- self ._args .display_every :]
553+ ) * 1000 ,
554+ memcpyHtoD_time = np .mean (
555+ memcopy_times [- self ._args .display_every :]
556+ ) * 1000 ,
557+ dequeue_time = np .mean (
558+ dequeue_times [- self ._args .display_every :]
559+ ) * 1000
540560 )
541561 else :
542- print (f"{ 'GPU Iteration Time' :18s} : { iter_times [- 1 ]:08.4f} s" )
543- print (f"{ 'Data MemCopyHtoD Time' :18s} : { memcpyHtoD_time [- 1 ]:08.4f} s" )
544- print (f"{ 'Data Dequeue Time' :18s} : { dequeue_times [- 1 ]:08.4f} s" )
562+ print (
563+ f"{ 'GPU Iteration Time' :18s} : { iter_times [- 1 ]:08.4f} s"
564+ )
565+ print (
566+ f"{ 'Data MemCopyHtoD Time' :18s} : { memcpyHtoD_time [- 1 ]:08.4f} s"
567+ )
568+ print (
569+ f"{ 'Data Dequeue Time' :18s} : { dequeue_times [- 1 ]:08.4f} s"
570+ )
545571
546572 if not self ._args .use_synthetic_data :
547573 data_aggregator .aggregate_data (y_pred , y )
548574
549- if (
550- not self ._args .debug_performance and
551- step_idx % self ._args .display_every != 0
552- ): # avoids double printing
575+ if (not self ._args .debug_performance and
576+ step_idx % self ._args .display_every !=
577+ 0 ): # avoids double printing
553578 log_step (
554579 step_idx ,
555580 display_every = 1 , # force print
556- iter_time = np .mean (iter_times [- self ._args .display_every :]) * 1000 ,
557- memcpyHtoD_time = np .mean (memcopy_times [- self ._args .display_every :]) * 1000 ,
558- dequeue_time = np .mean (dequeue_times [- self ._args .display_every :]) * 1000
581+ iter_time = np .mean (iter_times [- self ._args .display_every :]) *
582+ 1000 ,
583+ memcpyHtoD_time = np .mean (
584+ memcopy_times [- self ._args .display_every :]
585+ ) * 1000 ,
586+ dequeue_time = np .mean (
587+ dequeue_times [- self ._args .display_every :]
588+ ) * 1000
559589 )
560590
561591 if step_idx >= 100 :
@@ -588,13 +618,17 @@ def start_profiling():
588618
589619 metrics ['Total GPU Time (s)' ] = int (np .ceil (np .sum (iter_times )))
590620 metrics ['Throughput (samples/sec)' ] = (
591- self ._args .batch_size / sp .stats .trim_mean (
592- iter_times , self ._args .trim_mean_percentage ))
621+ self ._args .batch_size /
622+ sp .stats .trim_mean (iter_times , self ._args .trim_mean_percentage )
623+ )
593624
594625 def timing_metrics (time_arr , log_prefix ):
595626 data = dict ()
596- data [f"{ log_prefix } Trim Mean [{ self ._args .trim_mean_percentage * 100 } %] (ms)" ] = (
597- sp .stats .trim_mean (time_arr , self ._args .trim_mean_percentage ) * 1000
627+ data [
628+ f"{ log_prefix } Trim Mean [{ self ._args .trim_mean_percentage * 100 } %] (ms)"
629+ ] = (
630+ sp .stats .
631+ trim_mean (time_arr , self ._args .trim_mean_percentage ) * 1000
598632 )
599633 data [f"{ log_prefix } 99th_percentile (ms)" ] = np .percentile (
600634 time_arr , q = 99 , interpolation = 'lower'
@@ -606,8 +640,12 @@ def timing_metrics(time_arr, log_prefix):
606640 return data
607641
608642 metrics .update (timing_metrics (iter_times , "GPU Latency" ))
609- metrics .update (timing_metrics (dequeue_times , "Data Batch Dequeue Time" ))
610- metrics .update (timing_metrics (memcopy_times , "Data MemCopyHtoD Time" ))
643+ metrics .update (
644+ timing_metrics (dequeue_times , "Data Batch Dequeue Time" )
645+ )
646+ metrics .update (
647+ timing_metrics (memcopy_times , "Data MemCopyHtoD Time" )
648+ )
611649
612650 self ._export_runtime_metrics_to_json (metrics )
613651 self ._export_runtime_metrics_to_csv (metrics )
0 commit comments