1515from benchmark_utils import force_gpu_resync
1616from benchmark_utils import print_dict
1717from benchmark_utils import timed_section
18+ from benchmark_utils import timed_dataset
1819
1920import numpy as np
2021import tensorflow as tf
2122
2223from tensorflow .python .compiler .tensorrt import trt_convert as trt
23- from tensorflow .python .saved_model import tag_constants
2424
2525__all__ = ["BaseBenchmarkRunner" ]
2626
@@ -102,7 +102,7 @@ def _get_graph_func(self):
102102
103103 if not self ._args .use_tftrt :
104104
105- with timed_section (' Loading TensorFlow native model...' ):
105+ with timed_section (" Loading TensorFlow native model" ):
106106
107107 saved_model_loaded = tf .saved_model .load (
108108 export_dir = self ._args .input_saved_model_dir ,
@@ -111,10 +111,18 @@ def _get_graph_func(self):
111111
112112 graph_func = saved_model_loaded .signatures [
113113 self ._args .input_signature_key ]
114+ # from tensorflow.python.framework import convert_to_constants
114115 # graph_func = convert_to_constants.convert_variables_to_constants_v2(
115116 # graph_func
116117 # )
117118
119+ # Known TF Issue: https://github.com/tensorflow/tensorflow/issues/37615#issuecomment-767804930
120+ # it looks like if the original trackable object is released by
121+ # the Python garbage collector once it goes out of scope, and
122+ # the signature returned by the function does not maintain a
123+ # back-reference to the original loaded object.
124+ graph_func ._backref_to_saved_model = saved_model_loaded
125+
118126 else :
119127
120128 def get_trt_precision (precision ):
@@ -208,7 +216,7 @@ def engine_build_input_fn(num_batches, model_phase):
208216 num_batches = 1 , model_phase = "Building"
209217 )
210218
211- with timed_section (' Building TensorRT engines...' ):
219+ with timed_section (" Building TensorRT engines" ):
212220 converter .build (
213221 input_fn = tf .autograph .experimental .
214222 do_not_convert (offline_opt_input_fn )
@@ -217,25 +225,30 @@ def engine_build_input_fn(num_batches, model_phase):
217225
218226 if self ._args .output_saved_model_dir is not None :
219227
220- with timed_section (' Saving converted graph with TF-TRT ...' ):
228+ with timed_section (" Saving converted graph with TF-TRT" ):
221229 converter .save (self ._args .output_saved_model_dir )
222230 print (
223231 f"Converted graph saved to "
224232 f"`{ self ._args .output_saved_model_dir } `"
225233 )
226234
227- savedmodel_outputs = print_dict (
228- graph_func .structured_outputs ,
229- redirect_to_str = True
230- )
235+ if isinstance (graph_func .structured_outputs , (tuple , list )):
236+ savedmodel_outputs = "\n - " .join ([
237+ str (t ) for t in graph_func .structured_outputs
238+ ])
239+ savedmodel_outputs = f" - { savedmodel_outputs } "
240+ else :
241+ savedmodel_outputs = print_dict (
242+ graph_func .structured_outputs , redirect_to_str = True
243+ )
244+ self ._debug_print (f"Available Output Tensors:\n { savedmodel_outputs } " )
245+ print () # visual spacing
231246
232247 chosen_outputs = "\n - " .join (
233248 sorted (self ._args .output_tensors_name .split ("," ))
234249 )
235-
236- self ._debug_print (f"Available Output Tensors:\n { savedmodel_outputs } " )
237- print () # visual spacing
238250 self ._debug_print (f"Chosen Output Tensor:\n - { chosen_outputs } " )
251+ print () # visual spacing
239252
240253 return graph_func
241254
@@ -244,111 +257,139 @@ def execute_benchmark(self):
244257 It consumes TFRecords with labels and reports accuracy.
245258 """
246259
247- print ("\n Start loading model ..." )
248-
249- graph_func = self ._get_graph_func ()
260+ with timed_section ("Model Loading" ):
261+ graph_func = self ._get_graph_func ()
250262
251- dataset , bypass_data_to_eval = self .get_dataset_batches ()
263+ with timed_section ("Model Inference" ):
264+ dataset , bypass_data_to_eval = self .get_dataset_batches ()
252265
253- print ("\n Start inference ..." )
266+ if self ._args .use_synthetic_data :
267+ old_ds = dataset
268+ try :
269+ dataset = dataset .take (count = 1 ) # loop over 1 batch
270+ dataset = dataset .cache ()
271+ dataset = dataset .repeat ()
272+ dataset = dataset .prefetch (
273+ buffer_size = tf .data .experimental .AUTOTUNE
274+ )
275+ self ._debug_print (
276+ "Model dataset has been replaced by a synthetic data "
277+ "loader to minimize data loading jitter."
278+ )
254279
255- @force_gpu_resync
256- @tf .function (jit_compile = self ._args .use_xla )
257- def infer_batch (x ):
258- if isinstance (x , (tuple , list )):
259- model_out = graph_func (* x )
260- elif isinstance (x , dict ):
261- model_out = graph_func (** x )
262- else :
263- model_out = graph_func (x )
280+ except Exception as e :
281+ dataset = old_ds
282+ print (
283+ f"[ERROR] Impossible to transform the dataset into a "
284+ f"synthetic dataset. Performance numbers will be "
285+ f"impacted.\n Error: { str (e )} ."
286+ )
264287
265- if self ._args .output_tensors_name is not None :
266- output_tensors_name = self ._args .output_tensors_name .split ("," )
267- if len (output_tensors_name ) == 1 :
268- return model_out [self ._args .output_tensors_name ]
288+ @force_gpu_resync
289+ @tf .function (jit_compile = self ._args .use_xla )
290+ def infer_batch (x ):
291+ if isinstance (x , (tuple , list )):
292+ model_out = graph_func (* x )
293+ elif isinstance (x , dict ):
294+ model_out = graph_func (** x )
269295 else :
270- return { key : model_out [ key ] for key in output_tensors_name }
296+ model_out = graph_func ( x )
271297
272- return model_out
298+ if self ._args .output_tensors_name is not None :
299+ output_ts_name = self ._args .output_tensors_name .split ("," )
300+ if len (output_ts_name ) == 1 :
301+ return model_out [self ._args .output_tensors_name ]
302+ else :
303+ return {key : model_out [key ] for key in output_ts_name }
273304
274- data_aggregator = DataAggregator (
275- self .postprocess_model_outputs , args = self ._args
276- )
305+ return model_out
277306
278- iter_times = []
307+ if not self ._args .use_synthetic_data :
308+ data_aggregator = DataAggregator (
309+ self .postprocess_model_outputs , args = self ._args
310+ )
279311
280- def log_step (step_idx , display_every , iter_time ):
281- if step_idx % display_every == 0 :
282- print (" step %04d, iter_time(ms)=%.3f" % (step_idx , iter_time ))
312+ iter_times = []
283313
284- data_start_t = time .time ()
285- for step_idx , data_batch in enumerate (dataset ):
286- x , y = self .preprocess_model_inputs (data_batch )
314+ def log_step (step_idx , display_every , iter_time ):
315+ if step_idx % display_every == 0 :
316+ print (
317+ f" step { step_idx :04d} , iter_time(ms)={ iter_time :.3f} "
318+ )
287319
288- if self . _args . debug :
289- print ( "Step:" , step_idx + 1 )
290- print ( "Data Loading Time:" , time . time () - data_start_t )
320+ dataset = timed_dataset (
321+ dataset , activate = self . _args . debug_performance
322+ )
291323
292- start_time = time .time ()
293- y_pred = infer_batch (x )
294- iter_times .append (time .time () - start_time )
324+ for step_idx , data_batch in enumerate (dataset ):
325+ x , y = self .preprocess_model_inputs (data_batch )
295326
296- if not self ._args .debug :
297- log_step (
298- step_idx + 1 , self ._args .display_every ,
299- np .mean (iter_times [- self ._args .display_every :]) * 1000
300- )
301- else :
302- print ("GPU Iteration Time:" , iter_times [- 1 ])
327+ start_time = time .time ()
328+ y_pred = infer_batch (x )
329+ iter_times .append (time .time () - start_time )
303330
304- data_aggregator .aggregate_data (y_pred , y )
331+ if not self ._args .debug_performance :
332+ log_step (
333+ step_idx + 1 , self ._args .display_every ,
334+ np .mean (iter_times [- self ._args .display_every :]) * 1000
335+ )
336+ else :
337+ print (f"{ 'GPU Iteration Time' :18s} : { iter_times [- 1 ]:.4f} s" )
305338
306- if (self ._args .num_iterations is not None and
307- step_idx + 1 >= self ._args .num_iterations ):
308- break
339+ if not self ._args .use_synthetic_data :
340+ data_aggregator .aggregate_data (y_pred , y )
309341
310- if self ._args .debug :
311- print ( "===============" )
312- data_start_t = time . time ()
342+ if ( self ._args .num_iterations is not None and
343+ step_idx + 1 >= self . _args . num_iterations ):
344+ break
313345
314- if step_idx % self ._args .display_every != 0 : # avoids double printing
315- log_step (
316- step_idx + 1 ,
317- display_every = 1 , # force print
318- iter_time = np .mean (iter_times [- self ._args .display_every :]) * 1000
319- )
346+ if (not self ._args .debug_performance and
347+ step_idx % self ._args .display_every !=
348+ 0 ): # avoids double printing
349+ log_step (
350+ step_idx + 1 ,
351+ display_every = 1 , # force print
352+ iter_time = (
353+ np .mean (iter_times [- self ._args .display_every :]) * 1000
354+ )
355+ )
320356
321- print ( " \n End of inference. Computing metrics ... \n " )
357+ with timed_section ( "Metric Computation" ):
322358
323- metric , metric_units = self .evaluate_model (
324- data_aggregator .predicted_dict , data_aggregator .expected_dict ,
325- bypass_data_to_eval
326- )
327- print (f"- { metric_units :35s} : { metric :.2f} " )
359+ if not self ._args .use_synthetic_data :
360+ metric , metric_units = self .evaluate_model (
361+ data_aggregator .predicted_dict ,
362+ data_aggregator .expected_dict , bypass_data_to_eval
363+ )
364+ print (f"- { metric_units :35s} : { metric :.2f} " )
328365
329- metrics = dict ()
366+ metrics = dict ()
330367
331- metrics ["Total Samples Processed" ] = (
332- data_aggregator .total_samples_processed
333- )
368+ if not self ._args .use_synthetic_data :
369+ metrics ["Total Samples Processed" ] = (
370+ data_aggregator .total_samples_processed
371+ )
334372
335- # Skipping last batch. Might have different batch_size
336- run_times = np .array (iter_times )[self ._args .num_warmup_iterations :- 1 ]
373+ # Skipping last batch. Might have different batch_size
374+ run_times = np .array (iter_times )
375+ run_times = run_times [self ._args .num_warmup_iterations :- 1 ]
337376
338- metrics ['Total GPU Time (s)' ] = int (np .ceil (np .sum (iter_times )))
339- metrics ['Throughput (samples/sec)' ] = np .mean (
340- self ._args .batch_size / run_times
341- )
342- metrics ['99th_percentile (ms)' ] = np .percentile (
343- run_times , q = 99 , interpolation = 'lower'
344- ) * 1000
345- metrics ['GPU Latency Mean (ms)' ] = np .mean (run_times ) * 1000
346- metrics ['GPU Latency Median (ms)' ] = np .median (run_times ) * 1000
347- metrics ['GPU Latency Min (ms)' ] = np .min (run_times ) * 1000
348- metrics ['GPU Latency Max (ms)' ] = np .max (run_times ) * 1000
349-
350- for key , val in sorted (metrics .items ()):
351- if isinstance (val , int ):
352- print (f"- { key :35s} : { val } " )
353- else :
354- print (f"- { key :35s} : { val :.2f} " )
377+ metrics ['Total GPU Time (s)' ] = int (np .ceil (np .sum (iter_times )))
378+ metrics ['Throughput (samples/sec)' ] = np .mean (
379+ self ._args .batch_size / run_times
380+ )
381+ metrics ['99th_percentile (ms)' ] = np .percentile (
382+ run_times , q = 99 , interpolation = 'lower'
383+ ) * 1000
384+ metrics ['GPU Latency Mean (ms)' ] = np .mean (run_times ) * 1000
385+ metrics ['GPU Latency Median (ms)' ] = np .median (run_times ) * 1000
386+ metrics ['GPU Latency Min (ms)' ] = np .min (run_times ) * 1000
387+ metrics ['GPU Latency Max (ms)' ] = np .max (run_times ) * 1000
388+
389+ for key , val in sorted (metrics .items ()):
390+ if isinstance (val , int ):
391+ print (f"- { key :35s} : { val } " )
392+ else :
393+ print (f"- { key :35s} : { val :.2f} " )
394+
395+ print () # visual spacing
0 commit comments