99
1010@rfm .simple_test
1111class tensorflow_cnn_check (rfm .RunOnlyRegressionTest , pin_prefix = True ):
12- '''Base class for the TensorFlow2 Horovod Test.
13-
14- TensorFlow is an end-to-end open source platform for machine
15- learning. It has a comprehensive, flexible ecosystem of tools,
16- libraries and community resources that lets researchers push the
17- state-of-the-art in ML and developers easily build and deploy ML
18- powered applications. (see tensorflow.org).
19-
20- Horovod is a distributed deep learning training
21- framework for TensorFlow, Keras, PyTorch, and Apache
22- MXNet. The goal of Horovod is to make distributed deep
23- learning fast and easy to use (see github.com/horovod/horovod).
24-
25- This test tests the performance of TensorFlow2 and Horovod using
26- classic deep learning model Inception v3. It checks whether learning is
27- performed to the end. The default assumption
28- is that TensorFlow2 and Horovod is already installed on the device
29- under test.
12+ '''Run a synthetic CNN benchmark with TensorFlow2 and Horovod.
13+
14+ TensorFlow is an end-to-end open source platform for machine learning. It
15+ has a comprehensive, flexible ecosystem of tools, libraries and community
16+ resources that lets researchers push the state-of-the-art in ML and
17+ developers easily build and deploy ML powered applications. For more
18+ information, refer to `<https://www.tensorflow.org/>`__.
19+
20+ Horovod is a distributed deep learning training framework for TensorFlow,
21+ Keras, PyTorch, and Apache MXNet. The goal of Horovod is to make
22+ distributed deep learning fast and easy to use. For more information refer
23+ to `<https://github.com/horovod/horovod>`__.
24+
25+ This test runs the Horovod ``tensorflow2_synthentic_benchmark.py``
26+ example, checks its sanity and extracts the GPU performance.
3027 '''
3128
29+ #: The version of Horovod to use.
30+ #:
31+ #: :type: :class:`str`
32+ #: :default: ``'v0.21.0'``
3233 benchmark_version = variable (str , value = 'v0.21.0' )
3334
34- # Name of the model used for the testing
35+ #: The name of the model to use for this benchmark.
36+ #:
37+ #: :type: :class:`str`
38+ #: :default: ``'InceptionV3'``
3539 model = variable (str , value = 'InceptionV3' )
3640
37- # Size of the batch used during the learning of models
41+ #: The size of the batch used during the learning of models.
42+ #:
43+ #: :type: :class:`int`
44+ #: :default: ``32``
3845 batch_size = variable (int , value = 32 )
3946
47+ #: The number of iterations.
48+ #:
49+ #: :type: :class:`int`
50+ #: :default: ``5``
4051 num_iters = variable (int , value = 5 )
52+
53+ #: The number of batches per iteration.
54+ #:
55+ #: :type: :class:`int`
56+ #: :default: ``5``
4157 num_batches_per_iter = variable (int , value = 5 )
58+
59+ #: The number of warmup batches
60+ #:
61+ #: :type: :class:`int`
62+ #: :default: ``5``
4263 num_warmup_batches = variable (int , value = 5 )
4364
4465 executable = 'python tensorflow2_synthetic_benchmark.py'
@@ -65,12 +86,14 @@ def prepare_test(self):
6586
6687 @performance_function ('images/s' )
6788 def throughput_iteration (self ):
89+ '''The average GPU throughput per iteration in ``images/s``.'''
6890 return sn .avg (
6991 sn .extractall (r'Img/sec per GPU: (\S+) \S+' , self .stdout , 1 , float )
7092 )
7193
7294 @performance_function ('images/s' )
7395 def throughput_total (self ):
96+ '''The total GPU throughput of the benchmark in ``images/s``.'''
7497 return sn .extractsingle (
7598 rf'Total img/sec on { self .num_tasks } GPU\(s\): (\S+) \S+' ,
7699 self .stdout , 1 , float
0 commit comments