Add wrapper for running experiments from TF Model Garden on GCP.

juanuribe28 · Tensorflow Cloud maintainers · commit f19e30866592 · 2021-07-12T08:52:54.000-07:00
PiperOrigin-RevId: 384244039
diff --git a/src/python/tensorflow_cloud/core/experimental/models.py b/src/python/tensorflow_cloud/core/experimental/models.py
@@ -15,12 +15,14 @@
 """Module that contains the `run_models` wrapper for training models from TF Model Garden."""
 
 import os
-from typing import Dict, Optional
+from typing import Any, Dict, Optional
 
+from .. import machine_config
 from .. import run
 import tensorflow as tf
 import tensorflow_datasets as tfds
 
+from official.core import train_lib
 from official.vision.image_classification.efficientnet import efficientnet_model
 from official.vision.image_classification.resnet import resnet_model
 
@@ -224,3 +226,70 @@ def data_pipeline(original_ds, image_size, width_ratio, batch_size, num_classes,
     ds = ds.batch(batch_size, drop_remainder=True)
     ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
     return ds
+
+
+def run_experiment_cloud(run_experiment_kwargs: Dict[str, Any],
+                         run_kwargs: Optional[Dict[str, Any]] = None,
+                         ) -> Optional[Dict[str, str]]:
+    """A wrapper for run API and tf-models-official run_experiment.
+
+    This method takes a dictionary of the parameters for run and a dictionary
+    of the parameters for run_experiment to run the experiment directly on GCP.
+
+    Args:
+        run_experiment_kwargs: keyword arguments for `train_lib.run_experiment`.
+        The docs can be found at
+        https://github.com/tensorflow/models/blob/master/official/core/train_lib.py
+        The distribution_strategy param is ignored because the distirbution
+        strategy is selected based on run_kwargs.
+        run_kwargs: keyword arguments for `tfc.run`. The docs can be found at
+        https://github.com/tensorflow/cloud/blob/master/src/python/tensorflow_cloud/core/run.py
+        The params entry_point and distribution_strategy are ignored.
+    Returns:
+        A dictionary with two keys.
+            1. 'job_id': the training job id.
+            2. 'docker_image': Docker image generated for the training job.
+    """
+    if run_kwargs is None:
+        run_kwargs = dict()
+
+    if run.remote():
+        default_machine_config = machine_config.COMMON_MACHINE_CONFIGS['T4_1X']
+        if 'chief_config' in run_kwargs:
+            chief_config = run_kwargs['chief_config']
+        else:
+            chief_config = default_machine_config
+        if 'worker_count' in run_kwargs:
+            worker_count = run_kwargs['worker_count']
+        else:
+            worker_count = 0
+        if 'worker_config' in run_kwargs:
+            worker_config = run_kwargs['worker_config']
+        else:
+            worker_config = default_machine_config
+        distribution_strategy = get_distribution_strategy(chief_config,
+                                                          worker_count,
+                                                          worker_config)
+        run_experiment_kwargs.update(
+            dict(distribution_strategy=distribution_strategy))
+        train_lib.run_experiment(**run_experiment_kwargs)
+
+    run_kwargs.update(dict(entry_point=None,
+                           distribution_strategy=None))
+    return run.run(**run_kwargs)
+
+
+def get_distribution_strategy(chief_config, worker_count, worker_config):
+    """Gets a tf distribution strategy based on the cloud run config."""
+    if worker_count > 0:
+        if machine_config.is_tpu_config(worker_config):
+            resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+            tf.config.experimental_connect_to_cluster(resolver)
+            tf.tpu.experimental.initialize_tpu_system(resolver)
+            return tf.distribute.TPUStrategy(resolver)
+        else:
+            return tf.distribute.MultiWorkerMirroredStrategy()
+    elif chief_config.accelerator_count > 1:
+        return tf.distribute.MirroredStrategy()
+    else:
+        return tf.distribute.OneDeviceStrategy(device='/gpu:0')
diff --git a/src/python/tensorflow_cloud/core/experimental/tests/unit/models_test.py b/src/python/tensorflow_cloud/core/experimental/tests/unit/models_test.py
@@ -18,8 +18,11 @@
 import mock
 
 import tensorflow as tf
+from tensorflow_cloud.core import machine_config
 from tensorflow_cloud.core import run
 from tensorflow_cloud.core.experimental import models
+from official.core import config_definitions
+from official.core import train_lib
 from official.vision.image_classification.efficientnet import efficientnet_model
 
 
@@ -41,12 +44,17 @@ def setup_normalize_img_and_label(self):
                                    3]
         self.label = tf.convert_to_tensor(4)
 
-    def setup_run_models(self, run_return_value=None, remote=True):
+    def setup_run(self, remote=True):
+        if remote:
+            self.run_return_value = None
+        else:
+            self.run_return_value = {'job_id': 'job_id',
+                                     'docker_image': 'docker_image'}
         self.run = mock.patch.object(
             run,
             'run',
             autospec=True,
-            return_value=run_return_value,
+            return_value=self.run_return_value,
         ).start()
 
         self.remote = mock.patch.object(
@@ -56,14 +64,29 @@ def setup_run_models(self, run_return_value=None, remote=True):
             return_value=remote,
         ).start()
 
+    def setup_run_models(self):
         self.classifier_trainer = mock.patch.object(
             models,
             'classifier_trainer',
             autospec=True,
         ).start()
 
-    def cleanup_run_models(self):
+    def setup_run_experiment(self):
+        config = config_definitions.ExperimentConfig()
+        self.run_experiment_kwargs = dict(task=config.task,
+                                          mode='train_and_eval',
+                                          params=config,
+                                          model_dir='model_path')
+
+        self.run_experiment = mock.patch.object(
+            train_lib,
+            'run_experiment',
+            autospec=True,
+        ).start()
+
+    def tearDown(self):
         mock.patch.stopall()
+        super(ModelsTest, self).tearDown()
 
     def test_get_model_resnet(self):
         self.setup_get_model()
@@ -114,10 +137,8 @@ def test_normalize_image_and_label_with_one_hot(self):
         self.assertTrue((result_label == expected_label).numpy().all())
 
     def test_run_models_locally(self):
-        run_return = {'job_id': 'job_id',
-                      'docker_image': 'docker_image'}
-
-        self.setup_run_models(run_return, remote=False)
+        self.setup_run(remote=False)
+        self.setup_run_models()
         run_kwargs = {'entry_point': 'entry_point',
                       'requirements_txt': 'requirements_txt',
                       'worker_count': 5,}
@@ -130,9 +151,8 @@ def test_run_models_locally(self):
                        'model_checkpoint', 'save_model']
         self.assertListEqual(list(result.keys()), return_keys)
 
-        self.cleanup_run_models()
-
     def test_run_models_remote(self):
+        self.setup_run()
         self.setup_run_models()
         result = models.run_models('dataset_name', 'model_name', 'gcs_bucket',
                                    'train')
@@ -142,7 +162,80 @@ def test_run_models_remote(self):
 
         self.assertIsNone(result)
 
-        self.cleanup_run_models()
+    def test_run_experiment_cloud_locally(self):
+        self.setup_run(remote=False)
+        self.setup_run_experiment()
+        models.run_experiment_cloud(
+            run_experiment_kwargs=self.run_experiment_kwargs)
+
+        self.remote.assert_called()
+        self.run_experiment.assert_not_called()
+        self.run.assert_called()
+
+    def test_run_experiment_cloud_remote(self):
+        self.setup_run()
+        self.setup_run_experiment()
+        models.run_experiment_cloud(
+            run_experiment_kwargs=self.run_experiment_kwargs)
+
+        self.remote.assert_called()
+        self.run_experiment.assert_called()
+        self.run.assert_called()
+
+    def setup_tpu(self):
+        mock.patch.object(tf.tpu.experimental,
+                          'initialize_tpu_system',
+                          autospec=True).start()
+        mock.patch.object(tf.config,
+                          'experimental_connect_to_cluster',
+                          autospec=True).start()
+        mock.patch('tensorflow.distribute.cluster_resolver.TPUClusterResolver'
+                   ).start()
+        mock_tpu_strategy = mock.MagicMock()
+        mock_tpu_strategy.__class__ = tf.distribute.TPUStrategy
+        mock.patch('tensorflow.distribute.TPUStrategy',
+                   return_value=mock_tpu_strategy).start()
+
+    def test_get_distribution_strategy_tpu(self):
+        tpu_srategy = tf.distribute.TPUStrategy
+        self.setup_tpu()
+        chief_config = None
+        worker_count = 1
+        worker_config = machine_config.COMMON_MACHINE_CONFIGS['TPU']
+        strategy = models.get_distribution_strategy(chief_config,
+                                                    worker_count,
+                                                    worker_config)
+        self.assertIsInstance(strategy,
+                              tpu_srategy)
+
+    def test_get_distribution_strategy_multi_mirror(self):
+        chief_config = None
+        worker_count = 1
+        worker_config = None
+        strategy = models.get_distribution_strategy(chief_config,
+                                                    worker_count,
+                                                    worker_config)
+        self.assertIsInstance(strategy,
+                              tf.distribute.MultiWorkerMirroredStrategy)
+
+    def test_get_distribution_strategy_mirror(self):
+        chief_config = machine_config.COMMON_MACHINE_CONFIGS['K80_4X']
+        worker_count = 0
+        worker_config = None
+        strategy = models.get_distribution_strategy(chief_config,
+                                                    worker_count,
+                                                    worker_config)
+        self.assertIsInstance(strategy, tf.distribute.MirroredStrategy)
+
+    def test_get_distribution_strategy_one_device(self):
+        chief_config = machine_config.COMMON_MACHINE_CONFIGS['K80_1X']
+        worker_count = 0
+        worker_config = None
+        strategy = models.get_distribution_strategy(chief_config,
+                                                    worker_count,
+                                                    worker_config)
+        self.assertIsInstance(strategy, tf.distribute.OneDeviceStrategy)
+
 
 if __name__ == '__main__':
   absltest.main()