Remove the argument custom_training_loop (#2509)

workingloong · web-flow · commit 94029cad6038 · 2021-02-24T10:51:35.000+08:00
* Remove the argument custom_training_loop

* Pre-commit

* Remove codes

* Fix unittests

* pre-commit

* Fix the docstring
diff --git a/elasticdl/python/common/model_utils.py b/elasticdl/python/common/model_utils.py
@@ -191,40 +191,6 @@ def get_model_spec(
     )
 
 
-def get_training_func_spec(
-    model_zoo, model_def, feed, custom_data_reader,
-):
-    """Get the model spec items in a tuple.
-
-    Args:
-        model_zoo: String, the folder name of model files.
-        model_def: The import path to the model definition function/class in
-        the "model zoo".
-        feed: the function name in the model definition file to convert the
-        input data.
-        custom_data_reader: the function name in the model definition file
-        to read data from the storage.
-
-    The model spec tuple contains the following items in order:
-
-    * The `training_func` of training loop.
-    * The `feed`,
-    * The `custom_data_reader`
-    """
-    model_def_module_file = get_module_file_path(model_zoo, model_def)
-    default_module = load_module(model_def_module_file).__dict__
-    training_func_name = model_def.split(".")[-1]
-    training_func = _get_spec_value(
-        training_func_name, model_zoo, default_module, required=True
-    )
-
-    return (
-        training_func,
-        _get_spec_value(feed, model_zoo, default_module, required=False),
-        _get_spec_value(custom_data_reader, model_zoo, default_module),
-    )
-
-
 def find_layer(model, layer_class):
     """
     Find all layers in model that are instances of layer_class
diff --git a/elasticdl/python/master/elasticdl_job_service.py b/elasticdl/python/master/elasticdl_job_service.py
@@ -71,11 +71,7 @@ def __init__(self, args, task_manager, rendezvous_server=None):
             get_module_file_path(args.model_zoo, args.model_def)
         ).__dict__
 
-        self._optimizer = (
-            None
-            if args.custom_training_loop
-            else model_module[args.optimizer]()
-        )
+        self._optimizer = model_module[args.optimizer]()
 
         # TODO: Remove task manage and rendezvous server after
         # refactoring pod manager.
diff --git a/elasticdl/python/master/task_manager.py b/elasticdl/python/master/task_manager.py
@@ -155,8 +155,7 @@ def __init__(
             args.validation_data, args.data_reader_params
         )
         self._set_completed_steps_by_checkpoint(args.checkpoint_dir_for_init)
-        if not args.custom_training_loop:
-            self._add_deferred_callback_create_train_end_task()
+        self._add_deferred_callback_create_train_end_task()
 
         self._max_task_completed_times = {
             elasticai_api_pb2.EVALUATION: 0,
diff --git a/elasticdl/python/tests/elasticdl_job_service_test.py b/elasticdl/python/tests/elasticdl_job_service_test.py
@@ -62,7 +62,6 @@ def test_create_master_for_allreduce(self):
                 temp_dir=temp_dir_name,
             )
             self.arguments["training_data"] = temp_dir_name
-            self.arguments["custom_training_loop"] = "true"
             args = self._get_args()
             args = parse_master_args(args)
             master = ElasticdlJobService(args, TaskManager(args))
@@ -72,8 +71,7 @@ def test_create_master_without_eval(self):
         self.arguments[
             "distribution_strategy"
         ] = DistributionStrategy.ALLREDUCE
-        self.arguments["custom_training_loop"] = "true"
-        self.arguments["model_def"] = "mnist.mnist_train_tfv2.train"
+        self.arguments["model_def"] = "mnist.mnist_functional_api.custom_model"
         with tempfile.TemporaryDirectory() as temp_dir_name:
             create_recordio_file(
                 self._num_records,
diff --git a/elasticdl/python/tests/model_utils_test.py b/elasticdl/python/tests/model_utils_test.py
@@ -22,7 +22,6 @@
     get_model_spec,
     get_module_file_path,
     get_optimizer_info,
-    get_training_func_spec,
 )
 
 _model_zoo_path = os.path.dirname(os.path.realpath(__file__))
@@ -76,20 +75,6 @@ def test_get_model_spec(self):
             callbacks="callbacks",
         )
 
-    def test_training_func_spec(self):
-        model_zoo_path = os.path.join(
-            os.path.dirname(os.path.realpath(__file__)), "../../../model_zoo"
-        )
-        (train_spec, feed, data_reader,) = get_training_func_spec(
-            model_zoo=model_zoo_path,
-            model_def="mnist.mnist_train_tfv2.train",
-            feed="feed",
-            custom_data_reader="custom_data_reader",
-        )
-        self.assertIsNotNone(train_spec)
-        self.assertIsNotNone(feed)
-        self.assertIsNone(data_reader)
-
     def test_get_module_file_path(self):
         self.assertEqual(
             get_module_file_path(_model_zoo_path, "test_module.custom_model"),
diff --git a/elasticdl/python/tests/test_utils.py b/elasticdl/python/tests/test_utils.py
@@ -107,7 +107,6 @@ def __init__(
         model_def="",
         custom_data_reader="custom_data_reader",
         checkpoint_dir_for_init="",
-        custom_training_loop=False,
         task_fault_tolerance=True,
         relaunch_timeout_worker=True,
     ):
@@ -122,7 +121,6 @@ def __init__(
         self.model_def = model_def
         self.custom_data_reader = custom_data_reader
         self.checkpoint_dir_for_init = checkpoint_dir_for_init
-        self.custom_training_loop = custom_training_loop
         self.task_fault_tolerance = task_fault_tolerance
         self.relaunch_timeout_worker = relaunch_timeout_worker
 
diff --git a/elasticdl/python/tests/worker_test.py b/elasticdl/python/tests/worker_test.py
@@ -16,10 +16,8 @@
 
 import tensorflow as tf
 
-from elasticai_api.proto import elasticai_api_pb2
 from elasticdl.python.common.args import parse_worker_args
 from elasticdl.python.worker.worker import Worker
-from elasticdl_client.common.constants import DistributionStrategy
 
 
 class WorkerTest(unittest.TestCase):
@@ -35,29 +33,6 @@ def _create_worker(self, arguments):
         args = parse_worker_args(arguments)
         return Worker(args)
 
-    def test_init_training_func_from_args(self):
-        arguments = [
-            "--worker_id",
-            "0",
-            "--job_type",
-            elasticai_api_pb2.TRAINING,
-            "--minibatch_size",
-            self._batch_size,
-            "--model_zoo",
-            self._model_zoo_path,
-            "--model_def",
-            "mnist.mnist_train_tfv2.train",
-            "--distribution_strategy",
-            DistributionStrategy.ALLREDUCE,
-            "--custom_training_loop",
-            "true",
-        ]
-        worker = self._create_worker(arguments)
-        self.assertIsNotNone(worker._feed)
-        self.assertIsNotNone(worker._training_func)
-        self.assertEqual(worker._minibatch_size, 16)
-        self.assertIsNotNone(worker._task_data_service)
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/elasticdl/python/worker/worker.py b/elasticdl/python/worker/worker.py
@@ -25,7 +25,6 @@
 from elasticdl.python.common.model_utils import (
     get_dict_from_params_str,
     get_model_spec,
-    get_training_func_spec,
     set_callback_parameters,
 )
 from elasticdl.python.common.timing_utils import Timing
@@ -84,21 +83,16 @@ def __init__(
         self._timing = Timing(args.log_level.upper() == "DEBUG", self.logger)
         self._log_loss_count = 0
         self._var_created = False
-        self._custom_training_loop = args.custom_training_loop
         self._job_type = args.job_type
         self._minibatch_size = args.minibatch_size
         self._data_shard_service = DataShardService(
             self._mc, self._minibatch_size
         )
-        if self._custom_training_loop:
-            self._init_training_func_from_args(args)
-        else:
-            self._init_model_from_args(args)
+        self._init_model_from_args(args)
         self._init_task_data_service(args)
         self._init_default_feed_if_needed()
-        if not self._custom_training_loop:
-            self._init_callbacks(args)
-            self._init_trainer(args)
+        self._init_callbacks(args)
+        self._init_trainer(args)
 
     def _init_model_from_args(self, args):
         """
@@ -172,19 +166,6 @@ def _init_trainer(self, args):
                 self._model_inst, self._ps_client, self._timing, args
             )
 
-    def _init_training_func_from_args(self, args):
-        self._job_type = args.job_type
-        (
-            self._training_func,
-            self._feed,
-            self._custom_data_reader,
-        ) = get_training_func_spec(
-            model_zoo=args.model_zoo,
-            model_def=args.model_def,
-            feed=args.feed,
-            custom_data_reader=args.custom_data_reader,
-        )
-
     def _init_default_feed_if_needed(self):
         if self._feed is None:
             if hasattr(self._task_data_service.data_reader, "default_feed"):
@@ -465,48 +446,4 @@ def run(self):
         elif self._job_type == JobType.EVALUATION_ONLY:
             self._evaluate_only()
         else:
-            if self._custom_training_loop:
-                self._elastic_allreduce_train()
-            else:
-                self._train_and_evaluate()
-
-    def _elastic_allreduce_train(self):
-        """
-        Train and evaluate the model on the worker
-        """
-        if os.getenv("USE_TORCH", None):
-            from elasticai_api.pytorch.controller import (
-                PyTorchAllReduceController,
-            )
-
-            elastic_controller = PyTorchAllReduceController(
-                self._mc, self._data_shard_service
-            )
-        elif _IS_TF2:
-            from elasticai_api.tensorflow.controller import (
-                TensorFlowV2AllReduceController,
-            )
-
-            elastic_controller = TensorFlowV2AllReduceController(
-                self._mc, self._data_shard_service
-            )
-        else:
-            from elasticai_api.tensorflow.controller import (
-                TensorFlowV1AllReduceController,
-            )
-
-            elastic_controller = TensorFlowV1AllReduceController(
-                self._mc, self._master_addr
-            )
-        # Initialize Horovod locally to generate varibles of the model
-        # and optimizer.
-        elastic_controller.init_horovod_locally()
-        dataset = self._task_data_service.get_dataset()
-        dataset = self._feed(
-            dataset,
-            Mode.TRAINING,
-            self._task_data_service.data_reader.metadata,
-        )
-        dataset = dataset.batch(self._minibatch_size).prefetch(1)
-        self._training_func(dataset, elastic_controller)
-        del dataset
+            self._train_and_evaluate()
diff --git a/elasticdl_client/common/args.py b/elasticdl_client/common/args.py
@@ -140,13 +140,6 @@ def add_train_params(parser):
         help="If True, PS will modulate the learning rate with staleness "
         "in asynchronous SGD",
     )
-    add_bool_param(
-        parser=parser,
-        name="--custom_training_loop",
-        default=False,
-        help="If true, users need to define training loop by themselves. "
-        "Otherwise, users should define a Keras model",
-    )
     add_bool_param(
         parser=parser,
         name="--need_elasticdl_job_service",
diff --git a/model_zoo/mnist/mnist_pytorch.py b/model_zoo/mnist/mnist_pytorch.py
@@ -15,7 +15,7 @@
 Download the mnist dataset from
 https://s3.amazonaws.com/fast-ai-imageclas/mnist_png.tgz
 and then untar it into ${data_store_dir}. Using minikube, we can use the
-following command to submit a training job with the script.
+following command to submit a training job with these codes.
 
 elasticdl train \
   --image_name=elasticdl:pt_mnist_allreduce  \
@@ -33,7 +33,6 @@
   --job_name=test-mnist-allreduce \
   --image_pull_policy=Never \
   --volume="host_path=${data_store_dir},mount_path=/local_data" \
-  --custom_training_loop=true \
   --distribution_strategy=AllreduceStrategy \
 """
 

Original file line number	Diff line number	Diff line change
`@@ -155,8 +155,7 @@ def __init__(`
`155`	`155`	`args.validation_data, args.data_reader_params`
`156`	`156`	`)`
`157`	`157`	`self._set_completed_steps_by_checkpoint(args.checkpoint_dir_for_init)`
`158`		`- if not args.custom_training_loop:`
`159`		`- self._add_deferred_callback_create_train_end_task()`
	`158`	`+ self._add_deferred_callback_create_train_end_task()`
`160`	`159`
`161`	`160`	`self._max_task_completed_times = {`
`162`	`161`	`elasticai_api_pb2.EVALUATION: 0,`