behavior version 2: disallowing boolean optimizer (#563)

JackTemaki · web-flow · commit 8a2222ad3b0a · 2021-08-27T13:22:26.000+02:00
diff --git a/demos/demo-hyper-param-tuning.config b/demos/demo-hyper-param-tuning.config
@@ -36,7 +36,7 @@ network = {
 }
 
 # training
-adam = True
+optimizer = {"class": "adam"}
 optimizer_epsilon = HyperParam(float, [1e-16, 1], log=True, default=1e-16)
 decouple_constraints = HyperParam(bool)
 learning_rate = HyperParam(float, [1e-6, 1], log=True, default=0.01)
diff --git a/demos/demo-returnn-as-framework.py b/demos/demo-returnn-as-framework.py
@@ -37,7 +37,7 @@
   },
 
   # training
-  adam=True,
+  optimizer={'class': 'adam'},
   learning_rate=0.01,
   num_epochs=5,
   debug_add_check_numerics_ops=True,
diff --git a/demos/demo-rhn-enwik8.config b/demos/demo-rhn-enwik8.config
@@ -57,7 +57,7 @@ model = "/tmp/%s/returnn/%s/model" % (get_login_username(), demo_name)
 cleanup_old_models = True
 gradient_clip = 0
 #gradient_clip_global_norm = 1.0
-adam = True
+optimizer = {"class": "adam"}
 optimizer_epsilon = 1e-8
 #debug_add_check_numerics_ops = True
 #debug_add_check_numerics_on_output = True
diff --git a/demos/demo-tf-att-copy.config b/demos/demo-tf-att-copy.config
@@ -39,7 +39,7 @@ network = {
     "output": {"class": "softmax", "from": ["decoder"], "loss": "ce", "grad_filter": 1.0}
 }
 
-adam = True
+optimizer = {"class": "adam"}
 learning_rate = 0.01
 gradient_noise = 0.3
 gradient_clip = 2
diff --git a/demos/demo-tf-attention.config b/demos/demo-tf-attention.config
@@ -53,7 +53,7 @@ chunking = "0"
 truncation = -1
 #gradient_clip = 10
 gradient_nan_inf_filter = True
-adam = True
+optimizer = {"class": "adam"}
 gradient_noise = 0.3
 learning_rate = 0.0005
 learning_rate_control = "newbob"
diff --git a/demos/demo-tf-chunking-blstm.12ax.config b/demos/demo-tf-chunking-blstm.12ax.config
@@ -34,7 +34,7 @@ network = {
 }
 
 # training
-adam = True
+optimizer = {"class": "adam"}
 learning_rate = 0.01
 model = "/tmp/%s/returnn/%s/model" % (get_login_username(), demo_name)  # https://github.com/tensorflow/tensorflow/issues/6537
 num_epochs = 100
diff --git a/demos/demo-tf-contribrnn-lstm.12ax.config b/demos/demo-tf-contribrnn-lstm.12ax.config
@@ -25,7 +25,7 @@ network = {
 }
 
 # training
-adam = True
+optimizer = {"class": "adam"}
 learning_rate = 0.01
 model = "/tmp/%s/returnn/%s/model" % (os.getlogin(), demo_name)  # https://github.com/tensorflow/tensorflow/issues/6537
 num_epochs = 100
diff --git a/demos/demo-tf-enc-dec.config b/demos/demo-tf-enc-dec.config
@@ -74,7 +74,7 @@ batching = "random"
 batch_size = 5000
 max_seqs = 40
 chunking = "0"
-adam = True
+optimizer = {"class": "adam"}
 learning_rate = 0.0005
 learning_rate_control = "newbob"
 learning_rate_control_relative_error_relative_lr = True
diff --git a/demos/demo-tf-hard-att-copy.config b/demos/demo-tf-hard-att-copy.config
@@ -245,7 +245,7 @@ def pretrain_construct(idx, net_dict):
 pretrain = {"construction_algo": pretrain_construct}
 
 stop_on_nonfinite_train_score = False
-adam = True
+optimizer = {"class": "adam"}
 learning_rate = 0.01
 #learning_rate = 0.001
 model = "/tmp/%s/returnn/%s/model" % (get_login_username(), demo_name)
diff --git a/demos/demo-tf-lstm-benchmark.py b/demos/demo-tf-lstm-benchmark.py
@@ -121,7 +121,7 @@ def make_config_dict(lstm_unit, use_gpu):
     "max_seqs": base_settings["max_seqs"],
     "chunking": base_settings["chunking"],
     # optimization
-    "adam": True,
+    "optimizer": {"class": "adam"},
     "learning_rate": 0.01}
 
 
diff --git a/demos/demo-tf-maxgradnorm-lstm.12ax.config b/demos/demo-tf-maxgradnorm-lstm.12ax.config
@@ -25,7 +25,7 @@ network = {
 }
 
 # training
-adam = True
+optimizer = {"class": "adam"}
 learning_rate = 0.01
 maximize_grad_norm = 1e-6
 model = "/tmp/%s/returnn/%s/model" % (get_login_username(), demo_name)  # https://github.com/tensorflow/tensorflow/issues/6537
diff --git a/demos/demo-tf-native-lstm-lowmem.12ax.config b/demos/demo-tf-native-lstm-lowmem.12ax.config
@@ -24,7 +24,7 @@ network = {
 }
 
 # training
-adam = True
+optimizer = {"class": "adam"}
 learning_rate = 0.01
 model = "/tmp/%s/returnn/%s/model" % (os.getlogin(), demo_name)  # https://github.com/tensorflow/tensorflow/issues/6537
 num_epochs = 100
diff --git a/demos/demo-tf-native-lstm.12ax.config b/demos/demo-tf-native-lstm.12ax.config
@@ -29,7 +29,7 @@ network = {
 }
 
 # training
-adam = True
+optimizer = {"class": "adam"}
 learning_rate = 0.01
 # https://github.com/tensorflow/tensorflow/issues/6537
 model = "/tmp/%s/returnn/%s/model" % (get_login_username(), demo_name)
diff --git a/demos/demo-tf-native-lstm2.12ax.config b/demos/demo-tf-native-lstm2.12ax.config
@@ -29,7 +29,7 @@ network = {
 }
 
 # training
-adam = True
+optimizer = {"class": "adam"}
 learning_rate = 0.01
 # https://github.com/tensorflow/tensorflow/issues/6537
 model = "/tmp/%s/returnn/%s/model" % (get_login_username(), demo_name)
diff --git a/demos/demo-tf-native-lstm2.12ax.tuned.config b/demos/demo-tf-native-lstm2.12ax.tuned.config
@@ -26,7 +26,7 @@ network = {
 }
 
 # training
-adam = True
+optimizer = {"class": "adam"}
 learning_rate = 0.127
 optimizer_epsilon = 5.e-15
 decouple_constraints = False
diff --git a/demos/demo-tf-neural-transducer.12ax.config b/demos/demo-tf-neural-transducer.12ax.config
@@ -37,7 +37,7 @@ network = {
 
 # training
 gradient_nan_inf_filter = True
-adam = True
+optimizer = {"class": "adam"}
 gradient_clip = 10
 gradient_noise = 0.3
 learning_rate = 0.0005
diff --git a/demos/demo-tf-rec-explicit-lstm.config b/demos/demo-tf-rec-explicit-lstm.config
@@ -42,7 +42,7 @@ batching = "random"
 batch_size = 5000
 max_seqs = 40
 chunking = "0"
-adam = True
+optimizer = {"class": "adam"}
 gradient_noise = 0.3
 learning_rate = 0.01
 learning_rate_control = "newbob"
diff --git a/demos/demo-tf-rec-explicit-rnn.config b/demos/demo-tf-rec-explicit-rnn.config
@@ -34,7 +34,7 @@ batching = "random"
 batch_size = 5000
 max_seqs = 40
 chunking = "0"
-adam = True
+optimizer = {"class": "adam"}
 gradient_noise = 0.3
 learning_rate = 0.0005
 learning_rate_control = "newbob"
diff --git a/demos/demo-tf-rec-self-att.config b/demos/demo-tf-rec-self-att.config
@@ -46,7 +46,7 @@ batching = "random"
 batch_size = 5000
 max_seqs = 40
 chunking = "0"
-adam = True
+optimizer = {"class": "adam"}
 learning_rate = 0.01
 learning_rate_control = "newbob"
 learning_rate_control_relative_error_relative_lr = True
diff --git a/demos/demo-tf-vanilla-lstm.12ax.config b/demos/demo-tf-vanilla-lstm.12ax.config
@@ -25,7 +25,7 @@ network = {
 }
 
 # training
-adam = True
+optimizer = {"class": "adam"}
 learning_rate = 0.01
 model = "/tmp/%s/returnn/%s/model" % (get_login_username(), demo_name)  # https://github.com/tensorflow/tensorflow/issues/6537
 num_epochs = 100
diff --git a/demos/demo-timit-lstm-ctc.config b/demos/demo-timit-lstm-ctc.config
@@ -112,7 +112,7 @@ learning_rate_file = "%s/newbob.data" % model_dir
 #pretrain_construction_algo = "from_input"
 #gradient_clip = 0
 #gradient_nan_inf_filter = True
-nadam = True
+optimizer = {"class": "nadam"}
 #optimizer_epsilon = 1e-8
 #debug_add_check_numerics_ops = True
 #debug_add_check_numerics_on_output = True
diff --git a/returnn/tf/updater.py b/returnn/tf/updater.py
@@ -11,6 +11,7 @@
 from tensorflow.python.ops import resource_variable_ops
 
 from returnn.log import log
+from returnn.util.basic import BehaviorVersion
 from returnn.tf.network import TFNetwork
 import returnn.tf.compat as tf_compat
 import returnn.tf.util.basic as tf_util
@@ -546,10 +547,12 @@ def _create_default_optimizer(self):
     use_locking = self.use_locking
     momentum = self.config.float("momentum", 0.0)
     optim_config = self.config.typed_value("optimizer")
+    behavior_valid_optimizer = False  # only via "optimizer" or nothing at all (default SGD)
     if optim_config:
       assert isinstance(optim_config, (dict, str))
       assert "class" in optim_config
       optimizer = self._create_optimizer(optim_config)
+      behavior_valid_optimizer = True
     elif self.config.bool("adam", False):
       assert not momentum
       print("Create Adam optimizer.", file=log.v2)
@@ -593,6 +596,11 @@ def _create_default_optimizer(self):
     else:
       print("Create SGD optimizer.", file=log.v2)
       optimizer = tf_compat.v1.train.GradientDescentOptimizer(learning_rate=lr, use_locking=use_locking)
+      behavior_valid_optimizer = True
+    BehaviorVersion.require(
+      condition=behavior_valid_optimizer,
+      message="Please define an optimizer specifically via the 'optimizer=...' parameter",
+      version=2)
     return optimizer
 
   def _compute_gradients(self, loss, var_list):
diff --git a/returnn/util/basic.py b/returnn/util/basic.py
@@ -190,7 +190,7 @@ class BehaviorVersion:
   The version will be set after the config is defined at __main__.init_config() or Engine.__init__()
   """
 
-  _latest_behavior_version = 1
+  _latest_behavior_version = 2
   _behavior_version = None  # type: typing.Optional[int]
 
   @classmethod
diff --git a/tests/test_TFEngine.py b/tests/test_TFEngine.py
@@ -314,7 +314,7 @@ def get_data(num_seqs):
     "start_epoch": 1,
     "num_epochs": 2,
     "batch_size": 50,  # set it such that sometimes we have num-seqs 1, 2 or 3 in a single batch
-    "adam": True,
+    "optimizer": {"class": "adam"},
     "learning_rate": 0.001,
     "tf_log_memory_usage": True,
     "log_batch_size": True
@@ -362,7 +362,7 @@ def get_data(num_seqs):
     "start_epoch": 1,
     "num_epochs": 2,
     "batch_size": 50,  # set it such that sometimes we have num-seqs 1, 2 or 3 in a single batch
-    "adam": True,
+    "optimizer": {"class": "adam"},
     "learning_rate": 0.001,
     "tf_log_memory_usage": True,
     "log_batch_size": True,
@@ -576,7 +576,7 @@ def test_engine_train_grad_noise_sparse():
     "start_epoch": 1,
     "num_epochs": 2,
     "learning_rate": 0.01,
-    "nadam": True,
+    "optimizer": {"class": "nadam"},
     "gradient_noise": 0.3,
     "batch_size": 100
   })
@@ -985,7 +985,7 @@ def run_dummy_training(net_dict):
     "start_epoch": 1,
     "num_epochs": 2,
     "learning_rate": 0.01,
-    "nadam": True,
+    "optimizer": {"class": "nadam"},
     "gradient_noise": 0.3,
     "debug_add_check_numerics_ops": True,
     "debug_print_layer_output_template": True,
@@ -1750,7 +1750,7 @@ def test_rec_subnet_train_t3b():
     "start_epoch": 1,
     "num_epochs": 2,
     "batch_size": 10,
-    "nadam": True,
+    "optimizer": {"class": "nadam"},
     "learning_rate": 0.01,
     "debug_add_check_numerics_ops": True
   })
@@ -1806,7 +1806,7 @@ def test_rec_subnet_train_t3d():
     "start_epoch": 1,
     "num_epochs": 2,
     "batch_size": 10,
-    "nadam": True,
+    "optimizer": {"class": "nadam"},
     "learning_rate": 0.01,
     "debug_add_check_numerics_ops": True
   })
@@ -1851,7 +1851,7 @@ def test_rec_subnet_train_t3d_simple():
     "start_epoch": 1,
     "num_epochs": 2,
     "batch_size": 10,
-    "nadam": True,
+    "optimizer": {"class": "nadam"},
     "learning_rate": 0.01,
     "debug_add_check_numerics_ops": True
   })
@@ -1880,7 +1880,7 @@ def deterministic_train_check(layer_opts):
     "start_epoch": 1,
     "num_epochs": 2,
     "batch_size": 10,
-    "adam": True,
+    "optimizer": {"class": "adam"},
     "learning_rate": 0.01,
     "debug_add_check_numerics_ops": True
   })
@@ -2011,7 +2011,7 @@ def create_config(optimize_move_layers_out):
       "start_epoch": 1,
       "num_epochs": 2,
       "batch_size": 10,
-      "nadam": True,
+      "optimizer": {"class": "nadam"},
       "learning_rate": 0.01
     })
     return config
@@ -2342,7 +2342,7 @@ def test_rec_subnet_eval_init_out_apply0():
     "start_epoch": 1,
     "num_epochs": 2,
     "batch_size": 10,
-    "adam": True,
+    "optimizer": {"class": "adam"},
     "learning_rate": 0.01,
     "debug_print_layer_output_template": True,
     "debug_runtime_sanity_checks": True,
@@ -2715,7 +2715,7 @@ def custom_construction_algo(idx, net_dict):
       "pretrain": {"copy_param_mode": "subset", "construction_algo": custom_construction_algo},
       "batch_size": 1000,
       "max_seqs": 2,
-      "adam": True,
+      "optimizer": {"class": "adam"},
       "learning_rate": learning_rate,
       "use_learning_rate_control_always": True,
       "learning_rate_control": "newbob_multi_epoch",
@@ -3156,7 +3156,7 @@ def make_network(num_layers):
     "start_epoch": 1,
     "num_epochs": 2,
     "learning_rate": 0.01,
-    "adam": True,
+    "optimizer": {"class": "adam"},
     "debug_print_layer_output_template": True,
   })
   _cleanup_old_models(config)
@@ -3188,7 +3188,7 @@ def test_grad_summaries():
     "num_inputs": n_data_dim,
     "num_epochs": 1,
     "learning_rate": 0.01,
-    "adam": True,
+    "optimizer": {"class": "adam"},
     "debug_print_layer_output_template": True,
     "debug_grad_summaries": True,
   }))
@@ -3677,7 +3677,7 @@ def test_regression_choice():
     "start_epoch": 1,
     "num_epochs": 2,
     "learning_rate": 0.01,
-    "adam": True,
+    "optimizer": {"class": "adam"},
     "debug_print_layer_output_template": True,
     "debug_print_layer_output_shape": True,
   })
diff --git a/tests/test_TFNetworkLayer.py b/tests/test_TFNetworkLayer.py
@@ -3249,7 +3249,7 @@ def test_ReuseParams_rec():
       "rec_fwd":      {"class": "rec", "direction": 1, "from": ["data"], "n_out": 300, "unit": "lstmp"},
       "rec_fwd_copy": {"class": "rec", "direction": 1, "from": ["data"], "n_out": 300, "unit": "lstmp", "reuse_params": "rec_fwd"}
     },
-    "adam": True,
+    "optimizer": {"class": "adam"},
     "target": "classes",
     "debug_grad_summaries": True,
     "debug_save_updater_vars": True,
@@ -3299,7 +3299,7 @@ def test_ReuseParams_dep_loop():
       "layer2": {"class": "linear", "from": "layer1", "activation": "relu", "n_out": 10},
       "out": {"class": "softmax", "from": "layer2", "loss": "ce", "n_out": num_outputs},
     },
-    "adam": True,
+    "optimizer": {"class": "adam"},
     "target": "classes",
     "debug_print_layer_output_template": True,
   })
@@ -3364,7 +3364,7 @@ def test_ReuseParams_dep_loop_2():
       "layer2": {"class": "linear", "from": "layer1/sub2", "activation": "relu", "n_out": 10},
       "out": {"class": "softmax", "from": "layer2", "loss": "ce", "n_out": num_outputs},
     },
-    "adam": True,
+    "optimizer": {"class": "adam"},
     "target": "classes",
     "debug_print_layer_output_template": True,
   })
@@ -3429,7 +3429,7 @@ def test_ReuseParams_dep_loop_3():
       "layer2": {"class": "linear", "from": "layer1/sub1", "activation": "relu", "n_out": 10},
       "out": {"class": "softmax", "from": "layer2", "loss": "ce", "n_out": num_outputs},
     },
-    "adam": True,
+    "optimizer": {"class": "adam"},
     "target": "classes",
     "debug_print_layer_output_template": True,
   })
diff --git a/tests/test_TFNetworkRecLayer.py b/tests/test_TFNetworkRecLayer.py

Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ network = {`
`36`	`36`	`}`
`37`	`37`
`38`	`38`	`# training`
`39`		`-adam = True`
	`39`	`+optimizer = {"class": "adam"}`
`40`	`40`	`optimizer_epsilon = HyperParam(float, [1e-16, 1], log=True, default=1e-16)`
`41`	`41`	`decouple_constraints = HyperParam(bool)`
`42`	`42`	`learning_rate = HyperParam(float, [1e-6, 1], log=True, default=0.01)`
Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,7 @@ network = {`
`39`	`39`	`"output": {"class": "softmax", "from": ["decoder"], "loss": "ce", "grad_filter": 1.0}`
`40`	`40`	`}`
`41`	`41`
`42`		`-adam = True`
	`42`	`+optimizer = {"class": "adam"}`
`43`	`43`	`learning_rate = 0.01`
`44`	`44`	`gradient_noise = 0.3`
`45`	`45`	`gradient_clip = 2`
Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@ network = {`
`34`	`34`	`}`
`35`	`35`
`36`	`36`	`# training`
`37`		`-adam = True`
	`37`	`+optimizer = {"class": "adam"}`
`38`	`38`	`learning_rate = 0.01`
`39`	`39`	`model = "/tmp/%s/returnn/%s/model" % (get_login_username(), demo_name) # https://github.com/tensorflow/tensorflow/issues/6537`
`40`	`40`	`num_epochs = 100`
Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,7 @@ network = {`
`25`	`25`	`}`
`26`	`26`
`27`	`27`	`# training`
`28`		`-adam = True`
	`28`	`+optimizer = {"class": "adam"}`
`29`	`29`	`learning_rate = 0.01`
`30`	`30`	`model = "/tmp/%s/returnn/%s/model" % (os.getlogin(), demo_name) # https://github.com/tensorflow/tensorflow/issues/6537`
`31`	`31`	`num_epochs = 100`