Save the current EMA varaibles to the checkpoint, instead of t-1.

ds-hwang · copybara-github · commit 9150f56da3f3 · 2022-11-14T17:04:13.000-08:00
Currently, Lingvo saves EMA(t-1) to the t-th checkpoint, as the train_op looks like,
  def ConstructFPropBPropGraph(self):
    self.ApplyExponentialMovingAverage()  &lt;-- update var_t-1 to ema var (ema_t-1)
    self._task.FPropDefaultTheta()
    self._task.BProp()  &lt;-- update var_t-1 to var_t

ema.apply() after BProp more makes sense, as is updates var_t and then updates
ema_t at global_step t.
But Lingvo runs ema.apply() first. It's because this method is used for both
graph construction and train step.
In graph construction, self._task.FPropDefaultTheta() may need EMA variables.
e.g. EMBR, EMA teacher, so on.
So, ConstructFPropBPropGraph() routine calls ema.apply() first to ensure EMA var.

It causes following confusion;
* After train_op, var_t and ema_t-1 are saved into the checkpoint at t.
* Evaler/Decoder step_ops are very confusing.

This issue can be solved by separating EMA variable creation and update,
althogh both use same TF API; ema.apply(). Especially it doesn't make sense
that graph construction method constructs EMA variables. EMA variables are
variables. It more makes sense that variable creation method (i.e.
mdl.Instantiate()) creates EMA variables. This CL makes the changes. After
that, ConstructFPropBPropGraph() doesn't need to call ema.apply() first,
because there is already EMA varaibles. The weird order of train_op is fixed.

It's more dramatic for evaler and decoder step_ops. Currently, Evaler's
ConstructFPropGraph() and Decoder's ConstructDecodeGraph() have to call
ema.apply(), only because of EMA variables creation. Evaler/Decoder must not
update EMA variables, which is Trainer's job. Those use EMA variable from the
checkpoint or trainer. Now we can remove the weird ema.apply() in
evaler/decoder_step_ops (i.e. tech debt).

PiperOrigin-RevId: 488507349
diff --git a/lingvo/core/base_layer.py b/lingvo/core/base_layer.py
@@ -1113,18 +1113,31 @@ def MatchKeys(x, y):
     for k in self.theta.keys():
       assert k in self.vars or k in self._extra_theta
 
-  def PostTrainingStepUpdate(self):
+  def PostTrainingStepUpdate(self) -> tf.Operation:
     """Returns a TF op which will be invoked at each training step.
 
     Subclasses of `BaseLayer` can implement this method. The method should
-    return a TF op to be invoked during training after gradients are applied.
+    return a TF op to be invoked during training after gradients are applied and
+    before EMA is updated.
     """
     update_ops = [
         child.PostTrainingStepUpdate()
         for child in py_utils.Flatten(self._private_children)
     ]
     return tf.group(*update_ops)
 
+  def PostEmaUpdate(self) -> tf.Operation:
+    """Returns a TF op which will be invoked at each training step.
+
+    Subclasses of `BaseLayer` can implement this method. The method should
+    return a TF op to be invoked during training after EMA is updated.
+    """
+    update_ops = [
+        child.PostEmaUpdate()
+        for child in py_utils.Flatten(self._private_children)
+    ]
+    return tf.group(*update_ops)
+
   def _CastToFPropDtype(self, value):
 
     def _Cast(x):
diff --git a/lingvo/core/base_model.py b/lingvo/core/base_model.py
@@ -16,6 +16,7 @@
 
 import collections
 import dataclasses
+import functools
 import re
 from typing import Dict, Union
 
@@ -748,12 +749,21 @@ def _BPropGenTrainOps(self, vmap, metrics=None, add_summary=True):
     var_update_ops = [
         tf.group(*tf.nest.flatten(train_ops), name='var_update_ops')
     ]
-    # Post training step update.
+    # Post training step update. It may update non-trainable vars, which have
+    # EMA variables.
     with tf.control_dependencies(var_update_ops):
       post_step_op = self.PostTrainingStepUpdate()
 
-    train_ops = {}
+    # EMA update after all EMA reference variables are updated.
     with tf.control_dependencies([post_step_op]):
+      ema_update_op = self.ApplyExponentialMovingAverage()
+
+    # Post EMA update, which depends on the updated EMA vars. e.g. quant_vars
+    with tf.control_dependencies([ema_update_op]):
+      post_ema_op = self.PostEmaUpdate()
+
+    train_ops = {}
+    with tf.control_dependencies([post_ema_op]):
       # Get the op to update the weight masks and thresholds
       mask_update_op = self._GetMaskUpdateOp()
       train_ops['mask_updates'] = mask_update_op
@@ -813,30 +823,28 @@ def _ComputeGradientMask(self, bprop_variable_filters):
           self._per_input_gradient_mask[var.name] += (
               tf.one_hot(i, len(bprop_variable_filters), dtype=tf.float32))
 
-  def ApplyExponentialMovingAverage(self, ema):
-    """Wraps `self.train_op` with an op updating exponential moving average."""
+  def CreateExponentialMovingAverage(self, ema):
+    """Create exponential moving average variables."""
     if not ema:
       # EMA not enabled.
       return
 
-    all_vars = _VariablesForEMA(self.params, self.vars.Flatten())
-    # For ExecutorTpu: `ema.apply()` below creates stateful variable update
-    # operations, and due to the use of tf.function in the tpu training loop,
-    # these update ops will be added as (implicit) control dependencies to
-    # the step function of eval/decode program. To avoid updating EMA variables,
-    # we run `ema.apply()` only in two cases: 1) in train program, or
-    # 2) in the first eval or decode program when there is no train program.
-    # It'll still apply the update in every eval/decode step even though
-    # the update is not materialized into checkpoint, but experiment shows it
-    # doesn't affect eval/decode metrics.
-    if self.do_eval:
-      need_ema_apply = any([ema.average(var) is None for var in all_vars])
-      if need_ema_apply:
-        assert all([ema.average(var) is None for var in all_vars
-                   ]), ('We never update EMA partially.')
-      else:
-        # Trainer already created EMA variables.
-        return
+    tf.logging.info('CreateExponentialMovingAverage on %s', self)
+    # Use empty name here so no prefix is added to the EMA variable names.
+    # Pin EMA varialbes to CPU if needed.
+    # The scope: GetLingvoVariableCreator(MaybePinVarsToCpu(_ApplyEMA(...)))
+    scoped_apply_ema = self._ApplyEMA
+    for scoped_creator in (py_utils.MaybePinVarsToCpu,
+                           py_utils.GetLingvoVariableCreator('', '')):
+      scoped_apply_ema = functools.partial(scoped_creator, scoped_apply_ema)
+    scoped_apply_ema(ema=ema)
+
+  def ApplyExponentialMovingAverage(self):
+    """Wraps `self.train_op` with an op updating exponential moving average."""
+    ema = self.ema
+    if not ema:
+      # EMA not enabled.
+      return tf.no_op()
 
     # Make sure this is called at most once in a graph. In eager mode, the outer
     # tf.function will be traced multiple times in different function graphs.
@@ -849,12 +857,14 @@ def ApplyExponentialMovingAverage(self, ema):
     self._graphs_applied_ema.add(graph)
 
     tf.logging.info('ApplyExponentialMovingAverage on %s', self)
-
-    def ApplyEma():
-      with tf.name_scope('moving_average'):
-        self._post_train_ops.append(ema.apply(all_vars))
     # Use empty name here so no prefix is added to the EMA variable names.
-    py_utils.GetLingvoVariableCreator('', '')(ApplyEma)
+    scoped_creator = py_utils.GetLingvoVariableCreator('', '')
+    return scoped_creator(self._ApplyEMA, ema=ema)
+
+  def _ApplyEMA(self, ema):
+    all_vars = _VariablesForEMA(self.params, self.vars.Flatten())
+    with tf.name_scope('moving_average'):
+      return ema.apply(all_vars)
 
   # TODO(blee): Rename Decode->DecodeWithDefaultTheta, DecodeWithTheta->Decode.
   def Decode(self, input_batch):
@@ -1199,18 +1209,15 @@ def _MakeEMAVariablesDict(self):
   def ConstructFPropBPropGraph(self):
     raise NotImplementedError('Abstract method')
 
-  def ConstructFPropGraph(self, apply_ema=False):
+  def ConstructFPropGraph(self):
     raise NotImplementedError('Abstract method')
 
-  def ConstructDecodeGraph(self, task_name=None, apply_ema=False):
+  def ConstructDecodeGraph(self, task_name=None):
     raise NotImplementedError('Abstract method')
 
   def ConstructPostTrainingLoop(self, outfeed=None):
     raise NotImplementedError('Abstract method')
 
-  def ApplyExponentialMovingAverage(self):
-    raise NotImplementedError('Abstract method')
-
   @property
   def tasks(self):
     """Returns a list of all tasks."""
@@ -1276,6 +1283,16 @@ class SingleTaskBase(BaseModel):
   def __init__(self, params, **kwargs):
     super().__init__(params, **kwargs)
 
+  def _CreateLayerVariables(self) -> None:
+    super()._CreateLayerVariables()
+    # CPU evaler doesn't create EMA variables. It loads EMA variables to
+    # regular variables.
+    use_ema = self.ema and (not self.do_eval or self.use_ema_for_theta)
+    # All variables of the model are created. Now create EMA variables.
+    if use_ema:
+      self._task.CreateExponentialMovingAverage(self.ema)
+      self._MakeEMAVariablesDict()
+
   @property
   def tasks(self):
     return [self._task]
@@ -1287,29 +1304,16 @@ def GetTask(self, task_name=None):
   def SampleTask(self, global_step):
     return self._task
 
-  def ApplyExponentialMovingAverage(self):
-    if self.ema:
-      self._task.ApplyExponentialMovingAverage(self.ema)
-      # ConstructFPropGraph/ConstructDecodeGraph also need this to ensure that
-      # ema vars are loaded from checkpoint even when no training is done.
-      self._MakeEMAVariablesDict()
-
   def ConstructFPropBPropGraph(self):
-    self.ApplyExponentialMovingAverage()
     self._task.FPropDefaultTheta()
     self._task.BProp()
 
-  def ConstructFPropGraph(self, apply_ema=False):
-    if apply_ema:
-      self.ApplyExponentialMovingAverage()
+  def ConstructFPropGraph(self):
     self._task.FPropDefaultTheta()
 
   def ConstructDecodeGraph(self,
                            task_name=None,
-                           apply_ema=False,
                            input_batch=None):
-    if apply_ema:
-      self.ApplyExponentialMovingAverage()
     with py_utils.TaskCallScope(self._task):
       if not input_batch:
         input_batch = self._task.GetInputBatch()
@@ -1511,6 +1515,19 @@ def __init__(self, params, **kwargs):
 
       self.CreateChild('task_schedule', p.task_schedule)
 
+  def _CreateLayerVariables(self) -> None:
+    super()._CreateLayerVariables()
+    # CPU evaler doesn't create EMA variables. It loads EMA variables to
+    # regular variables.
+    use_ema = self.ema and (not self.do_eval or self.use_ema_for_theta)
+    # All variables of the model are created. Now create EMA variables.
+    if use_ema:
+      for task_name in self.task_names:
+        with tf.name_scope(task_name):
+          task = self.GetTask(task_name)
+          task.CreateExponentialMovingAverage(self.ema)
+      self._MakeEMAVariablesDict()
+
   def _child_variable_scope_override(self):
     p = self.params
     res = super()._child_variable_scope_override()
@@ -1545,33 +1562,22 @@ def SampleTask(self, global_step):
     tf.logging.info('Sampled task: %s', sampled_task)
     return self.children[sampled_task]
 
-  def ApplyExponentialMovingAverage(self):
-    if self.ema:
-      for task_name in self.task_names:
-        with tf.name_scope(task_name):
-          task = self.GetTask(task_name)
-          task.ApplyExponentialMovingAverage(self.ema)
-      self._MakeEMAVariablesDict()
-
   def ConstructFPropBPropGraph(self):
     for task_name in self.task_names:
       with tf.name_scope(task_name):
-        self.ApplyExponentialMovingAverage()
         task = self.GetTask(task_name)
         task.FPropDefaultTheta()
         task.BProp()
 
-  def ConstructFPropGraph(self, apply_ema=False):
-    assert not apply_ema
+  def ConstructFPropGraph(self):
     for task_name in self.task_names:
       with tf.name_scope(task_name):
         task = self.GetTask(task_name)
         # Note: this is for CPU-based eval only where the variables are already
         # loaded as EMA variables, so we don't need to apply EMA.
         task.FPropDefaultTheta()
 
-  def ConstructDecodeGraph(self, task_name=None, apply_ema=False):
-    assert not apply_ema
+  def ConstructDecodeGraph(self, task_name=None):
     if not task_name:
       raise ValueError(
           'It can decode only one task at a time, but task_name is not set.')
diff --git a/lingvo/core/ema_test.py b/lingvo/core/ema_test.py
@@ -56,8 +56,6 @@ def testBatchNormLayer(self):
     model = p.Instantiate()
     self.assertIsNotNone(model.ema)
     task = model._task
-    task._train_op = tf.no_op()
-    task.ApplyExponentialMovingAverage(model.ema)
 
     layer = task.encoder
     self.assertLen(layer.vars, 4)
@@ -77,7 +75,7 @@ def testBatchNormLayer(self):
       self.evaluate(tf.assign(py_utils.GetOrCreateGlobalStepVar(), global_step))
       self.evaluate(tf.assign(beta, beta_1))
       self.evaluate(tf.assign(mean, mean_1))
-      self.evaluate(task._post_train_ops)
+      self.evaluate(task.ApplyExponentialMovingAverage())
 
       self.assertAllClose([beta_1, beta_1_ema, mean_1, mean_1_ema],
                           self.evaluate([
@@ -101,8 +99,6 @@ def testBatchNormLayer(self):
       model = p.Instantiate()
       self.assertIsNotNone(model.ema)
       task = model._task
-      task._train_op = tf.no_op()
-      task.ApplyExponentialMovingAverage(model.ema)
       layer = task.encoder
       for var in layer.vars.Flatten():
         self.assertIsNotNone(model.ema.average(var), msg=var.name)
@@ -155,8 +151,6 @@ def testBatchNormLayer(self):
       model = p.Instantiate(executor_ema=executor_ema)
       self.assertIsNotNone(model.ema)
       task = model._task
-      task._train_op = tf.no_op()
-      task.ApplyExponentialMovingAverage(model.ema)
       layer = task.encoder
       for var in layer.vars.Flatten():
         self.assertIsNotNone(model.ema.average(var), msg=var.name)
diff --git a/lingvo/core/program.py b/lingvo/core/program.py
@@ -798,8 +798,7 @@ def TpuEvalStep(self, *args):
       Summed eval metrics.
     """
     with tf.name_scope('tpu_eval'):
-      # Applies EMA if applicable to support running only eval/decode programs.
-      self._model.ConstructFPropGraph(apply_ema=True)
+      self._model.ConstructFPropGraph()
       per_step_eval_metrics = self._eval_metrics.SetMetrics(
           self._task.eval_metrics, args)
       summed_metrics = []
@@ -1044,7 +1043,6 @@ def __init__(self, params, **kwargs):
     super().__init__(params, **kwargs)
     self._program_name = 'DecodeProgram'
     self._decode_out_dict_lst = []
-    self._ema_applied = False
     self._dataset_summaries = {}
     # TODO(xingwu): fully deprecate decode_until_out_of_range
     if self.params.decode_until_out_of_range:
@@ -1152,11 +1150,8 @@ def DecodeFunc(self, inp_instance):
 
     def _DecodeFn():
       """Decode call to be compiled for TPU."""
-      # Applies EMA if applicable to support running only eval/decode programs.
       _, decode_dict = self._model.ConstructDecodeGraph(
-          apply_ema=(not self._ema_applied),
           input_batch=inp_instance.TpuDequeueBatch())
-      self._ema_applied = True
       self.decode_nm = py_utils.NestedMap(decode_dict)
       return self.decode_nm.Flatten()
 
@@ -1636,8 +1631,7 @@ def DecodeFunc(self):
 
     def _DecodeStep():
       """Decode call to be compiled for TPU."""
-      # Applies EMA if applicable to support running only eval/decode programs.
-      _, decode_dict = self._model.ConstructDecodeGraph(apply_ema=True)
+      _, decode_dict = self._model.ConstructDecodeGraph()
       self.decode_nm = py_utils.NestedMap(decode_dict)
       return [self._OutfeedEnqueue(decode_dict)]
 
@@ -1911,9 +1905,7 @@ def TpuTrain():
     def _DecodeFn():
       """Decode call to be compiled for TPU."""
       with cluster_factory.SetEval(True):
-        # Applies EMA if applicable to support running only eval/decode
-        # programs.
-        _, decode_dict = self._decode_model.ConstructDecodeGraph(apply_ema=True)
+        _, decode_dict = self._decode_model.ConstructDecodeGraph()
       self.decode_nm = py_utils.NestedMap(decode_dict)
       return self.decode_nm.Flatten()