Fixing name error and more detailed summaries for experts code.

William Fedus · Mesh TensorFlow Team · commit 3e8c165ef229 · 2020-12-30T10:29:35.000-08:00
PiperOrigin-RevId: 349574400
diff --git a/mesh_tensorflow/transformer/moe.py b/mesh_tensorflow/transformer/moe.py
@@ -946,6 +946,8 @@ def _rand_1_gating(
 
   if policy == "argmax" or policy == "input_dropout" or policy == "input_jitter":
     expert_gate, expert_index = mtf.top_1(raw_gates, reduced_dim=experts_dim)
+    if train:
+      mtf.scalar_summary("expert_gate", mtf.reduce_mean(expert_gate))
   elif policy == "sample":
     expert_index = mtf.sample_with_temperature(
         gate_logits, experts_dim, temperature=hparams.moe_rand_1_temperature)
@@ -1005,6 +1007,12 @@ def _rand_1_gating(
       dtype=raw_gates.dtype)
   expert_mask_flat = mtf.reduce_sum(expert_mask, reduced_dim=experts_dim)
 
+  if train:
+    total_routed = mtf.reduce_sum(expert_mask_flat)
+    importance = mtf.cast(importance, dtype=total_routed.dtype)
+    mtf.scalar_summary("fraction_routed",
+                       total_routed / mtf.reduce_sum(importance))
+
   # Mask out the experts that have overflowed expert capacity. Sparsify the
   # expert_gate.
   expert_gate *= expert_mask_flat
diff --git a/mesh_tensorflow/transformer/utils.py b/mesh_tensorflow/transformer/utils.py
@@ -665,9 +665,6 @@ def serialized_fn(mtf_features):
 
       if tpu_summaries:
         mtf.scalar_summary("loss", loss)
-        for g in var_grads:
-          grad_norm = mtf.sqrt(mtf.reduce_sum(mtf.square(g)))
-          mtf.scalar_summary("grads/norm" + g.name[:-2], grad_norm)
 
       if callable(learning_rate_schedule):
         # the following happens on CPU since TPU can't handle summaries.