tensorflow
diff --git a/‎examples/toy_model_tpu.py‎
Lines changed: 60 additions & 14 deletions b/‎examples/toy_model_tpu.py‎
Lines changed: 60 additions & 14 deletions
diff --git a/‎mesh_tensorflow/beam_search.py‎
Lines changed: 15 additions & 11 deletions b/‎mesh_tensorflow/beam_search.py‎
Lines changed: 15 additions & 11 deletions
@@ -36,6 +36,11 @@
 tf.flags.DEFINE_integer('batch_size', 64, 'Training batch size.')
 tf.flags.DEFINE_integer('io_size', 2, 'Number of channels per feature.')
 tf.flags.DEFINE_integer('hidden_size', 2, 'Size of each hidden layer.')
+tf.flags.DEFINE_integer('num_hidden_layers', 1, 'Number of layers.')
+tf.flags.DEFINE_string('master_dtype', 'bfloat16', 'dtype for master vars.')
+tf.flags.DEFINE_string('slice_dtype', 'float32', 'dtype for slice vars.')
+tf.flags.DEFINE_string('activation_dtype', 'float32', 'dtype for activations.')
+tf.flags.DEFINE_string('optimizer', 'SGD', 'optimizer (SGD or Adafactor).')
 tf.flags.DEFINE_string('mesh_shape', 'all:8', 'mesh shape')
 tf.flags.DEFINE_string('layout', 'hidden:all', 'layout rules')
 tf.flags.DEFINE_integer('iterations', 100,
@@ -48,6 +53,7 @@
     'model_dir',
     default='',
     help='The directory where the model will be stored.')
+tf.flags.DEFINE_bool('use_tpu', True, 'use TPU')
 
 # Cloud TPU Cluster Resolvers
 tf.flags.DEFINE_string(
@@ -97,14 +103,31 @@ def __call__(self, params):
 def toy_model(features, mesh):
   """A toy model implemented by mesh tensorlfow."""
   batch_dim = mtf.Dimension('batch', FLAGS.batch_size)
-  hidden_dim = mtf.Dimension('hidden', FLAGS.hidden_size)
   io_dim = mtf.Dimension('io', FLAGS.io_size)
 
-  x = mtf.import_tf_tensor(mesh, features, mtf.Shape([batch_dim, io_dim]))
-  h = mtf.layers.dense(x, hidden_dim, name='layer1', use_bias=False)
-  y = mtf.layers.dense(h, io_dim, name='layer2', use_bias=False)
+  master_dtype = tf.as_dtype(FLAGS.master_dtype)
+  slice_dtype = tf.as_dtype(FLAGS.slice_dtype)
+  activation_dtype = tf.as_dtype(FLAGS.activation_dtype)
 
-  loss = mtf.reduce_sum(mtf.square(y - x))
+  x = mtf.import_tf_tensor(mesh, features, mtf.Shape([batch_dim, io_dim]))
+  x = mtf.cast(x, activation_dtype)
+  h = x
+  for lnum in xrange(FLAGS.num_hidden_layers + 1):
+    if lnum + 1 == FLAGS.num_hidden_layers + 1:
+      dim = io_dim
+    elif lnum % 2 == 0:
+      dim = mtf.Dimension('hidden_even', FLAGS.hidden_size)
+    else:
+      dim = mtf.Dimension('hidden_odd', FLAGS.hidden_size)
+    h = mtf.layers.dense(
+        h, dim,
+        use_bias=False,
+        master_dtype=master_dtype,
+        slice_dtype=slice_dtype,
+        name='layer_%d' % lnum)
+  y = h
+
+  loss = mtf.reduce_mean(mtf.square(y - x))
   return y, loss
 
 
@@ -113,20 +136,43 @@ def model_fn(features, labels, mode, params):
   del labels
   global_step = tf.train.get_global_step()
   graph = mtf.Graph()
-  mesh = mtf.Mesh(graph, 'my_mesh')
   mesh_shape = mtf.convert_to_shape(FLAGS.mesh_shape)
-  mesh_devices = [''] * mesh_shape.size
-  mesh_impl = mtf.simd_mesh_impl.SimdMeshImpl(
-      mesh_shape, mtf.convert_to_layout_rules(FLAGS.layout),
-      mesh_devices, params['context'].device_assignment)
+  layout_rules = mtf.convert_to_layout_rules(FLAGS.layout)
+  if FLAGS.use_tpu:
+    ctx = params['context']
+    num_hosts = ctx.num_hosts
+    host_placement_fn = ctx.tpu_host_placement_function
+    device_list = [host_placement_fn(host_id=t) for t in range(num_hosts)]
+    tf.logging.info('device_list = %s' % device_list,)
+    # TODO(ylc): Better estimation of replica cache size?
+    replica_cache_size = 300 * 1000000  # 300M per replica
+    # Worker 0 caches all the TPU binaries.
+    worker0_mem = replica_cache_size * ctx.num_replicas
+    devices_memeory_usage = [worker0_mem] + [0] * (num_hosts - 1)
+    var_placer = mtf.utils.BalancedVariablePlacer(device_list,
+                                                  devices_memeory_usage)
+    mesh_devices = [''] * mesh_shape.size
+    mesh_impl = mtf.simd_mesh_impl.SimdMeshImpl(
+        mesh_shape, layout_rules, mesh_devices, ctx.device_assignment)
+  else:
+    var_placer = None
+    mesh_devices = [''] * mesh_shape.size
+    mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl(
+        mesh_shape, layout_rules, mesh_devices)
+  mesh = mtf.Mesh(graph, 'my_mesh', var_placer)
+
   with mtf.utils.outside_all_rewrites():
     logits, loss = toy_model(features, mesh)
 
   # TRAIN mode
   if mode == tf.estimator.ModeKeys.TRAIN:
     var_grads = mtf.gradients([loss],
                               [v.outputs[0] for v in graph.trainable_variables])
-    optimizer = mtf.optimize.AdafactorOptimizer()
+    if FLAGS.optimizer == 'Adafactor':
+      optimizer = mtf.optimize.AdafactorOptimizer()
+    else:
+      assert FLAGS.optimizer == 'SGD'
+      optimizer = mtf.optimize.SgdOptimizer(lr=1e-4)
     update_ops = []
     for grad, var in zip(var_grads, graph.trainable_variables):
       update_ops.extend(optimizer.apply_grad(grad, var))
@@ -136,7 +182,7 @@ def model_fn(features, labels, mode, params):
 
   lowering = mtf.Lowering(graph, {mesh: mesh_impl})
 
-  tf_loss = lowering.export_to_tf_tensor(loss)
+  tf_loss = tf.to_float(lowering.export_to_tf_tensor(loss))
 
   if mode == tf.estimator.ModeKeys.TRAIN:
     tf_update_ops = [lowering.lowered_operation(op) for op in update_ops]
@@ -173,8 +219,8 @@ def model_fn(features, labels, mode, params):
     elif mode == tf.estimator.ModeKeys.EVAL:
 
       def metric_fn(tf_logits):
-        mean_logitss = tf.metrics.mean(tf_logits)
-        return {'mean_logitss': mean_logitss}
+        mean_logits = tf.metrics.mean(tf_logits)
+        return {'mean_logits': mean_logits}
 
       eval_metrics = (metric_fn, [tf_logits])
 
 
@@ -95,7 +95,8 @@ def beam_search(logits_fn,
                 eos_id=EOS_ID,
                 stop_early=True,
                 decode_length=None,
-                use_tpu=True):
+                use_tpu=True,
+                dtype=tf.float32):
   """Beam search with length penalties.
 
   Requires a function that can take the currently decoded symbols and return
@@ -128,14 +129,15 @@ def beam_search(logits_fn,
           step_num - mtf Scalar
           ids - mtf Tensor with shape [batch, beam, length]
         Should return:
-          logits - [batch, beam, vocab_size]
+          logits - [batch, beam, vocab_size], dtype=dtype
     initial_ids: a mtf.Tensor with shape [batch_dim, beam_dim, length_dim])
     alpha: alpha for length penalty.
     states: list of mtf.Tensor
     eos_id: ID for end of sentence.
     stop_early: a boolean - stop once best sequence is provably determined.
     decode_length: a mtf Scalar of dtype tf.int32 - maximum length of decodes
     use_tpu: a boolean
+    dtype: a tf.dtype
   Returns:
     Tuple of
     (decoded beams [batch, beam, length]
@@ -150,7 +152,8 @@ def beam_search(logits_fn,
           mtf.constant(mesh, 0, dtype=tf.int32),
           beam_dim,
           on_value=0.0,
-          off_value=-INF),
+          off_value=-INF,
+          dtype=dtype),
       batch_by_beam)
 
   length_scalar = mtf.constant(mesh, length_dim.size, dtype=tf.int32)
@@ -166,7 +169,7 @@ def beam_search(logits_fn,
   # Finished log probs will be negative infinity in the beginning
   # finished_flags will keep track of booleans
   finished_seq = initial_ids
-  finished_scores = mtf.constant(mesh, -INF, batch_by_beam)
+  finished_scores = mtf.constant(mesh, -INF, batch_by_beam, dtype=dtype)
 
   # Setting the scores of the initial to negative infinity.
   finished_flags = mtf.constant(mesh, False, batch_by_beam, tf.bool)
@@ -197,7 +200,7 @@ def grow_finished(finished_seq, finished_scores, finished_flags, curr_seq,
 
     # Set the scores of the unfinished seq in curr_seq to large negative
     # values
-    curr_scores += (1. - mtf.to_float(curr_finished)) * -INF
+    curr_scores += (1. - mtf.cast(curr_finished, curr_scores.dtype)) * -INF
     unused_batch_dim, beam_dim, unused_length_dim = finished_seq.shape.dims
     # concatenating the sequences and scores along beam axis
     def _my_concat(a, b):
@@ -232,7 +235,7 @@ def grow_alive(curr_seq, curr_scores, curr_log_probs, curr_finished, states):
     """
     # Set the scores of the finished seq in curr_seq to large negative
     # values
-    curr_scores += mtf.to_float(curr_finished) * -INF
+    curr_scores += mtf.cast(curr_finished, curr_scores.dtype) * -INF
     return compute_topk_scores_and_seq(curr_seq, curr_scores, curr_log_probs,
                                        curr_finished, beam_dim,
                                        "grow_alive", states)
@@ -273,7 +276,7 @@ def grow_topk(i, alive_seq, alive_log_probs, states=None):
     # (batch_size, beam_size, vocab_size) + (batch_size, beam_size, 1)
     log_probs = candidate_log_probs + alive_log_probs
 
-    length_penalty = mtf.pow(((5. + mtf.to_float(i + 1)) / 6.), alpha)
+    length_penalty = mtf.pow(((5. + mtf.cast(i + 1, logits.dtype)) / 6.), alpha)
 
     curr_scores = log_probs / length_penalty
 
@@ -401,7 +404,7 @@ def _is_finished(i, unused_alive_seq, alive_log_probs, unused_finished_seq,
     if not stop_early:
       return mtf.less(i, decode_length)
     max_length_penalty = mtf.pow(
-        ((5. + mtf.to_float(decode_length)) / 6.), alpha)
+        ((5. + mtf.cast(decode_length, finished_scores.dtype)) / 6.), alpha)
     # The best possible score of the most likely alive sequence.
     lower_bound_alive_scores = mtf.gather(
         alive_log_probs, mtf.constant(mesh, 0, dtype=tf.int32),
@@ -412,16 +415,17 @@ def _is_finished(i, unused_alive_seq, alive_log_probs, unused_finished_seq,
     # scores are all -ve, taking the min will give us the score of the lowest
     # finished item.
     lowest_score_of_finished_in_finished = mtf.reduce_min(
-        finished_scores * mtf.to_float(finished_in_finished),
+        finished_scores * mtf.cast(finished_in_finished, finished_scores.dtype),
         reduced_dim=beam_dim)
 
     # If none of the sequences have finished, then the min will be 0 and
     # we have to replace it by -ve INF if it is. The score of any seq in alive
     # will be much higher than -ve INF and the termination condition will not
     # be met.
     lowest_score_of_finished_in_finished += (
-        (1. - mtf.to_float(mtf.reduce_any(
-            finished_in_finished, reduced_dim=beam_dim))) * -INF)
+        (1. - mtf.cast(mtf.reduce_any(
+            finished_in_finished, reduced_dim=beam_dim),
+                       finished_scores.dtype)) * -INF)
 
     bound_is_met = mtf.reduce_all(
         mtf.greater(lowest_score_of_finished_in_finished,