Add TensorFlow profiler to training loop. (#296)

virajbshah · web-flow · commit a2d19d3126d6 · 2025-04-25T19:19:40.000+05:30
* Use the TF Profiler in sampling mode through the gRPC server API.
 * This enables on-demand, remote sampling with TPUs or multiple workers.

* Add unit test for TF Profiler.
 * Tests the profiling server by sending a request and ensuring the
   profile is written to the expected location.
diff --git a/gematria/model/python/main_function.py b/gematria/model/python/main_function.py
@@ -424,6 +424,23 @@ def main(_):
     ),
 )
 
+_GEMATRIA_RUN_TF_PROFILER = flags.DEFINE_bool(
+    'gematria_run_tf_profiler',
+    False,
+    'Whether the TensorFlow profiler gRPC server is started or not. When set,'
+    ' the server will listen to `gematria_tf_profiler_port` for requests for'
+    ' on-demand profiling. Requests can be sent through'
+    ' `tf.profiler.experimental.client.trace` or through the TensorBoard GUI.',
+)
+_GEMATRIA_TF_PROFILER_PORT = flags.DEFINE_integer(
+    'gematria_tf_profiler_port',
+    6009,
+    (
+        'When running under the TensorFlow profiler, this is the port the'
+        ' gRPC server listens for tracing requests from.'
+    ),
+)
+
 
 @flags.validator(
     _COLLECTED_PERCENTILE_RANKS.name,
@@ -825,6 +842,9 @@ def checkpoint_model():
           _GEMATRIA_SUMMARY_DIR.value
       )
 
+      if _GEMATRIA_RUN_TF_PROFILER.value:
+        tf.profiler.experimental.server.start(_GEMATRIA_TF_PROFILER_PORT.value)
+
       with train_summary_writer.as_default(), tf.summary.record_if(
           lambda: tf.equal(
               model.global_step % _GEMATRIA_SAVE_SUMMARIES_EPOCHS.value, 0
diff --git a/gematria/model/python/main_function_test.py b/gematria/model/python/main_function_test.py
@@ -17,6 +17,7 @@
 from os import path
 import os
 import re
+import threading
 from unittest import mock
 
 from absl import flags
@@ -769,6 +770,72 @@ def test_multi_task_flags(self):
       FLAGS.gematria_throughput_source_filter = ['alice', 'bob']
       FLAGS.validate_all_flags()
 
+  @flagsaver.flagsaver
+  def test_train_under_tf_profiler(self):
+    """Tests the profiling of model training using the TF Profiler.
+
+    The tests prepares training data and runs the actual training for a small
+    number of epochs under the TF Profiler. Then checks that the expected profiles
+    were recorded and stored at the expected directory.
+    """
+    num_epochs = 10
+    max_blocks_in_batch = 15
+    max_instructions_in_batch = 124
+    learning_rate = 0.321
+    randomize_batches = False
+    training_throughput_selection = io_options.ThroughputSelection.RANDOM
+    checkpoint_dir = path.join(self.work_directory.full_path, 'checkpoint')
+    summary_dir = path.join(self.work_directory.full_path, 'summary')
+    tf_profiler_port = 6009
+    use_seq2seq_loss = False  # The default is True.
+
+    model = None
+
+    def MockModel(*args, **kwargs):
+      nonlocal model
+      self.assertEqual(kwargs['learning_rate'], learning_rate)
+      model = TestModel(*args, **kwargs)
+      # Record calls to model.train(), but still call the original method.
+      mock_train = mock.MagicMock(side_effect=model.train)
+      model.train = mock_train
+      return model
+
+    FLAGS.gematria_action = model_options.Action.TRAIN
+    FLAGS.gematria_run_tf_profiler = True
+    FLAGS.gematria_tf_profiler_port = tf_profiler_port
+    FLAGS.gematria_input_file = (self.input_filename,)
+    FLAGS.gematria_checkpoint_dir = checkpoint_dir
+    FLAGS.gematria_summary_dir = summary_dir
+    FLAGS.gematria_training_num_epochs = num_epochs
+    FLAGS.gematria_training_randomize_batches = randomize_batches
+    FLAGS.gematria_max_blocks_in_batch = max_blocks_in_batch
+    FLAGS.gematria_max_instructions_in_batch = max_instructions_in_batch
+    FLAGS.gematria_use_seq2seq_loss = use_seq2seq_loss
+    FLAGS.gematria_learning_rate = learning_rate
+    FLAGS.gematria_training_throughput_selection = training_throughput_selection
+
+    # Set up a thread for the training process running the profiling server.
+    server_thread = threading.Thread(
+        target=main_function.run_gematria_model_from_command_line_flags,
+        args=(MockModel,),
+        kwargs={'dtype': tf.dtypes.float32},
+    )
+    server_thread.start()
+
+    # Try sending a trace request to the TF Profiler.
+    tf.profiler.experimental.client.trace(
+        service_addr=f'grpc://localhost:{tf_profiler_port}',
+        logdir=summary_dir,
+        duration_ms=1000,
+        num_tracing_attempts=4000,  # Keep trying until the server is ready.
+    )
+    server_thread.join()
+
+    # Check that profile has been written to the expected location.
+    self._assert_file_exists(
+        f'summary/plugins/profile/*/localhost_{tf_profiler_port}.xplane.pb'
+    )
+
 
 if __name__ == '__main__':
   tf.test.main()
diff --git a/gematria/model/python/model_base.py b/gematria/model/python/model_base.py
@@ -1298,14 +1298,17 @@ def run_one_epoch():
 
     with timer.scoped('ModelBase.train - one batch', num_iterations=num_epochs):
       for epoch_index in range(num_epochs):
-        tf.summary.experimental.set_step(epoch_index)
-        stats = run_one_epoch()
-        logging.info('Training: %s', stats)
-        if not hooks:
-          continue
-        for epochs_every, hook_function in hooks:
-          if (epoch_index + 1) % epochs_every == 0:
-            hook_function()
+        with tf.profiler.experimental.Trace(
+            'train', step_num=epoch_index, _r=1
+        ):
+          tf.summary.experimental.set_step(epoch_index)
+          stats = run_one_epoch()
+          logging.info('Training: %s', stats)
+          if not hooks:
+            continue
+          for epochs_every, hook_function in hooks:
+            if (epoch_index + 1) % epochs_every == 0:
+              hook_function()
       return stats
 
   def _compute_loss(self, schedule: FeedDict) -> loss_utils.LossComputation: