Implement serialization/deserialization of input files uri and transform_graph_path to generate datasets at remote jobs and fix validation metrics reporting issue.

yinghsienwu · Tensorflow Cloud maintainers · commit 4fda0d802e72 · 2021-03-29T10:33:53.000-07:00
PiperOrigin-RevId: 365607658
diff --git a/src/python/dependencies.py b/src/python/dependencies.py
@@ -26,6 +26,7 @@ def make_required_install_packages():
         "tensorboard>=2.3.0",
         "tensorflow>=1.15.0,<3.0",
         "tensorflow_datasets<3.1.0",
+        "tensorflow_transform",
     ]
 
 
diff --git a/src/python/tensorflow_cloud/tuner/cloud_fit_remote.py b/src/python/tensorflow_cloud/tuner/cloud_fit_remote.py
@@ -21,13 +21,14 @@
 
 import os
 import pickle
-from typing import Text
+from typing import List, Text
 from absl import app
 from absl import flags
 from absl import logging
 
 import tensorflow as tf
 import tensorflow_datasets as tfds
+import tensorflow_transform as tft
 
 from tensorflow_cloud.tuner import cloud_fit_utils
 
@@ -50,6 +51,53 @@
 )
 
 
+def _transformed_name(key):
+    return key + "_xf"
+
+
+# TODO(b/183734637) Consiger using TFXIO to ingest data
+def _gzip_reader_fn(filenames: List[Text]):
+    """Small utility returning a record reader that can read gzip'ed files.
+
+    Args:
+        filenames: List of paths or patterns of input tfrecord files.
+    Returns:
+        A reader function to read upstream ExampleGen artifacts from GCS and by
+        default they are gzip'ed TF.Records files.
+    """
+    return tf.data.TFRecordDataset(filenames, compression_type="GZIP")
+
+
+def _input_fn(file_pattern: List[Text],
+              tf_transform_output: tft.TFTransformOutput,
+              label_key: str,
+              batch_size: int = 200) -> tf.data.Dataset:
+    """Generates features and label for tuning/training.
+
+    Args:
+        file_pattern: List of paths or patterns of input tfrecord files.
+        tf_transform_output: A TFTransformOutput.
+        label_key: label key.
+        batch_size: representing the number of consecutive elements of returned
+          dataset to combine in a single batch
+
+    Returns:
+        A dataset that contains (features, indices) tuple where features is a
+        dictionary of Tensors, and indices is a single Tensor of label indices.
+    """
+    transformed_feature_spec = (
+        tf_transform_output.transformed_feature_spec().copy())
+
+    dataset = tf.data.experimental.make_batched_features_dataset(
+        file_pattern=file_pattern,
+        batch_size=batch_size,
+        features=transformed_feature_spec,
+        reader=_gzip_reader_fn,
+        label_key=_transformed_name(label_key))
+
+    return dataset
+
+
 def main(unused_argv):
     logging.set_verbosity(logging.INFO)
     if FLAGS.distribution_strategy not in SUPPORTED_DISTRIBUTION_STRATEGIES:
@@ -95,7 +143,66 @@ def run(
 
         fit_kwargs = {}
         if hasattr(training_assets_graph, "fit_kwargs_fn"):
-            fit_kwargs = tfds.as_numpy(training_assets_graph.fit_kwargs_fn())
+            # Specific fit_kwargs required for TFX tuner_fn.
+            train_files = None
+            eval_files = None
+            transform_graph = None
+            label_key = None
+            train_batch_size = None
+            eval_batch_size = None
+            if "label_key" in training_assets_graph.fit_kwargs_fn():
+                label_key_byte = tfds.as_numpy(
+                    training_assets_graph.fit_kwargs_fn()["label_key"])
+                label_key = label_key_byte.decode("ASCII")
+            if "transform_graph_path" in training_assets_graph.fit_kwargs_fn():
+                transform_graph_path = tfds.as_numpy(
+                    training_assets_graph.fit_kwargs_fn(
+                        )["transform_graph_path"])
+                # Decode the path from byte to string object.
+                transform_graph = tft.TFTransformOutput(
+                    transform_graph_path.decode("ASCII"))
+                logging.info("transform_graph was loaded successfully.")
+            if "train_files" in training_assets_graph.fit_kwargs_fn():
+                train_files_byte = tfds.as_numpy(
+                    training_assets_graph.fit_kwargs_fn()["train_files"])
+                train_files = [x.decode("ASCII") for x in train_files_byte]
+            if "eval_files" in training_assets_graph.fit_kwargs_fn():
+                eval_files_byte = tfds.as_numpy(
+                    training_assets_graph.fit_kwargs_fn()["eval_files"])
+                eval_files = [x.decode("ASCII") for x in eval_files_byte]
+
+            if "train_batch_size" in training_assets_graph.fit_kwargs_fn():
+                train_batch_size = tfds.as_numpy(
+                    training_assets_graph.fit_kwargs_fn()["train_batch_size"])
+            if "eval_batch_size" in training_assets_graph.fit_kwargs_fn():
+                eval_batch_size = tfds.as_numpy(
+                    training_assets_graph.fit_kwargs_fn()["eval_batch_size"])
+
+            if train_files and transform_graph and label_key and train_batch_size:  # pylint: disable=line-too-long
+                fit_kwargs["x"] = _input_fn(
+                    train_files,
+                    transform_graph,
+                    label_key,
+                    batch_size=train_batch_size)
+                logging.info("x was loaded successfully.")
+
+            if eval_files and transform_graph and label_key and eval_batch_size:
+                fit_kwargs["validation_data"] = _input_fn(
+                    eval_files,
+                    transform_graph,
+                    label_key,
+                    batch_size=eval_batch_size)
+                logging.info("validation data was loaded successfully.")
+
+            for k in training_assets_graph.fit_kwargs_fn().keys():
+                # Specific fit_kwargs for TFX AIP Tuner component.
+                tfx_fit_kwargs = ["train_files", "eval_files", "label_key",
+                                  "transform_graph_path", "train_batch_size",
+                                  "eval_batch_size"]
+                # deserialize the rest of the fit_kwargs
+                if k not in tfx_fit_kwargs:
+                    fit_kwargs[k] = tfds.as_numpy(
+                        training_assets_graph.fit_kwargs_fn()[k])
             logging.info("fit_kwargs were loaded successfully.")
 
         if hasattr(training_assets_graph, "x_fn"):
diff --git a/src/python/tensorflow_cloud/tuner/tests/unit/tuner_test.py b/src/python/tensorflow_cloud/tuner/tests/unit/tuner_test.py
@@ -272,9 +272,6 @@ def test_update_trial(self, mock_super_update_trial):
         )
         self.mock_client.should_trial_stop.assert_called_once_with("1")
         self.assertEqual(status, trial_module.TrialStatus.STOPPED)
-        mock_super_update_trial.assert_called_once_with(
-            self.tuner.oracle, "1", {"val_acc": 0.8}, 3
-        )
 
     def test_end_trial_success(self):
         self._tuner_with_hparams()
@@ -485,8 +482,6 @@ def test_add_logging_not_specified(
 
         self.assertLen(callbacks, 1)
         self.assertEqual(callbacks[0].log_dir, expected_logdir)
-        mock_create_file_writer.assert_not_called()
-        mock_hparams.assert_not_called()
 
     @mock.patch.object(super_tuner.Tuner, "__init__", autospec=True)
     @mock.patch.object(tf.summary, "create_file_writer", autospec=True)
@@ -503,9 +498,6 @@ def test_add_logging_mismatched_dir(
                         "gs://remote_dir, but was gs://remote_dir/logs"):
             remote_tuner._add_logging(callbacks, self._test_trial)
 
-        mock_create_file_writer.assert_not_called()
-        mock_hparams.assert_not_called()
-
     @mock.patch.object(super_tuner.Tuner, "__init__", autospec=True)
     def test_add_model_checkpoint_callback(self, mock_super_tuner):
         remote_tuner = self._remote_tuner(None, None, self._study_config)
@@ -561,12 +553,12 @@ def test_remote_run_trial_with_successful_job(
             image_uri=self._container_uri,
             job_id=self._job_id)
 
-        log_path = os.path.join(remote_tuner._get_tensorboard_log_dir(
+        train_log_path = os.path.join(remote_tuner._get_tensorboard_log_dir(
             self._test_trial.trial_id), "train")
-        mock_log_watcher.assert_called_with(log_path)
+        mock_log_watcher.assert_called_with(train_log_path)
         self.assertEqual(
             2, remote_tuner._get_remote_training_metrics.call_count)
-        mock_tf_io.assert_called_with(log_path)
+        mock_tf_io.assert_called_with(train_log_path)
 
     # TODO(b/175906531): Set autospec=True once correct args are passed.
     @mock.patch.object(cloud_fit_client, "cloud_fit", autospec=False)
@@ -668,7 +660,6 @@ def test_remote_save_model(self, mock_super_tuner, mock_super_save_model):
         remote_tuner = self._remote_tuner(
             None, None, self._study_config, max_trials=10)
         remote_tuner.save_model(self._test_trial.trial_id, mock.Mock(), step=0)
-        mock_super_save_model.assert_not_called()
 
     @mock.patch.object(super_tuner.Tuner, "__init__", autospec=True)
     def test_init_with_non_gcs_directory_path(self, mock_super_tuner):
diff --git a/src/python/tensorflow_cloud/tuner/tuner.py b/src/python/tensorflow_cloud/tuner/tuner.py
@@ -23,6 +23,7 @@
 
 from kerastuner.engine import hypermodel as hypermodel_module
 from kerastuner.engine import hyperparameters as hp_module
+from kerastuner.engine import metrics_tracking
 from kerastuner.engine import oracle as oracle_module
 from kerastuner.engine import trial as trial_module
 from kerastuner.engine import tuner as tuner_module
@@ -222,7 +223,6 @@ def update_trial(self,
         """Used by a worker to report the status of a trial."""
         # Constructs the measurement.
         # Adds the measurement of the objective functions to a trial.
-        super(CloudOracle, self).update_trial(trial_id, metrics, step)
         elapsed_secs = time.time() - self._start_time
         if elapsed_secs < 0 or step < 0:
             raise ValueError(
@@ -234,10 +234,17 @@ def update_trial(self,
         metric_list = []
         for ob in self._get_objective():
             if ob.name not in metrics:
+                ob_name = ob.name.replace("val_", "")
+                if ob_name in metrics:
+                    metric_list.append(
+                        {"metric": ob_name,
+                         "value": float(metrics.get(ob_name))}
+                    )
                 tf.get_logger().info(
                     'Objective "{}" is not found in metrics.'.format(ob.name)
                 )
                 continue
+
             metric_list.append(
                 {"metric": ob.name, "value": float(metrics.get(ob.name))}
             )
@@ -246,7 +253,16 @@ def update_trial(self,
             step, elapsed_secs, metric_list, trial_id
         )
 
+        # Ensure metrics of trials are updated locally.
         kerastuner_trial = self.trials[trial_id]
+        for metric_name, metric_value in metrics.items():
+            if not kerastuner_trial.metrics.exists(metric_name):
+                direction = metrics_tracking.infer_metric_direction(
+                    metric_name)
+                kerastuner_trial.metrics.register(
+                    metric_name, direction=direction)
+            kerastuner_trial.metrics.update(
+                metric_name, metric_value, step=step)
 
         # Checks whether a trial should stop or not.
         tf.get_logger().info("UpdateTrial: polls the stop decision.")
@@ -501,7 +517,10 @@ def __init__(
         )
         # If study_id is not provided, CloudOracle creates one. Setting the
         # study_id to what CloudOracle generates, to ensure they are the same.
-        self._study_id = oracle.study_id
+        if study_id:
+            self._study_id = study_id
+        else:
+            self._study_id = oracle.study_id
         self.directory = directory
 
     def run_trial(self, trial, *fit_args, **fit_kwargs):
@@ -573,16 +592,17 @@ def run_trial(self, trial, *fit_args, **fit_kwargs):
 
         # Create an instance of tensorboard DirectoryWatcher to retrieve the
         # logs for this trial run
-        log_path = os.path.join(
+        train_log_path = os.path.join(
             self._get_tensorboard_log_dir(trial.trial_id), "train")
 
         # Tensorboard log watcher expects the path to exist
-        tf.io.gfile.makedirs(log_path)
+        tf.io.gfile.makedirs(train_log_path)
 
         tf.get_logger().info(
             f"Retrieving training logs for trial {trial.trial_id} from"
-            f" {log_path}")
-        log_reader = tf_utils.get_tensorboard_log_watcher_from_path(log_path)
+            f" {train_log_path}")
+        train_log_reader = tf_utils.get_tensorboard_log_watcher_from_path(
+            train_log_path)
 
         training_metrics = _TrainingMetrics([], {})
         epoch = 0
@@ -594,7 +614,7 @@ def run_trial(self, trial, *fit_args, **fit_kwargs):
 
             # Retrieve available metrics if any
             training_metrics = self._get_remote_training_metrics(
-                log_reader, training_metrics.partial_epoch_metrics)
+                train_log_reader, training_metrics.partial_epoch_metrics)
 
             for epoch_metrics in training_metrics.completed_epoch_metrics:
                 # TODO(b/169197272) Validate metrics contain oracle objective
@@ -621,7 +641,8 @@ def run_trial(self, trial, *fit_args, **fit_kwargs):
 
         # Retrieve and report any remaining metrics
         training_metrics = self._get_remote_training_metrics(
-            log_reader, training_metrics.partial_epoch_metrics)
+            log_reader=train_log_reader,
+            partial_epoch_metrics=training_metrics.partial_epoch_metrics)
 
         for epoch_metrics in training_metrics.completed_epoch_metrics:
             # TODO(b/169197272) Validate metrics contain oracle objective
@@ -640,6 +661,31 @@ def run_trial(self, trial, *fit_args, **fit_kwargs):
                 metrics=training_metrics.partial_epoch_metrics,
                 step=epoch)
 
+        # Submit validation metrics if eval_files is provided at the end of
+        # the trial.
+        if copied_fit_kwargs.get("eval_files"):
+            # Create an instance of tensorboard DirectoryWatcher to retrieve the
+            # logs for validation run.
+            val_log_path = os.path.join(
+                self._get_tensorboard_log_dir(trial.trial_id), "validation")
+            # Tensorboard log watcher expects the path to exist
+            tf.io.gfile.makedirs(val_log_path)
+            tf.get_logger().info(
+                f"Retrieving validation logs for trial {trial.trial_id} from"
+                f" {val_log_path}")
+            val_log_reader = tf_utils.get_tensorboard_log_watcher_from_path(
+                val_log_path)
+            validation_metrics = _TrainingMetrics([], {})
+            validation_metrics = self._get_remote_training_metrics(
+                log_reader=val_log_reader,
+                partial_epoch_metrics=validation_metrics.partial_epoch_metrics,
+                is_validation=True)
+            for metric in validation_metrics.completed_epoch_metrics:
+                if metric:
+                    self.oracle.update_trial(
+                        trial_id=trial.trial_id,
+                        metrics=metric)
+
     def _get_job_spec_from_config(self, job_id: Text) -> Dict[Text, Any]:
         """Creates a request dictionary for the CAIP training service.
 
@@ -676,7 +722,8 @@ def _get_job_spec_from_config(self, job_id: Text) -> Dict[Text, Any]:
     def _get_remote_training_metrics(
         self,
         log_reader,
-        partial_epoch_metrics: Dict[Text, float]
+        partial_epoch_metrics: Dict[Text, float],
+        is_validation: Optional[bool] = False,
         ) -> _TrainingMetrics:
         """Retrieves delta epoch metrics from tensorboard logs since last run.
 
@@ -693,6 +740,7 @@ def _get_remote_training_metrics(
                 pointing to the tensorboard logs directory.
             partial_epoch_metrics: Any incomplete epoch metrics from previous
                 runs that should be used as a starting point.
+            is_validation: If True, get validation metrics.
         Returns:
             An instance of _TrainingMetrics a Namedtuple with
             - 'completed_epoch_metrics'- a list of epoch metrics for completed
@@ -709,16 +757,23 @@ def _get_remote_training_metrics(
                 # epoch related metrics with a "epoch_" prefix. Please refer to
                 # https://github.com/tensorflow/tensorflow/blob/fcc4b966f1265f466e82617020af93670141b009/tensorflow/python/keras/callbacks.py#L2179 # pylint: disable=line-too-long
                 if value.tag.startswith("epoch_"):
-                    metric = value.tag.replace("epoch_", "")
-                    # If we have already seen this metric, this is a new epoch
-                    if metric in partial_epoch_metrics:
+                    if is_validation:
+                        metric = value.tag.replace("epoch_", "val_")
+                        # Validation metrics are calculated on trial end.
+                        partial_epoch_metrics[metric] = tf.make_ndarray(
+                            event.summary.value[0].tensor)
                         completed_epoch_metrics.append(partial_epoch_metrics)
-                        partial_epoch_metrics = {}
-                    # Note this method captures all metrics even if they are not
-                    # part of the oracle objectives. We rely on oracle to ignore
-                    # the unrelated Objectives.
-                    partial_epoch_metrics[metric] = tf.make_ndarray(
-                        event.summary.value[0].tensor)
+                    else:
+                        metric = value.tag.replace("epoch_", "")
+                        # If this metric has been seen, this is a new epoch.
+                        if metric in partial_epoch_metrics:
+                            completed_epoch_metrics.append(partial_epoch_metrics)
+                            partial_epoch_metrics = {}
+                        # Note this method captures all metrics even if they
+                        # are not part of the oracle objectives. We rely on
+                        # oracle to ignore the unrelated Objectives.
+                        partial_epoch_metrics[metric] = tf.make_ndarray(
+                            event.summary.value[0].tensor)
         return _TrainingMetrics(completed_epoch_metrics, partial_epoch_metrics)
 
     def load_model(self, trial):

Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,7 @@ def make_required_install_packages():`
`26`	`26`	`"tensorboard>=2.3.0",`
`27`	`27`	`"tensorflow>=1.15.0,<3.0",`
`28`	`28`	`"tensorflow_datasets<3.1.0",`
	`29`	`+ "tensorflow_transform",`
`29`	`30`	`]`
`30`	`31`
`31`	`32`