Sync copybara

RecML authors · qinyiyan · commit 03e1d28d210f · 2025-05-19T19:51:23.000Z
PiperOrigin-RevId: 759293017
diff --git a/recml/__init__.py b/recml/__init__.py
@@ -11,3 +11,30 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Public API for RecML."""
+
+# pylint: disable=g-importing-member
+
+from recml.core import data
+from recml.core import metrics
+from recml.core import utils
+from recml.core.metrics.base_metrics import Metric
+from recml.core.training.core import Experiment
+from recml.core.training.core import run_experiment
+from recml.core.training.core import Trainer
+from recml.core.training.jax_trainer import JaxState
+from recml.core.training.jax_trainer import JaxTask
+from recml.core.training.jax_trainer import JaxTrainer
+from recml.core.training.jax_trainer import KerasState
+from recml.core.training.keras_trainer import KerasTask
+from recml.core.training.keras_trainer import KerasTrainer
+from recml.core.training.optax_factory import AdagradFactory
+from recml.core.training.optax_factory import AdamFactory
+from recml.core.training.optax_factory import OptimizerFactory
+from recml.core.training.partitioning import DataParallelPartitioner
+from recml.core.training.partitioning import ModelParallelPartitioner
+from recml.core.training.partitioning import NullPartitioner
+from recml.core.training.partitioning import Partitioner
+from recml.core.utils.types import Factory
+from recml.core.utils.types import FactoryProtocol
+from recml.core.utils.types import ObjectFactory
diff --git a/recml/core/__init__.py b/recml/core/__init__.py
diff --git a/recml/core/data/__init__.py b/recml/core/data/__init__.py
@@ -0,0 +1,23 @@
+# Copyright 2024 RecML authors <recommendations-ml@google.com>.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Public API for RecML data."""
+
+# pylint: disable=g-importing-member
+
+from recml.core.data.iterator import Iterator
+from recml.core.data.iterator import TFDatasetIterator
+from recml.core.data.preprocessing import PreprocessingMode
+from recml.core.data.tf_dataset_factory import DatasetShardingInfo
+from recml.core.data.tf_dataset_factory import TFDatasetFactory
+from recml.core.data.tf_dataset_factory import TFDSMetadata
diff --git a/recml/core/data/iterator.py b/recml/core/data/iterator.py
@@ -23,7 +23,7 @@
 import tensorflow as tf
 
 
-DatasetIterator = clu_data.DatasetIterator
+Iterator = clu_data.DatasetIterator
 
 
 class TFDatasetIterator(clu_data.DatasetIterator):
diff --git a/recml/core/metrics/tools.py b/recml/core/metrics/tools.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tools for MLRX metrics."""
+"""Tools for RecML metrics."""
 
 from collections.abc import Mapping
 import concurrent.futures
diff --git a/recml/core/training/core.py b/recml/core/training/core.py
@@ -46,9 +46,9 @@
     tf.data.Dataset,
     tuple[tf.data.Dataset, tf.data.Dataset],
     tuple[tf.data.Dataset, Mapping[str, tf.data.Dataset]],
-    iterator.DatasetIterator,
-    tuple[iterator.DatasetIterator, iterator.DatasetIterator],
-    tuple[iterator.DatasetIterator, Mapping[str, iterator.DatasetIterator]],
+    iterator.Iterator,
+    tuple[iterator.Iterator, iterator.Iterator],
+    tuple[iterator.Iterator, Mapping[str, iterator.Iterator]],
 )
 MetaT = TypeVar("MetaT")
 Logs = Any  # Any metric logs returned by the training or evaluation task.
@@ -120,9 +120,9 @@ def run_experiment(
 
 def get_iterators(
     datasets: DatasetT,
-) -> tuple[iterator.DatasetIterator, Mapping[str, iterator.DatasetIterator]]:
+) -> tuple[iterator.Iterator, Mapping[str, iterator.Iterator]]:
   """Creates and unpacks the datasets returned by the task."""
-  if isinstance(datasets, (iterator.DatasetIterator, tf.data.Dataset)):
+  if isinstance(datasets, (iterator.Iterator, tf.data.Dataset)):
     if isinstance(datasets, tf.data.Dataset):
       datasets = iterator.TFDatasetIterator(datasets)
     return datasets, {}
@@ -133,7 +133,7 @@ def get_iterators(
     )
 
   train_dataset, eval_datasets = datasets
-  if isinstance(train_dataset, (iterator.DatasetIterator, tf.data.Dataset)):
+  if isinstance(train_dataset, (iterator.Iterator, tf.data.Dataset)):
     if isinstance(train_dataset, tf.data.Dataset):
       train_dataset = iterator.TFDatasetIterator(train_dataset)
   else:
@@ -143,7 +143,7 @@ def get_iterators(
         f" {type(train_dataset)}."
     )
 
-  if isinstance(eval_datasets, (iterator.DatasetIterator, tf.data.Dataset)):
+  if isinstance(eval_datasets, (iterator.Iterator, tf.data.Dataset)):
     if isinstance(eval_datasets, tf.data.Dataset):
       eval_datasets = iterator.TFDatasetIterator(eval_datasets)
     return train_dataset, {"": eval_datasets}
@@ -162,7 +162,7 @@ def get_iterators(
     }
 
   if not all(
-      isinstance(v, iterator.DatasetIterator) for v in eval_datasets.values()
+      isinstance(v, iterator.Iterator) for v in eval_datasets.values()
   ):
     raise ValueError(
         "Expected all values in the evaluation datasets mapping to be either"
diff --git a/recml/core/training/jax_trainer.py b/recml/core/training/jax_trainer.py
@@ -298,14 +298,14 @@ class JaxTask(abc.ABC, Generic[StateT]):
   def create_datasets(self) -> core.DatasetT:
     """Creates training and evaluation datasets.
 
-    Returns:
+    Returns:`
       One of the following:
-        1) A `tf.data.Dataset` or CLU `DatasetIterator instance that will be
+        1) A `tf.data.Dataset` or `Iterator` instance that will be
            used for training.
-        2) A tuple of `tf.data.Dataset` or CLU `DatasetIterator` instances where
+        2) A tuple of `tf.data.Dataset` or `Iterator` instances where
            the first element is the training dataset and the second element is
            the evaluation dataset.
-        3) A tuple of `tf.data.Dataset` or CLU `DatasetIterator` instances where
+        3) A tuple of `tf.data.Dataset` or `Iterator` instances where
            the first element is the training dataset and the second element is a
            dictionary of evaluation datasets keyed by name.
     """
@@ -601,8 +601,8 @@ def _evaluate_n_steps(
   def process_task(
       self, task: JaxTask, *, training: bool, check_for_checkpoints: bool
   ) -> tuple[
-      iterator_lib.DatasetIterator,
-      Mapping[str, iterator_lib.DatasetIterator],
+      iterator_lib.Iterator,
+      Mapping[str, iterator_lib.Iterator],
       State,
       partitioning.StepFn,
       partitioning.StepFn,
diff --git a/recml/core/training/keras_trainer.py b/recml/core/training/keras_trainer.py
@@ -17,6 +17,7 @@
 
 import abc
 from collections.abc import Mapping
+import dataclasses
 import gc
 import os
 import time
@@ -125,7 +126,7 @@ def __init__(
       model_dir = "/tmp"
 
     # This should be set before any layers are constructed and this is a
-    # fallback incase the trainer binary doesn't already do this.
+    # fallback in case the trainer binary doesn't already do this.
     if (
         isinstance(
             distribution,
@@ -204,7 +205,7 @@ def _maybe_get_model_kws(
     if py_utils.has_argument(task.create_model, "input_shapes"):
       batch = next(iter(dataset))
       x, *_ = keras.utils.unpack_x_y_sample_weight(batch)
-      kws["input_shapes"]: keras.tree.map_structure(core.get_shape, x)
+      kws["input_shapes"]: keras.tree.map_structure(core.get_shape, x)  # pylint: disable=undefined-variable
 
     return kws
 
@@ -232,6 +233,27 @@ def evaluate(self, task: KerasTask) -> core.Logs:
     model = task.create_model_for_eval(
         **self._maybe_get_model_kws(task, dataset)
     )
+
+    if keras.backend.backend() == "jax":
+      [tb_cbk] = [
+          cbk
+          for cbk in self._eval_callbacks
+          if isinstance(cbk, keras_utils.EpochSummaryCallback)
+      ]
+      epoch_start_time = time.time()
+      history = model.evaluate(
+          dataset,
+          steps=self._steps_per_eval,
+          callbacks=self._eval_callbacks,
+          return_dict=True,
+      )
+      epoch_dt = time.time() - epoch_start_time
+      steps_per_second = self._steps_per_eval / epoch_dt
+      val_logs = {"val_" + k: v for k, v in history.items()}
+      val_logs["val_steps_per_second"] = steps_per_second
+      tb_cbk.on_epoch_end(0, val_logs)
+      return history
+
     return model.evaluate(
         dataset,
         steps=self._steps_per_eval,
diff --git a/recml/core/utils/keras_utils.py b/recml/core/utils/keras_utils.py
@@ -191,8 +191,11 @@ def restore_keras_model(
   Args:
     model: The Keras model to restore.
     checkpoint_dir: The directory containing the Orbax checkpoints.
-    step: The step to restore the model to. If `None` then the latest checkpoint
-      will be restored.
+    step: The checkpoint step to resume training from. If set, it requires a
+      checkpoint with the same step number to be present in the model directory.
+      If not set, will resume training from the last checkpoint. Depending on
+      the value of `max_checkpoints_to_keep`, the model directory only contains
+      a certain number of the latest checkpoints.
     restore_optimizer_vars: Whether to restore the optimizer variables.
     restore_steps: Whether to restore the model's steps. If `True` then the
       model will continue training from the step the checkpoint was saved at. If
diff --git a/recml/examples/dlrm_experiment.py b/recml/examples/dlrm_experiment.py
diff --git a/recml/examples/dlrm_experiment_test.py b/recml/examples/dlrm_experiment_test.py
diff --git a/recml/layers/linen/sparsecore_test.py b/recml/layers/linen/sparsecore_test.py