Merge pull request #2544 from AI-Hypercomputer:xibin/diagon_sdk

Google-ML-Automation · Google-ML-Automation · commit cb475168c3e0 · 2025-12-03T10:39:31.000-08:00
PiperOrigin-RevId: 839818265
diff --git a/dependencies/requirements/base_requirements/requirements.txt b/dependencies/requirements/base_requirements/requirements.txt
@@ -8,6 +8,7 @@ flax
 gcsfs
 google-api-python-client
 google-cloud-aiplatform
+google-cloud-mldiagnostics
 google-cloud-monitoring
 grain[parquet]
 huggingface_hub
diff --git a/dependencies/requirements/generated_requirements/cuda12-requirements.txt b/dependencies/requirements/generated_requirements/cuda12-requirements.txt
@@ -66,12 +66,12 @@ google-cloud-audit-log>=0.4.0
 google-cloud-bigquery>=3.38.0
 google-cloud-core>=2.5.0
 google-cloud-logging>=3.12.1
+google-cloud-mldiagnostics>=0.5.5
 google-cloud-monitoring>=2.28.0
 google-cloud-resource-manager>=1.15.0
 google-cloud-storage>=3.6.0
 google-crc32c>=1.7.1
 google-genai>=1.52.0
-google-jetstream @ https://github.com/AI-Hypercomputer/JetStream/archive/29329e8e73820993f77cfc8efe34eb2a73f5de98.zip
 google-pasta>=0.2.0
 google-resumable-media>=2.8.0
 googleapis-common-protos>=1.72.0
@@ -120,7 +120,6 @@ mdurl>=0.1.2
 ml-collections>=1.1.0
 ml-dtypes>=0.5.4
 ml-goodput-measurement>=0.0.15
-mlperf-logging @ https://github.com/mlcommons/logging/archive/38ab22670527888c8eb7825a4ece176fcc36a95d.zip
 more-itertools>=10.8.0
 mpmath>=1.3.0
 msgpack>=1.1.2
@@ -255,4 +254,4 @@ xprof>=2.21.1
 xxhash>=3.6.0
 yarl>=1.22.0
 zipp>=3.23.0
-zstandard>=0.25.0
+zstandard>=0.25.0
diff --git a/dependencies/requirements/generated_requirements/tpu-requirements.txt b/dependencies/requirements/generated_requirements/tpu-requirements.txt
@@ -66,12 +66,12 @@ google-cloud-audit-log>=0.4.0
 google-cloud-bigquery>=3.38.0
 google-cloud-core>=2.5.0
 google-cloud-logging>=3.12.1
+google-cloud-mldiagnostics>=0.5.5
 google-cloud-monitoring>=2.28.0
 google-cloud-resource-manager>=1.15.0
 google-cloud-storage>=3.6.0
 google-crc32c>=1.7.1
 google-genai>=1.52.0
-google-jetstream @ https://github.com/AI-Hypercomputer/JetStream/archive/29329e8e73820993f77cfc8efe34eb2a73f5de98.zip
 google-pasta>=0.2.0
 google-resumable-media>=2.8.0
 google-tunix>=0.1.3
@@ -123,7 +123,6 @@ mdurl>=0.1.2
 ml-collections>=1.1.0
 ml-dtypes>=0.5.4
 ml-goodput-measurement>=0.0.15
-mlperf-logging @ https://github.com/mlcommons/logging/archive/38ab22670527888c8eb7825a4ece176fcc36a95d.zip
 more-itertools>=10.8.0
 mpmath>=1.3.0
 msgpack>=1.1.2
@@ -245,4 +244,4 @@ xprof>=2.21.1
 xxhash>=3.6.0
 yarl>=1.22.0
 zipp>=3.23.0
-zstandard>=0.25.0
+zstandard>=0.25.0
diff --git a/dependencies/requirements/requirements.txt b/dependencies/requirements/requirements.txt
@@ -8,6 +8,7 @@ flax
 gcsfs
 google-api-python-client
 google-cloud-aiplatform
+google-cloud-mldiagnostics
 google-cloud-monitoring
 grain[parquet]
 huggingface_hub
diff --git a/dependencies/requirements/requirements_with_jax_ai_image.txt b/dependencies/requirements/requirements_with_jax_ai_image.txt
@@ -3,6 +3,7 @@
 datasets @ https://github.com/huggingface/datasets/archive/6790e138c00b87a1ddc72184f89e7814cf784360.zip
 flax>=0.11.0
 google-api-python-client
+google-cloud-mldiagnostics
 google-jetstream @ https://github.com/AI-Hypercomputer/JetStream/archive/29329e8e73820993f77cfc8efe34eb2a73f5de98.zip
 grain[parquet]>=0.2.15
 jaxtyping
diff --git a/src/MaxText/configs/base.yml b/src/MaxText/configs/base.yml
@@ -612,7 +612,7 @@ colocated_python_data_input: False  # experimental feature, under testing
 
 # Training loop
 steps: 150_001 # If set to -1 then will inherit value from learning_rate_schedule_steps
-log_period: 100 # Flushes Tensorboard
+log_period: 100 # The frequency of Tensorboard flush, gcs metrics writing, and managed profiler metrics updating.
 
 jax_distributed_initialization_timeout: 300 # This is the default timeout in https://github.com/jax-ml/jax/blob/main/jax/_src/distributed.py
 # Note there are two separate initializations - the jax coordination service (aka jax.distributed.initialize) and the backend (e.g. PjRT), the timeout above refers
@@ -658,6 +658,12 @@ profile_cleanly: True # If set to true, adds a block_until_ready on train state
 profile_periodically_period: -1 # If set to a positive integer, profile every profile_periodically_period steps.
 # This is useful to debug scenarios where performance is changing.
 
+# Managed ML diagnostics settings. If the feature is enabled, it will
+# - create a managed ML diagnostics run with all the MaxText configs
+# - upload xplane profiling, if it is enabled.
+# - upload training metrics, at the defined log_period interval.
+managed_mldiagnostics: False # Whether to enable the managed diagnostics
+managed_mldiagnostics_run_group: ""  # Optional. Used to group multiple runs.
 
 # Dump HLO options
 dump_hlo: False
diff --git a/src/MaxText/configs/types.py b/src/MaxText/configs/types.py
@@ -1169,6 +1169,13 @@ class Metrics(BaseModel):
   )
 
 
+class ManagedMLDiagnostics(BaseModel):
+  """Configuration for managed mldiagnostics."""
+
+  managed_mldiagnostics: bool = Field(False, description="Enable managed mldiagnostics.")
+  managed_mldiagnostics_run_group: str = Field("", description="Name used to group multiple runs.")
+
+
 class Goodput(BaseModel):
   """Configuration for goodput monitoring."""
 
@@ -1428,6 +1435,10 @@ class DerivedValues(BaseModel):
       None,
       description="The full path to the tensorboard directory, derived from `run_name`.",
   )
+  managed_mldiagnostics_dir: None | str = Field(
+      None,
+      description="The full path to the managed mldiagnostics directory, derived from `run_name`.",
+  )
 
   rampup_end_step: None | int = Field(None, description="The step at which the batch size ramp-up phase concludes.")
   tensors_on_device: None | list[str] = Field(
@@ -1543,6 +1554,7 @@ class MaxTextConfig(
     Goodput,
     GcpMonitoring,
     Tensorboard,
+    ManagedMLDiagnostics,
     # Multimodal
     MultimodalGeneral,
     VisionTower,
@@ -1588,6 +1600,8 @@ def set_derived_and_validate_values(self) -> "MaxTextConfig":
       self.checkpoint_dir = os.path.join(output_dir, "checkpoints", "")
       self.metrics_dir = os.path.join(output_dir, "metrics", "")
       self.tensorboard_dir = os.path.join(output_dir, "tensorboard", "")
+      # To work around SDK bug b/454725283, remove the trailing back slash from the managed_mldiagnostics_dir.
+      self.managed_mldiagnostics_dir = os.path.join(output_dir, "managed-mldiagnostics")
     else:
       self.checkpoint_dir, self.metrics_dir, self.tensorboard_dir = None, None, None
 
diff --git a/src/MaxText/managed_mldiagnostics.py b/src/MaxText/managed_mldiagnostics.py
@@ -0,0 +1,76 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Create the managed mldiagnostics run."""
+import json
+from typing import Any
+
+import google_cloud_mldiagnostics as mldiag
+
+from MaxText.pyconfig import KEYS_NO_LOGGING
+
+
+class ManagedMLDiagnostics:
+  """
+  ML Diagnostics Run, implemented with the Singleton pattern.
+  Ensures that only one instance of the class can exist.
+  """
+
+  _instance = None  # Class attribute to hold the single instance
+
+  def __new__(cls, *args: Any, **kwargs: Any):
+    """
+    Overrides the instance creation method.
+    If an instance already exists, it is returned instead of creating a new one.
+    """
+    if cls._instance is None:
+      cls._instance = super(ManagedMLDiagnostics, cls).__new__(cls)
+
+    return cls._instance
+
+  def __init__(self, config):
+    """
+    Initializes the ManagedMLDiagnostics, ensuring this method runs only once.
+    """
+    # We need a flag to ensure __init__ only runs once,
+    # as the object is returned multiple times by __new__.
+    if hasattr(self, "_initialized"):
+      return
+    self._initialized = True
+    if not config.managed_mldiagnostics:
+      return
+
+    # Set up the managed mldiagnostics for profiling and metrics uploading.
+    def should_log_key(key, value):
+      if key in KEYS_NO_LOGGING:
+        return False
+      try:
+        # Verify the value can be serialized to json. If not, we'll skip it.
+        json.dumps(value, allow_nan=False)
+      except TypeError:
+        return False
+      return True
+
+    config_dict = {key: value for key, value in config.get_keys().items() if should_log_key(key, value)}
+
+    # Create a run for the managed mldiagnostics, and upload the configuration.
+    mldiag.machinelearning_run(
+        name=f"{config.run_name}",
+        run_group=config.managed_mldiagnostics_run_group,
+        configs=config_dict,
+        gcs_path=config.managed_mldiagnostics_dir,
+        # TODO: b/455623960 - Remove the following once multi-region and prod support are enabled.
+        region="us-central1",
+        environment="autopush",  # Default would be "prod" for formal launch.
+    )
diff --git a/src/MaxText/metric_logger.py b/src/MaxText/metric_logger.py
@@ -25,15 +25,31 @@
 
 import jax
 
+import google_cloud_mldiagnostics as mldiag
+
 from MaxText import max_logging
 from MaxText import max_utils
 from MaxText import maxtext_utils
+from MaxText.managed_mldiagnostics import ManagedMLDiagnostics
 from MaxText.utils import gcs_utils
 from MaxText.gcp_workload_monitor import GCPWorkloadMonitor
 from MaxText.globals import EPS
 
 from collections import defaultdict
 
+# Mapping MaxText metrics to managed profiler metrics
+_METRICS_TO_MANAGED = {
+    "learning/current_learning_rate": "learning_rate",
+    "learning/loss": "loss",
+    "learning/grad_norm": "gradient_norm",
+    "learning/total_weights": "total_weights",
+    "perf/step_time_seconds": "step_time",
+    "perf/per_device_tokens_per_sec": "throughput",
+    "perf/per_device_tflops_per_sec": "tflops",
+    # There are no mappings to the following metrics yet:
+    # "latency", "mfu"
+}
+
 
 def _prepare_metrics_for_json(metrics, step, run_name):
   """Converts metric dictionary into json supported types (e.g. float)"""
@@ -82,6 +98,8 @@ def __init__(self, config, learning_rate_schedule):
     self.learning_rate_schedule = learning_rate_schedule
     self.cumulative_eval_metrics = {"scalar": defaultdict(float)}
     self.buffered_train_metrics = None
+    if self.config.managed_mldiagnostics:
+      ManagedMLDiagnostics(config)  # Initialize the MLRun instance.
 
   def reset_eval_metrics(self):
     """Resets the cumulative metrics dictionary for a new evaluation run."""
@@ -101,6 +119,9 @@ def write_metrics(self, metrics, step, is_training=True):
       if self.config.gcs_metrics and jax.process_index() == 0:
         self.write_metrics_for_gcs(metrics, step, is_training)
 
+      if self.config.managed_mldiagnostics:
+        self.write_metrics_to_managed_mldiagnostics(metrics, step)
+
   def log_metrics(self, metrics, step, is_training):
     """Logs metrics via max_logging."""
     if is_training:
@@ -233,6 +254,18 @@ def write_metrics_to_tensorboard(self, metrics, step, is_training):
         max_logging.log(f"To see full metrics 'tensorboard --logdir={self.config.tensorboard_dir}'")
         self.writer.flush()
 
+  def write_metrics_to_managed_mldiagnostics(self, metrics, step):
+    """Write metrics to managed profiler."""
+    if (step + 1) % self.config.log_period == 0 or step == self.config.steps - 1:
+      for metric_name in metrics.get("scalar", []):
+        value = metrics["scalar"][metric_name]
+        # For NumPy/JAX array objects (including single-element arrays), use .item()
+        # to extract the native Python scalar.
+        if hasattr(value, "item"):
+          value = value.item()
+        mapped_metric_name = _METRICS_TO_MANAGED.get(metric_name, metric_name)
+        mldiag.metrics.record(mapped_metric_name, value, step=int(step))
+
   def write_setup_info_to_tensorboard(self, params):
     """Writes setup information like train config params, num model params, and XLA flags to TensorBoard."""
     num_model_parameters = max_utils.calculate_num_params_from_pytree(params)
diff --git a/src/MaxText/profiler.py b/src/MaxText/profiler.py
@@ -21,7 +21,10 @@
 
 import jax
 
+import google_cloud_mldiagnostics as mldiag
+
 from MaxText import max_logging
+from MaxText.managed_mldiagnostics import ManagedMLDiagnostics
 
 
 class Profiler:
@@ -40,6 +43,10 @@ def __init__(self, config, offset_step=0):
     self.finished_initial_profile_step = self._set_last_profiler_step(config.profiler_steps, config.steps)
     if config.profiler != "" and self.start_initial_profile_step >= config.steps:
       raise ValueError("Profiling requested but initial profiling step set past training final step")
+    self.prof = None  # managed mldiagnostics xprof collector.
+    self.managed_mldiagnostics = config.managed_mldiagnostics
+    if config.managed_mldiagnostics:
+      ManagedMLDiagnostics(config)  # Initialize the MLRun instance.
 
   def maybe_activate_profiler(self, step, state):
     """Conditionally activates the profiler based on the current step.
@@ -56,6 +63,16 @@ def activate(self, blocking_object=None, optional_postfix=""):
     nsys profiler becomes no-op when libcudart.so is not available on the system."""
     if self.profile_cleanly and blocking_object is not None:
       jax.block_until_ready(blocking_object)
+
+    if self.managed_mldiagnostics and self.mode == "xplane":
+      # Handle the special profiling logic for managed_mldiagnostics
+      if self.prof is None:
+        # Starts xprof collector.
+        # Only profiling on the first device, if not upload_all_profiler_results. None is for all devices.
+        self.prof = mldiag.xprof(process_index_list=None if self.upload_all_profiler_results else [0])
+      self.prof.start()
+      return
+
     if not (self.upload_all_profiler_results or jax.process_index() == 0):
       return
     if self.mode != "":
@@ -84,6 +101,13 @@ def deactivate(self, blocking_object=None):
     The result is uploaded to the output bucket."""
     if self.profile_cleanly and blocking_object is not None:
       jax.block_until_ready(blocking_object)
+
+    if self.managed_mldiagnostics and self.mode == "xplane":
+      # Handle the special profileing logic for managed_mldiagnostics
+      if self.prof is not None:
+        self.prof.stop()
+      return
+
     if not (self.upload_all_profiler_results or jax.process_index() == 0):
       return
     if self.mode == "nsys":
diff --git a/src/MaxText/pyconfig.py b/src/MaxText/pyconfig.py
@@ -38,6 +38,9 @@
 _MAX_PREFIX = "M_"
 _yaml_types_to_parser = {str: str, int: int, float: float, bool: str2bool}
 
+# Don't log the following keys.
+KEYS_NO_LOGGING = ("hf_access_token",)
+
 
 def yaml_key_to_env_key(s: str) -> str:
   return _MAX_PREFIX + s.upper()
@@ -281,7 +284,7 @@ def initialize(argv: list[str], **kwargs) -> HyperParameters:
 
   if config.log_config:
     for k, v in sorted(config.get_keys().items()):
-      if k != "hf_access_token":
+      if k not in KEYS_NO_LOGGING:
         logger.info("Config param %s: %s", k, v)
 
   return config