meta-pytorch
diff --git a/‎torchrec/metrics/metric_job_types.py‎
Lines changed: 87 additions & 0 deletions b/‎torchrec/metrics/metric_job_types.py‎
Lines changed: 87 additions & 0 deletions
diff --git a/‎torchrec/metrics/metric_module.py‎
Lines changed: 10 additions & 1 deletion b/‎torchrec/metrics/metric_module.py‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎torchrec/metrics/metric_state_snapshot.py‎
Lines changed: 112 additions & 0 deletions b/‎torchrec/metrics/metric_state_snapshot.py‎
Lines changed: 112 additions & 0 deletions
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import concurrent
+from typing import Any, Dict
+
+import torch
+from torchrec.metrics.metric_module import MetricValue
+from torchrec.metrics.metric_state_snapshot import MetricStateSnapshot
+
+
+class MetricUpdateJob:
+    """
+    Encapsulates metric update job for CPU processing:
+    update each metric state tensors with intermediate model outputs
+    """
+
+    __slots__ = ["model_out", "transfer_completed_event", "kwargs"]
+
+    def __init__(
+        self,
+        model_out: Dict[str, torch.Tensor],
+        transfer_completed_event: torch.cuda.Event,
+        kwargs: Dict[str, Any],
+    ) -> None:
+        """
+        Args:
+            model_out: intermediate model outputs to be used for metric updates
+            transfer_completed_event: cuda event to track when the transfer to CPU is completed
+            kwargs: additional arguments from the trainer platform
+        """
+
+        self.model_out: Dict[str, torch.Tensor] = model_out
+        self.transfer_completed_event: torch.cuda.Event = transfer_completed_event
+        self.kwargs: Dict[str, Any] = kwargs
+
+
+class MetricComputeJob:
+    """
+    Encapsulates metric compute job for CPU processing: perform an
+    all gather across ranks, compute metrics, and return the result to be
+    published.
+    """
+
+    __slots__ = ["future", "metric_state_snapshot"]
+
+    def __init__(
+        self,
+        future: concurrent.futures.Future[Dict[str, MetricValue]],
+        metric_state_snapshot: MetricStateSnapshot,
+    ) -> None:
+        """
+        Args:
+            future: future to set the result of the compute job. Contains the computed metrics.
+            metric_state_snapshot: snapshot of metric state tensors across all metrics types.
+        """
+        self.future: concurrent.futures.Future[Dict[str, MetricValue]] = future
+        self.metric_state_snapshot: MetricStateSnapshot = metric_state_snapshot
+
+
+class SynchronizationMarker:
+    """
+    Represents the synchronization marker that is stored in the update queue. This is the point
+    we want to synchronize across all ranks to compute metrics.
+    When processed, this marker will convert to a MetricComputeJob in the compute queue.
+
+    This separation of synchronization marker and compute job is so that the metric compute job
+    accurately includes all of the metric jobs that were scheduled before it.
+    """
+
+    __slots__ = "future"
+
+    def __init__(
+        self,
+        future: concurrent.futures.Future[Dict[str, MetricValue]],
+    ) -> None:
+        """
+        Args:
+            future: future to set the result of the compute job. Passed to the MetricComputeJob.
+        """
+        self.future: concurrent.futures.Future[Dict[str, MetricValue]] = future
@@ -10,6 +10,7 @@
 #!/usr/bin/env python3
 
 import abc
+import concurrent
 import logging
 import time
 from collections import defaultdict
@@ -56,7 +57,7 @@
 from torchrec.metrics.precision import PrecisionMetric
 from torchrec.metrics.precision_session import PrecisionSessionMetric
 from torchrec.metrics.rauc import RAUCMetric
-from torchrec.metrics.rec_metric import RecMetric, RecMetricList
+from torchrec.metrics.rec_metric import RecMetric, RecMetricException, RecMetricList
 from torchrec.metrics.recall import RecallMetric
 from torchrec.metrics.recall_session import RecallSessionMetric
 from torchrec.metrics.scalar import ScalarMetric
@@ -486,6 +487,14 @@ def load_pre_compute_states(
             for name, buf in self.throughput_metric.named_buffers():  # pyre-ignore[16]
                 buf.copy_(states[name])
 
+    def shutdown(self) -> None:
+        logger.info("Initiating graceful shutdown...")
+
+    def async_compute(
+        self, future: concurrent.futures.Future[Dict[str, MetricValue]]
+    ) -> None:
+        raise RecMetricException("async_compute is not supported in RecMetricModule")
+
 
 def _generate_rec_metrics(
     metrics_config: MetricsConfig,
 
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import copy
+from typing import Any, cast, Dict, Optional
+
+from torch import nn
+
+from torchrec.metrics.rec_metric import (
+    RecComputeMode,
+    RecMetric,
+    RecMetricComputation,
+    RecMetricList,
+)
+from torchrec.metrics.throughput import ThroughputMetric
+
+
+class MetricStateSnapshot:
+    """
+    Encapsulates both rec metrics reduced states and throughput metric snapshots
+    for thread-safe CPU offloaded metric computation (updates and computes).
+    """
+
+    def __init__(
+        self,
+        metric_states: Dict[str, Any],
+        throughput_metric: Optional[ThroughputMetric],
+    ) -> None:
+        """
+        Args:
+            metric_states (Dict[str, Any]): Reduced states from rec metrics
+            throughput_metric (Optional[ThroughputMetric]): Deep copy of throughput metric
+        """
+        self.metric_states = metric_states
+        self.throughput_metric = throughput_metric
+
+    @classmethod
+    def from_metrics(
+        cls,
+        rec_metrics: RecMetricList,
+        throughput_metric: Optional[ThroughputMetric] = None,
+    ) -> "MetricStateSnapshot":
+        """
+        Generate a MetricStateSnapshot before performing an all gather. This provides a consistent
+        view of the local metric states without accessing the original references.
+
+        Apply reductions BEFORE queuing to reduce memory footprint. For instance, AUC holds a list of
+        tensors which can be reduced to a list of a single tensor. Only reduce lists for
+        fused mode compatibility.
+        """
+        reduced_states: Dict[str, Any] = {}
+
+        for metric in rec_metrics.rec_metrics:
+            metric = cast(RecMetric, metric)
+            compute_mode = metric._compute_mode
+            if (
+                compute_mode == RecComputeMode.FUSED_TASKS_COMPUTATION
+                or compute_mode == RecComputeMode.FUSED_TASKS_AND_STATES_COMPUTATION
+            ):
+                computation = metric._metrics_computations[0]
+                _load_into_reduced_states(
+                    compute_mode.name, computation, reduced_states
+                )
+            else:
+                for task, computation in zip(
+                    metric._tasks, metric._metrics_computations
+                ):
+                    _load_into_reduced_states(task.name, computation, reduced_states)
+
+        # Snapshot throughput metric
+        throughput_snapshot = None
+        if throughput_metric:
+            throughput_snapshot = copy.deepcopy(throughput_metric)
+
+        return cls(
+            metric_states=reduced_states,
+            throughput_metric=throughput_snapshot,
+        )
+
+
+def _load_into_reduced_states(
+    prefix: str,
+    computation: nn.Module,
+    reduced_states: Dict[str, Any],
+) -> None:
+    """
+    Load the reduced states into the reduced_states dict.
+
+    Args:
+        prefix (str): prefix for the metric computation
+        computation (nn.Module): metric computation
+        reduced_states (Dict[str, Any]): reduced states dict to load into
+    """
+    computation = cast(RecMetricComputation, computation)
+    computation_name = f"{prefix}_{computation.__class__.__name__}"
+
+    for attr_name in computation._reductions:
+        cache_key = f"{computation_name}_{attr_name}"
+        original_value = getattr(computation, attr_name)
+        reduction_fn = computation._reductions[attr_name]
+        if callable(reduction_fn) and isinstance(original_value, list):
+            reduced_value = reduction_fn(original_value)
+        else:
+            reduced_value = original_value
+
+        reduced_states[cache_key] = reduced_value