add variable batch size support to tower QPS (#3438)

iamzainhuda · meta-codesync[bot] · commit d991b46e9493 · 2025-10-07T00:27:25.000-07:00
Summary: Pull Request resolved: #3438 add variable batch size support to tower QPS - this applies under fused recmetrics task mode for tower QPS. this is because for fusion, we concat the state tensors across tasks to more efficiently compute the metric value. future todos: examine other metrics with batch size dependency and move batch size scheduling to module level (recmetricmodule/recmetric), this way we can pass batch_size as a parameter in update() according to schedule vs. setting up on a per metric basis. Reviewed By: irobert0126, AKhazane Differential Revision: D83700799 fbshipit-source-id: a9e36c8485c4fe893525fab5213219e6d06df60b
diff --git a/torchrec/metrics/tests/test_tower_qps.py b/torchrec/metrics/tests/test_tower_qps.py
@@ -10,11 +10,14 @@
 
 import unittest
 from functools import partial, update_wrapper
-from typing import Callable, Dict, List, Optional, Tuple, Type
+from typing import Any, Callable, Dict, List, Optional, OrderedDict, Tuple, Type, Union
+from collections import OrderedDict
+from unittest.mock import Mock, patch
 
 import torch
 import torch.distributed as dist
-from torchrec.metrics.metrics_config import DefaultTaskInfo
+from torch import Tensor
+from torchrec.metrics.metrics_config import BatchSizeStage, DefaultTaskInfo
 from torchrec.metrics.model_utils import parse_task_model_outputs
 from torchrec.metrics.rec_metric import (
     RecComputeMode,
@@ -159,6 +162,10 @@ def compute(
 
 
 class TowerQPSMetricTest(unittest.TestCase):
+    def setUp(self) -> None:
+        self.world_size = 1
+        self.batch_size = 256
+
     target_clazz: Type[RecMetric] = TowerQPSMetric
     task_names: str = "qps"
 
@@ -377,3 +384,206 @@ def test_tower_qps_update_with_invalid_tensors(self) -> None:
                     "key_2": torch.rand(batch_size),
                 },
             )
+
+    @patch("torchrec.metrics.tower_qps.time.monotonic")
+    def test_batch_size_schedule(self, time_mock: Mock) -> None:
+
+        def _gen_data_with_batch_size(
+            batch_size: int,
+        ) -> Dict[str, Union[Dict[str, Tensor], Tensor]]:
+            return {
+                "labels": {
+                    "t1": torch.rand(batch_size),
+                    "t2": torch.rand(batch_size),
+                    "t3": torch.rand(batch_size),
+                },
+                "predictions": torch.ones(batch_size),
+                "weights": torch.rand(batch_size),
+            }
+
+        batch_size_stages = [BatchSizeStage(256, 1), BatchSizeStage(512, None)]
+        time_mock.return_value = 1
+        batch_size = 256
+        task_names = ["t1", "t2", "t3"]
+        tasks = gen_test_tasks(task_names)
+        metric = TowerQPSMetric(
+            my_rank=0,
+            tasks=tasks,
+            batch_size=batch_size,
+            world_size=1,
+            window_size=1000,
+            batch_size_stages=batch_size_stages,
+            compute_mode=RecComputeMode.FUSED_TASKS_COMPUTATION,
+        )
+
+        data = _gen_data_with_batch_size(batch_size_stages[0].batch_size)
+        metric.update(**data)  # pyre-ignore[6]
+
+        self.assertEqual(
+            metric.compute(),
+            {
+                "qps-t1|lifetime_qps": 0,
+                "qps-t2|lifetime_qps": 0,
+                "qps-t3|lifetime_qps": 0,
+                "qps-t1|window_qps": 0,
+                "qps-t2|window_qps": 0,
+                "qps-t3|window_qps": 0,
+                "qps-t1|total_examples": 256,
+                "qps-t2|total_examples": 256,
+                "qps-t3|total_examples": 256,
+            },
+        )
+
+        data = _gen_data_with_batch_size(batch_size_stages[1].batch_size)
+        metric.update(**data)  # pyre-ignore[6]
+
+        self.assertEqual(
+            metric.compute(),
+            {
+                "qps-t1|lifetime_qps": 0,
+                "qps-t2|lifetime_qps": 0,
+                "qps-t3|lifetime_qps": 0,
+                "qps-t1|window_qps": 0,
+                "qps-t2|window_qps": 0,
+                "qps-t3|window_qps": 0,
+                "qps-t1|total_examples": 768,
+                "qps-t2|total_examples": 768,
+                "qps-t3|total_examples": 768,
+            },
+        )
+
+    def test_num_batch_without_batch_size_stages(self) -> None:
+        task_names = ["t1", "t2", "t3"]
+        tasks = gen_test_tasks(task_names)
+        metric = TowerQPSMetric(
+            my_rank=0,
+            tasks=tasks,
+            batch_size=self.batch_size,
+            world_size=self.world_size,
+            window_size=1000,
+            compute_mode=RecComputeMode.FUSED_TASKS_COMPUTATION,
+        )
+
+        self.assertFalse(hasattr(metric, "num_batch"))
+
+        metric.update(
+            labels={
+                "t1": torch.rand(self.batch_size),
+                "t2": torch.rand(self.batch_size),
+                "t3": torch.rand(self.batch_size),
+            },
+            predictions=torch.ones(self.batch_size),
+            weights=torch.rand(self.batch_size),
+        )
+        state_dict: Dict[str, Any] = metric.state_dict()
+        self.assertNotIn("num_batch", state_dict)
+
+    def test_state_dict_load_module_lifecycle(self) -> None:
+        task_names = ["t1", "t2", "t3"]
+        tasks = gen_test_tasks(task_names)
+        metric = TowerQPSMetric(
+            my_rank=0,
+            tasks=tasks,
+            batch_size=self.batch_size,
+            world_size=self.world_size,
+            window_size=1000,
+            compute_mode=RecComputeMode.FUSED_TASKS_COMPUTATION,
+            batch_size_stages=[BatchSizeStage(256, 1), BatchSizeStage(512, None)],
+        )
+
+        self.assertTrue(hasattr(metric, "_num_batch"))
+
+        metric.update(
+            labels={
+                "t1": torch.rand(self.batch_size),
+                "t2": torch.rand(self.batch_size),
+                "t3": torch.rand(self.batch_size),
+            },
+            predictions=torch.ones(self.batch_size),
+            weights=torch.rand(self.batch_size),
+        )
+        self.assertEqual(metric._num_batch, 1)
+        state_dict = metric.state_dict()
+        self.assertIn("num_batch", state_dict)
+        self.assertEqual(state_dict["num_batch"].item(), metric._num_batch)
+
+        new_metric = TowerQPSMetric(
+            my_rank=0,
+            tasks=tasks,
+            batch_size=self.batch_size,
+            world_size=self.world_size,
+            window_size=1000,
+            compute_mode=RecComputeMode.FUSED_TASKS_COMPUTATION,
+            batch_size_stages=[BatchSizeStage(256, 1), BatchSizeStage(512, None)],
+        )
+        self.assertEqual(new_metric._num_batch, 0)
+        new_metric.load_state_dict(state_dict)
+        self.assertEqual(new_metric._num_batch, 1)
+
+        state_dict = new_metric.state_dict()
+        self.assertIn("num_batch", state_dict)
+        self.assertEqual(state_dict["num_batch"].item(), new_metric._num_batch)
+
+    def test_state_dict_hook_adds_key(self) -> None:
+        task_names = ["t1", "t2", "t3"]
+        tasks = gen_test_tasks(task_names)
+        metric = TowerQPSMetric(
+            my_rank=0,
+            tasks=tasks,
+            batch_size=self.batch_size,
+            world_size=self.world_size,
+            window_size=1000,
+            compute_mode=RecComputeMode.FUSED_TASKS_COMPUTATION,
+            batch_size_stages=[BatchSizeStage(256, 1), BatchSizeStage(256, None)],
+        )
+
+        for _ in range(5):
+            metric.update(
+                labels={
+                    "t1": torch.rand(self.batch_size),
+                    "t2": torch.rand(self.batch_size),
+                    "t3": torch.rand(self.batch_size),
+                },
+                predictions=torch.ones(self.batch_size),
+                weights=torch.rand(self.batch_size),
+            )
+        state_dict: OrderedDict[str, torch.Tensor] = OrderedDict()
+        prefix: str = "test_prefix_"
+        metric.state_dict_hook(metric, state_dict, prefix, {})
+        self.assertIn(f"{prefix}num_batch", state_dict)
+        self.assertEqual(state_dict[f"{prefix}num_batch"].item(), 5)
+
+    def test_state_dict_hook_no_batch_size_stages(self) -> None:
+        task_names = ["t1", "t2", "t3"]
+        tasks = gen_test_tasks(task_names)
+        metric = TowerQPSMetric(
+            my_rank=0,
+            tasks=tasks,
+            batch_size=self.batch_size,
+            world_size=self.world_size,
+            window_size=1000,
+            compute_mode=RecComputeMode.FUSED_TASKS_COMPUTATION,
+            batch_size_stages=None,
+        )
+        state_dict: OrderedDict[str, torch.Tensor] = OrderedDict()
+        prefix: str = "test_prefix_"
+        metric.state_dict_hook(metric, state_dict, prefix, {})
+        self.assertNotIn(f"{prefix}num_batch", state_dict)
+
+    def test_load_state_dict_hook_restores_value(self) -> None:
+        task_names = ["t1", "t2", "t3"]
+        tasks = gen_test_tasks(task_names)
+        metric = TowerQPSMetric(
+            my_rank=0,
+            tasks=tasks,
+            batch_size=self.batch_size,
+            world_size=self.world_size,
+            window_size=1000,
+            compute_mode=RecComputeMode.FUSED_TASKS_COMPUTATION,
+            batch_size_stages=[BatchSizeStage(256, 1), BatchSizeStage(512, None)],
+        )
+        state_dict: OrderedDict[str, torch.Tensor] = OrderedDict()
+        prefix: str = "test_prefix_"
+        state_dict[f"{prefix}num_batch"] = torch.tensor(10, dtype=torch.long)
+        metric.load_state_dict_hook(state_dict, prefix, {}, True, [], [], [])
+        self.assertEqual(metric._num_batch, 10)
diff --git a/torchrec/metrics/tower_qps.py b/torchrec/metrics/tower_qps.py
@@ -7,13 +7,16 @@
 
 # pyre-strict
 
+import copy
 import time
-from typing import Any, cast, Dict, List, Optional, Type
+from typing import Any, cast, Dict, List, Optional, OrderedDict, Type
 
 import torch
 import torch.distributed as dist
+from torch import nn
+from torchrec.distributed.utils import none_throws
 
-from torchrec.metrics.metrics_config import RecComputeMode, RecTaskInfo
+from torchrec.metrics.metrics_config import BatchSizeStage, RecComputeMode, RecTaskInfo
 from torchrec.metrics.metrics_namespace import MetricName, MetricNamespace, MetricPrefix
 from torchrec.metrics.rec_metric import (
     MetricComputationReport,
@@ -194,6 +197,7 @@ def __init__(
         fused_update_limit: int = 0,
         process_group: Optional[dist.ProcessGroup] = None,
         warmup_steps: int = WARMUP_STEPS,
+        batch_size_stages: Optional[List[BatchSizeStage]] = None,
         **kwargs: Any,
     ) -> None:
         if fused_update_limit > 0:
@@ -213,6 +217,18 @@ def __init__(
             **kwargs,
         )
 
+        self._batch_size = batch_size
+        self._world_size = world_size
+        self._batch_size_stages: Optional[List[BatchSizeStage]] = copy.deepcopy(
+            batch_size_stages
+        )
+
+        if self._batch_size_stages is not None:
+            self._num_batch: int = 0
+
+        self._register_load_state_dict_pre_hook(self.load_state_dict_hook)
+        self.register_state_dict_post_hook(self.state_dict_hook)
+
     def update(
         self,
         *,
@@ -221,6 +237,9 @@ def update(
         weights: Optional[RecModelOutput],
         **kwargs: Dict[str, Any],
     ) -> None:
+        if self._batch_size_stages is not None:
+            self._num_batch += 1
+            self._batch_size = self._get_batch_size()
         with torch.no_grad():
             if self._compute_mode in [
                 RecComputeMode.FUSED_TASKS_COMPUTATION,
@@ -313,3 +332,53 @@ def update(
                         labels=task_labels,
                         weights=None,
                     )
+
+    def _get_batch_size(self) -> int:
+        if not self._batch_size_stages:
+            return self._batch_size
+
+        batch_size_stages = none_throws(self._batch_size_stages)
+        while self._batch_size_stages:
+            stage = self._batch_size_stages[0]
+            if stage.max_iters is None:
+                assert len(batch_size_stages) == 1
+                return stage.batch_size
+            if stage.max_iters < self._num_batch:
+                batch_size_stages.pop(0)
+                continue
+            return stage.batch_size
+        raise AssertionError("Unreachable, batch_size_stages should always has 1 item")
+
+    @staticmethod
+    def state_dict_hook(
+        module: nn.Module,
+        state_dict: OrderedDict[str, torch.Tensor],
+        prefix: str,
+        local_metadata: Dict[str, Any],
+    ) -> None:
+        """
+        The state dict hook and load state dict hook exist to ensure we load num_batch for a metric with
+        batch_size_stages set. The reason we do this apporach as opposted to saving num_batch as a buffer
+        is in some cases we are accessing the value from a CPU workload where the tensors are on GPU. This
+        incurs a device to head call, which is expensive and blocking.
+        """
+        if module._batch_size_stages is not None:
+            num_batch_key = f"{prefix}num_batch"
+            state_dict[num_batch_key] = torch.tensor(
+                module._num_batch, dtype=torch.long
+            )
+
+    def load_state_dict_hook(
+        self,
+        state_dict: OrderedDict[str, torch.Tensor],
+        prefix: str,
+        local_metadata: Dict[str, Any],
+        strict: bool,
+        missing_keys: List[str],
+        unexpected_keys: List[str],
+        error_msgs: List[str],
+    ) -> None:
+        key = f"{prefix}num_batch"
+        if key in state_dict and self._batch_size_stages is not None:
+            num_batch_tensor = state_dict.pop(key)
+            self._num_batch = int(num_batch_tensor.item())