[None][fix] Fully resolve the tactic recovery issues in AutoTuner serialized cache (#9835)

hyukn · web-flow · commit 072f23600295 · 2025-12-10T20:41:04.000+08:00
Restrict tactic types to those compatible with AutoTuner cache serialization and deserialization.

Signed-off-by: Yukun He &lt;23156053+hyukn@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py
@@ -169,6 +169,10 @@ def get_valid_tactics(self, inputs: List[torch.Tensor],
         means. User can choose to implement their own types of tactic for flexibility, such as using a dict-typed
         to represent a collection of named configs.
 
+        The type of the tactic is arbitrary. But serialization/deserialization of the cache requires that the type is compatible with json.dumps/json.loads.
+        To evaluate if a type of tactic is compatible with current workflow, try the following code:
+            *  assert YOUR_TACTIC_OBJECT == eval(repr(YOUR_TACTIC_OBJECT))
+
         tactic==-1 has special meaning, means the fallback kernel which should be able to implement any shapes
         This fallback tactic is needed for 2 reasons:
             * when the autotuner cannot find a valid tactic in it's cache.
@@ -475,14 +479,22 @@ def _serialize_cache_to_json(self) -> Dict[str, Any]:
         }
 
         for key, value in self.cache.items():
-            # Convert tuple key to string for JSON compatibility
+            # Convert any simple object to string for JSON compatibility
             key_str = str(key)
-
             runner_id, tactic, min_time = value
+            tactic_str = repr(tactic)
+            try:
+                assert tactic == ast.literal_eval(
+                    tactic_str
+                ), f"Tactic is not compatible with json.dumps/json.loads"
+            except Exception as e:
+                logger.warning_once(
+                    f"[AutoTuner] Could not serialize tactic: {tactic_str} for cache key {key_str} due to {e}. Deserialization may fail.",
+                    key=tactic_str)
 
             serializable_cache["cache_data"][key_str] = {
                 "runner_id": runner_id,
-                "tactic": tactic,
+                "tactic": tactic_str,
                 "min_time": min_time,
             }
 
@@ -511,22 +523,22 @@ def _deserialize_cache_from_json(
         cache = {}
         cache_data = serializable_cache["cache_data"]
 
-        def lists_to_tuples(obj):
-            if isinstance(obj, list):
-                return tuple(lists_to_tuples(x) for x in obj)
-            return obj
-
         for key_str, value in cache_data.items():
             # Reconstruct the tuple key safely
             try:
-                key = ast.literal_eval(key_str)  # Safer than eval()
+                key = ast.literal_eval(key_str)
             except (ValueError, SyntaxError):
                 logger.warning(
                     f"[AutoTuner] Could not reconstruct cache key: {key_str}")
                 continue
+            try:
+                tactic = ast.literal_eval(value["tactic"])
+            except (ValueError, TypeError):
+                logger.warning_once(
+                    f"[AutoTuner] Could not deserialize tactic: {value['tactic']} for cache key {key_str}",
+                    key=value["tactic"])
 
             runner_id = value["runner_id"]
-            tactic = lists_to_tuples(value["tactic"])
             min_time = value["min_time"]
 
             cache[key] = (runner_id, tactic, min_time)
diff --git a/tests/unittest/_torch/misc/test_autotuner.py b/tests/unittest/_torch/misc/test_autotuner.py
@@ -1,6 +1,7 @@
+import itertools
 import os
 import tempfile
-from typing import Dict, List
+from typing import Any, List
 
 import torch
 
@@ -327,48 +328,63 @@ def test_multiple_dynamic_shapes_cache():
 
 
 class GemmRunnerComplexTuningConfigs(TunableRunner):
+
+    # test serialization of different types of tactics
     valid_tactic_ids = [-1, 0, 1]
+    valid_tile_sizes = [(128, 128), (256, 256)]
+    valid_cluster_sizes = [[1, 1, 1], [2, 2, 1]]
+
     tune_max_num_tokens = 32
 
     def get_valid_tactics(
         self,
         inputs: List[FakeTensor],
         profile: OptimizationProfile,
         **kwargs,
-    ) -> List[Dict[str, int]]:
+    ) -> List[Any]:
         # During the tuning process, we verify if the tuning config behaves as expected
-
         assert inputs[0].shape[0] <= self.tune_max_num_tokens, \
             f"Input shape {inputs[0].shape[0]} is larger than the max num tokens {self.tune_max_num_tokens}"
 
         assert inputs[0][-1, 0] == inputs[0].shape[0], \
             f"Input shape {inputs[0].shape[0]} is not set through the pre_hook correctly"
 
-        # The simulated delay is not deterministic, so we need to return specific tactics here
         return [{
-            "block_size": block_size,
-            "tactic_id": tactic_id
-        } for tactic_id in self.valid_tactic_ids for block_size in [128, 256]]
+            "int_tactic_id": tactic_id,
+            "tuple_tile_size": tile_size,
+            "list_cluster_size": cluster_size,
+        } for tactic_id, tile_size, cluster_size in itertools.product(
+            self.valid_tactic_ids,
+            self.valid_tile_sizes,
+            self.valid_cluster_sizes,
+        )]
 
     def forward(
         self,
         /,
         inputs: List[torch.Tensor],
         *,
-        tactic: dict = {},
+        tactic: Any = -1,
     ) -> torch.Tensor:
         # Notice that in fallback case tactic is -1
         if tactic == -1:
             # assign default configs for fallback case
-            block_size, tactic_id = 128, -1
+            tactic_id, tile_size, cluster_size = -1, (128, 256), [1, 1, 1]
         else:
-            block_size, tactic_id = tactic["block_size"], tactic["tactic_id"]
-        assert tactic_id in self.valid_tactic_ids
+            tactic_id, tile_size, cluster_size = tactic[
+                "int_tactic_id"], tactic["tuple_tile_size"], tactic[
+                    "list_cluster_size"]
+
+        assert isinstance(tactic_id, int) and tactic_id in self.valid_tactic_ids
+        assert isinstance(tile_size, tuple) and len(tile_size) == 2 \
+            and tile_size in self.valid_tile_sizes
+        assert isinstance(cluster_size, list) and len(cluster_size) == 3 \
+            and cluster_size in self.valid_cluster_sizes
         return [gemm_0, gemm_1, gemm_fallback][tactic_id](*inputs)
 
     @staticmethod
     def inputs_pre_hook(inputs: List[torch.Tensor]):
-        # always set the first element to bo iota in x
+        # always set the first element to be the number of tokens in x
         x, w = inputs
         x_hooked = torch.zeros_like(x)
         x_hooked[-1, 0] = x.shape[0]
@@ -389,13 +405,29 @@ def test_autotuner_tuning_configs():
         # Test if the number of tuning tokens is clipped to 32
         tune_max_num_tokens=GemmRunnerComplexTuningConfigs.tune_max_num_tokens,
         inputs_pre_hook=GemmRunnerComplexTuningConfigs.inputs_pre_hook,
+        use_cold_l2_cache=True,
+        use_cuda_graph=False,
     )
-    with autotune():
+    temp_dir = tempfile.TemporaryDirectory()
+    with autotune(cache_path=os.path.join(
+            temp_dir.name, "test_autotuner_tactic_configs.json")):
         tuner = AutoTuner.get()
-        runner, tactic = tuner.choose_one("test_autotuner_tactic_configs",
-                                          runners, tuning_config, [x, w])
+        runner, best_tactic = tuner.choose_one("test_autotuner_tactic_configs",
+                                               runners, tuning_config, [x, w])
+
+    runner_0([x, w], tactic=best_tactic)
+
+    # Test if the tactic can be loaded from cache correctly
+    AutoTuner.get().profiling_cache.clear()
+    AutoTuner.get().profiling_cache.load_cache(
+        os.path.join(temp_dir.name, "test_autotuner_tactic_configs.rank0.json"))
+
+    # No further tuning should be performed.
+    runner, deserialized_tactic = tuner.choose_one(
+        "test_autotuner_tactic_configs", runners, tuning_config, [x, w])
+    assert best_tactic == deserialized_tactic, "Tactic should be the same after deserialization"
 
-    runner_0.forward(inputs=[x, w], tactic=tactic)
+    runner_0([x, w], tactic=deserialized_tactic)
 
 
 def test_kernel_testing_single_context():