Change default strategy back to INDEPENDENT. Send all contents in cache to the next PP rank.

hyukn · hyukn · commit 2b0ead5af4cf · 2025-12-17T07:31:27.000Z
Signed-off-by: Yukun He &lt;23156053+hyukn@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py
@@ -125,7 +125,7 @@ class TuningConfig:
     inputs_pre_hook: Callable = None
     use_cold_l2_cache: bool = False
     use_cuda_graph: bool = True
-    distributed_tuning_strategy: DistributedTuningStrategy = DistributedTuningStrategy.BROADCAST
+    distributed_tuning_strategy: DistributedTuningStrategy = DistributedTuningStrategy.INDEPENDENT
 
 
 @dataclass(unsafe_hash=True)
@@ -1524,16 +1524,9 @@ def cache_sync_pp_recv(self):
             self.profiling_cache.merge_cache_data(profiling_cache)
 
     def cache_sync_pp_send(self):
-        # Op with INDEPENDENT strategy shall not be send
+        # Send all cache contents to next pp rank
         if self.mapping.has_pp() and not self.mapping.is_last_pp_rank:
-            dependent_custom_ops = [
-                op for op, strategy in
-                self._map_op_to_distributed_strategy.items()
-                if strategy != DistributedTuningStrategy.INDEPENDENT
-            ]
-            dependent_custom_ops_cache = dict()
-            for op in dependent_custom_ops:
-                dependent_custom_ops_cache.update(
-                    self.profiling_cache.get_specific_custom_op(op))
-            self._dist.send_object(dependent_custom_ops_cache,
-                                   self.mapping.next_pp_rank())
+            self._dist.send_object(
+                self.profiling_cache.cache,
+                self.mapping.next_pp_rank(),
+            )