[inductor] turn on loaf (for oss) by default (pytorch#162030)

shunting314 · pytorchmergebot · commit a8e9ed2407bd · 2025-09-24T06:02:02.000Z
Pull Request resolved: pytorch#162030 Approved by: https://github.com/eellison, https://github.com/jansel
diff --git a/test/dynamo/test_logging.py b/test/dynamo/test_logging.py
@@ -137,7 +137,11 @@ def test_fusion(self, records):
         fn_opt = torch.compile(inductor_schedule_fn, backend="inductor")
         fn_opt(torch.ones(1000, 1000, device=device_type))
         self.assertGreater(len(records), 0)
-        self.assertLess(len(records), 8)
+
+        # LOAF will add an extra round of fusion and result in more logs
+        self.assertLess(
+            len(records), 8 * (1 + torch._inductor.config.loop_ordering_after_fusion)
+        )
 
     @requires_cuda_and_triton
     @make_logging_test(cudagraphs=True)
diff --git a/test/inductor/test_torchinductor_strided_blocks.py b/test/inductor/test_torchinductor_strided_blocks.py
@@ -1104,6 +1104,8 @@ def foo(x, length):
     # bernoulli operation
     # TODO: fails for triton CPU "Failed to convert to LLVM IR"
     @test_torchinductor.xfail_if_triton_cpu
+    # Disable split_reductions on this test for now due to the interaction with LOAF
+    @config.patch(split_reductions=False)
     def test_removed_buffers(self):
         from torch.ops import aten
 
@@ -1114,8 +1116,8 @@ def fn(a):
         result, code = self._run_and_compare(
             fn,
             *[torch.ones(200, 200, device=self.device) * p],
-            expected_num_triton_kernels=2,
-            expected_num_block_pointers=3,
+            expected_num_triton_kernels=1,
+            expected_num_block_pointers=1,
             atol=p * 0.06,
             rtol=0.06,
         )
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
@@ -649,7 +649,10 @@ def use_autoheuristic(name: str) -> bool:
 benchmark_fusion: bool = os.environ.get("TORCHINDUCTOR_BENCHMARK_FUSION") == "1"
 enabled_metric_tables = os.environ.get("TORCHINDUCTOR_ENABLED_METRIC_TABLES", "")
 loop_ordering_after_fusion: bool = (
-    os.environ.get("TORCHINDUCTOR_LOOP_ORDERING_AFTER_FUSION", "0") == "1"
+    os.environ.get(
+        "TORCHINDUCTOR_LOOP_ORDERING_AFTER_FUSION", "0" if is_fbcode() else "1"
+    )
+    == "1"
 )
 
 # If fusing two nodes only save less then score_fusion_memory_threshold memory,