[dynamo] Avoid recompiling over unused objects (pytorch#156891)

StrongerXi · pytorchmergebot · commit f742b32a2ff6 · 2025-07-09T20:14:34.000Z
Dynamo was aggressively specializing on lazy VTs over `set_name_hint` in `STORE_FAST`, etc., and `isinstance` in `LOAD_FAST_CHECK`. This causes regional `torch.compile` from optimizing ComfyUI GGUF + LoRA to either (1). exceed the recompialtion limit of 8, which results in suboptimal performance, and (2). even if recompilation limit is increased, the compilation time gets unnecessarily high (180s v.s. 20s for Flux). This patch fixes the recompilation issue. Pull Request resolved: pytorch#156891 Approved by: https://github.com/williamwen42, https://github.com/mlazos
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
@@ -210,7 +210,7 @@ mobilenet_v2,pass,0
 
 
 
-mobilenet_v2_quantized_qat,pass,2
+mobilenet_v2_quantized_qat,pass,3
 
 
 
@@ -274,7 +274,7 @@ resnet50,pass,0
 
 
 
-resnet50_quantized_qat,pass,2
+resnet50_quantized_qat,pass,3
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
@@ -210,7 +210,7 @@ mobilenet_v2,pass,0
 
 
 
-mobilenet_v2_quantized_qat,pass,2
+mobilenet_v2_quantized_qat,pass,3
 
 
 
@@ -274,7 +274,7 @@ resnet50,pass,0
 
 
 
-resnet50_quantized_qat,pass,2
+resnet50_quantized_qat,pass,3
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
@@ -210,7 +210,7 @@ mobilenet_v2,pass,0
 
 
 
-mobilenet_v2_quantized_qat,pass,2
+mobilenet_v2_quantized_qat,pass,3
 
 
 
@@ -274,7 +274,7 @@ resnet50,pass,0
 
 
 
-resnet50_quantized_qat,pass,2
+resnet50_quantized_qat,pass,3
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
@@ -194,7 +194,7 @@ mobilenet_v2,pass,0
 
 
 
-mobilenet_v2_quantized_qat,pass,2
+mobilenet_v2_quantized_qat,pass,3
 
 
 
@@ -258,7 +258,7 @@ resnet50,pass,0
 
 
 
-resnet50_quantized_qat,pass,2
+resnet50_quantized_qat,pass,3
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
@@ -210,7 +210,7 @@ mobilenet_v2,pass,0
 
 
 
-mobilenet_v2_quantized_qat,pass,2
+mobilenet_v2_quantized_qat,pass,3
 
 
 
@@ -274,7 +274,7 @@ resnet50,pass,0
 
 
 
-resnet50_quantized_qat,pass,2
+resnet50_quantized_qat,pass,3
 
 
 
diff --git a/test/dynamo/test_higher_order_ops.py b/test/dynamo/test_higher_order_ops.py
@@ -4483,15 +4483,13 @@ def wrapper_fn(model, params, buffers, inputs):
         if torch._dynamo.config.inline_inbuilt_nn_modules:
             expected = """\
 class GraphModule(torch.nn.Module):
-    def forward(self, L_params_l1_weight_: "f32[1, 1]", L_params_l1_bias_: "f32[1]", L_buffers_buffer_: "f32[1]", L_inputs_: "f32[1, 1]"):
-        l_params_l1_weight_ = L_params_l1_weight_
-        l_params_l1_bias_ = L_params_l1_bias_
-        l_buffers_buffer_ = L_buffers_buffer_
+    def forward(self, L_inputs_: "f32[1, 1]", L_model_modules_l1_parameters_weight_: "f32[1, 1]", L_model_modules_l1_parameters_bias_: "f32[1]", L_model_buffers_buffer_: "f32[1]"):
         l_inputs_ = L_inputs_
-
-        linear: "f32[1, 1]" = torch._C._nn.linear(l_inputs_, l_params_l1_weight_, l_params_l1_bias_);  l_inputs_ = l_params_l1_weight_ = l_params_l1_bias_ = None
-
-        add: "f32[1, 1]" = linear + l_buffers_buffer_;  linear = l_buffers_buffer_ = None
+        l_model_modules_l1_parameters_weight_ = L_model_modules_l1_parameters_weight_
+        l_model_modules_l1_parameters_bias_ = L_model_modules_l1_parameters_bias_
+        l_model_buffers_buffer_ = L_model_buffers_buffer_
+        linear: "f32[1, 1]" = torch._C._nn.linear(l_inputs_, l_model_modules_l1_parameters_weight_, l_model_modules_l1_parameters_bias_);  l_inputs_ = l_model_modules_l1_parameters_weight_ = l_model_modules_l1_parameters_bias_ = None
+        add: "f32[1, 1]" = linear + l_model_buffers_buffer_;  linear = l_model_buffers_buffer_ = None
         return (add,)
 """
             # We found Windows/Linux have some empty line difference, empty_line_normalizer will help fix it.
diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
@@ -6823,7 +6823,7 @@ def fn(x):
             # assign fstring to a variable causes the fstring to be used,
             # which realizes the variable tracker.
             f_str = f"{x.shape[0]}"
-            return x.sin()
+            return x.sin(), f_str
 
         guard_failure = None
 
diff --git a/test/dynamo/test_recompiles.py b/test/dynamo/test_recompiles.py
@@ -499,6 +499,29 @@ def f(x, foo):
         f(x, foo1)
         self.assertEqual(counter.frame_count, 2)
 
+    def test_no_recompile_over_unused_objects(self):
+        # This is a regression test case that imitates
+        # https://github.com/city96/ComfyUI-GGUF/blob/47bec6147569a138dd30ad3e14f190a36a3be456/ops.py#L169-L182
+        counter = torch._dynamo.testing.CompileCounter()
+
+        def f(x, key, patches):
+            return x * x + 1
+
+        @torch.compile(backend=counter, fullgraph=True)
+        def apply_patches(f, x, keys):
+            patches = []
+            for key, patch in keys:  # noqa: F402
+                patches.append(patch)
+            x = f(x, key, patches)
+            return x
+
+        # no recompilation
+        x = torch.rand(10)
+        apply_patches(f, x, [("a", 1), ("b", 2)])
+        self.assertEqual(counter.frame_count, 1)
+        apply_patches(f, x, [("c", 3), ("d", 4)])
+        self.assertEqual(counter.frame_count, 1)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/torch_np/numpy_tests/core/test_multiarray.py b/test/torch_np/numpy_tests/core/test_multiarray.py
@@ -3779,6 +3779,7 @@ def test_datetime(self):
         expected_idx = np.array([2, 1, 0])
         assert_array_equal(idx, expected_idx)
 
+    @xfail  # GH issue #157720
     def test_object(self):  # gh-6312
         a = np.random.choice(10, 1000)
         b = np.random.choice(["abc", "xy", "wz", "efghi", "qwst", "x"], 1000)
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
@@ -3035,7 +3035,7 @@ def END_FOR(self, inst):
             self.popn(2)
 
     def LOAD_FAST_CHECK(self, inst):
-        if isinstance(self.symbolic_locals.get(inst.argval, None), NullVariable):
+        if istype(self.symbolic_locals.get(inst.argval, None), NullVariable):
             unimplemented_v2(
                 gb_type="LOAD_FAST_CHECK on uninitialized variable",
                 context=inst.argval,
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
@@ -1632,6 +1632,8 @@ def wrap_listlike(self, value: Union[tuple, list, odict_values, NamedTuple]):
                 source=source,
             )
 
+            # Apply relevant logic from `VariableTracker.build(value[i])`
+            # (except for the `create_graph_input` stuff).
             guards = []
             for i, tensor_variable in enumerate(list_variable.items):
                 source_i = GetItemSource(base=source, index=i, index_is_slice=False)
@@ -1640,7 +1642,6 @@ def wrap_listlike(self, value: Union[tuple, list, odict_values, NamedTuple]):
                 tensor_variable.proxy.node.meta["tensor_dict"] = _extract_tensor_dict(
                     value[i]
                 )
-
                 guard = functools.partial(
                     GuardBuilder.TENSOR_MATCH, value=TensorWeakRef(value[i])
                 )
@@ -1657,6 +1658,27 @@ def wrap_listlike(self, value: Union[tuple, list, odict_values, NamedTuple]):
             )
             tensor_list_proxy.node.meta["grapharg"] = grapharg
 
+            # The following is very important for maintaining the "python object
+            # <==> variable tracker" 1-to-1 mapping, which is mainly handled via
+            # `side_effects`. Note that constructing `tensor_variable` above
+            # already adds it to graph arg, but we never registered it with
+            # `side_effects`. The pre-emptive `realize` calls here basically
+            # does that registration (at the end of `self.__call__`).
+            #
+            # A slightly cleaner alternative is to register the
+            # `tensor_variable`s above with `side_effects` directly, and just
+            # return the `list_variable`, but that breaks some tensor-subclass
+            # releated tests like `test_inputs_aliasing_bytecode_stack_restore`,
+            # because `tensor_variable` is constructed via
+            # `handle_traced_output`, which doesn't really expect/handle tensor
+            # subclass.
+            #
+            # Eventually, we expect to fix remove all of these by having Dynamo
+            # auto-boxing inputs to the compiled graph, see
+            # https://github.com/pytorch/pytorch/issues/153701.
+            for vt in output:
+                vt.realize()
+
         result = BaseListVariable.cls_for_instance(value)(output, source=self.source)
         if istype(value, (list, collections.deque)):
             return self.tx.output.side_effects.track_mutable(value, result)
diff --git a/torch/_dynamo/variables/lazy.py b/torch/_dynamo/variables/lazy.py
@@ -17,6 +17,7 @@ def __init__(self, value: Any, source: Any) -> None:
             assert source
         self.value = value
         self.source = source
+        self.name_hint: Optional[str] = None
         self.vt: Optional[VariableTracker] = None
 
     def realize(self) -> None:
@@ -31,8 +32,12 @@ def realize(self) -> None:
         else:
             self.vt = builder.VariableBuilder(tx, self.source)(self.value)
 
+        if self.name_hint is not None:
+            self.vt.set_name_hint(self.name_hint)
+
         del self.value
         del self.source
+        del self.name_hint
 
 
 @final
@@ -92,6 +97,12 @@ def peek_value(self) -> Any:
         assert not self.is_realized()
         return self._cache.value
 
+    def set_name_hint(self, name: str) -> None:
+        if self.is_realized():
+            self._cache.vt.set_name_hint(name)  # type: ignore[union-attr]
+        else:
+            self._cache.name_hint = name
+
     def __str__(self) -> str:
         if self.is_realized():
             return repr(self.unwrap())