[pt2e] Avoid getting model device once per node

andrewor14 · andrewor14 · commit 4a08e16a4945 · 2025-08-05T14:22:23.000-07:00
**Summary:** Previously, we call `assert_and_get_unqiue_device` once per node in convert. This is expensive and unnecessary since the model device is the same across all nodes, so we should just call this once in the beginning and reuse the same model device across all the nodes. torchao version of pytorch/pytorch#159901 **Test Plan:** ``` python test/quantization/pt2e/test_quantize_pt2e.py ```
diff --git a/torchao/quantization/pt2e/convert.py b/torchao/quantization/pt2e/convert.py
@@ -49,9 +49,7 @@
 )
 from torch.ao.quantization.fx.utils import (
     _get_module,
-    assert_and_get_unique_device,
     collect_producer_nodes,
-    create_getattr_from_value,
     graph_module_from_producer_nodes,
     node_arg_is_weight,
 )
@@ -73,7 +71,11 @@
 
 from torchao.quantization.pt2e import FROM_NODE_KEY
 from torchao.quantization.pt2e.observer import _is_activation_post_process
-from torchao.utils import TORCH_VERSION_AT_LEAST_2_6
+from torchao.quantization.pt2e.utils import create_getattr_from_value
+from torchao.utils import (
+    TORCH_VERSION_AT_LEAST_2_6,
+    _assert_and_get_unique_device,
+)
 
 if TORCH_VERSION_AT_LEAST_2_6:
     from torch.fx.traceback import NodeSource, NodeSourceAction
@@ -132,6 +134,7 @@ def _replace_observer_with_quantize_dequantize_node_decomposed(
     modules: dict[str, torch.nn.Module],
     node_name_to_scope: dict[str, tuple[str, type]],
     node_name_to_qconfig: dict[str, QConfigAny],
+    model_device: Optional[torch.device] = None,
 ) -> None:
     """Replace activation_post_process module call node with quantize and
     dequantize node working with decomposed Tensor
@@ -260,7 +263,11 @@ def add_quantize_dequantize_node_info(qdq_node, original_node):
                     # sure that the default overload can be used.
                     # TODO: maybe need more complex attr name here
                     qparam_node = create_getattr_from_value(
-                        model, graph, module_path + prefix + key, value_or_node
+                        model,
+                        graph,
+                        module_path + prefix + key,
+                        value_or_node,
+                        model_device,
                     )
                     quantize_op_inputs.append(qparam_node)
                 else:
@@ -407,6 +414,7 @@ def _replace_observer_with_quantize_dequantize_node(
     modules: dict[str, torch.nn.Module],
     node_name_to_scope: dict[str, tuple[str, type]],
     node_name_to_qconfig: dict[str, QConfigAny],
+    model_device: Optional[torch.device] = None,
 ) -> None:
     """Replace activation_post_process module call node with quantize and
     dequantize node
@@ -487,7 +495,11 @@ def _replace_observer_with_quantize_dequantize_node(
                     # For scale and zero_point values we register them as buffers in the root module.
                     # TODO: maybe need more complex attr name here
                     qparam_node = create_getattr_from_value(
-                        model, graph, module_path + prefix + key, value_or_node
+                        model,
+                        graph,
+                        module_path + prefix + key,
+                        value_or_node,
+                        model_device,
                     )
                     quantize_op_inputs.append(qparam_node)
                 else:
@@ -785,6 +797,7 @@ def convert_weighted_module(
     backend_config: BackendConfig,
     is_decomposed: bool = False,
     is_reference: bool = False,
+    model_device: Optional[torch.device] = None,
 ) -> None:
     """Convert a weighted module to reference quantized module in the model
     If the QConfig of a QAT module is not set, the module will still be converted to
@@ -873,7 +886,10 @@ def convert_weighted_module(
         is_ptq = weight_post_process is None
         if is_ptq:
             weight_post_process = qconfig.weight()  # type: ignore[union-attr, operator]
-            device = assert_and_get_unique_device(float_module)
+            if model_device is not None:
+                device = model_device
+            else:
+                device = _assert_and_get_unique_device(float_module)
             if device:
                 weight_post_process.to(device)
 
@@ -1076,6 +1092,7 @@ def convert(
     root_module_classes = tuple(root_module_to_quantized_reference_module.keys())
     qat_module_classes = get_qat_module_classes(backend_config)
     fused_module_classes = get_fused_module_classes(backend_config)
+    model_device = _assert_and_get_unique_device(model)
 
     for node in list(model.graph.nodes):
         if node.op == "placeholder":
@@ -1123,6 +1140,7 @@ def convert(
                         modules,
                         node_name_to_scope,
                         node_name_to_qconfig,
+                        model_device,
                     )
                 else:
                     _replace_observer_with_quantize_dequantize_node(
@@ -1131,6 +1149,7 @@ def convert(
                         modules,
                         node_name_to_scope,
                         node_name_to_qconfig,
+                        model_device,
                     )
             elif isinstance(mod, DeQuantStub):
                 _replace_observer_or_dequant_stub_with_dequantize_node(
@@ -1160,6 +1179,7 @@ def convert(
                     backend_config,
                     is_decomposed,
                     is_reference,
+                    model_device,
                 )
 
     # remove deadcode after converting observers to quant/dequant ops
diff --git a/torchao/quantization/pt2e/observer.py b/torchao/quantization/pt2e/observer.py
@@ -1885,7 +1885,9 @@ def convert(self, model: torch.fx.GraphModule, observer_node: Node):
             )
 
         from torchao.quantization.pt2e.utils import create_getattr_from_value
+        from torchao.utils import _assert_and_get_unique_device
 
+        model_device = _assert_and_get_unique_device(model)
         with model.graph.inserting_before(observer_node):
             assert self.block_size is not None, "Expecting block_size to be populated"
             assert self.original_dtype is not None, (
@@ -1915,10 +1917,18 @@ def convert(self, model: torch.fx.GraphModule, observer_node: Node):
             else:
                 scale, zero_point = self.calculate_qparams()
                 scale_node = create_getattr_from_value(
-                    model, model.graph, "_scale", scale
+                    model,
+                    model.graph,
+                    "_scale",
+                    scale,
+                    model_device,
                 )
                 zero_point_node = create_getattr_from_value(
-                    model, model.graph, "_zero_point", zero_point
+                    model,
+                    model.graph,
+                    "_zero_point",
+                    zero_point,
+                    model_device,
                 )
 
             q_node = model.graph.call_function(
diff --git a/torchao/quantization/pt2e/utils.py b/torchao/quantization/pt2e/utils.py
@@ -525,15 +525,20 @@ def get_attr_name(i: int):
 
 
 def create_getattr_from_value(
-    module: torch.nn.Module, graph: Graph, prefix: str, value: Any
+    module: torch.nn.Module,
+    graph: Graph,
+    prefix: str,
+    value: Any,
+    device: Optional[torch.device] = None,
 ) -> Node:
     """
     Given a value of any type, creates a getattr node corresponding to the value and
     registers the value as a buffer to the module.
     """
     get_new_attr_name = get_new_attr_name_with_prefix(prefix)
     attr_name = get_new_attr_name(module)
-    device = _assert_and_get_unique_device(module)
+    if device is None:
+        device = _assert_and_get_unique_device(module)
     new_value = (
         value.detach().clone()
         if isinstance(value, torch.Tensor)