Move quantization &quant_moe to new inf optimizer (#112)

h-guo18 · Fridah-nv · web-flow · commit b47abb48beb7 · 2025-07-25T23:27:28.000-07:00
* refactor: move quantization and quant_moe to new inf optimizer

Signed-off-by: haoguo &lt;67671475+h-guo18@users.noreply.github.com&gt;

* refactor: use quant_config from factory instead of new config type

Signed-off-by: haoguo &lt;67671475+h-guo18@users.noreply.github.com&gt;

* refactor: del old files; update default.yaml

Signed-off-by: haoguo &lt;67671475+h-guo18@users.noreply.github.com&gt;

* move helper class FakeFacotry to _graph_test_helpers.py

Signed-off-by: haoguo &lt;67671475+h-guo18@users.noreply.github.com&gt;

* polish: remove unreachable branch in quantization.py

Co-authored-by: Fridah-nv &lt;201670829+Fridah-nv@users.noreply.github.com&gt;
Signed-off-by: h-guo18 &lt;67671475+h-guo18@users.noreply.github.com&gt;

* style: run pre-commit

Signed-off-by: haoguo &lt;67671475+h-guo18@users.noreply.github.com&gt;

* fix to fetch hf_quant_config from fetched dir

Signed-off-by: Frida Hou &lt;201670829+Fridah-nv@users.noreply.github.com&gt;

---------

Signed-off-by: haoguo &lt;67671475+h-guo18@users.noreply.github.com&gt;
Signed-off-by: h-guo18 &lt;67671475+h-guo18@users.noreply.github.com&gt;
Signed-off-by: Frida Hou &lt;201670829+Fridah-nv@users.noreply.github.com&gt;
Co-authored-by: Fridah-nv &lt;201670829+Fridah-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/config/default.yaml b/tensorrt_llm/_torch/auto_deploy/config/default.yaml
@@ -19,3 +19,7 @@ transforms:
     stage: post_export
   cleanup_input_constraints:
     stage: post_export
+  quantize:
+    stage: pattern_matcher
+  quantize_moe:
+    stage: pattern_matcher
diff --git a/tensorrt_llm/_torch/auto_deploy/models/hf.py b/tensorrt_llm/_torch/auto_deploy/models/hf.py
@@ -305,7 +305,7 @@ def _prefetch_checkpoint(self, model_name_or_path: str, skip_prefetch_weights: b
         # at this point it should be a directory (either the original one or the download dir)
         assert os.path.isdir(fetched_dir), f"Checkpoint path {fetched_dir} is not a directory."
 
-        self._load_quantization_config()
+        self._load_quantization_config(fetched_dir)
 
         return fetched_dir
 
@@ -323,13 +323,13 @@ def _load_checkpoint(self, model: nn.Module, device: DeviceLikeType):
             # model-transformed weights,leading to unexpected key mismatches or format issues.
             load_checkpoint_in_model(model, checkpoint=ckpt_file, full_state_dict=False)
 
-    def _load_quantization_config(self):
+    def _load_quantization_config(self, fetched_dir: str):
         """Load the quantization config from the model directory if not done already."""
         if self._quant_config is not None:
             return
 
         assert self.model
-        hf_quant_config_file = os.path.join(self.model, "hf_quant_config.json")
+        hf_quant_config_file = os.path.join(fetched_dir, "hf_quant_config.json")
         if os.path.exists(hf_quant_config_file):
             with open(hf_quant_config_file, "r") as file:
                 quantization_config = json.load(file)
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py b/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py
@@ -1,11 +1,12 @@
 from collections import defaultdict
 from functools import partial
-from typing import Any, Dict
+from typing import Dict, Tuple
 
 import torch.nn as nn
 from torch.fx import GraphModule, Node
 
-from ...utils.logger import ad_logger
+from ...models.factory import ModelFactory
+from ...shim.interface import CachedSequenceInterface
 from ...utils.node_utils import (
     extract_param_names_from_lin_node,
     get_quantization_params_from_linear_node,
@@ -20,7 +21,7 @@
     remove_output_quantizers,
     should_skip_quantization,
 )
-from .._graph import canonicalize_graph
+from ..interface import BaseTransform, TransformInfo, TransformRegistry
 
 
 def _insert_quantized_linear(
@@ -138,12 +139,8 @@ def get_scale_name(scale_name):
         scale_target_module = gm  # Register in root module
         scale_name_prefix = ""
 
-        ad_logger.info(f"Quantized BMM with dynamic weight tensor for node {node}")
     else:
         # If we can't determine the shape, skip quantization
-        ad_logger.warning(
-            f"BMM weight is dynamic tensor without shape metadata, skipping quantization for node {node}"
-        )
         return
 
     # Common logic for both parameter and dynamic tensor cases
@@ -169,53 +166,70 @@ def get_scale_name(scale_name):
     node.args = (*node.args, *scale_values)
 
 
-def quantize(gm: GraphModule, quant_config: Dict[str, Any]) -> None:
-    """Quantize the GraphModule and replace linear with quantized linear."""
-    # extract info from quant_config
-    is_quant_graph = is_quantized_graph(gm)
-    quant_algo = quant_config.get("quant_algo")
-    excluded_patterns = quant_config.get("exclude_modules", [])
-
-    # no quantization to do
-    if not (is_quant_graph or quant_config):
-        ad_logger.info("No quantization to do.")
-        return
+@TransformRegistry.register("quantize")
+class Quantization(BaseTransform):
+    """Quantize the GraphModule and replace linear/BMM with quantized linear/BMM."""
 
-    # tracking quantized operations in the graph
-    quantized_nodes: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
-    for n in gm.graph.nodes:
-        if should_skip_quantization(n, excluded_patterns):
-            continue
-
-        # Process linear operations
-        if is_linear_op(n, include_quantization=False):
-            # get per-layer quantization format from the node
-            quant_algo_n: str = (
-                get_quantization_from_linear_node(n) if is_quant_graph else quant_algo
+    def _apply(
+        self, gm: GraphModule, cm: CachedSequenceInterface, factory: ModelFactory
+    ) -> Tuple[GraphModule, TransformInfo]:
+        # extract info from quant_config
+        quant_config = factory.get_quant_config()
+        if not quant_config:
+            return gm, TransformInfo(
+                skipped=True, num_matches=0, is_clean=True, has_valid_shapes=True
             )
-            if not quant_algo_n:
-                continue
 
-            # insert quantized linear node
-            _insert_quantized_linear(gm, n, QuantizationImpl.create(quant_algo_n), is_quant_graph)
-            quantized_nodes[quant_algo_n]["linear"] += 1
+        is_quant_graph = is_quantized_graph(gm)
+        quant_algo = quant_config.get("quant_algo")
+        excluded_patterns = quant_config.get("exclude_modules", [])
+        if not quant_algo:
+            return gm, TransformInfo(
+                skipped=True, num_matches=0, is_clean=True, has_valid_shapes=True
+            )
 
-        # Process BMM operations
-        elif is_bmm_op(n):
-            if not quant_algo:
+        # tracking quantized operations in the graph
+        quantized_nodes: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
+        for n in gm.graph.nodes:
+            if should_skip_quantization(n, excluded_patterns):
                 continue
 
-            # insert quantized bmm node
-            _insert_quantized_bmm(
-                gm, n, QuantizationImpl.create(quant_algo, is_bmm=True), is_quant_graph
-            )
-            quantized_nodes[quant_algo]["bmm"] += 1
-
-    if is_quant_graph:
-        remove_output_quantizers(gm)
+            # Process linear operations
+            if is_linear_op(n, include_quantization=False):
+                # get per-layer quantization format from the node
+                quant_algo_n: str = (
+                    get_quantization_from_linear_node(n) if is_quant_graph else quant_algo
+                )
+                if not quant_algo_n:
+                    continue
+
+                # insert quantized linear node
+                _insert_quantized_linear(
+                    gm, n, QuantizationImpl.create(quant_algo_n), is_quant_graph
+                )
+                quantized_nodes[quant_algo_n]["linear"] += 1
+
+            # Process BMM operations
+            elif is_bmm_op(n):
+                if not quant_algo:
+                    continue
+
+                # insert quantized bmm node
+                _insert_quantized_bmm(
+                    gm, n, QuantizationImpl.create(quant_algo, is_bmm=True), is_quant_graph
+                )
+                quantized_nodes[quant_algo]["bmm"] += 1
+
+        if is_quant_graph:
+            remove_output_quantizers(gm)
+
+        num_matches = 0
+        for quant_algo in quantized_nodes:
+            for op_type, count in quantized_nodes[quant_algo].items():
+                num_matches += count
+
+        info = TransformInfo(
+            skipped=False, num_matches=num_matches, is_clean=False, has_valid_shapes=True
+        )
 
-    canonicalize_graph(gm)
-    for quant_algo in quantized_nodes:
-        for op_type, count in quantized_nodes[quant_algo].items():
-            ad_logger.info(f"Found {count} {quant_algo} quantized {op_type} nodes.")
-    ad_logger.debug("After quantization: " + str(gm))
+        return gm, info
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/quantize_moe.py b/tensorrt_llm/_torch/auto_deploy/transform/library/quantize_moe.py
@@ -1,14 +1,15 @@
 from functools import partial
-from typing import Any, Callable, Dict, List, Tuple
+from typing import Callable, List, Tuple
 
 import torch
 import torch.nn as nn
 from torch.fx import GraphModule, Node
 
-from ...utils.logger import ad_logger
+from ...models.factory import ModelFactory
+from ...shim.interface import CachedSequenceInterface
 from ...utils.node_utils import is_op
 from ...utils.quantization_utils import QuantizationImpl, should_skip_quantization
-from .._graph import canonicalize_graph
+from ..interface import BaseTransform, TransformInfo, TransformRegistry
 
 quantized_moe_op_map = {
     "FP8": torch.ops.auto_deploy.torch_quant_fp8_moe,
@@ -92,47 +93,10 @@ def collect_scales(index: int) -> Tuple[List[Node], List[Node], List[Node]]:
             quantized_op,
             args=tuple(args),
         )
-        ad_logger.debug(f"Updating {node.name} args to {new_node.args}")
         node.replace_all_uses_with(new_node)
         gm.graph.erase_node(node)
 
 
-def quantize_moe(gm: GraphModule, quant_config: Dict[str, Any]) -> None:
-    """
-    Traverse gm, find every torch.ops.auto_deploy.torch_moe, and replace it with the
-    quantized version using the quant_algo from quant_config.
-    """
-    quant_algo = quant_config.get("quant_algo")
-    if not quant_algo:
-        ad_logger.info("No quantization to do.")
-        return gm
-    excluded_patterns = quant_config.get("exclude_modules", [])
-
-    quant_impl = QuantizationImpl.create(quant_algo)
-    quantized_op = quantized_moe_op_map[quant_algo]
-
-    count = 0
-
-    for node in list(gm.graph.nodes):
-        if is_op(node, torch.ops.auto_deploy.torch_moe):
-            # Check that all expert weights should be quantized
-            w1_names, w2_names, w3_names = _extract_moe_weight_param_lists(node)
-            if any(
-                should_skip_quantization(n, excluded_patterns)
-                for n in w1_names + w2_names + w3_names
-            ):
-                continue
-            _quantize_moe_node(gm, node, quant_impl, quantized_op)
-            count += 1
-
-    if count == 0:
-        return gm
-
-    gm = canonicalize_graph(gm)
-    ad_logger.info(f"Found {count} {quant_algo} quantized {quantized_op} nodes.")
-    return
-
-
 # TODO(Fridah-nv): robust handling similar to `extract_param_names_from_lin_node` or expand it
 def _extract_moe_weight_param_lists(moe_node: Node) -> Tuple[List[str], List[str], List[str]]:
     """
@@ -165,3 +129,51 @@ def _unwrap_list(arg) -> List[str]:
     w3_names = _unwrap_list(w3_list)
 
     return w1_names, w2_names, w3_names
+
+
+@TransformRegistry.register("quantize_moe")
+class QuantizeMOE(BaseTransform):
+    """
+    Traverse gm, find every torch.ops.auto_deploy.torch_moe, and replace it with the
+    quantized version using the quant_algo from quant_config.
+    """
+
+    def _apply(
+        self, gm: GraphModule, cm: CachedSequenceInterface, factory: ModelFactory
+    ) -> Tuple[GraphModule, TransformInfo]:
+        quant_config = factory.get_quant_config()
+        quant_algo = quant_config.get("quant_algo") if quant_config else None
+
+        if not quant_config or not quant_algo:
+            return gm, TransformInfo(
+                skipped=True, num_matches=0, is_clean=True, has_valid_shapes=True
+            )
+        excluded_patterns = quant_config.get("exclude_modules", [])
+
+        quant_impl = QuantizationImpl.create(quant_algo)
+        quantized_op = quantized_moe_op_map[quant_algo]
+
+        count = 0
+
+        for node in list(gm.graph.nodes):
+            if is_op(node, torch.ops.auto_deploy.torch_moe):
+                # Check that all expert weights should be quantized
+                w1_names, w2_names, w3_names = _extract_moe_weight_param_lists(node)
+                if any(
+                    should_skip_quantization(n, excluded_patterns)
+                    for n in w1_names + w2_names + w3_names
+                ):
+                    continue
+                _quantize_moe_node(gm, node, quant_impl, quantized_op)
+                count += 1
+
+        if count == 0:
+            return gm, TransformInfo(
+                skipped=False, num_matches=0, is_clean=True, has_valid_shapes=True
+            )
+
+        info = TransformInfo(
+            skipped=False, num_matches=count, is_clean=False, has_valid_shapes=False
+        )
+
+        return gm, info
diff --git a/tensorrt_llm/_torch/auto_deploy/transformations/library/__init__.py b/tensorrt_llm/_torch/auto_deploy/transformations/library/__init__.py
@@ -6,8 +6,6 @@
 from .fused_moe import *
 from .fusion import *
 from .kvcache import *
-from .quantization import *
-from .quantize_moe import *
 from .rms_norm import *
 from .rope import *
 from .sharding import *
diff --git a/tensorrt_llm/_torch/auto_deploy/transformations/transform.py b/tensorrt_llm/_torch/auto_deploy/transformations/transform.py
@@ -32,8 +32,6 @@
     match_rope_layout,
     match_rope_pattern,
     optimize_rope,
-    quantize,
-    quantize_moe,
     resize_kv_cache,
     sharding_transform_executor,
     update_in_out_nodes,
@@ -70,9 +68,6 @@ def __call__(self, cm: CachedSequenceInterface) -> nn.Module:
         ############################################################################################
         # RUN PATTERN MATCHER TRANSFORMATIONS TO STANDARDIZE GRAPH REPRESENTATION
         ############################################################################################
-        # quantization
-        quantize(egm, self.factory.get_quant_config())
-        quantize_moe(egm, self.factory.get_quant_config())
 
         # Match MoE pattern
         match_moe_pattern(egm)
diff --git a/tests/unittest/_torch/auto_deploy/_utils_test/_graph_test_helpers.py b/tests/unittest/_torch/auto_deploy/_utils_test/_graph_test_helpers.py
@@ -10,15 +10,32 @@
 
 from tensorrt_llm._torch.auto_deploy.custom_ops.attention_interface import SequenceInfo
 from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm
+from tensorrt_llm._torch.auto_deploy.models.factory import ModelFactory
 from tensorrt_llm._torch.auto_deploy.transformations.library.sharding import ShardingTransformInfo
 
 
-class FakeFactory:
-    def __init__(self, model: nn.Module):
-        self.model = model
+class FakeFactory(ModelFactory):
+    """Dummy factory to pass cache_config for testing."""
 
-    def build_model(self, device: str) -> nn.Module:
-        return self.model.to(device=device)
+    def __init__(self, model=None, cache_config=None, quant_config=None):
+        self._model = model
+        self.cache_config = cache_config
+        self.quant_config = quant_config
+
+    def build_model(self, device: str):
+        return self._model.to(device=device) if self._model else None
+
+    def _build_model(self, device: str):
+        return
+
+    def _load_checkpoint(self, model, device):
+        return
+
+    def get_cache_config(self):
+        return self.cache_config
+
+    def get_quant_config(self):
+        return self.quant_config
 
 
 class SequenceEmbeddingInfo(SequenceInfo):
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_quant_moe.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_quant_moe.py
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_quantization.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_quantization.py