Lint

jingyu-ml · jingyu-ml · commit 831c32d70800 · 2025-09-15T23:27:15.000Z
Signed-off-by: Jingyu Xin &lt;jingyux@nvidia.com&gt;
diff --git a/examples/diffusers/quantization/config.py b/examples/diffusers/quantization/config.py
@@ -36,7 +36,7 @@
         "*weight_quantizer": {"num_bits": (4, 3), "axis": None},
         "*input_quantizer": {"num_bits": (4, 3), "axis": None},
         "*output_quantizer": {"enable": False},
-        "*[qkv]_bmm_quantizer": {"type": "dynamic", "num_bits": (4, 3),"block_sizes": {-2: 32}},
+        "*[qkv]_bmm_quantizer": {"type": "dynamic", "num_bits": (4, 3), "block_sizes": {-2: 32}},
         "*softmax_quantizer": {
             "num_bits": (4, 3),
             "axis": None,
diff --git a/modelopt/torch/quantization/export_onnx.py b/modelopt/torch/quantization/export_onnx.py
@@ -234,6 +234,7 @@ def _fp8_quantize(
     )
     return q_op
 
+
 def _fp8_block_quantize(
     g: torch.onnx._internal.jit_utils.GraphContext,
     inputs: torch.Value,
@@ -289,13 +290,14 @@ def _fp8_dequantize(
         out = g.op("Cast", out, to_i=onnx_dtype_map[otype])  # type: ignore[index]
     return out
 
+
 def _fp8_block_dequantize(
     g: torch.onnx._internal.jit_utils.GraphContext,
     inputs: torch.Value,
     scales: torch.Value,
     trt_high_precision_dtype: str,
     otype: str | None = None,
-    block_sizes: list = [1,1,128,1]
+    block_sizes: list = [1, 1, 128, 1],
 ):
     """Helper Function for Dequantization."""
     output_shape = sym_help._get_tensor_sizes(inputs)
@@ -339,8 +341,7 @@ def export_fp8(
         )
         return _fp8_block_dequantize(
             g, q_tensor, scales_output, trt_high_precision_dtype, otype, block_sizes
-        )  
-
+        )
 
 
 def scaled_dot_product_attention(
@@ -498,7 +499,9 @@ def export_fp8_mha(
         v_input_dtype = value.type().scalarType()
         if {q_input_dtype, k_input_dtype, v_input_dtype} != {high_precision_flag}:
             raise ValueError("The quantized MHA must have 16-bit inputs.")
-        query_scaled = export_fp8(g, query_scaled, q_quantized_scale, high_precision_flag, q_block_shape)
+        query_scaled = export_fp8(
+            g, query_scaled, q_quantized_scale, high_precision_flag, q_block_shape
+        )
         query_scaled = g.op("Cast", query_scaled, to_i=onnx_dtype_map["Float"])
         key_transposed_scaled = export_fp8(
             g, key_transposed_scaled, k_quantized_scale, high_precision_flag, k_block_shape
@@ -531,7 +534,7 @@ def export_fp8_mha(
 
     if not disable_fp8_mha:
         # Softmax's output scale is hard coded to 1.0
-        # We cannot do block quant for the softmax's output 
+        # We cannot do block quant for the softmax's output
         attn_weight = export_fp8(g, attn_weight, 1.0, high_precision_flag, None)
         attn_weight = g.op("Cast", attn_weight, to_i=onnx_dtype_map["Float"])
 
diff --git a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py
@@ -650,27 +650,27 @@ def _real_quantize(self, inputs):
 
     def _get_block_sizes_list(self, shape):
         """Convert block_sizes dict to list format based on tensor shape.
-        
+
         Args:
             shape: The tensor shape to use for conversion (can be tuple or torch.Size)
-            
+
         Returns:
             List of block sizes for each dimension, or None if block_sizes is None
-            
+
         Example:
             block_sizes = {-2: 32} with shape [2, 24, 4608, 128] -> [1, 1, 32, 1]
         """
         if self.block_sizes is None:
             return None
-            
+
         block_sizes_list = []
         for dim in range(len(shape)):
             # Check both positive and negative dimension indices
             dim_negative = dim - len(shape)
             block_size = self.block_sizes.get(dim, None) or self.block_sizes.get(dim_negative, None)
             block_sizes_list.append(block_size if block_size is not None else 1)
         return block_sizes_list
-    
+
     def _fake_quantize(self, inputs):
         """Fake quantization."""
         amax = None
@@ -956,7 +956,7 @@ def forward(self, inputs):
             and self.block_sizes.get("type", None) != "dynamic"
             and self._fake_quant
         ):
-            # Reshape is required if the logic isn’t handled in the simulation kernel
+            # Reshape is required if the logic isnt handled in the simulation kernel
             self._setup_for_blockquant(inputs)
             setattr(self, "_original_input_shape", inputs.shape)
             inputs = self._process_for_blockquant(inputs)
diff --git a/modelopt/torch/quantization/plugins/diffusers.py b/modelopt/torch/quantization/plugins/diffusers.py
@@ -114,18 +114,26 @@ def _quantized_sdpa(self, *args, **kwargs):
         key = self.k_bmm_quantizer(key)
         value = self.v_bmm_quantizer(value)
 
-    if not self.q_bmm_quantizer._dynamic and not self.k_bmm_quantizer._dynamic and not self.v_bmm_quantizer._dynamic:
+    if (
+        not self.q_bmm_quantizer._dynamic
+        and not self.k_bmm_quantizer._dynamic
+        and not self.v_bmm_quantizer._dynamic
+    ):
         q_quantized_scale = self.q_bmm_quantizer._get_amax(query)
         k_quantized_scale = self.k_bmm_quantizer._get_amax(key)
         v_quantized_scale = self.v_bmm_quantizer._get_amax(value)
     else:
-        assert self.q_bmm_quantizer._dynamic and self.k_bmm_quantizer._dynamic and self.v_bmm_quantizer._dynamic, "QKV QDQS must be in the same type"
+        assert (
+            self.q_bmm_quantizer._dynamic
+            and self.k_bmm_quantizer._dynamic
+            and self.v_bmm_quantizer._dynamic
+        ), "QKV QDQS must be in the same type"
         q_quantized_scale, k_quantized_scale, v_quantized_scale = None, None, None
-    
+
     # Get block sizes lists for each quantizer if needed
-    q_block_sizes = self.q_bmm_quantizer._get_block_sizes_list(query.shape)
-    k_block_sizes = self.k_bmm_quantizer._get_block_sizes_list(key.shape)
-    v_block_sizes = self.v_bmm_quantizer._get_block_sizes_list(value.shape)
+    q_block_sizes = self.q_bmm_quantizer._get_block_sizes_list(query.shape)  # type: ignore[union-attr]
+    k_block_sizes = self.k_bmm_quantizer._get_block_sizes_list(key.shape)  # type: ignore[union-attr]
+    v_block_sizes = self.v_bmm_quantizer._get_block_sizes_list(value.shape)  # type: ignore[union-attr]
 
     # We don't need to calibrate the output of softmax
     return self.bmm2_output_quantizer(
@@ -142,7 +150,7 @@ def _quantized_sdpa(self, *args, **kwargs):
             else "Half",
             self._disable_fp8_mha if hasattr(self, "_disable_fp8_mha") else True,
             q_block_sizes,
-            k_block_sizes, 
+            k_block_sizes,
             v_block_sizes,
         )
     )
@@ -218,7 +226,9 @@ def forward(
         )
 
     @staticmethod
-    @symbolic_helper.parse_args("v", "v", "v", "v", "f", "b", "v", "t", "t", "t", "s", "b", "is", "is", "is")
+    @symbolic_helper.parse_args(
+        "v", "v", "v", "v", "f", "b", "v", "t", "t", "t", "s", "b", "is", "is", "is"
+    )
     def symbolic(
         g: jit_utils.GraphContext,
         query: torch._C.Value,