tmp fix

yiliu30 · yiliu30 · commit cf9843b582ec · 2025-09-03T05:47:17.000-04:00
Signed-off-by: yiliu30 &lt;yi4.liu@intel.com&gt;
diff --git a/examples/quantization_w4a4_fp4/llama3_example.py b/examples/quantization_w4a4_fp4/llama3_example.py
@@ -85,7 +85,7 @@ def tokenize(sample):
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
     model.device
 )
-output = model.generate(input_ids, max_new_tokens=100)
+output = model.generate(input_ids, max_new_tokens=10)
 
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
diff --git a/src/llmcompressor/transformers/compression/quantization_format.py b/src/llmcompressor/transformers/compression/quantization_format.py
@@ -24,6 +24,8 @@ def _get_quant_compression_format(
     is_weight_only = weight_args is not None and input_args is None
 
     if weight_args.num_bits == 4 and weight_args.type == QuantizationType.FLOAT.value:
+        if weight_args.is_mx:
+             return CompressionFormat.mxfp4_pack_quantized
         return CompressionFormat.nvfp4_pack_quantized
 
     if is_weight_only:  # w4a16 and w8a16
@@ -55,6 +57,30 @@ def _get_quant_compression_format(
         return CompressionFormat.naive_quantized
 
 
+def _get_unique_quant_args(model):
+    """
+    Gets a list of all the unique quantization settings present in model
+    """
+    from compressed_tensors.quantization.utils import (
+        is_model_quantized,
+        is_module_quantized,
+        iter_named_leaf_modules,
+    )
+    quant_info_weight = []
+    quant_info_inputs = []
+    for _, submodule in iter_named_leaf_modules(model):
+        if is_module_quantized(submodule):
+            weight_scheme = submodule.quantization_scheme.weights
+            input_scheme = submodule.quantization_scheme.input_activations
+            if weight_scheme is not None:
+                if weight_scheme not in quant_info_weight:
+                    quant_info_weight.append(weight_scheme)
+            if input_scheme is not None:
+                if input_scheme not in quant_info_inputs:
+                    quant_info_inputs.append(input_scheme)
+
+    return quant_info_weight, quant_info_inputs
+
 def infer_and_set_per_module_quantization_format(
     model,
     quantization_format: Optional[str] = None,
@@ -79,50 +105,50 @@ def infer_and_set_per_module_quantization_format(
 
     if not save_compressed:
         return None
-    if save_compressed:
-        weight_args, input_args = _get_unique_quant_args(model)
-        is_24_structure = (
-            SparsityStructure(sparsity_structure) == SparsityStructure.TWO_FOUR
-        )
-        is_weight_only = len(input_args) == 0 and len(weight_args) > 0
-        if (
-            weight_args[0].num_bits == 4
-            and weight_args[0].type == QuantizationType.FLOAT.value
-        ):
-            if weight_args[0].is_mx:
-                return CompressionFormat.mxfp4_pack_quantized
-            else:
-                return CompressionFormat.nvfp4_pack_quantized
-
-        if is_weight_only:  # w4a16 and w8a16
-            is_valid_pack = all(
-                weight_arg.num_bits in [4, 8]
-                and weight_arg.type == QuantizationType.INT.value
-                for weight_arg in weight_args
-            )
-            if not is_valid_pack:  # packing only valid for int4 and int 8
-                return CompressionFormat.naive_quantized
-            if is_24_structure:
-                for arg in weight_args:
-                    if (
-                        arg.strategy is not QuantizationStrategy.CHANNEL.value
-                        and arg.strategy is not QuantizationStrategy.GROUP.value
-                    ):
-                        # marlin24 kernel only applicable for channel/group quantization
-                        return CompressionFormat.pack_quantized
-                return CompressionFormat.marlin_24
-            return CompressionFormat.pack_quantized
-        else:  # w8a8 float and int
-            if len(weight_args) == 1:
-                if (
-                    weight_args[0].type == QuantizationType.FLOAT.value
-                    and weight_args[0].num_bits == 8
-                ):
-                    return CompressionFormat.float_quantized
-                if weight_args[0].type == QuantizationType.INT.value:
-                    return CompressionFormat.int_quantized
-
-            return CompressionFormat.naive_quantized
+    # if save_compressed:
+    #     weight_args, input_args = _get_unique_quant_args(model)
+    #     is_24_structure = (
+    #         SparsityStructure(sparsity_structure) == SparsityStructure.TWO_FOUR
+    #     )
+    #     is_weight_only = len(input_args) == 0 and len(weight_args) > 0
+    #     if (
+    #         weight_args[0].num_bits == 4
+    #         and weight_args[0].type == QuantizationType.FLOAT.value
+    #     ):
+    #         if weight_args[0].is_mx:
+    #             return CompressionFormat.mxfp4_pack_quantized
+    #         else:
+    #             return CompressionFormat.nvfp4_pack_quantized
+
+    #     if is_weight_only:  # w4a16 and w8a16
+    #         is_valid_pack = all(
+    #             weight_arg.num_bits in [4, 8]
+    #             and weight_arg.type == QuantizationType.INT.value
+    #             for weight_arg in weight_args
+    #         )
+    #         if not is_valid_pack:  # packing only valid for int4 and int 8
+    #             return CompressionFormat.naive_quantized
+    #         if is_24_structure:
+    #             for arg in weight_args:
+    #                 if (
+    #                     arg.strategy is not QuantizationStrategy.CHANNEL.value
+    #                     and arg.strategy is not QuantizationStrategy.GROUP.value
+    #                 ):
+    #                     # marlin24 kernel only applicable for channel/group quantization
+    #                     return CompressionFormat.pack_quantized
+    #             return CompressionFormat.marlin_24
+    #         return CompressionFormat.pack_quantized
+    #     else:  # w8a8 float and int
+    #         if len(weight_args) == 1:
+    #             if (
+    #                 weight_args[0].type == QuantizationType.FLOAT.value
+    #                 and weight_args[0].num_bits == 8
+    #             ):
+    #                 return CompressionFormat.float_quantized
+    #             if weight_args[0].type == QuantizationType.INT.value:
+    #                 return CompressionFormat.int_quantized
+
+    #         return CompressionFormat.naive_quantized
 
 
     if quantization_format:

Original file line number	Diff line number	Diff line change
`@@ -85,7 +85,7 @@ def tokenize(sample):`
`85`	`85`	`input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(`
`86`	`86`	`model.device`
`87`	`87`	`)`
`88`		`-output = model.generate(input_ids, max_new_tokens=100)`
	`88`	`+output = model.generate(input_ids, max_new_tokens=10)`
`89`	`89`
`90`	`90`	`print(tokenizer.decode(output[0]))`
`91`	`91`	`print("==========================================\n\n")`