Fix INT4 ONNX quantization issue for version > 1.18

hthadicherla · hthadicherla · commit 4d621650b553 · 2025-10-15T15:33:05.000+05:30
Signed-off-by: Hrishith Thadicherla &lt;hthadicherla@nvidia.com&gt;
diff --git a/modelopt/onnx/quantization/gs_patching.py b/modelopt/onnx/quantization/gs_patching.py
@@ -69,9 +69,11 @@ def _export_tensor_proto(tensor: gs.Constant) -> onnx.TensorProto:
 
         vals = tensor.values
         if _onnx_supports_int4() and dtype in [onnx.TensorProto.INT4, onnx.TensorProto.UINT4]:
-            signed = dtype == onnx.TensorProto.INT4
-            np_dtype = onnx.helper.tensor_dtype_to_np_dtype(dtype)
-            vals = pack_float32_to_4bit_cpp_based(tensor.values, signed=signed).astype(np_dtype)
+            signed = dtype == onnx.TensorProto.INT4      
+            if(signed):
+                vals = pack_float32_to_4bit_cpp_based(tensor.values, signed=signed).astype(np.int8)
+            else:
+                vals = pack_float32_to_4bit_cpp_based(tensor.values, signed=signed).astype(np.uint8)
 
         onnx_tensor = onnx.helper.make_tensor(
             tensor.name,
diff --git a/modelopt/onnx/quantization/int4.py b/modelopt/onnx/quantization/int4.py
@@ -98,6 +98,28 @@
 # supported and working
 CLIP_MIN = 1e-5
 
+def convert_ml_dtypes_int4_to_int8_format(tensor):
+    """
+    Convert ml_dtypes.int4 tensor to numpy.int8 for CuPy compatibility.
+    
+    In ONNX 1.19, int4 tensors use ml_dtypes.int4 which CuPy doesn't support.
+    This function converts them to regular numpy.int8 while preserving values.
+    
+    Args:
+        tensor: numpy array that may have ml_dtypes.int4 dtype
+        
+    Returns:
+        cupy or numpy array (if cupy is not supported) with numpy.int8 dtype if input was ml_dtypes.int4, otherwise unchanged
+    """
+    try:
+        import ml_dtypes
+        if hasattr(tensor, 'dtype') and tensor.dtype == ml_dtypes.int4:
+            return np.asarray(tensor.astype(numpy.int8))
+    except ImportError:
+        pass
+    
+    return np.asarray(tensor)
+
 
 def _quantize_gather_nodes(
     graph: onnx.GraphProto,
diff --git a/tests/gpu/onnx/test_quantize_onnx_torch_int4_awq.py b/tests/gpu/onnx/test_quantize_onnx_torch_int4_awq.py
@@ -40,7 +40,7 @@
 
 
 def test_int4_awq(tmp_path):
-    skip_if_onnx_version_above_1_18()
+    # skip_if_onnx_version_above_1_18()
 
     def _forward_loop(model, dataloader):
         """Forward loop for calibration."""
@@ -94,20 +94,19 @@ def _forward_loop(model, dataloader):
         scale_awq_lite = find_init(onnx_model_awq_lite, scale_names[i])
 
         if int4.has_cupy:
-            wq_onnx_awq_lite = np.array(wq_onnx_awq_lite)
-            scale_awq_lite = np.array(scale_awq_lite)
+            wq_onnx_awq_lite = int4.convert_ml_dtypes_int4_to_int8_format(wq_onnx_awq_lite)
+            scale_awq_lite = int4.convert_ml_dtypes_int4_to_int8_format(scale_awq_lite)
 
         wq_onnx_awq_lite = dq_tensor(wq_onnx_awq_lite, scale_awq_lite, block_size)
-
         wq_torch_awq_clip = model_torch_copy.net[i * 2].weight_quantizer(
             model_torch_copy.net[i * 2].weight
         )
         wq_onnx_awq_clip = find_init(onnx_model_awq_clip, wq_names[i])
         scale_awq_clip = find_init(onnx_model_awq_clip, scale_names[i])
-
+        
         if int4.has_cupy:
-            wq_onnx_awq_clip = np.array(wq_onnx_awq_clip)
-            scale_awq_clip = np.array(scale_awq_clip)
+            wq_onnx_awq_clip = int4.convert_ml_dtypes_int4_to_int8_format(wq_onnx_awq_clip)
+            scale_awq_clip = int4.convert_ml_dtypes_int4_to_int8_format(scale_awq_clip)
 
         wq_onnx_awq_clip = dq_tensor(wq_onnx_awq_clip, scale_awq_clip, block_size)
 
@@ -116,7 +115,7 @@ def _forward_loop(model, dataloader):
 
 
 def test_int4_awq_cuda(tmp_path):
-    skip_if_onnx_version_above_1_18()
+    # skip_if_onnx_version_above_1_18()
     skip_if_no_libcudnn()
     block_size = 128