ONNX 1.19 compatibility fix for INT4 quantization (#423)

hthadicherla · web-flow · commit 7ccaa53198d8 · 2025-10-22T12:18:34.000+05:30
Signed-off-by: Hrishith Thadicherla &lt;hthadicherla@nvidia.com&gt;
diff --git a/modelopt/onnx/quantization/gs_patching.py b/modelopt/onnx/quantization/gs_patching.py
@@ -70,8 +70,8 @@ def _export_tensor_proto(tensor: gs.Constant) -> onnx.TensorProto:
         vals = tensor.values
         if _onnx_supports_int4() and dtype in [onnx.TensorProto.INT4, onnx.TensorProto.UINT4]:
             signed = dtype == onnx.TensorProto.INT4
-            np_dtype = onnx.helper.tensor_dtype_to_np_dtype(dtype)
-            vals = pack_float32_to_4bit_cpp_based(tensor.values, signed=signed).astype(np_dtype)
+            packed_dtype = np.int8 if signed else np.uint8
+            vals = pack_float32_to_4bit_cpp_based(tensor.values, signed=signed).astype(packed_dtype)
 
         onnx_tensor = onnx.helper.make_tensor(
             tensor.name,
diff --git a/modelopt/onnx/quantization/int4.py b/modelopt/onnx/quantization/int4.py
@@ -99,6 +99,29 @@
 CLIP_MIN = 1e-5
 
 
+def safe_cupy_array(tensor):
+    """Convert ml_dtypes.int4 tensor to numpy.int8 for CuPy compatibility.
+
+    In ONNX 1.19, int4 tensors use ml_dtypes.int4 which CuPy doesn't support.
+    This function converts them to regular numpy.int8 while preserving values.
+
+    Args:
+        tensor: numpy array that may have ml_dtypes.int4 dtype
+    Returns:
+        cupy or numpy array (if cupy is not supported) with numpy.int8 dtype if input was ml_dtypes.int4,
+        otherwise unchanged
+    """
+    try:
+        import ml_dtypes
+
+        if hasattr(tensor, "dtype") and tensor.dtype == ml_dtypes.int4:
+            return np.asarray(tensor.astype(numpy.int8))
+    except ImportError:
+        pass
+
+    return np.asarray(tensor)
+
+
 def _quantize_gather_nodes(
     graph: onnx.GraphProto,
     nodes_to_exclude: list[str],
@@ -271,19 +294,26 @@ def quantize_rtn(
                 scales[name] = np.asnumpy(scales[name])
             gemm_weights_quantized[name] = numpy.asarray(qw)
         scales = reshape_scales_for_per_channel_nodes(scales, block_size, precision_info)
+        dq_node_attributes = {"axis": 0, "block_size": block_size}
         qdq.insert_dq_nodes(
             graph,
             scales,
             quantized_weights=gemm_weights_quantized,
+            attributes=dq_node_attributes,
             precision_info=precision_info,
         )
 
         if gather_w_map is not None:
             assert gather_s_map is not None, "scale-map not found for quantizable gather nodes"
+            gather_dq_node_attributes = {
+                "axis": gather_quantize_axis,
+                "block_size": gather_block_size,
+            }
             qdq.insert_dq_nodes(
                 graph,
                 gather_s_map,
                 quantized_weights=gather_w_map,
+                attributes=gather_dq_node_attributes,
                 precision_info=precision_info,
             )
     else:
@@ -299,7 +329,10 @@ def quantize_rtn(
             )
 
     logger.info(f"RTN quantization completed in {time.time() - t_start:.2f} seconds")
-    return gs.export_onnx(graph)
+    model = gs.export_onnx(graph)
+    model.ir_version = 10
+
+    return model
 
 
 class AWQClipHelper:
diff --git a/tests/gpu/onnx/test_quantize_onnx_torch_int4_awq.py b/tests/gpu/onnx/test_quantize_onnx_torch_int4_awq.py
@@ -20,7 +20,7 @@
 from functools import partial
 
 import torch
-from _test_utils.import_helper import skip_if_no_libcudnn, skip_if_onnx_version_above_1_18
+from _test_utils.import_helper import skip_if_no_libcudnn
 from _test_utils.onnx_quantization.lib_test_models import SimpleMLP, export_as_onnx, find_init
 from _test_utils.torch_quantization.quantize_common import get_awq_config
 
@@ -39,9 +39,45 @@
 #       test_qdq_utils_fp8.py::test_fused_q[bf16,fp16] fails if this script runs after the int4 test, but not before.
 
 
-def test_int4_awq(tmp_path):
-    skip_if_onnx_version_above_1_18()
+def test_safe_cupy_array(monkeypatch):
+    """Comprehensive test for safe_cupy_array covering all code paths."""
+    import builtins
+
+    import numpy  # Import actual numpy for creating int4 tensors
+
+    # Test 1: Regular numpy array (should hit line 122)
+    result = int4.safe_cupy_array(numpy.array([1, 2, 3, 4], dtype=numpy.float32))
+    assert isinstance(result, np.ndarray)
+
+    # Test 2: With real ml_dtypes.int4 (covers lines 117-118)
+    try:
+        import ml_dtypes
+
+        int4_tensor = numpy.array([1, 2, -3, 4], dtype=numpy.float32).astype(ml_dtypes.int4)
+        result = int4.safe_cupy_array(int4_tensor)
+        assert isinstance(result, np.ndarray) and result.dtype == numpy.int8
+        expected = int4_tensor.astype(numpy.int8)
+        actual = result.get() if int4.has_cupy else result
+        np.testing.assert_array_equal(actual, expected)
+    except ImportError:
+        pass  # ml_dtypes not available
+
+    # Test 3: When ml_dtypes import fails (covers ImportError catch and line 122)
+    original_import = builtins.__import__
 
+    def mock_import(name, *args, **kwargs):
+        if name == "ml_dtypes":
+            raise ImportError("ml_dtypes not available")
+        return original_import(name, *args, **kwargs)
+
+    monkeypatch.setattr(builtins, "__import__", mock_import)
+
+    # Use actual numpy for creating the array
+    result = int4.safe_cupy_array(numpy.array([5, 6, 7, 8], dtype=numpy.int8))
+    assert isinstance(result, np.ndarray)
+
+
+def test_int4_awq(tmp_path):
     def _forward_loop(model, dataloader):
         """Forward loop for calibration."""
         for data in dataloader:
@@ -94,20 +130,19 @@ def _forward_loop(model, dataloader):
         scale_awq_lite = find_init(onnx_model_awq_lite, scale_names[i])
 
         if int4.has_cupy:
-            wq_onnx_awq_lite = np.array(wq_onnx_awq_lite)
-            scale_awq_lite = np.array(scale_awq_lite)
+            wq_onnx_awq_lite = int4.safe_cupy_array(wq_onnx_awq_lite)
+            scale_awq_lite = int4.safe_cupy_array(scale_awq_lite)
 
         wq_onnx_awq_lite = dq_tensor(wq_onnx_awq_lite, scale_awq_lite, block_size)
-
         wq_torch_awq_clip = model_torch_copy.net[i * 2].weight_quantizer(
             model_torch_copy.net[i * 2].weight
         )
         wq_onnx_awq_clip = find_init(onnx_model_awq_clip, wq_names[i])
         scale_awq_clip = find_init(onnx_model_awq_clip, scale_names[i])
 
         if int4.has_cupy:
-            wq_onnx_awq_clip = np.array(wq_onnx_awq_clip)
-            scale_awq_clip = np.array(scale_awq_clip)
+            wq_onnx_awq_clip = int4.safe_cupy_array(wq_onnx_awq_clip)
+            scale_awq_clip = int4.safe_cupy_array(scale_awq_clip)
 
         wq_onnx_awq_clip = dq_tensor(wq_onnx_awq_clip, scale_awq_clip, block_size)