Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions modelopt/onnx/quantization/gs_patching.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@ def _export_tensor_proto(tensor: gs.Constant) -> onnx.TensorProto:
vals = tensor.values
if _onnx_supports_int4() and dtype in [onnx.TensorProto.INT4, onnx.TensorProto.UINT4]:
signed = dtype == onnx.TensorProto.INT4
np_dtype = onnx.helper.tensor_dtype_to_np_dtype(dtype)
vals = pack_float32_to_4bit_cpp_based(tensor.values, signed=signed).astype(np_dtype)
packed_dtype = np.int8 if signed else np.uint8
vals = pack_float32_to_4bit_cpp_based(tensor.values, signed=signed).astype(packed_dtype)

onnx_tensor = onnx.helper.make_tensor(
tensor.name,
Expand Down
35 changes: 34 additions & 1 deletion modelopt/onnx/quantization/int4.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,29 @@
CLIP_MIN = 1e-5


def safe_cupy_array(tensor):
"""Convert ml_dtypes.int4 tensor to numpy.int8 for CuPy compatibility.

In ONNX 1.19, int4 tensors use ml_dtypes.int4 which CuPy doesn't support.
This function converts them to regular numpy.int8 while preserving values.

Args:
tensor: numpy array that may have ml_dtypes.int4 dtype
Returns:
cupy or numpy array (if cupy is not supported) with numpy.int8 dtype if input was ml_dtypes.int4,
otherwise unchanged
"""
try:
import ml_dtypes

if hasattr(tensor, "dtype") and tensor.dtype == ml_dtypes.int4:
return np.asarray(tensor.astype(numpy.int8))
except ImportError:
pass

return np.asarray(tensor)

Comment on lines +102 to +123
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion | 🟠 Major

🧩 Analysis chain

Well-implemented compatibility helper for ml_dtypes.int4.

The safe_cupy_array function correctly handles the conversion of ml_dtypes.int4 tensors to numpy.int8 for CuPy compatibility. The dynamic import of ml_dtypes and conditional casting ensure backward compatibility with ONNX versions that don't use ml_dtypes.int4.

However, verify that all weight-loading scenarios are covered. The previous review comment flagged that weights loaded via numpy_helper.to_array (e.g., lines 531, 557, 750, 842, 931, 1162, 1249) followed by np.asarray might fail with CuPy if those weights are ml_dtypes.int4. While the typical quantization flow loads float32 weights, confirm there are no edge cases where already-quantized INT4 weights are loaded and processed within this module.

Run the following script to identify all weight-loading patterns:


🏁 Script executed:

#!/bin/bash
# Description: Find all numpy_helper.to_array usage followed by np.asarray in int4.py

rg -n -A 5 'numpy_helper\.to_array' modelopt/onnx/quantization/int4.py | \
  rg -B 2 -A 2 'np\.asarray'

Length of output: 1093


Enforce safe_cupy_array on all weight loads
Wrap each raw np.asarray call following numpy_helper.to_array (lines 536, 562, 755, 847, 936, 1167 and line 1253) with safe_cupy_array to convert any ml_dtypes.int4 to numpy.int8 and ensure CuPy compatibility.


def _quantize_gather_nodes(
graph: onnx.GraphProto,
nodes_to_exclude: list[str],
Expand Down Expand Up @@ -271,19 +294,26 @@ def quantize_rtn(
scales[name] = np.asnumpy(scales[name])
gemm_weights_quantized[name] = numpy.asarray(qw)
scales = reshape_scales_for_per_channel_nodes(scales, block_size, precision_info)
dq_node_attributes = {"axis": 0, "block_size": block_size}
qdq.insert_dq_nodes(
graph,
scales,
quantized_weights=gemm_weights_quantized,
attributes=dq_node_attributes,
precision_info=precision_info,
)

if gather_w_map is not None:
assert gather_s_map is not None, "scale-map not found for quantizable gather nodes"
gather_dq_node_attributes = {
"axis": gather_quantize_axis,
"block_size": gather_block_size,
}
qdq.insert_dq_nodes(
graph,
gather_s_map,
quantized_weights=gather_w_map,
attributes=gather_dq_node_attributes,
precision_info=precision_info,
)
else:
Expand All @@ -299,7 +329,10 @@ def quantize_rtn(
)

logger.info(f"RTN quantization completed in {time.time() - t_start:.2f} seconds")
return gs.export_onnx(graph)
model = gs.export_onnx(graph)
model.ir_version = 10

return model


class AWQClipHelper:
Expand Down
52 changes: 43 additions & 9 deletions tests/gpu/onnx/test_quantize_onnx_torch_int4_awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from functools import partial

import torch
from _test_utils.import_helper import skip_if_no_libcudnn, skip_if_onnx_version_above_1_18
from _test_utils.import_helper import skip_if_no_libcudnn
from _test_utils.onnx_quantization.lib_test_models import SimpleMLP, export_as_onnx, find_init
from _test_utils.torch_quantization.quantize_common import get_awq_config

Expand All @@ -39,9 +39,45 @@
# test_qdq_utils_fp8.py::test_fused_q[bf16,fp16] fails if this script runs after the int4 test, but not before.


def test_int4_awq(tmp_path):
skip_if_onnx_version_above_1_18()
def test_safe_cupy_array(monkeypatch):
"""Comprehensive test for safe_cupy_array covering all code paths."""
import builtins

import numpy # Import actual numpy for creating int4 tensors

# Test 1: Regular numpy array (should hit line 122)
result = int4.safe_cupy_array(numpy.array([1, 2, 3, 4], dtype=numpy.float32))
assert isinstance(result, np.ndarray)

# Test 2: With real ml_dtypes.int4 (covers lines 117-118)
try:
import ml_dtypes

int4_tensor = numpy.array([1, 2, -3, 4], dtype=numpy.float32).astype(ml_dtypes.int4)
result = int4.safe_cupy_array(int4_tensor)
assert isinstance(result, np.ndarray) and result.dtype == numpy.int8
expected = int4_tensor.astype(numpy.int8)
actual = result.get() if int4.has_cupy else result
np.testing.assert_array_equal(actual, expected)
except ImportError:
Comment on lines +58 to +62
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Use numpy.testing here to avoid cupy alias mismatch.

Under cupy, np is cupy, so np.testing.assert_array_equal may not accept NumPy arrays (actual/expected). Call numpy’s testing explicitly.

-        np.testing.assert_array_equal(actual, expected)
+        numpy.testing.assert_array_equal(actual, expected)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
assert isinstance(result, np.ndarray) and result.dtype == numpy.int8
expected = int4_tensor.astype(numpy.int8)
actual = result.get() if int4.has_cupy else result
np.testing.assert_array_equal(actual, expected)
except ImportError:
assert isinstance(result, np.ndarray) and result.dtype == numpy.int8
expected = int4_tensor.astype(numpy.int8)
actual = result.get() if int4.has_cupy else result
numpy.testing.assert_array_equal(actual, expected)
except ImportError:
🤖 Prompt for AI Agents
In tests/gpu/onnx/test_quantize_onnx_torch_int4_awq.py around lines 58 to 62,
the test calls np.testing.assert_array_equal which under CuPy can resolve to
cupy.testing and fail when comparing NumPy arrays; replace that call with
numpy.testing.assert_array_equal so the NumPy testing function is used
explicitly (ensure the existing numpy import is used), i.e. change the assertion
to call numpy.testing.assert_array_equal(actual, expected).

pass # ml_dtypes not available

# Test 3: When ml_dtypes import fails (covers ImportError catch and line 122)
original_import = builtins.__import__

def mock_import(name, *args, **kwargs):
if name == "ml_dtypes":
raise ImportError("ml_dtypes not available")
return original_import(name, *args, **kwargs)

monkeypatch.setattr(builtins, "__import__", mock_import)

# Use actual numpy for creating the array
result = int4.safe_cupy_array(numpy.array([5, 6, 7, 8], dtype=numpy.int8))
assert isinstance(result, np.ndarray)


def test_int4_awq(tmp_path):
def _forward_loop(model, dataloader):
"""Forward loop for calibration."""
for data in dataloader:
Expand Down Expand Up @@ -94,20 +130,19 @@ def _forward_loop(model, dataloader):
scale_awq_lite = find_init(onnx_model_awq_lite, scale_names[i])

if int4.has_cupy:
wq_onnx_awq_lite = np.array(wq_onnx_awq_lite)
scale_awq_lite = np.array(scale_awq_lite)
wq_onnx_awq_lite = int4.safe_cupy_array(wq_onnx_awq_lite)
scale_awq_lite = int4.safe_cupy_array(scale_awq_lite)

wq_onnx_awq_lite = dq_tensor(wq_onnx_awq_lite, scale_awq_lite, block_size)

wq_torch_awq_clip = model_torch_copy.net[i * 2].weight_quantizer(
model_torch_copy.net[i * 2].weight
)
wq_onnx_awq_clip = find_init(onnx_model_awq_clip, wq_names[i])
scale_awq_clip = find_init(onnx_model_awq_clip, scale_names[i])

if int4.has_cupy:
wq_onnx_awq_clip = np.array(wq_onnx_awq_clip)
scale_awq_clip = np.array(scale_awq_clip)
wq_onnx_awq_clip = int4.safe_cupy_array(wq_onnx_awq_clip)
scale_awq_clip = int4.safe_cupy_array(scale_awq_clip)

wq_onnx_awq_clip = dq_tensor(wq_onnx_awq_clip, scale_awq_clip, block_size)

Expand All @@ -116,7 +151,6 @@ def _forward_loop(model, dataloader):


def test_int4_awq_cuda(tmp_path):
skip_if_onnx_version_above_1_18()
skip_if_no_libcudnn()
block_size = 128

Expand Down