Change IO dtype for INT4 CUDA models (microsoft#1629)

kunal-vaishnavi · web-flow · commit d373cf997583 · 2025-07-31T21:39:41.000Z
### Description This PR allows a user to set the IO dtype (i.e. the input/output dtype) for an INT4 CUDA ONNX model to be bfloat16 precision instead of float16 precision. This can be used by using `-p/--precision int4`, `-e/--execution_provider cuda`, and then setting `--extra_options use_bf16_cuda=true/True/1`. ### Motivation and Context Models lose accuracy when converting weights from their native bfloat16 precision to float16 precision. With the [recent support](microsoft/onnxruntime#25161) of bfloat16 precision in `MatMulNBits`, the conversion is not always needed any more.
diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
@@ -2417,8 +2417,7 @@ def make_relu(self, layer_id, root_input, activation):
 
     def make_relu_squared(self, layer_id, root_input, activation):
         relu_name = self.make_relu(layer_id, root_input, "Relu")
-        basename = f"/model/layers.{layer_id}/mlp/square/{activation}"
-        pow_name = f"{basename}/pow"
+        pow_name = f"/model/layers.{layer_id}/mlp/act_fn/Pow"
         pow_inputs = [f"{relu_name}/output_0", "/model/constants/INT32/[2]"]
         self.make_node("Pow", inputs=pow_inputs, outputs=[f"{pow_name}/output_0"], name=pow_name, domain="")
         self.make_value(f"{pow_name}/output_0", self.io_dtype, shape=['batch_size', 'sequence_length', self.intermediate_size])
@@ -3645,7 +3644,7 @@ def check_extra_options(kv_pairs):
     """
     Check key-value pairs and set values correctly
     """
-    bools = ["int4_is_symmetric", "exclude_embeds", "exclude_lm_head", "include_hidden_states", "enable_cuda_graph", "use_8bits_moe", "use_qdq", "use_webgpu_fp32"]
+    bools = ["int4_is_symmetric", "exclude_embeds", "exclude_lm_head", "include_hidden_states", "enable_cuda_graph", "use_8bits_moe", "use_qdq", "use_webgpu_fp32", "use_cuda_bf16"]
     for key in bools:
         if key in kv_pairs:
             if kv_pairs[key] in {"false", "False", "0"}:
@@ -3710,11 +3709,15 @@ def parse_hf_token(hf_token):
 
 
 def set_io_dtype(precision, execution_provider, extra_options) -> ir.DataType:
-    if precision in {"int8", "fp32"} or (precision == "int4" and execution_provider == "cpu") or extra_options.get("use_webgpu_fp32", False):
+    int4_cpu = precision == "int4" and execution_provider == "cpu"
+    fp32_webgpu = execution_provider == "webgpu" and extra_options.get("use_webgpu_fp32", False)
+    bf16_cuda = precision == "int4" and execution_provider == "cuda" and extra_options.get("use_cuda_bf16", False)
+
+    if precision in {"int8", "fp32"} or int4_cpu or fp32_webgpu:
         # FP32 precision
         return ir.DataType.FLOAT
 
-    if precision == "bf16":
+    if precision == "bf16" or bf16_cuda:
         # BF16 precision
         return ir.DataType.BFLOAT16
 
@@ -3951,8 +3954,10 @@ def get_args():
                     If true, the QMoE op will use 8-bit quantization. If false, the QMoE op will use 4-bit quantization.
                 use_qdq = Use the QDQ decomposition for ops.
                     Use this option when you want to use quantize-dequantize ops. For example, you will have a quantized MatMul op instead of the MatMulNBits op.
-                use_webgpu_fp32 = Use FP32 for WebGPU EP.
+                use_webgpu_fp32 = Use FP32 I/O precision for WebGPU EP.
                     Use this option to enable GPUs that do not support FP16 on WebGPU (e.g. GTX 10xx).
+                use_cuda_bf16 = Use BF16 I/O precision in quantized ONNX models for CUDA EP.
+                    Use this option to create quantized ONNX models that use BF16 precision.
                 adapter_path = Path to folder on disk containing the adapter files (adapter_config.json and adapter model weights).
                     Use this option for LoRA models.
             """),