Skip to content

Conversation

@Prince781
Copy link
Contributor

Test how these intrinsics are handled by the NVPTX backend. Currently, these intrinsics are lowered to sequential reductions by the ExpandReductions pass.

Test how these intrinsics are handled by the NVPTX backend. Currently,
these intrinsics are lowered to sequential reductions by the
ExpandReductions pass.
@llvmbot
Copy link
Member

llvmbot commented Apr 18, 2025

@llvm/pr-subscribers-backend-nvptx

Author: Princeton Ferro (Prince781)

Changes

Test how these intrinsics are handled by the NVPTX backend. Currently, these intrinsics are lowered to sequential reductions by the ExpandReductions pass.


Patch is 79.90 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/136381.diff

1 Files Affected:

  • (added) llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll (+1958)
diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
new file mode 100644
index 0000000000000..2a12e9b364a54
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
@@ -0,0 +1,1958 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mcpu=sm_80 -mattr=+ptx70 -O0 \
+; RUN:      -disable-post-ra -verify-machineinstrs \
+; RUN: | FileCheck -check-prefixes CHECK,CHECK-SM80 %s
+; RUN: %if ptxas-12.8 %{ llc < %s -mcpu=sm_80 -mattr=+ptx70 -O0 \
+; RUN:      -disable-post-ra -verify-machineinstrs \
+; RUN: | %ptxas-verify -arch=sm_80 %}
+; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx87 -O0 \
+; RUN:      -disable-post-ra -verify-machineinstrs \
+; RUN: | FileCheck -check-prefixes CHECK,CHECK-SM100 %s
+; RUN: %if ptxas-12.8 %{ llc < %s -mcpu=sm_100 -mattr=+ptx87 -O0 \
+; RUN:      -disable-post-ra -verify-machineinstrs \
+; RUN: | %ptxas-verify -arch=sm_100 %}
+target triple = "nvptx64-nvidia-cuda"
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; Check straight line reduction.
+define half @reduce_fadd_half(<8 x half> %in) {
+; CHECK-LABEL: reduce_fadd_half(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<18>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    mov.b16 %rs3, 0x0000;
+; CHECK-NEXT:    add.rn.f16 %rs4, %rs1, %rs3;
+; CHECK-NEXT:    add.rn.f16 %rs5, %rs4, %rs2;
+; CHECK-NEXT:    mov.b32 {%rs6, %rs7}, %r2;
+; CHECK-NEXT:    add.rn.f16 %rs8, %rs5, %rs6;
+; CHECK-NEXT:    add.rn.f16 %rs9, %rs8, %rs7;
+; CHECK-NEXT:    mov.b32 {%rs10, %rs11}, %r3;
+; CHECK-NEXT:    add.rn.f16 %rs12, %rs9, %rs10;
+; CHECK-NEXT:    add.rn.f16 %rs13, %rs12, %rs11;
+; CHECK-NEXT:    mov.b32 {%rs14, %rs15}, %r4;
+; CHECK-NEXT:    add.rn.f16 %rs16, %rs13, %rs14;
+; CHECK-NEXT:    add.rn.f16 %rs17, %rs16, %rs15;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs17;
+; CHECK-NEXT:    ret;
+  %res = call half @llvm.vector.reduce.fadd(half 0.0, <8 x half> %in)
+  ret half %res
+}
+
+define half @reduce_fadd_half_reassoc(<8 x half> %in) {
+; CHECK-SM80-LABEL: reduce_fadd_half_reassoc(
+; CHECK-SM80:       {
+; CHECK-SM80-NEXT:    .reg .b16 %rs<6>;
+; CHECK-SM80-NEXT:    .reg .b32 %r<10>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT:  // %bb.0:
+; CHECK-SM80-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0];
+; CHECK-SM80-NEXT:    add.rn.f16x2 %r5, %r2, %r4;
+; CHECK-SM80-NEXT:    add.rn.f16x2 %r6, %r1, %r3;
+; CHECK-SM80-NEXT:    add.rn.f16x2 %r7, %r6, %r5;
+; CHECK-SM80-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; }
+; CHECK-SM80-NEXT:    // implicit-def: %rs2
+; CHECK-SM80-NEXT:    mov.b32 %r8, {%rs1, %rs2};
+; CHECK-SM80-NEXT:    add.rn.f16x2 %r9, %r7, %r8;
+; CHECK-SM80-NEXT:    { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; }
+; CHECK-SM80-NEXT:    mov.b16 %rs4, 0x0000;
+; CHECK-SM80-NEXT:    add.rn.f16 %rs5, %rs3, %rs4;
+; CHECK-SM80-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-SM80-NEXT:    ret;
+;
+; CHECK-SM100-LABEL: reduce_fadd_half_reassoc(
+; CHECK-SM100:       {
+; CHECK-SM100-NEXT:    .reg .b16 %rs<6>;
+; CHECK-SM100-NEXT:    .reg .b32 %r<10>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT:  // %bb.0:
+; CHECK-SM100-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0];
+; CHECK-SM100-NEXT:    add.rn.f16x2 %r5, %r2, %r4;
+; CHECK-SM100-NEXT:    add.rn.f16x2 %r6, %r1, %r3;
+; CHECK-SM100-NEXT:    add.rn.f16x2 %r7, %r6, %r5;
+; CHECK-SM100-NEXT:    mov.b32 {_, %rs1}, %r7;
+; CHECK-SM100-NEXT:    // implicit-def: %rs2
+; CHECK-SM100-NEXT:    mov.b32 %r8, {%rs1, %rs2};
+; CHECK-SM100-NEXT:    add.rn.f16x2 %r9, %r7, %r8;
+; CHECK-SM100-NEXT:    mov.b32 {%rs3, _}, %r9;
+; CHECK-SM100-NEXT:    mov.b16 %rs4, 0x0000;
+; CHECK-SM100-NEXT:    add.rn.f16 %rs5, %rs3, %rs4;
+; CHECK-SM100-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-SM100-NEXT:    ret;
+  %res = call reassoc half @llvm.vector.reduce.fadd(half 0.0, <8 x half> %in)
+  ret half %res
+}
+
+define half @reduce_fadd_half_reassoc_nonpow2(<7 x half> %in) {
+; CHECK-LABEL: reduce_fadd_half_reassoc_nonpow2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<16>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [reduce_fadd_half_reassoc_nonpow2_param_0+8];
+; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NEXT:    ld.param.b16 %rs7, [reduce_fadd_half_reassoc_nonpow2_param_0+12];
+; CHECK-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fadd_half_reassoc_nonpow2_param_0];
+; CHECK-NEXT:    mov.b16 %rs8, 0x0000;
+; CHECK-NEXT:    add.rn.f16 %rs9, %rs1, %rs8;
+; CHECK-NEXT:    add.rn.f16 %rs10, %rs9, %rs2;
+; CHECK-NEXT:    add.rn.f16 %rs11, %rs10, %rs3;
+; CHECK-NEXT:    add.rn.f16 %rs12, %rs11, %rs4;
+; CHECK-NEXT:    add.rn.f16 %rs13, %rs12, %rs5;
+; CHECK-NEXT:    add.rn.f16 %rs14, %rs13, %rs6;
+; CHECK-NEXT:    add.rn.f16 %rs15, %rs14, %rs7;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs15;
+; CHECK-NEXT:    ret;
+  %res = call half @llvm.vector.reduce.fadd(half 0.0, <7 x half> %in)
+  ret half %res
+}
+
+; Check straight-line reduction.
+define float @reduce_fadd_float(<8 x float> %in) {
+; CHECK-LABEL: reduce_fadd_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<17>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fadd_float_param_0+16];
+; CHECK-NEXT:    ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fadd_float_param_0];
+; CHECK-NEXT:    add.rn.f32 %f9, %f1, 0f00000000;
+; CHECK-NEXT:    add.rn.f32 %f10, %f9, %f2;
+; CHECK-NEXT:    add.rn.f32 %f11, %f10, %f3;
+; CHECK-NEXT:    add.rn.f32 %f12, %f11, %f4;
+; CHECK-NEXT:    add.rn.f32 %f13, %f12, %f5;
+; CHECK-NEXT:    add.rn.f32 %f14, %f13, %f6;
+; CHECK-NEXT:    add.rn.f32 %f15, %f14, %f7;
+; CHECK-NEXT:    add.rn.f32 %f16, %f15, %f8;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f16;
+; CHECK-NEXT:    ret;
+  %res = call float @llvm.vector.reduce.fadd(float 0.0, <8 x float> %in)
+  ret float %res
+}
+
+define float @reduce_fadd_float_reassoc(<8 x float> %in) {
+; CHECK-LABEL: reduce_fadd_float_reassoc(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<17>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fadd_float_reassoc_param_0+16];
+; CHECK-NEXT:    ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fadd_float_reassoc_param_0];
+; CHECK-NEXT:    add.rn.f32 %f9, %f3, %f7;
+; CHECK-NEXT:    add.rn.f32 %f10, %f1, %f5;
+; CHECK-NEXT:    add.rn.f32 %f11, %f4, %f8;
+; CHECK-NEXT:    add.rn.f32 %f12, %f2, %f6;
+; CHECK-NEXT:    add.rn.f32 %f13, %f12, %f11;
+; CHECK-NEXT:    add.rn.f32 %f14, %f10, %f9;
+; CHECK-NEXT:    add.rn.f32 %f15, %f14, %f13;
+; CHECK-NEXT:    add.rn.f32 %f16, %f15, 0f00000000;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f16;
+; CHECK-NEXT:    ret;
+  %res = call reassoc float @llvm.vector.reduce.fadd(float 0.0, <8 x float> %in)
+  ret float %res
+}
+
+define float @reduce_fadd_float_reassoc_nonpow2(<7 x float> %in) {
+; CHECK-LABEL: reduce_fadd_float_reassoc_nonpow2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<15>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f7, [reduce_fadd_float_reassoc_nonpow2_param_0+24];
+; CHECK-NEXT:    ld.param.v2.f32 {%f5, %f6}, [reduce_fadd_float_reassoc_nonpow2_param_0+16];
+; CHECK-NEXT:    ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fadd_float_reassoc_nonpow2_param_0];
+; CHECK-NEXT:    add.rn.f32 %f8, %f3, %f7;
+; CHECK-NEXT:    add.rn.f32 %f9, %f1, %f5;
+; CHECK-NEXT:    add.rn.f32 %f10, %f9, %f8;
+; CHECK-NEXT:    add.rn.f32 %f11, %f2, %f6;
+; CHECK-NEXT:    add.rn.f32 %f12, %f11, %f4;
+; CHECK-NEXT:    add.rn.f32 %f13, %f10, %f12;
+; CHECK-NEXT:    add.rn.f32 %f14, %f13, 0f00000000;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f14;
+; CHECK-NEXT:    ret;
+  %res = call reassoc float @llvm.vector.reduce.fadd(float 0.0, <7 x float> %in)
+  ret float %res
+}
+
+; Check straight line reduction.
+define half @reduce_fmul_half(<8 x half> %in) {
+; CHECK-LABEL: reduce_fmul_half(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<16>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    mul.rn.f16 %rs3, %rs1, %rs2;
+; CHECK-NEXT:    mov.b32 {%rs4, %rs5}, %r2;
+; CHECK-NEXT:    mul.rn.f16 %rs6, %rs3, %rs4;
+; CHECK-NEXT:    mul.rn.f16 %rs7, %rs6, %rs5;
+; CHECK-NEXT:    mov.b32 {%rs8, %rs9}, %r3;
+; CHECK-NEXT:    mul.rn.f16 %rs10, %rs7, %rs8;
+; CHECK-NEXT:    mul.rn.f16 %rs11, %rs10, %rs9;
+; CHECK-NEXT:    mov.b32 {%rs12, %rs13}, %r4;
+; CHECK-NEXT:    mul.rn.f16 %rs14, %rs11, %rs12;
+; CHECK-NEXT:    mul.rn.f16 %rs15, %rs14, %rs13;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs15;
+; CHECK-NEXT:    ret;
+  %res = call half @llvm.vector.reduce.fmul(half 1.0, <8 x half> %in)
+  ret half %res
+}
+
+define half @reduce_fmul_half_reassoc(<8 x half> %in) {
+; CHECK-SM80-LABEL: reduce_fmul_half_reassoc(
+; CHECK-SM80:       {
+; CHECK-SM80-NEXT:    .reg .b16 %rs<4>;
+; CHECK-SM80-NEXT:    .reg .b32 %r<10>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT:  // %bb.0:
+; CHECK-SM80-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0];
+; CHECK-SM80-NEXT:    mul.rn.f16x2 %r5, %r2, %r4;
+; CHECK-SM80-NEXT:    mul.rn.f16x2 %r6, %r1, %r3;
+; CHECK-SM80-NEXT:    mul.rn.f16x2 %r7, %r6, %r5;
+; CHECK-SM80-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; }
+; CHECK-SM80-NEXT:    // implicit-def: %rs2
+; CHECK-SM80-NEXT:    mov.b32 %r8, {%rs1, %rs2};
+; CHECK-SM80-NEXT:    mul.rn.f16x2 %r9, %r7, %r8;
+; CHECK-SM80-NEXT:    { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; }
+; CHECK-SM80-NEXT:    st.param.b16 [func_retval0], %rs3;
+; CHECK-SM80-NEXT:    ret;
+;
+; CHECK-SM100-LABEL: reduce_fmul_half_reassoc(
+; CHECK-SM100:       {
+; CHECK-SM100-NEXT:    .reg .b16 %rs<4>;
+; CHECK-SM100-NEXT:    .reg .b32 %r<10>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT:  // %bb.0:
+; CHECK-SM100-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0];
+; CHECK-SM100-NEXT:    mul.rn.f16x2 %r5, %r2, %r4;
+; CHECK-SM100-NEXT:    mul.rn.f16x2 %r6, %r1, %r3;
+; CHECK-SM100-NEXT:    mul.rn.f16x2 %r7, %r6, %r5;
+; CHECK-SM100-NEXT:    mov.b32 {_, %rs1}, %r7;
+; CHECK-SM100-NEXT:    // implicit-def: %rs2
+; CHECK-SM100-NEXT:    mov.b32 %r8, {%rs1, %rs2};
+; CHECK-SM100-NEXT:    mul.rn.f16x2 %r9, %r7, %r8;
+; CHECK-SM100-NEXT:    mov.b32 {%rs3, _}, %r9;
+; CHECK-SM100-NEXT:    st.param.b16 [func_retval0], %rs3;
+; CHECK-SM100-NEXT:    ret;
+  %res = call reassoc half @llvm.vector.reduce.fmul(half 1.0, <8 x half> %in)
+  ret half %res
+}
+
+define half @reduce_fmul_half_reassoc_nonpow2(<7 x half> %in) {
+; CHECK-LABEL: reduce_fmul_half_reassoc_nonpow2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<12>;
+; CHECK-NEXT:    .reg .b32 %r<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [reduce_fmul_half_reassoc_nonpow2_param_0+8];
+; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fmul_half_reassoc_nonpow2_param_0];
+; CHECK-NEXT:    mov.b32 %r2, {%rs1, %rs2};
+; CHECK-NEXT:    mov.b32 %r3, {%rs3, %rs4};
+; CHECK-NEXT:    ld.param.b16 %rs7, [reduce_fmul_half_reassoc_nonpow2_param_0+12];
+; CHECK-NEXT:    mov.b16 %rs8, 0x3C00;
+; CHECK-NEXT:    mov.b32 %r4, {%rs7, %rs8};
+; CHECK-NEXT:    mul.rn.f16x2 %r5, %r3, %r4;
+; CHECK-NEXT:    mul.rn.f16x2 %r6, %r2, %r1;
+; CHECK-NEXT:    mul.rn.f16x2 %r7, %r6, %r5;
+; CHECK-NEXT:    mov.b32 {%rs9, %rs10}, %r7;
+; CHECK-NEXT:    mul.rn.f16 %rs11, %rs9, %rs10;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs11;
+; CHECK-NEXT:    ret;
+  %res = call reassoc half @llvm.vector.reduce.fmul(half 1.0, <7 x half> %in)
+  ret half %res
+}
+
+; Check straight-line reduction.
+define float @reduce_fmul_float(<8 x float> %in) {
+; CHECK-LABEL: reduce_fmul_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<16>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmul_float_param_0+16];
+; CHECK-NEXT:    ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmul_float_param_0];
+; CHECK-NEXT:    mul.rn.f32 %f9, %f1, %f2;
+; CHECK-NEXT:    mul.rn.f32 %f10, %f9, %f3;
+; CHECK-NEXT:    mul.rn.f32 %f11, %f10, %f4;
+; CHECK-NEXT:    mul.rn.f32 %f12, %f11, %f5;
+; CHECK-NEXT:    mul.rn.f32 %f13, %f12, %f6;
+; CHECK-NEXT:    mul.rn.f32 %f14, %f13, %f7;
+; CHECK-NEXT:    mul.rn.f32 %f15, %f14, %f8;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f15;
+; CHECK-NEXT:    ret;
+  %res = call float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in)
+  ret float %res
+}
+
+define float @reduce_fmul_float_reassoc(<8 x float> %in) {
+; CHECK-LABEL: reduce_fmul_float_reassoc(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<16>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmul_float_reassoc_param_0+16];
+; CHECK-NEXT:    ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmul_float_reassoc_param_0];
+; CHECK-NEXT:    mul.rn.f32 %f9, %f3, %f7;
+; CHECK-NEXT:    mul.rn.f32 %f10, %f1, %f5;
+; CHECK-NEXT:    mul.rn.f32 %f11, %f4, %f8;
+; CHECK-NEXT:    mul.rn.f32 %f12, %f2, %f6;
+; CHECK-NEXT:    mul.rn.f32 %f13, %f12, %f11;
+; CHECK-NEXT:    mul.rn.f32 %f14, %f10, %f9;
+; CHECK-NEXT:    mul.rn.f32 %f15, %f14, %f13;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f15;
+; CHECK-NEXT:    ret;
+  %res = call reassoc float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in)
+  ret float %res
+}
+
+define float @reduce_fmul_float_reassoc_nonpow2(<7 x float> %in) {
+; CHECK-LABEL: reduce_fmul_float_reassoc_nonpow2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<14>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f7, [reduce_fmul_float_reassoc_nonpow2_param_0+24];
+; CHECK-NEXT:    ld.param.v2.f32 {%f5, %f6}, [reduce_fmul_float_reassoc_nonpow2_param_0+16];
+; CHECK-NEXT:    ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmul_float_reassoc_nonpow2_param_0];
+; CHECK-NEXT:    mul.rn.f32 %f8, %f3, %f7;
+; CHECK-NEXT:    mul.rn.f32 %f9, %f1, %f5;
+; CHECK-NEXT:    mul.rn.f32 %f10, %f9, %f8;
+; CHECK-NEXT:    mul.rn.f32 %f11, %f2, %f6;
+; CHECK-NEXT:    mul.rn.f32 %f12, %f11, %f4;
+; CHECK-NEXT:    mul.rn.f32 %f13, %f10, %f12;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f13;
+; CHECK-NEXT:    ret;
+  %res = call reassoc float @llvm.vector.reduce.fmul(float 1.0, <7 x float> %in)
+  ret float %res
+}
+
+; Check straight line reduction.
+define half @reduce_fmax_half(<8 x half> %in) {
+; CHECK-LABEL: reduce_fmax_half(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NEXT:    .reg .b32 %r<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmax_half_param_0];
+; CHECK-NEXT:    max.f16x2 %r5, %r2, %r4;
+; CHECK-NEXT:    max.f16x2 %r6, %r1, %r3;
+; CHECK-NEXT:    max.f16x2 %r7, %r6, %r5;
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r7;
+; CHECK-NEXT:    max.f16 %rs3, %rs1, %rs2;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs3;
+; CHECK-NEXT:    ret;
+  %res = call half @llvm.vector.reduce.fmax(<8 x half> %in)
+  ret half %res
+}
+
+define half @reduce_fmax_half_reassoc(<8 x half> %in) {
+; CHECK-LABEL: reduce_fmax_half_reassoc(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NEXT:    .reg .b32 %r<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmax_half_reassoc_param_0];
+; CHECK-NEXT:    max.f16x2 %r5, %r2, %r4;
+; CHECK-NEXT:    max.f16x2 %r6, %r1, %r3;
+; CHECK-NEXT:    max.f16x2 %r7, %r6, %r5;
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r7;
+; CHECK-NEXT:    max.f16 %rs3, %rs1, %rs2;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs3;
+; CHECK-NEXT:    ret;
+  %res = call reassoc half @llvm.vector.reduce.fmax(<8 x half> %in)
+  ret half %res
+}
+
+define half @reduce_fmax_half_reassoc_nonpow2(<7 x half> %in) {
+; CHECK-LABEL: reduce_fmax_half_reassoc_nonpow2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<12>;
+; CHECK-NEXT:    .reg .b32 %r<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [reduce_fmax_half_reassoc_nonpow2_param_0+8];
+; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fmax_half_reassoc_nonpow2_param_0];
+; CHECK-NEXT:    mov.b32 %r2, {%rs1, %rs2};
+; CHECK-NEXT:    mov.b32 %r3, {%rs3, %rs4};
+; CHECK-NEXT:    ld.param.b16 %rs7, [reduce_fmax_half_reassoc_nonpow2_param_0+12];
+; CHECK-NEXT:    mov.b16 %rs8, 0xFE00;
+; CHECK-NEXT:    mov.b32 %r4, {%rs7, %rs8};
+; CHECK-NEXT:    max.f16x2 %r5, %r3, %r4;
+; CHECK-NEXT:    max.f16x2 %r6, %r2, %r1;
+; CHECK-NEXT:    max.f16x2 %r7, %r6, %r5;
+; CHECK-NEXT:    mov.b32 {%rs9, %rs10}, %r7;
+; CHECK-NEXT:    max.f16 %rs11, %rs9, %rs10;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs11;
+; CHECK-NEXT:    ret;
+  %res = call reassoc half @llvm.vector.reduce.fmax(<7 x half> %in)
+  ret half %res
+}
+
+; Check straight-line reduction.
+define float @reduce_fmax_float(<8 x float> %in) {
+;
+; CHECK-LABEL: reduce_fmax_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<16>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmax_float_param_0+16];
+; CHECK-NEXT:    ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmax_float_param_0];
+; CHECK-NEXT:    max.f32 %f9, %f4, %f8;
+; CHECK-NEXT:    max.f32 %f10, %f2, %f6;
+; CHECK-NEXT:    max.f32 %f11, %f10, %f9;
+; CHECK-NEXT:    max.f32 %f12, %f3, %f7;
+; CHECK-NEXT:    max.f32 %f13, %f1, %f5;
+; CHECK-NEXT:    max.f32 %f14, %f13, %f12;
+; CHECK-NEXT:    max.f32 %f15, %f14, %f11;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f15;
+; CHECK-NEXT:    ret;
+  %res = call float @llvm.vector.reduce.fmax(<8 x float> %in)
+  ret float %res
+}
+
+define float @reduce_fmax_float_reassoc(<8 x float> %in) {
+;
+; CHECK-LABEL: reduce_fmax_float_reassoc(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<16>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmax_float_reassoc_param_0+16];
+; CHECK-NEXT:    ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmax_float_reassoc_param_0];
+; CHECK-NEXT:    max.f32 %f9, %f4, %f8;
+; CHECK-NEXT:    max.f32 %f10, %f2, %f6;
+; CHECK-NEXT:    max.f32 %f11, %f10, %f9;
+; CHECK-NEXT:    max.f32 %f12, %f3, %f7;
+; CHECK-NEXT:    max.f32 %f13, %f1, %f5;
+; CHECK-NEXT:    max.f32 %f14, %f13, %f12;
+; CHECK-NEXT:    max.f32 %f15, %f14, %f11;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f15;
+; CHECK-NEXT:    ret;
+  %res = call reassoc float @llvm.vector.reduce.fmax(<8 x float> %in)
+  ret float %res
+}
+
+define float @reduce_fmax_float_reassoc_nonpow2(<7 x float> %in) {
+;
+; CHECK-LABEL: reduce_fmax_float_reassoc_nonpow2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<14>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f7, [reduce_fmax_float_reassoc_nonpow2_param_0+24];
+; CHECK-NEXT:    ld.param.v2.f32 {%f5, %f6}, [reduce_fmax_float_reassoc_nonpow2_param_0+16];
+; CHECK-NEXT:    ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmax_float_reassoc_nonpow2_param_0];
+; CHECK-NEXT:    max.f32 %f8, %f3, %f7;
+; CHECK-NEXT:    max.f32 %f9, %f1, %f5;
+; CHECK-NEXT:    max.f32 %f10, %f9, %f8;
+; CHECK-NEXT:    max.f32 %f11, %f2, %f6;
+; CHECK-NEXT:    max.f32 %f12, %f11, %f4;
+; CHECK-NEXT:    max.f32 %f13, %f10, %f12;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f13;
+; CHECK-NEXT:    ret;
+  %res = call reassoc float @llvm.vector.reduce.fmax(<7 x float> %in)
+  ret float %res
+}
+
+; Check straight line reduction.
+define half @reduce_fmin_half(<8 x half> %in) {
+; CHECK-LABEL: reduce_fmin_half(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NEXT:    .reg .b32 %r<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmin_half_param_0];
+; CHECK-NEXT:    min.f16x2 %r5, %r2, %r4;
+; CHECK-NEXT:    min.f16x2 %r6, %r1, %r3;
+; CHECK-NEXT:    min.f16x2 %r7, %r6, %r5;
+; CHE...
[truncated]

@Prince781
Copy link
Contributor Author

Note: test case extracted from #136253

@Prince781 Prince781 merged commit 92c0b42 into llvm:main Apr 21, 2025
13 checks passed
IanWood1 pushed a commit to IanWood1/llvm-project that referenced this pull request May 6, 2025
Test how these intrinsics are handled by the NVPTX backend. Currently, these intrinsics are lowered to sequential reductions by the ExpandReductions pass.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants