From a035402e9ff56cb3b13562d3e7775087e273b32c Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Tue, 20 May 2025 11:52:13 -0400 Subject: [PATCH 1/6] [AMDGPU] si-peephole-sdwa: Disable V_CNDMASK conversion with sext The sext modifier on an operand of V_CNDMASK_B32_sdwa gets erroneously turned into a neg modifier in the assembly output. As a workaround, to avoid miscompilation, this patch disables the conversion of V_CNDMASK_B32 to the SDWA form if any operand uses an sext modifier. --- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 7 +++ .../AMDGPU/sdwa-peephole-cndmask-sext.ll | 47 +++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index bd8baaaa3df20..70d448e75eb1a 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -430,6 +430,13 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { case AMDGPU::V_CVT_PK_F32_BF8_sdwa: // Does not support input modifiers: noabs, noneg, nosext. return false; + case AMDGPU::V_CNDMASK_B32_sdwa: + // FIXME SISrcMods uses the same bitmask for SEXT and NEG + // modifiers and hence each instruction can only support one type + // of modifier; SEXT gets turned into NEG for this instruction. + if (Sext) + return false; + break; } // Find operand in instruction that matches source operand and replace it with diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll new file mode 100644 index 0000000000000..4a6c4ae5e6c02 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s +; XFAIL: * + +; FIXME The sext modifier is turned into a neg modifier in the asm output + +define void @widget(ptr addrspace(7) %arg, <1 x i1> %arg1, <1 x i1> %arg2) #0 { +; CHECK-LABEL: widget: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_mov_b32 s1, s0 +; CHECK-NEXT: s_mov_b32 s2, s0 +; CHECK-NEXT: s_mov_b32 s3, s0 +; CHECK-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; CHECK-NEXT: v_and_b32_e32 v1, 1, v5 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_and_saveexec_b64 s[0:1], vcc +; CHECK-NEXT: ; %bb.1: ; %cond.load +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: ds_read_b32 v1, v1 +; CHECK-NEXT: ; %bb.2: ; %else +; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] +; CHECK-NEXT: s_and_saveexec_b64 s[0:1], vcc +; CHECK-NEXT: s_cbranch_execz .LBB0_4 +; CHECK-NEXT: ; %bb.3: ; %cond.store +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_cndmask_b32_sdwa v0, v2, sext(v0), vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_or_b32_e32 v0, v0, v1 +; CHECK-NEXT: ds_write_b32 v2, v0 +; CHECK-NEXT: .LBB0_4: ; %else1 +; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +bb: + %load = load <1 x i8>, ptr addrspace(7) null, align 1 + %sext = sext <1 x i8> %load to <1 x i32> + %select = select <1 x i1> %arg1, <1 x i32> %sext, <1 x i32> zeroinitializer + %call = tail call <1 x i32> @llvm.masked.load.v1i32.p3(ptr addrspace(3) null, i32 1, <1 x i1> %arg1, <1 x i32> zeroinitializer) + %or = or <1 x i32> %select, %call + tail call void @llvm.masked.store.v1i32.p3(<1 x i32> %or, ptr addrspace(3) null, i32 1, <1 x i1> %arg1) + tail call void @llvm.amdgcn.s.waitcnt(i32 0) + ret void +} From 6761e1b4bbf548eb4e1970e54f3ea1aa12582c5f Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Wed, 21 May 2025 02:15:12 -0400 Subject: [PATCH 2/6] Extend comment on SISrcMods and make FIXME more explicit --- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 70d448e75eb1a..1e305c2efc8a0 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -431,9 +431,17 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { // Does not support input modifiers: noabs, noneg, nosext. return false; case AMDGPU::V_CNDMASK_B32_sdwa: - // FIXME SISrcMods uses the same bitmask for SEXT and NEG - // modifiers and hence each instruction can only support one type - // of modifier; SEXT gets turned into NEG for this instruction. + // SISrcMods uses the same bitmask for SEXT and NEG modifiers and + // hence the compiler can only support one type of modifier for + // each SDWA instruction. For V_CNDMASK_B32_sdwa, this is NEG + // since its operands get printed using + // AMDGPUInstPrinter::printOperandAndFPInputMods which produces + // the output intended for NEG if SEXT is set. + // + // The ISA does actually support both modifiers on most SDWA + // instructions. + // + // FIXME Accept SEXT here after fixing this issue. if (Sext) return false; break; From e3d9f8273d8e2fc3ac163078a7b590b73dc71bd4 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Wed, 21 May 2025 03:15:15 -0400 Subject: [PATCH 3/6] Simplify test --- .../AMDGPU/sdwa-peephole-cndmask-sext.ll | 46 ++++--------------- 1 file changed, 10 insertions(+), 36 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll index 4a6c4ae5e6c02..47dc0335e6f05 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll @@ -1,47 +1,21 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s ; XFAIL: * ; FIXME The sext modifier is turned into a neg modifier in the asm output -define void @widget(ptr addrspace(7) %arg, <1 x i1> %arg1, <1 x i1> %arg2) #0 { -; CHECK-LABEL: widget: -; CHECK: ; %bb.0: ; %bb +define i32 @test_select_on_sext_sdwa(i8 %x, i32 %y, i1 %cond) { +; CHECK-LABEL: test_select_on_sext_sdwa: +; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: s_mov_b32 s2, s0 -; CHECK-NEXT: s_mov_b32 s3, s0 -; CHECK-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 -; CHECK-NEXT: v_and_b32_e32 v1, 1, v5 -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: s_and_saveexec_b64 s[0:1], vcc -; CHECK-NEXT: ; %bb.1: ; %cond.load -; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: ds_read_b32 v1, v1 -; CHECK-NEXT: ; %bb.2: ; %else -; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] -; CHECK-NEXT: s_and_saveexec_b64 s[0:1], vcc -; CHECK-NEXT: s_cbranch_execz .LBB0_4 -; CHECK-NEXT: ; %bb.3: ; %cond.store +; CHECK-NEXT: v_and_b32_e32 v2, 1, v2 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: v_cndmask_b32_sdwa v0, v2, sext(v0), vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_or_b32_e32 v0, v0, v1 -; CHECK-NEXT: ds_write_b32 v2, v0 -; CHECK-NEXT: .LBB0_4: ; %else1 -; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] -bb: - %load = load <1 x i8>, ptr addrspace(7) null, align 1 - %sext = sext <1 x i8> %load to <1 x i32> - %select = select <1 x i1> %arg1, <1 x i32> %sext, <1 x i32> zeroinitializer - %call = tail call <1 x i32> @llvm.masked.load.v1i32.p3(ptr addrspace(3) null, i32 1, <1 x i1> %arg1, <1 x i32> zeroinitializer) - %or = or <1 x i32> %select, %call - tail call void @llvm.masked.store.v1i32.p3(<1 x i32> %or, ptr addrspace(3) null, i32 1, <1 x i1> %arg1) - tail call void @llvm.amdgcn.s.waitcnt(i32 0) - ret void + %sext = sext i8 %x to i32 + %select = select i1 %cond, i32 %sext, i32 zeroinitializer + %or = or i32 %select, %y + ret i32 %or } From f6d7513b483be8a9c3f545e2baef25224db70477 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Wed, 21 May 2025 15:40:56 +0200 Subject: [PATCH 4/6] Update llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll Co-authored-by: Matt Arsenault --- llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll index 47dc0335e6f05..1d61dad704aec 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll @@ -15,7 +15,7 @@ define i32 @test_select_on_sext_sdwa(i8 %x, i32 %y, i1 %cond) { ; CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; CHECK-NEXT: s_setpc_b64 s[30:31] %sext = sext i8 %x to i32 - %select = select i1 %cond, i32 %sext, i32 zeroinitializer + %select = select i1 %cond, i32 %sext, i32 0 %or = or i32 %select, %y ret i32 %or } From f7b8411d4bf3ba17edf3dca18bc41fc5c177200f Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Wed, 21 May 2025 10:01:32 -0400 Subject: [PATCH 5/6] Move test case into existing test file --- .../AMDGPU/sdwa-peephole-cndmask-sext.ll | 21 ---------- .../AMDGPU/sdwa-peephole-cndmask-wave64.mir | 38 +++++++++++++++++++ 2 files changed, 38 insertions(+), 21 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll deleted file mode 100644 index 1d61dad704aec..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll +++ /dev/null @@ -1,21 +0,0 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s -; XFAIL: * - -; FIXME The sext modifier is turned into a neg modifier in the asm output - -define i32 @test_select_on_sext_sdwa(i8 %x, i32 %y, i1 %cond) { -; CHECK-LABEL: test_select_on_sext_sdwa: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v2, 1, v2 -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_cndmask_b32_sdwa v0, v2, sext(v0), vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; CHECK-NEXT: v_or_b32_e32 v0, v0, v1 -; CHECK-NEXT: s_setpc_b64 s[30:31] - %sext = sext i8 %x to i32 - %select = select i1 %cond, i32 %sext, i32 0 - %or = or i32 %select, %y - ret i32 %or -} diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave64.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave64.mir index e243df4077ff4..52c06952aa9fd 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave64.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave64.mir @@ -231,3 +231,41 @@ body: | $vgpr0 = COPY %3 SI_RETURN implicit $vgpr0 ... + +# SDWA conversion of V_CNDMASK_B32 with V_BFE_I32 operand had to be +# disabled. +# FIXME sext modifier gets erroneously printed as neg modifier. + +... +--- +name: issue138766_cndmask_with_sext +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: issue138766_cndmask_with_sext + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 killed [[V_AND_B32_e64_]], 1, implicit $exec + ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[COPY2]], 0, 8, implicit $exec + ; CHECK-NEXT: $vcc = COPY killed [[V_CMP_EQ_U32_e64_]] + ; CHECK-NEXT: [[V_CNDMASK_B32_e32_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e32 0, killed [[V_BFE_I32_e64_]], implicit $vcc, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 killed [[V_CNDMASK_B32_e32_]], [[COPY1]], implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %10:vgpr_32 = COPY $vgpr2 + %9:vgpr_32 = COPY $vgpr1 + %8:vgpr_32 = COPY $vgpr0 + %11:vgpr_32 = V_AND_B32_e64 1, %10, implicit $exec + %12:sreg_64_xexec = V_CMP_EQ_U32_e64 killed %11, 1, implicit $exec + %14:vgpr_32 = V_BFE_I32_e64 %8, 0, 8, implicit $exec + %16:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, killed %14, killed %12, implicit $exec + %18:vgpr_32 = V_OR_B32_e64 killed %16, %9, implicit $exec + $vgpr0 = COPY %18 + SI_RETURN implicit $vgpr0 +... From 0a887bfa26c7362b97b93f604bb3f55d6d8b1d59 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Mon, 26 May 2025 06:34:54 +0000 Subject: [PATCH 6/6] Reintroduce IR test --- .../AMDGPU/sdwa-peephole-cndmask-sext.ll | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll new file mode 100644 index 0000000000000..2c7819a395c86 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll @@ -0,0 +1,21 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s + +; FIXME The sext modifier is turned into a neg modifier in the asm output + +define i32 @test_select_on_sext_sdwa(i8 %x, i32 %y, i1 %cond) { +; CHECK-LABEL: test_select_on_sext_sdwa: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v2, 1, v2 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; CHECK-NEXT: v_bfe_i32 v0, v0, 0, 8 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; CHECK-NEXT: v_or_b32_e32 v0, v0, v1 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %sext = sext i8 %x to i32 + %select = select i1 %cond, i32 %sext, i32 0 + %or = or i32 %select, %y + ret i32 %or +}