diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 92937e33fd500..d81f25c57af60 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -895,6 +895,8 @@ SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const { if (!SrcRC) return {}; + // TODO: Recognize 64-bit splats broken into 32-bit pieces (i.e. recognize + // every other other element is 0 for 64-bit immediates) int64_t Imm; for (unsigned I = 0, E = Defs.size(); I != E; ++I) { const MachineOperand *Op = Defs[I].first; @@ -924,10 +926,41 @@ MachineOperand *SIFoldOperandsImpl::tryFoldRegSeqSplat( if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx)) return nullptr; - // FIXME: Verify SplatRC is compatible with the use operand - uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType; - if (!TII->isInlineConstant(*SplatVal, OpTy) || - !TII->isOperandLegal(*UseMI, UseOpIdx, SplatVal)) + int16_t RCID = Desc.operands()[UseOpIdx].RegClass; + if (RCID == -1) + return nullptr; + + // Special case 0/-1, since when interpreted as a 64-bit element both halves + // have the same bits. Effectively this code does not handle 64-bit element + // operands correctly, as the incoming 64-bit constants are already split into + // 32-bit sequence elements. + // + // TODO: We should try to figure out how to interpret the reg_sequence as a + // split 64-bit splat constant, or use 64-bit pseudos for materializing f64 + // constants. + if (SplatVal->getImm() != 0 && SplatVal->getImm() != -1) { + const TargetRegisterClass *OpRC = TRI->getRegClass(RCID); + // We need to figure out the scalar type read by the operand. e.g. the MFMA + // operand will be AReg_128, and we want to check if it's compatible with an + // AReg_32 constant. + uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType; + switch (OpTy) { + case AMDGPU::OPERAND_REG_INLINE_AC_INT32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP32: + OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0); + break; + case AMDGPU::OPERAND_REG_INLINE_AC_FP64: + OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1); + break; + default: + return nullptr; + } + + if (!TRI->getCommonSubClass(OpRC, SplatRC)) + return nullptr; + } + + if (!TII->isOperandLegal(*UseMI, UseOpIdx, SplatVal)) return nullptr; return SplatVal; @@ -1039,14 +1072,13 @@ void SIFoldOperandsImpl::foldOperand( } } - if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI, OpNo, FoldList)) + if (RSUse->getSubReg() != RegSeqDstSubReg) continue; - if (RSUse->getSubReg() != RegSeqDstSubReg) + if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI, OpNo, FoldList)) continue; - foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList, - CopiesToReplace); + foldOperand(OpToFold, RSUseMI, OpNo, FoldList, CopiesToReplace); } return; diff --git a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll index af4ca2ad7120a..fb53e889b1158 100644 --- a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll +++ b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll @@ -192,8 +192,10 @@ define amdgpu_ps <4 x i32> @s_csh_v4i32(<4 x i32> inreg %a, <4 x i32> inreg %b) ; ; GISEL-LABEL: s_csh_v4i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], 31 -; GISEL-NEXT: s_and_b64 s[6:7], s[6:7], 31 +; GISEL-NEXT: s_mov_b32 s8, 31 +; GISEL-NEXT: s_mov_b32 s9, s8 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9] +; GISEL-NEXT: s_and_b64 s[6:7], s[6:7], s[8:9] ; GISEL-NEXT: s_lshl_b32 s8, s0, s4 ; GISEL-NEXT: s_lshl_b32 s9, s1, s5 ; GISEL-NEXT: s_lshl_b32 s10, s2, s6 diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index 28245c538a04c..d588f0e0897b7 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -745,7 +745,10 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100000001(ptr addrspace(1) ; ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000001: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], 1 +; GFX12-SDAG-NEXT: s_mov_b32 s0, 1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_mov_b32 s1, s0 +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] ; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll index 86bfb694ab255..5d5dc01439fe4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll @@ -262,11 +262,19 @@ bb: ret void } -; FIXME: This should not be foldable as an inline immediate ; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and_low: -; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 64{{$}} +; GCN: v_accvgpr_write_b32 a[[A_LOW_BITS_0:[0-9]+]], 64{{$}} +; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]] +; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]] +; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]] +; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]] +; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]] +; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]] +; GCN: v_accvgpr_mov_b32 a[[LAST_CONST_REG:[0-9]+]], a[[A_LOW_BITS_0]] + +; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a{{\[}}[[A_LOW_BITS_0]]:[[LAST_CONST_REG]]{{\]$}} ; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3 -; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 64{{$}} +; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a{{\[}}[[A_LOW_BITS_0]]:[[LAST_CONST_REG]]{{\]$}} ; GFX942: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0] ; GCN: global_store_dwordx4 ; GCN: global_store_dwordx4 diff --git a/llvm/test/CodeGen/AMDGPU/operand-folding.ll b/llvm/test/CodeGen/AMDGPU/operand-folding.ll index ebfc5d02134c5..778d73fd919fc 100644 --- a/llvm/test/CodeGen/AMDGPU/operand-folding.ll +++ b/llvm/test/CodeGen/AMDGPU/operand-folding.ll @@ -155,7 +155,9 @@ define i32 @issue139908(i64 %in) { ; CHECK-LABEL: issue139908: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, 42, v[0:1] +; CHECK-NEXT: s_mov_b32 s4, 42 +; CHECK-NEXT: s_mov_b32 s5, s4 +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] ; CHECK-NEXT: v_cndmask_b32_e64 v0, 2, 1, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %eq = icmp eq i64 %in, 180388626474 diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index ddc3e770767b8..bef38c1a65ef8 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX900 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG,GFX90A-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL,GFX90A-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG,GFX942-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL,GFX942-GISEL %s define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) { ; GFX900-LABEL: fadd_v2_vv: @@ -411,10 +411,12 @@ define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) { ; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-GISEL-NEXT: s_mov_b32 s2, 1.0 +; PACKED-GISEL-NEXT: s_mov_b32 s3, s2 ; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] ; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) -; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 1.0 +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] ; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -1186,10 +1188,12 @@ define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) { ; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-GISEL-NEXT: s_mov_b32 s2, 4.0 +; PACKED-GISEL-NEXT: s_mov_b32 s3, s2 ; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] ; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) -; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], 4.0 +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] ; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -1594,6 +1598,40 @@ define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) { ; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] op_sel_hi:[1,0,0] ; PACKED-SDAG-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; PACKED-SDAG-NEXT: s_endpgm +; +; GFX90A-GISEL-LABEL: fma_v2_v_imm: +; GFX90A-GISEL: ; %bb.0: +; GFX90A-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX90A-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX90A-GISEL-NEXT: s_mov_b32 s4, 0x43480000 +; GFX90A-GISEL-NEXT: s_mov_b32 s2, 0x42c80000 +; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX90A-GISEL-NEXT: s_mov_b32 s5, s4 +; GFX90A-GISEL-NEXT: s_mov_b32 s3, s2 +; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX90A-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] +; GFX90A-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX90A-GISEL-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: fma_v2_v_imm: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX942-GISEL-NEXT: s_mov_b32 s4, 0x43480000 +; GFX942-GISEL-NEXT: s_mov_b32 s2, 0x42c80000 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX942-GISEL-NEXT: s_mov_b32 s5, s4 +; GFX942-GISEL-NEXT: s_mov_b32 s3, s2 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] +; GFX942-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX942-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -1675,19 +1713,39 @@ define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) { ; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-SDAG-NEXT: s_endpgm ; -; PACKED-GISEL-LABEL: fma_v2_v_lit_splat: -; PACKED-GISEL: ; %bb.0: -; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; PACKED-GISEL-NEXT: s_mov_b32 s2, 1.0 -; PACKED-GISEL-NEXT: s_mov_b32 s3, s2 -; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] -; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) -; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], 4.0, s[2:3] -; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; PACKED-GISEL-NEXT: s_endpgm +; GFX90A-GISEL-LABEL: fma_v2_v_lit_splat: +; GFX90A-GISEL: ; %bb.0: +; GFX90A-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX90A-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX90A-GISEL-NEXT: s_mov_b32 s4, 1.0 +; GFX90A-GISEL-NEXT: s_mov_b32 s2, 4.0 +; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX90A-GISEL-NEXT: s_mov_b32 s5, s4 +; GFX90A-GISEL-NEXT: s_mov_b32 s3, s2 +; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX90A-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] +; GFX90A-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX90A-GISEL-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: fma_v2_v_lit_splat: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX942-GISEL-NEXT: s_mov_b32 s4, 1.0 +; GFX942-GISEL-NEXT: s_mov_b32 s2, 4.0 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX942-GISEL-NEXT: s_mov_b32 s5, s4 +; GFX942-GISEL-NEXT: s_mov_b32 s3, s2 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] +; GFX942-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX942-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -1725,6 +1783,40 @@ define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) { ; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] ; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; PACKED-SDAG-NEXT: s_endpgm +; +; GFX90A-GISEL-LABEL: fma_v2_v_unfoldable_lit: +; GFX90A-GISEL: ; %bb.0: +; GFX90A-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX90A-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX90A-GISEL-NEXT: s_mov_b32 s4, 1.0 +; GFX90A-GISEL-NEXT: s_mov_b32 s2, 4.0 +; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX90A-GISEL-NEXT: s_mov_b32 s5, 2.0 +; GFX90A-GISEL-NEXT: s_mov_b32 s3, 0x40400000 +; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX90A-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] +; GFX90A-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX90A-GISEL-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: fma_v2_v_unfoldable_lit: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX942-GISEL-NEXT: s_mov_b32 s4, 1.0 +; GFX942-GISEL-NEXT: s_mov_b32 s2, 4.0 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX942-GISEL-NEXT: s_mov_b32 s5, 2.0 +; GFX942-GISEL-NEXT: s_mov_b32 s3, 0x40400000 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] +; GFX942-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX942-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -2059,6 +2151,37 @@ define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) { ; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; PACKED-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; PACKED-SDAG-NEXT: s_endpgm +; +; GFX90A-GISEL-LABEL: fadd_fadd_fsub_0: +; GFX90A-GISEL: ; %bb.0: ; %bb +; GFX90A-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX90A-GISEL-NEXT: s_mov_b32 s3, s2 +; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], v[0:1] +; GFX90A-GISEL-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0 +; GFX90A-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX90A-GISEL-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX90A-GISEL-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: fadd_fadd_fsub_0: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX942-GISEL-NEXT: s_mov_b32 s3, s2 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], v[0:1] +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX942-GISEL-NEXT: s_endpgm bb: %i12 = fadd <2 x float> zeroinitializer, %arg %shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> @@ -2099,6 +2222,40 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p ; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[2:3], s[2:3] neg_lo:[0,1] neg_hi:[0,1] ; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] ; PACKED-SDAG-NEXT: s_endpgm +; +; GFX90A-GISEL-LABEL: fadd_fadd_fsub: +; GFX90A-GISEL: ; %bb.0: ; %bb +; GFX90A-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX90A-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], v[0:1] +; GFX90A-GISEL-NEXT: v_sub_f32_e32 v0, s0, v2 +; GFX90A-GISEL-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-GISEL-NEXT: v_pk_add_f32 v[2:3], s[2:3], v[2:3] +; GFX90A-GISEL-NEXT: v_subrev_f32_e32 v1, s3, v2 +; GFX90A-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX90A-GISEL-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: fadd_fadd_fsub: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], v[0:1] +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_sub_f32_e32 v0, s0, v2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-GISEL-NEXT: v_pk_add_f32 v[2:3], s[2:3], v[2:3] +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_subrev_f32_e32 v1, s3, v2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX942-GISEL-NEXT: s_endpgm bb: %i12 = fadd <2 x float> %arg, %arg1 %shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> @@ -2251,3 +2408,6 @@ declare i32 @llvm.amdgcn.workitem.id.x() declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) declare <32 x float> @llvm.fma.v32f32(<32 x float>, <32 x float>, <32 x float>) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90A-SDAG: {{.*}} +; GFX942-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir b/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir index aa1a7441bc477..8d6c3efb5ded5 100644 --- a/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir +++ b/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir @@ -140,7 +140,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 42 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 - ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY]], 42, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY]], [[REG_SEQUENCE]], implicit-def $scc ; CHECK-NEXT: S_ENDPGM 0, implicit $scc %0:sgpr_64 = COPY $sgpr8_sgpr9 %1:sreg_32 = S_MOV_B32 42