Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 40 additions & 8 deletions llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -895,6 +895,8 @@ SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const {
if (!SrcRC)
return {};

// TODO: Recognize 64-bit splats broken into 32-bit pieces (i.e. recognize
// every other other element is 0 for 64-bit immediates)
int64_t Imm;
for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
const MachineOperand *Op = Defs[I].first;
Expand Down Expand Up @@ -924,10 +926,41 @@ MachineOperand *SIFoldOperandsImpl::tryFoldRegSeqSplat(
if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
return nullptr;

// FIXME: Verify SplatRC is compatible with the use operand
uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
if (!TII->isInlineConstant(*SplatVal, OpTy) ||
!TII->isOperandLegal(*UseMI, UseOpIdx, SplatVal))
int16_t RCID = Desc.operands()[UseOpIdx].RegClass;
if (RCID == -1)
return nullptr;

// Special case 0/-1, since when interpreted as a 64-bit element both halves
// have the same bits. Effectively this code does not handle 64-bit element
// operands correctly, as the incoming 64-bit constants are already split into
// 32-bit sequence elements.
//
// TODO: We should try to figure out how to interpret the reg_sequence as a
// split 64-bit splat constant, or use 64-bit pseudos for materializing f64
// constants.
if (SplatVal->getImm() != 0 && SplatVal->getImm() != -1) {
const TargetRegisterClass *OpRC = TRI->getRegClass(RCID);
// We need to figure out the scalar type read by the operand. e.g. the MFMA
// operand will be AReg_128, and we want to check if it's compatible with an
// AReg_32 constant.
uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
switch (OpTy) {
case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);
break;
case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);
break;
default:
return nullptr;
}

if (!TRI->getCommonSubClass(OpRC, SplatRC))
return nullptr;
}

if (!TII->isOperandLegal(*UseMI, UseOpIdx, SplatVal))
return nullptr;

return SplatVal;
Expand Down Expand Up @@ -1039,14 +1072,13 @@ void SIFoldOperandsImpl::foldOperand(
}
}

if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI, OpNo, FoldList))
if (RSUse->getSubReg() != RegSeqDstSubReg)
continue;

if (RSUse->getSubReg() != RegSeqDstSubReg)
if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI, OpNo, FoldList))
continue;

foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
CopiesToReplace);
foldOperand(OpToFold, RSUseMI, OpNo, FoldList, CopiesToReplace);
}

return;
Expand Down
6 changes: 4 additions & 2 deletions llvm/test/CodeGen/AMDGPU/constrained-shift.ll
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,10 @@ define amdgpu_ps <4 x i32> @s_csh_v4i32(<4 x i32> inreg %a, <4 x i32> inreg %b)
;
; GISEL-LABEL: s_csh_v4i32:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], 31
; GISEL-NEXT: s_and_b64 s[6:7], s[6:7], 31
; GISEL-NEXT: s_mov_b32 s8, 31
; GISEL-NEXT: s_mov_b32 s9, s8
; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9]
; GISEL-NEXT: s_and_b64 s[6:7], s[6:7], s[8:9]
; GISEL-NEXT: s_lshl_b32 s8, s0, s4
; GISEL-NEXT: s_lshl_b32 s9, s1, s5
; GISEL-NEXT: s_lshl_b32 s10, s2, s6
Expand Down
5 changes: 4 additions & 1 deletion llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
Original file line number Diff line number Diff line change
Expand Up @@ -745,7 +745,10 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100000001(ptr addrspace(1)
;
; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000001:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], 1
; GFX12-SDAG-NEXT: s_mov_b32 s0, 1
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_mov_b32 s1, s0
; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
Expand Down
14 changes: 11 additions & 3 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
Original file line number Diff line number Diff line change
Expand Up @@ -262,11 +262,19 @@ bb:
ret void
}

; FIXME: This should not be foldable as an inline immediate
; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and_low:
; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 64{{$}}
; GCN: v_accvgpr_write_b32 a[[A_LOW_BITS_0:[0-9]+]], 64{{$}}
; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
; GCN: v_accvgpr_mov_b32 a[[LAST_CONST_REG:[0-9]+]], a[[A_LOW_BITS_0]]

; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a{{\[}}[[A_LOW_BITS_0]]:[[LAST_CONST_REG]]{{\]$}}
; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 64{{$}}
; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a{{\[}}[[A_LOW_BITS_0]]:[[LAST_CONST_REG]]{{\]$}}
; GFX942: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0]
; GCN: global_store_dwordx4
; GCN: global_store_dwordx4
Expand Down
4 changes: 3 additions & 1 deletion llvm/test/CodeGen/AMDGPU/operand-folding.ll
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,9 @@ define i32 @issue139908(i64 %in) {
; CHECK-LABEL: issue139908:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, 42, v[0:1]
; CHECK-NEXT: s_mov_b32 s4, 42
; CHECK-NEXT: s_mov_b32 s5, s4
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
; CHECK-NEXT: v_cndmask_b32_e64 v0, 2, 1, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
%eq = icmp eq i64 %in, 180388626474
Expand Down
198 changes: 179 additions & 19 deletions llvm/test/CodeGen/AMDGPU/packed-fp32.ll
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX900 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG,GFX90A-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL,GFX90A-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG,GFX942-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL,GFX942-GISEL %s

define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) {
; GFX900-LABEL: fadd_v2_vv:
Expand Down Expand Up @@ -411,10 +411,12 @@ define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) {
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; PACKED-GISEL-NEXT: s_mov_b32 s2, 1.0
; PACKED-GISEL-NEXT: s_mov_b32 s3, s2
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 1.0
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
Expand Down Expand Up @@ -1186,10 +1188,12 @@ define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) {
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; PACKED-GISEL-NEXT: s_mov_b32 s2, 4.0
; PACKED-GISEL-NEXT: s_mov_b32 s3, s2
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], 4.0
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3]
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
Expand Down Expand Up @@ -1594,6 +1598,40 @@ define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) {
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] op_sel_hi:[1,0,0]
; PACKED-SDAG-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
; PACKED-SDAG-NEXT: s_endpgm
;
; GFX90A-GISEL-LABEL: fma_v2_v_imm:
; GFX90A-GISEL: ; %bb.0:
; GFX90A-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX90A-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX90A-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX90A-GISEL-NEXT: s_mov_b32 s4, 0x43480000
; GFX90A-GISEL-NEXT: s_mov_b32 s2, 0x42c80000
; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX90A-GISEL-NEXT: s_mov_b32 s5, s4
; GFX90A-GISEL-NEXT: s_mov_b32 s3, s2
; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX90A-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
; GFX90A-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; GFX90A-GISEL-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: fma_v2_v_imm:
; GFX942-GISEL: ; %bb.0:
; GFX942-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX942-GISEL-NEXT: s_mov_b32 s4, 0x43480000
; GFX942-GISEL-NEXT: s_mov_b32 s2, 0x42c80000
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX942-GISEL-NEXT: s_mov_b32 s5, s4
; GFX942-GISEL-NEXT: s_mov_b32 s3, s2
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX942-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
; GFX942-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; GFX942-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
Expand Down Expand Up @@ -1675,19 +1713,39 @@ define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) {
; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-SDAG-NEXT: s_endpgm
;
; PACKED-GISEL-LABEL: fma_v2_v_lit_splat:
; PACKED-GISEL: ; %bb.0:
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; PACKED-GISEL-NEXT: s_mov_b32 s2, 1.0
; PACKED-GISEL-NEXT: s_mov_b32 s3, s2
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], 4.0, s[2:3]
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
; GFX90A-GISEL-LABEL: fma_v2_v_lit_splat:
; GFX90A-GISEL: ; %bb.0:
; GFX90A-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX90A-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX90A-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX90A-GISEL-NEXT: s_mov_b32 s4, 1.0
; GFX90A-GISEL-NEXT: s_mov_b32 s2, 4.0
; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX90A-GISEL-NEXT: s_mov_b32 s5, s4
; GFX90A-GISEL-NEXT: s_mov_b32 s3, s2
; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX90A-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
; GFX90A-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; GFX90A-GISEL-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: fma_v2_v_lit_splat:
; GFX942-GISEL: ; %bb.0:
; GFX942-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX942-GISEL-NEXT: s_mov_b32 s4, 1.0
; GFX942-GISEL-NEXT: s_mov_b32 s2, 4.0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX942-GISEL-NEXT: s_mov_b32 s5, s4
; GFX942-GISEL-NEXT: s_mov_b32 s3, s2
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX942-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
; GFX942-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; GFX942-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
Expand Down Expand Up @@ -1725,6 +1783,40 @@ define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; PACKED-SDAG-NEXT: s_endpgm
;
; GFX90A-GISEL-LABEL: fma_v2_v_unfoldable_lit:
; GFX90A-GISEL: ; %bb.0:
; GFX90A-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX90A-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX90A-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX90A-GISEL-NEXT: s_mov_b32 s4, 1.0
; GFX90A-GISEL-NEXT: s_mov_b32 s2, 4.0
; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX90A-GISEL-NEXT: s_mov_b32 s5, 2.0
; GFX90A-GISEL-NEXT: s_mov_b32 s3, 0x40400000
; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX90A-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
; GFX90A-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; GFX90A-GISEL-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: fma_v2_v_unfoldable_lit:
; GFX942-GISEL: ; %bb.0:
; GFX942-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX942-GISEL-NEXT: s_mov_b32 s4, 1.0
; GFX942-GISEL-NEXT: s_mov_b32 s2, 4.0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX942-GISEL-NEXT: s_mov_b32 s5, 2.0
; GFX942-GISEL-NEXT: s_mov_b32 s3, 0x40400000
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX942-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
; GFX942-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; GFX942-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
Expand Down Expand Up @@ -2059,6 +2151,37 @@ define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) {
; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s0
; PACKED-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; PACKED-SDAG-NEXT: s_endpgm
;
; GFX90A-GISEL-LABEL: fadd_fadd_fsub_0:
; GFX90A-GISEL: ; %bb.0: ; %bb
; GFX90A-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX90A-GISEL-NEXT: s_mov_b32 s2, 0
; GFX90A-GISEL-NEXT: s_mov_b32 s3, s2
; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], v[0:1]
; GFX90A-GISEL-NEXT: v_mov_b32_e32 v0, v1
; GFX90A-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0
; GFX90A-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX90A-GISEL-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX90A-GISEL-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: fadd_fadd_fsub_0:
; GFX942-GISEL: ; %bb.0: ; %bb
; GFX942-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX942-GISEL-NEXT: s_mov_b32 s2, 0
; GFX942-GISEL-NEXT: s_mov_b32 s3, s2
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], v[0:1]
; GFX942-GISEL-NEXT: s_nop 0
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, v1
; GFX942-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0
; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, v0
; GFX942-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX942-GISEL-NEXT: s_endpgm
bb:
%i12 = fadd <2 x float> zeroinitializer, %arg
%shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison>
Expand Down Expand Up @@ -2099,6 +2222,40 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[2:3], s[2:3] neg_lo:[0,1] neg_hi:[0,1]
; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7]
; PACKED-SDAG-NEXT: s_endpgm
;
; GFX90A-GISEL-LABEL: fadd_fadd_fsub:
; GFX90A-GISEL: ; %bb.0: ; %bb
; GFX90A-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], v[0:1]
; GFX90A-GISEL-NEXT: v_sub_f32_e32 v0, s0, v2
; GFX90A-GISEL-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-GISEL-NEXT: v_pk_add_f32 v[2:3], s[2:3], v[2:3]
; GFX90A-GISEL-NEXT: v_subrev_f32_e32 v1, s3, v2
; GFX90A-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX90A-GISEL-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: fadd_fadd_fsub:
; GFX942-GISEL: ; %bb.0: ; %bb
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX942-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], v[0:1]
; GFX942-GISEL-NEXT: s_nop 0
; GFX942-GISEL-NEXT: v_sub_f32_e32 v0, s0, v2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, v1
; GFX942-GISEL-NEXT: v_pk_add_f32 v[2:3], s[2:3], v[2:3]
; GFX942-GISEL-NEXT: s_nop 0
; GFX942-GISEL-NEXT: v_subrev_f32_e32 v1, s3, v2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX942-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
bb:
%i12 = fadd <2 x float> %arg, %arg1
%shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison>
Expand Down Expand Up @@ -2251,3 +2408,6 @@ declare i32 @llvm.amdgcn.workitem.id.x()
declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
declare <32 x float> @llvm.fma.v32f32(<32 x float>, <32 x float>, <32 x float>)
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX90A-SDAG: {{.*}}
; GFX942-SDAG: {{.*}}
Loading
Loading