llvm · arsenm · May 19, 2025 · May 16, 2025
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -895,6 +895,8 @@ SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const {
   if (!SrcRC)
     return {};
 
+  // TODO: Recognize 64-bit splats broken into 32-bit pieces (i.e. recognize
+  // every other other element is 0 for 64-bit immediates)
   int64_t Imm;
   for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
     const MachineOperand *Op = Defs[I].first;
@@ -924,10 +926,41 @@ MachineOperand *SIFoldOperandsImpl::tryFoldRegSeqSplat(
   if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
     return nullptr;
 
-  // FIXME: Verify SplatRC is compatible with the use operand
-  uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
-  if (!TII->isInlineConstant(*SplatVal, OpTy) ||
-      !TII->isOperandLegal(*UseMI, UseOpIdx, SplatVal))
+  int16_t RCID = Desc.operands()[UseOpIdx].RegClass;
+  if (RCID == -1)
+    return nullptr;
+
+  // Special case 0/-1, since when interpreted as a 64-bit element both halves
+  // have the same bits. Effectively this code does not handle 64-bit element
+  // operands correctly, as the incoming 64-bit constants are already split into
+  // 32-bit sequence elements.
+  //
+  // TODO: We should try to figure out how to interpret the reg_sequence as a
+  // split 64-bit splat constant, or use 64-bit pseudos for materializing f64
+  // constants.
+  if (SplatVal->getImm() != 0 && SplatVal->getImm() != -1) {
+    const TargetRegisterClass *OpRC = TRI->getRegClass(RCID);
+    // We need to figure out the scalar type read by the operand. e.g. the MFMA
+    // operand will be AReg_128, and we want to check if it's compatible with an
+    // AReg_32 constant.
+    uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
+    switch (OpTy) {
+    case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
+    case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+      OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);
+      break;
+    case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
+      OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);
+      break;
+    default:
+      return nullptr;
+    }
+
+    if (!TRI->getCommonSubClass(OpRC, SplatRC))
+      return nullptr;
+  }
+
+  if (!TII->isOperandLegal(*UseMI, UseOpIdx, SplatVal))
     return nullptr;
 
   return SplatVal;
@@ -1039,14 +1072,13 @@ void SIFoldOperandsImpl::foldOperand(
         }
       }
 
-      if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI, OpNo, FoldList))
+      if (RSUse->getSubReg() != RegSeqDstSubReg)
         continue;
 
-      if (RSUse->getSubReg() != RegSeqDstSubReg)
+      if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI, OpNo, FoldList))
         continue;
 
-      foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
-                  CopiesToReplace);
+      foldOperand(OpToFold, RSUseMI, OpNo, FoldList, CopiesToReplace);
     }
 
     return;

diff --git a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
@@ -192,8 +192,10 @@ define amdgpu_ps <4 x i32> @s_csh_v4i32(<4 x i32> inreg %a, <4 x i32> inreg %b)
 ;
 ; GISEL-LABEL: s_csh_v4i32:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_and_b64 s[4:5], s[4:5], 31
-; GISEL-NEXT:    s_and_b64 s[6:7], s[6:7], 31
+; GISEL-NEXT:    s_mov_b32 s8, 31
+; GISEL-NEXT:    s_mov_b32 s9, s8
+; GISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
+; GISEL-NEXT:    s_and_b64 s[6:7], s[6:7], s[8:9]
 ; GISEL-NEXT:    s_lshl_b32 s8, s0, s4
 ; GISEL-NEXT:    s_lshl_b32 s9, s1, s5
 ; GISEL-NEXT:    s_lshl_b32 s10, s2, s6

diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -745,7 +745,10 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100000001(ptr addrspace(1)
 ;
 ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000001:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[2:3], 1
+; GFX12-SDAG-NEXT:    s_mov_b32 s0, 1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_mov_b32 s1, s0
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
 ; GFX12-SDAG-NEXT:    s_load_u8 s0, s[0:1], 0x0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -262,11 +262,19 @@ bb:
   ret void
 }
 
-; FIXME: This should not be foldable as an inline immediate
 ; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and_low:
-; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 64{{$}}
+; GCN: v_accvgpr_write_b32 a[[A_LOW_BITS_0:[0-9]+]], 64{{$}}
+; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GCN: v_accvgpr_mov_b32 a[[LAST_CONST_REG:[0-9]+]], a[[A_LOW_BITS_0]]
+
+; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a{{\[}}[[A_LOW_BITS_0]]:[[LAST_CONST_REG]]{{\]$}}
 ; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
-; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 64{{$}}
+; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a{{\[}}[[A_LOW_BITS_0]]:[[LAST_CONST_REG]]{{\]$}}
 ; GFX942: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0]
 ; GCN:    global_store_dwordx4
 ; GCN:    global_store_dwordx4

diff --git a/llvm/test/CodeGen/AMDGPU/operand-folding.ll b/llvm/test/CodeGen/AMDGPU/operand-folding.ll
@@ -155,7 +155,9 @@ define i32 @issue139908(i64 %in) {
 ; CHECK-LABEL: issue139908:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc, 42, v[0:1]
+; CHECK-NEXT:    s_mov_b32 s4, 42
+; CHECK-NEXT:    s_mov_b32 s5, s4
+; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 2, 1, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %eq = icmp eq i64 %in, 180388626474

diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX900 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG,GFX90A-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL,GFX90A-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG,GFX942-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL,GFX942-GISEL %s
 
 define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) {
 ; GFX900-LABEL: fadd_v2_vv:
@@ -411,10 +411,12 @@ define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) {
 ; PACKED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; PACKED-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; PACKED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-GISEL-NEXT:    s_mov_b32 s2, 1.0
+; PACKED-GISEL-NEXT:    s_mov_b32 s3, s2
 ; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
 ; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; PACKED-GISEL-NEXT:    v_pk_add_f32 v[0:1], v[0:1], 1.0
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[2:3]
 ; PACKED-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; PACKED-GISEL-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -1186,10 +1188,12 @@ define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) {
 ; PACKED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; PACKED-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; PACKED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-GISEL-NEXT:    s_mov_b32 s2, 4.0
+; PACKED-GISEL-NEXT:    s_mov_b32 s3, s2
 ; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
 ; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; PACKED-GISEL-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], 4.0
+; PACKED-GISEL-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], s[2:3]
 ; PACKED-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; PACKED-GISEL-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -1594,6 +1598,40 @@ define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) {
 ; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] op_sel_hi:[1,0,0]
 ; PACKED-SDAG-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
 ; PACKED-SDAG-NEXT:    s_endpgm
+;
+; GFX90A-GISEL-LABEL: fma_v2_v_imm:
+; GFX90A-GISEL:       ; %bb.0:
+; GFX90A-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX90A-GISEL-NEXT:    s_mov_b32 s4, 0x43480000
+; GFX90A-GISEL-NEXT:    s_mov_b32 s2, 0x42c80000
+; GFX90A-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-GISEL-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX90A-GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX90A-GISEL-NEXT:    s_mov_b32 s3, s2
+; GFX90A-GISEL-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-GISEL-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
+; GFX90A-GISEL-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX90A-GISEL-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: fma_v2_v_imm:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, 0x43480000
+; GFX942-GISEL-NEXT:    s_mov_b32 s2, 0x42c80000
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX942-GISEL-NEXT:    s_mov_b32 s3, s2
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
   %load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -1675,19 +1713,39 @@ define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) {
 ; PACKED-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; PACKED-SDAG-NEXT:    s_endpgm
 ;
-; PACKED-GISEL-LABEL: fma_v2_v_lit_splat:
-; PACKED-GISEL:       ; %bb.0:
-; PACKED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; PACKED-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; PACKED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; PACKED-GISEL-NEXT:    s_mov_b32 s2, 1.0
-; PACKED-GISEL-NEXT:    s_mov_b32 s3, s2
-; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; PACKED-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
-; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; PACKED-GISEL-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], 4.0, s[2:3]
-; PACKED-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; PACKED-GISEL-NEXT:    s_endpgm
+; GFX90A-GISEL-LABEL: fma_v2_v_lit_splat:
+; GFX90A-GISEL:       ; %bb.0:
+; GFX90A-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX90A-GISEL-NEXT:    s_mov_b32 s4, 1.0
+; GFX90A-GISEL-NEXT:    s_mov_b32 s2, 4.0
+; GFX90A-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-GISEL-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX90A-GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX90A-GISEL-NEXT:    s_mov_b32 s3, s2
+; GFX90A-GISEL-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-GISEL-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
+; GFX90A-GISEL-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX90A-GISEL-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: fma_v2_v_lit_splat:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, 1.0
+; GFX942-GISEL-NEXT:    s_mov_b32 s2, 4.0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX942-GISEL-NEXT:    s_mov_b32 s3, s2
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
   %load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -1725,6 +1783,40 @@ define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
 ; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
 ; PACKED-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; PACKED-SDAG-NEXT:    s_endpgm
+;
+; GFX90A-GISEL-LABEL: fma_v2_v_unfoldable_lit:
+; GFX90A-GISEL:       ; %bb.0:
+; GFX90A-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX90A-GISEL-NEXT:    s_mov_b32 s4, 1.0
+; GFX90A-GISEL-NEXT:    s_mov_b32 s2, 4.0
+; GFX90A-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-GISEL-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX90A-GISEL-NEXT:    s_mov_b32 s5, 2.0
+; GFX90A-GISEL-NEXT:    s_mov_b32 s3, 0x40400000
+; GFX90A-GISEL-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-GISEL-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
+; GFX90A-GISEL-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX90A-GISEL-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: fma_v2_v_unfoldable_lit:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, 1.0
+; GFX942-GISEL-NEXT:    s_mov_b32 s2, 4.0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, 2.0
+; GFX942-GISEL-NEXT:    s_mov_b32 s3, 0x40400000
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
   %load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -2059,6 +2151,37 @@ define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) {
 ; PACKED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; PACKED-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
 ; PACKED-SDAG-NEXT:    s_endpgm
+;
+; GFX90A-GISEL-LABEL: fadd_fadd_fsub_0:
+; GFX90A-GISEL:       ; %bb.0: ; %bb
+; GFX90A-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-GISEL-NEXT:    s_mov_b32 s2, 0
+; GFX90A-GISEL-NEXT:    s_mov_b32 s3, s2
+; GFX90A-GISEL-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-GISEL-NEXT:    v_pk_add_f32 v[0:1], s[0:1], v[0:1]
+; GFX90A-GISEL-NEXT:    v_mov_b32_e32 v0, v1
+; GFX90A-GISEL-NEXT:    v_pk_add_f32 v[0:1], v[0:1], 0
+; GFX90A-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-GISEL-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX90A-GISEL-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: fadd_fadd_fsub_0:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_mov_b32 s2, 0
+; GFX942-GISEL-NEXT:    s_mov_b32 s3, s2
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_pk_add_f32 v[0:1], s[0:1], v[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, v1
+; GFX942-GISEL-NEXT:    v_pk_add_f32 v[0:1], v[0:1], 0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
 bb:
   %i12 = fadd <2 x float> zeroinitializer, %arg
   %shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison>
@@ -2099,6 +2222,40 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
 ; PACKED-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[2:3], s[2:3] neg_lo:[0,1] neg_hi:[0,1]
 ; PACKED-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[6:7]
 ; PACKED-SDAG-NEXT:    s_endpgm
+;
+; GFX90A-GISEL-LABEL: fadd_fadd_fsub:
+; GFX90A-GISEL:       ; %bb.0: ; %bb
+; GFX90A-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-GISEL-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-GISEL-NEXT:    v_pk_add_f32 v[0:1], s[0:1], v[0:1]
+; GFX90A-GISEL-NEXT:    v_sub_f32_e32 v0, s0, v2
+; GFX90A-GISEL-NEXT:    v_mov_b32_e32 v2, v1
+; GFX90A-GISEL-NEXT:    v_pk_add_f32 v[2:3], s[2:3], v[2:3]
+; GFX90A-GISEL-NEXT:    v_subrev_f32_e32 v1, s3, v2
+; GFX90A-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90A-GISEL-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: fadd_fadd_fsub:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-GISEL-NEXT:    v_pk_add_f32 v[0:1], s[0:1], v[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_sub_f32_e32 v0, s0, v2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-GISEL-NEXT:    v_pk_add_f32 v[2:3], s[2:3], v[2:3]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subrev_f32_e32 v1, s3, v2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX942-GISEL-NEXT:    s_endpgm
 bb:
   %i12 = fadd <2 x float> %arg, %arg1
   %shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison>
@@ -2251,3 +2408,6 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
 declare <32 x float> @llvm.fma.v32f32(<32 x float>, <32 x float>, <32 x float>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX90A-SDAG: {{.*}}
+; GFX942-SDAG: {{.*}}