Skip to content

Commit 4206dfe

Browse files
authored
Merge branch 'main' into distr_utils_fix
2 parents 5692546 + 1371684 commit 4206dfe

19 files changed

+2696
-1192
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -709,7 +709,10 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
709709
// 16-bit SGPRs instead of 32-bit ones.
710710
if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
711711
Old.setSubReg(AMDGPU::NoSubRegister);
712-
Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
712+
if (New->getReg().isPhysical())
713+
Old.substPhysReg(New->getReg(), *TRI);
714+
else
715+
Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
713716
Old.setIsUndef(New->isUndef());
714717
return true;
715718
}
@@ -1986,7 +1989,9 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy(
19861989
if (!FoldingImm && !OpToFold.isReg())
19871990
return false;
19881991

1989-
if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
1992+
// Fold virtual registers and constant physical registers.
1993+
if (OpToFold.isReg() && OpToFold.getReg().isPhysical() &&
1994+
!TRI->isConstantPhysReg(OpToFold.getReg()))
19901995
return false;
19911996

19921997
// Prevent folding operands backwards in the function. For example,

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15729,7 +15729,7 @@ SDValue SITargetLowering::performFDivCombine(SDNode *N,
1572915729
SelectionDAG &DAG = DCI.DAG;
1573015730
SDLoc SL(N);
1573115731
EVT VT = N->getValueType(0);
15732-
if (VT != MVT::f16 || !Subtarget->has16BitInsts())
15732+
if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
1573315733
return SDValue();
1573415734

1573515735
SDValue LHS = N->getOperand(0);

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4241,6 +4241,7 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
42414241
MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
42424242
MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
42434243
MI.getOpcode() == AMDGPU::S_SETPRIO ||
4244+
MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
42444245
changesVGPRIndexingMode(MI);
42454246
}
42464247

llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp

Lines changed: 174 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3846,18 +3846,21 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
38463846
setOriginForNaryOp(I);
38473847
}
38483848

3849-
// Instrument multiply-add intrinsics.
3849+
// Instrument multiply-add(-accumulate)? intrinsics.
38503850
//
38513851
// e.g., Two operands:
38523852
// <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a, <8 x i16> %b)
38533853
//
38543854
// Two operands which require an EltSizeInBits override:
38553855
// <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %a, <1 x i64> %b)
38563856
//
3857-
// Three operands are not implemented yet:
3857+
// Three operands:
38583858
// <4 x i32> @llvm.x86.avx512.vpdpbusd.128
38593859
// (<4 x i32> %s, <4 x i32> %a, <4 x i32> %b)
3860-
// (the result of multiply-add'ing %a and %b is accumulated with %s)
3860+
// (this is equivalent to multiply-add on %a and %b, followed by
3861+
// adding/"accumulating" %s. "Accumulation" stores the result in one
3862+
// of the source registers, but this accumulate vs. add distinction
3863+
// is lost when dealing with LLVM intrinsics.)
38613864
void handleVectorPmaddIntrinsic(IntrinsicInst &I, unsigned ReductionFactor,
38623865
unsigned EltSizeInBits = 0) {
38633866
IRBuilder<> IRB(&I);
@@ -3866,22 +3869,39 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
38663869
cast<FixedVectorType>(I.getType());
38673870
assert(isa<FixedVectorType>(ReturnType));
38683871

3869-
assert(I.arg_size() == 2);
3870-
38713872
// Vectors A and B, and shadows
3872-
Value *Va = I.getOperand(0);
3873-
Value *Vb = I.getOperand(1);
3873+
Value *Va = nullptr;
3874+
Value *Vb = nullptr;
3875+
Value *Sa = nullptr;
3876+
Value *Sb = nullptr;
38743877

3875-
Value *Sa = getShadow(&I, 0);
3876-
Value *Sb = getShadow(&I, 1);
3878+
assert(I.arg_size() == 2 || I.arg_size() == 3);
3879+
if (I.arg_size() == 2) {
3880+
Va = I.getOperand(0);
3881+
Vb = I.getOperand(1);
38773882

3878-
FixedVectorType *ParamType =
3879-
cast<FixedVectorType>(I.getArgOperand(0)->getType());
3880-
assert(ParamType == I.getArgOperand(1)->getType());
3883+
Sa = getShadow(&I, 0);
3884+
Sb = getShadow(&I, 1);
3885+
} else if (I.arg_size() == 3) {
3886+
// Operand 0 is the accumulator. We will deal with that below.
3887+
Va = I.getOperand(1);
3888+
Vb = I.getOperand(2);
3889+
3890+
Sa = getShadow(&I, 1);
3891+
Sb = getShadow(&I, 2);
3892+
}
3893+
3894+
FixedVectorType *ParamType = cast<FixedVectorType>(Va->getType());
3895+
assert(ParamType == Vb->getType());
38813896

38823897
assert(ParamType->getPrimitiveSizeInBits() ==
38833898
ReturnType->getPrimitiveSizeInBits());
38843899

3900+
if (I.arg_size() == 3) {
3901+
assert(ParamType == ReturnType);
3902+
assert(ParamType == I.getArgOperand(0)->getType());
3903+
}
3904+
38853905
FixedVectorType *ImplicitReturnType = ReturnType;
38863906
// Step 1: instrument multiplication of corresponding vector elements
38873907
if (EltSizeInBits) {
@@ -3944,10 +3964,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
39443964
Constant::getNullValue(Horizontal->getType())),
39453965
ImplicitReturnType);
39463966

3947-
// For MMX, cast it back to the required fake return type (<1 x i64>).
3967+
// Cast it back to the required fake return type (<1 x i64>).
39483968
if (EltSizeInBits)
39493969
OutShadow = CreateShadowCast(IRB, OutShadow, getShadowTy(&I));
39503970

3971+
// Step 3 (if applicable): instrument accumulator
3972+
if (I.arg_size() == 3)
3973+
OutShadow = IRB.CreateOr(OutShadow, getShadow(&I, 0));
3974+
39513975
setShadow(&I, OutShadow);
39523976
setOriginForNaryOp(I);
39533977
}
@@ -5525,6 +5549,143 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
55255549
handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
55265550
break;
55275551

5552+
// AVX Vector Neural Network Instructions: bytes
5553+
//
5554+
// Multiply and Add Packed Signed and Unsigned Bytes
5555+
// < 4 x i32> @llvm.x86.avx512.vpdpbusd.128
5556+
// (< 4 x i32>, < 4 x i32>, < 4 x i32>)
5557+
// < 8 x i32> @llvm.x86.avx512.vpdpbusd.256
5558+
// (< 8 x i32>, < 8 x i32>, < 8 x i32>)
5559+
// <16 x i32> @llvm.x86.avx512.vpdpbusd.512
5560+
// (<16 x i32>, <16 x i32>, <16 x i32>)
5561+
//
5562+
// Multiply and Add Unsigned and Signed Bytes With Saturation
5563+
// < 4 x i32> @llvm.x86.avx512.vpdpbusds.128
5564+
// (< 4 x i32>, < 4 x i32>, < 4 x i32>)
5565+
// < 8 x i32> @llvm.x86.avx512.vpdpbusds.256
5566+
// (< 8 x i32>, < 8 x i32>, < 8 x i32>)
5567+
// <16 x i32> @llvm.x86.avx512.vpdpbusds.512
5568+
// (<16 x i32>, <16 x i32>, <16 x i32>)
5569+
//
5570+
// < 4 x i32> @llvm.x86.avx2.vpdpbssd.128
5571+
// (< 4 x i32>, < 4 x i32>, < 4 x i32>)
5572+
// < 8 x i32> @llvm.x86.avx2.vpdpbssd.256
5573+
// (< 8 x i32>, < 8 x i32>, < 8 x i32>)
5574+
//
5575+
// < 4 x i32> @llvm.x86.avx2.vpdpbssds.128
5576+
// (< 4 x i32>, < 4 x i32>, < 4 x i32>)
5577+
// < 8 x i32> @llvm.x86.avx2.vpdpbssds.256
5578+
// (< 8 x i32>, < 8 x i32>, < 8 x i32>)
5579+
//
5580+
// <16 x i32> @llvm.x86.avx10.vpdpbssd.512
5581+
// (<16 x i32>, <16 x i32>, <16 x i32>)
5582+
// <16 x i32> @llvm.x86.avx10.vpdpbssds.512
5583+
// (<16 x i32>, <16 x i32>, <16 x i32>)
5584+
//
5585+
// These intrinsics are auto-upgraded into non-masked forms:
5586+
// <4 x i32> @llvm.x86.avx512.mask.vpdpbusd.128
5587+
// (<4 x i32>, <4 x i32>, <4 x i32>, i8)
5588+
// <4 x i32> @llvm.x86.avx512.maskz.vpdpbusd.128
5589+
// (<4 x i32>, <4 x i32>, <4 x i32>, i8)
5590+
// <8 x i32> @llvm.x86.avx512.mask.vpdpbusd.256
5591+
// (<8 x i32>, <8 x i32>, <8 x i32>, i8)
5592+
// <8 x i32> @llvm.x86.avx512.maskz.vpdpbusd.256
5593+
// (<8 x i32>, <8 x i32>, <8 x i32>, i8)
5594+
// <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512
5595+
// (<16 x i32>, <16 x i32>, <16 x i32>, i16)
5596+
// <16 x i32> @llvm.x86.avx512.maskz.vpdpbusd.512
5597+
// (<16 x i32>, <16 x i32>, <16 x i32>, i16)
5598+
//
5599+
// <4 x i32> @llvm.x86.avx512.mask.vpdpbusds.128
5600+
// (<4 x i32>, <4 x i32>, <4 x i32>, i8)
5601+
// <4 x i32> @llvm.x86.avx512.maskz.vpdpbusds.128
5602+
// (<4 x i32>, <4 x i32>, <4 x i32>, i8)
5603+
// <8 x i32> @llvm.x86.avx512.mask.vpdpbusds.256
5604+
// (<8 x i32>, <8 x i32>, <8 x i32>, i8)
5605+
// <8 x i32> @llvm.x86.avx512.maskz.vpdpbusds.256
5606+
// (<8 x i32>, <8 x i32>, <8 x i32>, i8)
5607+
// <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512
5608+
// (<16 x i32>, <16 x i32>, <16 x i32>, i16)
5609+
// <16 x i32> @llvm.x86.avx512.maskz.vpdpbusds.512
5610+
// (<16 x i32>, <16 x i32>, <16 x i32>, i16)
5611+
case Intrinsic::x86_avx512_vpdpbusd_128:
5612+
case Intrinsic::x86_avx512_vpdpbusd_256:
5613+
case Intrinsic::x86_avx512_vpdpbusd_512:
5614+
case Intrinsic::x86_avx512_vpdpbusds_128:
5615+
case Intrinsic::x86_avx512_vpdpbusds_256:
5616+
case Intrinsic::x86_avx512_vpdpbusds_512:
5617+
case Intrinsic::x86_avx2_vpdpbssd_128:
5618+
case Intrinsic::x86_avx2_vpdpbssd_256:
5619+
case Intrinsic::x86_avx2_vpdpbssds_128:
5620+
case Intrinsic::x86_avx2_vpdpbssds_256:
5621+
case Intrinsic::x86_avx10_vpdpbssd_512:
5622+
case Intrinsic::x86_avx10_vpdpbssds_512:
5623+
handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, /*EltSize=*/8);
5624+
break;
5625+
5626+
// AVX Vector Neural Network Instructions: words
5627+
//
5628+
// Multiply and Add Signed Word Integers
5629+
// < 4 x i32> @llvm.x86.avx512.vpdpwssd.128
5630+
// (< 4 x i32>, < 4 x i32>, < 4 x i32>)
5631+
// < 8 x i32> @llvm.x86.avx512.vpdpwssd.256
5632+
// (< 8 x i32>, < 8 x i32>, < 8 x i32>)
5633+
// <16 x i32> @llvm.x86.avx512.vpdpwssd.512
5634+
// (<16 x i32>, <16 x i32>, <16 x i32>)
5635+
//
5636+
// Multiply and Add Signed Word Integers With Saturation
5637+
// < 4 x i32> @llvm.x86.avx512.vpdpwssds.128
5638+
// (< 4 x i32>, < 4 x i32>, < 4 x i32>)
5639+
// < 8 x i32> @llvm.x86.avx512.vpdpwssds.256
5640+
// (< 8 x i32>, < 8 x i32>, < 8 x i32>)
5641+
// <16 x i32> @llvm.x86.avx512.vpdpwssds.512
5642+
// (<16 x i32>, <16 x i32>, <16 x i32>)
5643+
//
5644+
// These intrinsics are auto-upgraded into non-masked forms:
5645+
// <4 x i32> @llvm.x86.avx512.mask.vpdpwssd.128
5646+
// (<4 x i32>, <4 x i32>, <4 x i32>, i8)
5647+
// <4 x i32> @llvm.x86.avx512.maskz.vpdpwssd.128
5648+
// (<4 x i32>, <4 x i32>, <4 x i32>, i8)
5649+
// <8 x i32> @llvm.x86.avx512.mask.vpdpwssd.256
5650+
// (<8 x i32>, <8 x i32>, <8 x i32>, i8)
5651+
// <8 x i32> @llvm.x86.avx512.maskz.vpdpwssd.256
5652+
// (<8 x i32>, <8 x i32>, <8 x i32>, i8)
5653+
// <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512
5654+
// (<16 x i32>, <16 x i32>, <16 x i32>, i16)
5655+
// <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512
5656+
// (<16 x i32>, <16 x i32>, <16 x i32>, i16)
5657+
//
5658+
// <4 x i32> @llvm.x86.avx512.mask.vpdpwssds.128
5659+
// (<4 x i32>, <4 x i32>, <4 x i32>, i8)
5660+
// <4 x i32> @llvm.x86.avx512.maskz.vpdpwssds.128
5661+
// (<4 x i32>, <4 x i32>, <4 x i32>, i8)
5662+
// <8 x i32> @llvm.x86.avx512.mask.vpdpwssds.256
5663+
// (<8 x i32>, <8 x i32>, <8 x i32>, i8)
5664+
// <8 x i32> @llvm.x86.avx512.maskz.vpdpwssds.256
5665+
// (<8 x i32>, <8 x i32>, <8 x i32>, i8)
5666+
// <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512
5667+
// (<16 x i32>, <16 x i32>, <16 x i32>, i16)
5668+
// <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512
5669+
// (<16 x i32>, <16 x i32>, <16 x i32>, i16)
5670+
case Intrinsic::x86_avx512_vpdpwssd_128:
5671+
case Intrinsic::x86_avx512_vpdpwssd_256:
5672+
case Intrinsic::x86_avx512_vpdpwssd_512:
5673+
case Intrinsic::x86_avx512_vpdpwssds_128:
5674+
case Intrinsic::x86_avx512_vpdpwssds_256:
5675+
case Intrinsic::x86_avx512_vpdpwssds_512:
5676+
handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
5677+
break;
5678+
5679+
// TODO: Dot Product of BF16 Pairs Accumulated Into Packed Single
5680+
// Precision
5681+
// <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128
5682+
// (<4 x float>, <8 x bfloat>, <8 x bfloat>)
5683+
// <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256
5684+
// (<8 x float>, <16 x bfloat>, <16 x bfloat>)
5685+
// <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512
5686+
// (<16 x float>, <32 x bfloat>, <32 x bfloat>)
5687+
// handleVectorPmaddIntrinsic() currently only handles integer types.
5688+
55285689
case Intrinsic::x86_sse_cmp_ss:
55295690
case Intrinsic::x86_sse2_cmp_sd:
55305691
case Intrinsic::x86_sse_comieq_ss:

llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll

Lines changed: 22 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,14 @@ target triple = "amdgcn-amd-amdhsa"
99
define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
1010
; GFX1250-SDAG-LABEL: use_private_to_flat_addrspacecast:
1111
; GFX1250-SDAG: ; %bb.0:
12-
; GFX1250-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x24
12+
; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
1313
; GFX1250-SDAG-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
14-
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo
1514
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
1615
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
17-
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_lshlrev_b32 v1, 20, v0
18-
; GFX1250-SDAG-NEXT: s_cmp_lg_u32 s2, -1
16+
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_lshlrev_b32 v1, 20, v0
17+
; GFX1250-SDAG-NEXT: s_cmp_lg_u32 s0, -1
1918
; GFX1250-SDAG-NEXT: s_cselect_b32 vcc_lo, -1, 0
20-
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
19+
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], src_flat_scratch_base_lo, v[0:1]
2120
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
2221
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, 0, v1
2322
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo
@@ -27,20 +26,20 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p
2726
;
2827
; GFX1250-GISEL-LABEL: use_private_to_flat_addrspacecast:
2928
; GFX1250-GISEL: ; %bb.0:
30-
; GFX1250-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x24
31-
; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo
29+
; GFX1250-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24
30+
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], src_flat_scratch_base_lo
3231
; GFX1250-GISEL-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0
33-
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
3432
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
35-
; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, -1
36-
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
37-
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0
33+
; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, -1
34+
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
35+
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
3836
; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2
39-
; GFX1250-GISEL-NEXT: s_cselect_b32 s0, 1, 0
40-
; GFX1250-GISEL-NEXT: s_and_b32 s0, 1, s0
41-
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
37+
; GFX1250-GISEL-NEXT: s_cselect_b32 s1, 1, 0
38+
; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
39+
; GFX1250-GISEL-NEXT: s_and_b32 s1, 1, s1
4240
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v2, v1, vcc_lo
43-
; GFX1250-GISEL-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
41+
; GFX1250-GISEL-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
42+
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
4443
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, 0, v1
4544
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo
4645
; GFX1250-GISEL-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
@@ -56,27 +55,24 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast_nonnull(ptr addrspa
5655
; GFX1250-SDAG: ; %bb.0:
5756
; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
5857
; GFX1250-SDAG-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
59-
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
58+
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
6059
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 20, v0
6160
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
6261
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, s0
63-
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo
64-
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
65-
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
62+
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], src_flat_scratch_base_lo, v[0:1]
6663
; GFX1250-SDAG-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
6764
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
6865
; GFX1250-SDAG-NEXT: s_endpgm
6966
;
7067
; GFX1250-GISEL-LABEL: use_private_to_flat_addrspacecast_nonnull:
7168
; GFX1250-GISEL: ; %bb.0:
72-
; GFX1250-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x24
73-
; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo
69+
; GFX1250-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24
7470
; GFX1250-GISEL-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0
75-
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
71+
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], src_flat_scratch_base_lo
7672
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
7773
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v2, 20, v2
7874
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
79-
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0
75+
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
8076
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
8177
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v2, v1, vcc_lo
8278
; GFX1250-GISEL-NEXT: flat_store_b32 v[0:1], v3 scope:SCOPE_SYS
@@ -91,10 +87,9 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast(ptr %ptr) {
9187
; GFX1250-LABEL: use_flat_to_private_addrspacecast:
9288
; GFX1250: ; %bb.0:
9389
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
94-
; GFX1250-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo
9590
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
9691
; GFX1250-NEXT: s_wait_kmcnt 0x0
97-
; GFX1250-NEXT: s_sub_co_i32 s2, s0, s2
92+
; GFX1250-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo
9893
; GFX1250-NEXT: s_cmp_lg_u64 s[0:1], 0
9994
; GFX1250-NEXT: s_cselect_b32 s0, s2, -1
10095
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
@@ -110,9 +105,8 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast_nonnull(ptr %ptr) {
110105
; GFX1250-SDAG: ; %bb.0:
111106
; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
112107
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0
113-
; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
114108
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
115-
; GFX1250-SDAG-NEXT: s_sub_co_i32 s0, s0, s1
109+
; GFX1250-SDAG-NEXT: s_sub_co_i32 s0, s0, src_flat_scratch_base_lo
116110
; GFX1250-SDAG-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
117111
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
118112
; GFX1250-SDAG-NEXT: s_endpgm
@@ -122,9 +116,7 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast_nonnull(ptr %ptr) {
122116
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
123117
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0
124118
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
125-
; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
126-
; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
127-
; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s0, s1
119+
; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s0, src_flat_scratch_base_lo
128120
; GFX1250-GISEL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
129121
; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
130122
; GFX1250-GISEL-NEXT: s_endpgm

0 commit comments

Comments
 (0)