Skip to content

Commit d544325

Browse files
committed
[SelectionDAG] Add DoNotPoisonEltMask to SimplifyMultipleUseDemandedBits/VectorElts
Add DoNotPoisonEltMask to SimplifyMultipleUseDemandedBits and SimplifyMultipleUseDemandedVectorElts. Goal is to reduce amount of regressions after fix of #138513.
1 parent ead9659 commit d544325

18 files changed

+1119
-1185
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4231,6 +4231,16 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
42314231
/// More limited version of SimplifyDemandedBits that can be used to "look
42324232
/// through" ops that don't contribute to the DemandedBits/DemandedElts -
42334233
/// bitwise ops etc.
4234+
/// Vector elements that aren't demanded can be turned into poison unless the
4235+
/// corresponding bit in the \p DoNotPoisonEltMask is set.
4236+
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits,
4237+
const APInt &DemandedElts,
4238+
const APInt &DoNotPoisonEltMask,
4239+
SelectionDAG &DAG,
4240+
unsigned Depth = 0) const;
4241+
4242+
/// Helper wrapper around SimplifyMultipleUseDemandedBits, with
4243+
/// DoNotPoisonEltMask being set to zero.
42344244
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits,
42354245
const APInt &DemandedElts,
42364246
SelectionDAG &DAG,
@@ -4246,6 +4256,7 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
42464256
/// bits from only some vector elements.
42474257
SDValue SimplifyMultipleUseDemandedVectorElts(SDValue Op,
42484258
const APInt &DemandedElts,
4259+
const APInt &DoNotPoisonEltMask,
42494260
SelectionDAG &DAG,
42504261
unsigned Depth = 0) const;
42514262

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 89 additions & 52 deletions
Large diffs are not rendered by default.

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -41402,10 +41402,11 @@ static SDValue combineX86ShufflesRecursively(
4140241402
// The Op itself may be of different VT, so we need to scale the mask.
4140341403
unsigned NumOpElts = Op.getValueType().getVectorNumElements();
4140441404
APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
41405+
APInt DoNotPoisonElts = APInt::getZero(NumOpElts);
4140541406

4140641407
// Can this operand be simplified any further, given it's demanded elements?
4140741408
if (SDValue NewOp = TLI.SimplifyMultipleUseDemandedVectorElts(
41408-
Op, OpScaledDemandedElts, DAG))
41409+
Op, OpScaledDemandedElts, DoNotPoisonElts, DAG))
4140941410
Op = NewOp;
4141041411
}
4141141412
// FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
@@ -43418,12 +43419,13 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
4341843419
// Aggressively peek through ops to get at the demanded elts.
4341943420
if (!DemandedElts.isAllOnes()) {
4342043421
unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
43421-
APInt DemandedSrcElts =
43422-
APIntOps::ScaleBitMask(DemandedElts | DoNotPoisonEltMask, NumSrcElts);
43422+
APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
43423+
APInt DoNotPoisonSrcElts =
43424+
APIntOps::ScaleBitMask(DoNotPoisonEltMask, NumSrcElts);
4342343425
SDValue NewLHS = SimplifyMultipleUseDemandedVectorElts(
43424-
LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43426+
LHS, DemandedSrcElts, DoNotPoisonSrcElts, TLO.DAG, Depth + 1);
4342543427
SDValue NewRHS = SimplifyMultipleUseDemandedVectorElts(
43426-
RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43428+
RHS, DemandedSrcElts, DoNotPoisonSrcElts, TLO.DAG, Depth + 1);
4342743429
if (NewLHS || NewRHS) {
4342843430
NewLHS = NewLHS ? NewLHS : LHS;
4342943431
NewRHS = NewRHS ? NewRHS : RHS;
@@ -43476,7 +43478,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
4347643478
// Aggressively peek through ops to get at the demanded elts.
4347743479
if (!DemandedElts.isAllOnes())
4347843480
if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
43479-
Src, DemandedElts | DoNotPoisonEltMask, TLO.DAG, Depth + 1))
43481+
Src, DemandedElts, DoNotPoisonEltMask, TLO.DAG, Depth + 1))
4348043482
return TLO.CombineTo(
4348143483
Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
4348243484
break;
@@ -43723,9 +43725,9 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
4372343725
// TODO - we should do this for all target/faux shuffles ops.
4372443726
if (!DemandedElts.isAllOnes()) {
4372543727
SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(
43726-
N0, DemandedLHS | DoNotPoisonLHS, TLO.DAG, Depth + 1);
43728+
N0, DemandedLHS, DoNotPoisonLHS, TLO.DAG, Depth + 1);
4372743729
SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(
43728-
N1, DemandedRHS | DoNotPoisonRHS, TLO.DAG, Depth + 1);
43730+
N1, DemandedRHS, DoNotPoisonRHS, TLO.DAG, Depth + 1);
4372943731
if (NewN0 || NewN1) {
4373043732
NewN0 = NewN0 ? NewN0 : N0;
4373143733
NewN1 = NewN1 ? NewN1 : N1;
@@ -43763,9 +43765,9 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
4376343765
// TODO: Handle repeated operands.
4376443766
if (N0 != N1 && !DemandedElts.isAllOnes()) {
4376543767
SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(
43766-
N0, DemandedLHS | DoNotPoisonLHS, TLO.DAG, Depth + 1);
43768+
N0, DemandedLHS, DoNotPoisonLHS, TLO.DAG, Depth + 1);
4376743769
SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(
43768-
N1, DemandedRHS | DoNotPoisonRHS, TLO.DAG, Depth + 1);
43770+
N1, DemandedRHS, DoNotPoisonRHS, TLO.DAG, Depth + 1);
4376943771
if (NewN0 || NewN1) {
4377043772
NewN0 = NewN0 ? NewN0 : N0;
4377143773
NewN1 = NewN1 ? NewN1 : N1;
@@ -43863,14 +43865,15 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
4386343865
break;
4386443866
APInt SrcUndef, SrcZero;
4386543867
APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
43868+
// VBROADCAST only uses element zero. Allow poison in other elements.
4386643869
APInt DoNotPoisonSrcElts = APInt::getZero(SrcVT.getVectorNumElements());
4386743870
if (SimplifyDemandedVectorElts(Src, SrcElts, DoNotPoisonSrcElts, SrcUndef, SrcZero, TLO,
4386843871
Depth + 1))
4386943872
return true;
4387043873
// Aggressively peek through src to get at the demanded elt.
4387143874
// TODO - we should do this for all target/faux shuffles ops.
4387243875
if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
43873-
Src, SrcElts, TLO.DAG, Depth + 1))
43876+
Src, SrcElts, DoNotPoisonSrcElts, TLO.DAG, Depth + 1))
4387443877
return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
4387543878
break;
4387643879
}

llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -101,12 +101,13 @@ define i8 @test_v3i8(<3 x i8> %a) nounwind {
101101
define i8 @test_v9i8(<9 x i8> %a) nounwind {
102102
; CHECK-LABEL: test_v9i8:
103103
; CHECK: // %bb.0:
104-
; CHECK-NEXT: movi v1.2d, #0xffffffffffffff00
104+
; CHECK-NEXT: movi v1.2d, #0xffffff00ffffff00
105+
; CHECK-NEXT: fmov x8, d0
105106
; CHECK-NEXT: orr v1.16b, v0.16b, v1.16b
106107
; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8
107108
; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
108-
; CHECK-NEXT: fmov x8, d0
109-
; CHECK-NEXT: and x8, x8, x8, lsr #32
109+
; CHECK-NEXT: fmov x9, d0
110+
; CHECK-NEXT: and x8, x9, x8, lsr #32
110111
; CHECK-NEXT: and x8, x8, x8, lsr #16
111112
; CHECK-NEXT: lsr x9, x8, #8
112113
; CHECK-NEXT: and w0, w8, w9
@@ -118,14 +119,12 @@ define i8 @test_v9i8(<9 x i8> %a) nounwind {
118119
define i32 @test_v3i32(<3 x i32> %a) nounwind {
119120
; CHECK-LABEL: test_v3i32:
120121
; CHECK: // %bb.0:
121-
; CHECK-NEXT: mov v1.16b, v0.16b
122-
; CHECK-NEXT: mov w8, #-1 // =0xffffffff
123-
; CHECK-NEXT: mov v1.s[3], w8
124-
; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8
125-
; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
122+
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
126123
; CHECK-NEXT: fmov x8, d0
127-
; CHECK-NEXT: lsr x9, x8, #32
128-
; CHECK-NEXT: and w0, w8, w9
124+
; CHECK-NEXT: lsr x8, x8, #32
125+
; CHECK-NEXT: and v1.8b, v0.8b, v1.8b
126+
; CHECK-NEXT: fmov x9, d1
127+
; CHECK-NEXT: and w0, w9, w8
129128
; CHECK-NEXT: ret
130129
%b = call i32 @llvm.vector.reduce.and.v3i32(<3 x i32> %a)
131130
ret i32 %b

llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll

Lines changed: 15 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -985,9 +985,8 @@ define void @v_shuffle_v3bf16_v2bf16__u_1_1(ptr addrspace(1) inreg %ptr) {
985985
; GFX900-NEXT: ;;#ASMSTART
986986
; GFX900-NEXT: ; def v1
987987
; GFX900-NEXT: ;;#ASMEND
988+
; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
988989
; GFX900-NEXT: global_store_dword v0, v1, s[16:17]
989-
; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
990-
; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4
991990
; GFX900-NEXT: s_waitcnt vmcnt(0)
992991
; GFX900-NEXT: s_setpc_b64 s[30:31]
993992
;
@@ -998,9 +997,8 @@ define void @v_shuffle_v3bf16_v2bf16__u_1_1(ptr addrspace(1) inreg %ptr) {
998997
; GFX90A-NEXT: ;;#ASMSTART
999998
; GFX90A-NEXT: ; def v1
1000999
; GFX90A-NEXT: ;;#ASMEND
1000+
; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
10011001
; GFX90A-NEXT: global_store_dword v0, v1, s[16:17]
1002-
; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1003-
; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4
10041002
; GFX90A-NEXT: s_waitcnt vmcnt(0)
10051003
; GFX90A-NEXT: s_setpc_b64 s[30:31]
10061004
;
@@ -1011,9 +1009,8 @@ define void @v_shuffle_v3bf16_v2bf16__u_1_1(ptr addrspace(1) inreg %ptr) {
10111009
; GFX942-NEXT: ;;#ASMSTART
10121010
; GFX942-NEXT: ; def v1
10131011
; GFX942-NEXT: ;;#ASMEND
1012+
; GFX942-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4
10141013
; GFX942-NEXT: global_store_dword v0, v1, s[0:1]
1015-
; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1016-
; GFX942-NEXT: global_store_short v0, v1, s[0:1] offset:4
10171014
; GFX942-NEXT: s_waitcnt vmcnt(0)
10181015
; GFX942-NEXT: s_setpc_b64 s[30:31]
10191016
%vec0 = call <2 x bfloat> asm "; def $0", "=v"()
@@ -1030,9 +1027,8 @@ define void @v_shuffle_v3bf16_v2bf16__0_1_1(ptr addrspace(1) inreg %ptr) {
10301027
; GFX900-NEXT: ;;#ASMSTART
10311028
; GFX900-NEXT: ; def v1
10321029
; GFX900-NEXT: ;;#ASMEND
1030+
; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
10331031
; GFX900-NEXT: global_store_dword v0, v1, s[16:17]
1034-
; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1035-
; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4
10361032
; GFX900-NEXT: s_waitcnt vmcnt(0)
10371033
; GFX900-NEXT: s_setpc_b64 s[30:31]
10381034
;
@@ -1043,9 +1039,8 @@ define void @v_shuffle_v3bf16_v2bf16__0_1_1(ptr addrspace(1) inreg %ptr) {
10431039
; GFX90A-NEXT: ;;#ASMSTART
10441040
; GFX90A-NEXT: ; def v1
10451041
; GFX90A-NEXT: ;;#ASMEND
1042+
; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
10461043
; GFX90A-NEXT: global_store_dword v0, v1, s[16:17]
1047-
; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1048-
; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4
10491044
; GFX90A-NEXT: s_waitcnt vmcnt(0)
10501045
; GFX90A-NEXT: s_setpc_b64 s[30:31]
10511046
;
@@ -1056,9 +1051,8 @@ define void @v_shuffle_v3bf16_v2bf16__0_1_1(ptr addrspace(1) inreg %ptr) {
10561051
; GFX942-NEXT: ;;#ASMSTART
10571052
; GFX942-NEXT: ; def v1
10581053
; GFX942-NEXT: ;;#ASMEND
1054+
; GFX942-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4
10591055
; GFX942-NEXT: global_store_dword v0, v1, s[0:1]
1060-
; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1061-
; GFX942-NEXT: global_store_short v0, v1, s[0:1] offset:4
10621056
; GFX942-NEXT: s_waitcnt vmcnt(0)
10631057
; GFX942-NEXT: s_setpc_b64 s[30:31]
10641058
%vec0 = call <2 x bfloat> asm "; def $0", "=v"()
@@ -1126,9 +1120,8 @@ define void @v_shuffle_v3bf16_v2bf16__2_1_1(ptr addrspace(1) inreg %ptr) {
11261120
; GFX900-NEXT: ;;#ASMSTART
11271121
; GFX900-NEXT: ; def v1
11281122
; GFX900-NEXT: ;;#ASMEND
1123+
; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
11291124
; GFX900-NEXT: global_store_dword v0, v1, s[16:17]
1130-
; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1131-
; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4
11321125
; GFX900-NEXT: s_waitcnt vmcnt(0)
11331126
; GFX900-NEXT: s_setpc_b64 s[30:31]
11341127
;
@@ -1139,9 +1132,8 @@ define void @v_shuffle_v3bf16_v2bf16__2_1_1(ptr addrspace(1) inreg %ptr) {
11391132
; GFX90A-NEXT: ;;#ASMSTART
11401133
; GFX90A-NEXT: ; def v1
11411134
; GFX90A-NEXT: ;;#ASMEND
1135+
; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
11421136
; GFX90A-NEXT: global_store_dword v0, v1, s[16:17]
1143-
; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1144-
; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4
11451137
; GFX90A-NEXT: s_waitcnt vmcnt(0)
11461138
; GFX90A-NEXT: s_setpc_b64 s[30:31]
11471139
;
@@ -1152,9 +1144,8 @@ define void @v_shuffle_v3bf16_v2bf16__2_1_1(ptr addrspace(1) inreg %ptr) {
11521144
; GFX942-NEXT: ;;#ASMSTART
11531145
; GFX942-NEXT: ; def v1
11541146
; GFX942-NEXT: ;;#ASMEND
1147+
; GFX942-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4
11551148
; GFX942-NEXT: global_store_dword v0, v1, s[0:1]
1156-
; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1157-
; GFX942-NEXT: global_store_short v0, v1, s[0:1] offset:4
11581149
; GFX942-NEXT: s_waitcnt vmcnt(0)
11591150
; GFX942-NEXT: s_setpc_b64 s[30:31]
11601151
%vec0 = call <2 x bfloat> asm "; def $0", "=v"()
@@ -1713,9 +1704,8 @@ define void @v_shuffle_v3bf16_v2bf16__u_3_3(ptr addrspace(1) inreg %ptr) {
17131704
; GFX900-NEXT: ;;#ASMSTART
17141705
; GFX900-NEXT: ; def v1
17151706
; GFX900-NEXT: ;;#ASMEND
1707+
; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
17161708
; GFX900-NEXT: global_store_dword v0, v1, s[16:17]
1717-
; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1718-
; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4
17191709
; GFX900-NEXT: s_waitcnt vmcnt(0)
17201710
; GFX900-NEXT: s_setpc_b64 s[30:31]
17211711
;
@@ -1726,9 +1716,8 @@ define void @v_shuffle_v3bf16_v2bf16__u_3_3(ptr addrspace(1) inreg %ptr) {
17261716
; GFX90A-NEXT: ;;#ASMSTART
17271717
; GFX90A-NEXT: ; def v1
17281718
; GFX90A-NEXT: ;;#ASMEND
1719+
; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
17291720
; GFX90A-NEXT: global_store_dword v0, v1, s[16:17]
1730-
; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1731-
; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4
17321721
; GFX90A-NEXT: s_waitcnt vmcnt(0)
17331722
; GFX90A-NEXT: s_setpc_b64 s[30:31]
17341723
;
@@ -1739,9 +1728,8 @@ define void @v_shuffle_v3bf16_v2bf16__u_3_3(ptr addrspace(1) inreg %ptr) {
17391728
; GFX942-NEXT: ;;#ASMSTART
17401729
; GFX942-NEXT: ; def v1
17411730
; GFX942-NEXT: ;;#ASMEND
1731+
; GFX942-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4
17421732
; GFX942-NEXT: global_store_dword v0, v1, s[0:1]
1743-
; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1744-
; GFX942-NEXT: global_store_short v0, v1, s[0:1] offset:4
17451733
; GFX942-NEXT: s_waitcnt vmcnt(0)
17461734
; GFX942-NEXT: s_setpc_b64 s[30:31]
17471735
%vec0 = call <2 x bfloat> asm "; def $0", "=v"()
@@ -1882,9 +1870,8 @@ define void @v_shuffle_v3bf16_v2bf16__2_3_3(ptr addrspace(1) inreg %ptr) {
18821870
; GFX900-NEXT: ;;#ASMSTART
18831871
; GFX900-NEXT: ; def v1
18841872
; GFX900-NEXT: ;;#ASMEND
1873+
; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
18851874
; GFX900-NEXT: global_store_dword v0, v1, s[16:17]
1886-
; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1887-
; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4
18881875
; GFX900-NEXT: s_waitcnt vmcnt(0)
18891876
; GFX900-NEXT: s_setpc_b64 s[30:31]
18901877
;
@@ -1895,9 +1882,8 @@ define void @v_shuffle_v3bf16_v2bf16__2_3_3(ptr addrspace(1) inreg %ptr) {
18951882
; GFX90A-NEXT: ;;#ASMSTART
18961883
; GFX90A-NEXT: ; def v1
18971884
; GFX90A-NEXT: ;;#ASMEND
1885+
; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
18981886
; GFX90A-NEXT: global_store_dword v0, v1, s[16:17]
1899-
; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1900-
; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4
19011887
; GFX90A-NEXT: s_waitcnt vmcnt(0)
19021888
; GFX90A-NEXT: s_setpc_b64 s[30:31]
19031889
;
@@ -1908,9 +1894,8 @@ define void @v_shuffle_v3bf16_v2bf16__2_3_3(ptr addrspace(1) inreg %ptr) {
19081894
; GFX942-NEXT: ;;#ASMSTART
19091895
; GFX942-NEXT: ; def v1
19101896
; GFX942-NEXT: ;;#ASMEND
1897+
; GFX942-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4
19111898
; GFX942-NEXT: global_store_dword v0, v1, s[0:1]
1912-
; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1913-
; GFX942-NEXT: global_store_short v0, v1, s[0:1] offset:4
19141899
; GFX942-NEXT: s_waitcnt vmcnt(0)
19151900
; GFX942-NEXT: s_setpc_b64 s[30:31]
19161901
%vec0 = call <2 x bfloat> asm "; def $0", "=v"()

0 commit comments

Comments
 (0)