Skip to content

Commit 962bd62

Browse files
committed
waitcnt patch
1 parent 693146d commit 962bd62

15 files changed

+97
-433
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -586,6 +586,12 @@ def FeatureRealTrue16Insts : SubtargetFeature<"real-true16",
586586
"Use true 16-bit registers"
587587
>;
588588

589+
def Feature16bitD16HWBug : SubtargetFeature<"d16-hw-bug",
590+
"Enable16bitD16HWBug",
591+
"true",
592+
"D16 for 16 bit data type interfere the other half in true16 mode"
593+
>;
594+
589595
def FeatureBF16TransInsts : SubtargetFeature<"bf16-trans-insts",
590596
"HasBF16TransInsts",
591597
"true",
@@ -1934,7 +1940,9 @@ def FeatureISAVersion11_Common : FeatureSet<
19341940
FeaturePackedTID,
19351941
FeatureVcmpxPermlaneHazard,
19361942
FeatureMemoryAtomicFAddF32DenormalSupport,
1937-
FeatureRealTrue16Insts]>;
1943+
FeatureRealTrue16Insts,
1944+
Feature16bitD16HWBug,
1945+
]>;
19381946

19391947
// There are few workarounds that need to be
19401948
// added to all targets. This pessimizes codegen
@@ -2570,6 +2578,12 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() &&
25702578
// FIXME When we default to RealTrue16 instead of Fake, change the line as follows.
25712579
// AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>;
25722580

2581+
def Has16bitD16HWBug: Predicate<"Subtarget->has16bitD16HWBug()">,
2582+
AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, Feature16bitD16HWBug)>;
2583+
def NotHas16bitD16HWBug: Predicate<"Subtarget->useRealTrue16Insts() && "
2584+
"!Subtarget->has16bitD16HWBug()">,
2585+
AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, (not Feature16bitD16HWBug))>;
2586+
25732587
def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">,
25742588
AssemblerPredicate<(all_of FeatureBF16TransInsts)>;
25752589

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@ bool AMDGPUSubtarget::useRealTrue16Insts() const {
3838
return hasTrue16BitInsts() && EnableRealTrue16Insts;
3939
}
4040

41+
bool AMDGPUSubtarget::has16bitD16HWBug() const {
42+
return hasTrue16BitInsts() && useRealTrue16Insts() && Enable16bitD16HWBug;
43+
}
44+
4145
// Returns the maximum per-workgroup LDS allocation size (in bytes) that still
4246
// allows the given function to achieve an occupancy of NWaves waves per
4347
// SIMD / EU, taking into account only the function's *maximum* workgroup size.

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ class AMDGPUSubtarget {
5959
bool HasCvtPkF16F32Inst = false;
6060
bool HasF32ToF16BF16ConversionSRInsts = false;
6161
bool EnableRealTrue16Insts = false;
62+
bool Enable16bitD16HWBug = false;
6263
bool HasBF16TransInsts = false;
6364
bool HasBF16ConversionInsts = false;
6465
bool HasBF16PackedInsts = false;
@@ -224,6 +225,8 @@ class AMDGPUSubtarget {
224225
// supported and the support for fake True16 instructions is removed.
225226
bool useRealTrue16Insts() const;
226227

228+
bool has16bitD16HWBug() const;
229+
227230
bool hasBF16TransInsts() const { return HasBF16TransInsts; }
228231

229232
bool hasBF16ConversionInsts() const {

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -845,6 +845,14 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
845845
assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
846846
assert(Size % 16 == 0);
847847
Result.second = Result.first + (Size / 16);
848+
849+
if (Size == 16 && Context->ST->has16bitD16HWBug()) {
850+
// also update the other half since lo16/hi16 interfere with each other
851+
if (AMDGPU::isHi16Reg(MCReg, *TRI))
852+
Result.first -= 1;
853+
else
854+
Result.second += 1;
855+
}
848856
} else if (TRI->isSGPRReg(*MRI, Op.getReg()) && RegIdx < SQ_MAX_PGM_SGPRS) {
849857
// SGPRs including VCC, TTMPs and EXEC but excluding read-only scalar
850858
// sources like SRC_PRIVATE_BASE.

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll

Lines changed: 23 additions & 255 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll

Lines changed: 8 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -5033,6 +5033,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
50335033
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l
50345034
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l
50355035
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l
5036+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
50365037
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l
50375038
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l
50385039
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
@@ -5059,15 +5060,10 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
50595060
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l
50605061
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
50615062
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
5062-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
50635063
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l
5064-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
50655064
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h
5066-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
50675065
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l
5068-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
50695066
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h
5070-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
50715067
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l
50725068
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36
50735069
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -11993,6 +11989,7 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
1199311989
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l
1199411990
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l
1199511991
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l
11992+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
1199611993
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l
1199711994
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l
1199811995
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
@@ -12019,15 +12016,10 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
1201912016
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l
1202012017
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
1202112018
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
12022-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
1202312019
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l
12024-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
1202512020
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h
12026-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
1202712021
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l
12028-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
1202912022
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h
12030-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
1203112023
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l
1203212024
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36
1203312025
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -18559,6 +18551,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
1855918551
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:20
1856018552
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:12
1856118553
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:4
18554+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
1856218555
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v29.l
1856318556
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l
1856418557
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v25.l
@@ -18596,13 +18589,9 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
1859618589
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v35.h
1859718590
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
1859818591
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
18599-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
1860018592
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v33.h
18601-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
1860218593
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
18603-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
1860418594
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.h
18605-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
1860618595
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
1860718596
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
1860818597
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v36.l
@@ -18701,10 +18690,9 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
1870118690
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
1870218691
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2
1870318692
; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true
18704-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
18693+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1870518694
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.h, 3
1870618695
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3
18707-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1870818696
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.l, 3
1870918697
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3
1871018698
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.l, 3
@@ -24640,6 +24628,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
2464024628
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:20
2464124629
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:12
2464224630
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:4
24631+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
2464324632
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v29.l
2464424633
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l
2464524634
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v25.l
@@ -24677,13 +24666,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
2467724666
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v35.h
2467824667
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
2467924668
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
24680-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
2468124669
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v33.h
24682-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
2468324670
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
24684-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
2468524671
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.h
24686-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
2468724672
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
2468824673
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
2468924674
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v36.l
@@ -24782,10 +24767,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
2478224767
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
2478324768
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB62_2
2478424769
; GFX11-TRUE16-NEXT: .LBB62_4: ; %cmp.true
24785-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
24770+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
2478624771
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.h, 3
2478724772
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3
24788-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
2478924773
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.l, 3
2479024774
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3
2479124775
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.l, 3
@@ -28760,6 +28744,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
2876028744
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
2876128745
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
2876228746
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
28747+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
2876328748
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l
2876428749
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l
2876528750
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l
@@ -28792,15 +28777,10 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
2879228777
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h
2879328778
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
2879428779
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
28795-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
2879628780
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
28797-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
2879828781
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h
28799-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
2880028782
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
28801-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
2880228783
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h
28803-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
2880428784
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l
2880528785
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49
2880628786
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -32871,6 +32851,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
3287132851
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
3287232852
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
3287332853
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
32854+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
3287432855
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l
3287532856
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l
3287632857
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l
@@ -32903,15 +32884,10 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
3290332884
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h
3290432885
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
3290532886
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
32906-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
3290732887
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
32908-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
3290932888
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h
32910-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
3291132889
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
32912-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
3291332890
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h
32914-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
3291532891
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l
3291632892
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49
3291732893
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0

0 commit comments

Comments
 (0)