Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -586,6 +586,12 @@ def FeatureRealTrue16Insts : SubtargetFeature<"real-true16",
"Use true 16-bit registers"
>;

def Feature16bitD16HWBug : SubtargetFeature<"d16-hw-bug",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd prefer to find a more descriptive name for the feature. The symptom is that for waitcnt insertion purposes you need to treat D16 loads as if they write to a full 32-bit VGPR, right? So maybe something like "D16Writes32BitVgpr" or "D16LoadsWriteFullVgpr"?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I personally like "D16Writes32BitVgpr". Updated

"Enable16bitD16HWBug",
"true",
"D16 for 16 bit data type interfere the other half in true16 mode"
>;

def FeatureBF16TransInsts : SubtargetFeature<"bf16-trans-insts",
"HasBF16TransInsts",
"true",
Expand Down Expand Up @@ -1934,7 +1940,9 @@ def FeatureISAVersion11_Common : FeatureSet<
FeaturePackedTID,
FeatureVcmpxPermlaneHazard,
FeatureMemoryAtomicFAddF32DenormalSupport,
FeatureRealTrue16Insts]>;
FeatureRealTrue16Insts,
Feature16bitD16HWBug,
]>;

// There are few workarounds that need to be
// added to all targets. This pessimizes codegen
Expand Down Expand Up @@ -2570,6 +2578,12 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() &&
// FIXME When we default to RealTrue16 instead of Fake, change the line as follows.
// AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>;

def Has16bitD16HWBug: Predicate<"Subtarget->has16bitD16HWBug()">,
AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, Feature16bitD16HWBug)>;
def NotHas16bitD16HWBug: Predicate<"Subtarget->useRealTrue16Insts() && "
"!Subtarget->has16bitD16HWBug()">,
AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, (not Feature16bitD16HWBug))>;

def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">,
AssemblerPredicate<(all_of FeatureBF16TransInsts)>;

Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ bool AMDGPUSubtarget::useRealTrue16Insts() const {
return hasTrue16BitInsts() && EnableRealTrue16Insts;
}

bool AMDGPUSubtarget::has16bitD16HWBug() const {
return hasTrue16BitInsts() && useRealTrue16Insts() && Enable16bitD16HWBug;
}

// Returns the maximum per-workgroup LDS allocation size (in bytes) that still
// allows the given function to achieve an occupancy of NWaves waves per
// SIMD / EU, taking into account only the function's *maximum* workgroup size.
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ class AMDGPUSubtarget {
bool HasCvtPkF16F32Inst = false;
bool HasF32ToF16BF16ConversionSRInsts = false;
bool EnableRealTrue16Insts = false;
bool Enable16bitD16HWBug = false;
bool HasBF16TransInsts = false;
bool HasBF16ConversionInsts = false;
bool HasBF16PackedInsts = false;
Expand Down Expand Up @@ -224,6 +225,8 @@ class AMDGPUSubtarget {
// supported and the support for fake True16 instructions is removed.
bool useRealTrue16Insts() const;

bool has16bitD16HWBug() const;

bool hasBF16TransInsts() const { return HasBF16TransInsts; }

bool hasBF16ConversionInsts() const {
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -845,6 +845,14 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
assert(Size % 16 == 0);
Result.second = Result.first + (Size / 16);

if (Size == 16 && Context->ST->has16bitD16HWBug()) {
// also update the other half since lo16/hi16 interfere with each other
if (AMDGPU::isHi16Reg(MCReg, *TRI))
Result.first -= 1;
else
Result.second += 1;
}
} else if (TRI->isSGPRReg(*MRI, Op.getReg()) && RegIdx < SQ_MAX_PGM_SGPRS) {
// SGPRs including VCC, TTMPs and EXEC but excluding read-only scalar
// sources like SRC_PRIVATE_BASE.
Expand Down
278 changes: 23 additions & 255 deletions llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll

Large diffs are not rendered by default.

40 changes: 8 additions & 32 deletions llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5033,6 +5033,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
Expand All @@ -5059,15 +5060,10 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
Expand Down Expand Up @@ -11993,6 +11989,7 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
Expand All @@ -12019,15 +12016,10 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
Expand Down Expand Up @@ -18559,6 +18551,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v25.l
Expand Down Expand Up @@ -18596,13 +18589,9 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v35.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v33.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v36.l
Expand Down Expand Up @@ -18701,10 +18690,9 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2
; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.l, 3
Expand Down Expand Up @@ -24640,6 +24628,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v25.l
Expand Down Expand Up @@ -24677,13 +24666,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v35.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v33.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v36.l
Expand Down Expand Up @@ -24782,10 +24767,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB62_2
; GFX11-TRUE16-NEXT: .LBB62_4: ; %cmp.true
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.l, 3
Expand Down Expand Up @@ -28760,6 +28744,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l
Expand Down Expand Up @@ -28792,15 +28777,10 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
Expand Down Expand Up @@ -32871,6 +32851,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l
Expand Down Expand Up @@ -32903,15 +32884,10 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
Expand Down
Loading
Loading