-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU] Use true16 loads with +real-true16 and sram-ecc #161256
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Use true16 loads with +real-true16 and sram-ecc #161256
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Stanislav Mekhanoshin (rampitec) ChangesFixes: SC1-6072 Patch is 260.92 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/161256.diff 12 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index eaa1870f4be28..53b8c314d2ac4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -2595,6 +2595,10 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() &&
// FIXME When we default to RealTrue16 instead of Fake, change the line as follows.
// AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>;
+def Use32BitLoadStoreWithTrue16Insts : True16PredicateClass<"Subtarget->useRealTrue16Insts() && "
+ "!Subtarget->d16PreservesUnusedBits()">,
+ AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureSRAMECC)>;
+
def HasD16Writes32BitVgpr: Predicate<"Subtarget->hasD16Writes32BitVgpr()">,
AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, FeatureD16Writes32BitVgpr)>;
def NotHasD16Writes32BitVgpr: Predicate<"!Subtarget->hasD16Writes32BitVgpr()">,
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index f2e432fa8d7f5..830443380b443 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -969,7 +969,7 @@ multiclass DSReadPat_t16<DS_Pseudo inst, ValueType vt, string frag> {
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
- foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+ foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts, Use32BitLoadStoreWithTrue16Insts] in
let True16Predicate = p in {
def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
}
@@ -1050,7 +1050,7 @@ multiclass DSWritePat_t16 <DS_Pseudo inst, ValueType vt, string frag> {
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
- foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+ foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts, Use32BitLoadStoreWithTrue16Insts] in
let True16Predicate = p in {
def : DSWritePat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
}
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 9f33bac4c56ea..80999a5a5cc84 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1982,7 +1982,7 @@ defm : FlatLoadPats <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
defm : FlatLoadPats <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>;
defm : FlatLoadPats <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts, Use32BitLoadStoreWithTrue16Insts] in
let True16Predicate = p in {
defm : FlatLoadPats <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
defm : FlatLoadPats <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
@@ -2127,7 +2127,7 @@ defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, extloadi16_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, zextloadi16_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_SSHORT, sextloadi16_global, i32>;
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts, Use32BitLoadStoreWithTrue16Insts] in
let True16Predicate = p in {
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, extloadi8_global, i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, zextloadi8_global, i16>;
@@ -2187,7 +2187,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, truncstorei16_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX3, store_global, v3i32>;
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts, Use32BitLoadStoreWithTrue16Insts] in
let OtherPredicates = [HasFlatGlobalInsts], True16Predicate = p in {
defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i16>;
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, store_global, i16>;
@@ -2356,7 +2356,7 @@ defm : ScratchFLATLoadPats <SCRATCH_LOAD_USHORT, extloadi16_private, i32>;
defm : ScratchFLATLoadPats <SCRATCH_LOAD_USHORT, zextloadi16_private, i32>;
defm : ScratchFLATLoadPats <SCRATCH_LOAD_SSHORT, sextloadi16_private, i32>;
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts, Use32BitLoadStoreWithTrue16Insts] in
let True16Predicate = p in {
defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, extloadi8_private, i16>;
defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, zextloadi8_private, i16>;
@@ -2366,7 +2366,7 @@ defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT, store_private, i16>;
defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE, truncstorei8_private, i16>;
}
-let True16Predicate = UseRealTrue16Insts in {
+let OtherPredicates = [D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in {
defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", extloadi8_private, i16>;
defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", zextloadi8_private, i16>;
defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_SBYTE_D16", sextloadi8_private, i16>;
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
index aaedb85c8353c..e67d7fdba58d3 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
@@ -3,6 +3,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s
define i8 @atomic_load_monotonic_i8(ptr addrspace(3) %ptr) {
; CI-LABEL: atomic_load_monotonic_i8:
@@ -33,6 +35,14 @@ define i8 @atomic_load_monotonic_i8(ptr addrspace(3) %ptr) {
; GFX11-FAKE16-NEXT: ds_load_u8 v0, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_i8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_u8 v0, v0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%load = load atomic i8, ptr addrspace(3) %ptr monotonic, align 1
ret i8 %load
}
@@ -66,6 +76,14 @@ define i8 @atomic_load_monotonic_i8_offset(ptr addrspace(3) %ptr) {
; GFX11-FAKE16-NEXT: ds_load_u8 v0, v0 offset:16
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_i8_offset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_u8 v0, v0 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%gep = getelementptr inbounds i8, ptr addrspace(3) %ptr, i8 16
%load = load atomic i8, ptr addrspace(3) %gep monotonic, align 1
ret i8 %load
@@ -100,6 +118,14 @@ define i16 @atomic_load_monotonic_i16(ptr addrspace(3) %ptr) {
; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_u16 v0, v0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%load = load atomic i16, ptr addrspace(3) %ptr monotonic, align 2
ret i16 %load
}
@@ -133,6 +159,14 @@ define i16 @atomic_load_monotonic_i16_offset(ptr addrspace(3) %ptr) {
; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 offset:32
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_i16_offset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_u16 v0, v0 offset:32
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%gep = getelementptr inbounds i16, ptr addrspace(3) %ptr, i16 16
%load = load atomic i16, ptr addrspace(3) %gep monotonic, align 2
ret i16 %load
@@ -160,6 +194,14 @@ define i32 @atomic_load_monotonic_i32(ptr addrspace(3) %ptr) {
; GFX11-NEXT: ds_load_b32 v0, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_b32 v0, v0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%load = load atomic i32, ptr addrspace(3) %ptr monotonic, align 4
ret i32 %load
}
@@ -186,6 +228,14 @@ define i32 @atomic_load_monotonic_i32_offset(ptr addrspace(3) %ptr) {
; GFX11-NEXT: ds_load_b32 v0, v0 offset:64
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_i32_offset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_b32 v0, v0 offset:64
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%gep = getelementptr inbounds i32, ptr addrspace(3) %ptr, i32 16
%load = load atomic i32, ptr addrspace(3) %gep monotonic, align 4
ret i32 %load
@@ -213,6 +263,14 @@ define i64 @atomic_load_monotonic_i64(ptr addrspace(3) %ptr) {
; GFX11-NEXT: ds_load_b64 v[0:1], v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_b64 v[0:1], v0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%load = load atomic i64, ptr addrspace(3) %ptr monotonic, align 8
ret i64 %load
}
@@ -239,6 +297,14 @@ define i64 @atomic_load_monotonic_i64_offset(ptr addrspace(3) %ptr) {
; GFX11-NEXT: ds_load_b64 v[0:1], v0 offset:128
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_i64_offset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_b64 v[0:1], v0 offset:128
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%gep = getelementptr inbounds i64, ptr addrspace(3) %ptr, i32 16
%load = load atomic i64, ptr addrspace(3) %gep monotonic, align 8
ret i64 %load
@@ -266,6 +332,14 @@ define float @atomic_load_monotonic_f32_offset(ptr addrspace(3) %ptr) {
; GFX11-NEXT: ds_load_b32 v0, v0 offset:64
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_f32_offset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_b32 v0, v0 offset:64
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%gep = getelementptr inbounds float, ptr addrspace(3) %ptr, i32 16
%load = load atomic float, ptr addrspace(3) %gep monotonic, align 4
ret float %load
@@ -293,6 +367,14 @@ define double @atomic_load_monotonic_f64_offset(ptr addrspace(3) %ptr) {
; GFX11-NEXT: ds_load_b64 v[0:1], v0 offset:128
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_f64_offset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_b64 v[0:1], v0 offset:128
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%gep = getelementptr inbounds double, ptr addrspace(3) %ptr, i32 16
%load = load atomic double, ptr addrspace(3) %gep monotonic, align 8
ret double %load
@@ -320,6 +402,14 @@ define ptr @atomic_load_monotonic_p0i8_offset(ptr addrspace(3) %ptr) {
; GFX11-NEXT: ds_load_b64 v[0:1], v0 offset:128
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_p0i8_offset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_b64 v[0:1], v0 offset:128
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%gep = getelementptr inbounds ptr, ptr addrspace(3) %ptr, i32 16
%load = load atomic ptr, ptr addrspace(3) %gep monotonic, align 8
ret ptr %load
@@ -347,6 +437,14 @@ define ptr addrspace(3) @atomic_load_monotonic_p3i8_offset(ptr addrspace(3) %ptr
; GFX11-NEXT: ds_load_b32 v0, v0 offset:64
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_p3i8_offset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_b32 v0, v0 offset:64
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%gep = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %ptr, i32 16
%load = load atomic ptr addrspace(3), ptr addrspace(3) %gep monotonic, align 4
ret ptr addrspace(3) %load
@@ -381,6 +479,14 @@ define i16 @atomic_load_monotonic_f16(ptr addrspace(3) %ptr) {
; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_f16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_u16 v0, v0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%load = load atomic half, ptr addrspace(3) %ptr monotonic, align 2
%ret = bitcast half %load to i16
ret i16 %ret
@@ -415,6 +521,14 @@ define i16 @atomic_load_monotonic_f16_offset(ptr addrspace(3) %ptr) {
; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 offset:32
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_f16_offset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_u16 v0, v0 offset:32
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%gep = getelementptr inbounds half, ptr addrspace(3) %ptr, i32 16
%load = load atomic half, ptr addrspace(3) %gep monotonic, align 2
%ret = bitcast half %load to i16
@@ -450,6 +564,14 @@ define i16 @atomic_load_monotonic_bf16(ptr addrspace(3) %ptr) {
; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_u16 v0, v0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%load = load atomic bfloat, ptr addrspace(3) %ptr monotonic, align 2
%ret = bitcast bfloat %load to i16
ret i16 %ret
@@ -484,6 +606,14 @@ define i16 @atomic_load_monotonic_bf16_offset(ptr addrspace(3) %ptr) {
; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 offset:32
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: atomic_load_monotonic_bf16_offset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_u16 v0, v0 offset:32
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%gep = getelementptr inbounds bfloat, ptr addrspace(3) %ptr, i32 16
%load = load atomic bfloat, ptr addrspace(3) %gep monotonic, align 2
%ret = bitcast bfloat %load to i16
@@ -491,3 +621,5 @@ define i16 @atomic_load_monotonic_bf16_offset(ptr addrspace(3) %ptr) {
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GCN: {{.*}}
+; GFX1250-FAKE16: {{.*}}
+; GFX1250-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll
index c2bb4f004e001..83e30773615aa 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll
@@ -3,6 +3,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s
define void @atomic_store_monotonic_i8(ptr addrspace(3) %ptr, i8 %val) {
; CI-LABEL: atomic_store_monotonic_i8:
@@ -41,6 +43,28 @@ define void @atomic_store_monotonic_i8(ptr addrspace(3) %ptr, i8 %val) {
; GFX11-FAKE16-NEXT: ds_store_b8 v0, v2
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-TRUE16-LABEL: atomic_store_monotonic_i8:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_add_nc_u16 v2.l, v2.l, 2
+; GFX1250-TRUE16-NEXT: ds_store_b8 v0, v1
+; GFX1250-TRUE16-NEXT: ds_store_b8 v0, v2
+; GFX1250-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: atomic_store_monotonic_i8:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2
+; GFX1250-FAKE16-NEXT: ds_store_b8 v0, v1
+; GFX1250-FAKE16-NEXT: ds_store_b8 v0, v2
+; GFX1250-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
%val1 = add i8 %val, 2
store atomic i8 %val, ptr addrspace(3) %ptr monotonic, align 1
store atomic i8 %val1, ptr addrspace(3) %ptr monotonic, align 1
@@ -84,6 +108,28 @@ define void @atomic_store_monotonic_offset_i8(ptr addrspace(3) %ptr, i8 %val) {
; GFX11-FAKE16-NEXT: ds_store_b8 v0, v2 offset:16
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-TRUE16-LABEL: atomic_store_monotonic_offset_i8:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_add_nc_u16 v2.l, v2.l, 2
+; GFX1250-TRUE16-NEXT: ds_store_b8 v0, v1 offset:8
+; GFX1250-TRUE16-NEXT: ds_store_b8 v0, v2 offset:16
+; GFX1250-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: atomic_store_monotonic_offset_i8:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2
+; GFX1250-FAKE16-NEXT: ds_store_b8 v0, v1 offset:8
+; GFX1250-FAKE16-NEXT: ds_store_b8 v0, v2 offset:16
+; GFX1250-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX1250-FAKE16-...
[truncated]
|
d5cebb0
to
9e0fea6
Compare
9e0fea6
to
4bc156f
Compare
I think there are two issues:
|
Indeed. Interesting it does not trigger errors anywhere. |
I think this does not trigger issue by itself, but confuses the following pass which use register class to do optimizations and cause functional error in the later flow. This usually hurts in the cts test. |
4bc156f
to
23d3ee2
Compare
Done |
This patch does not address spilling, it is separate. Although I am also not sure what is a best way here. Maybe really force spilling of the whole 32-bit superreg but that likely will mess up with a liveness of the other half. |
ping |
llvm/lib/Target/AMDGPU/AMDGPU.td
Outdated
|
||
def UseLoadTrue16WithSramECC : True16PredicateClass<"Subtarget->useRealTrue16Insts() && " | ||
"!Subtarget->d16PreservesUnusedBits()">, | ||
AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureSRAMECC)>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's not clear to me that this AssemblerPredicate expression matches the C++ expression above. But, if this predicate is only used for patterns (not for instruction definitions) then I guess you don't need the AssemblerPredicate part anyway?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
d16PreservesUnusedBits()
is essentially 'not sram-ecc'. But I have removed AssemblerPredicate
, it is indeed unused.
23d3ee2
to
865a9a1
Compare
llvm/lib/Target/AMDGPU/AMDGPU.td
Outdated
// FIXME When we default to RealTrue16 instead of Fake, change the line as follows. | ||
// AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>; | ||
|
||
def UseLoadTrue16WithSramECC : True16PredicateClass<"Subtarget->useRealTrue16Insts() && " |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We could use some more description here and in the commit message on what behavior is achieved. Also, it is confusing that the commit is titled 'Use fake16 load/store...' But the predicate is called 'UseLoadTrue16...' I might just drop Load here, and call it 'UseTrue16WithSramECC'. In a comment you can say the behavior, that it serves to not use d16 loads.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done. Predicate renamed, description updated.
When sram-ecc is enabled 16-bit loads clobber full 32-bit VGPR. A load into a just 16-bit VGPR is not possible. Do a 16-bit extending load and extract a 16-bit subreg in this situation. Also fixes lack of 16-bit store patterns with this combination. Fixes: SC1-6072
865a9a1
to
7023284
Compare
; GISEL-GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 | ||
; GISEL-GFX1250-TRUE16-NEXT: v_med3_num_f16 v2.l, v2.l, v3.l, v4.l | ||
; GISEL-GFX1250-TRUE16-NEXT: flat_store_b16 v[0:1], v2 | ||
; GISEL-GFX1250-TRUE16-NEXT: global_store_b16 v[0:1], v2, off |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks functionally equivalent, but is the switch from flat to global expected?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not just expected but desired. It is fixed by this patch. When t16 patterns were added stores were also put under that D16 preserving bit condition for no reason.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This what mean by "Also fixes lack of 16-bit store patterns with this combination" in the PR description.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
When sram-ecc is enabled 16-bit loads clobber full 32-bit VGPR.
A load into a just 16-bit VGPR is not possible. Do a 16-bit
extending load and extract a 16-bit subreg in this situation.
Also fixes lack of 16-bit store patterns with this combination.
Fixes: SC1-6072