From 8039d1856749ec713eb9374c394af3855f2bd7b0 Mon Sep 17 00:00:00 2001 From: guochen2 Date: Tue, 28 Oct 2025 14:35:17 -0400 Subject: [PATCH] set D16 HW fix for gfx12 --- llvm/lib/Target/AMDGPU/AMDGPU.td | 2 + llvm/test/CodeGen/AMDGPU/spillv16.ll | 528 ++++++++++++++++++++++----- 2 files changed, 445 insertions(+), 85 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 54d94b1f8682e..0b61adf409948 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -2069,6 +2069,7 @@ def FeatureISAVersion12 : FeatureSet< FeatureMemoryAtomicFAddF32DenormalSupport, FeatureBVHDualAndBVH8Insts, FeatureWaitsBeforeSystemScopeStores, + FeatureD16Writes32BitVgpr ]>; def FeatureISAVersion12_50 : FeatureSet< @@ -2143,6 +2144,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureSupportsXNACK, FeatureXNACK, FeatureClusters, + FeatureD16Writes32BitVgpr, ]>; def FeatureISAVersion12_51 : FeatureSet< diff --git a/llvm/test/CodeGen/AMDGPU/spillv16.ll b/llvm/test/CodeGen/AMDGPU/spillv16.ll index 2d54ac8283a3a..16a7bf9bc91dd 100644 --- a/llvm/test/CodeGen/AMDGPU/spillv16.ll +++ b/llvm/test/CodeGen/AMDGPU/spillv16.ll @@ -1,8 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,GCN-TRUE16 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,GCN-FAKE16 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX1250,GFX1250-TRUE16 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX1250,GFX1250-FAKE16 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16,+d16-write-vgpr32 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX1250-TRUE16-D16W32 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX1250-FAKE16 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16,-d16-write-vgpr32 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX1250-TRUE16-D16W16 + define void @spill_i16_alu() { ; GCN-TRUE16-LABEL: spill_i16_alu: @@ -35,23 +37,23 @@ define void @spill_i16_alu() { ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-TRUE16-LABEL: spill_i16_alu: -; GFX1250-TRUE16: ; %bb.0: ; %entry -; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l -; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 -; GFX1250-TRUE16-NEXT: ;;#ASMSTART -; GFX1250-TRUE16-NEXT: ;;#ASMEND -; GFX1250-TRUE16-NEXT: scratch_load_u16 v1, off, s32 offset:2 th:TH_LOAD_LU ; 2-byte Folded Reload -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l -; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS -; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-TRUE16-D16W32-LABEL: spill_i16_alu: +; GFX1250-TRUE16-D16W32: ; %bb.0: ; %entry +; GFX1250-TRUE16-D16W32-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W32-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX1250-TRUE16-D16W32-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill +; GFX1250-TRUE16-D16W32-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: ;;#ASMSTART +; GFX1250-TRUE16-D16W32-NEXT: ;;#ASMEND +; GFX1250-TRUE16-D16W32-NEXT: scratch_load_u16 v1, off, s32 offset:2 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX1250-TRUE16-D16W32-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX1250-TRUE16-D16W32-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W32-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-FAKE16-LABEL: spill_i16_alu: ; GFX1250-FAKE16: ; %bb.0: ; %entry @@ -69,6 +71,41 @@ define void @spill_i16_alu() { ; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-TRUE16-D16W16-LABEL: spill_i16_alu: +; GFX1250-TRUE16-D16W16: ; %bb.0: ; %entry +; GFX1250-TRUE16-D16W16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX1250-TRUE16-D16W16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill +; GFX1250-TRUE16-D16W16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: ;;#ASMSTART +; GFX1250-TRUE16-D16W16-NEXT: ;;#ASMEND +; GFX1250-TRUE16-D16W16-NEXT: scratch_load_u16 v1, off, s32 offset:2 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX1250-TRUE16-D16W16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX1250-TRUE16-D16W16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-TRUE16-LABEL: spill_i16_alu: +; GFX1250-TRUE16: ; %bb.0: ; %entry +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: ;;#ASMSTART +; GFX1250-TRUE16-NEXT: ;;#ASMEND +; GFX1250-TRUE16-NEXT: scratch_load_u16 v1, off, s32 offset:2 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] entry: %alloca = alloca i16, i32 1, align 4, addrspace(5) @@ -126,28 +163,28 @@ define void @spill_i16_alu_two_vals() { ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-TRUE16-LABEL: spill_i16_alu_two_vals: -; GFX1250-TRUE16: ; %bb.0: ; %entry -; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l -; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:6 ; 2-byte Folded Spill -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 -; GFX1250-TRUE16-NEXT: ;;#ASMSTART -; GFX1250-TRUE16-NEXT: ;;#ASMEND -; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 offset:4 scope:SCOPE_SYS -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-TRUE16-NEXT: scratch_load_u16 v1, off, s32 offset:6 th:TH_LOAD_LU ; 2-byte Folded Reload -; GFX1250-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l -; GFX1250-TRUE16-NEXT: scratch_store_d16_hi_b16 off, v0, s32 scope:SCOPE_SYS -; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 scope:SCOPE_SYS -; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-TRUE16-D16W32-LABEL: spill_i16_alu_two_vals: +; GFX1250-TRUE16-D16W32: ; %bb.0: ; %entry +; GFX1250-TRUE16-D16W32-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W32-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX1250-TRUE16-D16W32-NEXT: scratch_store_b16 off, v0, s32 offset:6 ; 2-byte Folded Spill +; GFX1250-TRUE16-D16W32-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: ;;#ASMSTART +; GFX1250-TRUE16-D16W32-NEXT: ;;#ASMEND +; GFX1250-TRUE16-D16W32-NEXT: scratch_load_u16 v0, off, s32 offset:4 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W32-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: scratch_load_u16 v1, off, s32 offset:6 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX1250-TRUE16-D16W32-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX1250-TRUE16-D16W32-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX1250-TRUE16-D16W32-NEXT: scratch_store_d16_hi_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W32-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: scratch_store_b16 off, v0, s32 offset:4 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W32-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-FAKE16-LABEL: spill_i16_alu_two_vals: ; GFX1250-FAKE16: ; %bb.0: ; %entry @@ -170,6 +207,51 @@ define void @spill_i16_alu_two_vals() { ; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-TRUE16-D16W16-LABEL: spill_i16_alu_two_vals: +; GFX1250-TRUE16-D16W16: ; %bb.0: ; %entry +; GFX1250-TRUE16-D16W16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX1250-TRUE16-D16W16-NEXT: scratch_store_b16 off, v0, s32 offset:6 ; 2-byte Folded Spill +; GFX1250-TRUE16-D16W16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: ;;#ASMSTART +; GFX1250-TRUE16-D16W16-NEXT: ;;#ASMEND +; GFX1250-TRUE16-D16W16-NEXT: scratch_load_u16 v0, off, s32 offset:4 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: scratch_load_u16 v1, off, s32 offset:6 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX1250-TRUE16-D16W16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX1250-TRUE16-D16W16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX1250-TRUE16-D16W16-NEXT: scratch_store_d16_hi_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: scratch_store_b16 off, v0, s32 offset:4 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-TRUE16-LABEL: spill_i16_alu_two_vals: +; GFX1250-TRUE16: ; %bb.0: ; %entry +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:6 ; 2-byte Folded Spill +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: ;;#ASMSTART +; GFX1250-TRUE16-NEXT: ;;#ASMEND +; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 offset:4 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_load_u16 v1, off, s32 offset:6 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX1250-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX1250-TRUE16-NEXT: scratch_store_d16_hi_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] entry: %alloca = alloca i16, i32 1, align 4, addrspace(5) %alloca2 = alloca i16, i32 1, align 4, addrspace(5) @@ -223,6 +305,53 @@ define void @spill_i16() { ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] ; +; GFX1250-TRUE16-D16W32-LABEL: spill_i16: +; GFX1250-TRUE16-D16W32: ; %bb.0: ; %entry +; GFX1250-TRUE16-D16W32-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W32-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill +; GFX1250-TRUE16-D16W32-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: ;;#ASMSTART +; GFX1250-TRUE16-D16W32-NEXT: ;;#ASMEND +; GFX1250-TRUE16-D16W32-NEXT: scratch_load_b32 v0, off, s32 offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-TRUE16-D16W32-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W32-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: spill_i16: +; GFX1250-FAKE16: ; %bb.0: ; %entry +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: ;;#ASMSTART +; GFX1250-FAKE16-NEXT: ;;#ASMEND +; GFX1250-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-TRUE16-D16W16-LABEL: spill_i16: +; GFX1250-TRUE16-D16W16: ; %bb.0: ; %entry +; GFX1250-TRUE16-D16W16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill +; GFX1250-TRUE16-D16W16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: ;;#ASMSTART +; GFX1250-TRUE16-D16W16-NEXT: ;;#ASMEND +; GFX1250-TRUE16-D16W16-NEXT: scratch_load_b32 v0, off, s32 offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-TRUE16-D16W16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-LABEL: spill_i16: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -282,6 +411,53 @@ define void @spill_half() { ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] ; +; GFX1250-TRUE16-D16W32-LABEL: spill_half: +; GFX1250-TRUE16-D16W32: ; %bb.0: ; %entry +; GFX1250-TRUE16-D16W32-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W32-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill +; GFX1250-TRUE16-D16W32-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: ;;#ASMSTART +; GFX1250-TRUE16-D16W32-NEXT: ;;#ASMEND +; GFX1250-TRUE16-D16W32-NEXT: scratch_load_b32 v0, off, s32 offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-TRUE16-D16W32-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W32-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: spill_half: +; GFX1250-FAKE16: ; %bb.0: ; %entry +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: ;;#ASMSTART +; GFX1250-FAKE16-NEXT: ;;#ASMEND +; GFX1250-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-TRUE16-D16W16-LABEL: spill_half: +; GFX1250-TRUE16-D16W16: ; %bb.0: ; %entry +; GFX1250-TRUE16-D16W16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill +; GFX1250-TRUE16-D16W16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: ;;#ASMSTART +; GFX1250-TRUE16-D16W16-NEXT: ;;#ASMEND +; GFX1250-TRUE16-D16W16-NEXT: scratch_load_b32 v0, off, s32 offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-TRUE16-D16W16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-LABEL: spill_half: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -341,6 +517,53 @@ define void @spill_i16_from_v2i16() { ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] ; +; GFX1250-TRUE16-D16W32-LABEL: spill_i16_from_v2i16: +; GFX1250-TRUE16-D16W32: ; %bb.0: ; %entry +; GFX1250-TRUE16-D16W32-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: scratch_load_u16 v0, off, s32 offset:2 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W32-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX1250-TRUE16-D16W32-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: ;;#ASMSTART +; GFX1250-TRUE16-D16W32-NEXT: ;;#ASMEND +; GFX1250-TRUE16-D16W32-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-TRUE16-D16W32-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W32-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: spill_i16_from_v2i16: +; GFX1250-FAKE16: ; %bb.0: ; %entry +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:2 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: ;;#ASMSTART +; GFX1250-FAKE16-NEXT: ;;#ASMEND +; GFX1250-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-TRUE16-D16W16-LABEL: spill_i16_from_v2i16: +; GFX1250-TRUE16-D16W16: ; %bb.0: ; %entry +; GFX1250-TRUE16-D16W16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: scratch_load_u16 v0, off, s32 offset:2 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX1250-TRUE16-D16W16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: ;;#ASMSTART +; GFX1250-TRUE16-D16W16-NEXT: ;;#ASMEND +; GFX1250-TRUE16-D16W16-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-TRUE16-D16W16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-LABEL: spill_i16_from_v2i16: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -414,29 +637,29 @@ define void @spill_2xi16_from_v2i16() { ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-TRUE16-LABEL: spill_2xi16_from_v2i16: -; GFX1250-TRUE16: ; %bb.0: ; %entry -; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 offset:2 scope:SCOPE_SYS -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-TRUE16-NEXT: s_clause 0x1 -; GFX1250-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:12 -; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 -; GFX1250-TRUE16-NEXT: ;;#ASMSTART -; GFX1250-TRUE16-NEXT: ;;#ASMEND -; GFX1250-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS -; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX1250-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS -; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-TRUE16-D16W32-LABEL: spill_2xi16_from_v2i16: +; GFX1250-TRUE16-D16W32: ; %bb.0: ; %entry +; GFX1250-TRUE16-D16W32-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: scratch_load_u16 v0, off, s32 offset:2 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W32-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: s_clause 0x1 ; 4-byte Folded Spill +; GFX1250-TRUE16-D16W32-NEXT: scratch_store_b32 off, v0, s32 offset:12 +; GFX1250-TRUE16-D16W32-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W32-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX1250-TRUE16-D16W32-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: ;;#ASMSTART +; GFX1250-TRUE16-D16W32-NEXT: ;;#ASMEND +; GFX1250-TRUE16-D16W32-NEXT: scratch_load_b32 v0, off, s32 offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-TRUE16-D16W32-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W32-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-TRUE16-D16W32-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W32-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-FAKE16-LABEL: spill_2xi16_from_v2i16: ; GFX1250-FAKE16: ; %bb.0: ; %entry @@ -444,7 +667,7 @@ define void @spill_2xi16_from_v2i16() { ; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:2 scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-FAKE16-NEXT: s_clause 0x1 +; GFX1250-FAKE16-NEXT: s_clause 0x1 ; 4-byte Folded Spill ; GFX1250-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; GFX1250-FAKE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -461,6 +684,53 @@ define void @spill_2xi16_from_v2i16() { ; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-TRUE16-D16W16-LABEL: spill_2xi16_from_v2i16: +; GFX1250-TRUE16-D16W16: ; %bb.0: ; %entry +; GFX1250-TRUE16-D16W16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: scratch_load_u16 v0, off, s32 offset:2 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: s_clause 0x1 ; 4-byte Folded Spill +; GFX1250-TRUE16-D16W16-NEXT: scratch_store_b32 off, v0, s32 offset:12 +; GFX1250-TRUE16-D16W16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX1250-TRUE16-D16W16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: ;;#ASMSTART +; GFX1250-TRUE16-D16W16-NEXT: ;;#ASMEND +; GFX1250-TRUE16-D16W16-NEXT: scratch_load_b32 v0, off, s32 offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-TRUE16-D16W16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-TRUE16-D16W16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-TRUE16-LABEL: spill_2xi16_from_v2i16: +; GFX1250-TRUE16: ; %bb.0: ; %entry +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 offset:2 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: s_clause 0x1 +; GFX1250-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:12 +; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: ;;#ASMSTART +; GFX1250-TRUE16-NEXT: ;;#ASMEND +; GFX1250-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] entry: %alloca = alloca <2 x i16>, i32 2, align 1, addrspace(5) @@ -520,26 +790,26 @@ define void @spill_2xi16_from_v2i16_one_free_reg() { ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-TRUE16-LABEL: spill_2xi16_from_v2i16_one_free_reg: -; GFX1250-TRUE16: ; %bb.0: ; %entry -; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-TRUE16-NEXT: scratch_load_u16 v7, off, s32 offset:2 scope:SCOPE_SYS -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 -; GFX1250-TRUE16-NEXT: ;;#ASMSTART -; GFX1250-TRUE16-NEXT: ;;#ASMEND -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.l -; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS -; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX1250-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS -; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-TRUE16-D16W32-LABEL: spill_2xi16_from_v2i16_one_free_reg: +; GFX1250-TRUE16-D16W32: ; %bb.0: ; %entry +; GFX1250-TRUE16-D16W32-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: scratch_load_u16 v7, off, s32 offset:2 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W32-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W32-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX1250-TRUE16-D16W32-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: ;;#ASMSTART +; GFX1250-TRUE16-D16W32-NEXT: ;;#ASMEND +; GFX1250-TRUE16-D16W32-NEXT: v_mov_b16_e32 v0.l, v7.l +; GFX1250-TRUE16-D16W32-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W32-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-TRUE16-D16W32-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W32-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-FAKE16-LABEL: spill_2xi16_from_v2i16_one_free_reg: ; GFX1250-FAKE16: ; %bb.0: ; %entry @@ -560,6 +830,47 @@ define void @spill_2xi16_from_v2i16_one_free_reg() { ; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-TRUE16-D16W16-LABEL: spill_2xi16_from_v2i16_one_free_reg: +; GFX1250-TRUE16-D16W16: ; %bb.0: ; %entry +; GFX1250-TRUE16-D16W16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: scratch_load_u16 v7, off, s32 offset:2 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX1250-TRUE16-D16W16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: ;;#ASMSTART +; GFX1250-TRUE16-D16W16-NEXT: ;;#ASMEND +; GFX1250-TRUE16-D16W16-NEXT: v_mov_b16_e32 v0.l, v7.l +; GFX1250-TRUE16-D16W16-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-TRUE16-D16W16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-TRUE16-LABEL: spill_2xi16_from_v2i16_one_free_reg: +; GFX1250-TRUE16: ; %bb.0: ; %entry +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_load_u16 v7, off, s32 offset:2 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: ;;#ASMSTART +; GFX1250-TRUE16-NEXT: ;;#ASMEND +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.l +; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] entry: %alloca = alloca <2 x i16>, i32 2, align 1, addrspace(5) @@ -595,6 +906,53 @@ define void @spill_v2i16() { ; GCN-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; +; GFX1250-TRUE16-D16W32-LABEL: spill_v2i16: +; GFX1250-TRUE16-D16W32: ; %bb.0: ; %entry +; GFX1250-TRUE16-D16W32-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: scratch_load_b32 v0, off, s32 offset:4 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W32-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX1250-TRUE16-D16W32-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: ;;#ASMSTART +; GFX1250-TRUE16-D16W32-NEXT: ;;#ASMEND +; GFX1250-TRUE16-D16W32-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-TRUE16-D16W32-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: scratch_store_b32 off, v0, s32 offset:4 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W32-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-D16W32-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: spill_v2i16: +; GFX1250-FAKE16: ; %bb.0: ; %entry +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: ;;#ASMSTART +; GFX1250-FAKE16-NEXT: ;;#ASMEND +; GFX1250-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:4 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-TRUE16-D16W16-LABEL: spill_v2i16: +; GFX1250-TRUE16-D16W16: ; %bb.0: ; %entry +; GFX1250-TRUE16-D16W16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: scratch_load_b32 v0, off, s32 offset:4 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX1250-TRUE16-D16W16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: ;;#ASMSTART +; GFX1250-TRUE16-D16W16-NEXT: ;;#ASMEND +; GFX1250-TRUE16-D16W16-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-TRUE16-D16W16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: scratch_store_b32 off, v0, s32 offset:4 scope:SCOPE_SYS +; GFX1250-TRUE16-D16W16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-D16W16-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-LABEL: spill_v2i16: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0