Skip to content

Conversation

@broxigarchen
Copy link
Contributor

@broxigarchen broxigarchen commented Feb 27, 2025

true16 pattern for fshr.

GlobalIsel will be enabled latter when merge_value selection is supported in true16 mode

@broxigarchen broxigarchen marked this pull request as ready for review February 27, 2025 17:23
@llvmbot
Copy link
Member

llvmbot commented Feb 27, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Brox Chen (broxigarchen)

Changes

true16 pattern for fshr.

GlobalIsel will be enabled latter when merge_value selection is supported in true16 mode


Patch is 62.64 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/129085.diff

3 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+27)
  • (modified) llvm/test/CodeGen/AMDGPU/bswap.ll (+9-19)
  • (modified) llvm/test/CodeGen/AMDGPU/fshr.ll (+912-177)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index e8daa4b542e02..f528338b02f82 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2511,6 +2511,33 @@ def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
                           (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
 } // end True16Predicate = NotHasTrue16BitInsts
 
+let OtherPredicates = [UseRealTrue16Insts] in {
+def : GCNPat <
+  (rotr i32:$src0, i32:$src1),
+  (V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
+                          /* src1_modifiers */ 0, $src0,
+                          /* src2_modifiers */ 0,
+                          (EXTRACT_SUBREG $src1, lo16),
+                          /* clamp */ 0, /* op_sel */ 0)
+>;
+
+def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
+          (V_ALIGNBIT_B32_t16_e64 0, /* src0_modifiers */
+                          (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
+                          0, /* src1_modifiers */
+                          (i32 (EXTRACT_SUBREG (i64 $src0), sub0)),
+                          0, /* src2_modifiers */
+                          (i16 (EXTRACT_SUBREG VGPR_32:$src1, lo16)),
+                          /* clamp */ 0, /* op_sel */ 0)>;
+
+def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
+          (V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
+                          /* src1_modifiers */ 0, $src1,
+                          /* src2_modifiers */ 0,
+                          (EXTRACT_SUBREG VGPR_32:$src2, lo16),
+                          /* clamp */ 0, /* op_sel */ 0)>;
+} // end OtherPredicates = [UseRealTrue16Insts]
+
 let True16Predicate = UseFakeTrue16Insts in {
 def : GCNPat <
   (rotr i32:$src0, i32:$src1),
diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll
index b98ca1ff55f59..a95a1aba0c914 100644
--- a/llvm/test/CodeGen/AMDGPU/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/bswap.ll
@@ -737,25 +737,15 @@ define i64 @v_bswap_i48(i64 %src) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-REAL16-LABEL: v_bswap_i48:
-; GFX11-REAL16:       ; %bb.0:
-; GFX11-REAL16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-REAL16-NEXT:    v_perm_b32 v2, 0, v0, 0x10203
-; GFX11-REAL16-NEXT:    v_perm_b32 v1, 0, v1, 0x10203
-; GFX11-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-REAL16-NEXT:    v_lshrrev_b64 v[0:1], 16, v[1:2]
-; GFX11-REAL16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-REAL16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: v_bswap_i48:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v2, 0, v0, 0x10203
-; GFX11-FAKE16-NEXT:    v_perm_b32 v0, 0, v1, 0x10203
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, v2, v0, 16
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_bswap_i48:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v2, 0, v0, 0x10203
+; GFX11-NEXT:    v_perm_b32 v0, 0, v1, 0x10203
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-NEXT:    v_alignbit_b32 v0, v2, v0, 16
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %trunc = trunc i64 %src to i48
   %bswap = call i48 @llvm.bswap.i48(i48 %trunc)
   %zext = zext i48 %bswap to i64
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index 5ca81ce9f9e07..c3d82e38d1985 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -4,7 +4,10 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,GFX9
 ; RUN: llc < %s -mtriple=r600 -mcpu=redwood  -verify-machineinstrs | FileCheck %s --check-prefix=R600
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX11
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX11,GFX11-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX12,GFX12-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16
 
 declare i32 @llvm.fshr.i32(i32, i32, i32)
 declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
@@ -81,17 +84,55 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
 ; GFX10-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fshr_i32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_alignbit_b32 v0, s0, s1, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fshr_i32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_alignbit_b32 v0, s0, s1, v0.l
+; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fshr_i32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, s0, s1, v0
+; GFX11-FAKE16-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: fshr_i32:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x2c
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_alignbit_b32 v0, s0, s1, v0.l
+; GFX12-TRUE16-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fshr_i32:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x2c
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_alignbit_b32 v0, s0, s1, v0
+; GFX12-FAKE16-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-FAKE16-NEXT:    s_endpgm
 entry:
   %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
   store i32 %0, ptr addrspace(1) %in
@@ -162,6 +203,15 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ; GFX11-NEXT:    v_alignbit_b32 v1, s2, s3, 7
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fshr_i32_imm:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_alignbit_b32 v1, s2, s3, 7
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 7)
   store i32 %0, ptr addrspace(1) %in
@@ -248,20 +298,67 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
 ; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[8:9]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fshr_v2i32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7
-; GFX11-NEXT:    v_mov_b32_e32 v2, s6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_alignbit_b32 v1, s1, s3, v0
-; GFX11-NEXT:    v_alignbit_b32 v0, s0, s2, v2
-; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[4:5]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fshr_v2i32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_clause 0x2
+; GFX11-TRUE16-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, s6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_alignbit_b32 v1, s1, s3, v0.l
+; GFX11-TRUE16-NEXT:    v_alignbit_b32 v0, s0, s2, v0.h
+; GFX11-TRUE16-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fshr_v2i32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, s1, s3, v0
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, s0, s2, v2
+; GFX11-FAKE16-NEXT:    global_store_b64 v3, v[0:1], s[4:5]
+; GFX11-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: fshr_v2i32:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x2
+; GFX12-TRUE16-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, s6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_alignbit_b32 v1, s1, s3, v0.l
+; GFX12-TRUE16-NEXT:    v_alignbit_b32 v0, s0, s2, v0.h
+; GFX12-TRUE16-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fshr_v2i32:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x2
+; GFX12-FAKE16-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, s6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_alignbit_b32 v1, s1, s3, v0
+; GFX12-FAKE16-NEXT:    v_alignbit_b32 v0, s0, s2, v2
+; GFX12-FAKE16-NEXT:    global_store_b64 v3, v[0:1], s[4:5]
+; GFX12-FAKE16-NEXT:    s_endpgm
 entry:
   %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
   store <2 x i32> %0, ptr addrspace(1) %in
@@ -347,6 +444,18 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
 ; GFX11-NEXT:    v_alignbit_b32 v0, s0, s2, 7
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fshr_v2i32_imm:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_alignbit_b32 v1, s1, s3, 9
+; GFX12-NEXT:    v_alignbit_b32 v0, s0, s2, 7
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-NEXT:    s_endpgm
 entry:
   %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
   store <2 x i32> %0, ptr addrspace(1) %in
@@ -459,24 +568,85 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
 ; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[6:7]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fshr_v4i32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x54
-; GFX11-NEXT:    s_load_b256 s[8:15], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v6, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_alignbit_b32 v3, s11, s15, v0
-; GFX11-NEXT:    v_alignbit_b32 v2, s10, s14, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_alignbit_b32 v1, s9, s13, v4
-; GFX11-NEXT:    v_alignbit_b32 v0, s8, s12, v5
-; GFX11-NEXT:    global_store_b128 v6, v[0:3], s[4:5]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fshr_v4i32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_clause 0x2
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x54
+; GFX11-TRUE16-NEXT:    s_load_b256 s[8:15], s[4:5], 0x34
+; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, s2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, s1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_alignbit_b32 v3, s11, s15, v0.l
+; GFX11-TRUE16-NEXT:    v_alignbit_b32 v2, s10, s14, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_alignbit_b32 v1, s9, s13, v1.l
+; GFX11-TRUE16-NEXT:    v_alignbit_b32 v0, s8, s12, v4.l
+; GFX11-TRUE16-NEXT:    global_store_b128 v5, v[0:3], s[4:5]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fshr_v4i32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x54
+; GFX11-FAKE16-NEXT:    s_load_b256 s[8:15], s[4:5], 0x34
+; GFX11-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v3, s11, s15, v0
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v2, s10, s14, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, s9, s13, v4
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, s8, s12, v5
+; GFX11-FAKE16-NEXT:    global_store_b128 v6, v[0:3], s[4:5]
+; GFX11-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: fshr_v4i32:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x2
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x54
+; GFX12-TRUE16-NEXT:    s_load_b256 s[8:15], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, s2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, s1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_alignbit_b32 v3, s11, s15, v0.l
+; GFX12-TRUE16-NEXT:    v_alignbit_b32 v2, s10, s14, v0.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_alignbit_b32 v1, s9, s13, v1.l
+; GFX12-TRUE16-NEXT:    v_alignbit_b32 v0, s8, s12, v4.l
+; GFX12-TRUE16-NEXT:    global_store_b128 v5, v[0:3], s[4:5]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fshr_v4i32:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x2
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x54
+; GFX12-FAKE16-NEXT:    s_load_b256 s[8:15], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_alignbit_b32 v3, s11, s15, v0
+; GFX12-FAKE16-NEXT:    v_alignbit_b32 v2, s10, s14, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_alignbit_b32 v1, s9, s13, v4
+; GFX12-FAKE16-NEXT:    v_alignbit_b32 v0, s8, s12, v5
+; GFX12-FAKE16-NEXT:    global_store_b128 v6, v[0:3], s[4:5]
+; GFX12-FAKE16-NEXT:    s_endpgm
 entry:
   %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z)
   store <4 x i32> %0, ptr addrspace(1) %in
@@ -580,6 +750,20 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
 ; GFX11-NEXT:    v_alignbit_b32 v0, s8, s12, 1
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fshr_v4i32_imm:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b256 s[8:15], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v4, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_alignbit_b32 v3, s11, s15, 1
+; GFX12-NEXT:    v_alignbit_b32 v2, s10, s14, 9
+; GFX12-NEXT:    v_alignbit_b32 v1, s9, s13, 7
+; GFX12-NEXT:    v_alignbit_b32 v0, s8, s12, 1
+; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>)
   store <4 x i32> %0, ptr addrspace(1) %in
@@ -604,11 +788,37 @@ define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) {
 ; GFX10-NEXT:    v_alignbit_b32 v0, v0, v1, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fshr_i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v0, v0, v1, v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_fshr_i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_alignbit_b32 v0...
[truncated]

Copy link
Contributor

@Sisyph Sisyph left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@Sisyph
Copy link
Contributor

Sisyph commented Mar 4, 2025

This may need a followup based on #129159 @jrbyrnes

@broxigarchen broxigarchen force-pushed the main-valign-codegen branch from 561caa5 to a777577 Compare March 4, 2025 17:09
@broxigarchen
Copy link
Contributor Author

rebased

@broxigarchen
Copy link
Contributor Author

This may need a followup based on #129159 @jrbyrnes

I guess I will merge this first and then create a follow up later

@broxigarchen broxigarchen merged commit 5cc033b into llvm:main Mar 4, 2025
11 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants