llvm · broxigarchen · Jan 17, 2025 · Jan 13, 2025 · arsenm · Jan 14, 2025
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3030,6 +3030,8 @@ def : GCNPat <
 
 // Magic number: 1 | (0 << 8) | (12 << 16) | (12 << 24)
 // The 12s emit 0s.
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
 def : GCNPat <
   (i16 (bswap i16:$a)),
   (V_PERM_B32_e64  (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
@@ -3039,6 +3041,19 @@ def : GCNPat <
   (i32 (zext (bswap i16:$a))),
   (V_PERM_B32_e64  (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
 >;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
 let SubtargetPredicate = isGFX8Plus, AddedComplexity = 1 in { 
 let SubtargetPredicate = isGFX8Plus, AddedComplexity = 1 in { 
+def : GCNPat <
+  (i16 (bswap i16:$a)),
+  (EXTRACT_SUBREG (V_PERM_B32_e64  (i32 0), (COPY VGPR_16:$a), (S_MOV_B32 (i32 0x0c0c0001))), lo16)
+>;
+
+def : GCNPat <
+  (i32 (zext (bswap i16:$a))),
 %bswap = call i16 @llvm.bswap.i16(i16 %src) 
 %zext = zext i16 %bswap to i32 
 %bswap = call i16 @llvm.bswap.i16(i16 %src) 
 %zext = zext i16 %bswap to i32 
+  (V_PERM_B32_e64  (i32 0), (COPY VGPR_16:$a), (S_MOV_B32 (i32 0x0c0c0001)))
+>;
+}
 
 // Magic number: 1 | (0 << 8) | (3 << 16) | (2 << 24)
 def : GCNPat <

diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=amdgcn-- -verify-machineinstrs | FileCheck %s --check-prefix=SI
 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI
-; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=GFX11
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX11-REAL16
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX11-FAKE16
 
 declare i16 @llvm.bswap.i16(i16) nounwind readnone
 declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>) nounwind readnone
@@ -490,13 +491,21 @@ define float @missing_truncate_promote_bswap(i32 %arg) {
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: missing_truncate_promote_bswap:
-; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, 0, v0, 0xc0c0001
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-REAL16-LABEL: missing_truncate_promote_bswap:
+; GFX11-REAL16:       ; %bb.0: ; %bb
+; GFX11-REAL16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-REAL16-NEXT:    v_perm_b32 v0, 0, v0, 0xc0c0001
+; GFX11-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-REAL16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-REAL16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: missing_truncate_promote_bswap:
+; GFX11-FAKE16:       ; %bb.0: ; %bb
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, 0, v0, 0xc0c0001
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %tmp = trunc i32 %arg to i16
   %tmp1 = call i16 @llvm.bswap.i16(i16 %tmp)