Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -3030,6 +3030,8 @@ def : GCNPat <

// Magic number: 1 | (0 << 8) | (12 << 16) | (12 << 24)
// The 12s emit 0s.
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
let True16Predicate = p in {
def : GCNPat <
(i16 (bswap i16:$a)),
(V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
Expand All @@ -3039,6 +3041,19 @@ def : GCNPat <
(i32 (zext (bswap i16:$a))),
(V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
>;
}

let True16Predicate = UseRealTrue16Insts in {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume UseRealTrue16Insts implies has perm

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi Matt. Yes UseRealTrue16Insts implies GFX11Plus

These patterns are also guarded by

let SubtargetPredicate = isGFX8Plus, AddedComplexity = 1 in {
.

def : GCNPat <
(i16 (bswap i16:$a)),
(EXTRACT_SUBREG (V_PERM_B32_e64 (i32 0), (COPY VGPR_16:$a), (S_MOV_B32 (i32 0x0c0c0001))), lo16)
>;

def : GCNPat <
(i32 (zext (bswap i16:$a))),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this handle zext or any ext

Copy link
Contributor Author

@broxigarchen broxigarchen Jan 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi Matt. From the pattern this is zero filling so it should be zext.

This is following the existing pattern here https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/AMDGPU/SIInstructions.td#L3039

I don't know if we need to handle sext. It seems rare that if we need to handle a byte swap and at the mean time do a sext

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't mean sext, I mean anyext

Copy link
Contributor Author

@broxigarchen broxigarchen Jan 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. I think we can have both zext and anyext here like https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/AMDGPU/SIInstructions.td#L2355-L2356

But if there is only one then it should be using zext. The reason would be
zext -> anyext cost nothing, but anyext -> zext will add an addtional and operation to zero out top bits.

i.e. for example

%bswap = call i16 @llvm.bswap.i16(i16 %src)
%zext = zext i16 %bswap to i32
I think this one it will introduce an addtional and operation if we replace zext to anyext

(V_PERM_B32_e64 (i32 0), (COPY VGPR_16:$a), (S_MOV_B32 (i32 0x0c0c0001)))
>;
}

// Magic number: 1 | (0 << 8) | (3 << 16) | (2 << 24)
def : GCNPat <
Expand Down
25 changes: 17 additions & 8 deletions llvm/test/CodeGen/AMDGPU/bswap.ll
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=amdgcn-- -verify-machineinstrs | FileCheck %s --check-prefix=SI
; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI
; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=GFX11
; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX11-REAL16
; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX11-FAKE16

declare i16 @llvm.bswap.i16(i16) nounwind readnone
declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>) nounwind readnone
Expand Down Expand Up @@ -490,13 +491,21 @@ define float @missing_truncate_promote_bswap(i32 %arg) {
; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: missing_truncate_promote_bswap:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_perm_b32 v0, 0, v0, 0xc0c0001
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-REAL16-LABEL: missing_truncate_promote_bswap:
; GFX11-REAL16: ; %bb.0: ; %bb
; GFX11-REAL16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-REAL16-NEXT: v_perm_b32 v0, 0, v0, 0xc0c0001
; GFX11-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
; GFX11-REAL16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: missing_truncate_promote_bswap:
; GFX11-FAKE16: ; %bb.0: ; %bb
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_perm_b32 v0, 0, v0, 0xc0c0001
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
bb:
%tmp = trunc i32 %arg to i16
%tmp1 = call i16 @llvm.bswap.i16(i16 %tmp)
Expand Down
Loading