Skip to content

Commit 222ff18

Browse files
authored
[AMDGPU][True16][CodeGen] Update codegen pattern for v_med3_f16 (#121992)
true16 codegen pattern for v_med3_f16
1 parent 3ba46dd commit 222ff18

File tree

2 files changed

+147
-35
lines changed

2 files changed

+147
-35
lines changed

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3670,6 +3670,8 @@ defm : FPMed3Pat<f32, V_MED3_F32_e64>;
36703670
let SubtargetPredicate = HasMed3_16 in {
36713671
let True16Predicate = NotHasTrue16BitInsts in
36723672
defm : FPMed3Pat<f16, V_MED3_F16_e64>;
3673+
let True16Predicate = UseRealTrue16Insts in
3674+
defm : FPMed3Pat<f16, V_MED3_F16_t16_e64>;
36733675
let True16Predicate = UseFakeTrue16Insts in
36743676
defm : FPMed3Pat<f16, V_MED3_F16_fake16_e64>;
36753677
}

llvm/test/CodeGen/AMDGPU/fmed3.ll

Lines changed: 145 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@
55
; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-GISEL %s
66
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-SDAG %s
77
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-GISEL %s
8-
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-SDAG %s
9-
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-GISEL %s
8+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
9+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
10+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
11+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
1012

1113
define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
1214
; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
@@ -7531,19 +7533,61 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o
75317533
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
75327534
; GFX9-NEXT: s_endpgm
75337535
;
7534-
; GFX11-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
7535-
; GFX11: ; %bb.0:
7536-
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
7537-
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
7538-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
7539-
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
7540-
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
7541-
; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
7542-
; GFX11-NEXT: s_waitcnt vmcnt(0)
7543-
; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1
7544-
; GFX11-NEXT: v_med3_f16 v1, v1, 2.0, 4.0
7545-
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
7546-
; GFX11-NEXT: s_endpgm
7536+
; GFX11-SDAG-FAKE16-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
7537+
; GFX11-SDAG-FAKE16: ; %bb.0:
7538+
; GFX11-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
7539+
; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
7540+
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
7541+
; GFX11-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
7542+
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
7543+
; GFX11-SDAG-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
7544+
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
7545+
; GFX11-SDAG-FAKE16-NEXT: v_add_f16_e32 v1, 1.0, v1
7546+
; GFX11-SDAG-FAKE16-NEXT: v_med3_f16 v1, v1, 2.0, 4.0
7547+
; GFX11-SDAG-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
7548+
; GFX11-SDAG-FAKE16-NEXT: s_endpgm
7549+
;
7550+
; GFX11-GISEL-FAKE16-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
7551+
; GFX11-GISEL-FAKE16: ; %bb.0:
7552+
; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
7553+
; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
7554+
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
7555+
; GFX11-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
7556+
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
7557+
; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
7558+
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0)
7559+
; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v1, 1.0, v1
7560+
; GFX11-GISEL-FAKE16-NEXT: v_med3_f16 v1, v1, 2.0, 4.0
7561+
; GFX11-GISEL-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
7562+
; GFX11-GISEL-FAKE16-NEXT: s_endpgm
7563+
;
7564+
; GFX11-SDAG-TRUE16-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
7565+
; GFX11-SDAG-TRUE16: ; %bb.0:
7566+
; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
7567+
; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
7568+
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
7569+
; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
7570+
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
7571+
; GFX11-SDAG-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
7572+
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
7573+
; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l
7574+
; GFX11-SDAG-TRUE16-NEXT: v_med3_f16 v0.l, v0.l, 2.0, 4.0
7575+
; GFX11-SDAG-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
7576+
; GFX11-SDAG-TRUE16-NEXT: s_endpgm
7577+
;
7578+
; GFX11-GISEL-TRUE16-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
7579+
; GFX11-GISEL-TRUE16: ; %bb.0:
7580+
; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
7581+
; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
7582+
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
7583+
; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
7584+
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
7585+
; GFX11-GISEL-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
7586+
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
7587+
; GFX11-GISEL-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l
7588+
; GFX11-GISEL-TRUE16-NEXT: v_med3_f16 v0.l, v0.l, 2.0, 4.0
7589+
; GFX11-GISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
7590+
; GFX11-GISEL-TRUE16-NEXT: s_endpgm
75477591
%tid = call i32 @llvm.amdgcn.workitem.id.x()
75487592
%gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid
75497593
%outgep = getelementptr half, ptr addrspace(1) %out, i32 %tid
@@ -7723,26 +7767,92 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
77237767
; GFX9-NEXT: global_store_short v0, v1, s[8:9]
77247768
; GFX9-NEXT: s_endpgm
77257769
;
7726-
; GFX11-LABEL: v_nnan_inputs_med3_f16_pat0:
7727-
; GFX11: ; %bb.0:
7728-
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
7729-
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
7730-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
7731-
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
7732-
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
7733-
; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
7734-
; GFX11-NEXT: s_waitcnt vmcnt(0)
7735-
; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc
7736-
; GFX11-NEXT: s_waitcnt vmcnt(0)
7737-
; GFX11-NEXT: global_load_u16 v3, v0, s[6:7] glc dlc
7738-
; GFX11-NEXT: s_waitcnt vmcnt(0)
7739-
; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1
7740-
; GFX11-NEXT: v_add_f16_e32 v2, 2.0, v2
7741-
; GFX11-NEXT: v_add_f16_e32 v3, 4.0, v3
7742-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
7743-
; GFX11-NEXT: v_med3_f16 v1, v1, v2, v3
7744-
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
7745-
; GFX11-NEXT: s_endpgm
7770+
; GFX11-SDAG-FAKE16-LABEL: v_nnan_inputs_med3_f16_pat0:
7771+
; GFX11-SDAG-FAKE16: ; %bb.0:
7772+
; GFX11-SDAG-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
7773+
; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
7774+
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
7775+
; GFX11-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
7776+
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
7777+
; GFX11-SDAG-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
7778+
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
7779+
; GFX11-SDAG-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc
7780+
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
7781+
; GFX11-SDAG-FAKE16-NEXT: global_load_u16 v3, v0, s[6:7] glc dlc
7782+
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
7783+
; GFX11-SDAG-FAKE16-NEXT: v_add_f16_e32 v1, 1.0, v1
7784+
; GFX11-SDAG-FAKE16-NEXT: v_add_f16_e32 v2, 2.0, v2
7785+
; GFX11-SDAG-FAKE16-NEXT: v_add_f16_e32 v3, 4.0, v3
7786+
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
7787+
; GFX11-SDAG-FAKE16-NEXT: v_med3_f16 v1, v1, v2, v3
7788+
; GFX11-SDAG-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
7789+
; GFX11-SDAG-FAKE16-NEXT: s_endpgm
7790+
;
7791+
; GFX11-GISEL-FAKE16-LABEL: v_nnan_inputs_med3_f16_pat0:
7792+
; GFX11-GISEL-FAKE16: ; %bb.0:
7793+
; GFX11-GISEL-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
7794+
; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
7795+
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
7796+
; GFX11-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
7797+
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
7798+
; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
7799+
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0)
7800+
; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc
7801+
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0)
7802+
; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v3, v0, s[6:7] glc dlc
7803+
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0)
7804+
; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v1, 1.0, v1
7805+
; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v2, 2.0, v2
7806+
; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v3, 4.0, v3
7807+
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
7808+
; GFX11-GISEL-FAKE16-NEXT: v_med3_f16 v1, v1, v2, v3
7809+
; GFX11-GISEL-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
7810+
; GFX11-GISEL-FAKE16-NEXT: s_endpgm
7811+
;
7812+
; GFX11-SDAG-TRUE16-LABEL: v_nnan_inputs_med3_f16_pat0:
7813+
; GFX11-SDAG-TRUE16: ; %bb.0:
7814+
; GFX11-SDAG-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
7815+
; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
7816+
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
7817+
; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
7818+
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
7819+
; GFX11-SDAG-TRUE16-NEXT: global_load_u16 v0, v2, s[2:3] glc dlc
7820+
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
7821+
; GFX11-SDAG-TRUE16-NEXT: global_load_u16 v1, v2, s[4:5] glc dlc
7822+
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
7823+
; GFX11-SDAG-TRUE16-NEXT: global_load_u16 v3, v2, s[6:7] glc dlc
7824+
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
7825+
; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l
7826+
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
7827+
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l
7828+
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
7829+
; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.h, 2.0, v0.h
7830+
; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v1.l
7831+
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
7832+
; GFX11-SDAG-TRUE16-NEXT: v_med3_f16 v0.l, v0.l, v0.h, v1.l
7833+
; GFX11-SDAG-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
7834+
; GFX11-SDAG-TRUE16-NEXT: s_endpgm
7835+
;
7836+
; GFX11-GISEL-TRUE16-LABEL: v_nnan_inputs_med3_f16_pat0:
7837+
; GFX11-GISEL-TRUE16: ; %bb.0:
7838+
; GFX11-GISEL-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
7839+
; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
7840+
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
7841+
; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
7842+
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
7843+
; GFX11-GISEL-TRUE16-NEXT: global_load_u16 v0, v2, s[2:3] glc dlc
7844+
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
7845+
; GFX11-GISEL-TRUE16-NEXT: global_load_u16 v1, v2, s[4:5] glc dlc
7846+
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
7847+
; GFX11-GISEL-TRUE16-NEXT: global_load_u16 v3, v2, s[6:7] glc dlc
7848+
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
7849+
; GFX11-GISEL-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l
7850+
; GFX11-GISEL-TRUE16-NEXT: v_add_f16_e32 v0.h, 2.0, v1.l
7851+
; GFX11-GISEL-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v3.l
7852+
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
7853+
; GFX11-GISEL-TRUE16-NEXT: v_med3_f16 v0.l, v0.l, v0.h, v1.l
7854+
; GFX11-GISEL-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
7855+
; GFX11-GISEL-TRUE16-NEXT: s_endpgm
77467856
%tid = call i32 @llvm.amdgcn.workitem.id.x()
77477857
%gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid
77487858
%gep1 = getelementptr half, ptr addrspace(1) %bptr, i32 %tid

0 commit comments

Comments
 (0)