Skip to content

Commit 1ebcb75

Browse files
committed
[AMDGPU] Fix FMA Combine
Update the check in the FMA combine to check dot10-insts instead of dot7-insts. The target of the combine, v_dot2_f32_f16, is available only if dot10-insts target feature is enabled. The issue probably dates back to the change that split out dot10-insts out of dot7-insts. As far as I can see, this does not affect any current targets, but if a future target has dot7-insts, but not dot10-insts that would cause a crash ("cannot select") for the input ir in the test.
1 parent 54c6a59 commit 1ebcb75

File tree

2 files changed

+71
-1
lines changed

2 files changed

+71
-1
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14696,7 +14696,7 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,
1469614696
EVT VT = N->getValueType(0);
1469714697
SDLoc SL(N);
1469814698

14699-
if (!Subtarget->hasDot7Insts() || VT != MVT::f32)
14699+
if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
1470014700
return SDValue();
1470114701

1470214702
// FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+dot7-insts,-dot10-insts,-dot5-insts" < %s | FileCheck %s -check-prefix=DISABLED
3+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+dot7-insts,+dot10-insts,-dot5-insts" < %s | FileCheck %s -check-prefix=ENABLED
4+
5+
; Test that FMACombine is disabled for a target without dot10-insts, and enabled for a target with dot10-insts.
6+
7+
define amdgpu_kernel void @func() {
8+
; DISABLED-LABEL: func:
9+
; DISABLED: ; %bb.0: ; %bb
10+
; DISABLED-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
11+
; DISABLED-NEXT: v_mov_b32_e32 v2, 0
12+
; DISABLED-NEXT: s_mov_b32 vcc_lo, exec_lo
13+
; DISABLED-NEXT: .LBB0_1: ; %main
14+
; DISABLED-NEXT: ; =>This Inner Loop Header: Depth=1
15+
; DISABLED-NEXT: ds_load_b128 v[3:6], v1
16+
; DISABLED-NEXT: ds_store_b32 v1, v0
17+
; DISABLED-NEXT: s_waitcnt lgkmcnt(1)
18+
; DISABLED-NEXT: v_fma_mix_f32 v2, v4, v3, v2 op_sel_hi:[1,1,0]
19+
; DISABLED-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
20+
; DISABLED-NEXT: v_fma_mix_f32 v2, v4, v3, v2 op_sel:[1,1,0] op_sel_hi:[1,1,0]
21+
; DISABLED-NEXT: v_add_f32_e32 v2, 0, v2
22+
; DISABLED-NEXT: s_cbranch_vccnz .LBB0_1
23+
; DISABLED-NEXT: ; %bb.2: ; %DummyReturnBlock
24+
; DISABLED-NEXT: s_endpgm
25+
;
26+
; ENABLED-LABEL: func:
27+
; ENABLED: ; %bb.0: ; %bb
28+
; ENABLED-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
29+
; ENABLED-NEXT: v_mov_b32_e32 v2, 0
30+
; ENABLED-NEXT: s_mov_b32 vcc_lo, exec_lo
31+
; ENABLED-NEXT: .LBB0_1: ; %main
32+
; ENABLED-NEXT: ; =>This Inner Loop Header: Depth=1
33+
; ENABLED-NEXT: ds_load_b128 v[3:6], v1
34+
; ENABLED-NEXT: ds_store_b32 v1, v0
35+
; ENABLED-NEXT: s_waitcnt lgkmcnt(1)
36+
; ENABLED-NEXT: v_dot2_f32_f16 v2, v4, v3, v2
37+
; ENABLED-NEXT: s_delay_alu instid0(VALU_DEP_1)
38+
; ENABLED-NEXT: v_add_f32_e32 v2, 0, v2
39+
; ENABLED-NEXT: s_cbranch_vccnz .LBB0_1
40+
; ENABLED-NEXT: ; %bb.2: ; %DummyReturnBlock
41+
; ENABLED-NEXT: s_endpgm
42+
bb:
43+
br label %main
44+
45+
main: ; preds = %main, %bb
46+
%.sroa.3 = phi float [ 0.000000e+00, %bb ], [ %i17, %main ]
47+
%.sroa.4 = phi float [ 0.000000e+00, %bb ], [ %.sroa.3, %main ]
48+
%i = load <8 x half>, ptr addrspace(3) null, align 16
49+
%i1 = bitcast <8 x half> %i to i128
50+
%i2 = trunc i128 %i1 to i32
51+
%i3 = bitcast i32 %i2 to <2 x half>
52+
%.sroa.1 = extractelement <2 x half> %i3, i64 0
53+
%.sroa.2 = extractelement <2 x half> %i3, i64 1
54+
%i4 = shufflevector <8 x half> %i, <8 x half> zeroinitializer, <2 x i32> <i32 2, i32 3>
55+
%i5 = fpext <2 x half> %i4 to <2 x float>
56+
%i6 = fpext half %.sroa.1 to float
57+
%i7 = fpext half %.sroa.2 to float
58+
%i8 = extractelement <2 x float> %i5, i64 0
59+
%i9 = fmul contract float %i8, %i6
60+
%i10 = fadd contract float %.sroa.3, %i9
61+
%i11 = extractelement <2 x float> %i5, i64 1
62+
%i12 = fmul contract float %i11, %i7
63+
%i13 = fadd contract float %i12, %i10
64+
%i14 = insertelement <2 x float> zeroinitializer, float %i13, i64 0
65+
%i15 = insertelement <2 x float> %i14, float %.sroa.4, i64 1
66+
%i16 = fadd <2 x float> %i15, zeroinitializer
67+
store <2 x half> zeroinitializer, ptr addrspace(3) null, align 4
68+
%i17 = extractelement <2 x float> %i16, i64 0
69+
br label %main
70+
}

0 commit comments

Comments
 (0)