|
| 1 | +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| 2 | +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+dot7-insts,-dot10-insts,-dot5-insts" < %s | FileCheck %s -check-prefix=DISABLED |
| 3 | +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+dot7-insts,+dot10-insts,-dot5-insts" < %s | FileCheck %s -check-prefix=ENABLED |
| 4 | + |
| 5 | +; Test that FMACombine is disabled for a target without dot10-insts, and enabled for a target with dot10-insts. |
| 6 | + |
| 7 | +define amdgpu_kernel void @func() { |
| 8 | +; DISABLED-LABEL: func: |
| 9 | +; DISABLED: ; %bb.0: ; %bb |
| 10 | +; DISABLED-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 |
| 11 | +; DISABLED-NEXT: v_mov_b32_e32 v2, 0 |
| 12 | +; DISABLED-NEXT: s_mov_b32 vcc_lo, exec_lo |
| 13 | +; DISABLED-NEXT: .LBB0_1: ; %main |
| 14 | +; DISABLED-NEXT: ; =>This Inner Loop Header: Depth=1 |
| 15 | +; DISABLED-NEXT: ds_load_b128 v[3:6], v1 |
| 16 | +; DISABLED-NEXT: ds_store_b32 v1, v0 |
| 17 | +; DISABLED-NEXT: s_waitcnt lgkmcnt(1) |
| 18 | +; DISABLED-NEXT: v_fma_mix_f32 v2, v4, v3, v2 op_sel_hi:[1,1,0] |
| 19 | +; DISABLED-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| 20 | +; DISABLED-NEXT: v_fma_mix_f32 v2, v4, v3, v2 op_sel:[1,1,0] op_sel_hi:[1,1,0] |
| 21 | +; DISABLED-NEXT: v_add_f32_e32 v2, 0, v2 |
| 22 | +; DISABLED-NEXT: s_cbranch_vccnz .LBB0_1 |
| 23 | +; DISABLED-NEXT: ; %bb.2: ; %DummyReturnBlock |
| 24 | +; DISABLED-NEXT: s_endpgm |
| 25 | +; |
| 26 | +; ENABLED-LABEL: func: |
| 27 | +; ENABLED: ; %bb.0: ; %bb |
| 28 | +; ENABLED-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 |
| 29 | +; ENABLED-NEXT: v_mov_b32_e32 v2, 0 |
| 30 | +; ENABLED-NEXT: s_mov_b32 vcc_lo, exec_lo |
| 31 | +; ENABLED-NEXT: .LBB0_1: ; %main |
| 32 | +; ENABLED-NEXT: ; =>This Inner Loop Header: Depth=1 |
| 33 | +; ENABLED-NEXT: ds_load_b128 v[3:6], v1 |
| 34 | +; ENABLED-NEXT: ds_store_b32 v1, v0 |
| 35 | +; ENABLED-NEXT: s_waitcnt lgkmcnt(1) |
| 36 | +; ENABLED-NEXT: v_dot2_f32_f16 v2, v4, v3, v2 |
| 37 | +; ENABLED-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| 38 | +; ENABLED-NEXT: v_add_f32_e32 v2, 0, v2 |
| 39 | +; ENABLED-NEXT: s_cbranch_vccnz .LBB0_1 |
| 40 | +; ENABLED-NEXT: ; %bb.2: ; %DummyReturnBlock |
| 41 | +; ENABLED-NEXT: s_endpgm |
| 42 | +bb: |
| 43 | + br label %main |
| 44 | + |
| 45 | +main: ; preds = %main, %bb |
| 46 | + %.sroa.3 = phi float [ 0.000000e+00, %bb ], [ %i17, %main ] |
| 47 | + %.sroa.4 = phi float [ 0.000000e+00, %bb ], [ %.sroa.3, %main ] |
| 48 | + %i = load <8 x half>, ptr addrspace(3) null, align 16 |
| 49 | + %i1 = bitcast <8 x half> %i to i128 |
| 50 | + %i2 = trunc i128 %i1 to i32 |
| 51 | + %i3 = bitcast i32 %i2 to <2 x half> |
| 52 | + %.sroa.1 = extractelement <2 x half> %i3, i64 0 |
| 53 | + %.sroa.2 = extractelement <2 x half> %i3, i64 1 |
| 54 | + %i4 = shufflevector <8 x half> %i, <8 x half> zeroinitializer, <2 x i32> <i32 2, i32 3> |
| 55 | + %i5 = fpext <2 x half> %i4 to <2 x float> |
| 56 | + %i6 = fpext half %.sroa.1 to float |
| 57 | + %i7 = fpext half %.sroa.2 to float |
| 58 | + %i8 = extractelement <2 x float> %i5, i64 0 |
| 59 | + %i9 = fmul contract float %i8, %i6 |
| 60 | + %i10 = fadd contract float %.sroa.3, %i9 |
| 61 | + %i11 = extractelement <2 x float> %i5, i64 1 |
| 62 | + %i12 = fmul contract float %i11, %i7 |
| 63 | + %i13 = fadd contract float %i12, %i10 |
| 64 | + %i14 = insertelement <2 x float> zeroinitializer, float %i13, i64 0 |
| 65 | + %i15 = insertelement <2 x float> %i14, float %.sroa.4, i64 1 |
| 66 | + %i16 = fadd <2 x float> %i15, zeroinitializer |
| 67 | + store <2 x half> zeroinitializer, ptr addrspace(3) null, align 4 |
| 68 | + %i17 = extractelement <2 x float> %i16, i64 0 |
| 69 | + br label %main |
| 70 | +} |
0 commit comments