From 6a78b1798110176467ee924f77ae0331ad05d6a3 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Mon, 17 Mar 2025 11:34:58 -0400 Subject: [PATCH] [NFC][AMDGPU] Auto generate check lines for `llvm/test/CodeGen/AMDGPU/packed-fp32.ll` --- llvm/test/CodeGen/AMDGPU/packed-fp32.ll | 2003 ++++++++++++++++++++--- 1 file changed, 1810 insertions(+), 193 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index 2004e1eb061bf..28a995e74f7ab 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -1,13 +1,34 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PACKED-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PACKED-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PACKED-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PACKED-GISEL %s - -; GCN-LABEL: {{^}}fadd_v2_vv: -; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; PACKED: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX900 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL %s + define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) { +; GFX900-LABEL: fadd_v2_vv: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-LABEL: fadd_v2_vv: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-NEXT: s_waitcnt vmcnt(0) +; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[0:1] +; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -16,10 +37,30 @@ define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fadd_v2_vs: -; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; PACKED: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { +; GFX900-LABEL: fadd_v2_vs: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v1, s3, v1 +; GFX900-NEXT: v_add_f32_e32 v0, s2, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-LABEL: fadd_v2_vs: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-NEXT: s_waitcnt vmcnt(0) +; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] +; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -28,10 +69,49 @@ define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { ret void } -; GCN-LABEL: {{^}}fadd_v4_vs: -; GFX900-COUNT-4: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; PACKED-COUNT-2: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { +; GFX900-LABEL: fadd_v4_vs: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v3, s3, v3 +; GFX900-NEXT: v_add_f32_e32 v2, s2, v2 +; GFX900-NEXT: v_add_f32_e32 v1, s1, v1 +; GFX900-NEXT: v_add_f32_e32 v0, s0, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fadd_v4_vs: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[2:3] +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[0:1] +; PACKED-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fadd_v4_vs: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[0:1] +; PACKED-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[2:3] +; PACKED-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id %load = load <4 x float>, ptr addrspace(1) %gep, align 16 @@ -40,10 +120,163 @@ define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { ret void } -; GCN-LABEL: {{^}}fadd_v32_vs: -; GFX900-COUNT-32: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; PACKED-COUNT-16: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { +; GFX900-LABEL: fadd_v32_vs: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16 +; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1] +; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48 +; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32 +; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80 +; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64 +; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 +; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 +; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v4, s43, v4 +; GFX900-NEXT: v_add_f32_e32 v3, s42, v3 +; GFX900-NEXT: v_add_f32_e32 v2, s41, v2 +; GFX900-NEXT: v_add_f32_e32 v1, s40, v1 +; GFX900-NEXT: s_waitcnt vmcnt(6) +; GFX900-NEXT: v_add_f32_e32 v8, s39, v8 +; GFX900-NEXT: v_add_f32_e32 v7, s38, v7 +; GFX900-NEXT: v_add_f32_e32 v6, s37, v6 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v32, s19, v32 +; GFX900-NEXT: v_add_f32_e32 v31, s18, v31 +; GFX900-NEXT: v_add_f32_e32 v30, s17, v30 +; GFX900-NEXT: v_add_f32_e32 v29, s16, v29 +; GFX900-NEXT: v_add_f32_e32 v5, s36, v5 +; GFX900-NEXT: v_add_f32_e32 v12, s51, v12 +; GFX900-NEXT: v_add_f32_e32 v11, s50, v11 +; GFX900-NEXT: v_add_f32_e32 v10, s49, v10 +; GFX900-NEXT: v_add_f32_e32 v9, s48, v9 +; GFX900-NEXT: v_add_f32_e32 v16, s47, v16 +; GFX900-NEXT: v_add_f32_e32 v15, s46, v15 +; GFX900-NEXT: v_add_f32_e32 v14, s45, v14 +; GFX900-NEXT: v_add_f32_e32 v13, s44, v13 +; GFX900-NEXT: v_add_f32_e32 v20, s15, v20 +; GFX900-NEXT: v_add_f32_e32 v19, s14, v19 +; GFX900-NEXT: v_add_f32_e32 v18, s13, v18 +; GFX900-NEXT: v_add_f32_e32 v17, s12, v17 +; GFX900-NEXT: v_add_f32_e32 v24, s11, v24 +; GFX900-NEXT: v_add_f32_e32 v23, s10, v23 +; GFX900-NEXT: v_add_f32_e32 v22, s9, v22 +; GFX900-NEXT: v_add_f32_e32 v21, s8, v21 +; GFX900-NEXT: v_add_f32_e32 v28, s23, v28 +; GFX900-NEXT: v_add_f32_e32 v27, s22, v27 +; GFX900-NEXT: v_add_f32_e32 v26, s21, v26 +; GFX900-NEXT: v_add_f32_e32 v25, s20, v25 +; GFX900-NEXT: global_store_dwordx4 v0, v[29:32], s[0:1] offset:96 +; GFX900-NEXT: global_store_dwordx4 v0, v[25:28], s[0:1] offset:112 +; GFX900-NEXT: global_store_dwordx4 v0, v[21:24], s[0:1] offset:64 +; GFX900-NEXT: global_store_dwordx4 v0, v[17:20], s[0:1] offset:80 +; GFX900-NEXT: global_store_dwordx4 v0, v[13:16], s[0:1] offset:32 +; GFX900-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:48 +; GFX900-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] +; GFX900-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] offset:16 +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fadd_v32_vs: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] +; PACKED-SDAG-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:48 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:32 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96 +; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[40:41] +; PACKED-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[42:43] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[6:7], v[6:7], s[38:39] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(5) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[8:9], v[8:9], s[48:49] +; PACKED-SDAG-NEXT: v_pk_add_f32 v[10:11], v[10:11], s[50:51] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(4) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[16:17], v[16:17], s[44:45] +; PACKED-SDAG-NEXT: v_pk_add_f32 v[18:19], v[18:19], s[46:47] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[28:29], v[28:29], s[16:17] +; PACKED-SDAG-NEXT: v_pk_add_f32 v[30:31], v[30:31], s[18:19] +; PACKED-SDAG-NEXT: v_pk_add_f32 v[20:21], v[20:21], s[12:13] +; PACKED-SDAG-NEXT: v_pk_add_f32 v[22:23], v[22:23], s[14:15] +; PACKED-SDAG-NEXT: v_pk_add_f32 v[14:15], v[14:15], s[10:11] +; PACKED-SDAG-NEXT: v_pk_add_f32 v[24:25], v[24:25], s[20:21] +; PACKED-SDAG-NEXT: v_pk_add_f32 v[26:27], v[26:27], s[22:23] +; PACKED-SDAG-NEXT: v_pk_add_f32 v[4:5], v[4:5], s[36:37] +; PACKED-SDAG-NEXT: v_pk_add_f32 v[12:13], v[12:13], s[8:9] +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:96 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:112 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:64 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:48 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16 +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fadd_v32_vs: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 +; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[36:37] +; PACKED-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[38:39] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[4:5], v[4:5], s[40:41] +; PACKED-GISEL-NEXT: v_pk_add_f32 v[6:7], v[6:7], s[42:43] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(5) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[8:9], v[8:9], s[44:45] +; PACKED-GISEL-NEXT: v_pk_add_f32 v[10:11], v[10:11], s[46:47] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(4) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[12:13], v[12:13], s[48:49] +; PACKED-GISEL-NEXT: v_pk_add_f32 v[14:15], v[14:15], s[50:51] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(3) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[16:17], v[16:17], s[8:9] +; PACKED-GISEL-NEXT: v_pk_add_f32 v[18:19], v[18:19], s[10:11] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(2) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[20:21], v[20:21], s[12:13] +; PACKED-GISEL-NEXT: v_pk_add_f32 v[22:23], v[22:23], s[14:15] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(1) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[24:25], v[24:25], s[16:17] +; PACKED-GISEL-NEXT: v_pk_add_f32 v[26:27], v[26:27], s[18:19] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[28:29], v[28:29], s[20:21] +; PACKED-GISEL-NEXT: v_pk_add_f32 v[30:31], v[30:31], s[22:23] +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id %load = load <32 x float>, ptr addrspace(1) %gep, align 128 @@ -53,13 +286,45 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { } ; FIXME: GISel does not use op_sel for splat constants. - -; GCN-LABEL: {{^}}fadd_v2_v_imm: -; PACKED: s_mov_b32 s[[K:[0-9]+]], 0x42c80000 -; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 0x42c80000, v{{[0-9]+}} -; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fadd_v2_v_imm(ptr addrspace(1) %a) { +; GFX900-LABEL: fadd_v2_v_imm: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v1, 0x42c80000, v1 +; GFX900-NEXT: v_add_f32_e32 v0, 0x42c80000, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fadd_v2_v_imm: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-SDAG-NEXT: s_mov_b32 s2, 0x42c80000 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] +; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fadd_v2_v_imm: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-GISEL-NEXT: s_mov_b32 s2, 0x42c80000 +; PACKED-GISEL-NEXT: s_mov_b32 s3, s2 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] +; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -68,11 +333,43 @@ define amdgpu_kernel void @fadd_v2_v_imm(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fadd_v2_v_v_splat: -; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v0 -; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1] op_sel_hi:[1,0]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1]{{$}} define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) { +; GFX900-LABEL: fadd_v2_v_v_splat: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v2, v2, v0 +; GFX900-NEXT: v_add_f32_e32 v1, v1, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fadd_v2_v_v_splat: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[2:3], v[0:1] op_sel_hi:[1,0] +; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fadd_v2_v_v_splat: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v1, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[2:3], v[0:1] +; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -84,11 +381,42 @@ define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fadd_v2_v_lit_splat: -; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} -; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1.0 op_sel_hi:[1,0]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1.0{{$}} define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) { +; GFX900-LABEL: fadd_v2_v_lit_splat: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX900-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fadd_v2_v_lit_splat: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], 1.0 op_sel_hi:[1,0] +; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fadd_v2_v_lit_splat: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 1.0 +; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -97,12 +425,31 @@ define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fadd_v2_v_lit_hi0: -; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} -; PACKED-DAG: s_mov_b64 [[K:s\[[0-9:]+\]]], 0x3f800000 -; PACKED: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], [[K]] define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) { +; GFX900-LABEL: fadd_v2_v_lit_hi0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v1, 0, v1 +; GFX900-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-LABEL: fadd_v2_v_lit_hi0: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-NEXT: s_mov_b64 s[2:3], 0x3f800000 +; PACKED-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-NEXT: s_waitcnt vmcnt(0) +; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] +; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -111,13 +458,32 @@ define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fadd_v2_v_lit_lo0: -; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} -; PACKED-DAG: s_mov_b32 s[[LO:[0-9]+]], 0 -; PACKED-DAG: s_mov_b32 s[[HI:[0-9]+]], 1.0 -; PACKED: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[LO]]:[[HI]]]{{$}} define amdgpu_kernel void @fadd_v2_v_lit_lo0(ptr addrspace(1) %a) { +; GFX900-LABEL: fadd_v2_v_lit_lo0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX900-NEXT: v_add_f32_e32 v0, 0, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-LABEL: fadd_v2_v_lit_lo0: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-NEXT: s_mov_b32 s2, 0 +; PACKED-NEXT: s_mov_b32 s3, 1.0 +; PACKED-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-NEXT: s_waitcnt vmcnt(0) +; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] +; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -126,13 +492,32 @@ define amdgpu_kernel void @fadd_v2_v_lit_lo0(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fadd_v2_v_unfoldable_lit: -; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} -; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} -; PACKED-DAG: s_mov_b32 s{{[0-9]+}}, 1.0 -; PACKED-DAG: s_mov_b32 s{{[0-9]+}}, 2.0 -; PACKED: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(ptr addrspace(1) %a) { +; GFX900-LABEL: fadd_v2_v_unfoldable_lit: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v1, 2.0, v1 +; GFX900-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-LABEL: fadd_v2_v_unfoldable_lit: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-NEXT: s_mov_b32 s2, 1.0 +; PACKED-NEXT: s_mov_b32 s3, 2.0 +; PACKED-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-NEXT: s_waitcnt vmcnt(0) +; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] +; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -142,12 +527,47 @@ define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(ptr addrspace(1) %a) { } ; FIXME: Fold fneg into v_pk_add_f32 with Global ISel. - -; GCN-LABEL: {{^}}fadd_v2_v_fneg: -; GFX900-COUNT-2: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fadd_v2_v_fneg(ptr addrspace(1) %a, float %x) { +; GFX900-LABEL: fadd_v2_v_fneg: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_subrev_f32_e32 v1, s2, v1 +; GFX900-NEXT: v_subrev_f32_e32 v0, s2, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fadd_v2_v_fneg: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1] +; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fadd_v2_v_fneg: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; PACKED-GISEL-NEXT: v_max_f32_e64 v2, -s2, -s2 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v3, v2 +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -159,12 +579,47 @@ define amdgpu_kernel void @fadd_v2_v_fneg(ptr addrspace(1) %a, float %x) { ret void } -; GCN-LABEL: {{^}}fadd_v2_v_fneg_lo: -; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fadd_v2_v_fneg_lo(ptr addrspace(1) %a, float %x) { +; GFX900-LABEL: fadd_v2_v_fneg_lo: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v1, s2, v1 +; GFX900-NEXT: v_subrev_f32_e32 v0, s2, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fadd_v2_v_fneg_lo: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_lo:[0,1] +; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fadd_v2_v_fneg_lo: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; PACKED-GISEL-NEXT: v_mov_b32_e32 v3, s2 +; PACKED-GISEL-NEXT: v_max_f32_e64 v2, -s2, -s2 +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -176,12 +631,47 @@ define amdgpu_kernel void @fadd_v2_v_fneg_lo(ptr addrspace(1) %a, float %x) { ret void } -; GCN-LABEL: {{^}}fadd_v2_v_fneg_hi: -; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_hi:[0,1]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fadd_v2_v_fneg_hi(ptr addrspace(1) %a, float %x) { +; GFX900-LABEL: fadd_v2_v_fneg_hi: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_subrev_f32_e32 v1, s2, v1 +; GFX900-NEXT: v_add_f32_e32 v0, s2, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fadd_v2_v_fneg_hi: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_hi:[0,1] +; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fadd_v2_v_fneg_hi: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; PACKED-GISEL-NEXT: v_max_f32_e64 v3, -s2, -s2 +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -193,12 +683,44 @@ define amdgpu_kernel void @fadd_v2_v_fneg_hi(ptr addrspace(1) %a, float %x) { ret void } -; GCN-LABEL: {{^}}fadd_v2_v_fneg_lo2: -; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] neg_lo:[0,1]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fadd_v2_v_fneg_lo2(ptr addrspace(1) %a, float %x, float %y) { +; GFX900-LABEL: fadd_v2_v_fneg_lo2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v1, s3, v1 +; GFX900-NEXT: v_subrev_f32_e32 v0, s2, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fadd_v2_v_fneg_lo2: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] neg_lo:[0,1] +; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fadd_v2_v_fneg_lo2: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; PACKED-GISEL-NEXT: v_max_f32_e64 v2, -s2, -s2 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v3, s3 +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -210,12 +732,44 @@ define amdgpu_kernel void @fadd_v2_v_fneg_lo2(ptr addrspace(1) %a, float %x, flo ret void } -; GCN-LABEL: {{^}}fadd_v2_v_fneg_hi2: -; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0] neg_hi:[0,1]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fadd_v2_v_fneg_hi2(ptr addrspace(1) %a, float %x, float %y) { +; GFX900-LABEL: fadd_v2_v_fneg_hi2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_subrev_f32_e32 v1, s2, v1 +; GFX900-NEXT: v_add_f32_e32 v0, s3, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fadd_v2_v_fneg_hi2: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel:[0,1] op_sel_hi:[1,0] neg_hi:[0,1] +; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fadd_v2_v_fneg_hi2: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; PACKED-GISEL-NEXT: v_max_f32_e64 v3, -s2, -s2 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -227,10 +781,30 @@ define amdgpu_kernel void @fadd_v2_v_fneg_hi2(ptr addrspace(1) %a, float %x, flo ret void } -; GCN-LABEL: {{^}}fmul_v2_vv: -; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; PACKED: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] define amdgpu_kernel void @fmul_v2_vv(ptr addrspace(1) %a) { +; GFX900-LABEL: fmul_v2_vv: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mul_f32_e32 v1, v1, v1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-LABEL: fmul_v2_vv: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-NEXT: s_waitcnt vmcnt(0) +; PACKED-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[0:1] +; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -239,10 +813,30 @@ define amdgpu_kernel void @fmul_v2_vv(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fmul_v2_vs: -; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; PACKED: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fmul_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { +; GFX900-LABEL: fmul_v2_vs: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mul_f32_e32 v1, s3, v1 +; GFX900-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-LABEL: fmul_v2_vs: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-NEXT: s_waitcnt vmcnt(0) +; PACKED-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] +; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -251,10 +845,49 @@ define amdgpu_kernel void @fmul_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { ret void } -; GCN-LABEL: {{^}}fmul_v4_vs: -; GFX900-COUNT-4: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; PACKED-COUNT-2: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fmul_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { +; GFX900-LABEL: fmul_v4_vs: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mul_f32_e32 v3, s3, v3 +; GFX900-NEXT: v_mul_f32_e32 v2, s2, v2 +; GFX900-NEXT: v_mul_f32_e32 v1, s1, v1 +; GFX900-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fmul_v4_vs: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[2:3] +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[0:1] +; PACKED-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fmul_v4_vs: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[0:1] +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[2:3] +; PACKED-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id %load = load <4 x float>, ptr addrspace(1) %gep, align 16 @@ -263,10 +896,163 @@ define amdgpu_kernel void @fmul_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { ret void } -; GCN-LABEL: {{^}}fmul_v32_vs: -; GFX900-COUNT-32: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; PACKED-COUNT-16: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { +; GFX900-LABEL: fmul_v32_vs: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16 +; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1] +; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48 +; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32 +; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80 +; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64 +; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 +; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 +; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; GFX900-NEXT: v_mul_f32_e32 v4, s43, v4 +; GFX900-NEXT: v_mul_f32_e32 v3, s42, v3 +; GFX900-NEXT: v_mul_f32_e32 v2, s41, v2 +; GFX900-NEXT: v_mul_f32_e32 v1, s40, v1 +; GFX900-NEXT: s_waitcnt vmcnt(6) +; GFX900-NEXT: v_mul_f32_e32 v8, s39, v8 +; GFX900-NEXT: v_mul_f32_e32 v7, s38, v7 +; GFX900-NEXT: v_mul_f32_e32 v6, s37, v6 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mul_f32_e32 v32, s19, v32 +; GFX900-NEXT: v_mul_f32_e32 v31, s18, v31 +; GFX900-NEXT: v_mul_f32_e32 v30, s17, v30 +; GFX900-NEXT: v_mul_f32_e32 v29, s16, v29 +; GFX900-NEXT: v_mul_f32_e32 v5, s36, v5 +; GFX900-NEXT: v_mul_f32_e32 v12, s51, v12 +; GFX900-NEXT: v_mul_f32_e32 v11, s50, v11 +; GFX900-NEXT: v_mul_f32_e32 v10, s49, v10 +; GFX900-NEXT: v_mul_f32_e32 v9, s48, v9 +; GFX900-NEXT: v_mul_f32_e32 v16, s47, v16 +; GFX900-NEXT: v_mul_f32_e32 v15, s46, v15 +; GFX900-NEXT: v_mul_f32_e32 v14, s45, v14 +; GFX900-NEXT: v_mul_f32_e32 v13, s44, v13 +; GFX900-NEXT: v_mul_f32_e32 v20, s15, v20 +; GFX900-NEXT: v_mul_f32_e32 v19, s14, v19 +; GFX900-NEXT: v_mul_f32_e32 v18, s13, v18 +; GFX900-NEXT: v_mul_f32_e32 v17, s12, v17 +; GFX900-NEXT: v_mul_f32_e32 v24, s11, v24 +; GFX900-NEXT: v_mul_f32_e32 v23, s10, v23 +; GFX900-NEXT: v_mul_f32_e32 v22, s9, v22 +; GFX900-NEXT: v_mul_f32_e32 v21, s8, v21 +; GFX900-NEXT: v_mul_f32_e32 v28, s23, v28 +; GFX900-NEXT: v_mul_f32_e32 v27, s22, v27 +; GFX900-NEXT: v_mul_f32_e32 v26, s21, v26 +; GFX900-NEXT: v_mul_f32_e32 v25, s20, v25 +; GFX900-NEXT: global_store_dwordx4 v0, v[29:32], s[0:1] offset:96 +; GFX900-NEXT: global_store_dwordx4 v0, v[25:28], s[0:1] offset:112 +; GFX900-NEXT: global_store_dwordx4 v0, v[21:24], s[0:1] offset:64 +; GFX900-NEXT: global_store_dwordx4 v0, v[17:20], s[0:1] offset:80 +; GFX900-NEXT: global_store_dwordx4 v0, v[13:16], s[0:1] offset:32 +; GFX900-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:48 +; GFX900-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] +; GFX900-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] offset:16 +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fmul_v32_vs: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] +; PACKED-SDAG-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:48 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:32 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96 +; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[40:41] +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[42:43] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6) +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[6:7], v[6:7], s[38:39] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(5) +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[8:9], v[8:9], s[48:49] +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[10:11], v[10:11], s[50:51] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(4) +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[16:17], v[16:17], s[44:45] +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[18:19], v[18:19], s[46:47] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[28:29], v[28:29], s[16:17] +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[30:31], v[30:31], s[18:19] +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[20:21], v[20:21], s[12:13] +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[22:23], v[22:23], s[14:15] +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[14:15], v[14:15], s[10:11] +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[24:25], v[24:25], s[20:21] +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[26:27], v[26:27], s[22:23] +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[4:5], v[4:5], s[36:37] +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[12:13], v[12:13], s[8:9] +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:96 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:112 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:64 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:48 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16 +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fmul_v32_vs: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 +; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[36:37] +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[38:39] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[4:5], v[4:5], s[40:41] +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[6:7], v[6:7], s[42:43] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(5) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[8:9], v[8:9], s[44:45] +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[10:11], v[10:11], s[46:47] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(4) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[12:13], v[12:13], s[48:49] +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[14:15], v[14:15], s[50:51] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(3) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[16:17], v[16:17], s[8:9] +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[18:19], v[18:19], s[10:11] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(2) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[20:21], v[20:21], s[12:13] +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[22:23], v[22:23], s[14:15] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(1) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[24:25], v[24:25], s[16:17] +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[26:27], v[26:27], s[18:19] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[28:29], v[28:29], s[20:21] +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[30:31], v[30:31], s[22:23] +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id %load = load <32 x float>, ptr addrspace(1) %gep, align 128 @@ -275,12 +1061,45 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ret void } -; GCN-LABEL: {{^}}fmul_v2_v_imm: -; PACKED: s_mov_b32 s[[K:[0-9]+]], 0x42c80000 -; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, 0x42c80000, v{{[0-9]+}} -; PACKED-SDAG: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}} -; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fmul_v2_v_imm(ptr addrspace(1) %a) { +; GFX900-LABEL: fmul_v2_v_imm: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mul_f32_e32 v1, 0x42c80000, v1 +; GFX900-NEXT: v_mul_f32_e32 v0, 0x42c80000, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fmul_v2_v_imm: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-SDAG-NEXT: s_mov_b32 s2, 0x42c80000 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] +; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fmul_v2_v_imm: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-GISEL-NEXT: s_mov_b32 s2, 0x42c80000 +; PACKED-GISEL-NEXT: s_mov_b32 s3, s2 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] +; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -289,11 +1108,43 @@ define amdgpu_kernel void @fmul_v2_v_imm(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fmul_v2_v_v_splat: -; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v0 -; PACKED-SDAG: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1] op_sel_hi:[1,0]{{$}} -; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1]{{$}} define amdgpu_kernel void @fmul_v2_v_v_splat(ptr addrspace(1) %a) { +; GFX900-LABEL: fmul_v2_v_v_splat: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mul_f32_e32 v2, v2, v0 +; GFX900-NEXT: v_mul_f32_e32 v1, v1, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fmul_v2_v_v_splat: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[2:3], v[0:1] op_sel_hi:[1,0] +; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fmul_v2_v_v_splat: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v1, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[2:3], v[0:1] +; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -305,11 +1156,42 @@ define amdgpu_kernel void @fmul_v2_v_v_splat(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fmul_v2_v_lit_splat: -; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} -; PACKED-SDAG: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0 op_sel_hi:[1,0]{{$}} -; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0{{$}} define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) { +; GFX900-LABEL: fmul_v2_v_lit_splat: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mul_f32_e32 v1, 4.0, v1 +; GFX900-NEXT: v_mul_f32_e32 v0, 4.0, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fmul_v2_v_lit_splat: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], 4.0 op_sel_hi:[1,0] +; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fmul_v2_v_lit_splat: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], 4.0 +; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -318,13 +1200,32 @@ define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fmul_v2_v_unfoldable_lit: -; GFX900-DAG: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} -; GFX900-DAG: v_mul_f32_e32 v{{[0-9]+}}, 0x40400000, v{{[0-9]+}} -; PACKED-DAG: s_mov_b32 s{{[0-9]+}}, 4.0 -; PACKED-DAG: s_mov_b32 s{{[0-9]+}}, 0x40400000 -; PACKED: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(ptr addrspace(1) %a) { +; GFX900-LABEL: fmul_v2_v_unfoldable_lit: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mul_f32_e32 v1, 0x40400000, v1 +; GFX900-NEXT: v_mul_f32_e32 v0, 4.0, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-LABEL: fmul_v2_v_unfoldable_lit: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-NEXT: s_mov_b32 s2, 4.0 +; PACKED-NEXT: s_mov_b32 s3, 0x40400000 +; PACKED-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-NEXT: s_waitcnt vmcnt(0) +; PACKED-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] +; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -333,11 +1234,47 @@ define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fmul_v2_v_fneg: -; GFX900-COUNT-2: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -s{{[0-9]+}} -; PACKED-SDAG: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}} -; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fmul_v2_v_fneg(ptr addrspace(1) %a, float %x) { +; GFX900-LABEL: fmul_v2_v_fneg: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mul_f32_e64 v1, v1, -s2 +; GFX900-NEXT: v_mul_f32_e64 v0, v0, -s2 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fmul_v2_v_fneg: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1] +; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fmul_v2_v_fneg: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; PACKED-GISEL-NEXT: v_max_f32_e64 v2, -s2, -s2 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v3, v2 +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3] +; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -349,10 +1286,30 @@ define amdgpu_kernel void @fmul_v2_v_fneg(ptr addrspace(1) %a, float %x) { ret void } -; GCN-LABEL: {{^}}fma_v2_vv: -; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; PACKED: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] define amdgpu_kernel void @fma_v2_vv(ptr addrspace(1) %a) { +; GFX900-LABEL: fma_v2_vv: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_fma_f32 v1, v1, v1, v1 +; GFX900-NEXT: v_fma_f32 v0, v0, v0, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-LABEL: fma_v2_vv: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-NEXT: s_waitcnt vmcnt(0) +; PACKED-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[0:1], v[0:1] +; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -361,10 +1318,30 @@ define amdgpu_kernel void @fma_v2_vv(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fma_v2_vs: -; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; PACKED: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fma_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { +; GFX900-LABEL: fma_v2_vs: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_fma_f32 v1, v1, s3, s3 +; GFX900-NEXT: v_fma_f32 v0, v0, s2, s2 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-LABEL: fma_v2_vs: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-NEXT: s_waitcnt vmcnt(0) +; PACKED-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[2:3] +; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -373,10 +1350,49 @@ define amdgpu_kernel void @fma_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { ret void } -; GCN-LABEL: {{^}}fma_v4_vs: -; GFX900-COUNT-4: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; PACKED-COUNT-2: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fma_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { +; GFX900-LABEL: fma_v4_vs: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_fma_f32 v3, v3, s3, s3 +; GFX900-NEXT: v_fma_f32 v2, v2, s2, s2 +; GFX900-NEXT: v_fma_f32 v1, v1, s1, s1 +; GFX900-NEXT: v_fma_f32 v0, v0, s0, s0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fma_v4_vs: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[2:3], s[2:3] +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[0:1], s[0:1] +; PACKED-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fma_v4_vs: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[0:1], s[0:1] +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[2:3], s[2:3] +; PACKED-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id %load = load <4 x float>, ptr addrspace(1) %gep, align 16 @@ -385,10 +1401,163 @@ define amdgpu_kernel void @fma_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { ret void } -; GCN-LABEL: {{^}}fma_v32_vs: -; GFX900-COUNT-32: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; PACKED-COUNT-16: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { +; GFX900-LABEL: fma_v32_vs: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16 +; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1] +; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48 +; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32 +; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80 +; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64 +; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 +; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 +; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; GFX900-NEXT: v_fma_f32 v4, v4, s43, s43 +; GFX900-NEXT: v_fma_f32 v3, v3, s42, s42 +; GFX900-NEXT: v_fma_f32 v2, v2, s41, s41 +; GFX900-NEXT: v_fma_f32 v1, v1, s40, s40 +; GFX900-NEXT: s_waitcnt vmcnt(6) +; GFX900-NEXT: v_fma_f32 v8, v8, s39, s39 +; GFX900-NEXT: v_fma_f32 v7, v7, s38, s38 +; GFX900-NEXT: v_fma_f32 v6, v6, s37, s37 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_fma_f32 v32, v32, s19, s19 +; GFX900-NEXT: v_fma_f32 v31, v31, s18, s18 +; GFX900-NEXT: v_fma_f32 v30, v30, s17, s17 +; GFX900-NEXT: v_fma_f32 v29, v29, s16, s16 +; GFX900-NEXT: v_fma_f32 v5, v5, s36, s36 +; GFX900-NEXT: v_fma_f32 v12, v12, s51, s51 +; GFX900-NEXT: v_fma_f32 v11, v11, s50, s50 +; GFX900-NEXT: v_fma_f32 v10, v10, s49, s49 +; GFX900-NEXT: v_fma_f32 v9, v9, s48, s48 +; GFX900-NEXT: v_fma_f32 v16, v16, s47, s47 +; GFX900-NEXT: v_fma_f32 v15, v15, s46, s46 +; GFX900-NEXT: v_fma_f32 v14, v14, s45, s45 +; GFX900-NEXT: v_fma_f32 v13, v13, s44, s44 +; GFX900-NEXT: v_fma_f32 v20, v20, s15, s15 +; GFX900-NEXT: v_fma_f32 v19, v19, s14, s14 +; GFX900-NEXT: v_fma_f32 v18, v18, s13, s13 +; GFX900-NEXT: v_fma_f32 v17, v17, s12, s12 +; GFX900-NEXT: v_fma_f32 v24, v24, s11, s11 +; GFX900-NEXT: v_fma_f32 v23, v23, s10, s10 +; GFX900-NEXT: v_fma_f32 v22, v22, s9, s9 +; GFX900-NEXT: v_fma_f32 v21, v21, s8, s8 +; GFX900-NEXT: v_fma_f32 v28, v28, s23, s23 +; GFX900-NEXT: v_fma_f32 v27, v27, s22, s22 +; GFX900-NEXT: v_fma_f32 v26, v26, s21, s21 +; GFX900-NEXT: v_fma_f32 v25, v25, s20, s20 +; GFX900-NEXT: global_store_dwordx4 v0, v[29:32], s[0:1] offset:96 +; GFX900-NEXT: global_store_dwordx4 v0, v[25:28], s[0:1] offset:112 +; GFX900-NEXT: global_store_dwordx4 v0, v[21:24], s[0:1] offset:64 +; GFX900-NEXT: global_store_dwordx4 v0, v[17:20], s[0:1] offset:80 +; GFX900-NEXT: global_store_dwordx4 v0, v[13:16], s[0:1] offset:32 +; GFX900-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:48 +; GFX900-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] +; GFX900-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] offset:16 +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fma_v32_vs: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] +; PACKED-SDAG-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:48 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:32 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96 +; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[40:41], s[40:41] +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[42:43], s[42:43] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6) +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[6:7], v[6:7], s[38:39], s[38:39] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(5) +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[8:9], v[8:9], s[48:49], s[48:49] +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[10:11], v[10:11], s[50:51], s[50:51] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(4) +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[16:17], v[16:17], s[44:45], s[44:45] +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[18:19], v[18:19], s[46:47], s[46:47] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[28:29], v[28:29], s[16:17], s[16:17] +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[30:31], v[30:31], s[18:19], s[18:19] +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[20:21], v[20:21], s[12:13], s[12:13] +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[22:23], v[22:23], s[14:15], s[14:15] +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[14:15], v[14:15], s[10:11], s[10:11] +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[24:25], v[24:25], s[20:21], s[20:21] +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[26:27], v[26:27], s[22:23], s[22:23] +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[4:5], v[4:5], s[36:37], s[36:37] +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[12:13], v[12:13], s[8:9], s[8:9] +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:96 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:112 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:64 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:48 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16 +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fma_v32_vs: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 +; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[36:37], s[36:37] +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[38:39], s[38:39] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6) +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[4:5], v[4:5], s[40:41], s[40:41] +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[6:7], v[6:7], s[42:43], s[42:43] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(5) +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[8:9], v[8:9], s[44:45], s[44:45] +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[10:11], v[10:11], s[46:47], s[46:47] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(4) +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[12:13], v[12:13], s[48:49], s[48:49] +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[14:15], v[14:15], s[50:51], s[50:51] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(3) +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[16:17], v[16:17], s[8:9], s[8:9] +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[18:19], v[18:19], s[10:11], s[10:11] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(2) +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[20:21], v[20:21], s[12:13], s[12:13] +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[22:23], v[22:23], s[14:15], s[14:15] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(1) +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[24:25], v[24:25], s[16:17], s[16:17] +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[26:27], v[26:27], s[18:19], s[18:19] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[28:29], v[28:29], s[20:21], s[20:21] +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[30:31], v[30:31], s[22:23], s[22:23] +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id %load = load <32 x float>, ptr addrspace(1) %gep, align 128 @@ -397,14 +1566,34 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ret void } -; GCN-LABEL: {{^}}fma_v2_v_imm: -; GCN-DAG: s_mov_b32 s[[K1:[0-9]+]], 0x42c80000 -; GFX900-DAG: v_mov_b32_e32 v[[K2:[0-9]+]], 0x43480000 -; PACKED-SDAG-DAG: v_mov_b32_e32 v[[K2:[0-9]+]], 0x43480000 -; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s[[K1]], v[[K2]] -; PACKED-SDAG: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K1]]:{{[0-9:]+}}], v[[[K2]]:{{[0-9:]+}}] op_sel_hi:[1,0,0]{{$}} -; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K1]]:{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) { +; GFX900-LABEL: fma_v2_v_imm: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_mov_b32 s2, 0x42c80000 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x43480000 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_fma_f32 v1, v1, s2, v3 +; GFX900-NEXT: v_fma_f32 v0, v0, s2, v3 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fma_v2_v_imm: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, 0x43480000 +; PACKED-SDAG-NEXT: s_mov_b32 s2, 0x42c80000 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] op_sel_hi:[1,0,0] +; PACKED-SDAG-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -413,11 +1602,43 @@ define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fma_v2_v_v_splat: -; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v0, v0 -; PACKED-SDAG: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1], v[0:1] op_sel_hi:[1,0,0]{{$}} -; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1], v[0:1]{{$}} define amdgpu_kernel void @fma_v2_v_v_splat(ptr addrspace(1) %a) { +; GFX900-LABEL: fma_v2_v_v_splat: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_fma_f32 v2, v2, v0, v0 +; GFX900-NEXT: v_fma_f32 v1, v1, v0, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fma_v2_v_v_splat: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[2:3], v[0:1], v[0:1] op_sel_hi:[1,0,0] +; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fma_v2_v_v_splat: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v1, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[2:3], v[0:1], v[0:1] +; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -429,11 +1650,42 @@ define amdgpu_kernel void @fma_v2_v_v_splat(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fma_v2_v_lit_splat: -; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, 4.0, 1.0 -; PACKED-SDAG: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0, 1.0 op_sel_hi:[1,0,0]{{$}} -; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0, 1.0{{$}} define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) { +; GFX900-LABEL: fma_v2_v_lit_splat: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_fma_f32 v1, v1, 4.0, 1.0 +; GFX900-NEXT: v_fma_f32 v0, v0, 4.0, 1.0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fma_v2_v_lit_splat: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], 4.0, 1.0 op_sel_hi:[1,0,0] +; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fma_v2_v_lit_splat: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], 4.0, 1.0 +; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -442,15 +1694,35 @@ define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fma_v2_v_unfoldable_lit: -; GCN-DAG: s_mov_b32 s{{[0-9]+}}, 0x40400000 -; GFX900-DAG: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, 4.0, 1.0 -; GFX900-DAG: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, 2.0 -; PACKED-SDAG-DAG: s_mov_b32 s{{[0-9]+}}, 4.0 -; PACKED-SDAG-DAG: v_mov_b32_e32 v{{[0-9]+}}, 1.0 -; PACKED-SDAG-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0 -; PACKED: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) { +; GFX900-LABEL: fma_v2_v_unfoldable_lit: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_mov_b32 s2, 0x40400000 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_fma_f32 v1, v1, s2, 2.0 +; GFX900-NEXT: v_fma_f32 v0, v0, 4.0, 1.0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fma_v2_v_unfoldable_lit: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; PACKED-SDAG-NEXT: s_mov_b32 s2, 4.0 +; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, 1.0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; PACKED-SDAG-NEXT: v_mov_b32_e32 v3, 2.0 +; PACKED-SDAG-NEXT: s_mov_b32 s3, 0x40400000 +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] +; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -459,11 +1731,47 @@ define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fma_v2_v_fneg: -; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -s{{[0-9]+}}, -s{{[0-9]+}} -; PACKED-SDAG: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0,0] neg_lo:[0,1,1] neg_hi:[0,1,1]{{$}} -; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fma_v2_v_fneg(ptr addrspace(1) %a, float %x) { +; GFX900-LABEL: fma_v2_v_fneg: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_fma_f32 v1, v1, -s2, -s2 +; GFX900-NEXT: v_fma_f32 v0, v0, -s2, -s2 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fma_v2_v_fneg: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[2:3] op_sel_hi:[1,0,0] neg_lo:[0,1,1] neg_hi:[0,1,1] +; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fma_v2_v_fneg: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; PACKED-GISEL-NEXT: v_max_f32_e64 v2, -s2, -s2 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v3, v2 +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[2:3] +; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -475,11 +1783,51 @@ define amdgpu_kernel void @fma_v2_v_fneg(ptr addrspace(1) %a, float %x) { ret void } -; GCN-LABEL: {{^}}add_vector_neg_bitcast_scalar_lo: -; GFX900-COUNT-2: v_sub_f32_e32 -; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) { +; GFX900-LABEL: add_vector_neg_bitcast_scalar_lo: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s2 +; GFX900-NEXT: v_mov_b32_e32 v2, s3 +; GFX900-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX900-NEXT: ds_read_b32 v2, v2 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX900-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: add_vector_neg_bitcast_scalar_lo: +; PACKED-SDAG: ; %bb.0: ; %bb +; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_mov_b32_e32 v3, 0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s3 +; PACKED-SDAG-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; PACKED-SDAG-NEXT: ds_read_b32 v2, v2 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1] +; PACKED-SDAG-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: add_vector_neg_bitcast_scalar_lo: +; PACKED-GISEL: ; %bb.0: ; %bb +; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; PACKED-GISEL-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; PACKED-GISEL-NEXT: ds_read_b32 v2, v2 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: v_max_f32_e64 v2, -v2, -v2 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v3, v2 +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm bb: %vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 4 %scalar0 = load volatile float, ptr addrspace(3) %arg2, align 4 @@ -493,11 +1841,59 @@ bb: ret void } -; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo_scalar_hi: -; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} -; PACKED-SDAG: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}} -; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) { +; GFX900-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, s2 +; GFX900-NEXT: v_mov_b32_e32 v4, s3 +; GFX900-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 +; GFX900-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 +; GFX900-NEXT: ds_read_b32 v5, v4 +; GFX900-NEXT: ds_read_b32 v4, v4 offset:8 +; GFX900-NEXT: s_waitcnt lgkmcnt(1) +; GFX900-NEXT: v_fma_f32 v0, v0, v2, -v5 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_fma_f32 v1, v1, v3, -v4 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi: +; PACKED-SDAG: ; %bb.0: ; %bb +; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_mov_b32_e32 v6, 0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; PACKED-SDAG-NEXT: v_mov_b32_e32 v5, s3 +; PACKED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 +; PACKED-SDAG-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 +; PACKED-SDAG-NEXT: ds_read_b32 v4, v5 +; PACKED-SDAG-NEXT: ds_read_b32 v5, v5 offset:8 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1] +; PACKED-SDAG-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi: +; PACKED-GISEL: ; %bb.0: ; %bb +; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v5, s3 +; PACKED-GISEL-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 +; PACKED-GISEL-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 +; PACKED-GISEL-NEXT: ds_read_b32 v4, v5 +; PACKED-GISEL-NEXT: ds_read_b32 v5, v5 offset:8 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[4:5], 1.0, v[4:5] op_sel_hi:[0,1] +; PACKED-GISEL-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; PACKED-GISEL-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5] +; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x float>, ptr addrspace(3) %lds, i32 1 %arg2.gep = getelementptr inbounds float, ptr addrspace(3) %arg2, i32 2 @@ -517,11 +1913,51 @@ bb: ret void } -; GCN-LABEL: {{^}}shuffle_add_f32: -; GFX900-COUNT-2: v_add_f32_e32 -; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @shuffle_add_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { +; GFX900-LABEL: shuffle_add_f32: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, s0 +; GFX900-NEXT: ds_read_b64 v[0:1], v2 +; GFX900-NEXT: ds_read_b64 v[2:3], v2 offset:8 +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: shuffle_add_f32: +; PACKED-SDAG: ; %bb.0: ; %bb +; PACKED-SDAG-NEXT: s_load_dword s0, s[4:5], 0x2c +; PACKED-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; PACKED-SDAG-NEXT: ds_read_b64 v[0:1], v2 +; PACKED-SDAG-NEXT: ds_read_b64 v[2:3], v2 offset:8 +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0] +; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: shuffle_add_f32: +; PACKED-GISEL: ; %bb.0: ; %bb +; PACKED-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; PACKED-GISEL-NEXT: ds_read_b64 v[0:1], v2 +; PACKED-GISEL-NEXT: ds_read_b64 v[2:3], v2 offset:8 +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: v_mov_b32_e32 v4, v3 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[4:5] +; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm bb: %vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 8 %lds.gep1 = getelementptr inbounds <2 x float>, ptr addrspace(3) %lds, i32 1 @@ -532,11 +1968,61 @@ bb: ret void } -; GCN-LABEL: {{^}}shuffle_neg_add_f32: -; GFX900-COUNT-2: v_sub_f32_e32 -; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @shuffle_neg_add_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { +; GFX900-LABEL: shuffle_neg_add_f32: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, s0 +; GFX900-NEXT: ds_read_b64 v[0:1], v2 +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: ds_read_b32 v3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: ds_read_b64 v[2:3], v2 offset:8 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX900-NEXT: v_sub_f32_e32 v0, v0, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: shuffle_neg_add_f32: +; PACKED-SDAG: ; %bb.0: ; %bb +; PACKED-SDAG-NEXT: s_load_dword s0, s[4:5], 0x2c +; PACKED-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; PACKED-SDAG-NEXT: ds_read_b64 v[0:1], v2 +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: ds_read_b32 v3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: ds_read_b64 v[2:3], v2 offset:8 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1] +; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: shuffle_neg_add_f32: +; PACKED-GISEL: ; %bb.0: ; %bb +; PACKED-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; PACKED-GISEL-NEXT: ds_read_b64 v[0:1], v2 +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: ds_read_b32 v3, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: ds_read_b64 v[2:3], v2 offset:8 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1] +; PACKED-GISEL-NEXT: v_xor_b32_e32 v5, 0x80000000, v2 +; PACKED-GISEL-NEXT: v_xor_b32_e32 v4, 0x80000000, v3 +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[4:5] +; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm bb: %vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 8 %lds.gep1 = getelementptr inbounds <2 x float>, ptr addrspace(3) %lds, i32 1 @@ -549,16 +2035,26 @@ bb: ret void } -; GCN-LABEL: {{^}}fadd_fadd_fsub_0: -; GFX900: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0 -; GFX900: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} - -; PACKED-SDAG: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0 -; PACKED-SDAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} - -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0{{$}} define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) { +; GFX900-LABEL: fadd_fadd_fsub_0: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_add_f32_e64 v0, s1, 0 +; GFX900-NEXT: v_add_f32_e32 v1, 0, v0 +; GFX900-NEXT: v_mov_b32_e32 v0, s0 +; GFX900-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fadd_fadd_fsub_0: +; PACKED-SDAG: ; %bb.0: ; %bb +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: v_add_f32_e64 v0, s1, 0 +; PACKED-SDAG-NEXT: v_add_f32_e32 v1, 0, v0 +; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; PACKED-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; PACKED-SDAG-NEXT: s_endpgm bb: %i12 = fadd <2 x float> zeroinitializer, %arg %shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> @@ -569,16 +2065,36 @@ bb: ret void } -; GCN-LABEL: {{^}}fadd_fadd_fsub: -; GFX900: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; GFX900: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} - -; PACKED-SDAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0]{{$}} - -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, ptr addrspace(1) %ptr) { +; GFX900-LABEL: fadd_fadd_fsub: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s3 +; GFX900-NEXT: v_add_f32_e32 v0, s1, v0 +; GFX900-NEXT: v_mov_b32_e32 v1, s2 +; GFX900-NEXT: v_add_f32_e32 v3, s2, v0 +; GFX900-NEXT: v_sub_f32_e32 v0, s0, v1 +; GFX900-NEXT: v_subrev_f32_e32 v1, s3, v3 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fadd_fadd_fsub: +; PACKED-SDAG: ; %bb.0: ; %bb +; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; PACKED-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; PACKED-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s3 +; PACKED-SDAG-NEXT: v_add_f32_e32 v0, s1, v0 +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], s[2:3], v[0:1] op_sel_hi:[1,0] +; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; PACKED-SDAG-NEXT: v_mov_b32_e32 v3, v0 +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[2:3], s[2:3] neg_lo:[0,1] neg_hi:[0,1] +; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] +; PACKED-SDAG-NEXT: s_endpgm bb: %i12 = fadd <2 x float> %arg, %arg1 %shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> @@ -589,11 +2105,48 @@ bb: ret void } -; GCN-LABEL: {{^}}fadd_shuffle_v4: -; GFX900-COUNT-4: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; PACKED-SDAG-COUNT-2: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0]{{$}} -; PACKED-GISEL-COUNT-2: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fadd_shuffle_v4(ptr addrspace(1) %arg) { +; GFX900-LABEL: fadd_shuffle_v4: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v3, v3, v0 +; GFX900-NEXT: v_add_f32_e32 v2, v2, v0 +; GFX900-NEXT: v_add_f32_e32 v1, v1, v0 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fadd_shuffle_v4: +; PACKED-SDAG: ; %bb.0: ; %bb +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[0:1] op_sel_hi:[1,0] +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[0:1] op_sel_hi:[1,0] +; PACKED-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fadd_shuffle_v4: +; PACKED-GISEL: ; %bb.0: ; %bb +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v6, 4, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_mov_b32_e32 v4, v0 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v5, v0 +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[4:5] +; PACKED-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[4:5] +; PACKED-GISEL-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid @@ -604,12 +2157,44 @@ bb: ret void } -; GCN-LABEL: {{^}}fneg_v2f32_vec: -; GFX900-COUNT-2: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} -; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0 neg_lo:[1,1] neg_hi:[1,1]{{$}} -; PACKED-GISEL-COUNT-2: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} -; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], 1.0, v[{{[0-9:]+}}] op_sel_hi:[0,1]{{$}} define amdgpu_kernel void @fneg_v2f32_vec(ptr addrspace(1) %a) { +; GFX900-LABEL: fneg_v2f32_vec: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX900-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fneg_v2f32_vec: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0 neg_lo:[1,1] neg_hi:[1,1] +; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fneg_v2f32_vec: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; PACKED-GISEL-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1] +; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -618,9 +2203,41 @@ define amdgpu_kernel void @fneg_v2f32_vec(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fneg_v2f32_scalar: -; GCN-COUNT-2: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 define amdgpu_kernel void @fneg_v2f32_scalar(ptr addrspace(1) %a, <2 x float> %x) { +; GFX900-LABEL: fneg_v2f32_scalar: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_xor_b32 s3, s3, 0x80000000 +; GFX900-NEXT: s_xor_b32 s2, s2, 0x80000000 +; GFX900-NEXT: v_mov_b32_e32 v0, s2 +; GFX900-NEXT: v_mov_b32_e32 v1, s3 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fneg_v2f32_scalar: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: s_xor_b32 s3, s3, 0x80000000 +; PACKED-SDAG-NEXT: s_xor_b32 s2, s2, 0x80000000 +; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; PACKED-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fneg_v2f32_scalar: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: s_xor_b32 s2, s2, 0x80000000 +; PACKED-GISEL-NEXT: s_xor_b32 s3, s3, 0x80000000 +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, s[2:3] op_sel_hi:[0,1] +; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %fneg = fsub <2 x float> , %x store <2 x float> %fneg, ptr addrspace(1) %a, align 8 ret void