diff --git a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll index 2c2855c860ebb..840165d5a7e7a 100644 --- a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll +++ b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,FUNC,GFX7 %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,FUNC,GFX8 %s @@ -10,21 +11,70 @@ ; Instructions with B32, U32, and I32 in their name take 32-bit operands, while ; instructions with B64, U64, and I64 take 64-bit operands. -; FUNC-LABEL: {{^}}local_address_load: -; SI: v_mov_b32_e{{32|64}} [[PTR:v[0-9]]] -; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]] define amdgpu_kernel void @local_address_load(ptr addrspace(1) %out, ptr addrspace(3) %in) { +; GFX7-LABEL: local_address_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: local_address_load: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: ds_read_b32 v0, v0 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm entry: %0 = load i32, ptr addrspace(3) %in store i32 %0, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}local_address_gep: -; SI: s_add_i32 [[SPTR:s[0-9]]] -; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] -; SI: ds_read_b32 [[VPTR]] define amdgpu_kernel void @local_address_gep(ptr addrspace(1) %out, ptr addrspace(3) %in, i32 %offset) { +; GFX7-LABEL: local_address_gep: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b32 s3, s3, 2 +; GFX7-NEXT: s_add_i32 s2, s2, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: local_address_gep: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b32 s3, s3, 2 +; GFX8-NEXT: s_add_i32 s2, s2, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: ds_read_b32 v0, v0 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm entry: %0 = getelementptr i32, ptr addrspace(3) %in, i32 %offset %1 = load i32, ptr addrspace(3) %0 @@ -32,10 +82,34 @@ entry: ret void } -; FUNC-LABEL: {{^}}local_address_gep_const_offset: -; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}} -; SI: ds_read_b32 v{{[0-9]+}}, [[VPTR]] offset:4 define amdgpu_kernel void @local_address_gep_const_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) { +; GFX7-LABEL: local_address_gep_const_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: ds_read_b32 v0, v0 offset:4 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: local_address_gep_const_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: ds_read_b32 v0, v0 offset:4 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm entry: %0 = getelementptr i32, ptr addrspace(3) %in, i32 1 %1 = load i32, ptr addrspace(3) %0 @@ -44,11 +118,36 @@ entry: } ; Offset too large, can't fold into 16-bit immediate offset. -; FUNC-LABEL: {{^}}local_address_gep_large_const_offset: -; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004 -; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] -; SI: ds_read_b32 [[VPTR]] define amdgpu_kernel void @local_address_gep_large_const_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) { +; GFX7-LABEL: local_address_gep_large_const_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s2, s2, 0x10004 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: local_address_gep_large_const_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_i32 s2, s2, 0x10004 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: ds_read_b32 v0, v0 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm entry: %0 = getelementptr i32, ptr addrspace(3) %in, i32 16385 %1 = load i32, ptr addrspace(3) %0 @@ -56,24 +155,70 @@ entry: ret void } -; FUNC-LABEL: {{^}}null_32bit_lds_ptr: -; GFX7 v_cmp_ne_u32 -; GFX7: s_cselect_b32 -; GFX8: s_cmp_lg_u32 -; GFX8-NOT: v_cmp_ne_u32 -; GFX8: s_cselect_b32 define amdgpu_kernel void @null_32bit_lds_ptr(ptr addrspace(1) %out, ptr addrspace(3) %lds) nounwind { +; GFX7-LABEL: null_32bit_lds_ptr: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7-NEXT: s_movk_i32 s4, 0x7b +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_cmp_lg_u32 s6, 0 +; GFX7-NEXT: s_cselect_b32 s4, s4, 0x1c8 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: null_32bit_lds_ptr: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: s_movk_i32 s4, 0x7b +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_cmp_lg_u32 s6, 0 +; GFX8-NEXT: s_cselect_b32 s4, s4, 0x1c8 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm %cmp = icmp ne ptr addrspace(3) %lds, null %x = select i1 %cmp, i32 123, i32 456 store i32 %x, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}mul_32bit_ptr: -; SI: s_mul_i32 -; SI-NEXT: s_add_i32 -; SI: ds_read_b32 define amdgpu_kernel void @mul_32bit_ptr(ptr addrspace(1) %out, ptr addrspace(3) %lds, i32 %tid) { +; GFX7-LABEL: mul_32bit_ptr: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mul_i32 s3, s3, 12 +; GFX7-NEXT: s_add_i32 s2, s2, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: mul_32bit_ptr: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mul_i32 s3, s3, 12 +; GFX8-NEXT: s_add_i32 s2, s2, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: ds_read_b32 v0, v0 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm %ptr = getelementptr [3 x float], ptr addrspace(3) %lds, i32 %tid, i32 0 %val = load float, ptr addrspace(3) %ptr store float %val, ptr addrspace(1) %out @@ -82,60 +227,156 @@ define amdgpu_kernel void @mul_32bit_ptr(ptr addrspace(1) %out, ptr addrspace(3) @g_lds = addrspace(3) global float poison, align 4 -; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset: -; SI: v_mov_b32_e32 [[PTR:v[0-9]+]], 0{{$}} -; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]] define amdgpu_kernel void @infer_ptr_alignment_global_offset(ptr addrspace(1) %out, i32 %tid) { +; GFX7-LABEL: infer_ptr_alignment_global_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_mov_b32_e32 v0, 0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: infer_ptr_alignment_global_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: ds_read_b32 v0, v0 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm %val = load float, ptr addrspace(3) @g_lds store float %val, ptr addrspace(1) %out ret void } - @ptr = addrspace(3) global ptr addrspace(3) poison @dst = addrspace(3) global [16383 x i32] poison -; FUNC-LABEL: {{^}}global_ptr: -; SI: ds_write_b32 define amdgpu_kernel void @global_ptr() nounwind { +; SI-LABEL: global_ptr: +; SI: ; %bb.0: +; SI-NEXT: v_mov_b32_e32 v0, 64 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_write_b32 v1, v0 offset:65532 +; SI-NEXT: s_endpgm store ptr addrspace(3) getelementptr ([16383 x i32], ptr addrspace(3) @dst, i32 0, i32 16), ptr addrspace(3) @ptr ret void } -; FUNC-LABEL: {{^}}local_address_store: -; SI: ds_write_b32 define amdgpu_kernel void @local_address_store(ptr addrspace(3) %out, i32 %val) { +; GFX7-LABEL: local_address_store: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: local_address_store: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: ds_write_b32 v0, v1 +; GFX8-NEXT: s_endpgm store i32 %val, ptr addrspace(3) %out ret void } -; FUNC-LABEL: {{^}}local_address_gep_store: -; SI: s_add_i32 [[SADDR:s[0-9]+]], -; SI: v_mov_b32_e32 [[ADDR:v[0-9]+]], [[SADDR]] -; SI: ds_write_b32 [[ADDR]], v{{[0-9]+}} define amdgpu_kernel void @local_address_gep_store(ptr addrspace(3) %out, i32, i32 %val, i32 %offset) { +; GFX7-LABEL: local_address_gep_store: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x9 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b32 s1, s1, 2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_i32 s0, s2, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: local_address_gep_store: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b32 s1, s1, 2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_add_i32 s0, s2, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: ds_write_b32 v1, v0 +; GFX8-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(3) %out, i32 %offset store i32 %val, ptr addrspace(3) %gep, align 4 ret void } -; FUNC-LABEL: {{^}}local_address_gep_const_offset_store: -; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}} -; SI: v_mov_b32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}} -; SI: ds_write_b32 [[VPTR]], [[VAL]] offset:4 define amdgpu_kernel void @local_address_gep_const_offset_store(ptr addrspace(3) %out, i32 %val) { +; GFX7-LABEL: local_address_gep_const_offset_store: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_write_b32 v0, v1 offset:4 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: local_address_gep_const_offset_store: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: ds_write_b32 v0, v1 offset:4 +; GFX8-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(3) %out, i32 1 store i32 %val, ptr addrspace(3) %gep, align 4 ret void } ; Offset too large, can't fold into 16-bit immediate offset. -; FUNC-LABEL: {{^}}local_address_gep_large_const_offset_store: -; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004 -; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] -; SI: ds_write_b32 [[VPTR]], v{{[0-9]+$}} define amdgpu_kernel void @local_address_gep_large_const_offset_store(ptr addrspace(3) %out, i32 %val) { +; GFX7-LABEL: local_address_gep_large_const_offset_store: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s0, s0, 0x10004 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: local_address_gep_large_const_offset_store: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_i32 s0, s0, 0x10004 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: ds_write_b32 v1, v0 +; GFX8-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(3) %out, i32 16385 store i32 %val, ptr addrspace(3) %gep, align 4 ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; FUNC: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll index f4d8ec180cf91..1f7bb761b55b6 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll @@ -1,11 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s -; GCN-LABEL: {{^}}select_and1: -; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, -; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]] -; GCN-NOT: v_and_b32 -; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @select_and1(ptr addrspace(1) %p, i32 %x, i32 %y) { +; GCN-LABEL: select_and1: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_gt_i32 s2, 10 +; GCN-NEXT: s_cselect_b32 s2, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-NEXT: s_endpgm %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 0, i32 -1 %a = and i32 %y, %s @@ -13,12 +19,17 @@ define amdgpu_kernel void @select_and1(ptr addrspace(1) %p, i32 %x, i32 %y) { ret void } -; GCN-LABEL: {{^}}select_and2: -; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, -; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]] -; GCN-NOT: v_and_b32 -; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @select_and2(ptr addrspace(1) %p, i32 %x, i32 %y) { +; GCN-LABEL: select_and2: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_gt_i32 s2, 10 +; GCN-NEXT: s_cselect_b32 s2, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-NEXT: s_endpgm %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 0, i32 -1 %a = and i32 %s, %y @@ -26,12 +37,17 @@ define amdgpu_kernel void @select_and2(ptr addrspace(1) %p, i32 %x, i32 %y) { ret void } -; GCN-LABEL: {{^}}select_and3: -; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, -; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]] -; GCN-NOT: v_and_b32 -; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @select_and3(ptr addrspace(1) %p, i32 %x, i32 %y) { +; GCN-LABEL: select_and3: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lt_i32 s2, 11 +; GCN-NEXT: s_cselect_b32 s2, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-NEXT: s_endpgm %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 -1, i32 0 %a = and i32 %y, %s @@ -39,18 +55,25 @@ define amdgpu_kernel void @select_and3(ptr addrspace(1) %p, i32 %x, i32 %y) { ret void } -; GCN-LABEL: {{^}}select_and_v4: -; GCN: s_cselect_b32 s[[SEL0:[0-9]+]], s{{[0-9]+}}, 0 -; GCN: s_cselect_b32 s[[SEL1:[0-9]+]], s{{[0-9]+}}, 0 -; GCN: s_cselect_b32 s[[SEL2:[0-9]+]], s{{[0-9]+}}, 0 -; GCN: s_cselect_b32 s[[SEL3:[0-9]+]], s{{[0-9]+}}, 0 -; GCN: v_mov_b32_e32 v[[V0:[0-9]+]], s[[SEL3]] -; GCN: v_mov_b32_e32 v[[V1:[0-9]+]], s[[SEL2]] -; GCN: v_mov_b32_e32 v[[V2:[0-9]+]], s[[SEL1]] -; GCN: v_mov_b32_e32 v[[V3:[0-9]+]], s[[SEL0]] -; GCN-NOT: v_and_b32 -; GCN: global_store_dwordx4 v{{[0-9]+}}, v[[[V0]]:[[V3]]] define amdgpu_kernel void @select_and_v4(ptr addrspace(1) %p, i32 %x, <4 x i32> %y) { +; GCN-LABEL: select_and_v4: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s8, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_gt_i32 s8, 10 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 +; GCN-NEXT: s_cselect_b32 s1, s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s0, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GCN-NEXT: s_endpgm %c = icmp slt i32 %x, 11 %s = select i1 %c, <4 x i32> zeroinitializer, <4 x i32> %a = and <4 x i32> %s, %y @@ -58,12 +81,17 @@ define amdgpu_kernel void @select_and_v4(ptr addrspace(1) %p, i32 %x, <4 x i32> ret void } -; GCN-LABEL: {{^}}select_or1: -; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, -; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]] -; GCN-NOT: v_or_b32 -; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @select_or1(ptr addrspace(1) %p, i32 %x, i32 %y) { +; GCN-LABEL: select_or1: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lt_i32 s2, 11 +; GCN-NEXT: s_cselect_b32 s2, s3, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-NEXT: s_endpgm %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 0, i32 -1 %a = or i32 %y, %s @@ -71,12 +99,17 @@ define amdgpu_kernel void @select_or1(ptr addrspace(1) %p, i32 %x, i32 %y) { ret void } -; GCN-LABEL: {{^}}select_or2: -; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, -; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]] -; GCN-NOT: v_or_b32 -; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @select_or2(ptr addrspace(1) %p, i32 %x, i32 %y) { +; GCN-LABEL: select_or2: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lt_i32 s2, 11 +; GCN-NEXT: s_cselect_b32 s2, s3, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-NEXT: s_endpgm %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 0, i32 -1 %a = or i32 %s, %y @@ -84,12 +117,17 @@ define amdgpu_kernel void @select_or2(ptr addrspace(1) %p, i32 %x, i32 %y) { ret void } -; GCN-LABEL: {{^}}select_or3: -; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, -; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]] -; GCN-NOT: v_or_b32 -; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @select_or3(ptr addrspace(1) %p, i32 %x, i32 %y) { +; GCN-LABEL: select_or3: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_gt_i32 s2, 10 +; GCN-NEXT: s_cselect_b32 s2, s3, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-NEXT: s_endpgm %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 -1, i32 0 %a = or i32 %y, %s @@ -97,18 +135,25 @@ define amdgpu_kernel void @select_or3(ptr addrspace(1) %p, i32 %x, i32 %y) { ret void } -; GCN-LABEL: {{^}}select_or_v4: -; GCN: s_cselect_b32 s[[SEL0:[0-9]+]], s{{[0-9]+}}, -1 -; GCN: s_cselect_b32 s[[SEL1:[0-9]+]], s{{[0-9]+}}, -1 -; GCN: s_cselect_b32 s[[SEL2:[0-9]+]], s{{[0-9]+}}, -1 -; GCN: s_cselect_b32 s[[SEL3:[0-9]+]], s{{[0-9]+}}, -1 -; GCN-NOT: v_or_b32 -; GCN: v_mov_b32_e32 v[[V0:[0-9]+]], s[[SEL3]] -; GCN: v_mov_b32_e32 v[[V1:[0-9]+]], s[[SEL2]] -; GCN: v_mov_b32_e32 v[[V2:[0-9]+]], s[[SEL1]] -; GCN: v_mov_b32_e32 v[[V3:[0-9]+]], s[[SEL0]] -; GCN: global_store_dwordx4 v{{[0-9]+}}, v[[[V0]]:[[V3]]] define amdgpu_kernel void @select_or_v4(ptr addrspace(1) %p, i32 %x, <4 x i32> %y) { +; GCN-LABEL: select_or_v4: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s8, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lt_i32 s8, 11 +; GCN-NEXT: s_cselect_b32 s3, s3, -1 +; GCN-NEXT: s_cselect_b32 s2, s2, -1 +; GCN-NEXT: s_cselect_b32 s1, s1, -1 +; GCN-NEXT: s_cselect_b32 s0, s0, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GCN-NEXT: s_endpgm %c = icmp slt i32 %x, 11 %s = select i1 %c, <4 x i32> zeroinitializer, <4 x i32> %a = or <4 x i32> %s, %y @@ -116,192 +161,360 @@ define amdgpu_kernel void @select_or_v4(ptr addrspace(1) %p, i32 %x, <4 x i32> % ret void } -; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants: -; GCN: s_cselect_b32 s{{[0-9]+}}, 9, 2 define amdgpu_kernel void @sel_constants_sub_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) { +; GCN-LABEL: sel_constants_sub_constant_sel_constants: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s2, 0 +; GCN-NEXT: s_cselect_b32 s2, 9, 2 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-NEXT: s_endpgm %sel = select i1 %cond, i32 -4, i32 3 %bo = sub i32 5, %sel store i32 %bo, ptr addrspace(1) %p, align 4 ret void } -; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_i16: -; GCN: s_cselect_b32 s{{[0-9]+}}, 9, 2 define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_i16(ptr addrspace(1) %p, i1 %cond) { +; GCN-LABEL: sel_constants_sub_constant_sel_constants_i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s2, 0 +; GCN-NEXT: s_cselect_b32 s2, 9, 2 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: global_store_short v0, v1, s[0:1] +; GCN-NEXT: s_endpgm %sel = select i1 %cond, i16 -4, i16 3 %bo = sub i16 5, %sel store i16 %bo, ptr addrspace(1) %p, align 2 ret void } -; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_i16_neg: -; GCN: s_cselect_b32 s[[SGPR:[0-9]+]], s[[SGPR]], 0xf449 define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_i16_neg(ptr addrspace(1) %p, i1 %cond) { +; GCN-LABEL: sel_constants_sub_constant_sel_constants_i16_neg: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s2, 0 +; GCN-NEXT: s_mov_b32 s2, 0xfffd +; GCN-NEXT: s_cselect_b32 s2, s2, 0xf449 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: global_store_short v0, v1, s[0:1] +; GCN-NEXT: s_endpgm %sel = select i1 %cond, i16 4, i16 3000 %bo = sub i16 1, %sel store i16 %bo, ptr addrspace(1) %p, align 2 ret void } -; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_v2i16: -; GCN-DAG: s_mov_b32 [[T:s[0-9]+]], 0x50009 -; GCN: s_cselect_b32 s{{[0-9]+}}, [[T]], 0x60002 define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_v2i16(ptr addrspace(1) %p, i1 %cond) { +; GCN-LABEL: sel_constants_sub_constant_sel_constants_v2i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s2, 0 +; GCN-NEXT: s_mov_b32 s2, 0x50009 +; GCN-NEXT: s_cselect_b32 s2, s2, 0x60002 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-NEXT: s_endpgm %sel = select i1 %cond, <2 x i16> , <2 x i16> %bo = sub <2 x i16> , %sel store <2 x i16> %bo, ptr addrspace(1) %p, align 4 ret void } -; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_v4i32: -; GCN: s_cselect_b32 s[[SEL0:[0-9]+]], 7, 14 -; GCN: s_cselect_b32 s[[SEL1:[0-9]+]], 6, 10 -; GCN: s_cselect_b32 s[[SEL2:[0-9]+]], 5, 6 -; GCN: s_cselect_b32 s[[SEL3:[0-9]+]], 9, 2 -; GCN: v_mov_b32_e32 v[[V0:[0-9]+]], s[[SEL3]] -; GCN: v_mov_b32_e32 v[[V1:[0-9]+]], s[[SEL2]] -; GCN: v_mov_b32_e32 v[[V2:[0-9]+]], s[[SEL1]] -; GCN: v_mov_b32_e32 v[[V3:[0-9]+]], s[[SEL0]] -; GCN: global_store_dwordx4 v{{[0-9]+}}, v[[[V0]]:[[V3]]] define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_v4i32(ptr addrspace(1) %p, i1 %cond) { +; GCN-LABEL: sel_constants_sub_constant_sel_constants_v4i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s2, 0 +; GCN-NEXT: s_cselect_b32 s2, 7, 14 +; GCN-NEXT: s_cselect_b32 s3, 6, 10 +; GCN-NEXT: s_cselect_b32 s4, 5, 6 +; GCN-NEXT: s_cselect_b32 s5, 9, 2 +; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: v_mov_b32_e32 v3, s2 +; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GCN-NEXT: s_endpgm %sel = select i1 %cond, <4 x i32> , <4 x i32> %bo = sub <4 x i32> , %sel store <4 x i32> %bo, ptr addrspace(1) %p, align 32 ret void } -; GCN-LABEL: {{^}}sdiv_constant_sel_constants_i64: -; GCN: s_cselect_b32 s{{[0-9]+}}, 0, 5 define amdgpu_kernel void @sdiv_constant_sel_constants_i64(ptr addrspace(1) %p, i1 %cond) { +; GCN-LABEL: sdiv_constant_sel_constants_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s2, 0 +; GCN-NEXT: s_cselect_b32 s2, 0, 5 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GCN-NEXT: s_endpgm %sel = select i1 %cond, i64 121, i64 23 %bo = sdiv i64 120, %sel store i64 %bo, ptr addrspace(1) %p, align 8 ret void } -; GCN-LABEL: {{^}}sdiv_constant_sel_constants_i32: -; GCN: s_cselect_b32 s{{[0-9]+}}, 26, 8 define amdgpu_kernel void @sdiv_constant_sel_constants_i32(ptr addrspace(1) %p, i1 %cond) { +; GCN-LABEL: sdiv_constant_sel_constants_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s2, 0 +; GCN-NEXT: s_cselect_b32 s2, 26, 8 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-NEXT: s_endpgm %sel = select i1 %cond, i32 7, i32 23 %bo = sdiv i32 184, %sel store i32 %bo, ptr addrspace(1) %p, align 8 ret void } -; GCN-LABEL: {{^}}udiv_constant_sel_constants_i64: -; GCN: s_cselect_b32 s{{[0-9]+}}, 0, 5 define amdgpu_kernel void @udiv_constant_sel_constants_i64(ptr addrspace(1) %p, i1 %cond) { +; GCN-LABEL: udiv_constant_sel_constants_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s2, 0 +; GCN-NEXT: s_cselect_b32 s2, 0, 5 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GCN-NEXT: s_endpgm %sel = select i1 %cond, i64 -4, i64 23 %bo = udiv i64 120, %sel store i64 %bo, ptr addrspace(1) %p, align 8 ret void } -; GCN-LABEL: {{^}}srem_constant_sel_constants: -; GCN: s_cselect_b32 s{{[0-9]+}}, 33, 3 define amdgpu_kernel void @srem_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) { +; GCN-LABEL: srem_constant_sel_constants: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s2, 0 +; GCN-NEXT: s_cselect_b32 s2, 33, 3 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GCN-NEXT: s_endpgm %sel = select i1 %cond, i64 34, i64 15 %bo = srem i64 33, %sel store i64 %bo, ptr addrspace(1) %p, align 8 ret void } -; GCN-LABEL: {{^}}urem_constant_sel_constants: -; GCN: s_cselect_b32 s{{[0-9]+}}, 33, 3 define amdgpu_kernel void @urem_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) { +; GCN-LABEL: urem_constant_sel_constants: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s2, 0 +; GCN-NEXT: s_cselect_b32 s2, 33, 3 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GCN-NEXT: s_endpgm %sel = select i1 %cond, i64 34, i64 15 %bo = urem i64 33, %sel store i64 %bo, ptr addrspace(1) %p, align 8 ret void } -; GCN-LABEL: {{^}}shl_constant_sel_constants: -; GCN: s_cselect_b32 s{{[0-9]+}}, 4, 8 define amdgpu_kernel void @shl_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) { +; GCN-LABEL: shl_constant_sel_constants: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s2, 0 +; GCN-NEXT: s_cselect_b32 s2, 4, 8 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-NEXT: s_endpgm %sel = select i1 %cond, i32 2, i32 3 %bo = shl i32 1, %sel store i32 %bo, ptr addrspace(1) %p, align 4 ret void } -; GCN-LABEL: {{^}}lshr_constant_sel_constants: -; GCN: s_cselect_b32 s{{[0-9]+}}, 16, 8 define amdgpu_kernel void @lshr_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) { +; GCN-LABEL: lshr_constant_sel_constants: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s2, 0 +; GCN-NEXT: s_cselect_b32 s2, 16, 8 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-NEXT: s_endpgm %sel = select i1 %cond, i32 2, i32 3 %bo = lshr i32 64, %sel store i32 %bo, ptr addrspace(1) %p, align 4 ret void } -; GCN-LABEL: {{^}}ashr_constant_sel_constants: -; GCN: s_cselect_b32 s{{[0-9]+}}, 32, 16 define amdgpu_kernel void @ashr_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) { +; GCN-LABEL: ashr_constant_sel_constants: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s2, 0 +; GCN-NEXT: s_cselect_b32 s2, 32, 16 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-NEXT: s_endpgm %sel = select i1 %cond, i32 2, i32 3 %bo = ashr i32 128, %sel store i32 %bo, ptr addrspace(1) %p, align 4 ret void } -; GCN-LABEL: {{^}}fsub_constant_sel_constants: -; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, -4.0, 1.0, define amdgpu_kernel void @fsub_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) { +; GCN-LABEL: fsub_constant_sel_constants: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s2, 0 +; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v1, -4.0, 1.0, s[2:3] +; GCN-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-NEXT: s_endpgm %sel = select i1 %cond, float -2.0, float 3.0 %bo = fsub float -1.0, %sel store float %bo, ptr addrspace(1) %p, align 4 ret void } -; GCN-LABEL: {{^}}fsub_constant_sel_constants_f16: ; TODO: it shall be possible to fold constants with OpSel -; GCN-DAG: v_mov_b32_e32 [[T:v[0-9]+]], 0x3c00 -; GCN-DAG: v_mov_b32_e32 [[F:v[0-9]+]], 0xc400 -; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, [[F]], [[T]], define amdgpu_kernel void @fsub_constant_sel_constants_f16(ptr addrspace(1) %p, i1 %cond) { +; GCN-LABEL: fsub_constant_sel_constants_f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v1, 0xc400 +; GCN-NEXT: v_mov_b32_e32 v2, 0x3c00 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s2, 0 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: global_store_short v0, v1, s[0:1] +; GCN-NEXT: s_endpgm %sel = select i1 %cond, half -2.0, half 3.0 %bo = fsub half -1.0, %sel store half %bo, ptr addrspace(1) %p, align 2 ret void } -; GCN-LABEL: {{^}}fsub_constant_sel_constants_v2f16: -; GCN: s_cselect_b32 s{{[0-9]+}}, 0x45003c00, -2.0 define amdgpu_kernel void @fsub_constant_sel_constants_v2f16(ptr addrspace(1) %p, i1 %cond) { +; GCN-LABEL: fsub_constant_sel_constants_v2f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s2, 0 +; GCN-NEXT: s_cselect_b32 s2, 0x45003c00, -2.0 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-NEXT: s_endpgm %sel = select i1 %cond, <2 x half> , <2 x half> %bo = fsub <2 x half> , %sel store <2 x half> %bo, ptr addrspace(1) %p, align 4 ret void } -; GCN-LABEL: {{^}}fsub_constant_sel_constants_v4f32: -; GCN: s_mov_b32 [[T0:s[0-9]+]], 0x41500000 -; GCN: s_cselect_b32 s[[SEL0:[0-9]+]], [[T0]], 0x40c00000 -; GCN: s_cselect_b32 s[[SEL1:[0-9]+]], 0x41100000, 4.0 -; GCN: s_cselect_b32 s[[SEL2:[0-9]+]], 0x40a00000, 2.0 -; GCN: s_cselect_b32 s[[SEL3:[0-9]+]], 1.0, 0 -; GCN: v_mov_b32_e32 v[[V0:[0-9]+]], s[[SEL3]] -; GCN: v_mov_b32_e32 v[[V1:[0-9]+]], s[[SEL2]] -; GCN: v_mov_b32_e32 v[[V2:[0-9]+]], s[[SEL1]] -; GCN: v_mov_b32_e32 v[[V3:[0-9]+]], s[[SEL0]] -; GCN: global_store_dwordx4 v{{[0-9]+}}, v[[[V0]]:[[V3]]] define amdgpu_kernel void @fsub_constant_sel_constants_v4f32(ptr addrspace(1) %p, i1 %cond) { +; GCN-LABEL: fsub_constant_sel_constants_v4f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: s_mov_b32 s3, 0x41500000 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s2, 0 +; GCN-NEXT: s_cselect_b32 s2, s3, 0x40c00000 +; GCN-NEXT: s_cselect_b32 s3, 0x41100000, 4.0 +; GCN-NEXT: s_cselect_b32 s4, 0x40a00000, 2.0 +; GCN-NEXT: s_cselect_b32 s5, 1.0, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: v_mov_b32_e32 v3, s2 +; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GCN-NEXT: s_endpgm %sel = select i1 %cond, <4 x float> , <4 x float> %bo = fsub <4 x float> , %sel store <4 x float> %bo, ptr addrspace(1) %p, align 32 ret void } -; GCN-LABEL: {{^}}fdiv_constant_sel_constants: -; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 4.0, -2.0, define amdgpu_kernel void @fdiv_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) { +; GCN-LABEL: fdiv_constant_sel_constants: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s2, 0 +; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v1, 4.0, -2.0, s[2:3] +; GCN-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-NEXT: s_endpgm %sel = select i1 %cond, float -4.0, float 2.0 %bo = fdiv float 8.0, %sel store float %bo, ptr addrspace(1) %p, align 4 ret void } -; GCN-LABEL: {{^}}frem_constant_sel_constants: -; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 2.0, 1.0, define amdgpu_kernel void @frem_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) { +; GCN-LABEL: frem_constant_sel_constants: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s2, 0 +; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v1, 2.0, 1.0, s[2:3] +; GCN-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-NEXT: s_endpgm %sel = select i1 %cond, float -4.0, float 3.0 %bo = frem float 5.0, %sel store float %bo, ptr addrspace(1) %p, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll index 6b42c4e72d64a..ae88eade32228 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s declare i1 @llvm.amdgcn.class.f32(float, i32) #1 @@ -6,30 +7,40 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 declare float @llvm.fabs.f32(float) #1 declare double @llvm.fabs.f64(double) #1 -; SI-LABEL: {{^}}test_class_f32: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c -; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm define amdgpu_kernel void @test_class_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 { +; SI-LABEL: test_class_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s6, s[4:5], 0x1c +; SI-NEXT: s_load_dword s7, s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_cmp_class_f32_e32 vcc, s7, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 %b) #1 %sext = sext i1 %result to i32 store i32 %sext, ptr addrspace(1) %out, align 4 ret void } -; SI-LABEL: {{^}}test_class_fabs_f32: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c -; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm define amdgpu_kernel void @test_class_fabs_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 { +; SI-LABEL: test_class_fabs_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s6, s[4:5], 0x1c +; SI-NEXT: s_load_dword s7, s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], |s7|, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %a.fabs = call float @llvm.fabs.f32(float %a) #1 %result = call i1 @llvm.amdgcn.class.f32(float %a.fabs, i32 %b) #1 %sext = sext i1 %result to i32 @@ -37,15 +48,20 @@ define amdgpu_kernel void @test_class_fabs_f32(ptr addrspace(1) %out, [8 x i32], ret void } -; SI-LABEL: {{^}}test_class_fneg_f32: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c -; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm define amdgpu_kernel void @test_class_fneg_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 { +; SI-LABEL: test_class_fneg_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s6, s[4:5], 0x1c +; SI-NEXT: s_load_dword s7, s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], -s7, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %a.fneg = fsub float -0.0, %a %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg, i32 %b) #1 %sext = sext i1 %result to i32 @@ -53,15 +69,20 @@ define amdgpu_kernel void @test_class_fneg_f32(ptr addrspace(1) %out, [8 x i32], ret void } -; SI-LABEL: {{^}}test_class_fneg_fabs_f32: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c -; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm define amdgpu_kernel void @test_class_fneg_fabs_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 { +; SI-LABEL: test_class_fneg_fabs_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s6, s[4:5], 0x1c +; SI-NEXT: s_load_dword s7, s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], -|s7|, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %a.fabs = call float @llvm.fabs.f32(float %a) #1 %a.fneg.fabs = fsub float -0.0, %a.fabs %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg.fabs, i32 %b) #1 @@ -70,26 +91,36 @@ define amdgpu_kernel void @test_class_fneg_fabs_f32(ptr addrspace(1) %out, [8 x ret void } -; SI-LABEL: {{^}}test_class_1_f32: -; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 1{{$}} -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm define amdgpu_kernel void @test_class_1_f32(ptr addrspace(1) %out, float %a) #0 { +; SI-LABEL: test_class_1_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], s6, 1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1 %sext = sext i1 %result to i32 store i32 %sext, ptr addrspace(1) %out, align 4 ret void } -; SI-LABEL: {{^}}test_class_64_f32: -; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 64{{$}} -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm define amdgpu_kernel void @test_class_64_f32(ptr addrspace(1) %out, float %a) #0 { +; SI-LABEL: test_class_64_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], s6, 64 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 64) #1 %sext = sext i1 %result to i32 store i32 %sext, ptr addrspace(1) %out, align 4 @@ -97,42 +128,62 @@ define amdgpu_kernel void @test_class_64_f32(ptr addrspace(1) %out, float %a) #0 } ; Set all 10 bits of mask -; SI-LABEL: {{^}}test_class_full_mask_f32: -; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}} -; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm define amdgpu_kernel void @test_class_full_mask_f32(ptr addrspace(1) %out, float %a) #0 { +; SI-LABEL: test_class_full_mask_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s4, s[4:5], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x3ff +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1023) #1 %sext = sext i1 %result to i32 store i32 %sext, ptr addrspace(1) %out, align 4 ret void } -; SI-LABEL: {{^}}test_class_9bit_mask_f32: -; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} -; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm define amdgpu_kernel void @test_class_9bit_mask_f32(ptr addrspace(1) %out, float %a) #0 { +; SI-LABEL: test_class_9bit_mask_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s4, s[4:5], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x1ff +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 511) #1 %sext = sext i1 %result to i32 store i32 %sext, ptr addrspace(1) %out, align 4 ret void } -; SI-LABEL: {{^}}v_test_class_full_mask_f32: -; SI-DAG: buffer_load_dword [[VA:v[0-9]+]] -; SI-DAG: s_movk_i32 [[MASK:s[0-9]+]], 0x1ff{{$}} -; SI: v_cmp_class_f32_e64 s[{{[0-9]}}:{{[0-9]}}], [[VA]], [[MASK]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, s[{{[0-9]}}:{{[0-9]}}] -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm define amdgpu_kernel void @v_test_class_full_mask_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: v_test_class_full_mask_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_movk_i32 s4, 0x1ff +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v2, s4 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -144,13 +195,23 @@ define amdgpu_kernel void @v_test_class_full_mask_f32(ptr addrspace(1) %out, ptr ret void } -; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f32: -; SI-DAG: buffer_load_dword [[VB:v[0-9]+]] -; SI: v_cmp_class_f32_e32 vcc, 1.0, [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: test_class_inline_imm_constant_dynamic_mask_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_class_f32_e32 vcc, 1.0, v2 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -163,14 +224,24 @@ define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f32(ptr a } ; FIXME: Why isn't this using a literal constant operand? -; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f32: -; SI-DAG: buffer_load_dword [[VB:v[0-9]+]] -; SI-DAG: s_mov_b32 [[VK:s[0-9]+]], 0x44800000 -; SI: v_cmp_class_f32_e32 vcc, [[VK]], [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: test_class_lit_constant_dynamic_mask_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s4, 0x44800000 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v2 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -182,30 +253,40 @@ define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f32(ptr addrspac ret void } -; SI-LABEL: {{^}}test_class_f64: -; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d -; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[VB]] -; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm define amdgpu_kernel void @test_class_f64(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 { +; SI-LABEL: test_class_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s8, s[4:5], 0x1d +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_cmp_class_f64_e32 vcc, s[6:7], v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 %b) #1 %sext = sext i1 %result to i32 store i32 %sext, ptr addrspace(1) %out, align 4 ret void } -; SI-LABEL: {{^}}test_class_fabs_f64: -; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d -; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]] -; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm define amdgpu_kernel void @test_class_fabs_f64(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 { +; SI-LABEL: test_class_fabs_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s8, s[4:5], 0x1d +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_cmp_class_f64_e64 s[4:5], |s[6:7]|, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %a.fabs = call double @llvm.fabs.f64(double %a) #1 %result = call i1 @llvm.amdgcn.class.f64(double %a.fabs, i32 %b) #1 %sext = sext i1 %result to i32 @@ -213,15 +294,20 @@ define amdgpu_kernel void @test_class_fabs_f64(ptr addrspace(1) %out, [8 x i32], ret void } -; SI-LABEL: {{^}}test_class_fneg_f64: -; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d -; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]] -; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm define amdgpu_kernel void @test_class_fneg_f64(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 { +; SI-LABEL: test_class_fneg_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s8, s[4:5], 0x1d +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_cmp_class_f64_e64 s[4:5], -s[6:7], v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %a.fneg = fsub double -0.0, %a %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg, i32 %b) #1 %sext = sext i1 %result to i32 @@ -229,15 +315,20 @@ define amdgpu_kernel void @test_class_fneg_f64(ptr addrspace(1) %out, [8 x i32], ret void } -; SI-LABEL: {{^}}test_class_fneg_fabs_f64: -; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d -; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]] -; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm define amdgpu_kernel void @test_class_fneg_fabs_f64(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 { +; SI-LABEL: test_class_fneg_fabs_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s8, s[4:5], 0x1d +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_cmp_class_f64_e64 s[4:5], -|s[6:7]|, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %a.fabs = call double @llvm.fabs.f64(double %a) #1 %a.fneg.fabs = fsub double -0.0, %a.fabs %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg.fabs, i32 %b) #1 @@ -246,20 +337,38 @@ define amdgpu_kernel void @test_class_fneg_fabs_f64(ptr addrspace(1) %out, [8 x ret void } -; SI-LABEL: {{^}}test_class_1_f64: -; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 1{{$}} -; SI: s_endpgm define amdgpu_kernel void @test_class_1_f64(ptr addrspace(1) %out, double %a) #0 { +; SI-LABEL: test_class_1_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_cmp_class_f64_e64 s[0:1], s[2:3], 1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 1) #1 %sext = sext i1 %result to i32 store i32 %sext, ptr addrspace(1) %out, align 4 ret void } -; SI-LABEL: {{^}}test_class_64_f64: -; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 64{{$}} -; SI: s_endpgm define amdgpu_kernel void @test_class_64_f64(ptr addrspace(1) %out, double %a) #0 { +; SI-LABEL: test_class_64_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_cmp_class_f64_e64 s[0:1], s[2:3], 64 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 64) #1 %sext = sext i1 %result to i32 store i32 %sext, ptr addrspace(1) %out, align 4 @@ -267,30 +376,45 @@ define amdgpu_kernel void @test_class_64_f64(ptr addrspace(1) %out, double %a) # } ; Set all 9 bits of mask -; SI-LABEL: {{^}}test_class_full_mask_f64: -; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 -; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} -; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[MASK]] -; SI-NOT: vcc -; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm define amdgpu_kernel void @test_class_full_mask_f64(ptr addrspace(1) %out, [8 x i32], double %a) #0 { +; SI-LABEL: test_class_full_mask_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x13 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x1ff +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cmp_class_f64_e32 vcc, s[4:5], v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1 %sext = sext i1 %result to i32 store i32 %sext, ptr addrspace(1) %out, align 4 ret void } -; SI-LABEL: {{^}}v_test_class_full_mask_f64: -; SI-DAG: buffer_load_dwordx2 [[VA:v\[[0-9]+:[0-9]+\]]] -; SI-DAG: s_movk_i32 [[MASK:s[0-9]+]], 0x1ff{{$}} -; SI: v_cmp_class_f64_e64 s[{{[0-9]}}:{{[0-9]}}], [[VA]], [[MASK]] -; SI-NOT: vcc -; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, s[{{[0-9]}}:{{[0-9]}}] -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm define amdgpu_kernel void @v_test_class_full_mask_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: v_test_class_full_mask_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_movk_i32 s4, 0x1ff +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], s4 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -302,11 +426,23 @@ define amdgpu_kernel void @v_test_class_full_mask_f64(ptr addrspace(1) %out, ptr ret void } -; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f64: -; XSI: v_cmp_class_f64_e32 vcc, 1.0, -; SI: v_cmp_class_f64_e32 vcc, -; SI: s_endpgm define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: test_class_inline_imm_constant_dynamic_mask_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_class_f64_e32 vcc, 1.0, v2 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -318,10 +454,25 @@ define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f64(ptr a ret void } -; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f64: -; SI: v_cmp_class_f64_e32 vcc, s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} -; SI: s_endpgm define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: test_class_lit_constant_dynamic_mask_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_mov_b32 s5, 0x40900000 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_class_f64_e32 vcc, s[4:5], v2 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -333,12 +484,26 @@ define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f64(ptr addrspac ret void } -; SI-LABEL: {{^}}test_fold_or_class_f32_0: -; SI-NOT: v_cmp_class -; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 3{{$}} -; SI-NOT: v_cmp_class -; SI: s_endpgm define amdgpu_kernel void @test_fold_or_class_f32_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: test_fold_or_class_f32_0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, 3 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -353,12 +518,26 @@ define amdgpu_kernel void @test_fold_or_class_f32_0(ptr addrspace(1) %out, ptr a ret void } -; SI-LABEL: {{^}}test_fold_or3_class_f32_0: -; SI-NOT: v_cmp_class -; SI: v_cmp_class_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}} -; SI-NOT: v_cmp_class -; SI: s_endpgm define amdgpu_kernel void @test_fold_or3_class_f32_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: test_fold_or3_class_f32_0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, 7 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -375,13 +554,27 @@ define amdgpu_kernel void @test_fold_or3_class_f32_0(ptr addrspace(1) %out, ptr ret void } -; SI-LABEL: {{^}}test_fold_or_all_tests_class_f32_0: -; SI-NOT: v_cmp_class -; SI: s_movk_i32 [[MASK:s[0-9]+]], 0x3ff{{$}} -; SI: v_cmp_class_f32_e64 s[0:1], v{{[0-9]+}}, [[MASK]]{{$}} -; SI-NOT: v_cmp_class -; SI: s_endpgm define amdgpu_kernel void @test_fold_or_all_tests_class_f32_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: test_fold_or_all_tests_class_f32_0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_movk_i32 s2, 0x3ff +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, s2 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -411,12 +604,26 @@ define amdgpu_kernel void @test_fold_or_all_tests_class_f32_0(ptr addrspace(1) % ret void } -; SI-LABEL: {{^}}test_fold_or_class_f32_1: -; SI-NOT: v_cmp_class -; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 12{{$}} -; SI-NOT: v_cmp_class -; SI: s_endpgm define amdgpu_kernel void @test_fold_or_class_f32_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: test_fold_or_class_f32_1: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, 12 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -431,12 +638,26 @@ define amdgpu_kernel void @test_fold_or_class_f32_1(ptr addrspace(1) %out, ptr a ret void } -; SI-LABEL: {{^}}test_fold_or_class_f32_2: -; SI-NOT: v_cmp_class -; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}} -; SI-NOT: v_cmp_class -; SI: s_endpgm define amdgpu_kernel void @test_fold_or_class_f32_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: test_fold_or_class_f32_2: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, 7 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -451,12 +672,29 @@ define amdgpu_kernel void @test_fold_or_class_f32_2(ptr addrspace(1) %out, ptr a ret void } -; SI-LABEL: {{^}}test_no_fold_or_class_f32_0: -; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 4{{$}} -; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 8{{$}} -; SI: s_or_b64 -; SI: s_endpgm define amdgpu_kernel void @test_no_fold_or_class_f32_0(ptr addrspace(1) %out, ptr addrspace(1) %in, float %b) #0 { +; SI-LABEL: test_no_fold_or_class_f32_0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s12, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_cmp_class_f32_e64 s[0:1], s12, 8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_class_f32_e64 s[2:3], v0, 4 +; SI-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -471,72 +709,94 @@ define amdgpu_kernel void @test_no_fold_or_class_f32_0(ptr addrspace(1) %out, pt ret void } -; SI-LABEL: {{^}}test_class_0_f32: -; SI-NOT: v_cmp_class -; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm define amdgpu_kernel void @test_class_0_f32(ptr addrspace(1) %out, float %a) #0 { +; SI-LABEL: test_class_0_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 0) #1 %sext = sext i1 %result to i32 store i32 %sext, ptr addrspace(1) %out, align 4 ret void } -; SI-LABEL: {{^}}test_class_0_f64: -; SI-NOT: v_cmp_class -; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm define amdgpu_kernel void @test_class_0_f64(ptr addrspace(1) %out, double %a) #0 { +; SI-LABEL: test_class_0_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 0) #1 %sext = sext i1 %result to i32 store i32 %sext, ptr addrspace(1) %out, align 4 ret void } -; SI-LABEL: {{^}}test_class_undef_f32: -; SI-NOT: v_cmp_class -; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0 -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm define amdgpu_kernel void @test_class_undef_f32(ptr addrspace(1) %out, float %a, i32 %b) #0 { +; SI-LABEL: test_class_undef_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f32(float poison, i32 %b) #1 %sext = sext i1 %result to i32 store i32 %sext, ptr addrspace(1) %out, align 4 ret void } -; SI-LABEL: {{^}}test_fold_and_ord: -; SI: s_waitcnt -; SI-NEXT: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v0, 32{{$}} -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, [[COND]] -; SI-NEXT: s_setpc_b64 define i1 @test_fold_and_ord(float %a) { +; SI-LABEL: test_fold_and_ord: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 32 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] %class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1 %ord = fcmp ord float %a, %a %and = and i1 %ord, %class ret i1 %and } -; SI-LABEL: {{^}}test_fold_and_unord: -; SI: s_waitcnt -; SI-NEXT: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v0, 3{{$}} -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, [[COND]] -; SI-NEXT: s_setpc_b64 define i1 @test_fold_and_unord(float %a) { +; SI-LABEL: test_fold_and_unord: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 3 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] %class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1 %ord = fcmp uno float %a, %a %and = and i1 %ord, %class ret i1 %and } -; SI-LABEL: {{^}}test_fold_and_ord_multi_use: -; SI: v_cmp_class -; SI-NOT: v_cmp_class -; SI: v_cmp_o -; SI: s_and_b64 define i1 @test_fold_and_ord_multi_use(float %a) { +; SI-LABEL: test_fold_and_ord_multi_use: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 35 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] %class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1 store volatile i1 %class, ptr addrspace(1) poison %ord = fcmp ord float %a, %a diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll index 8732c77778b01..67ae05eb6f0b8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,NOLIT-SRCC,GFX908,GFX908_A %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,LIT-SRCC,GFX908,GFX908_A %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A,GFX908_A,GFX90A_42 %s @@ -18,57 +19,392 @@ declare <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32, i32, <16 x i32>, i32, i3 declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32) declare i32 @llvm.amdgcn.workitem.id.x() -; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 -; GCN-DAG: s_load_dwordx16 -; GCN-DAG: s_load_dwordx16 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX90A_42-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} -; GFX908_A: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX942: v_mfma_f32_32x32x1_2b_f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX908-COUNT-4: v_accvgpr_read_b32 -; GFX908-COUNT-2: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] -; GFX908-COUNT-4: v_accvgpr_read_b32 -; GFX908-COUNT-2: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] -; GFX908-COUNT-4: v_accvgpr_read_b32 -; GFX908-COUNT-2: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] -; GFX908-COUNT-4: v_accvgpr_read_b32 -; GFX908-COUNT-2: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] -; GFX90A-NOT: v_accvgpr_read_b32 -; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { +; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x1f32: +; NOLIT-SRCC: ; %bb.0: ; %bb +; NOLIT-SRCC-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 +; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s16 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s17 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s18 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s21 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s22 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s23 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s24 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s25 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s26 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s27 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s28 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s29 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s30 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s31 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a16, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a17, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a18, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a19, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s4 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s5 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s6 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a20, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a21, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a22, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s7 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s8 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s9 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s19 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a23, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a24, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a25, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s10 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s11 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s12 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s20 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s13 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s14 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s15 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 2.0 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3 +; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 +; NOLIT-SRCC-NEXT: s_endpgm +; +; LIT-SRCC-LABEL: test_mfma_f32_32x32x1f32: +; LIT-SRCC: ; %bb.0: ; %bb +; LIT-SRCC-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 +; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s16 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s17 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s18 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s21 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s22 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s23 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s24 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s25 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s26 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s27 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s28 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s29 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s30 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s31 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a16, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a17, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a18, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a19, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s4 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s5 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s6 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a20, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a21, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a22, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s7 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s8 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s9 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s19 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a23, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a24, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a25, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s10 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s11 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s12 +; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s20 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s13 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s14 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s15 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, 1.0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 2.0 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3 +; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 +; LIT-SRCC-NEXT: s_endpgm +; +; GFX90A-LABEL: test_mfma_f32_32x32x1f32: +; GFX90A: ; %bb.0: ; %bb +; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 +; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s16 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, s17 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, s18 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, s19 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, s20 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, s21 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, s22 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, s23 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, s24 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, s25 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, s26 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, s27 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, s28 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, s29 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, s30 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, s31 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, s3 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, s4 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, s5 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, s6 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, s7 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, s8 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, s9 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, s10 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, s11 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, s12 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, s13 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, s14 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15 +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 +; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 +; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 +; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 +; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] +; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: test_mfma_f32_32x32x1f32: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 +; GFX942-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_accvgpr_write_b32 a0, s16 +; GFX942-NEXT: v_accvgpr_write_b32 a1, s17 +; GFX942-NEXT: v_accvgpr_write_b32 a2, s18 +; GFX942-NEXT: v_accvgpr_write_b32 a3, s19 +; GFX942-NEXT: v_accvgpr_write_b32 a4, s20 +; GFX942-NEXT: v_accvgpr_write_b32 a5, s21 +; GFX942-NEXT: v_accvgpr_write_b32 a6, s22 +; GFX942-NEXT: v_accvgpr_write_b32 a7, s23 +; GFX942-NEXT: v_accvgpr_write_b32 a8, s24 +; GFX942-NEXT: v_accvgpr_write_b32 a9, s25 +; GFX942-NEXT: v_accvgpr_write_b32 a10, s26 +; GFX942-NEXT: v_accvgpr_write_b32 a11, s27 +; GFX942-NEXT: v_accvgpr_write_b32 a12, s28 +; GFX942-NEXT: v_accvgpr_write_b32 a13, s29 +; GFX942-NEXT: v_accvgpr_write_b32 a14, s30 +; GFX942-NEXT: v_accvgpr_write_b32 a15, s31 +; GFX942-NEXT: v_accvgpr_write_b32 a16, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a17, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a18, s2 +; GFX942-NEXT: v_accvgpr_write_b32 a19, s3 +; GFX942-NEXT: v_accvgpr_write_b32 a20, s4 +; GFX942-NEXT: v_accvgpr_write_b32 a21, s5 +; GFX942-NEXT: v_accvgpr_write_b32 a22, s6 +; GFX942-NEXT: v_accvgpr_write_b32 a23, s7 +; GFX942-NEXT: v_accvgpr_write_b32 a24, s8 +; GFX942-NEXT: v_accvgpr_write_b32 a25, s9 +; GFX942-NEXT: v_accvgpr_write_b32 a26, s10 +; GFX942-NEXT: v_accvgpr_write_b32 a27, s11 +; GFX942-NEXT: v_accvgpr_write_b32 a28, s12 +; GFX942-NEXT: v_accvgpr_write_b32 a29, s13 +; GFX942-NEXT: v_accvgpr_write_b32 a30, s14 +; GFX942-NEXT: v_accvgpr_write_b32 a31, s15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 +; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 +; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 +; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 +; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 +; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 +; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] +; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 +; GFX942-NEXT: s_endpgm bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3) @@ -76,19 +412,212 @@ bb: ret void } -; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 -; GCN-DAG: s_load_dwordx16 -; GFX908-DAG-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX90A_42-COUNT-16:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} -; GFX908_A: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX942: v_mfma_f32_16x16x1_4b_f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX908-COUNT: v_accvgpr_read_b32 -; GFX908-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] -; GFX90A-NOT: v_accvgpr_read_b32 -; GFX90A-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { +; NOLIT-SRCC-LABEL: test_mfma_f32_16x16x1f32: +; NOLIT-SRCC: ; %bb.0: ; %bb +; NOLIT-SRCC-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v12, 0 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v13 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v13 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s4 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s5 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s6 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v13 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s7 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s8 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s9 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v13 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s10 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s11 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s12 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v13 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s14 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s15 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v13 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 +; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; NOLIT-SRCC-NEXT: s_endpgm +; +; LIT-SRCC-LABEL: test_mfma_f32_16x16x1f32: +; LIT-SRCC: ; %bb.0: ; %bb +; LIT-SRCC-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v12, 0 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v13 +; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v13 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s4 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s5 +; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s6 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v13 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s7 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s8 +; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s9 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v13 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s10 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s11 +; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s12 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v13 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s14 +; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s15 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v13 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 +; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 +; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48 +; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32 +; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; LIT-SRCC-NEXT: s_endpgm +; +; GFX90A-LABEL: test_mfma_f32_16x16x1f32: +; GFX90A: ; %bb.0: ; %bb +; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: test_mfma_f32_16x16x1f32: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-NEXT: v_accvgpr_write_b32 a4, s4 +; GFX942-NEXT: v_accvgpr_write_b32 a5, s5 +; GFX942-NEXT: v_accvgpr_write_b32 a6, s6 +; GFX942-NEXT: v_accvgpr_write_b32 a7, s7 +; GFX942-NEXT: v_accvgpr_write_b32 a8, s8 +; GFX942-NEXT: v_accvgpr_write_b32 a9, s9 +; GFX942-NEXT: v_accvgpr_write_b32 a10, s10 +; GFX942-NEXT: v_accvgpr_write_b32 a11, s11 +; GFX942-NEXT: v_accvgpr_write_b32 a12, s12 +; GFX942-NEXT: v_accvgpr_write_b32 a13, s13 +; GFX942-NEXT: v_accvgpr_write_b32 a14, s14 +; GFX942-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 +; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX942-NEXT: s_endpgm bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 1, i32 2, i32 3) @@ -96,19 +625,100 @@ bb: ret void } -; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 -; GCN: s_load_dwordx4 -; GFX908-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX90A_42-COUNT-4:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} -; GFX908_A: v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX942: v_mfma_f32_4x4x1_16b_f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX908-COUNT-4: v_accvgpr_read_b32 -; GFX908: global_store_dwordx4 -; GFX90A-NOT: v_accvgpr_read_b32 -; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]] define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { +; NOLIT-SRCC-LABEL: test_mfma_f32_4x4x1f32: +; NOLIT-SRCC: ; %bb.0: ; %bb +; NOLIT-SRCC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v5 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3 +; NOLIT-SRCC-NEXT: s_nop 3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; NOLIT-SRCC-NEXT: s_endpgm +; +; LIT-SRCC-LABEL: test_mfma_f32_4x4x1f32: +; LIT-SRCC: ; %bb.0: ; %bb +; LIT-SRCC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v5 +; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3 +; LIT-SRCC-NEXT: s_nop 3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; LIT-SRCC-NEXT: s_endpgm +; +; GFX90A-LABEL: test_mfma_f32_4x4x1f32: +; GFX90A: ; %bb.0: ; %bb +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: s_nop 4 +; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: test_mfma_f32_4x4x1f32: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: s_nop 3 +; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] +; GFX942-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 1, i32 2, i32 3) @@ -116,19 +726,214 @@ bb: ret void } -; GCN-LABEL: {{^}}test_mfma_f32_32x32x2f32: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 -; GCN-DAG: s_load_dwordx16 -; GFX908-DAG-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX90A_42-COUNT-16:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} -; GFX908_A: v_mfma_f32_32x32x2f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX942: v_mfma_f32_32x32x2_f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX908-COUNT-16: v_accvgpr_read_b32 -; GFX908-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] -; GFX90A-NOT: v_accvgpr_read_b32 -; GFX90A-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 { +; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x2f32: +; NOLIT-SRCC: ; %bb.0: ; %bb +; NOLIT-SRCC-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v17 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v17 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s4 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s5 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s6 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v17 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s7 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s8 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s9 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v17 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s10 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s11 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s12 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v17 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s14 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s15 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v17 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 +; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; NOLIT-SRCC-NEXT: s_endpgm +; +; LIT-SRCC-LABEL: test_mfma_f32_32x32x2f32: +; LIT-SRCC: ; %bb.0: ; %bb +; LIT-SRCC-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v17 +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v17 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s4 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s5 +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s6 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v17 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s7 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s8 +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s9 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v17 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s10 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s11 +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s12 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v17 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s14 +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s15 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v17 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 +; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; LIT-SRCC-NEXT: s_endpgm +; +; GFX90A-LABEL: test_mfma_f32_32x32x2f32: +; GFX90A: ; %bb.0: ; %bb +; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: test_mfma_f32_32x32x2f32: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-NEXT: v_accvgpr_write_b32 a4, s4 +; GFX942-NEXT: v_accvgpr_write_b32 a5, s5 +; GFX942-NEXT: v_accvgpr_write_b32 a6, s6 +; GFX942-NEXT: v_accvgpr_write_b32 a7, s7 +; GFX942-NEXT: v_accvgpr_write_b32 a8, s8 +; GFX942-NEXT: v_accvgpr_write_b32 a9, s9 +; GFX942-NEXT: v_accvgpr_write_b32 a10, s10 +; GFX942-NEXT: v_accvgpr_write_b32 a11, s11 +; GFX942-NEXT: v_accvgpr_write_b32 a12, s12 +; GFX942-NEXT: v_accvgpr_write_b32 a13, s13 +; GFX942-NEXT: v_accvgpr_write_b32 a14, s14 +; GFX942-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mfma_f32_32x32x2_f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 +; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX942-NEXT: s_endpgm bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float 1.0, float 2.0, <16 x float> %in.1, i32 1, i32 2, i32 3) @@ -136,19 +941,104 @@ bb: ret void } -; GCN-LABEL: {{^}}test_mfma_f32_16x16x4f32: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 -; GCN: s_load_dwordx4 -; GFX908-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX90A_42-COUNT-4:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} -; GFX908_A: v_mfma_f32_16x16x4f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX942: v_mfma_f32_16x16x4_f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX908-COUNT-4: v_accvgpr_read_b32 -; GFX908: global_store_dwordx4 -; GFX90A-NOT: v_accvgpr_read_b32 -; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]], define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 { +; NOLIT-SRCC-LABEL: test_mfma_f32_16x16x4f32: +; NOLIT-SRCC: ; %bb.0: ; %bb +; NOLIT-SRCC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v5 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x4f32 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3 +; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; NOLIT-SRCC-NEXT: s_endpgm +; +; LIT-SRCC-LABEL: test_mfma_f32_16x16x4f32: +; LIT-SRCC: ; %bb.0: ; %bb +; LIT-SRCC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v5 +; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_mfma_f32_16x16x4f32 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3 +; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; LIT-SRCC-NEXT: s_endpgm +; +; GFX90A-LABEL: test_mfma_f32_16x16x4f32: +; GFX90A: ; %bb.0: ; %bb +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: v_mfma_f32_16x16x4f32 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: test_mfma_f32_16x16x4f32: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mfma_f32_16x16x4_f32 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] +; GFX942-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float 1.0, float 2.0, <4 x float> %in.1, i32 1, i32 2, i32 3) @@ -156,18 +1046,408 @@ bb: ret void } -; GCN-LABEL: {{^}}test_mfma_f32_32x32x4f16: -; GCN-DAG: s_load_dwordx16 -; GCN-DAG: s_load_dwordx16 -; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX90A_42-COUNT-32:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} -; GFX908_A: v_mfma_f32_32x32x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX942: v_mfma_f32_32x32x4_2b_f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX908-COUNT-32: v_accvgpr_read_b32 -; GFX908: global_store_dwordx4 -; GFX90A-NOT: v_accvgpr_read_b32 -; GFX90A-COUNT-8: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 { +; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x4f16: +; NOLIT-SRCC: ; %bb.0: ; %bb +; NOLIT-SRCC-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[36:37], 0x0 +; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[36:37], 0x40 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s16 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s17 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s18 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s22 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s24 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s23 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s25 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s26 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s27 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s28 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s29 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s30 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s31 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a16, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a17, v1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s3 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s4 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a18, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s5 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a19, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a20, v1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s6 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s7 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a21, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s8 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a22, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a23, v1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s9 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s10 +; NOLIT-SRCC-NEXT: s_load_dwordx4 s[0:3], s[38:39], 0x0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s19 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a24, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s11 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a25, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s12 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s20 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v6, s21 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s14 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s15 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v6 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v3 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s3 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3 +; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:96 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:112 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:64 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:80 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:32 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:48 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:16 +; NOLIT-SRCC-NEXT: s_endpgm +; +; LIT-SRCC-LABEL: test_mfma_f32_32x32x4f16: +; LIT-SRCC: ; %bb.0: ; %bb +; LIT-SRCC-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[36:37], 0x0 +; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[36:37], 0x40 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s16 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s17 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s18 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s22 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s24 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s23 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s25 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s26 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s27 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s28 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s29 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s30 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s31 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a16, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a17, v1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s3 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s4 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a18, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s5 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a19, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a20, v1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s6 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s7 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a21, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s8 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a22, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a23, v1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s9 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s10 +; LIT-SRCC-NEXT: s_load_dwordx4 s[0:3], s[38:39], 0x0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s19 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a24, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s11 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a25, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s12 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 +; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s20 +; LIT-SRCC-NEXT: v_mov_b32_e32 v6, s21 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s14 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s15 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v6 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v3 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s3 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3 +; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:96 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:112 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:64 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:80 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:32 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:48 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:16 +; LIT-SRCC-NEXT: s_endpgm +; +; GFX90A-LABEL: test_mfma_f32_32x32x4f16: +; GFX90A: ; %bb.0: ; %bb +; GFX90A-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[36:37], 0x40 +; GFX90A-NEXT: s_load_dwordx16 s[16:31], s[36:37], 0x0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, s3 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[38:39], 0x0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s16 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, s17 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, s18 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, s19 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, s20 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, s21 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, s22 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, s23 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, s24 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, s25 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, s26 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, s27 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, s28 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, s29 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, s30 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, s31 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, s4 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, s5 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, s6 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, s7 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, s8 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, s9 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, s10 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, s11 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, s12 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, s13 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, s14 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15 +; GFX90A-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: global_store_dwordx4 v4, a[24:27], s[36:37] offset:96 +; GFX90A-NEXT: global_store_dwordx4 v4, a[28:31], s[36:37] offset:112 +; GFX90A-NEXT: global_store_dwordx4 v4, a[16:19], s[36:37] offset:64 +; GFX90A-NEXT: global_store_dwordx4 v4, a[20:23], s[36:37] offset:80 +; GFX90A-NEXT: global_store_dwordx4 v4, a[8:11], s[36:37] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v4, a[12:15], s[36:37] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v4, a[0:3], s[36:37] +; GFX90A-NEXT: global_store_dwordx4 v4, a[4:7], s[36:37] offset:16 +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: test_mfma_f32_32x32x4f16: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_load_dwordx16 s[0:15], s[36:37], 0x40 +; GFX942-NEXT: s_load_dwordx16 s[16:31], s[36:37], 0x0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_accvgpr_write_b32 a16, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a17, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a18, s2 +; GFX942-NEXT: v_accvgpr_write_b32 a19, s3 +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[38:39], 0x0 +; GFX942-NEXT: v_accvgpr_write_b32 a0, s16 +; GFX942-NEXT: v_accvgpr_write_b32 a1, s17 +; GFX942-NEXT: v_accvgpr_write_b32 a2, s18 +; GFX942-NEXT: v_accvgpr_write_b32 a3, s19 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a4, s20 +; GFX942-NEXT: v_accvgpr_write_b32 a5, s21 +; GFX942-NEXT: v_accvgpr_write_b32 a6, s22 +; GFX942-NEXT: v_accvgpr_write_b32 a7, s23 +; GFX942-NEXT: v_accvgpr_write_b32 a8, s24 +; GFX942-NEXT: v_accvgpr_write_b32 a9, s25 +; GFX942-NEXT: v_accvgpr_write_b32 a10, s26 +; GFX942-NEXT: v_accvgpr_write_b32 a11, s27 +; GFX942-NEXT: v_accvgpr_write_b32 a12, s28 +; GFX942-NEXT: v_accvgpr_write_b32 a13, s29 +; GFX942-NEXT: v_accvgpr_write_b32 a14, s30 +; GFX942-NEXT: v_accvgpr_write_b32 a15, s31 +; GFX942-NEXT: v_accvgpr_write_b32 a20, s4 +; GFX942-NEXT: v_accvgpr_write_b32 a21, s5 +; GFX942-NEXT: v_accvgpr_write_b32 a22, s6 +; GFX942-NEXT: v_accvgpr_write_b32 a23, s7 +; GFX942-NEXT: v_accvgpr_write_b32 a24, s8 +; GFX942-NEXT: v_accvgpr_write_b32 a25, s9 +; GFX942-NEXT: v_accvgpr_write_b32 a26, s10 +; GFX942-NEXT: v_accvgpr_write_b32 a27, s11 +; GFX942-NEXT: v_accvgpr_write_b32 a28, s12 +; GFX942-NEXT: v_accvgpr_write_b32 a29, s13 +; GFX942-NEXT: v_accvgpr_write_b32 a30, s14 +; GFX942-NEXT: v_accvgpr_write_b32 a31, s15 +; GFX942-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NEXT: v_mov_b32_e32 v3, s3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 2 +; GFX942-NEXT: global_store_dwordx4 v4, a[24:27], s[36:37] offset:96 +; GFX942-NEXT: global_store_dwordx4 v4, a[28:31], s[36:37] offset:112 +; GFX942-NEXT: global_store_dwordx4 v4, a[16:19], s[36:37] offset:64 +; GFX942-NEXT: global_store_dwordx4 v4, a[20:23], s[36:37] offset:80 +; GFX942-NEXT: global_store_dwordx4 v4, a[8:11], s[36:37] offset:32 +; GFX942-NEXT: global_store_dwordx4 v4, a[12:15], s[36:37] offset:48 +; GFX942-NEXT: global_store_dwordx4 v4, a[0:3], s[36:37] +; GFX942-NEXT: global_store_dwordx4 v4, a[4:7], s[36:37] offset:16 +; GFX942-NEXT: s_endpgm bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %c.1 = load <4 x half>, ptr addrspace(1) %c @@ -178,17 +1458,224 @@ bb: ret void } -; GCN-LABEL: {{^}}test_mfma_f32_16x16x4f16: -; GCN: s_load_dwordx16 -; GFX908-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX90A_42-COUNT-16:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} -; GFX908_A: v_mfma_f32_16x16x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX942: v_mfma_f32_16x16x4_4b_f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX908-COUNT-16: v_accvgpr_read_b32 -; GFX908: global_store_dwordx4 -; GFX90A-NOT: v_accvgpr_read_b32 -; GFX90A-COUNT-4: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 { +; NOLIT-SRCC-LABEL: test_mfma_f32_16x16x4f16: +; NOLIT-SRCC: ; %bb.0: ; %bb +; NOLIT-SRCC-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v12, 0 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0 +; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s20 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v13 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v4 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v13 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s4 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s5 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s6 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v4 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v13 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s7 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s8 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s9 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v4 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v13 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s10 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s11 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s12 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s21 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v4 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v13 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s13 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s14 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s15 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v4 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v13 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s22 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s23 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 +; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; NOLIT-SRCC-NEXT: s_endpgm +; +; LIT-SRCC-LABEL: test_mfma_f32_16x16x4f16: +; LIT-SRCC: ; %bb.0: ; %bb +; LIT-SRCC-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 +; LIT-SRCC-NEXT: v_mov_b32_e32 v12, 0 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0 +; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s20 +; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v13 +; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v4 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v13 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s4 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s5 +; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s6 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v4 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v13 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s7 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s8 +; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s9 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v4 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v13 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s10 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s11 +; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s12 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s21 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v4 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v13 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s13 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s14 +; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s15 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v4 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v13 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s22 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s23 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 +; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 +; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48 +; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32 +; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; LIT-SRCC-NEXT: s_endpgm +; +; GFX90A-LABEL: test_mfma_f32_16x16x4f16: +; GFX90A: ; %bb.0: ; %bb +; GFX90A-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0 +; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: v_mov_b32_e32 v1, s21 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX90A-NEXT: v_mov_b32_e32 v2, s22 +; GFX90A-NEXT: v_mov_b32_e32 v3, s23 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: test_mfma_f32_16x16x4f16: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0 +; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, s20 +; GFX942-NEXT: v_mov_b32_e32 v1, s21 +; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-NEXT: v_mov_b32_e32 v2, s22 +; GFX942-NEXT: v_mov_b32_e32 v3, s23 +; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-NEXT: v_accvgpr_write_b32 a4, s4 +; GFX942-NEXT: v_accvgpr_write_b32 a5, s5 +; GFX942-NEXT: v_accvgpr_write_b32 a6, s6 +; GFX942-NEXT: v_accvgpr_write_b32 a7, s7 +; GFX942-NEXT: v_accvgpr_write_b32 a8, s8 +; GFX942-NEXT: v_accvgpr_write_b32 a9, s9 +; GFX942-NEXT: v_accvgpr_write_b32 a10, s10 +; GFX942-NEXT: v_accvgpr_write_b32 a11, s11 +; GFX942-NEXT: v_accvgpr_write_b32 a12, s12 +; GFX942-NEXT: v_accvgpr_write_b32 a13, s13 +; GFX942-NEXT: v_accvgpr_write_b32 a14, s14 +; GFX942-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mfma_f32_16x16x4_4b_f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 +; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX942-NEXT: s_endpgm bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %c.1 = load <4 x half>, ptr addrspace(1) %c @@ -199,18 +1686,112 @@ bb: ret void } -; GCN-LABEL: {{^}}test_mfma_f32_4x4x4f16: -; GCN: s_load_dwordx4 -; GCN: s_load_dwordx4 -; GFX908-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX90A_42-COUNT-4:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} -; GFX908_A: v_mfma_f32_4x4x4f16 [[RES:a\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX942: v_mfma_f32_4x4x4_16b_f16 [[RES:a\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX908-COUNT-4: v_accvgpr_read_b32 -; GFX908: global_store_dwordx4 -; GFX90A-NOT: v_accvgpr_read_b32 -; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]], define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 { +; NOLIT-SRCC-LABEL: test_mfma_f32_4x4x4f16: +; NOLIT-SRCC: ; %bb.0: ; %bb +; NOLIT-SRCC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; NOLIT-SRCC-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s4 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s8 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s5 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v6, s9 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v5 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v7, s10 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s11 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v6 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v7 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s6 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s7 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 +; NOLIT-SRCC-NEXT: s_nop 3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; NOLIT-SRCC-NEXT: s_endpgm +; +; LIT-SRCC-LABEL: test_mfma_f32_4x4x4f16: +; LIT-SRCC: ; %bb.0: ; %bb +; LIT-SRCC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; LIT-SRCC-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s4 +; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s8 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s5 +; LIT-SRCC-NEXT: v_mov_b32_e32 v6, s9 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v5 +; LIT-SRCC-NEXT: v_mov_b32_e32 v7, s10 +; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s11 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v6 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v7 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s6 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s7 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 +; LIT-SRCC-NEXT: s_nop 3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; LIT-SRCC-NEXT: s_endpgm +; +; GFX90A-LABEL: test_mfma_f32_4x4x4f16: +; GFX90A: ; %bb.0: ; %bb +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s8 +; GFX90A-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NEXT: v_mov_b32_e32 v3, s7 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, s9 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, s10 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, s11 +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: s_nop 4 +; GFX90A-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1] +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: test_mfma_f32_4x4x4f16: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX942-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, s4 +; GFX942-NEXT: v_mov_b32_e32 v1, s5 +; GFX942-NEXT: v_accvgpr_write_b32 a0, s8 +; GFX942-NEXT: v_mov_b32_e32 v2, s6 +; GFX942-NEXT: v_mov_b32_e32 v3, s7 +; GFX942-NEXT: v_accvgpr_write_b32 a1, s9 +; GFX942-NEXT: v_accvgpr_write_b32 a2, s10 +; GFX942-NEXT: v_accvgpr_write_b32 a3, s11 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mfma_f32_4x4x4_16b_f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: s_nop 4 +; GFX942-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1] +; GFX942-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %c.1 = load <4 x half>, ptr addrspace(1) %c @@ -221,19 +1802,225 @@ bb: ret void } -; GCN-LABEL: {{^}}test_mfma_f32_32x32x8f16: -; GCN: s_load_dwordx16 -; GCN: s_waitcnt lgkmcnt(0) -; GFX908_A: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} -; GFX908-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX90A_42-COUNT-16:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} -; GFX908_A: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX942: v_mfma_f32_32x32x8_f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX908-COUNT-16: v_accvgpr_read_b32 -; GFX908: global_store_dwordx4 -; GFX90A-NOT: v_accvgpr_read_b32 -; GFX90A-COUNT-4: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 { +; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x8f16: +; NOLIT-SRCC: ; %bb.0: ; %bb +; NOLIT-SRCC-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0 +; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s20 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v17 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v4 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v17 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s4 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s5 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s6 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v4 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v17 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s7 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s8 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s9 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v4 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v17 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s10 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s11 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s12 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s21 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v4 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v17 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s13 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s14 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s15 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v4 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v17 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s22 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s23 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 +; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; NOLIT-SRCC-NEXT: s_endpgm +; +; LIT-SRCC-LABEL: test_mfma_f32_32x32x8f16: +; LIT-SRCC: ; %bb.0: ; %bb +; LIT-SRCC-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 +; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0 +; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s20 +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v17 +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v4 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v17 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s4 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s5 +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s6 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v4 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v17 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s7 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s8 +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s9 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v4 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v17 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s10 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s11 +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s12 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s21 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v4 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v17 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s13 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s14 +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s15 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v4 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v17 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s22 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s23 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 +; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; LIT-SRCC-NEXT: s_endpgm +; +; GFX90A-LABEL: test_mfma_f32_32x32x8f16: +; GFX90A: ; %bb.0: ; %bb +; GFX90A-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0 +; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: v_mov_b32_e32 v1, s21 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX90A-NEXT: v_mov_b32_e32 v2, s22 +; GFX90A-NEXT: v_mov_b32_e32 v3, s23 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: test_mfma_f32_32x32x8f16: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0 +; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, s20 +; GFX942-NEXT: v_mov_b32_e32 v1, s21 +; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-NEXT: v_mov_b32_e32 v2, s22 +; GFX942-NEXT: v_mov_b32_e32 v3, s23 +; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-NEXT: v_accvgpr_write_b32 a4, s4 +; GFX942-NEXT: v_accvgpr_write_b32 a5, s5 +; GFX942-NEXT: v_accvgpr_write_b32 a6, s6 +; GFX942-NEXT: v_accvgpr_write_b32 a7, s7 +; GFX942-NEXT: v_accvgpr_write_b32 a8, s8 +; GFX942-NEXT: v_accvgpr_write_b32 a9, s9 +; GFX942-NEXT: v_accvgpr_write_b32 a10, s10 +; GFX942-NEXT: v_accvgpr_write_b32 a11, s11 +; GFX942-NEXT: v_accvgpr_write_b32 a12, s12 +; GFX942-NEXT: v_accvgpr_write_b32 a13, s13 +; GFX942-NEXT: v_accvgpr_write_b32 a14, s14 +; GFX942-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mfma_f32_32x32x8_f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 +; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX942-NEXT: s_endpgm bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %c.1 = load <4 x half>, ptr addrspace(1) %c @@ -244,18 +2031,115 @@ bb: ret void } -; GCN-LABEL: {{^}}test_mfma_f32_16x16x16f16: -; GCN: s_load_dwordx4 -; GCN: s_load_dwordx4 -; GFX908-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX90A_42-COUNT-4:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} -; GFX908_A: v_mfma_f32_16x16x16f16 [[RES:a\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX942: v_mfma_f32_16x16x16_f16 [[RES:a\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX908-COUNT-4: v_accvgpr_read_b32 -; GFX908: global_store_dwordx4 -; GFX90A-NOT: v_accvgpr_read_b32 -; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]], define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 { +; NOLIT-SRCC-LABEL: test_mfma_f32_16x16x16f16: +; NOLIT-SRCC: ; %bb.0: ; %bb +; NOLIT-SRCC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; NOLIT-SRCC-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s4 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s8 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s5 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v6, s9 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v5 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v7, s10 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s11 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v6 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v7 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s6 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s7 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 +; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; NOLIT-SRCC-NEXT: s_endpgm +; +; LIT-SRCC-LABEL: test_mfma_f32_16x16x16f16: +; LIT-SRCC: ; %bb.0: ; %bb +; LIT-SRCC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; LIT-SRCC-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s4 +; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s8 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s5 +; LIT-SRCC-NEXT: v_mov_b32_e32 v6, s9 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v5 +; LIT-SRCC-NEXT: v_mov_b32_e32 v7, s10 +; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s11 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v6 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v7 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s6 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s7 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 +; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; LIT-SRCC-NEXT: s_endpgm +; +; GFX90A-LABEL: test_mfma_f32_16x16x16f16: +; GFX90A: ; %bb.0: ; %bb +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s8 +; GFX90A-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NEXT: v_mov_b32_e32 v3, s7 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, s9 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, s10 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, s11 +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1] +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: test_mfma_f32_16x16x16f16: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX942-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, s4 +; GFX942-NEXT: v_mov_b32_e32 v1, s5 +; GFX942-NEXT: v_accvgpr_write_b32 a0, s8 +; GFX942-NEXT: v_mov_b32_e32 v2, s6 +; GFX942-NEXT: v_mov_b32_e32 v3, s7 +; GFX942-NEXT: v_accvgpr_write_b32 a1, s9 +; GFX942-NEXT: v_accvgpr_write_b32 a2, s10 +; GFX942-NEXT: v_accvgpr_write_b32 a3, s11 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: s_nop 6 +; GFX942-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1] +; GFX942-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %c.1 = load <4 x half>, ptr addrspace(1) %c @@ -266,51 +2150,364 @@ bb: ret void } -; GCN-LABEL: {{^}}test_mfma_i32_32x32x4i8: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 -; GCN-DAG: s_load_dwordx16 -; GCN-DAG: s_load_dwordx16 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX90A_42-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} -; GFX908_A: v_mfma_i32_32x32x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX942: v_mfma_i32_32x32x4_2b_i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX908-COUNT-32: v_accvgpr_read_b32 -; GFX908: global_store_dwordx4 -; GFX90A-NOT: v_accvgpr_read_b32 -; GFX90A-COUNT-8: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { +; NOLIT-SRCC-LABEL: test_mfma_i32_32x32x4i8: +; NOLIT-SRCC: ; %bb.0: ; %bb +; NOLIT-SRCC-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 +; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s16 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s17 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s18 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s21 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s22 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s23 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s24 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s25 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s26 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s27 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s28 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s29 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s30 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s31 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a16, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a17, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a18, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a19, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s4 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s5 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s6 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a20, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a21, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a22, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s7 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s8 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s9 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s19 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a23, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a24, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a25, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s10 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s11 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s12 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s20 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s13 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s14 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s15 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, 1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 2 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_mfma_i32_32x32x4i8 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3 +; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a27 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a26 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a25 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a24 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:96 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a31 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a30 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a29 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a28 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a3 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:112 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a19 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a18 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a17 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a16 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a0 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:64 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a13 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:80 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a12 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a8 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:32 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[34:35] offset:48 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[34:35] +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[34:35] offset:16 +; NOLIT-SRCC-NEXT: s_endpgm +; +; LIT-SRCC-LABEL: test_mfma_i32_32x32x4i8: +; LIT-SRCC: ; %bb.0: ; %bb +; LIT-SRCC-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 +; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s16 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s17 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s18 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s21 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s22 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s23 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s24 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s25 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s26 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s27 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s28 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s29 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s30 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s31 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a16, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a17, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a18, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a19, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s4 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s5 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s6 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a20, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a21, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a22, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s7 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s8 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s9 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s19 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a23, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a24, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a25, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s10 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s11 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s12 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s20 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s13 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s14 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s15 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, 1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 2 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_mfma_i32_32x32x4i8 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3 +; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a27 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a26 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a25 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a24 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:96 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a31 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a30 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a29 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a28 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a3 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:112 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a19 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a18 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a17 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a16 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a0 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:64 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a13 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:80 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a12 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a8 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:32 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[34:35] offset:48 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[34:35] +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[34:35] offset:16 +; LIT-SRCC-NEXT: s_endpgm +; +; GFX90A-LABEL: test_mfma_i32_32x32x4i8: +; GFX90A: ; %bb.0: ; %bb +; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v1, 1 +; GFX90A-NEXT: v_mov_b32_e32 v2, 2 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 +; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s16 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, s17 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, s18 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, s19 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, s20 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, s21 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, s22 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, s23 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, s24 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, s25 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, s26 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, s27 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, s28 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, s29 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, s30 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, s31 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, s3 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, s4 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, s5 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, s6 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, s7 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, s8 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, s9 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, s10 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, s11 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, s12 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, s13 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, s14 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15 +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: v_mfma_i32_32x32x4i8 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 +; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 +; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 +; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 +; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] +; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: test_mfma_i32_32x32x4i8: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-NEXT: v_mov_b32_e32 v2, 2 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 +; GFX942-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_accvgpr_write_b32 a0, s16 +; GFX942-NEXT: v_accvgpr_write_b32 a1, s17 +; GFX942-NEXT: v_accvgpr_write_b32 a2, s18 +; GFX942-NEXT: v_accvgpr_write_b32 a3, s19 +; GFX942-NEXT: v_accvgpr_write_b32 a4, s20 +; GFX942-NEXT: v_accvgpr_write_b32 a5, s21 +; GFX942-NEXT: v_accvgpr_write_b32 a6, s22 +; GFX942-NEXT: v_accvgpr_write_b32 a7, s23 +; GFX942-NEXT: v_accvgpr_write_b32 a8, s24 +; GFX942-NEXT: v_accvgpr_write_b32 a9, s25 +; GFX942-NEXT: v_accvgpr_write_b32 a10, s26 +; GFX942-NEXT: v_accvgpr_write_b32 a11, s27 +; GFX942-NEXT: v_accvgpr_write_b32 a12, s28 +; GFX942-NEXT: v_accvgpr_write_b32 a13, s29 +; GFX942-NEXT: v_accvgpr_write_b32 a14, s30 +; GFX942-NEXT: v_accvgpr_write_b32 a15, s31 +; GFX942-NEXT: v_accvgpr_write_b32 a16, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a17, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a18, s2 +; GFX942-NEXT: v_accvgpr_write_b32 a19, s3 +; GFX942-NEXT: v_accvgpr_write_b32 a20, s4 +; GFX942-NEXT: v_accvgpr_write_b32 a21, s5 +; GFX942-NEXT: v_accvgpr_write_b32 a22, s6 +; GFX942-NEXT: v_accvgpr_write_b32 a23, s7 +; GFX942-NEXT: v_accvgpr_write_b32 a24, s8 +; GFX942-NEXT: v_accvgpr_write_b32 a25, s9 +; GFX942-NEXT: v_accvgpr_write_b32 a26, s10 +; GFX942-NEXT: v_accvgpr_write_b32 a27, s11 +; GFX942-NEXT: v_accvgpr_write_b32 a28, s12 +; GFX942-NEXT: v_accvgpr_write_b32 a29, s13 +; GFX942-NEXT: v_accvgpr_write_b32 a30, s14 +; GFX942-NEXT: v_accvgpr_write_b32 a31, s15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mfma_i32_32x32x4_2b_i8 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 2 +; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 +; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 +; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 +; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 +; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 +; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 +; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] +; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 +; GFX942-NEXT: s_endpgm bb: %in.1 = load <32 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 1, i32 2, <32 x i32> %in.1, i32 1, i32 2, i32 3) @@ -318,19 +2515,212 @@ bb: ret void } -; GCN-LABEL: {{^}}test_mfma_i32_16x16x4i8: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 -; GCN-DAG: s_load_dwordx16 -; GFX908-DAG-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX90A_42-COUNT-16:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} -; GFX908_A: v_mfma_i32_16x16x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX942: v_mfma_i32_16x16x4_4b_i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX908-COUNT-16: v_accvgpr_read_b32 -; GFX908: global_store_dwordx4 -; GFX90A-NOT: v_accvgpr_read_b32 -; GFX90A-COUNT-4: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 { +; NOLIT-SRCC-LABEL: test_mfma_i32_16x16x4i8: +; NOLIT-SRCC: ; %bb.0: ; %bb +; NOLIT-SRCC-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v17 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v17 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s4 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s5 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s6 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v17 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s7 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s8 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s9 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v17 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s10 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s11 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s12 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v17 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s14 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s15 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v17 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 +; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; NOLIT-SRCC-NEXT: s_endpgm +; +; LIT-SRCC-LABEL: test_mfma_i32_16x16x4i8: +; LIT-SRCC: ; %bb.0: ; %bb +; LIT-SRCC-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v17 +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v17 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s4 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s5 +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s6 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v17 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s7 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s8 +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s9 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v17 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s10 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s11 +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s12 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v17 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s14 +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s15 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v17 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 +; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; LIT-SRCC-NEXT: s_endpgm +; +; GFX90A-LABEL: test_mfma_i32_16x16x4i8: +; GFX90A: ; %bb.0: ; %bb +; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 2 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: test_mfma_i32_16x16x4i8: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v0, 1 +; GFX942-NEXT: v_mov_b32_e32 v1, 2 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-NEXT: v_accvgpr_write_b32 a4, s4 +; GFX942-NEXT: v_accvgpr_write_b32 a5, s5 +; GFX942-NEXT: v_accvgpr_write_b32 a6, s6 +; GFX942-NEXT: v_accvgpr_write_b32 a7, s7 +; GFX942-NEXT: v_accvgpr_write_b32 a8, s8 +; GFX942-NEXT: v_accvgpr_write_b32 a9, s9 +; GFX942-NEXT: v_accvgpr_write_b32 a10, s10 +; GFX942-NEXT: v_accvgpr_write_b32 a11, s11 +; GFX942-NEXT: v_accvgpr_write_b32 a12, s12 +; GFX942-NEXT: v_accvgpr_write_b32 a13, s13 +; GFX942-NEXT: v_accvgpr_write_b32 a14, s14 +; GFX942-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mfma_i32_16x16x4_4b_i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 +; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX942-NEXT: s_endpgm bb: %in.1 = load <16 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 1, i32 2, <16 x i32> %in.1, i32 1, i32 2, i32 3) @@ -338,19 +2728,100 @@ bb: ret void } -; GCN-LABEL: {{^}}test_mfma_i32_4x4x4i8: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 -; GCN: s_load_dwordx4 -; GFX908-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX90A_42-COUNT-4:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} -; GFX908_A: v_mfma_i32_4x4x4i8 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX942: v_mfma_i32_4x4x4_16b_i8 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX908-COUNT-4: v_accvgpr_read_b32 -; GFX908: global_store_dwordx4 -; GFX90A-NOT: v_accvgpr_read_b32 -; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]], define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 { +; NOLIT-SRCC-LABEL: test_mfma_i32_4x4x4i8: +; NOLIT-SRCC: ; %bb.0: ; %bb +; NOLIT-SRCC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v5 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3 +; NOLIT-SRCC-NEXT: s_nop 3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; NOLIT-SRCC-NEXT: s_endpgm +; +; LIT-SRCC-LABEL: test_mfma_i32_4x4x4i8: +; LIT-SRCC: ; %bb.0: ; %bb +; LIT-SRCC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v5 +; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3 +; LIT-SRCC-NEXT: s_nop 3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; LIT-SRCC-NEXT: s_endpgm +; +; GFX90A-LABEL: test_mfma_i32_4x4x4i8: +; GFX90A: ; %bb.0: ; %bb +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 1 +; GFX90A-NEXT: v_mov_b32_e32 v2, 2 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: s_nop 4 +; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: test_mfma_i32_4x4x4i8: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v0, 1 +; GFX942-NEXT: v_mov_b32_e32 v2, 2 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mfma_i32_4x4x4_16b_i8 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: s_nop 4 +; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] +; GFX942-NEXT: s_endpgm bb: %in.1 = load <4 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %in.1, i32 1, i32 2, i32 3) @@ -358,12 +2829,396 @@ bb: ret void } -; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_forward_acc: -; GFX908_A: v_mfma_f32_32x32x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}] -; GFX908_A-NEXT: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]] -; GFX942: v_mfma_f32_32x32x1_2b_f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}] -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]] define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1) %arg) #0 { +; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_forward_acc: +; NOLIT-SRCC: ; %bb.0: ; %bb +; NOLIT-SRCC-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 +; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s16 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s17 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s18 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s22 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s23 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s24 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s25 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s26 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s27 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s28 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s29 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s30 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s31 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a16, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a17, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a18, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a19, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s4 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s5 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s6 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a20, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a21, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a22, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s7 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s8 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s9 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s19 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a23, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a24, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a25, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s10 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s11 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s12 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s20 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s21 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s13 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s14 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s15 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v5 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 2.0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] +; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] +; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 +; NOLIT-SRCC-NEXT: s_endpgm +; +; LIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_forward_acc: +; LIT-SRCC: ; %bb.0: ; %bb +; LIT-SRCC-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 +; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s16 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s17 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s18 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s22 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s23 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s24 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s25 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s26 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s27 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s28 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s29 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s30 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s31 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a16, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a17, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a18, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a19, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s4 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s5 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s6 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a20, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a21, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a22, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s7 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s8 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s9 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s19 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a23, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a24, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a25, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s10 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s11 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s12 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s20 +; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s21 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s13 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s14 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s15 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, 1.0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v5 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 2.0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] +; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] +; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 +; LIT-SRCC-NEXT: s_endpgm +; +; GFX90A-LABEL: test_mfma_f32_32x32x1f32_forward_acc: +; GFX90A: ; %bb.0: ; %bb +; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 +; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s16 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, s17 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, s18 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, s19 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, s20 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, s21 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, s22 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, s23 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, s24 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, s25 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, s26 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, s27 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, s28 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, s29 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, s30 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, s31 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, s3 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, s4 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, s5 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, s6 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, s7 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, s8 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, s9 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, s10 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, s11 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, s12 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, s13 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, s14 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15 +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 +; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 +; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 +; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 +; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] +; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: test_mfma_f32_32x32x1f32_forward_acc: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 +; GFX942-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_accvgpr_write_b32 a0, s16 +; GFX942-NEXT: v_accvgpr_write_b32 a1, s17 +; GFX942-NEXT: v_accvgpr_write_b32 a2, s18 +; GFX942-NEXT: v_accvgpr_write_b32 a3, s19 +; GFX942-NEXT: v_accvgpr_write_b32 a4, s20 +; GFX942-NEXT: v_accvgpr_write_b32 a5, s21 +; GFX942-NEXT: v_accvgpr_write_b32 a6, s22 +; GFX942-NEXT: v_accvgpr_write_b32 a7, s23 +; GFX942-NEXT: v_accvgpr_write_b32 a8, s24 +; GFX942-NEXT: v_accvgpr_write_b32 a9, s25 +; GFX942-NEXT: v_accvgpr_write_b32 a10, s26 +; GFX942-NEXT: v_accvgpr_write_b32 a11, s27 +; GFX942-NEXT: v_accvgpr_write_b32 a12, s28 +; GFX942-NEXT: v_accvgpr_write_b32 a13, s29 +; GFX942-NEXT: v_accvgpr_write_b32 a14, s30 +; GFX942-NEXT: v_accvgpr_write_b32 a15, s31 +; GFX942-NEXT: v_accvgpr_write_b32 a16, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a17, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a18, s2 +; GFX942-NEXT: v_accvgpr_write_b32 a19, s3 +; GFX942-NEXT: v_accvgpr_write_b32 a20, s4 +; GFX942-NEXT: v_accvgpr_write_b32 a21, s5 +; GFX942-NEXT: v_accvgpr_write_b32 a22, s6 +; GFX942-NEXT: v_accvgpr_write_b32 a23, s7 +; GFX942-NEXT: v_accvgpr_write_b32 a24, s8 +; GFX942-NEXT: v_accvgpr_write_b32 a25, s9 +; GFX942-NEXT: v_accvgpr_write_b32 a26, s10 +; GFX942-NEXT: v_accvgpr_write_b32 a27, s11 +; GFX942-NEXT: v_accvgpr_write_b32 a28, s12 +; GFX942-NEXT: v_accvgpr_write_b32 a29, s13 +; GFX942-NEXT: v_accvgpr_write_b32 a30, s14 +; GFX942-NEXT: v_accvgpr_write_b32 a31, s15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 +; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 +; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 +; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 +; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 +; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 +; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] +; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 +; GFX942-NEXT: s_endpgm bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) @@ -372,12 +3227,214 @@ bb: ret void } -; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_forward_acc: -; GFX908_A: v_mfma_f32_16x16x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}] -; GFX908_A-NEXT: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]] -; GFX942: v_mfma_f32_16x16x1_4b_f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}] -; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]] define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1) %arg) #0 { +; NOLIT-SRCC-LABEL: test_mfma_f32_16x16x1f32_forward_acc: +; NOLIT-SRCC: ; %bb.0: ; %bb +; NOLIT-SRCC-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v17 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v17 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s4 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s5 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s6 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v17 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s7 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s8 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s9 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v17 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s10 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s11 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s12 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v17 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s14 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s15 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v17 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; NOLIT-SRCC-NEXT: s_endpgm +; +; LIT-SRCC-LABEL: test_mfma_f32_16x16x1f32_forward_acc: +; LIT-SRCC: ; %bb.0: ; %bb +; LIT-SRCC-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v17 +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v17 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s4 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s5 +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s6 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v17 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s7 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s8 +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s9 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v17 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s10 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s11 +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s12 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v17 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s14 +; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s15 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v17 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; LIT-SRCC-NEXT: s_endpgm +; +; GFX90A-LABEL: test_mfma_f32_16x16x1f32_forward_acc: +; GFX90A: ; %bb.0: ; %bb +; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: test_mfma_f32_16x16x1f32_forward_acc: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-NEXT: v_accvgpr_write_b32 a4, s4 +; GFX942-NEXT: v_accvgpr_write_b32 a5, s5 +; GFX942-NEXT: v_accvgpr_write_b32 a6, s6 +; GFX942-NEXT: v_accvgpr_write_b32 a7, s7 +; GFX942-NEXT: v_accvgpr_write_b32 a8, s8 +; GFX942-NEXT: v_accvgpr_write_b32 a9, s9 +; GFX942-NEXT: v_accvgpr_write_b32 a10, s10 +; GFX942-NEXT: v_accvgpr_write_b32 a11, s11 +; GFX942-NEXT: v_accvgpr_write_b32 a12, s12 +; GFX942-NEXT: v_accvgpr_write_b32 a13, s13 +; GFX942-NEXT: v_accvgpr_write_b32 a14, s14 +; GFX942-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] +; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 +; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX942-NEXT: s_endpgm bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -386,13 +3443,105 @@ bb: ret void } -; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_forward_acc: -; GFX908_A: v_mfma_f32_4x4x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}] -; GFX908_A-NEXT: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]] -; GFX942: v_mfma_f32_4x4x1_16b_f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}] -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]] define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) %arg) #0 { +; NOLIT-SRCC-LABEL: test_mfma_f32_4x4x1f32_forward_acc: +; NOLIT-SRCC: ; %bb.0: ; %bb +; NOLIT-SRCC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v5 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] +; NOLIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] +; NOLIT-SRCC-NEXT: s_nop 3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; NOLIT-SRCC-NEXT: s_endpgm +; +; LIT-SRCC-LABEL: test_mfma_f32_4x4x1f32_forward_acc: +; LIT-SRCC: ; %bb.0: ; %bb +; LIT-SRCC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v5 +; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] +; LIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] +; LIT-SRCC-NEXT: s_nop 3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; LIT-SRCC-NEXT: s_endpgm +; +; GFX90A-LABEL: test_mfma_f32_4x4x1f32_forward_acc: +; GFX90A: ; %bb.0: ; %bb +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] +; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] +; GFX90A-NEXT: s_nop 4 +; GFX90A-NEXT: global_store_dwordx4 v2, a[0:3], s[6:7] +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: test_mfma_f32_4x4x1f32_forward_acc: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v1, a[0:3] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v1, a[0:3] +; GFX942-NEXT: s_nop 3 +; GFX942-NEXT: global_store_dwordx4 v2, a[0:3], s[6:7] +; GFX942-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -401,215 +3550,1211 @@ bb: ret void } -; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_imm_splat: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 -; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; NOLIT-SRCC: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}] -; LIT-SRCC: v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], 1.0 -; GFX90A: v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], 1.0 -; GFX942: v_mfma_f32_4x4x1_16b_f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], 1.0 -; GFX908-COUNT-4: v_accvgpr_read_b32 -; GFX908: global_store_dwordx4 -; GFX90A-NOT: v_accvgpr_read_b32 -; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]], define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm_splat(ptr addrspace(1) %arg) #0 { +; NOLIT-SRCC-LABEL: test_mfma_f32_4x4x1f32_imm_splat: +; NOLIT-SRCC: ; %bb.0: ; %bb +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 1.0 +; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] +; NOLIT-SRCC-NEXT: s_nop 3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; NOLIT-SRCC-NEXT: s_endpgm +; +; LIT-SRCC-LABEL: test_mfma_f32_4x4x1f32_imm_splat: +; LIT-SRCC: ; %bb.0: ; %bb +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, 1.0 +; LIT-SRCC-NEXT: s_nop 3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; LIT-SRCC-NEXT: s_endpgm +; +; GFX90A-LABEL: test_mfma_f32_4x4x1f32_imm_splat: +; GFX90A: ; %bb.0: ; %bb +; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v2, 1.0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_nop 3 +; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1] +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: test_mfma_f32_4x4x1f32_imm_splat: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v2, 1.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_nop 2 +; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1] +; GFX942-NEXT: s_endpgm bb: %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> , i32 0, i32 0, i32 0) store <4 x float> %mai.1, ptr addrspace(1) %arg ret void } -; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_imm_splat: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 -; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; NOLIT-SRCC: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}] -; LIT-SRCC: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0 -; GFX90A: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0 -; GFX942: v_mfma_f32_16x16x1_4b_f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0 -; GFX908-COUNT-16: v_accvgpr_read_b32 -; GFX908: global_store_dwordx4 -; GFX90A-NOT: v_accvgpr_read_b32 -; GFX90A-COUNT-4: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %arg) #0 { +; NOLIT-SRCC-LABEL: test_mfma_f32_16x16x1f32_imm_splat: +; NOLIT-SRCC: ; %bb.0: ; %bb +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, 1.0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 +; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; NOLIT-SRCC-NEXT: s_endpgm +; +; LIT-SRCC-LABEL: test_mfma_f32_16x16x1f32_imm_splat: +; LIT-SRCC: ; %bb.0: ; %bb +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 +; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; LIT-SRCC-NEXT: v_mov_b32_e32 v8, 0 +; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, 1.0 +; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 +; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:32 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; LIT-SRCC-NEXT: s_endpgm +; +; GFX90A-LABEL: test_mfma_f32_16x16x1f32_imm_splat: +; GFX90A: ; %bb.0: ; %bb +; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: test_mfma_f32_16x16x1f32_imm_splat: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: s_endpgm bb: %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> , i32 0, i32 0, i32 0) store <16 x float> %mai.1, ptr addrspace(1) %arg ret void } -; GCN-LABEL: {{^}}test_mfma_f32_32x32x8f16_imm_splat: -; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 0x40004000 -; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 0x3c003c00 -; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; NOLIT-SRCC: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9:]+}}] -; LIT-SRCC: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], 1.0 -; GFX90A: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], 1.0 -; GFX942: v_mfma_f32_32x32x8_f16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], 1.0 -; GFX908-COUNT-16: v_accvgpr_read_b32 -; GFX908: global_store_dwordx4 -; GFX90A-NOT: v_accvgpr_read_b32 -; GFX90A-COUNT-4: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %arg) #0 { +; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x8f16_imm_splat: +; NOLIT-SRCC: ; %bb.0: ; %bb +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 0x3c003c00 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, 1.0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, 0x40004000 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, v2 +; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] +; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; NOLIT-SRCC-NEXT: s_endpgm +; +; LIT-SRCC-LABEL: test_mfma_f32_32x32x8f16_imm_splat: +; LIT-SRCC: ; %bb.0: ; %bb +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 0x3c003c00 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, v0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, 0x40004000 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, v2 +; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; LIT-SRCC-NEXT: v_mov_b32_e32 v13, 0 +; LIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], 1.0 +; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a3 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: global_store_dwordx4 v13, v[0:3], s[0:1] offset:48 +; LIT-SRCC-NEXT: global_store_dwordx4 v13, v[4:7], s[0:1] offset:32 +; LIT-SRCC-NEXT: global_store_dwordx4 v13, v[8:11], s[0:1] offset:16 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a0 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v13, v[9:12], s[0:1] +; LIT-SRCC-NEXT: s_endpgm +; +; GFX90A-LABEL: test_mfma_f32_32x32x8f16_imm_splat: +; GFX90A: ; %bb.0: ; %bb +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x3c003c00 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x40004000 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: test_mfma_f32_32x32x8f16_imm_splat: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: v_mov_b32_e32 v0, 0x3c003c00 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0x40004000 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mfma_f32_32x32x8_f16 a[0:15], v[0:1], v[2:3], 1.0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: s_endpgm bb: %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> , <4 x half> , <16 x float> , i32 0, i32 0, i32 0) store <16 x float> %mai.1, ptr addrspace(1) %arg ret void } -; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_imm_splat: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 -; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; NOLIT-SRCC: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}] -; LIT-SRCC: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 0 -; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 0 -; GFX942: v_mfma_f32_32x32x1_2b_f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 0 -; GFX908-COUNT-32: v_accvgpr_read_b32 -; GFX908: global_store_dwordx4 -; GFX90A-NOT: v_accvgpr_read_b32 -; GFX90A-COUNT-8: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %arg) #0 { +; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_imm_splat: +; NOLIT-SRCC: ; %bb.0: ; %bb +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a16, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a17, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a18, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a19, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a20, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a21, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a22, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a23, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a24, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a25, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a26, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a27, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a28, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a29, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, 0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 +; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; NOLIT-SRCC-NEXT: s_endpgm +; +; LIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_imm_splat: +; LIT-SRCC: ; %bb.0: ; %bb +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 +; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; LIT-SRCC-NEXT: v_mov_b32_e32 v14, 0 +; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0 +; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a27 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a26 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a25 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a24 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a23 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a22 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a21 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a20 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:112 +; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:96 +; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:80 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a19 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a18 +; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:48 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a17 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a16 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:32 +; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] offset:64 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:16 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] +; LIT-SRCC-NEXT: s_endpgm +; +; GFX90A-LABEL: test_mfma_f32_32x32x1f32_imm_splat: +; GFX90A: ; %bb.0: ; %bb +; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 +; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 +; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 +; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 +; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: test_mfma_f32_32x32x1f32_imm_splat: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 +; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 +; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 +; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 +; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: s_endpgm bb: %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> , i32 0, i32 0, i32 0) store <32 x float> %mai.1, ptr addrspace(1) %arg ret void } -; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_imm: -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 2.0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX908_A: v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] -; GFX942: v_mfma_f32_4x4x1_16b_f32 [[RES:a\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] -; GFX908-COUNT-4: v_accvgpr_read_b32 -; GFX908: global_store_dwordx4 -; GFX90A-NOT: v_accvgpr_read_b32 -; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]], define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(ptr addrspace(1) %arg) #0 { +; NOLIT-SRCC-LABEL: test_mfma_f32_4x4x1f32_imm: +; NOLIT-SRCC: ; %bb.0: ; %bb +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 2.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 1.0 +; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] +; NOLIT-SRCC-NEXT: s_nop 3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; NOLIT-SRCC-NEXT: s_endpgm +; +; LIT-SRCC-LABEL: test_mfma_f32_4x4x1f32_imm: +; LIT-SRCC: ; %bb.0: ; %bb +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1.0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 2.0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 1.0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 1.0 +; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] +; LIT-SRCC-NEXT: s_nop 3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; LIT-SRCC-NEXT: s_endpgm +; +; GFX90A-LABEL: test_mfma_f32_4x4x1f32_imm: +; GFX90A: ; %bb.0: ; %bb +; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, 2.0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v2, a[0:3] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_nop 3 +; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: test_mfma_f32_4x4x1f32_imm: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_accvgpr_write_b32 a0, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, 2.0 +; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v1, v2, a[0:3] +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_nop 2 +; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: s_endpgm bb: %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> , i32 0, i32 0, i32 0) store <4 x float> %mai.1, ptr addrspace(1) %arg ret void } -; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_imm: -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 2.0 -; GFX908-COUNT-14: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; GFX90A-COUNT-14: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX908_A: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] -; GFX942: v_mfma_f32_16x16x1_4b_f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] -; GFX908-COUNT-16: v_accvgpr_read_b32 -; GFX908: global_store_dwordx4 -; GFX90A-NOT: v_accvgpr_read_b32 -; GFX90A-COUNT-4: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #0 { +; NOLIT-SRCC-LABEL: test_mfma_f32_16x16x1f32_imm: +; NOLIT-SRCC: ; %bb.0: ; %bb +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, 2.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, 1.0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 +; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; NOLIT-SRCC-NEXT: s_endpgm +; +; LIT-SRCC-LABEL: test_mfma_f32_16x16x1f32_imm: +; LIT-SRCC: ; %bb.0: ; %bb +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1.0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, 2.0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 1.0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 1.0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 1.0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, 1.0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, 1.0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, 1.0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, 1.0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, 1.0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, 1.0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, 1.0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, 1.0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, 1.0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, 1.0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, 1.0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 +; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; LIT-SRCC-NEXT: s_endpgm +; +; GFX90A-LABEL: test_mfma_f32_16x16x1f32_imm: +; GFX90A: ; %bb.0: ; %bb +; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, 2.0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: test_mfma_f32_16x16x1f32_imm: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_accvgpr_write_b32 a0, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX942-NEXT: v_accvgpr_write_b32 a15, 2.0 +; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a8, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a9, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a10, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a11, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a12, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a13, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a14, a0 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v1, v2, a[0:15] +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: s_endpgm bb: %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> , i32 0, i32 0, i32 0) store <16 x float> %mai.1, ptr addrspace(1) %arg ret void } -; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_imm: -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} -; GFX908_A: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] -; GFX942: v_mfma_f32_32x32x1_2b_f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] -; GFX908-COUNT-32: v_accvgpr_read_b32 -; GFX908: global_store_dwordx4 -; GFX90A-NOT: v_accvgpr_read_b32 -; GFX90A-COUNT-8: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #0 { +; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_imm: +; NOLIT-SRCC: ; %bb.0: ; %bb +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a16, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a17, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a18, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a19, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a20, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a21, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a22, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a23, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a24, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a25, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a26, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a27, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a28, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a29, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, 0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, 0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 +; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; NOLIT-SRCC-NEXT: s_endpgm +; +; LIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_imm: +; LIT-SRCC: ; %bb.0: ; %bb +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1.0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a16, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a17, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a18, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a19, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a20, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a21, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a22, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a23, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a24, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a25, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a26, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a27, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a28, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a29, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, 0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, 0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 +; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; LIT-SRCC-NEXT: s_endpgm +; +; GFX90A-LABEL: test_mfma_f32_32x32x1f32_imm: +; GFX90A: ; %bb.0: ; %bb +; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, 1.0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a15, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a1 +; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 +; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 +; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 +; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 +; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: test_mfma_f32_32x32x1f32_imm: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a0, 1.0 +; GFX942-NEXT: v_accvgpr_mov_b32 a2, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a3, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a4, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a5, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a6, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a7, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a8, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a9, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a10, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a11, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a12, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a13, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a14, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a15, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a16, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a17, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a18, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a19, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a20, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a21, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a22, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a23, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a24, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a25, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a26, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a27, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a28, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a29, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a30, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a31, a1 +; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 +; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 +; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 +; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 +; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: s_endpgm bb: %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> , i32 0, i32 0, i32 0) store <32 x float> %mai.1, ptr addrspace(1) %arg ret void } -; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_lit_splat: -; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000 -; GCN: v_accvgpr_write_b32 [[TTMPA:a[0-9]+]], [[TMP]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] -; GFX90A: v_accvgpr_mov_b32 a{{[0-9]+}}, [[TTMPA]] -; GFX90A: v_accvgpr_mov_b32 a{{[0-9]+}}, [[TTMPA]] -; GFX90A: v_accvgpr_mov_b32 a{{[0-9]+}}, [[TTMPA]] -; GFX908_A: v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] -; GFX942: v_mfma_f32_4x4x1_16b_f32 [[RES:a\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] -; GFX908-COUNT-4: v_accvgpr_read_b32 -; GFX908: global_store_dwordx4 -; GFX90A-NOT: v_accvgpr_read_b32 -; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]] define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(ptr addrspace(1) %arg, i64 %idx) #0 { +; NOLIT-SRCC-LABEL: test_mfma_f32_4x4x1f32_lit_splat: +; NOLIT-SRCC: ; %bb.0: ; %bb +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 0x42f60000 +; NOLIT-SRCC-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v1 +; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] +; NOLIT-SRCC-NEXT: s_nop 3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; NOLIT-SRCC-NEXT: s_endpgm +; +; LIT-SRCC-LABEL: test_mfma_f32_4x4x1f32_lit_splat: +; LIT-SRCC: ; %bb.0: ; %bb +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 0x42f60000 +; LIT-SRCC-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v1 +; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] +; LIT-SRCC-NEXT: s_nop 3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; LIT-SRCC-NEXT: s_endpgm +; +; GFX90A-LABEL: test_mfma_f32_4x4x1f32_lit_splat: +; GFX90A: ; %bb.0: ; %bb +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x42f60000 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v2, a[0:3] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_nop 3 +; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: test_mfma_f32_4x4x1f32_lit_splat: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: v_mov_b32_e32 v1, 0x42f60000 +; GFX942-NEXT: v_accvgpr_write_b32 a0, v1 +; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 +; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v1, v2, a[0:3] +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_nop 2 +; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: s_endpgm bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid @@ -619,20 +4764,88 @@ bb: ret void } -; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_lit_splat_bad_code: -; GCN: v_mov_b32_e32 [[TMP0:v[0-9]+]], 0x42f60000 -; GCN: v_accvgpr_write_b32 [[AGPR:a[0-9]+]], [[TMP0]] -; GFX90A_42-COUNT-3: v_accvgpr_mov_b32 a{{[0-9]+}}, [[AGPR]] -; GFX908-NEXT: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP0]] -; GFX908-NEXT: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP0]] -; GFX908-NEXT: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP0]] -; GCN: s_nop 0 -; GFX908_A: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] -; GFX942: v_mfma_f32_4x4x1_16b_f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] -; GFX908-COUNT-4: v_accvgpr_read_b32 -; GFX908: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}] -; GFX90A_42: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}] define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(ptr addrspace(1) %arg) #0 { +; NOLIT-SRCC-LABEL: test_mfma_f32_4x4x1f32_lit_splat_bad_code: +; NOLIT-SRCC: ; %bb.0: ; %bb +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 0x42f60000 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 1.0 +; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 2.0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v0, a[0:3] +; NOLIT-SRCC-NEXT: s_nop 3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; NOLIT-SRCC-NEXT: s_endpgm +; +; LIT-SRCC-LABEL: test_mfma_f32_4x4x1f32_lit_splat_bad_code: +; LIT-SRCC: ; %bb.0: ; %bb +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 0x42f60000 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 1.0 +; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 2.0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v0, a[0:3] +; LIT-SRCC-NEXT: s_nop 3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; LIT-SRCC-NEXT: s_endpgm +; +; GFX90A-LABEL: test_mfma_f32_4x4x1f32_lit_splat_bad_code: +; GFX90A: ; %bb.0: ; %bb +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x42f60000 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v2, a[0:3] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_nop 3 +; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: test_mfma_f32_4x4x1f32_lit_splat_bad_code: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: v_mov_b32_e32 v1, 0x42f60000 +; GFX942-NEXT: v_accvgpr_write_b32 a0, v1 +; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 +; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v1, v2, a[0:3] +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_nop 2 +; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: s_endpgm bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid @@ -642,22 +4855,260 @@ bb: ret void } -; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_vecarg: -; GFX90A_42-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 -; GFX90A_42-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 -; GCN-COUNT-8: global_load_dwordx4 -; GFX908-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX90A_42-NOT: v_accvgpr_write -; GFX908-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 -; GFX908-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 -; GFX908: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX942: v_mfma_f32_32x32x1_2b_f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX908: v_accvgpr_read_b32 -; GFX908-COUNT-8: global_store_dwordx4 -; GFX90A_42-NOT: v_accvgpr_read_b32 -; GFX90A_42-COUNT-5: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}] define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg) #0 { +; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_vecarg: +; NOLIT-SRCC: ; %bb.0: ; %bb +; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; NOLIT-SRCC-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; NOLIT-SRCC-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 +; NOLIT-SRCC-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 +; NOLIT-SRCC-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 +; NOLIT-SRCC-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 +; NOLIT-SRCC-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; NOLIT-SRCC-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 +; NOLIT-SRCC-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 +; NOLIT-SRCC-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; NOLIT-SRCC-NEXT: s_waitcnt vmcnt(0) +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v5 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v6 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v7 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v8 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v9 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v10 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v11 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v12 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v13 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v14 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v15 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a16, v16 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a17, v17 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a18, v18 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a19, v19 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a20, v20 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a21, v21 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a22, v22 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a23, v23 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a24, v24 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a25, v25 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v26 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v27 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v28 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v29 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v30 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v31 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] cbsz:1 abid:2 blgp:3 +; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a31 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a30 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a29 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a28 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a19 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a18 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a17 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a16 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a11 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a10 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a9 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a8 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a15 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a14 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a13 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a12 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a3 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a2 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a1 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a0 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:48 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] +; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16 +; NOLIT-SRCC-NEXT: s_endpgm +; +; LIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_vecarg: +; LIT-SRCC: ; %bb.0: ; %bb +; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; LIT-SRCC-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) +; LIT-SRCC-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 +; LIT-SRCC-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 +; LIT-SRCC-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 +; LIT-SRCC-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 +; LIT-SRCC-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; LIT-SRCC-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 +; LIT-SRCC-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 +; LIT-SRCC-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; LIT-SRCC-NEXT: s_waitcnt vmcnt(0) +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v5 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v6 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v7 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v8 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v9 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v10 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v11 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v12 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v13 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v14 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v15 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a16, v16 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a17, v17 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a18, v18 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a19, v19 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a20, v20 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a21, v21 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a22, v22 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a23, v23 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a24, v24 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a25, v25 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v26 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v27 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v28 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v29 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v30 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v31 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] cbsz:1 abid:2 blgp:3 +; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a31 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a30 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a29 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a28 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a19 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a18 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a17 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a16 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a11 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a10 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a9 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a8 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a15 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a14 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a13 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a12 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a3 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a2 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a1 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a0 +; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112 +; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64 +; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80 +; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 +; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:48 +; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] +; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16 +; LIT-SRCC-NEXT: s_endpgm +; +; GFX90A-LABEL: test_mfma_f32_32x32x1f32_vecarg: +; GFX90A: ; %bb.0: ; %bb +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112 +; GFX90A-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96 +; GFX90A-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80 +; GFX90A-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64 +; GFX90A-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48 +; GFX90A-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32 +; GFX90A-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16 +; GFX90A-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 +; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 +; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 +; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 +; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: test_mfma_f32_32x32x1f32_vecarg: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112 +; GFX942-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96 +; GFX942-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80 +; GFX942-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64 +; GFX942-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48 +; GFX942-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32 +; GFX942-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16 +; GFX942-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 +; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 +; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 +; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 +; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GFX942-NEXT: s_endpgm bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %arg, i32 %tid @@ -668,3 +5119,8 @@ bb: } attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} +; GFX908: {{.*}} +; GFX908_A: {{.*}} +; GFX90A_42: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll index 2f440d5230b98..44415657b6336 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll @@ -1,44 +1,230 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX8-OPT,GCN-OPT %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -O0 -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX8-NOOPT %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10,GCN-OPT %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-enable-vopd=0 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11,GCN-OPT %s -; GCN-LABEL: {{^}}dpp_test: -; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8-OPT: s_mov -; GFX8-OPT: s_mov -; GFX8-NOOPT: s_nop 1 -; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { +; GFX8-OPT-LABEL: dpp_test: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false) #0 store i32 %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}dpp_test_bc: -; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8-OPT: s_mov -; GFX8-OPT: s_mov -; GFX8-NOOPT: s_nop 1 -; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1{{$}} define amdgpu_kernel void @dpp_test_bc(ptr addrspace(1) %out, i32 %in1, i32 %in2) { +; GFX8-OPT-LABEL: dpp_test_bc: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 +; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test_bc: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test_bc: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test_bc: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 2, i32 1, i32 1, i1 true) #0 store i32 %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}dpp_test1: -; GFX10,GFX11: v_add_nc_u32_e32 [[REG:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} -; GFX8-OPT: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}} -; GFX8-NOOPT: v_add_u32_e64 [[REG:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}} -; GFX8-NOOPT: v_mov_b32_e32 v{{[0-9]+}}, 0 -; GFX8: s_nop 1 -; GFX8-NEXT: v_mov_b32_dpp {{v[0-9]+}}, [[REG]] quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf @0 = internal unnamed_addr addrspace(3) global [448 x i32] poison, align 4 define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr { +; GFX8-OPT-LABEL: dpp_test1: +; GFX8-OPT: ; %bb.0: ; %bb +; GFX8-OPT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-OPT-NEXT: s_mov_b32 m0, -1 +; GFX8-OPT-NEXT: ds_read_b32 v1, v0 +; GFX8-OPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-OPT-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: s_barrier +; GFX8-OPT-NEXT: v_add_u32_e32 v1, vcc, v1, v1 +; GFX8-OPT-NEXT: s_nop 1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf +; GFX8-OPT-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-OPT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-OPT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-OPT-NEXT: flat_store_dword v[0:1], v2 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test1: +; GFX8-NOOPT: ; %bb.0: ; %bb +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX8-NOOPT-NEXT: s_mov_b32 s0, 2 +; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v3, s0, v0 +; GFX8-NOOPT-NEXT: s_mov_b32 m0, -1 +; GFX8-NOOPT-NEXT: ds_read_b32 v0, v3 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_barrier +; GFX8-NOOPT-NEXT: v_add_u32_e64 v1, s[0:1], v0, v0 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf +; GFX8-NOOPT-NEXT: v_add_u32_e64 v2, s[0:1], v0, v1 +; GFX8-NOOPT-NEXT: s_mov_b32 s0, 0 +; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NOOPT-NEXT: s_mov_b32 s0, s2 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s3 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NOOPT-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NOOPT-NEXT: v_addc_u32_e64 v3, s[0:1], v1, v3, s[0:1] +; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NOOPT-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test1: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: ds_read_b32 v1, v0 +; GFX10-NEXT: s_barrier +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v1 +; GFX10-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf +; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test1: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: ds_load_b32 v1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_barrier +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 @@ -55,13 +241,95 @@ bb: ret void } -; GCN-LABEL: {{^}}update_dppi64_test: -; GCN: load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]] -; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} -; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} -; GCN-NOOPT: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} -; GCN-NOOPT: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i64 %in2) { +; GFX8-OPT-LABEL: update_dppi64_test: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-OPT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-OPT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-OPT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-OPT-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-OPT-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-OPT-NEXT: s_waitcnt vmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-OPT-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: update_dppi64_test: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 +; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, 0 +; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr2 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s4 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NOOPT-NEXT: s_mov_b32 s4, s5 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec +; GFX8-NOOPT-NEXT: v_add_u32_e64 v0, s[2:3], s2, v0 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: v_addc_u32_e64 v2, s[2:3], v1, v2, s[2:3] +; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NOOPT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NOOPT-NEXT: s_waitcnt vmcnt(0) +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s1 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 +; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NOOPT-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: update_dppi64_test: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: update_dppi64_test: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id %load = load i64, ptr addrspace(1) %gep @@ -70,13 +338,95 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i ret void } -; GCN-LABEL: {{^}}update_dppf64_test: -; GCN: load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]] -; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} -; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} -; GCN-NOOPT: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} -; GCN-NOOPT: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1, double %in2) { +; GFX8-OPT-LABEL: update_dppf64_test: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-OPT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-OPT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-OPT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-OPT-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-OPT-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-OPT-NEXT: s_waitcnt vmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-OPT-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: update_dppf64_test: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 +; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, 0 +; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr2 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s4 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NOOPT-NEXT: s_mov_b32 s4, s5 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec +; GFX8-NOOPT-NEXT: v_add_u32_e64 v0, s[2:3], s2, v0 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: v_addc_u32_e64 v2, s[2:3], v1, v2, s[2:3] +; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NOOPT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NOOPT-NEXT: s_waitcnt vmcnt(0) +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s1 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 +; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NOOPT-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: update_dppf64_test: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: update_dppf64_test: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id %load = load double, ptr addrspace(1) %gep @@ -85,13 +435,95 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1 ret void } -; GCN-LABEL: {{^}}update_dppv2i32_test: -; GCN: load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]] -; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} -; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} -; GCN-NOOPT: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} -; GCN-NOOPT: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> %in1, <2 x i32> %in2) { +; GFX8-OPT-LABEL: update_dppv2i32_test: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-OPT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-OPT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-OPT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-OPT-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-OPT-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-OPT-NEXT: s_waitcnt vmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-OPT-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: update_dppv2i32_test: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 +; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, 0 +; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr2 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s4 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NOOPT-NEXT: s_mov_b32 s4, s5 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec +; GFX8-NOOPT-NEXT: v_add_u32_e64 v0, s[2:3], s2, v0 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: v_addc_u32_e64 v2, s[2:3], v1, v2, s[2:3] +; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NOOPT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NOOPT-NEXT: s_waitcnt vmcnt(0) +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s1 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 +; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NOOPT-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: update_dppv2i32_test: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: update_dppv2i32_test: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x i32>, ptr addrspace(1) %arg, i32 %id %load = load <2 x i32>, ptr addrspace(1) %gep @@ -100,13 +532,95 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> ret void } -; GCN-LABEL: {{^}}update_dppv2f32_test: -; GCN: load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]] -; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} -; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} -; GCN-NOOPT: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} -; GCN-NOOPT: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x float> %in1, <2 x float> %in2) { +; GFX8-OPT-LABEL: update_dppv2f32_test: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-OPT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-OPT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-OPT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-OPT-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-OPT-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-OPT-NEXT: s_waitcnt vmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-OPT-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: update_dppv2f32_test: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 +; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, 0 +; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr2 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s4 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NOOPT-NEXT: s_mov_b32 s4, s5 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec +; GFX8-NOOPT-NEXT: v_add_u32_e64 v0, s[2:3], s2, v0 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: v_addc_u32_e64 v2, s[2:3], v1, v2, s[2:3] +; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NOOPT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NOOPT-NEXT: s_waitcnt vmcnt(0) +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s1 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 +; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NOOPT-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: update_dppv2f32_test: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: update_dppv2f32_test: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %arg, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep @@ -115,13 +629,95 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa ret void } -; GCN-LABEL: {{^}}update_dpp_p0_test: -; GCN: load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]] -; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} -; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} -; GCN-NOOPT: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} -; GCN-NOOPT: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, ptr %in2) { +; GFX8-OPT-LABEL: update_dpp_p0_test: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-OPT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-OPT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-OPT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-OPT-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-OPT-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-OPT-NEXT: s_waitcnt vmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-OPT-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: update_dpp_p0_test: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 +; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, 0 +; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr2 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s4 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NOOPT-NEXT: s_mov_b32 s4, s5 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec +; GFX8-NOOPT-NEXT: v_add_u32_e64 v0, s[2:3], s2, v0 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: v_addc_u32_e64 v2, s[2:3], v1, v2, s[2:3] +; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NOOPT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NOOPT-NEXT: s_waitcnt vmcnt(0) +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s1 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 +; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NOOPT-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: update_dpp_p0_test: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: update_dpp_p0_test: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds ptr, ptr addrspace(1) %arg, i32 %id %load = load ptr, ptr addrspace(1) %gep @@ -130,10 +726,65 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p ret void } -; GCN-LABEL: {{^}}update_dpp_p3_test: -; GCN: {{load|read}}_{{dword|b32}} v[[SRC:[0-9]+]] -; GCN: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspace(3) %in1, ptr %in2) { +; GFX8-OPT-LABEL: update_dpp_p3_test: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-OPT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-OPT-NEXT: s_mov_b32 m0, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-OPT-NEXT: ds_read_b32 v1, v0 +; GFX8-OPT-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: s_nop 0 +; GFX8-OPT-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-OPT-NEXT: ds_write_b32 v0, v2 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: update_dpp_p3_test: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_load_dword s1, s[4:5], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s0, s[4:5], 0x28 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, 2 +; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v0, s2, v0 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: v_add_u32_e64 v0, s[2:3], s1, v0 +; GFX8-NOOPT-NEXT: s_mov_b32 m0, -1 +; GFX8-NOOPT-NEXT: ds_read_b32 v2, v0 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_nop 0 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v1, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NOOPT-NEXT: s_mov_b32 m0, -1 +; GFX8-NOOPT-NEXT: ds_write_b32 v0, v1 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: update_dpp_p3_test: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: ds_read_b32 v1, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: ds_write_b32 v0, v2 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: update_dpp_p3_test: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-NEXT: ds_load_b32 v1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX11-NEXT: ds_store_b32 v0, v2 +; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %arg, i32 %id %load = load ptr addrspace(3), ptr addrspace(3) %gep @@ -142,10 +793,80 @@ define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspa ret void } -; GCN-LABEL: {{^}}update_dpp_p5_test: -; GCN: {{load|read}}_{{dword|b32}} v[[SRC:[0-9]+]] -; GCN: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspace(5) %in1, ptr %in2) { +; GFX8-OPT-LABEL: update_dpp_p5_test: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GFX8-OPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GFX8-OPT-NEXT: s_mov_b32 s90, -1 +; GFX8-OPT-NEXT: s_mov_b32 s91, 0xe80000 +; GFX8-OPT-NEXT: s_add_u32 s88, s88, s11 +; GFX8-OPT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-OPT-NEXT: s_addc_u32 s89, s89, 0 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-OPT-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen +; GFX8-OPT-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-OPT-NEXT: s_waitcnt vmcnt(0) +; GFX8-OPT-NEXT: s_nop 0 +; GFX8-OPT-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-OPT-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: update_dpp_p5_test: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GFX8-NOOPT-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GFX8-NOOPT-NEXT: s_mov_b32 s90, -1 +; GFX8-NOOPT-NEXT: s_mov_b32 s91, 0xe80000 +; GFX8-NOOPT-NEXT: s_add_u32 s88, s88, s11 +; GFX8-NOOPT-NEXT: s_addc_u32 s89, s89, 0 +; GFX8-NOOPT-NEXT: s_load_dword s1, s[4:5], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s0, s[4:5], 0x28 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, 2 +; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v0, s2, v0 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: v_add_u32_e64 v1, s[2:3], s1, v0 +; GFX8-NOOPT-NEXT: buffer_load_dword v2, v1, s[88:91], 0 offen +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NOOPT-NEXT: s_waitcnt vmcnt(0) +; GFX8-NOOPT-NEXT: s_nop 0 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: update_dpp_p5_test: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-NEXT: s_mov_b32 s14, -1 +; GFX10-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-NEXT: s_add_u32 s12, s12, s11 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: update_dpp_p5_test: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-NEXT: scratch_load_b32 v1, v0, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX11-NEXT: scratch_store_b32 v0, v2, off +; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds ptr addrspace(5), ptr addrspace(5) %arg, i32 %id %load = load ptr addrspace(5), ptr addrspace(5) %gep @@ -154,18 +875,97 @@ define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspa ret void } -; GCN-LABEL: {{^}}update_dppi64_imm_old_test: -; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], 0x3afaedd9 -; GFX8-OPT-DAG,GFX10-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047 -; GFX11-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047 -; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_LO:[0-9]+]], 0x3afaedd9 -; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_HI:[0-9]+]], 0x7047 -; GCN-DAG: load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]] -; GCN-OPT-DAG: v_mov_b32_dpp v[[OLD_LO]], v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} -; GFX8-OPT-DAG,GFX10-DAG,GFX11-DAG: v_mov_b32_dpp v[[OLD_HI]], v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} -; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} -; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} define amdgpu_kernel void @update_dppi64_imm_old_test(ptr addrspace(1) %arg, i64 %in2) { +; GFX8-OPT-LABEL: update_dppi64_imm_old_test: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-OPT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX8-OPT-NEXT: v_mov_b32_e32 v5, 0x7047 +; GFX8-OPT-NEXT: v_mov_b32_e32 v4, 0x3afaedd9 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-OPT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-OPT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-OPT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-OPT-NEXT: s_waitcnt vmcnt(0) +; GFX8-OPT-NEXT: s_nop 0 +; GFX8-OPT-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-OPT-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: update_dppi64_imm_old_test: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX8-NOOPT-NEXT: s_mov_b32 s0, 3 +; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s0, v0 +; GFX8-NOOPT-NEXT: s_mov_b32 s0, 0 +; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s0, s2 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s3 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec +; GFX8-NOOPT-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NOOPT-NEXT: v_addc_u32_e64 v2, s[0:1], v1, v2, s[0:1] +; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NOOPT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NOOPT-NEXT: s_mov_b32 s2, 0x7047 +; GFX8-NOOPT-NEXT: s_mov_b32 s0, 0x3afaedd9 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s2 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s1 +; GFX8-NOOPT-NEXT: s_waitcnt vmcnt(0) +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 +; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NOOPT-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: update_dppi64_imm_old_test: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0x7047 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x3afaedd9 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: update_dppi64_imm_old_test: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0x7047 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x3afaedd9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id %load = load i64, ptr addrspace(1) %gep @@ -174,18 +974,97 @@ define amdgpu_kernel void @update_dppi64_imm_old_test(ptr addrspace(1) %arg, i64 ret void } -; GCN-LABEL: {{^}}update_dppf64_imm_old_test: -; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], 0x6b8564a -; GFX8-OPT-DAG,GFX10-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x405edce1 -; GFX11-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x405edce1 -; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_LO:[0-9]+]], 0x6b8564a -; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_HI:[0-9]+]], 0x405edce1 -; GCN-DAG: load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]] -; GCN-OPT-DAG: v_mov_b32_dpp v[[OLD_LO]], v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} -; GFX8-OPT-DAG,GFX10-DAG,GFX11-DAG: v_mov_b32_dpp v[[OLD_HI]], v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} -; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} -; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} define amdgpu_kernel void @update_dppf64_imm_old_test(ptr addrspace(1) %arg, double %in2) { +; GFX8-OPT-LABEL: update_dppf64_imm_old_test: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-OPT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX8-OPT-NEXT: v_mov_b32_e32 v5, 0x405edce1 +; GFX8-OPT-NEXT: v_mov_b32_e32 v4, 0x6b8564a +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-OPT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-OPT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-OPT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-OPT-NEXT: s_waitcnt vmcnt(0) +; GFX8-OPT-NEXT: s_nop 0 +; GFX8-OPT-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-OPT-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: update_dppf64_imm_old_test: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX8-NOOPT-NEXT: s_mov_b32 s0, 3 +; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s0, v0 +; GFX8-NOOPT-NEXT: s_mov_b32 s0, 0 +; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s0, s2 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s3 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec +; GFX8-NOOPT-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NOOPT-NEXT: v_addc_u32_e64 v2, s[0:1], v1, v2, s[0:1] +; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NOOPT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NOOPT-NEXT: s_mov_b32 s2, 0x405edce1 +; GFX8-NOOPT-NEXT: s_mov_b32 s0, 0x6b8564a +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s2 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s1 +; GFX8-NOOPT-NEXT: s_waitcnt vmcnt(0) +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 +; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NOOPT-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: update_dppf64_imm_old_test: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0x405edce1 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x6b8564a +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: update_dppf64_imm_old_test: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0x405edce1 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x6b8564a +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id %load = load double, ptr addrspace(1) %gep @@ -194,383 +1073,1964 @@ define amdgpu_kernel void @update_dppf64_imm_old_test(ptr addrspace(1) %arg, dou ret void } -; GCN-LABEL: {{^}}update_dppi64_imm_src_test: -; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], 0x3afaedd9 -; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047 -; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_LO:[0-9]+]], 0x3afaedd9 -; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_HI:[0-9]+]], 0x7047 -; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[OLD_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} -; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[OLD_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} -; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} -; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} define amdgpu_kernel void @update_dppi64_imm_src_test(ptr addrspace(1) %out, i64 %in1) { +; GFX8-OPT-LABEL: update_dppi64_imm_src_test: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, 0x7047 +; GFX8-OPT-NEXT: v_mov_b32_e32 v2, 0x3afaedd9 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v1, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: s_nop 1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-OPT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: update_dppi64_imm_src_test: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s4, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s5, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s5 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s4 +; GFX8-NOOPT-NEXT: s_mov_b32 s8, 0x7047 +; GFX8-NOOPT-NEXT: s_mov_b32 s4, 0x3afaedd9 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX8-NOOPT-NEXT: s_mov_b32 s5, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s5 +; GFX8-NOOPT-NEXT: s_mov_b32 s9, s7 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX8-NOOPT-NEXT: s_mov_b32 s5, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr4 +; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr4 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NOOPT-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: update_dppi64_imm_src_test: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x7047 +; GFX10-NEXT: v_mov_b32_e32 v3, 0x3afaedd9 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v1, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: v_mov_b32_dpp v0, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: update_dppi64_imm_src_test: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x7047 +; GFX11-NEXT: v_mov_b32_e32 v3, 0x3afaedd9 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_dpp v1, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX11-NEXT: v_mov_b32_dpp v0, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 123451234512345, i32 1, i32 1, i32 1, i1 false) #0 store i64 %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}update_dppf64_imm_src_test: -; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], 0x6b8564a -; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x405edce1 -; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_LO:[0-9]+]], 0x6b8564a -; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_HI:[0-9]+]], 0x405edce1 -; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[OLD_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} -; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[OLD_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} -; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} -; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} define amdgpu_kernel void @update_dppf64_imm_src_test(ptr addrspace(1) %out, double %in1) { +; GFX8-OPT-LABEL: update_dppf64_imm_src_test: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, 0x405edce1 +; GFX8-OPT-NEXT: v_mov_b32_e32 v2, 0x6b8564a +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v1, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: s_nop 1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-OPT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: update_dppf64_imm_src_test: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2c +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s4, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s5, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s5 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s4 +; GFX8-NOOPT-NEXT: s_mov_b32 s8, 0x405edce1 +; GFX8-NOOPT-NEXT: s_mov_b32 s4, 0x6b8564a +; GFX8-NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX8-NOOPT-NEXT: s_mov_b32 s5, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s5 +; GFX8-NOOPT-NEXT: s_mov_b32 s9, s7 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX8-NOOPT-NEXT: s_mov_b32 s5, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr4 +; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr4 +; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NOOPT-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: update_dppf64_imm_src_test: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x405edce1 +; GFX10-NEXT: v_mov_b32_e32 v3, 0x6b8564a +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v1, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: v_mov_b32_dpp v0, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: update_dppf64_imm_src_test: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x405edce1 +; GFX11-NEXT: v_mov_b32_e32 v3, 0x6b8564a +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_dpp v1, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX11-NEXT: v_mov_b32_dpp v0, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call double @llvm.amdgcn.update.dpp.f64(double %in1, double 123.451234512345, i32 1, i32 1, i32 1, i1 false) #0 store double %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}dpp_test_f32: -; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8-OPT: s_mov -; GFX8-OPT: s_mov -; GFX8-NOOPT: s_nop 1 -; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} define amdgpu_kernel void @dpp_test_f32(ptr addrspace(1) %out, float %in1, float %in2) { +; GFX8-OPT-LABEL: dpp_test_f32: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test_f32: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 1, i32 1, i32 1, i1 false) store float %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}dpp_test_f32_imm_comb1: -; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8-OPT: s_mov -; GFX8-OPT: s_mov -; GFX8-NOOPT: s_nop 1 -; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0{{$}} define amdgpu_kernel void @dpp_test_f32_imm_comb1(ptr addrspace(1) %out, float %in1, float %in2) { +; GFX8-OPT-LABEL: dpp_test_f32_imm_comb1: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0 +; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test_f32_imm_comb1: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test_f32_imm_comb1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test_f32_imm_comb1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 0, i32 0, i32 0, i1 false) store float %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}dpp_test_f32_imm_comb2: -; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8-OPT: s_mov -; GFX8-OPT: s_mov -; GFX8-NOOPT: s_nop 1 -; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3{{$}} define amdgpu_kernel void @dpp_test_f32_imm_comb2(ptr addrspace(1) %out, float %in1, float %in2) { +; GFX8-OPT-LABEL: dpp_test_f32_imm_comb2: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3 +; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test_f32_imm_comb2: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test_f32_imm_comb2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test_f32_imm_comb2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 3, i32 3, i32 3, i1 false) store float %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}dpp_test_f32_imm_comb3: -; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8-OPT: s_mov -; GFX8-OPT: s_mov -; GFX8-NOOPT: s_nop 1 -; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1{{$}} define amdgpu_kernel void @dpp_test_f32_imm_comb3(ptr addrspace(1) %out, float %in1, float %in2) { +; GFX8-OPT-LABEL: dpp_test_f32_imm_comb3: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1 +; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test_f32_imm_comb3: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test_f32_imm_comb3: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test_f32_imm_comb3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 1, i32 2, i32 3, i1 true) store float %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}dpp_test_f32_imm_comb4: -; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8-OPT: s_mov -; GFX8-OPT: s_mov -; GFX8-NOOPT: s_nop 1 -; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1{{$}} define amdgpu_kernel void @dpp_test_f32_imm_comb4(ptr addrspace(1) %out, float %in1, float %in2) { +; GFX8-OPT-LABEL: dpp_test_f32_imm_comb4: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1 +; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test_f32_imm_comb4: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test_f32_imm_comb4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test_f32_imm_comb4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 4, i32 3, i32 2, i1 true) store float %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}dpp_test_f32_imm_comb5: -; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8-OPT: s_mov -; GFX8-OPT: s_mov -; GFX8-NOOPT: s_nop 1 -; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1{{$}} define amdgpu_kernel void @dpp_test_f32_imm_comb5(ptr addrspace(1) %out, float %in1, float %in2) { +; GFX8-OPT-LABEL: dpp_test_f32_imm_comb5: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1 +; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test_f32_imm_comb5: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test_f32_imm_comb5: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test_f32_imm_comb5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 63, i32 14, i32 13, i1 true) store float %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}dpp_test_f32_imm_comb6: -; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8-OPT: s_mov -; GFX8-OPT: s_mov -; GFX8-NOOPT: s_nop 1 -; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} define amdgpu_kernel void @dpp_test_f32_imm_comb6(ptr addrspace(1) %out, float %in1, float %in2) { +; GFX8-OPT-LABEL: dpp_test_f32_imm_comb6: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test_f32_imm_comb6: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test_f32_imm_comb6: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test_f32_imm_comb6: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 63, i32 15, i32 15, i1 true) store float %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}dpp_test_f32_imm_comb7: -; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8-OPT: s_mov -; GFX8-OPT: s_mov -; GFX8-NOOPT: s_nop 1 -; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1{{$}} define amdgpu_kernel void @dpp_test_f32_imm_comb7(ptr addrspace(1) %out, float %in1, float %in2) { +; GFX8-OPT-LABEL: dpp_test_f32_imm_comb7: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1 +; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test_f32_imm_comb7: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test_f32_imm_comb7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test_f32_imm_comb7: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 64, i32 0, i32 0, i1 true) store float %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}dpp_test_f32_imm_comb8: -; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8-OPT: s_mov -; GFX8-OPT: s_mov -; GFX8-NOOPT: s_nop 1 -; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1{{$}} define amdgpu_kernel void @dpp_test_f32_imm_comb8(ptr addrspace(1) %out, float %in1, float %in2) { +; GFX8-OPT-LABEL: dpp_test_f32_imm_comb8: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1 +; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test_f32_imm_comb8: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test_f32_imm_comb8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test_f32_imm_comb8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 31, i32 15, i32 0, i1 true) store float %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}dpp_test_v2i16: -; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8-OPT: s_mov -; GFX8-OPT: s_mov -; GFX8-NOOPT: s_nop 1 -; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} define amdgpu_kernel void @dpp_test_v2i16(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) { +; GFX8-OPT-LABEL: dpp_test_v2i16: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test_v2i16: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 1, i32 1, i32 1, i1 false) store <2 x i16> %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb1: -; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8-OPT: s_mov -; GFX8-OPT: s_mov -; GFX8-NOOPT: s_nop 1 -; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0{{$}} define amdgpu_kernel void @dpp_test_v2i16_imm_comb1(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) { +; GFX8-OPT-LABEL: dpp_test_v2i16_imm_comb1: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0 +; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test_v2i16_imm_comb1: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test_v2i16_imm_comb1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test_v2i16_imm_comb1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 0, i32 0, i32 0, i1 false) store <2 x i16> %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb2: -; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8-OPT: s_mov -; GFX8-OPT: s_mov -; GFX8-NOOPT: s_nop 1 -; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3{{$}} define amdgpu_kernel void @dpp_test_v2i16_imm_comb2(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) { +; GFX8-OPT-LABEL: dpp_test_v2i16_imm_comb2: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3 +; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test_v2i16_imm_comb2: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test_v2i16_imm_comb2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test_v2i16_imm_comb2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 3, i32 3, i32 3, i1 false) store <2 x i16> %tmp0, ptr addrspace(1) %out ret void } - ; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb3: -; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8-OPT: s_mov -; GFX8-OPT: s_mov -; GFX8-NOOPT: s_nop 1 -; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1{{$}} define amdgpu_kernel void @dpp_test_v2i16_imm_comb3(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) { +; GFX8-OPT-LABEL: dpp_test_v2i16_imm_comb3: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1 +; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test_v2i16_imm_comb3: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test_v2i16_imm_comb3: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test_v2i16_imm_comb3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 1, i32 2, i32 3, i1 true) store <2 x i16> %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb4: -; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8-OPT: s_mov -; GFX8-OPT: s_mov -; GFX8-NOOPT: s_nop 1 -; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1{{$}} define amdgpu_kernel void @dpp_test_v2i16_imm_comb4(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) { +; GFX8-OPT-LABEL: dpp_test_v2i16_imm_comb4: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1 +; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test_v2i16_imm_comb4: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test_v2i16_imm_comb4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test_v2i16_imm_comb4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 4, i32 3, i32 2, i1 true) store <2 x i16> %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb5: -; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8-OPT: s_mov -; GFX8-OPT: s_mov -; GFX8-NOOPT: s_nop 1 -; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1{{$}} define amdgpu_kernel void @dpp_test_v2i16_imm_comb5(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) { +; GFX8-OPT-LABEL: dpp_test_v2i16_imm_comb5: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1 +; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test_v2i16_imm_comb5: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test_v2i16_imm_comb5: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test_v2i16_imm_comb5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 63, i32 14, i32 13, i1 true) store <2 x i16> %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb6: -; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8-OPT: s_mov -; GFX8-OPT: s_mov -; GFX8-NOOPT: s_nop 1 -; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} define amdgpu_kernel void @dpp_test_v2i16_imm_comb6(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) { +; GFX8-OPT-LABEL: dpp_test_v2i16_imm_comb6: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test_v2i16_imm_comb6: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test_v2i16_imm_comb6: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test_v2i16_imm_comb6: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 63, i32 15, i32 15, i1 true) store <2 x i16> %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb7: -; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8-OPT: s_mov -; GFX8-OPT: s_mov -; GFX8-NOOPT: s_nop 1 -; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1{{$}} define amdgpu_kernel void @dpp_test_v2i16_imm_comb7(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) { +; GFX8-OPT-LABEL: dpp_test_v2i16_imm_comb7: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1 +; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test_v2i16_imm_comb7: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test_v2i16_imm_comb7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test_v2i16_imm_comb7: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 64, i32 0, i32 0, i1 true) store <2 x i16> %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb8: -; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8-OPT: s_mov -; GFX8-OPT: s_mov -; GFX8-NOOPT: s_nop 1 -; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1{{$}} define amdgpu_kernel void @dpp_test_v2i16_imm_comb8(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) { +; GFX8-OPT-LABEL: dpp_test_v2i16_imm_comb8: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1 +; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test_v2i16_imm_comb8: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test_v2i16_imm_comb8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test_v2i16_imm_comb8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 31, i32 15, i32 0, i1 true) store <2 x i16> %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}dpp_test_v2f16: -; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8-OPT: s_mov -; GFX8-OPT: s_mov -; GFX8-NOOPT: s_nop 1 -; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} define amdgpu_kernel void @dpp_test_v2f16(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) { +; GFX8-OPT-LABEL: dpp_test_v2f16: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test_v2f16: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 1, i32 1, i32 1, i1 false) store <2 x half> %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb1: -; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8-OPT: s_mov -; GFX8-OPT: s_mov -; GFX8-NOOPT: s_nop 1 -; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0{{$}} define amdgpu_kernel void @dpp_test_v2f16_imm_comb1(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) { +; GFX8-OPT-LABEL: dpp_test_v2f16_imm_comb1: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0 +; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test_v2f16_imm_comb1: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test_v2f16_imm_comb1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test_v2f16_imm_comb1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 0, i32 0, i32 0, i1 false) store <2 x half> %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb2: -; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8-OPT: s_mov -; GFX8-OPT: s_mov -; GFX8-NOOPT: s_nop 1 -; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3{{$}} define amdgpu_kernel void @dpp_test_v2f16_imm_comb2(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) { +; GFX8-OPT-LABEL: dpp_test_v2f16_imm_comb2: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3 +; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test_v2f16_imm_comb2: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test_v2f16_imm_comb2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test_v2f16_imm_comb2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 3, i32 3, i32 3, i1 false) store <2 x half> %tmp0, ptr addrspace(1) %out ret void } - ; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb3: -; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8-OPT: s_mov -; GFX8-OPT: s_mov -; GFX8-NOOPT: s_nop 1 -; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1{{$}} define amdgpu_kernel void @dpp_test_v2f16_imm_comb3(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) { +; GFX8-OPT-LABEL: dpp_test_v2f16_imm_comb3: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1 +; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test_v2f16_imm_comb3: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test_v2f16_imm_comb3: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test_v2f16_imm_comb3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 1, i32 2, i32 3, i1 true) store <2 x half> %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb4: -; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8-OPT: s_mov -; GFX8-OPT: s_mov -; GFX8-NOOPT: s_nop 1 -; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1{{$}} define amdgpu_kernel void @dpp_test_v2f16_imm_comb4(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) { +; GFX8-OPT-LABEL: dpp_test_v2f16_imm_comb4: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1 +; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test_v2f16_imm_comb4: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test_v2f16_imm_comb4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test_v2f16_imm_comb4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 4, i32 3, i32 2, i1 true) store <2 x half> %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb5: -; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8-OPT: s_mov -; GFX8-OPT: s_mov -; GFX8-NOOPT: s_nop 1 -; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1{{$}} define amdgpu_kernel void @dpp_test_v2f16_imm_comb5(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) { +; GFX8-OPT-LABEL: dpp_test_v2f16_imm_comb5: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1 +; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test_v2f16_imm_comb5: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test_v2f16_imm_comb5: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test_v2f16_imm_comb5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 63, i32 14, i32 13, i1 true) store <2 x half> %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb6: -; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8-OPT: s_mov -; GFX8-OPT: s_mov -; GFX8-NOOPT: s_nop 1 -; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} define amdgpu_kernel void @dpp_test_v2f16_imm_comb6(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) { +; GFX8-OPT-LABEL: dpp_test_v2f16_imm_comb6: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test_v2f16_imm_comb6: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test_v2f16_imm_comb6: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test_v2f16_imm_comb6: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 63, i32 15, i32 15, i1 true) store <2 x half> %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb7: -; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8-OPT: s_mov -; GFX8-OPT: s_mov -; GFX8-NOOPT: s_nop 1 -; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1{{$}} define amdgpu_kernel void @dpp_test_v2f16_imm_comb7(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) { +; GFX8-OPT-LABEL: dpp_test_v2f16_imm_comb7: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1 +; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test_v2f16_imm_comb7: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test_v2f16_imm_comb7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test_v2f16_imm_comb7: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 64, i32 0, i32 0, i1 true) store <2 x half> %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb8: -; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8-OPT: s_mov -; GFX8-OPT: s_mov -; GFX8-NOOPT: s_nop 1 -; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1{{$}} define amdgpu_kernel void @dpp_test_v2f16_imm_comb8(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) { +; GFX8-OPT-LABEL: dpp_test_v2f16_imm_comb8: +; GFX8-OPT: ; %bb.0: +; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-OPT-NEXT: s_mov_b32 s6, -1 +; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-OPT-NEXT: s_mov_b32 s4, s0 +; GFX8-OPT-NEXT: s_mov_b32 s5, s1 +; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1 +; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-OPT-NEXT: s_endpgm +; +; GFX8-NOOPT-LABEL: dpp_test_v2f16_imm_comb8: +; GFX8-NOOPT: ; %bb.0: +; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 +; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 +; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 +; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 +; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 +; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOOPT-NEXT: s_nop 1 +; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1 +; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NOOPT-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test_v2f16_imm_comb8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test_v2f16_imm_comb8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 31, i32 15, i32 0, i1 true) store <2 x half> %tmp0, ptr addrspace(1) %out ret void @@ -585,3 +3045,7 @@ declare float @llvm.amdgcn.update.dpp.f32(float, float, i32, i32, i32, i1) #0 declare i64 @llvm.amdgcn.update.dpp.i64(i64, i64, i32, i32, i32, i1) #0 attributes #0 = { nounwind readnone convergent } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} +; GCN-OPT: {{.*}} +; GFX8: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll index 87d16385f7b26..544941b7fa0da 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI,VIGFX9 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VIGFX9 %s @@ -6,19 +7,96 @@ declare half @llvm.fma.f16(half %a, half %b, half %c) declare <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) declare <4 x half> @llvm.fma.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) -; GCN-LABEL: {{^}}fma_f16 -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] -; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VIGFX9: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fma_f16( +; SI-LABEL: fma_f16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_fma_f32 v0, v0, v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fma_f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_fma_f16 v0, v0, v1, v2 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fma_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s6, s2 +; GFX9-NEXT: s_mov_b32 s7, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s10 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: s_mov_b32 s16, s12 +; GFX9-NEXT: s_mov_b32 s17, s13 +; GFX9-NEXT: s_mov_b32 s18, s2 +; GFX9-NEXT: s_mov_b32 s19, s3 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; GFX9-NEXT: buffer_load_ushort v2, off, s[12:15], 0 +; GFX9-NEXT: s_mov_b32 s0, s8 +; GFX9-NEXT: s_mov_b32 s1, s9 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -31,20 +109,79 @@ define amdgpu_kernel void @fma_f16( ret void } -; GCN-LABEL: {{^}}fma_f16_imm_a -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] - -; SI: s_mov_b32 s[[A_F32:[0-9]+]], 0x40400000{{$}} -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] -; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[B_F32:[0-9]]], s[[A_F32:[0-9]]], v[[C_F32:[0-9]]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VIGFX9: s_movk_i32 s[[A_F16:[0-9]+]], 0x4200{{$}} -; VIGFX9: v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], s[[A_F16]], v[[C_F16]] -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fma_f16_imm_a( +; SI-LABEL: fma_f16_imm_a: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s2, 0x40400000 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_fma_f32 v0, v0, s2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fma_f16_imm_a: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s14, s6 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_movk_i32 s0, 0x4200 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_fma_f16 v0, v0, s0, v1 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fma_f16_imm_a: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_movk_i32 s0, 0x4200 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_fma_f16 v0, v0, s0, v1 +; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b, ptr addrspace(1) %c) { @@ -55,19 +192,79 @@ define amdgpu_kernel void @fma_f16_imm_a( ret void } -; GCN-LABEL: {{^}}fma_f16_imm_b -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] -; SI: s_mov_b32 s[[B_F32:[0-9]+]], 0x40400000{{$}} -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] -; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], s[[B_F32:[0-9]]], v[[C_F32:[0-9]]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VIGFX9: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}} -; VIGFX9: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], s[[B_F16]], v[[C_F16]] -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fma_f16_imm_b( +; SI-LABEL: fma_f16_imm_b: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s2, 0x40400000 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_fma_f32 v0, v0, s2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fma_f16_imm_b: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s14, s6 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_movk_i32 s0, 0x4200 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_fma_f16 v0, v0, s0, v1 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fma_f16_imm_b: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_movk_i32 s0, 0x4200 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_fma_f16 v0, v0, s0, v1 +; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %c) { @@ -78,19 +275,79 @@ define amdgpu_kernel void @fma_f16_imm_b( ret void } -; GCN-LABEL: {{^}}fma_f16_imm_c -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: s_mov_b32 s[[C_F32:[0-9]+]], 0x40400000{{$}} -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], s[[C_F32:[0-9]]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VIGFX9: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}} -; VIGFX9: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], s[[C_F16]] -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fma_f16_imm_c( +; SI-LABEL: fma_f16_imm_c: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s2, 0x40400000 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_fma_f32 v0, v0, v1, s2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fma_f16_imm_c: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s14, s6 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_movk_i32 s0, 0x4200 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_fma_f16 v0, v0, v1, s0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fma_f16_imm_c: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_movk_i32 s0, 0x4200 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_fma_f16 v0, v0, v1, s0 +; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -101,42 +358,114 @@ define amdgpu_kernel void @fma_f16_imm_c( ret void } -; GCN-LABEL: {{^}}fma_v2f16 -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]] - -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] - -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] - - -; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] - -; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; VI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] -; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_F16_1]], v[[B_F16_1]], v[[A_F16_1]] -; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[C_V2_F16]], v[[B_V2_F16]], v[[A_V2_F16]] - -; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]] - -; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN-NOT: and -; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] -; GCN: buffer_store_dword v[[R_V2_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fma_v2f16( +; SI-LABEL: fma_v2f16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_fma_f32 v0, v0, v4, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_fma_f32 v1, v3, v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fma_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v2, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_fma_f16 v3, v5, v4, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_fma_f16 v0, v2, v1, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fma_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s6, s2 +; GFX9-NEXT: s_mov_b32 s7, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s10 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: s_mov_b32 s16, s12 +; GFX9-NEXT: s_mov_b32 s17, s13 +; GFX9-NEXT: s_mov_b32 s18, s2 +; GFX9-NEXT: s_mov_b32 s19, s3 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[12:15], 0 +; GFX9-NEXT: s_mov_b32 s0, s8 +; GFX9-NEXT: s_mov_b32 s1, s9 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_fma_f16 v0, v0, v1, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -149,41 +478,93 @@ define amdgpu_kernel void @fma_v2f16( ret void } -; GCN-LABEL: {{^}}fma_v2f16_imm_a: -; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]] -; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] - - -; VIGFX9: buffer_load_dword v[[C_V2_F16:[0-9]+]] -; VIGFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]] - - -; SI: s_mov_b32 s[[A_F32:[0-9]+]], 0x40400000{{$}} -; VIGFX9: s_movk_i32 s[[A_F16:[0-9]+]], 0x4200{{$}} -; SIVI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SIVI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] - -; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] - -; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], s[[A_F32]], v[[C_F32_1]] -; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], s[[A_F32]], v[[C_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] - -; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], s[[A_F16]], v[[C_F16_1]] -; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], s[[A_F16]], v[[C_V2_F16]] - -; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], s[[A_F16]], v[[B_V2_F16]] - -; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN-NOT: and -; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] -; GCN: buffer_store_dword v[[R_V2_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fma_v2f16_imm_a( +; SI-LABEL: fma_v2f16_imm_a: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s2, 0x40400000 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_fma_f32 v2, v3, s2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_fma_f32 v0, v1, s2, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fma_v2f16_imm_a: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s14, s6 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: s_movk_i32 s2, 0x4200 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_fma_f16 v2, v3, s2, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_fma_f16 v0, v1, s2, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fma_v2f16_imm_a: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_movk_i32 s0, 0x4200 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_fma_f16 v0, v0, s0, v1 op_sel_hi:[1,0,1] +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b, ptr addrspace(1) %c) { @@ -194,42 +575,93 @@ define amdgpu_kernel void @fma_v2f16_imm_a( ret void } -; GCN-LABEL: {{^}}fma_v2f16_imm_b: -; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]] -; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] - -; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]] -; VIGFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GFX9: buffer_load_dword v[[C_V2_F16:[0-9]+]] - -; SI: s_mov_b32 s[[B_F32:[0-9]+]], 0x40400000{{$}} -; VIGFX9: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}} - -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] -; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] - -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] -; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], s[[B_F32]], v[[C_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], s[[B_F32]], v[[C_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] - -; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] -; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], s[[B_F16]], v[[C_V2_F16]] -; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], s[[B_F16]], v[[C_F16_1]] - -; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], s[[B_F16]], v[[C_V2_F16]] - -; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN-NOT: and -; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] -; GCN: buffer_store_dword v[[R_V2_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fma_v2f16_imm_b( +; SI-LABEL: fma_v2f16_imm_b: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s2, 0x40400000 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_fma_f32 v2, v3, s2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_fma_f32 v0, v1, s2, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fma_v2f16_imm_b: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s14, s6 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: s_movk_i32 s2, 0x4200 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_fma_f16 v2, v3, s2, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_fma_f16 v0, v1, s2, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fma_v2f16_imm_b: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_movk_i32 s0, 0x4200 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_fma_f16 v0, v0, s0, v1 op_sel_hi:[1,0,1] +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %c) { @@ -240,46 +672,93 @@ define amdgpu_kernel void @fma_v2f16_imm_b( ret void } -; GCN-LABEL: {{^}}fma_v2f16_imm_c: -; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] - -; GFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; VIGFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]] - -; SI: s_mov_b32 s[[C_F32:[0-9]+]], 0x40400000{{$}} -; VIGFX9: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}} - -; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] - -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] - -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] - -; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], s[[C_F32]] -; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], s[[C_F32]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] - -; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], s[[C_F16]] -; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], s[[C_F16]] -; GCN-NOT: and -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] - -; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], s[[C_F16]] - -; GCN: buffer_store_dword v[[R_V2_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fma_v2f16_imm_c( +; SI-LABEL: fma_v2f16_imm_c: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s2, 0x40400000 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_fma_f32 v2, v3, v2, s2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_fma_f32 v0, v1, v0, s2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fma_v2f16_imm_c: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s14, s6 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: s_movk_i32 s2, 0x4200 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_fma_f16 v2, v3, v2, s2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_fma_f16 v0, v1, v0, s2 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fma_v2f16_imm_c: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_movk_i32 s0, 0x4200 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_fma_f16 v0, v0, v1, s0 op_sel_hi:[1,1,0] +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -290,65 +769,137 @@ define amdgpu_kernel void @fma_v2f16_imm_c( ret void } -; GCN-LABEL: {{^}}fma_v4f16 -; GCN: buffer_load_dwordx2 v[[[A_V4_F16_LO:[0-9]+]]:[[A_V4_F16_HI:[0-9]+]]] -; GCN: buffer_load_dwordx2 v[[[B_V4_F16_LO:[0-9]+]]:[[B_V4_F16_HI:[0-9]+]]] -; GCN: buffer_load_dwordx2 v[[[C_V4_F16_LO:[0-9]+]]:[[C_V4_F16_HI:[0-9]+]]] - -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V4_F16_LO]] -; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_0:[0-9]+]], 16, v[[A_V4_F16_LO]] -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_V4_F16_HI]] -; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_2:[0-9]+]], 16, v[[A_V4_F16_HI]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V4_F16_LO]] -; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V4_F16_LO]] -; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_0:[0-9]+]], 16, v[[B_V4_F16_LO]] -; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_0:[0-9]+]], 16, v[[C_V4_F16_LO]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_V4_F16_HI]] -; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_V4_F16_HI]] -; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V4_F16_HI]] -; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V4_F16_HI]] -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_2:[0-9]+]], v[[A_V4_F16_LO]] -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_3:[0-9]+]], v[[A_V4_F16_HI]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_2:[0-9]+]], v[[B_V4_F16_LO]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_3:[0-9]+]], v[[B_V4_F16_HI]] -; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_2:[0-9]+]], v[[C_V4_F16_LO]] -; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_3:[0-9]+]], v[[C_V4_F16_HI]] - -; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32_0]] -; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32_1]] -; SI-DAG: v_fma_f32 v[[R_F32_2:[0-9]+]], v[[A_F32_2]], v[[B_F32_2]], v[[C_F32_2]] -; SI-DAG: v_fma_f32 v[[R_F32_3:[0-9]+]], v[[A_F32_3]], v[[B_F32_3]], v[[C_F32_3]] - -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_2:[0-9]+]], v[[R_F32_2]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_3:[0-9]+]], v[[R_F32_3]] - -; SI-DAG: v_lshlrev_b32_e32 v[[R1_F16_0:[0-9]]], 16, v[[R_F16_2]] -; SI-DAG: v_lshlrev_b32_e32 v[[R1_F16_1:[0-9]]], 16, v[[R_F16_3]] - -; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_0:[0-9]+]], 16, v[[A_V4_F16_LO]] -; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V4_F16_HI]] -; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_0:[0-9]+]], 16, v[[B_V4_F16_LO]] -; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V4_F16_HI]] -; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_0:[0-9]+]], 16, v[[C_V4_F16_LO]] -; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V4_F16_HI]] - -; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[C_V4_F16_LO]], v[[B_V4_F16_LO]], v[[A_V4_F16_LO]] -; VI-DAG: v_fma_f16 v[[R1_F16_0:[0-9]+]], v[[C_F16_0]], v[[B_F16_0]], v[[A_F16_0]] -; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_V4_F16_HI]], v[[B_V4_F16_HI]], v[[A_V4_F16_HI]] -; VI-DAG: v_fma_f16 v[[R1_F16_1:[0-9]+]], v[[C_F16_1]], v[[B_F16_1]], v[[A_F16_1]] - -; SIVI-DAG: v_or_b32_e32 v[[R_V4_F16_LO:[0-9]+]], v[[R_F16_0]], v[[R1_F16_0]] -; SIVI-DAG: v_or_b32_e32 v[[R_V4_F16_HI:[0-9]+]], v[[R_F16_1]], v[[R1_F16_1]] - -; GFX9-DAG: v_pk_fma_f16 v[[R_V4_F16_LO:[0-9]+]], v[[A_V4_F16_LO]], v[[B_V4_F16_LO]], v[[C_V4_F16_LO]] -; GFX9-DAG: v_pk_fma_f16 v[[R_V4_F16_HI:[0-9]+]], v[[A_V4_F16_HI]], v[[B_V4_F16_HI]], v[[C_V4_F16_HI]] - -; GCN: buffer_store_dwordx2 v[[[R_V4_F16_LO]]:[[R_V4_F16_HI]]] -; GCN: s_endpgm - define amdgpu_kernel void @fma_v4f16( +; SI-LABEL: fma_v4f16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 +; SI-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_fma_f32 v7, v7, v9, v11 +; SI-NEXT: v_fma_f32 v6, v6, v8, v10 +; SI-NEXT: v_fma_f32 v1, v1, v3, v5 +; SI-NEXT: v_fma_f32 v0, v0, v2, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fma_v4f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s16, s8 +; VI-NEXT: s_mov_b32 s17, s9 +; VI-NEXT: s_mov_b32 s8, s10 +; VI-NEXT: s_mov_b32 s9, s11 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s18, s2 +; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 +; VI-NEXT: buffer_load_dwordx2 v[4:5], off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; VI-NEXT: v_fma_f16 v1, v5, v3, v1 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; VI-NEXT: v_fma_f16 v0, v4, v2, v0 +; VI-NEXT: v_fma_f16 v2, v8, v7, v6 +; VI-NEXT: v_fma_f16 v3, v9, v5, v3 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fma_v4f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s6, s2 +; GFX9-NEXT: s_mov_b32 s7, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s10 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: s_mov_b32 s16, s12 +; GFX9-NEXT: s_mov_b32 s17, s13 +; GFX9-NEXT: s_mov_b32 s18, s2 +; GFX9-NEXT: s_mov_b32 s19, s3 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 +; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[12:15], 0 +; GFX9-NEXT: s_mov_b32 s0, s8 +; GFX9-NEXT: s_mov_b32 s1, s9 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX9-NEXT: v_pk_fma_f16 v0, v0, v2, v4 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -360,3 +911,7 @@ define amdgpu_kernel void @fma_v4f16( store <4 x half> %r.val, ptr addrspace(1) %r ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} +; SIVI: {{.*}} +; VIGFX9: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll index ca24b78f62c2e..c119ef274bb04 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck --check-prefixes=GCN-NOHSA,FUNC,SI-NOHSA %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri < %s | FileCheck --check-prefixes=GCN-HSA,FUNC,GCNX3-HSA %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCN-NOHSA,FUNC,GCNX3-NOHSA %s @@ -5,162 +6,766 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=R600,FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cayman < %s | FileCheck --check-prefixes=R600,FUNC %s -; FUNC-LABEL: {{^}}global_load_f32: -; GCN-NOHSA: buffer_load_dword v{{[0-9]+}} -; GCN-HSA: flat_load_dword - -; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0 define amdgpu_kernel void @global_load_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-NOHSA-LABEL: global_load_f32: +; SI-NOHSA: ; %bb.0: ; %entry +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 +; SI-NOHSA-NEXT: s_mov_b32 s6, -1 +; SI-NOHSA-NEXT: s_mov_b32 s10, s6 +; SI-NOHSA-NEXT: s_mov_b32 s11, s7 +; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; SI-NOHSA-NEXT: s_mov_b32 s8, s2 +; SI-NOHSA-NEXT: s_mov_b32 s9, s3 +; SI-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NOHSA-NEXT: s_mov_b32 s4, s0 +; SI-NOHSA-NEXT: s_mov_b32 s5, s1 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) +; SI-NOHSA-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NOHSA-NEXT: s_endpgm +; +; GCN-HSA-LABEL: global_load_f32: +; GCN-HSA: ; %bb.0: ; %entry +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_dword v2, v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: s_waitcnt vmcnt(0) +; GCN-HSA-NEXT: flat_store_dword v[0:1], v2 +; GCN-HSA-NEXT: s_endpgm +; +; GCNX3-NOHSA-LABEL: global_load_f32: +; GCNX3-NOHSA: ; %bb.0: ; %entry +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 +; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 +; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 +; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 +; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 +; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 +; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 +; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0) +; GCNX3-NOHSA-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCNX3-NOHSA-NEXT: s_endpgm entry: %tmp0 = load float, ptr addrspace(1) %in store float %tmp0, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}global_load_v2f32: -; GCN-NOHSA: buffer_load_dwordx2 -; GCN-HSA: flat_load_dwordx2 - -; R600: VTX_READ_64 define amdgpu_kernel void @global_load_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-NOHSA-LABEL: global_load_v2f32: +; SI-NOHSA: ; %bb.0: ; %entry +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 +; SI-NOHSA-NEXT: s_mov_b32 s6, -1 +; SI-NOHSA-NEXT: s_mov_b32 s10, s6 +; SI-NOHSA-NEXT: s_mov_b32 s11, s7 +; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; SI-NOHSA-NEXT: s_mov_b32 s8, s2 +; SI-NOHSA-NEXT: s_mov_b32 s9, s3 +; SI-NOHSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NOHSA-NEXT: s_mov_b32 s4, s0 +; SI-NOHSA-NEXT: s_mov_b32 s5, s1 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) +; SI-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NOHSA-NEXT: s_endpgm +; +; GCN-HSA-LABEL: global_load_v2f32: +; GCN-HSA: ; %bb.0: ; %entry +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 +; GCN-HSA-NEXT: s_waitcnt vmcnt(0) +; GCN-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN-HSA-NEXT: s_endpgm +; +; GCNX3-NOHSA-LABEL: global_load_v2f32: +; GCNX3-NOHSA: ; %bb.0: ; %entry +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 +; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 +; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 +; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 +; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 +; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 +; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0) +; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCNX3-NOHSA-NEXT: s_endpgm entry: %tmp0 = load <2 x float>, ptr addrspace(1) %in store <2 x float> %tmp0, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}global_load_v3f32: -; SI-NOHSA: buffer_load_dwordx4 -; GCNX3-NOHSA: buffer_load_dwordx3 -; GCNX3-HSA: flat_load_dwordx3 - -; R600: VTX_READ_128 define amdgpu_kernel void @global_load_v3f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-NOHSA-LABEL: global_load_v3f32: +; SI-NOHSA: ; %bb.0: ; %entry +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 +; SI-NOHSA-NEXT: s_mov_b32 s6, -1 +; SI-NOHSA-NEXT: s_mov_b32 s10, s6 +; SI-NOHSA-NEXT: s_mov_b32 s11, s7 +; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; SI-NOHSA-NEXT: s_mov_b32 s8, s2 +; SI-NOHSA-NEXT: s_mov_b32 s9, s3 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NOHSA-NEXT: s_mov_b32 s4, s0 +; SI-NOHSA-NEXT: s_mov_b32 s5, s1 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) +; SI-NOHSA-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 +; SI-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NOHSA-NEXT: s_endpgm +; +; GCN-HSA-LABEL: global_load_v3f32: +; GCN-HSA: ; %bb.0: ; %entry +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_dwordx3 v[0:2], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s1 +; GCN-HSA-NEXT: s_waitcnt vmcnt(0) +; GCN-HSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2] +; GCN-HSA-NEXT: s_endpgm +; +; GCNX3-NOHSA-LABEL: global_load_v3f32: +; GCNX3-NOHSA: ; %bb.0: ; %entry +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 +; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 +; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 +; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 +; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 +; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0 +; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 +; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0) +; GCNX3-NOHSA-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 +; GCNX3-NOHSA-NEXT: s_endpgm entry: %tmp0 = load <3 x float>, ptr addrspace(1) %in store <3 x float> %tmp0, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}global_load_v4f32: -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 - -; R600: VTX_READ_128 define amdgpu_kernel void @global_load_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-NOHSA-LABEL: global_load_v4f32: +; SI-NOHSA: ; %bb.0: ; %entry +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 +; SI-NOHSA-NEXT: s_mov_b32 s6, -1 +; SI-NOHSA-NEXT: s_mov_b32 s10, s6 +; SI-NOHSA-NEXT: s_mov_b32 s11, s7 +; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; SI-NOHSA-NEXT: s_mov_b32 s8, s2 +; SI-NOHSA-NEXT: s_mov_b32 s9, s3 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NOHSA-NEXT: s_mov_b32 s4, s0 +; SI-NOHSA-NEXT: s_mov_b32 s5, s1 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NOHSA-NEXT: s_endpgm +; +; GCN-HSA-LABEL: global_load_v4f32: +; GCN-HSA: ; %bb.0: ; %entry +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: s_waitcnt vmcnt(0) +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_endpgm +; +; GCNX3-NOHSA-LABEL: global_load_v4f32: +; GCNX3-NOHSA: ; %bb.0: ; %entry +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 +; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 +; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 +; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 +; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 +; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 +; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0) +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCNX3-NOHSA-NEXT: s_endpgm entry: %tmp0 = load <4 x float>, ptr addrspace(1) %in store <4 x float> %tmp0, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}global_load_v8f32: -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 - -; R600: VTX_READ_128 -; R600: VTX_READ_128 define amdgpu_kernel void @global_load_v8f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-NOHSA-LABEL: global_load_v8f32: +; SI-NOHSA: ; %bb.0: ; %entry +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 +; SI-NOHSA-NEXT: s_mov_b32 s6, -1 +; SI-NOHSA-NEXT: s_mov_b32 s10, s6 +; SI-NOHSA-NEXT: s_mov_b32 s11, s7 +; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; SI-NOHSA-NEXT: s_mov_b32 s8, s2 +; SI-NOHSA-NEXT: s_mov_b32 s9, s3 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 +; SI-NOHSA-NEXT: s_mov_b32 s4, s0 +; SI-NOHSA-NEXT: s_mov_b32 s5, s1 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(1) +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(1) +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; SI-NOHSA-NEXT: s_endpgm +; +; GCN-HSA-LABEL: global_load_v8f32: +; GCN-HSA: ; %bb.0: ; %entry +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GCN-HSA-NEXT: s_endpgm +; +; GCNX3-NOHSA-LABEL: global_load_v8f32: +; GCNX3-NOHSA: ; %bb.0: ; %entry +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 +; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 +; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 +; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 +; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 +; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 +; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 +; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1) +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1) +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GCNX3-NOHSA-NEXT: s_endpgm entry: %tmp0 = load <8 x float>, ptr addrspace(1) %in store <8 x float> %tmp0, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}global_load_v9f32: -; GCN-NOHSA: buffer_load_dword -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dword -; GCN-HSA: flat_load_dwordx4 - -; R600: VTX_READ_128 -; R600: VTX_READ_32 -; R600: VTX_READ_128 define amdgpu_kernel void @global_load_v9f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-NOHSA-LABEL: global_load_v9f32: +; SI-NOHSA: ; %bb.0: ; %entry +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 +; SI-NOHSA-NEXT: s_mov_b32 s6, -1 +; SI-NOHSA-NEXT: s_mov_b32 s10, s6 +; SI-NOHSA-NEXT: s_mov_b32 s11, s7 +; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; SI-NOHSA-NEXT: s_mov_b32 s8, s2 +; SI-NOHSA-NEXT: s_mov_b32 s9, s3 +; SI-NOHSA-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:32 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; SI-NOHSA-NEXT: s_mov_b32 s4, s0 +; SI-NOHSA-NEXT: s_mov_b32 s5, s1 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(2) +; SI-NOHSA-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:32 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(2) +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(2) +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; SI-NOHSA-NEXT: s_endpgm +; +; GCN-HSA-LABEL: global_load_v9f32: +; GCN-HSA: ; %bb.0: ; %entry +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s2, 32 +; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: flat_load_dword v14, v[6:7] +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(2) +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: s_waitcnt vmcnt(2) +; GCN-HSA-NEXT: flat_store_dword v[10:11], v14 +; GCN-HSA-NEXT: s_waitcnt vmcnt(2) +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; GCN-HSA-NEXT: s_endpgm +; +; GCNX3-NOHSA-LABEL: global_load_v9f32: +; GCNX3-NOHSA: ; %bb.0: ; %entry +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 +; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 +; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 +; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 +; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 +; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 +; GCNX3-NOHSA-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:32 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 +; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) +; GCNX3-NOHSA-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:32 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GCNX3-NOHSA-NEXT: s_endpgm entry: %tmp0 = load <9 x float>, ptr addrspace(1) %in store <9 x float> %tmp0, ptr addrspace(1) %out ret void } - -; FUNC-LABEL: {{^}}global_load_v10f32: -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx2 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx2 - -; R600: VTX_READ_128 -; R600: VTX_READ_128 -; R600: VTX_READ_128 define amdgpu_kernel void @global_load_v10f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-NOHSA-LABEL: global_load_v10f32: +; SI-NOHSA: ; %bb.0: ; %entry +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 +; SI-NOHSA-NEXT: s_mov_b32 s6, -1 +; SI-NOHSA-NEXT: s_mov_b32 s10, s6 +; SI-NOHSA-NEXT: s_mov_b32 s11, s7 +; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; SI-NOHSA-NEXT: s_mov_b32 s8, s2 +; SI-NOHSA-NEXT: s_mov_b32 s9, s3 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; SI-NOHSA-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0 offset:32 +; SI-NOHSA-NEXT: s_mov_b32 s4, s0 +; SI-NOHSA-NEXT: s_mov_b32 s5, s1 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(2) +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(2) +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(2) +; SI-NOHSA-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 offset:32 +; SI-NOHSA-NEXT: s_endpgm +; +; GCN-HSA-LABEL: global_load_v10f32: +; GCN-HSA: ; %bb.0: ; %entry +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN-HSA-NEXT: flat_load_dwordx2 v[8:9], v[8:9] +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(2) +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; GCN-HSA-NEXT: s_waitcnt vmcnt(2) +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; GCN-HSA-NEXT: s_waitcnt vmcnt(2) +; GCN-HSA-NEXT: flat_store_dwordx2 v[14:15], v[8:9] +; GCN-HSA-NEXT: s_endpgm +; +; GCNX3-NOHSA-LABEL: global_load_v10f32: +; GCNX3-NOHSA: ; %bb.0: ; %entry +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 +; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 +; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 +; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 +; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 +; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0 offset:32 +; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 +; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) +; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 offset:32 +; GCNX3-NOHSA-NEXT: s_endpgm entry: %tmp0 = load <10 x float>, ptr addrspace(1) %in store <10 x float> %tmp0, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}global_load_v11f32: -; SI-NOHSA: buffer_load_dwordx4 -; SI-NOHSA: buffer_load_dwordx4 -; SI-NOHSA: buffer_load_dwordx4 -; GCNX3-NOHSA: buffer_load_dwordx4 -; GCNX3-NOHSA: buffer_load_dwordx4 -; GCNX3-NOHSA: buffer_load_dwordx3 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx3 - -; R600: VTX_READ_128 -; R600: VTX_READ_128 -; R600: VTX_READ_128 define amdgpu_kernel void @global_load_v11f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-NOHSA-LABEL: global_load_v11f32: +; SI-NOHSA: ; %bb.0: ; %entry +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 +; SI-NOHSA-NEXT: s_mov_b32 s6, -1 +; SI-NOHSA-NEXT: s_mov_b32 s10, s6 +; SI-NOHSA-NEXT: s_mov_b32 s11, s7 +; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; SI-NOHSA-NEXT: s_mov_b32 s8, s2 +; SI-NOHSA-NEXT: s_mov_b32 s9, s3 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32 +; SI-NOHSA-NEXT: s_mov_b32 s4, s0 +; SI-NOHSA-NEXT: s_mov_b32 s5, s1 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[3:6], off, s[8:11], 0 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 offset:16 +; SI-NOHSA-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:40 +; SI-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:32 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(3) +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(3) +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0 offset:16 +; SI-NOHSA-NEXT: s_endpgm +; +; GCN-HSA-LABEL: global_load_v11f32: +; GCN-HSA: ; %bb.0: ; %entry +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN-HSA-NEXT: flat_load_dwordx3 v[8:10], v[8:9] +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s1 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(2) +; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[0:3] +; GCN-HSA-NEXT: s_waitcnt vmcnt(2) +; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[4:7] +; GCN-HSA-NEXT: s_waitcnt vmcnt(2) +; GCN-HSA-NEXT: flat_store_dwordx3 v[15:16], v[8:10] +; GCN-HSA-NEXT: s_endpgm +; +; GCNX3-NOHSA-LABEL: global_load_v11f32: +; GCNX3-NOHSA: ; %bb.0: ; %entry +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 +; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 +; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 +; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 +; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 +; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx3 v[8:10], off, s[8:11], 0 offset:32 +; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 +; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) +; GCNX3-NOHSA-NEXT: buffer_store_dwordx3 v[8:10], off, s[4:7], 0 offset:32 +; GCNX3-NOHSA-NEXT: s_endpgm entry: %tmp0 = load <11 x float>, ptr addrspace(1) %in store <11 x float> %tmp0, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}global_load_v12f32: -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 - -; R600: VTX_READ_128 -; R600: VTX_READ_128 -; R600: VTX_READ_128 define amdgpu_kernel void @global_load_v12f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-NOHSA-LABEL: global_load_v12f32: +; SI-NOHSA: ; %bb.0: ; %entry +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 +; SI-NOHSA-NEXT: s_mov_b32 s6, -1 +; SI-NOHSA-NEXT: s_mov_b32 s10, s6 +; SI-NOHSA-NEXT: s_mov_b32 s11, s7 +; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; SI-NOHSA-NEXT: s_mov_b32 s8, s2 +; SI-NOHSA-NEXT: s_mov_b32 s9, s3 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; SI-NOHSA-NEXT: s_mov_b32 s4, s0 +; SI-NOHSA-NEXT: s_mov_b32 s5, s1 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(2) +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(2) +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(2) +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; SI-NOHSA-NEXT: s_endpgm +; +; GCN-HSA-LABEL: global_load_v12f32: +; GCN-HSA: ; %bb.0: ; %entry +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(2) +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; GCN-HSA-NEXT: s_waitcnt vmcnt(2) +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] +; GCN-HSA-NEXT: s_waitcnt vmcnt(2) +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; GCN-HSA-NEXT: s_endpgm +; +; GCNX3-NOHSA-LABEL: global_load_v12f32: +; GCNX3-NOHSA: ; %bb.0: ; %entry +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 +; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 +; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 +; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 +; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 +; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 +; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; GCNX3-NOHSA-NEXT: s_endpgm entry: %tmp0 = load <12 x float>, ptr addrspace(1) %in store <12 x float> %tmp0, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}global_load_v16f32: -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 - -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 - -; R600: VTX_READ_128 -; R600: VTX_READ_128 -; R600: VTX_READ_128 -; R600: VTX_READ_128 define amdgpu_kernel void @global_load_v16f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-NOHSA-LABEL: global_load_v16f32: +; SI-NOHSA: ; %bb.0: ; %entry +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 +; SI-NOHSA-NEXT: s_mov_b32 s6, -1 +; SI-NOHSA-NEXT: s_mov_b32 s10, s6 +; SI-NOHSA-NEXT: s_mov_b32 s11, s7 +; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; SI-NOHSA-NEXT: s_mov_b32 s4, s0 +; SI-NOHSA-NEXT: s_mov_b32 s5, s1 +; SI-NOHSA-NEXT: s_mov_b32 s8, s2 +; SI-NOHSA-NEXT: s_mov_b32 s9, s3 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(3) +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(3) +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:48 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(3) +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(3) +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16 +; SI-NOHSA-NEXT: s_endpgm +; +; GCN-HSA-LABEL: global_load_v16f32: +; GCN-HSA: ; %bb.0: ; %entry +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 +; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: s_add_u32 s6, s2, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s2, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4 +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s5 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s4 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(3) +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(2) +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GCN-HSA-NEXT: s_waitcnt vmcnt(3) +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[12:15] +; GCN-HSA-NEXT: s_endpgm +; +; GCNX3-NOHSA-LABEL: global_load_v16f32: +; GCNX3-NOHSA: ; %bb.0: ; %entry +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 +; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 +; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 +; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 +; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 +; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16 +; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 +; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3) +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3) +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:48 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3) +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3) +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16 +; GCNX3-NOHSA-NEXT: s_endpgm entry: %tmp0 = load <16 x float>, ptr addrspace(1) %in store <16 x float> %tmp0, ptr addrspace(1) %out @@ -168,3 +773,8 @@ entry: } attributes #0 = { nounwind } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; FUNC: {{.*}} +; GCN-NOHSA: {{.*}} +; GCNX3-HSA: {{.*}} +; R600: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll index 6de015c6de79b..87083d64fd01d 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll @@ -1,15 +1,29 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; FIXME: Fails with -enable-var-scope ; Make sure 64-bit BFE pattern does a 32-bit BFE on the relevant half. ; Extract the high bit of the low half -; GCN-LABEL: {{^}}v_uextract_bit_31_i64: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] -; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN: buffer_store_dwordx2 v[[[SHIFT]]:[[ZERO]]] define amdgpu_kernel void @v_uextract_bit_31_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: v_uextract_bit_31_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_ashr_i32 s3, s2, 31 +; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GCN-NEXT: s_mov_b64 s[6:7], s[10:11] +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 31, v2 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_endpgm %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x @@ -21,13 +35,24 @@ define amdgpu_kernel void @v_uextract_bit_31_i64(ptr addrspace(1) %out, ptr addr } ; Extract the high bit of the high half -; GCN-LABEL: {{^}}v_uextract_bit_63_i64: -; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] -; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]] -; GCN: buffer_store_dwordx2 v[[[SHIFT]]:[[ZERO1]]] define amdgpu_kernel void @v_uextract_bit_63_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: v_uextract_bit_63_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 +; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 31, v2 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_endpgm %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x @@ -38,12 +63,25 @@ define amdgpu_kernel void @v_uextract_bit_63_i64(ptr addrspace(1) %out, ptr addr ret void } -; GCN-LABEL: {{^}}v_uextract_bit_1_i64: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1 -; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO]]] define amdgpu_kernel void @v_uextract_bit_1_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: v_uextract_bit_1_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_ashr_i32 s3, s2, 31 +; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GCN-NEXT: s_mov_b64 s[6:7], s[10:11] +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_bfe_u32 v2, v2, 1, 1 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_endpgm %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x @@ -54,12 +92,25 @@ define amdgpu_kernel void @v_uextract_bit_1_i64(ptr addrspace(1) %out, ptr addrs ret void } -; GCN-LABEL: {{^}}v_uextract_bit_20_i64: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 1 -; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO]]] define amdgpu_kernel void @v_uextract_bit_20_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: v_uextract_bit_20_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_ashr_i32 s3, s2, 31 +; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GCN-NEXT: s_mov_b64 s[6:7], s[10:11] +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_bfe_u32 v2, v2, 20, 1 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_endpgm %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x @@ -70,13 +121,24 @@ define amdgpu_kernel void @v_uextract_bit_20_i64(ptr addrspace(1) %out, ptr addr ret void } -; GCN-LABEL: {{^}}v_uextract_bit_32_i64: -; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 1, [[VAL]] -; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]{{$}} -; GCN: buffer_store_dwordx2 v[[[AND]]:[[ZERO1]]] define amdgpu_kernel void @v_uextract_bit_32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: v_uextract_bit_32_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 +; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 1, v2 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_endpgm %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x @@ -87,13 +149,24 @@ define amdgpu_kernel void @v_uextract_bit_32_i64(ptr addrspace(1) %out, ptr addr ret void } -; GCN-LABEL: {{^}}v_uextract_bit_33_i64: -; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1{{$}} -; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]] -; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO1]]] define amdgpu_kernel void @v_uextract_bit_33_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: v_uextract_bit_33_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 +; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_bfe_u32 v2, v2, 1, 1 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_endpgm %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x @@ -104,12 +177,25 @@ define amdgpu_kernel void @v_uextract_bit_33_i64(ptr addrspace(1) %out, ptr addr ret void } -; GCN-LABEL: {{^}}v_uextract_bit_20_21_i64: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 2 -; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO]]] define amdgpu_kernel void @v_uextract_bit_20_21_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: v_uextract_bit_20_21_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_ashr_i32 s3, s2, 31 +; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GCN-NEXT: s_mov_b64 s[6:7], s[10:11] +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_bfe_u32 v2, v2, 20, 2 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_endpgm %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x @@ -120,12 +206,25 @@ define amdgpu_kernel void @v_uextract_bit_20_21_i64(ptr addrspace(1) %out, ptr a ret void } -; GCN-LABEL: {{^}}v_uextract_bit_1_30_i64: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30 -; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO]]] define amdgpu_kernel void @v_uextract_bit_1_30_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: v_uextract_bit_1_30_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_ashr_i32 s3, s2, 31 +; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GCN-NEXT: s_mov_b64 s[6:7], s[10:11] +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_bfe_u32 v2, v2, 1, 30 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_endpgm %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x @@ -136,12 +235,25 @@ define amdgpu_kernel void @v_uextract_bit_1_30_i64(ptr addrspace(1) %out, ptr ad ret void } -; GCN-LABEL: {{^}}v_uextract_bit_1_31_i64: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 1, [[VAL]] -; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN: buffer_store_dwordx2 v[[[SHIFT]]:[[ZERO]]] define amdgpu_kernel void @v_uextract_bit_1_31_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: v_uextract_bit_1_31_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_ashr_i32 s3, s2, 31 +; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GCN-NEXT: s_mov_b64 s[6:7], s[10:11] +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_endpgm %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x @@ -154,13 +266,26 @@ define amdgpu_kernel void @v_uextract_bit_1_31_i64(ptr addrspace(1) %out, ptr ad ; Spans the dword boundary, so requires full shift. ; Truncated after the shift, so only low shift result is used. -; GCN-LABEL: {{^}}v_uextract_bit_31_32_i64: -; GCN: buffer_load_dwordx2 v[[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]] -; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 31 -; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]]{{$}} -; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN: buffer_store_dwordx2 v[[[AND]]:[[ZERO]]] define amdgpu_kernel void @v_uextract_bit_31_32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: v_uextract_bit_31_32_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_ashr_i32 s3, s2, 31 +; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 +; GCN-NEXT: s_mov_b64 s[6:7], s[10:11] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 31 +; GCN-NEXT: v_and_b32_e32 v2, 3, v2 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_endpgm %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x @@ -171,13 +296,24 @@ define amdgpu_kernel void @v_uextract_bit_31_32_i64(ptr addrspace(1) %out, ptr a ret void } -; GCN-LABEL: {{^}}v_uextract_bit_32_33_i64: -; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 2 -; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]] -; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO1]]] define amdgpu_kernel void @v_uextract_bit_32_33_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: v_uextract_bit_32_33_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 +; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_bfe_u32 v2, v2, 1, 2 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_endpgm %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x @@ -188,14 +324,24 @@ define amdgpu_kernel void @v_uextract_bit_32_33_i64(ptr addrspace(1) %out, ptr a ret void } -; GCN-LABEL: {{^}}v_uextract_bit_30_60_i64: -; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN: buffer_load_dwordx2 v[[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]] -; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 30 -; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 0x3fffffff, v[[SHRLO]]{{$}} -; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]] -; GCN: buffer_store_dwordx2 v[[[AND]]:[[ZERO1]]] define amdgpu_kernel void @v_uextract_bit_30_60_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: v_uextract_bit_30_60_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 30 +; GCN-NEXT: v_and_b32_e32 v2, 0x3fffffff, v2 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x @@ -206,13 +352,24 @@ define amdgpu_kernel void @v_uextract_bit_30_60_i64(ptr addrspace(1) %out, ptr a ret void } -; GCN-LABEL: {{^}}v_uextract_bit_33_63_i64: -; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30 -; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]] -; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO1]]] define amdgpu_kernel void @v_uextract_bit_33_63_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: v_uextract_bit_33_63_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 +; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_bfe_u32 v2, v2, 1, 30 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_endpgm %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x @@ -223,12 +380,25 @@ define amdgpu_kernel void @v_uextract_bit_33_63_i64(ptr addrspace(1) %out, ptr a ret void } -; GCN-LABEL: {{^}}v_uextract_bit_31_63_i64: -; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN: buffer_load_dwordx2 v[[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]] -; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 31 -; GCN: buffer_store_dwordx2 v[[[SHRLO]]:[[ZERO]]] define amdgpu_kernel void @v_uextract_bit_31_63_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: v_uextract_bit_31_63_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_mov_b32 s11, s7 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v0, v3, v2, 31 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_endpgm %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x @@ -240,11 +410,23 @@ define amdgpu_kernel void @v_uextract_bit_31_63_i64(ptr addrspace(1) %out, ptr a } ; trunc applied before and mask -; GCN-LABEL: {{^}}v_uextract_bit_31_i64_trunc_i32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] -; GCN: buffer_store_dword v[[SHIFT]] define amdgpu_kernel void @v_uextract_bit_31_i64_trunc_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: v_uextract_bit_31_i64_trunc_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v3, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v0, 31, v3 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x @@ -256,11 +438,23 @@ define amdgpu_kernel void @v_uextract_bit_31_i64_trunc_i32(ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}v_uextract_bit_3_i64_trunc_i32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 3, 1{{$}} -; GCN: buffer_store_dword [[BFE]] define amdgpu_kernel void @v_uextract_bit_3_i64_trunc_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: v_uextract_bit_3_i64_trunc_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v3, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_bfe_u32 v0, v3, 3, 1 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x @@ -272,11 +466,24 @@ define amdgpu_kernel void @v_uextract_bit_3_i64_trunc_i32(ptr addrspace(1) %out, ret void } -; GCN-LABEL: {{^}}v_uextract_bit_33_i64_trunc_i32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 1, 1{{$}} -; GCN: buffer_store_dword [[BFE]] define amdgpu_kernel void @v_uextract_bit_33_i64_trunc_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: v_uextract_bit_33_i64_trunc_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-NEXT: buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:4 +; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_bfe_u32 v0, v3, 1, 1 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_endpgm %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x @@ -288,13 +495,24 @@ define amdgpu_kernel void @v_uextract_bit_33_i64_trunc_i32(ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}v_uextract_bit_31_32_i64_trunc_i32: -; GCN: buffer_load_dwordx2 v[[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]] -; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 31 -; GCN-NEXT: v_and_b32_e32 v[[SHRLO]], 3, v[[SHRLO]] -; GCN-NOT: v[[SHRLO]] -; GCN: buffer_store_dword v[[SHRLO]] define amdgpu_kernel void @v_uextract_bit_31_32_i64_trunc_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: v_uextract_bit_31_32_i64_trunc_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v0, v4, v3, 31 +; GCN-NEXT: v_and_b32_e32 v0, 3, v0 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x @@ -306,16 +524,24 @@ define amdgpu_kernel void @v_uextract_bit_31_32_i64_trunc_i32(ptr addrspace(1) % ret void } -; GCN-LABEL: {{^}}and_not_mask_i64: -; GCN-DAG: buffer_load_dword v[[VAL:[0-9]+]] -; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN-DAG: v_mov_b32_e32 v[[SHRHI:[0-9]+]], v[[ZERO]]{{$}} -; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 20, v[[VAL]] -; GCN-DAG: v_and_b32_e32 v[[SHRLO:[0-9]+]], 4, [[SHR]] -; GCN-NOT: v[[SHRLO]] -; GCN-NOT: v[[SHRHI]] -; GCN: buffer_store_dwordx2 v[[[SHRLO]]:[[SHRHI]]] define amdgpu_kernel void @and_not_mask_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: and_not_mask_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 20, v2 +; GCN-NEXT: v_and_b32_e32 v2, 4, v2 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x @@ -328,15 +554,29 @@ define amdgpu_kernel void @and_not_mask_i64(ptr addrspace(1) %out, ptr addrspace ; The instruction count is the same with/without hasOneUse, but ; keeping the 32-bit and has a smaller encoding size than the bfe. - -; GCN-LABEL: {{^}}v_uextract_bit_27_29_multi_use_shift_i64: -; GCN-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] -; GCN-DAG: v_lshr_b64 v[[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]], [[VAL]], 27 -; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]] -; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN: buffer_store_dwordx2 v[[[SHRLO]]:[[SHRHI]]] -; GCN: buffer_store_dwordx2 v[[[AND]]:[[ZERO]]] define amdgpu_kernel void @v_uextract_bit_27_29_multi_use_shift_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: v_uextract_bit_27_29_multi_use_shift_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_mov_b32 s11, s7 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshr_b64 v[2:3], v[2:3], 27 +; GCN-NEXT: v_and_b32_e32 v0, 3, v2 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x @@ -348,15 +588,30 @@ define amdgpu_kernel void @v_uextract_bit_27_29_multi_use_shift_i64(ptr addrspac ret void } -; GCN-LABEL: {{^}}v_uextract_bit_34_37_multi_use_shift_i64: -; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; GCN-DAG: v_mov_b32_e32 v[[ZERO_SHR:[0-9]+]], 0{{$}} -; GCN: v_mov_b32_e32 v[[ZERO_BFE:[0-9]+]], v[[ZERO_SHR]] -; GCN-DAG: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 2, [[VAL]] -; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 2, 3 -; GCN-DAG: buffer_store_dwordx2 v[[[SHR]]:[[ZERO_SHR]]] -; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO_BFE]]] define amdgpu_kernel void @v_uextract_bit_34_37_multi_use_shift_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: v_uextract_bit_34_37_multi_use_shift_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_mov_b32 s11, s7 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v0, 2, v2 +; GCN-NEXT: v_bfe_u32 v2, v2, 2, 3 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x @@ -368,13 +623,32 @@ define amdgpu_kernel void @v_uextract_bit_34_37_multi_use_shift_i64(ptr addrspac ret void } -; GCN-LABEL: {{^}}v_uextract_bit_33_36_use_upper_half_shift_i64: -; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 3 -; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN: buffer_store_dwordx2 v[[[BFE]]:{{[0-9]+\]}} -; GCN: buffer_store_dword v[[ZERO]] define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) #1 { +; GCN-LABEL: v_uextract_bit_33_36_use_upper_half_shift_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_mov_b64 s[6:7], s[2:3] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v5, v[1:2], s[4:7], 0 addr64 offset:4 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: v_mov_b32_e32 v6, v2 +; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v4, v2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[0:1], s[4:5] +; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_bfe_u32 v5, v5, 1, 3 +; GCN-NEXT: buffer_store_dwordx2 v[5:6], v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, v[3:4], s[8:11], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x %out0.gep = getelementptr i64, ptr addrspace(1) %out0, i32 %id.x diff --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll index db802732e987b..8d17a01cc4c9f 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc.ll @@ -1,51 +1,150 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cypress < %s | FileCheck -enable-var-scope -check-prefix=EG %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @trunc_i64_to_i32_store(ptr addrspace(1) %out, [8 x i32], i64 %in) { -; GCN-LABEL: {{^}}trunc_i64_to_i32_store: -; GCN: s_load_dword [[SLOAD:s[0-9]+]], s[4:5], -; GCN: v_mov_b32_e32 [[VLOAD:v[0-9]+]], [[SLOAD]] -; SI: buffer_store_dword [[VLOAD]] -; VI: flat_store_dword v[{{[0-9:]+}}], [[VLOAD]] - -; EG-LABEL: {{^}}trunc_i64_to_i32_store: -; EG: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 -; EG: LSHR -; EG-NEXT: 2( - +; SI-LABEL: trunc_i64_to_i32_store: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s6, s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: trunc_i64_to_i32_store: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x4c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; EG-LABEL: trunc_i64_to_i32_store: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; EG-NEXT: MOV * T1.X, KC0[4].W, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %result = trunc i64 %in to i32 store i32 %result, ptr addrspace(1) %out, align 4 ret void } -; GCN-LABEL: {{^}}trunc_load_shl_i64: -; GCN-DAG: s_load_dwordx2 -; GCN-DAG: s_load_dword [[SREG:s[0-9]+]], -; GCN: s_lshl_b32 [[SHL:s[0-9]+]], [[SREG]], 2 -; GCN: v_mov_b32_e32 [[VSHL:v[0-9]+]], [[SHL]] -; SI: buffer_store_dword [[VSHL]] -; VI: flat_store_dword v[{{[0-9:]+}}], [[VSHL]] - define amdgpu_kernel void @trunc_load_shl_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { +; SI-LABEL: trunc_load_shl_i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s6, s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshl_b32 s4, s6, 2 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: trunc_load_shl_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshl_b32 s2, s2, 2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; EG-LABEL: trunc_load_shl_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; EG-NEXT: LSHL * T1.X, KC0[4].W, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %b = shl i64 %a, 2 %result = trunc i64 %b to i32 store i32 %result, ptr addrspace(1) %out, align 4 ret void } -; GCN-LABEL: {{^}}trunc_shl_i64: -; SI: s_load_dwordx2 s[[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd -; VI: s_load_dwordx2 s[[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34 -; GCN: s_lshl_b64 s[[[LO_SHL:[0-9]+]]:{{[0-9]+\]}}, s[[[LO_SREG]]:{{[0-9]+\]}}, 2 -; GCN: s_add_u32 s[[LO_SREG2:[0-9]+]], s[[LO_SHL]], -; GCN: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]] -; SI: buffer_store_dword v[[LO_VREG]], -; VI: flat_store_dword v[{{[0-9:]+}}], v[[LO_VREG]] -; GCN: v_mov_b32_e32 -; GCN: v_mov_b32_e32 define amdgpu_kernel void @trunc_shl_i64(ptr addrspace(1) %out2, ptr addrspace(1) %out, i64 %a) { +; SI-LABEL: trunc_shl_i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 +; SI-NEXT: s_add_u32 s8, s8, 0x3a8 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s0, s2 +; SI-NEXT: s_mov_b32 s1, s3 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: trunc_shl_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_lshl_b64 s[0:1], s[4:5], 2 +; VI-NEXT: s_add_u32 s0, s0, 0x3a8 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_store_dword v[2:3], v4 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_endpgm +; +; EG-LABEL: trunc_shl_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHL * T0.W, KC0[2].W, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.W, KC0[3].X, KC0[2].W, literal.x, +; EG-NEXT: ADDC_UINT * T2.W, PV.W, literal.y, +; EG-NEXT: 30(4.203895e-44), 936(1.311615e-42) +; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; EG-NEXT: ADD_INT T1.Y, PV.W, PS, +; EG-NEXT: ADD_INT * T1.X, T0.W, literal.y, +; EG-NEXT: 2(2.802597e-45), 936(1.311615e-42) +; EG-NEXT: LSHR * T2.X, KC0[2].Z, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %aa = add i64 %a, 234 ; Prevent shrinking store. %b = shl i64 %aa, 2 %result = trunc i64 %b to i32 @@ -54,9 +153,55 @@ define amdgpu_kernel void @trunc_shl_i64(ptr addrspace(1) %out2, ptr addrspace(1 ret void } -; GCN-LABEL: {{^}}trunc_i32_to_i1: -; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 1, v{{[0-9]+}} define amdgpu_kernel void @trunc_i32_to_i1(ptr addrspace(1) %out, ptr addrspace(1) %ptr) { +; SI-LABEL: trunc_i32_to_i1: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: trunc_i32_to_i1: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v2, 1, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; EG-LABEL: trunc_i32_to_i1: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: AND_INT T0.X, T0.X, 1, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %a = load i32, ptr addrspace(1) %ptr, align 4 %trunc = trunc i32 %a to i1 %result = select i1 %trunc, i32 1, i32 0 @@ -64,9 +209,64 @@ define amdgpu_kernel void @trunc_i32_to_i1(ptr addrspace(1) %out, ptr addrspace( ret void } -; GCN-LABEL: {{^}}trunc_i8_to_i1: -; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 1, v{{[0-9]+}} define amdgpu_kernel void @trunc_i8_to_i1(ptr addrspace(1) %out, ptr addrspace(1) %ptr) { +; SI-LABEL: trunc_i8_to_i1: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 1, v0 +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: trunc_i8_to_i1: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_ubyte v2, v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v2, 1, v2 +; VI-NEXT: flat_store_byte v[0:1], v2 +; VI-NEXT: s_endpgm +; +; EG-LABEL: trunc_i8_to_i1: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.X, 1, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: LSHL T0.X, T1.W, PV.W, +; EG-NEXT: LSHL * T0.W, literal.x, PV.W, +; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: MOV * T0.Z, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %a = load i8, ptr addrspace(1) %ptr, align 4 %trunc = trunc i8 %a to i1 %result = select i1 %trunc, i8 1, i8 0 @@ -74,43 +274,213 @@ define amdgpu_kernel void @trunc_i8_to_i1(ptr addrspace(1) %out, ptr addrspace(1 ret void } -; GCN-LABEL: {{^}}sgpr_trunc_i16_to_i1: -; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1 define amdgpu_kernel void @sgpr_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %a) { +; SI-LABEL: sgpr_trunc_i16_to_i1: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_and_b32 s4, s6, 1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: sgpr_trunc_i16_to_i1: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_and_b32 s2, s2, 1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_endpgm +; +; EG-LABEL: sgpr_trunc_i16_to_i1: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, 0.0, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.X, 1, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: LSHL T0.X, T1.W, PV.W, +; EG-NEXT: LSHL * T0.W, literal.x, PV.W, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: MOV * T0.Z, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %trunc = trunc i16 %a to i1 %result = select i1 %trunc, i16 1, i16 0 store i16 %result, ptr addrspace(1) %out, align 4 ret void } -; GCN-LABEL: {{^}}sgpr_trunc_i32_to_i1: -; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1 define amdgpu_kernel void @sgpr_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %a) { +; SI-LABEL: sgpr_trunc_i32_to_i1: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_and_b32 s4, s6, 1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: sgpr_trunc_i32_to_i1: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_and_b32 s2, s2, 1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; EG-LABEL: sgpr_trunc_i32_to_i1: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; EG-NEXT: AND_INT * T1.X, KC0[2].Z, 1, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %trunc = trunc i32 %a to i1 %result = select i1 %trunc, i32 1, i32 0 store i32 %result, ptr addrspace(1) %out, align 4 ret void } -; GCN-LABEL: {{^}}s_trunc_i64_to_i1: -; SI: s_load_dwordx2 s[[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x13 -; VI: s_load_dwordx2 s[[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x4c -; GCN: s_bitcmp1_b32 s[[SLO]], 0 -; GCN: s_cselect_b32 {{s[0-9]+}}, 63, -12 define amdgpu_kernel void @s_trunc_i64_to_i1(ptr addrspace(1) %out, [8 x i32], i64 %x) { +; SI-LABEL: s_trunc_i64_to_i1: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitcmp1_b32 s6, 0 +; SI-NEXT: s_cselect_b32 s4, 63, -12 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_trunc_i64_to_i1: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_bitcmp1_b32 s0, 0 +; VI-NEXT: s_cselect_b32 s0, 63, -12 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; EG-LABEL: s_trunc_i64_to_i1: +; EG: ; %bb.0: +; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: MOV T0.W, literal.x, +; EG-NEXT: AND_INT * T1.W, KC0[4].W, 1, +; EG-NEXT: 63(8.828180e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T0.X, PS, literal.x, PV.W, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: -12(nan), 2(2.802597e-45) %trunc = trunc i64 %x to i1 %sel = select i1 %trunc, i32 63, i32 -12 store i32 %sel, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}v_trunc_i64_to_i1: -; SI: buffer_load_dwordx2 v[[[VLO:[0-9]+]]:{{[0-9]+\]}} -; VI: flat_load_dwordx2 v[[[VLO:[0-9]+]]:{{[0-9]+\]}} -; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 1, v[[VLO]] -; GCN: v_cmp_eq_u32_e32 vcc, 1, [[MASKED]] -; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc define amdgpu_kernel void @v_trunc_i64_to_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; SI-LABEL: v_trunc_i64_to_i1: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; SI-NEXT: v_mov_b32_e32 v2, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 1, v3 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, -12, 63, vcc +; SI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_trunc_i64_to_i1: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_and_b32_e32 v0, 1, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, -12, 63, vcc +; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: s_endpgm +; +; EG-LABEL: v_trunc_i64_to_i1: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T1.X, KC0[2].Z, PV.W, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: LSHL T0.Z, T0.X, literal.x, +; EG-NEXT: AND_INT T0.W, T1.X, 1, BS:VEC_120/SCL_212 +; EG-NEXT: MOV * T1.W, literal.y, +; EG-NEXT: 2(2.802597e-45), 63(8.828180e-44) +; EG-NEXT: CNDE_INT T0.X, PV.W, literal.x, PS, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, PV.Z, +; EG-NEXT: -12(nan), 0(0.000000e+00) +; EG-NEXT: LSHR * T1.X, PV.W, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -121,3 +491,5 @@ define amdgpu_kernel void @v_trunc_i64_to_i1(ptr addrspace(1) %out, ptr addrspac store i32 %sel, ptr addrspace(1) %out.gep ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}}