diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index eb2b702dcba18..ff1783305a07e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -338,6 +338,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand); setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand); setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand); + setTruncStoreAction(MVT::v6f32, MVT::v6f16, Expand); setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand); setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand); setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand); diff --git a/llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp32_to_bf16.ll b/llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp32_to_bf16.ll index 2ccc0337b8aeb..3b1358106392a 100644 --- a/llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp32_to_bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp32_to_bf16.ll @@ -119,6 +119,54 @@ entry: ret void } +define void @v6(<6 x float> %num, ptr addrspace(1) %p) { +; CHECK-LABEL: v6: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_bfe_u32 v8, v4, 16, 1 +; CHECK-NEXT: s_movk_i32 s4, 0x7fff +; CHECK-NEXT: v_add3_u32 v8, v8, v4, s4 +; CHECK-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; CHECK-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; CHECK-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; CHECK-NEXT: v_bfe_u32 v8, v5, 16, 1 +; CHECK-NEXT: v_add3_u32 v8, v8, v5, s4 +; CHECK-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; CHECK-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; CHECK-NEXT: s_mov_b32 s5, 0x7060302 +; CHECK-NEXT: v_perm_b32 v4, v5, v4, s5 +; CHECK-NEXT: v_bfe_u32 v5, v2, 16, 1 +; CHECK-NEXT: v_add3_u32 v5, v5, v2, s4 +; CHECK-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; CHECK-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc +; CHECK-NEXT: v_bfe_u32 v5, v3, 16, 1 +; CHECK-NEXT: v_add3_u32 v5, v5, v3, s4 +; CHECK-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; CHECK-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; CHECK-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc +; CHECK-NEXT: v_perm_b32 v3, v3, v2, s5 +; CHECK-NEXT: v_bfe_u32 v2, v0, 16, 1 +; CHECK-NEXT: v_add3_u32 v2, v2, v0, s4 +; CHECK-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; CHECK-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc +; CHECK-NEXT: v_bfe_u32 v2, v1, 16, 1 +; CHECK-NEXT: v_add3_u32 v2, v2, v1, s4 +; CHECK-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; CHECK-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; CHECK-NEXT: v_perm_b32 v2, v1, v0, s5 +; CHECK-NEXT: global_store_dwordx3 v[6:7], v[2:4], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + %conv = fptrunc <6 x float> %num to <6 x bfloat> + store <6 x bfloat> %conv, ptr addrspace(1) %p, align 16 + ret void +} + define void @v8(<8 x float> %num, ptr addrspace(1) %p) { ; CHECK-LABEL: v8: ; CHECK: ; %bb.0: ; %entry diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll b/llvm/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll index c6b5ae48d6504..77d9d678b03af 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll @@ -53,3 +53,17 @@ define amdgpu_kernel void @global_truncstore_v16f64_to_v16f16(ptr addrspace(1) % store <16 x half> %cvt, ptr addrspace(1) %out ret void } + +; GCN-LABEL: {{^}}global_truncstore_v6f64_to_v6f16: +define void @global_truncstore_v6f64_to_v6f16(ptr addrspace(1) %ptr, <6 x double> %src) { + %trunc = fptrunc <6 x double> %src to <6 x half> + store <6 x half> %trunc, ptr addrspace(1) %ptr + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_v6f64_to_v6bf16: +define void @global_truncstore_v6f64_to_v6bf16(ptr addrspace(1) %ptr, <6 x double> %src) { + %trunc = fptrunc <6 x double> %src to <6 x bfloat> + store <6 x bfloat> %trunc, ptr addrspace(1) %ptr + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store.ll b/llvm/test/CodeGen/AMDGPU/trunc-store.ll index 5f01db82ccd48..1d96921ec1287 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc-store.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-store.ll @@ -442,5 +442,130 @@ define void @truncstore_v6i32_to_v6i16(ptr addrspace(1) %out, <6 x i32> %val) { ret void } +define void @global_fp_truncstore_v6f32_to_v6bf16(ptr addrspace(1) %ptr, <6 x float> %src) { +; SI-LABEL: global_fp_truncstore_v6f32_to_v6bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_alignbit_b32 v3, v3, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: v_alignbit_b32 v2, v2, v5, 16 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 offset:8 +; SI-NEXT: buffer_store_dwordx2 v[3:4], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_fp_truncstore_v6f32_to_v6bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_bfe_u32 v9, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: s_movk_i32 s4, 0x7fff +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, s4, v9 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v8, vcc +; VI-NEXT: v_bfe_u32 v8, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, s4, v8 +; VI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v8, v7, vcc +; VI-NEXT: v_bfe_u32 v8, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; VI-NEXT: v_add_u32_e32 v8, vcc, s4, v8 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v8, v7, vcc +; VI-NEXT: v_bfe_u32 v7, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v7, v4, vcc +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v7, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; VI-NEXT: flat_store_dwordx3 v[0:1], v[4:6] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] + %trunc = fptrunc <6 x float> %src to <6 x bfloat> + store <6 x bfloat> %trunc, ptr addrspace(1) %ptr + ret void +} + + +define void @global_fp_truncstore_v6f32_to_v6f16(ptr addrspace(1) %ptr, <6 x float> %src) { +; SI-LABEL: global_fp_truncstore_v6f32_to_v6f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_or_b32_e32 v3, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 offset:8 +; SI-NEXT: buffer_store_dwordx2 v[3:4], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_fp_truncstore_v6f32_to_v6f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; VI-NEXT: v_cvt_f16_f32_sdwa v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_e32 v8, v4 +; VI-NEXT: v_cvt_f16_f32_sdwa v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; VI-NEXT: v_or_b32_e32 v4, v6, v7 +; VI-NEXT: v_or_b32_e32 v3, v8, v5 +; VI-NEXT: v_or_b32_e32 v2, v2, v9 +; VI-NEXT: flat_store_dwordx3 v[0:1], v[2:4] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] + %trunc = fptrunc <6 x float> %src to <6 x half> + store <6 x half> %trunc, ptr addrspace(1) %ptr + ret void +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GCN: {{.*}}