-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[LLVM] Add intrinsics for v_cvt_pk_norm_{i16,u16}_f16 #135631
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-ir Author: Acim Maravic (Acim-Maravic) ChangesAdded builtin and intrinsic for v_cvt_pk_norm_i16_f16 and v_cvt_pk_norm_u16_f16 Full diff: https://github.com/llvm/llvm-project/pull/135631.diff 8 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 39fef9e4601f8..0f3789d282304 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -259,6 +259,9 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "atom
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2hV2h*3V2h", "t", "atomic-ds-pk-add-16-insts")
TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3IUiIiIUi", "t", "vmem-to-lds-load-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_norm_i16_f16, "V2sxx", "nc", "gfx9-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_norm_u16_f16, "V2Usxx", "nc", "gfx9-insts")
+
//===----------------------------------------------------------------------===//
// Deep learning builtins.
//===----------------------------------------------------------------------===//
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx9.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx9.cl
index 87f2da20a21a6..06417a693d303 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx9.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx9.cl
@@ -5,6 +5,8 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
typedef unsigned int uint;
typedef unsigned long ulong;
+typedef short __attribute__((ext_vector_type(2))) short2;
+typedef unsigned short __attribute__((ext_vector_type(2))) ushort2;
// CHECK-LABEL: @test_fmed3_f16
// CHECK: call half @llvm.amdgcn.fmed3.f16(half %a, half %b, half %c)
@@ -26,3 +28,17 @@ void test_groupstaticsize(global uint* out)
{
*out = __builtin_amdgcn_groupstaticsize();
}
+
+// CHECK-LABEL: define dso_local void @test_cvt_pk_norm_i16_f16(
+// CHECK: call <2 x i16> @llvm.amdgcn.cvt.pk.norm.i16.f16(half %src0, half %src1)
+void test_cvt_pk_norm_i16_f16(global short2* out, half src0, half src1)
+{
+ *out = __builtin_amdgcn_cvt_pk_norm_i16_f16(src0, src1);
+}
+
+// CHECK-LABEL: define dso_local void @test_cvt_pk_norm_u16_f16(
+// CHECK: call <2 x i16> @llvm.amdgcn.cvt.pk.norm.u16.f16(half %src0, half %src1)
+void test_cvt_pk_norm_u16_f16(global ushort2* out, half src0, half src1)
+{
+ *out = __builtin_amdgcn_cvt_pk_norm_u16_f16(src0, src1);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 217e43fcce4fd..60904e9202238 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2644,6 +2644,18 @@ def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS;
def int_amdgcn_pops_exiting_wave_id :
DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrHasSideEffects]>;
+def int_amdgcn_cvt_pk_norm_i16_f16 :
+ ClangBuiltin<"__builtin_amdgcn_cvt_pk_norm_i16_f16">,
+ DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_half_ty, llvm_half_ty],
+ [IntrNoMem, IntrSpeculatable]
+>;
+
+def int_amdgcn_cvt_pk_norm_u16_f16 :
+ ClangBuiltin<"__builtin_amdgcn_cvt_pk_norm_u16_f16">,
+ DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_half_ty, llvm_half_ty],
+ [IntrNoMem, IntrSpeculatable]
+>;
+
//===----------------------------------------------------------------------===//
// GFX10 Intrinsics
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 1d0e81db5a5db..8d35721d6df8f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4669,6 +4669,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
+ case Intrinsic::amdgcn_cvt_pk_norm_i16_f16:
+ case Intrinsic::amdgcn_cvt_pk_norm_u16_f16:
return getDefaultMappingVOP(MI);
case Intrinsic::amdgcn_log:
case Intrinsic::amdgcn_exp2:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 23a7f508dcda2..f7322d0a85434 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2856,6 +2856,7 @@ def VOP_V32BF16_V6I32_F32 : VOPProfile <[v32bf16, v6i32, f32, untyped]>;
def VOP_V6I32_V32F16_F32 : VOPProfile<[v6i32, v32f16, f32, untyped]>;
def VOP_V6I32_V32BF16_F32 : VOPProfile<[v6i32, v32bf16, f32, untyped]>;
def VOP_V6I32_V16F32_V16F32_F32 : VOPProfile<[v6i32, v16f32, v16f32, f32]>;
+def VOP_V2I16_F16_F16 : VOPProfile<[v2i16, f16, f16, untyped]>;
def VOP_V2F16_I32_F32 : VOPProfile<[v2f16, i32, f32, untyped]>;
def VOP_V2I16_F32_F32_F32 : VOPProfile<[v2i16, f32, f32, f32]>;
def VOP_V2I16_V2F16_F32 : VOPProfile<[v2i16, v2f16, f32, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 8686a85620a17..1f33827a31914 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -666,8 +666,11 @@ let isCommutable = 1 in {
defm V_MAD_I32_I16 : VOP3Inst_t16 <"v_mad_i32_i16", VOP_I32_I16_I16_I32>;
} // End isCommutable = 1
-defm V_CVT_PKNORM_I16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_i16_f16", VOP_B32_F16_F16>;
-defm V_CVT_PKNORM_U16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_u16_f16", VOP_B32_F16_F16>;
+defm V_CVT_PKNORM_I16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_i16_f16", VOP_V2I16_F16_F16, int_amdgcn_cvt_pk_norm_i16_f16>;
+defm V_CVT_PKNORM_U16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_u16_f16", VOP_V2I16_F16_F16, int_amdgcn_cvt_pk_norm_u16_f16>;
+// multiclass VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> {
+
+// multiclass VOP3Inst_t16<string OpName, VOPProfile P, SDPatternOperator node = null_frag, SDPatternOperator node_t16 = node>
defm V_PACK_B32_F16 : VOP3Inst_t16 <"v_pack_b32_f16", VOP_B32_F16_F16>;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.norm.i16.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.norm.i16.f16.ll
new file mode 100644
index 0000000000000..ce91b3fc65b70
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.norm.i16.f16.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-GISEL %s
+
+define amdgpu_kernel void @cvt_pk_norm_i16_f16_sgpr(ptr addrspace(1) %out, half %a, half %b) {
+; GFX9-SDAG-LABEL: cvt_pk_norm_i16_f16_sgpr:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: s_lshr_b32 s3, s2, 16
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT: v_cvt_pknorm_i16_f16 v1, s2, v1
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: s_endpgm
+;
+; GFX9-GISEL-LABEL: cvt_pk_norm_i16_f16_sgpr:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: s_lshr_b32 s3, s2, 16
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-GISEL-NEXT: v_cvt_pknorm_i16_f16 v0, s2, v0
+; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT: s_endpgm
+ %r = call <2 x i16> @llvm.amdgcn.cvt.pk.norm.i16.f16(half %a, half %b)
+ %res = bitcast <2 x i16> %r to i32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @cvt_pk_norm_i16_f16_const(ptr addrspace(1) %out) {
+; GFX9-SDAG-LABEL: cvt_pk_norm_i16_f16_const:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-SDAG-NEXT: v_cvt_pknorm_i16_f16 v1, 1.0, 2.0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: s_endpgm
+;
+; GFX9-GISEL-LABEL: cvt_pk_norm_i16_f16_const:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-GISEL-NEXT: v_cvt_pknorm_i16_f16 v0, 1.0, 2.0
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT: s_endpgm
+ %r = call <2 x i16> @llvm.amdgcn.cvt.pk.norm.i16.f16(half 1.0, half 2.0)
+ %res = bitcast <2 x i16> %r to i32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @cvt_pk_norm_i16_f16_undef(ptr addrspace(1) %out) {
+; GFX9-SDAG-LABEL: cvt_pk_norm_i16_f16_undef:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_cvt_pknorm_i16_f16 v1, s0, v0
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: s_endpgm
+;
+; GFX9-GISEL-LABEL: cvt_pk_norm_i16_f16_undef:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_cvt_pknorm_i16_f16 v0, s0, s0
+; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT: s_endpgm
+ %r = call <2 x i16> @llvm.amdgcn.cvt.pk.norm.i16.f16(half undef, half undef)
+ %res = bitcast <2 x i16> %r to i32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @cvt_pk_norm_i16_f16_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) {
+; GFX9-SDAG-LABEL: cvt_pk_norm_i16_f16_vgpr:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: global_load_ushort v2, v1, s[2:3] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_ushort v3, v1, s[6:7] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_cvt_pknorm_i16_f16 v1, v2, v3
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: s_endpgm
+;
+; GFX9-GISEL-LABEL: cvt_pk_norm_i16_f16_vgpr:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: global_load_ushort v2, v1, s[2:3] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_ushort v3, v1, s[6:7] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_cvt_pknorm_i16_f16 v1, v2, v3
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds half, ptr addrspace(1) %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds half, ptr addrspace(1) %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+ %a = load volatile half, ptr addrspace(1) %a.gep
+ %b = load volatile half, ptr addrspace(1) %b.gep
+ %r = call <2 x i16> @llvm.amdgcn.cvt.pk.norm.i16.f16(half %a, half %b)
+ %res = bitcast <2 x i16> %r to i32
+ store i32 %res, ptr addrspace(1) %out.gep
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.norm.u16.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.norm.u16.f16.ll
new file mode 100644
index 0000000000000..276c82b8d2cf8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.norm.u16.f16.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-GISEL %s
+
+define amdgpu_kernel void @cvt_pk_norm_u16_f16_sgpr(ptr addrspace(1) %out, half %a, half %b) {
+; GFX9-SDAG-LABEL: cvt_pk_norm_u16_f16_sgpr:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: s_lshr_b32 s3, s2, 16
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT: v_cvt_pknorm_u16_f16 v1, s2, v1
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: s_endpgm
+;
+; GFX9-GISEL-LABEL: cvt_pk_norm_u16_f16_sgpr:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: s_lshr_b32 s3, s2, 16
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-GISEL-NEXT: v_cvt_pknorm_u16_f16 v0, s2, v0
+; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT: s_endpgm
+ %r = call <2 x i16> @llvm.amdgcn.cvt.pk.norm.u16.f16(half %a, half %b)
+ %res = bitcast <2 x i16> %r to i32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @cvt_pk_norm_u16_f16_const(ptr addrspace(1) %out) {
+; GFX9-SDAG-LABEL: cvt_pk_norm_u16_f16_const:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-SDAG-NEXT: v_cvt_pknorm_u16_f16 v1, 1.0, 2.0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: s_endpgm
+;
+; GFX9-GISEL-LABEL: cvt_pk_norm_u16_f16_const:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-GISEL-NEXT: v_cvt_pknorm_u16_f16 v0, 1.0, 2.0
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT: s_endpgm
+ %r = call <2 x i16> @llvm.amdgcn.cvt.pk.norm.u16.f16(half 1.0, half 2.0)
+ %res = bitcast <2 x i16> %r to i32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @cvt_pk_norm_u16_f16_undef(ptr addrspace(1) %out) {
+; GFX9-SDAG-LABEL: cvt_pk_norm_u16_f16_undef:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_cvt_pknorm_u16_f16 v1, s0, v0
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: s_endpgm
+;
+; GFX9-GISEL-LABEL: cvt_pk_norm_u16_f16_undef:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_cvt_pknorm_u16_f16 v0, s0, s0
+; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT: s_endpgm
+ %r = call <2 x i16> @llvm.amdgcn.cvt.pk.norm.u16.f16(half undef, half undef)
+ %res = bitcast <2 x i16> %r to i32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @cvt_pk_norm_u16_f16_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) {
+; GFX9-SDAG-LABEL: cvt_pk_norm_u16_f16_vgpr:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: global_load_ushort v2, v1, s[2:3] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: global_load_ushort v3, v1, s[6:7] glc
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_cvt_pknorm_u16_f16 v1, v2, v3
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: s_endpgm
+;
+; GFX9-GISEL-LABEL: cvt_pk_norm_u16_f16_vgpr:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: global_load_ushort v2, v1, s[2:3] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: global_load_ushort v3, v1, s[6:7] glc
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_cvt_pknorm_u16_f16 v1, v2, v3
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds half, ptr addrspace(1) %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds half, ptr addrspace(1) %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+ %a = load volatile half, ptr addrspace(1) %a.gep
+ %b = load volatile half, ptr addrspace(1) %b.gep
+ %r = call <2 x i16> @llvm.amdgcn.cvt.pk.norm.u16.f16(half %a, half %b)
+ %res = bitcast <2 x i16> %r to i32
+ store i32 %res, ptr addrspace(1) %out.gep
+ ret void
+}
|
|
Check code formatting job is failing in a weird way. I can't work out what the issue is. |
I have checked locally, and it seems that the whole AMDGPURegisterBankInfo.cpp is not clang-formatted... But my change did not introduce any new formatting issues... How should I procced? |
Added builtin and intrinsic for v_cvt_pk_norm_i16_f16 and v_cvt_pk_norm_u16_f16
86976f2 to
df4b39e
Compare
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)' 'HEAD~1' HEAD llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.norm.i16.f16.ll llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.norm.u16.f16.ll llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cppThe following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
}Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
}Please refer to the Undefined Behavior Manual for more information. |
|
Looks like the undefs are causing some issues. |
arsenm
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There seem to be scalar versions of these instructions, but no matching intrinsics? I would hope we would have both (and combines to form the vector case from the scalar uses, and breaking the vector case into scalar for partially undef)
| ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-SDAG %s | ||
| ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-GISEL %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-SDAG %s | |
| ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-GISEL %s | |
| ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9-SDAG %s | |
| ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9-GISEL %s |
| // CHECK: call <2 x i16> @llvm.amdgcn.cvt.pk.norm.u16.f16(half %src0, half %src1) | ||
| void test_cvt_pk_norm_u16_f16(global ushort2* out, half src0, half src1) | ||
| { | ||
| *out = __builtin_amdgcn_cvt_pk_norm_u16_f16(src0, src1); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing error test on unsupported targets
These cases should just use poison (but this is also more relevant to future instcombine handling of these) |
Added builtin and intrinsic for v_cvt_pk_norm_i16_f16 and v_cvt_pk_norm_u16_f16