Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,9 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "atom
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2hV2h*3V2h", "t", "atomic-ds-pk-add-16-insts")
TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3IUiIiIUi", "t", "vmem-to-lds-load-insts")

TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_norm_i16_f16, "V2sxx", "nc", "gfx9-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_norm_u16_f16, "V2Usxx", "nc", "gfx9-insts")

//===----------------------------------------------------------------------===//
// Deep learning builtins.
//===----------------------------------------------------------------------===//
Expand Down
16 changes: 16 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-gfx9.cl
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
typedef unsigned int uint;
typedef unsigned long ulong;
typedef short __attribute__((ext_vector_type(2))) short2;
typedef unsigned short __attribute__((ext_vector_type(2))) ushort2;

// CHECK-LABEL: @test_fmed3_f16
// CHECK: call half @llvm.amdgcn.fmed3.f16(half %a, half %b, half %c)
Expand All @@ -26,3 +28,17 @@ void test_groupstaticsize(global uint* out)
{
*out = __builtin_amdgcn_groupstaticsize();
}

// CHECK-LABEL: define dso_local void @test_cvt_pk_norm_i16_f16(
// CHECK: call <2 x i16> @llvm.amdgcn.cvt.pk.norm.i16.f16(half %src0, half %src1)
void test_cvt_pk_norm_i16_f16(global short2* out, half src0, half src1)
{
*out = __builtin_amdgcn_cvt_pk_norm_i16_f16(src0, src1);
}

// CHECK-LABEL: define dso_local void @test_cvt_pk_norm_u16_f16(
// CHECK: call <2 x i16> @llvm.amdgcn.cvt.pk.norm.u16.f16(half %src0, half %src1)
void test_cvt_pk_norm_u16_f16(global ushort2* out, half src0, half src1)
{
*out = __builtin_amdgcn_cvt_pk_norm_u16_f16(src0, src1);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing error test on unsupported targets

}
12 changes: 12 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2644,6 +2644,18 @@ def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS;
def int_amdgcn_pops_exiting_wave_id :
DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrHasSideEffects]>;

def int_amdgcn_cvt_pk_norm_i16_f16 :
ClangBuiltin<"__builtin_amdgcn_cvt_pk_norm_i16_f16">,
DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_half_ty, llvm_half_ty],
[IntrNoMem, IntrSpeculatable]
>;

def int_amdgcn_cvt_pk_norm_u16_f16 :
ClangBuiltin<"__builtin_amdgcn_cvt_pk_norm_u16_f16">,
DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_half_ty, llvm_half_ty],
[IntrNoMem, IntrSpeculatable]
>;

//===----------------------------------------------------------------------===//
// GFX10 Intrinsics
//===----------------------------------------------------------------------===//
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4669,6 +4669,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
case Intrinsic::amdgcn_cvt_pk_norm_i16_f16:
case Intrinsic::amdgcn_cvt_pk_norm_u16_f16:
return getDefaultMappingVOP(MI);
case Intrinsic::amdgcn_log:
case Intrinsic::amdgcn_exp2:
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -2856,6 +2856,7 @@ def VOP_V32BF16_V6I32_F32 : VOPProfile <[v32bf16, v6i32, f32, untyped]>;
def VOP_V6I32_V32F16_F32 : VOPProfile<[v6i32, v32f16, f32, untyped]>;
def VOP_V6I32_V32BF16_F32 : VOPProfile<[v6i32, v32bf16, f32, untyped]>;
def VOP_V6I32_V16F32_V16F32_F32 : VOPProfile<[v6i32, v16f32, v16f32, f32]>;
def VOP_V2I16_F16_F16 : VOPProfile<[v2i16, f16, f16, untyped]>;
def VOP_V2F16_I32_F32 : VOPProfile<[v2f16, i32, f32, untyped]>;
def VOP_V2I16_F32_F32_F32 : VOPProfile<[v2i16, f32, f32, f32]>;
def VOP_V2I16_V2F16_F32 : VOPProfile<[v2i16, v2f16, f32, untyped]>;
Expand Down
7 changes: 5 additions & 2 deletions llvm/lib/Target/AMDGPU/VOP3Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -666,8 +666,11 @@ let isCommutable = 1 in {
defm V_MAD_I32_I16 : VOP3Inst_t16 <"v_mad_i32_i16", VOP_I32_I16_I16_I32>;
} // End isCommutable = 1

defm V_CVT_PKNORM_I16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_i16_f16", VOP_B32_F16_F16>;
defm V_CVT_PKNORM_U16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_u16_f16", VOP_B32_F16_F16>;
defm V_CVT_PKNORM_I16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_i16_f16", VOP_V2I16_F16_F16, int_amdgcn_cvt_pk_norm_i16_f16>;
defm V_CVT_PKNORM_U16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_u16_f16", VOP_V2I16_F16_F16, int_amdgcn_cvt_pk_norm_u16_f16>;
// multiclass VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> {

// multiclass VOP3Inst_t16<string OpName, VOPProfile P, SDPatternOperator node = null_frag, SDPatternOperator node_t16 = node>

defm V_PACK_B32_F16 : VOP3Inst_t16 <"v_pack_b32_f16", VOP_B32_F16_F16>;

Expand Down
124 changes: 124 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.norm.i16.f16.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-GISEL %s
Comment on lines +2 to +3
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9-GISEL %s


define amdgpu_kernel void @cvt_pk_norm_i16_f16_sgpr(ptr addrspace(1) %out, half %a, half %b) {
; GFX9-SDAG-LABEL: cvt_pk_norm_i16_f16_sgpr:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-SDAG-NEXT: s_lshr_b32 s3, s2, 16
; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_cvt_pknorm_i16_f16 v1, s2, v1
; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: cvt_pk_norm_i16_f16_sgpr:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: s_lshr_b32 s3, s2, 16
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s3
; GFX9-GISEL-NEXT: v_cvt_pknorm_i16_f16 v0, s2, v0
; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
%r = call <2 x i16> @llvm.amdgcn.cvt.pk.norm.i16.f16(half %a, half %b)
%res = bitcast <2 x i16> %r to i32
store i32 %res, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @cvt_pk_norm_i16_f16_const(ptr addrspace(1) %out) {
; GFX9-SDAG-LABEL: cvt_pk_norm_i16_f16_const:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX9-SDAG-NEXT: v_cvt_pknorm_i16_f16 v1, 1.0, 2.0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: cvt_pk_norm_i16_f16_const:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-GISEL-NEXT: v_cvt_pknorm_i16_f16 v0, 1.0, 2.0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
%r = call <2 x i16> @llvm.amdgcn.cvt.pk.norm.i16.f16(half 1.0, half 2.0)
%res = bitcast <2 x i16> %r to i32
store i32 %res, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @cvt_pk_norm_i16_f16_undef(ptr addrspace(1) %out) {
; GFX9-SDAG-LABEL: cvt_pk_norm_i16_f16_undef:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_pknorm_i16_f16 v1, s0, v0
; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: cvt_pk_norm_i16_f16_undef:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: v_cvt_pknorm_i16_f16 v0, s0, s0
; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
%r = call <2 x i16> @llvm.amdgcn.cvt.pk.norm.i16.f16(half undef, half undef)
%res = bitcast <2 x i16> %r to i32
store i32 %res, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @cvt_pk_norm_i16_f16_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) {
; GFX9-SDAG-LABEL: cvt_pk_norm_i16_f16_vgpr:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-SDAG-NEXT: global_load_ushort v2, v1, s[2:3] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: global_load_ushort v3, v1, s[6:7] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_pknorm_i16_f16 v1, v2, v3
; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: cvt_pk_norm_i16_f16_vgpr:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: global_load_ushort v2, v1, s[2:3] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: global_load_ushort v3, v1, s[6:7] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_cvt_pknorm_i16_f16 v1, v2, v3
; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds half, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds half, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile half, ptr addrspace(1) %a.gep
%b = load volatile half, ptr addrspace(1) %b.gep
%r = call <2 x i16> @llvm.amdgcn.cvt.pk.norm.i16.f16(half %a, half %b)
%res = bitcast <2 x i16> %r to i32
store i32 %res, ptr addrspace(1) %out.gep
ret void
}
124 changes: 124 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.norm.u16.f16.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-GISEL %s

define amdgpu_kernel void @cvt_pk_norm_u16_f16_sgpr(ptr addrspace(1) %out, half %a, half %b) {
; GFX9-SDAG-LABEL: cvt_pk_norm_u16_f16_sgpr:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-SDAG-NEXT: s_lshr_b32 s3, s2, 16
; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_cvt_pknorm_u16_f16 v1, s2, v1
; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: cvt_pk_norm_u16_f16_sgpr:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: s_lshr_b32 s3, s2, 16
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s3
; GFX9-GISEL-NEXT: v_cvt_pknorm_u16_f16 v0, s2, v0
; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
%r = call <2 x i16> @llvm.amdgcn.cvt.pk.norm.u16.f16(half %a, half %b)
%res = bitcast <2 x i16> %r to i32
store i32 %res, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @cvt_pk_norm_u16_f16_const(ptr addrspace(1) %out) {
; GFX9-SDAG-LABEL: cvt_pk_norm_u16_f16_const:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX9-SDAG-NEXT: v_cvt_pknorm_u16_f16 v1, 1.0, 2.0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: cvt_pk_norm_u16_f16_const:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-GISEL-NEXT: v_cvt_pknorm_u16_f16 v0, 1.0, 2.0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
%r = call <2 x i16> @llvm.amdgcn.cvt.pk.norm.u16.f16(half 1.0, half 2.0)
%res = bitcast <2 x i16> %r to i32
store i32 %res, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @cvt_pk_norm_u16_f16_undef(ptr addrspace(1) %out) {
; GFX9-SDAG-LABEL: cvt_pk_norm_u16_f16_undef:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_pknorm_u16_f16 v1, s0, v0
; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: cvt_pk_norm_u16_f16_undef:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: v_cvt_pknorm_u16_f16 v0, s0, s0
; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
%r = call <2 x i16> @llvm.amdgcn.cvt.pk.norm.u16.f16(half undef, half undef)
%res = bitcast <2 x i16> %r to i32
store i32 %res, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @cvt_pk_norm_u16_f16_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) {
; GFX9-SDAG-LABEL: cvt_pk_norm_u16_f16_vgpr:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-SDAG-NEXT: global_load_ushort v2, v1, s[2:3] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: global_load_ushort v3, v1, s[6:7] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_pknorm_u16_f16 v1, v2, v3
; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: cvt_pk_norm_u16_f16_vgpr:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: global_load_ushort v2, v1, s[2:3] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: global_load_ushort v3, v1, s[6:7] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_cvt_pknorm_u16_f16 v1, v2, v3
; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds half, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds half, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile half, ptr addrspace(1) %a.gep
%b = load volatile half, ptr addrspace(1) %b.gep
%r = call <2 x i16> @llvm.amdgcn.cvt.pk.norm.u16.f16(half %a, half %b)
%res = bitcast <2 x i16> %r to i32
store i32 %res, ptr addrspace(1) %out.gep
ret void
}
Loading