Skip to content

Commit 2c50e4c

Browse files
shiltianrampitecjayfoad
authored
[AMDGPU] Add support for v_sat_pk4_i4_[i8,u8] on gfx1250 (llvm#149528)
Co-authored-by: Mekhanoshin, Stanislav <[email protected]> Co-authored-by: Foad, Jay <[email protected]>
1 parent e11d28f commit 2c50e4c

25 files changed

+831
-0
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -684,6 +684,8 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts")
684684
TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts")
685685
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_fp8, "V2hs", "nc", "gfx1250-insts")
686686
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_bf8, "V2hs", "nc", "gfx1250-insts")
687+
TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
688+
TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_u4_u8, "UsUi", "nc", "gfx1250-insts")
687689

688690
// GFX1250 WMMA builtins
689691
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x4_f32, "V8fIbV2fIbV2fIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
66

77
typedef unsigned int uint;
8+
typedef unsigned short int ushort;
89
typedef unsigned int __attribute__((ext_vector_type(2))) uint2;
910
typedef half __attribute__((ext_vector_type(2))) half2;
1011

@@ -369,6 +370,30 @@ void test_cvt_pk_f16_bf8(global half2* out, short a)
369370
out[0] = __builtin_amdgcn_cvt_pk_f16_bf8(a);
370371
}
371372

373+
// CHECK-LABEL: @test_sat_pk4_i4_i8(
374+
// CHECK-NEXT: entry:
375+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
376+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
377+
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
378+
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
379+
// CHECK-NEXT: store ptr [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
380+
// CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
381+
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
382+
// CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 [[TMP0]])
383+
// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
384+
// CHECK-NEXT: store i16 [[TMP1]], ptr [[TMP2]], align 2
385+
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
386+
// CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 [[TMP3]])
387+
// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
388+
// CHECK-NEXT: store i16 [[TMP4]], ptr [[TMP5]], align 2
389+
// CHECK-NEXT: ret void
390+
//
391+
void test_sat_pk4_i4_i8(ushort *out, uint src)
392+
{
393+
*out = __builtin_amdgcn_sat_pk4_i4_i8(src);
394+
*out = __builtin_amdgcn_sat_pk4_u4_u8(src);
395+
}
396+
372397
// CHECK-LABEL: @test_permlane16_swap(
373398
// CHECK-NEXT: entry:
374399
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3572,6 +3572,12 @@ def int_amdgcn_cvt_f16_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_f16_bf8">,
35723572
[llvm_i32_ty, llvm_i32_ty],
35733573
[IntrNoMem, ImmArg<ArgIndex<1>>]>;
35743574

3575+
def int_amdgcn_sat_pk4_i4_i8 : ClangBuiltin<"__builtin_amdgcn_sat_pk4_i4_i8">,
3576+
DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
3577+
3578+
def int_amdgcn_sat_pk4_u4_u8 : ClangBuiltin<"__builtin_amdgcn_sat_pk4_u4_u8">,
3579+
DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
3580+
35753581
//===----------------------------------------------------------------------===//
35763582
// Special Intrinsics for backend internal use only. No frontend
35773583
// should emit calls to these.

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4558,6 +4558,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
45584558
case Intrinsic::amdgcn_cvt_pk_u16:
45594559
case Intrinsic::amdgcn_cvt_pk_f16_fp8:
45604560
case Intrinsic::amdgcn_cvt_pk_f16_bf8:
4561+
case Intrinsic::amdgcn_sat_pk4_i4_i8:
4562+
case Intrinsic::amdgcn_sat_pk4_u4_u8:
45614563
case Intrinsic::amdgcn_fmed3:
45624564
case Intrinsic::amdgcn_cubeid:
45634565
case Intrinsic::amdgcn_cubema:

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2850,6 +2850,7 @@ def VOP_F16_I16 : VOPProfile <[f16, i16, untyped, untyped]>;
28502850
def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>;
28512851
def VOP_I16_I16 : VOPProfile <[i16, i16, untyped, untyped]>;
28522852
def VOP_BF16_BF16 : VOPProfile<[bf16, bf16, untyped, untyped]>;
2853+
def VOP1_I16_I32 : VOPProfile<[i16, i32, untyped, untyped]>;
28532854

28542855
def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>;
28552856
def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -803,6 +803,9 @@ let SubtargetPredicate = isGFX1250Plus in {
803803
def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f16_fp8, V_CVT_F16_FP8_fake16_e64, 1>;
804804
def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f16_bf8, V_CVT_F16_BF8_fake16_e64, 1>;
805805
}
806+
807+
defm V_SAT_PK4_I4_I8 : VOP1Inst_t16<"v_sat_pk4_i4_i8", VOP1_I16_I32, int_amdgcn_sat_pk4_i4_i8>;
808+
defm V_SAT_PK4_U4_U8 : VOP1Inst_t16<"v_sat_pk4_u4_u8", VOP1_I16_I32, int_amdgcn_sat_pk4_u4_u8>;
806809
} // End SubtargetPredicate = isGFX1250Plus
807810

808811
let SubtargetPredicate = isGFX10Plus in {
@@ -1158,6 +1161,8 @@ defm V_PERMLANE16_SWAP_B32 : VOP1_Real_OpSelIsDPP_gfx1250<0x049>;
11581161
defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>;
11591162
defm V_PRNG_B32 : VOP1_Real_FULL<GFX1250Gen, 0x04b>;
11601163
defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">;
1164+
defm V_SAT_PK4_I4_I8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x073>;
1165+
defm V_SAT_PK4_U4_U8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x074>;
11611166
defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>;
11621167
defm V_CVT_PK_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x076>;
11631168
defm V_CVT_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x077>;

0 commit comments

Comments
 (0)