Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -716,6 +716,15 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_bf16_fp4, "V8yUiUiIUi", "nc", "gfx
TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp8, "V8fV2UiUiIUi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_bf8, "V8fV2UiUiIUi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp4, "V8fUiUiIUi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_bf16, "V2UiV8yf", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_bf16, "V2UiV8yf", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_f16, "V2UiV8hf", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f16, "V2UiV8hf", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_f32, "V2UiV8ff", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f32, "V2UiV8ff", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f32, "UiV8ff", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f16, "UiV8hf", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16, "UiV8yf", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", "fp8e5m3-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", "fp8e5m3-insts")
TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
Expand Down
94 changes: 94 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
Original file line number Diff line number Diff line change
Expand Up @@ -674,6 +674,100 @@ void test_cvt_scale_pk(global half8 *outh8, global bfloat8 *outy8, uint2 src2,
*outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, 7);
}

// CHECK-LABEL: @test_cvt_scalef32_pk(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT2_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[SRCBF8_ADDR:%.*]] = alloca <8 x bfloat>, align 16, addrspace(5)
// CHECK-NEXT: [[SRCH8_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5)
// CHECK-NEXT: [[SRCF8_ADDR:%.*]] = alloca <8 x float>, align 32, addrspace(5)
// CHECK-NEXT: [[OUT3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[SRCBF16_ADDR:%.*]] = alloca <16 x bfloat>, align 32, addrspace(5)
// CHECK-NEXT: [[SRCH16_ADDR:%.*]] = alloca <16 x half>, align 32, addrspace(5)
// CHECK-NEXT: [[SRCF16_ADDR:%.*]] = alloca <16 x float>, align 64, addrspace(5)
// CHECK-NEXT: [[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT2_ADDR]] to ptr
// CHECK-NEXT: [[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCBF8_ADDR]] to ptr
// CHECK-NEXT: [[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCH8_ADDR]] to ptr
// CHECK-NEXT: [[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCF8_ADDR]] to ptr
// CHECK-NEXT: [[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT3_ADDR]] to ptr
// CHECK-NEXT: [[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCBF16_ADDR]] to ptr
// CHECK-NEXT: [[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCH16_ADDR]] to ptr
// CHECK-NEXT: [[SRCF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCF16_ADDR]] to ptr
// CHECK-NEXT: [[OUT1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT1_ADDR]] to ptr
// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT2:%.*]], ptr [[OUT2_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <8 x bfloat> [[SRCBF8:%.*]], ptr [[SRCBF8_ADDR_ASCAST]], align 16
// CHECK-NEXT: store <8 x half> [[SRCH8:%.*]], ptr [[SRCH8_ADDR_ASCAST]], align 16
// CHECK-NEXT: store <8 x float> [[SRCF8:%.*]], ptr [[SRCF8_ADDR_ASCAST]], align 32
// CHECK-NEXT: store ptr addrspace(1) [[OUT3:%.*]], ptr [[OUT3_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <16 x bfloat> [[SRCBF16:%.*]], ptr [[SRCBF16_ADDR_ASCAST]], align 32
// CHECK-NEXT: store <16 x half> [[SRCH16:%.*]], ptr [[SRCH16_ADDR_ASCAST]], align 32
// CHECK-NEXT: store <16 x float> [[SRCF16:%.*]], ptr [[SRCF16_ADDR_ASCAST]], align 64
// CHECK-NEXT: store ptr addrspace(1) [[OUT1:%.*]], ptr [[OUT1_ADDR_ASCAST]], align 8
// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.bf16(<8 x bfloat> [[TMP0]], float [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 8
// CHECK-NEXT: [[TMP4:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16
// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP6:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.bf16(<8 x bfloat> [[TMP4]], float [[TMP5]])
// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8
// CHECK-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16
// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP10:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f16(<8 x half> [[TMP8]], float [[TMP9]])
// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <2 x i32> [[TMP10]], ptr addrspace(1) [[TMP11]], align 8
// CHECK-NEXT: [[TMP12:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16
// CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP14:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f16(<8 x half> [[TMP12]], float [[TMP13]])
// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8
// CHECK-NEXT: [[TMP16:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32
// CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP18:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f32(<8 x float> [[TMP16]], float [[TMP17]])
// CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <2 x i32> [[TMP18]], ptr addrspace(1) [[TMP19]], align 8
// CHECK-NEXT: [[TMP20:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32
// CHECK-NEXT: [[TMP21:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP22:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f32(<8 x float> [[TMP20]], float [[TMP21]])
// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8
// CHECK-NEXT: [[TMP24:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32
// CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f32(<8 x float> [[TMP24]], float [[TMP25]])
// CHECK-NEXT: [[TMP27:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[TMP26]], ptr addrspace(1) [[TMP27]], align 4
// CHECK-NEXT: [[TMP28:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16
// CHECK-NEXT: [[TMP29:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP30:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f16(<8 x half> [[TMP28]], float [[TMP29]])
// CHECK-NEXT: [[TMP31:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[TMP30]], ptr addrspace(1) [[TMP31]], align 4
// CHECK-NEXT: [[TMP32:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16
// CHECK-NEXT: [[TMP33:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP34:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.bf16(<8 x bfloat> [[TMP32]], float [[TMP33]])
// CHECK-NEXT: [[TMP35:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[TMP34]], ptr addrspace(1) [[TMP35]], align 4
// CHECK-NEXT: ret void
//
void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float8 srcf8,
global uint3 *out3, bfloat16 srcbf16, half16 srch16, float16 srcf16,
global uint *out1, float scale)
{
*out2 = __builtin_amdgcn_cvt_scalef32_pk8_fp8_bf16(srcbf8, scale);
*out2 = __builtin_amdgcn_cvt_scalef32_pk8_bf8_bf16(srcbf8, scale);
*out2 = __builtin_amdgcn_cvt_scalef32_pk8_fp8_f16(srch8, scale);
*out2 = __builtin_amdgcn_cvt_scalef32_pk8_bf8_f16(srch8, scale);
*out2 = __builtin_amdgcn_cvt_scalef32_pk8_fp8_f32(srcf8, scale);
*out2 = __builtin_amdgcn_cvt_scalef32_pk8_bf8_f32(srcf8, scale);
*out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_f32(srcf8, scale);
*out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_f16(srch8, scale);
*out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16(srcbf8, scale);
}

// CHECK-LABEL: @test_sat_pk4_i4_i8(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
Expand Down
14 changes: 12 additions & 2 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -662,10 +662,17 @@ class AMDGPUCvtScaleF32ToFP6BF6Intrinsic<LLVMType DstTy, LLVMType Src0Ty, LLVMTy

def int_amdgcn_cvt_scalef32_pk32_fp6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_pk32_fp6_f16">;
def int_amdgcn_cvt_scalef32_pk32_bf6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_pk32_bf6_f16">;
def int_amdgcn_cvt_scalef32_pk8_fp8_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v2i32_ty, llvm_v8bf16_ty, "cvt_scalef32_pk8_fp8_bf16">;
def int_amdgcn_cvt_scalef32_pk8_bf8_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v2i32_ty, llvm_v8bf16_ty, "cvt_scalef32_pk8_bf8_bf16">;
def int_amdgcn_cvt_scalef32_pk32_fp6_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_pk32_fp6_bf16">;
def int_amdgcn_cvt_scalef32_pk32_bf6_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_pk32_bf6_bf16">;
def int_amdgcn_cvt_scalef32_2xpk16_fp6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_fp6_f32">;
def int_amdgcn_cvt_scalef32_2xpk16_bf6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_bf6_f32">;
def int_amdgcn_cvt_scalef32_pk8_fp8_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v2i32_ty, llvm_v8f16_ty, "cvt_scalef32_pk8_fp8_f16">;
def int_amdgcn_cvt_scalef32_pk8_bf8_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v2i32_ty, llvm_v8f16_ty, "cvt_scalef32_pk8_bf8_f16">;
def int_amdgcn_cvt_scalef32_pk8_fp8_f32 : AMDGPUCvtScaleF32Intrinsic<llvm_v2i32_ty, llvm_v8f32_ty, "cvt_scalef32_pk8_fp8_f32">;
def int_amdgcn_cvt_scalef32_pk8_bf8_f32 : AMDGPUCvtScaleF32Intrinsic<llvm_v2i32_ty, llvm_v8f32_ty, "cvt_scalef32_pk8_bf8_f32">;
def int_amdgcn_cvt_scalef32_pk8_fp4_f32 : AMDGPUCvtScaleF32Intrinsic<llvm_i32_ty, llvm_v8f32_ty, "cvt_scalef32_pk8_fp4_f32">;
def int_amdgcn_cvt_scalef32_pk8_fp4_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_i32_ty, llvm_v8f16_ty, "cvt_scalef32_pk8_fp4_f16">;
def int_amdgcn_cvt_scalef32_pk8_fp4_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_i32_ty, llvm_v8bf16_ty, "cvt_scalef32_pk8_fp4_bf16">;

def int_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_sr_pk32_bf6_bf16">;
def int_amdgcn_cvt_scalef32_sr_pk32_bf6_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_sr_pk32_bf6_f16">;
Expand All @@ -674,6 +681,9 @@ def int_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm
def int_amdgcn_cvt_scalef32_sr_pk32_fp6_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_sr_pk32_fp6_f16">;
def int_amdgcn_cvt_scalef32_sr_pk32_fp6_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f32_ty, "cvt_scalef32_sr_pk32_fp6_f32">;

def int_amdgcn_cvt_scalef32_2xpk16_fp6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_fp6_f32">;
def int_amdgcn_cvt_scalef32_2xpk16_bf6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_bf6_f32">;

class AMDGPUCvtScaleFP4FP8BF8ToF1632Intrinsic<LLVMType DstTy, string name> : DefaultAttrsIntrinsic<
[DstTy],
[llvm_i32_ty, // src
Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4603,6 +4603,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp8:
case Intrinsic::amdgcn_cvt_scale_pk8_f32_bf8:
case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp4:
case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_bf16:
case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_bf16:
case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_f16:
case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_f16:
case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_f32:
case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_f32:
case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f32:
case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f16:
case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_bf16:
case Intrinsic::amdgcn_sat_pk4_i4_i8:
case Intrinsic::amdgcn_sat_pk4_u4_u8:
case Intrinsic::amdgcn_fmed3:
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -2955,6 +2955,12 @@ def VOP_V8BF16_I32_I32 : VOPProfile<[v8bf16, i32, i32, untyped]>;
def VOP_V16F32_V3I32_I32 : VOPProfile<[v16f32, v3i32, i32, untyped]>;
def VOP_V8F32_V2I32_I32 : VOPProfile<[v8f32, v2i32, i32, untyped]>;
def VOP_V8F32_I32_I32 : VOPProfile<[v8f32, i32, i32, untyped]>;
def VOP_V2I32_V8BF16_F32 : VOPProfile<[v2i32, v8bf16, f32, untyped]>;
def VOP_V2I32_V8F16_F32 : VOPProfile<[v2i32, v8f16, f32, untyped]>;
def VOP_V2I32_V8F32_F32 : VOPProfile<[v2i32, v8f32, f32, untyped]>;
def VOP_I32_V8F32_F32 : VOPProfile<[i32, v8f32, f32, untyped]>;
def VOP_I32_V8F16_F32 : VOPProfile<[i32, v8f16, f32, untyped]>;
def VOP_I32_V8BF16_F32 : VOPProfile<[i32, v8bf16, f32, untyped]>;
def VOP_I32_F32_I32_F32 : VOPProfile<[i32, f32, i32, f32]>;

def VOP_V6I32_V32BF16_I32_F32 : VOPProfile<[v6i32, v32bf16, i32, f32]>;
Expand Down
23 changes: 23 additions & 0 deletions llvm/lib/Target/AMDGPU/VOP3Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1778,6 +1778,20 @@ let SubtargetPredicate = isGFX1250Plus in {
defm V_CVT_SCALE_PK8_F32_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_fp4", VOP_V8F32_I32_I32, int_amdgcn_cvt_scale_pk8_f32_fp4>;
} // End ReadsModeReg = 0

let Constraints = "@earlyclobber $vdst" in {
let WaveSizePredicate = isWave32 in {
defm V_CVT_SCALEF32_PK8_FP8_BF16 : VOP3Inst<"v_cvt_scalef32_pk8_fp8_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8BF16_F32>, int_amdgcn_cvt_scalef32_pk8_fp8_bf16>;
defm V_CVT_SCALEF32_PK8_BF8_BF16 : VOP3Inst<"v_cvt_scalef32_pk8_bf8_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8BF16_F32>, int_amdgcn_cvt_scalef32_pk8_bf8_bf16>;
defm V_CVT_SCALEF32_PK8_FP8_F16 : VOP3Inst<"v_cvt_scalef32_pk8_fp8_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F16_F32>, int_amdgcn_cvt_scalef32_pk8_fp8_f16>;
defm V_CVT_SCALEF32_PK8_BF8_F16 : VOP3Inst<"v_cvt_scalef32_pk8_bf8_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F16_F32>, int_amdgcn_cvt_scalef32_pk8_bf8_f16>;
defm V_CVT_SCALEF32_PK8_FP8_F32 : VOP3Inst<"v_cvt_scalef32_pk8_fp8_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F32_F32>, int_amdgcn_cvt_scalef32_pk8_fp8_f32>;
defm V_CVT_SCALEF32_PK8_BF8_F32 : VOP3Inst<"v_cvt_scalef32_pk8_bf8_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F32_F32>, int_amdgcn_cvt_scalef32_pk8_bf8_f32>;
defm V_CVT_SCALEF32_PK8_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk8_fp4_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F32_F32>, int_amdgcn_cvt_scalef32_pk8_fp4_f32>;
defm V_CVT_SCALEF32_PK8_FP4_F16 : VOP3Inst<"v_cvt_scalef32_pk8_fp4_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F16_F32>, int_amdgcn_cvt_scalef32_pk8_fp4_f16>;
defm V_CVT_SCALEF32_PK8_FP4_BF16 : VOP3Inst<"v_cvt_scalef32_pk8_fp4_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8BF16_F32>, int_amdgcn_cvt_scalef32_pk8_fp4_bf16>;
} // End WaveSizePredicate = isWave32
} // End Constraints = "@earlyclobber $vdst"

let True16Predicate = UseRealTrue16Insts in {
def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f16, V_CVT_SR_FP8_F16_t16_e64, f16>;
def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f16, V_CVT_SR_BF8_F16_t16_e64, f16>;
Expand Down Expand Up @@ -2198,6 +2212,15 @@ defm V_CVT_SCALE_PK8_F32_FP8 : VOP3Only_ScaleSel_Real_gfx1250<0x2aa>;
defm V_CVT_SCALE_PK8_F16_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ab>;
defm V_CVT_SCALE_PK8_BF16_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ac>;
defm V_CVT_SCALE_PK8_F32_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ad>;
defm V_CVT_SCALEF32_PK8_FP4_F32 : VOP3Only_Real_Base_gfx1250<0x2b0>;
defm V_CVT_SCALEF32_PK8_FP4_F16 : VOP3Only_Real_Base_gfx1250<0x2b3>;
defm V_CVT_SCALEF32_PK8_FP8_BF16 : VOP3Only_Real_Base_gfx1250<0x2b4>;
defm V_CVT_SCALEF32_PK8_BF8_BF16 : VOP3Only_Real_Base_gfx1250<0x2b5>;
defm V_CVT_SCALEF32_PK8_FP4_BF16 : VOP3Only_Real_Base_gfx1250<0x2b8>;
defm V_CVT_SCALEF32_PK8_FP8_F32 : VOP3Only_Real_Base_gfx1250<0x2c3>;
defm V_CVT_SCALEF32_PK8_FP8_F16 : VOP3Only_Real_Base_gfx1250<0x2c4>;
defm V_CVT_SCALEF32_PK8_BF8_F32 : VOP3Only_Real_Base_gfx1250<0x2c5>;
defm V_CVT_SCALEF32_PK8_BF8_F16 : VOP3Only_Real_Base_gfx1250<0x2c6>;
defm V_CVT_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36d>;
defm V_CVT_SR_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36e>;
defm V_CVT_PK_F16_F32 : VOP3Only_Realtriple_gfx1250<0x36f>;
Expand Down
Loading