Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -745,6 +745,10 @@ TARGET_BUILTIN(__builtin_amdgcn_permlane_down, "iiii", "nc", "gfx1250-insts,wave
TARGET_BUILTIN(__builtin_amdgcn_permlane_xor, "iiii", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_permlane_idx_gen, "iii", "nc", "gfx1250-insts,wavefrontsize32")

TARGET_BUILTIN(__builtin_amdgcn_perm_pk16_b4_u4, "V2UiUiUiV2Ui", "nc", "tensor-cvt-lut-insts")
TARGET_BUILTIN(__builtin_amdgcn_perm_pk16_b6_u4, "V3UiUiULiV2Ui", "nc", "tensor-cvt-lut-insts")
TARGET_BUILTIN(__builtin_amdgcn_perm_pk16_b8_u4, "V4UiULiULiV2Ui", "nc", "tensor-cvt-lut-insts")

// GFX1250 WMMA builtins
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x4_f32, "V8fIbV2fIbV2fIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x32_bf16, "V8fIbV16yIbV16yIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
Expand Down
2 changes: 1 addition & 1 deletion clang/test/CodeGenOpenCL/amdgpu-features.cl
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@
// GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"

// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"

Expand Down
55 changes: 55 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
Original file line number Diff line number Diff line change
Expand Up @@ -1070,6 +1070,61 @@ void test_permlane_idx_gen(global uint* out, uint src0, uint src1) {
*out = __builtin_amdgcn_permlane_idx_gen(src0, src1);
}

// CHECK-LABEL: @test_perm_pk(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[A32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[A64_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[B32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[B64_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[C_ADDR:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
// CHECK-NEXT: [[OUT2_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[OUT3_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[OUT4_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[A32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A32_ADDR]] to ptr
// CHECK-NEXT: [[A64_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A64_ADDR]] to ptr
// CHECK-NEXT: [[B32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B32_ADDR]] to ptr
// CHECK-NEXT: [[B64_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B64_ADDR]] to ptr
// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
// CHECK-NEXT: [[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT2_ADDR]] to ptr
// CHECK-NEXT: [[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT3_ADDR]] to ptr
// CHECK-NEXT: [[OUT4_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT4_ADDR]] to ptr
// CHECK-NEXT: store i32 [[A32:%.*]], ptr [[A32_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[A64:%.*]], ptr [[A64_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[B32:%.*]], ptr [[B32_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[B64:%.*]], ptr [[B64_ADDR_ASCAST]], align 4
// CHECK-NEXT: store <2 x i32> [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 8
// CHECK-NEXT: store ptr [[OUT2:%.*]], ptr [[OUT2_ADDR_ASCAST]], align 8
// CHECK-NEXT: store ptr [[OUT3:%.*]], ptr [[OUT3_ADDR_ASCAST]], align 8
// CHECK-NEXT: store ptr [[OUT4:%.*]], ptr [[OUT4_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A32_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B32_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[C_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.amdgcn.perm.pk16.b4.u4(i32 [[TMP0]], i32 [[TMP1]], <2 x i32> [[TMP2]])
// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT2_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <2 x i32> [[TMP3]], ptr [[TMP4]], align 8
// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[A32_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[B64_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[CONV:%.*]] = zext i32 [[TMP6]] to i64
// CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[C_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP8:%.*]] = call <3 x i32> @llvm.amdgcn.perm.pk16.b6.u4(i32 [[TMP5]], i64 [[CONV]], <2 x i32> [[TMP7]])
// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[OUT3_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <3 x i32> [[TMP8]], ptr [[TMP9]], align 16
// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[A64_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[CONV1:%.*]] = zext i32 [[TMP10]] to i64
// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[B64_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[CONV2:%.*]] = zext i32 [[TMP11]] to i64
// CHECK-NEXT: [[TMP12:%.*]] = load <2 x i32>, ptr [[C_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.amdgcn.perm.pk16.b8.u4(i64 [[CONV1]], i64 [[CONV2]], <2 x i32> [[TMP12]])
// CHECK-NEXT: [[TMP14:%.*]] = load ptr, ptr [[OUT4_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <4 x i32> [[TMP13]], ptr [[TMP14]], align 16
// CHECK-NEXT: ret void
//
void test_perm_pk(uint a32, uint a64, uint b32, uint b64, uint2 c, uint2 *out2, uint3 *out3, uint4 *out4) {
*out2 = __builtin_amdgcn_perm_pk16_b4_u4(a32, b32, c);
*out3 = __builtin_amdgcn_perm_pk16_b6_u4(a32, b64, c);
*out4 = __builtin_amdgcn_perm_pk16_b8_u4(a64, b64, c);
}

// CHECK-LABEL: @test_prefetch(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[FPTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
Expand Down
12 changes: 12 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -3705,6 +3705,18 @@ def int_amdgcn_permlane_idx_gen : ClangBuiltin<"__builtin_amdgcn_permlane_idx_ge
[llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

def int_amdgcn_perm_pk16_b4_u4 : ClangBuiltin<"__builtin_amdgcn_perm_pk16_b4_u4">,
DefaultAttrsIntrinsic<[llvm_v2i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty],
[IntrNoMem, IntrSpeculatable]>;

def int_amdgcn_perm_pk16_b6_u4 : ClangBuiltin<"__builtin_amdgcn_perm_pk16_b6_u4">,
DefaultAttrsIntrinsic<[llvm_v3i32_ty], [llvm_i32_ty, llvm_i64_ty, llvm_v2i32_ty],
[IntrNoMem, IntrSpeculatable]>;

def int_amdgcn_perm_pk16_b8_u4 : ClangBuiltin<"__builtin_amdgcn_perm_pk16_b8_u4">,
DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i64_ty, llvm_v2i32_ty],
[IntrNoMem, IntrSpeculatable]>;

//===----------------------------------------------------------------------===//
// Special Intrinsics for backend internal use only. No frontend
// should emit calls to these.
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -1160,6 +1160,12 @@ def FeatureTanhInsts : SubtargetFeature<"tanh-insts",
"Has v_tanh_f32/f16 instructions"
>;

def FeatureTensorCvtLutInsts : SubtargetFeature<"tensor-cvt-lut-insts",
"HasTensorCvtLutInsts",
"true",
"Has v_perm_pk16* instructions"
>;

def FeatureTransposeLoadF4F6Insts : SubtargetFeature<"transpose-load-f4f6-insts",
"HasTransposeLoadF4F6Insts",
"true",
Expand Down Expand Up @@ -2030,6 +2036,7 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureDPPSrc1SGPR,
FeatureBitOp3Insts,
FeatureTanhInsts,
FeatureTensorCvtLutInsts,
FeatureTransposeLoadF4F6Insts,
FeatureBF16TransInsts,
FeatureBF16ConversionInsts,
Expand Down Expand Up @@ -2785,6 +2792,9 @@ def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">,
def HasTanhInsts : Predicate<"Subtarget->hasTanhInsts()">,
AssemblerPredicate<(all_of FeatureTanhInsts)>;

def HasTensorCvtLutInsts : Predicate<"Subtarget->hasTensorCvtLutInsts()">,
AssemblerPredicate<(all_of FeatureTensorCvtLutInsts)>;

def HasTransposeLoadF4F6Insts : Predicate<"Subtarget->hasTransposeLoadF4F6Insts()">,
AssemblerPredicate<(all_of FeatureTransposeLoadF4F6Insts)>;

Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4795,6 +4795,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8:
case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
case Intrinsic::amdgcn_perm_pk16_b4_u4:
case Intrinsic::amdgcn_perm_pk16_b6_u4:
case Intrinsic::amdgcn_perm_pk16_b8_u4:
return getDefaultMappingVOP(MI);
case Intrinsic::amdgcn_log:
case Intrinsic::amdgcn_exp2:
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool Has64BitLiterals = false;
bool HasBitOp3Insts = false;
bool HasTanhInsts = false;
bool HasTensorCvtLutInsts = false;
bool HasTransposeLoadF4F6Insts = false;
bool HasPrngInst = false;
bool HasBVHDualAndBVH8Insts = false;
Expand Down Expand Up @@ -1411,6 +1412,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,

bool hasTanhInsts() const { return HasTanhInsts; }

bool hasTensorCvtLutInsts() const { return HasTensorCvtLutInsts; }

bool hasAddPC64Inst() const { return GFX1250Insts; }

bool hasMinimum3Maximum3PKF16() const {
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -1774,6 +1774,7 @@ class getVALUDstForVT<ValueType VT, bit IsTrue16 = 0, bit IsVOP3Encoding = 0> {
!eq(VT.Size, 256) : VOPDstOperand<VReg_256>,
!eq(VT.Size, 192) : VOPDstOperand<VReg_192>,
!eq(VT.Size, 128) : VOPDstOperand<VReg_128>,
!eq(VT.Size, 96) : VOPDstOperand<VReg_96>,
!eq(VT.Size, 64) : VOPDstOperand<VReg_64>,
!eq(VT.Size, 32) : VOPDstOperand<VGPR_32>,
!eq(VT.Size, 16) : op16,
Expand Down Expand Up @@ -1924,6 +1925,7 @@ class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> {
!eq(VT, v2f16) : VCSrc_v2f16,
!eq(VT, v2bf16) : VCSrc_v2bf16,
!eq(VT, f32) : VCSrc_f32,
!eq(VT, v2i32) : VCSrc_v2b32,
1 : VCSrc_b32);
}

Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -1302,6 +1302,7 @@ def VCSrc_f64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_FP64">;
def VCSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2INT16">;
def VCSrc_v2bf16: SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2BF16">;
def VCSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2FP16">;
def VCSrc_v2b32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_V2INT32">;

// True 16 Operands
def VCSrcT_b16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_INT16">;
Expand Down
15 changes: 15 additions & 0 deletions llvm/lib/Target/AMDGPU/VOP3Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1726,6 +1726,12 @@ multiclass VOP3CvtScaleSelInst<string OpName, VOPProfile P, SDPatternOperator no
}
}

let HasExtVOP3DPP = 0, HasModifiers = 0 in {
def VOP3_V2I32_I32_I32_V2I32 : VOP3_Profile<VOPProfile<[v2i32, i32, i32, v2i32]>>;
def VOP3_V3I32_I32_I64_V2I32 : VOP3_Profile<VOPProfile<[v3i32, i32, i64, v2i32]>>;
def VOP3_V4I32_I64_I64_V2I32 : VOP3_Profile<VOPProfile<[v4i32, i64, i64, v2i32]>>;
}

let Src0RC64 = VSrc_NoInline_v2f16 in {
def VOP3_CVT_PK_F8_F16_Profile : VOP3_Profile<VOP_I16_V2F16>;
def VOP3_CVT_PK_F8_F16_True16_Profile : VOP3_Profile_True16<VOP3_CVT_PK_F8_F16_Profile>;
Expand Down Expand Up @@ -1814,6 +1820,12 @@ let SubtargetPredicate = isGFX1250Plus in {
}
} // End SubtargetPredicate = isGFX1250Plus

let SubtargetPredicate = HasTensorCvtLutInsts in {
defm V_PERM_PK16_B4_U4 : VOP3Inst<"v_perm_pk16_b4_u4", VOP3_V2I32_I32_I32_V2I32, int_amdgcn_perm_pk16_b4_u4>;
defm V_PERM_PK16_B6_U4 : VOP3Inst<"v_perm_pk16_b6_u4", VOP3_V3I32_I32_I64_V2I32, int_amdgcn_perm_pk16_b6_u4>;
defm V_PERM_PK16_B8_U4 : VOP3Inst<"v_perm_pk16_b8_u4", VOP3_V4I32_I64_I64_V2I32, int_amdgcn_perm_pk16_b8_u4>;
} // End SubtargetPredicate = HasTensorCvtLutInsts

class Cvt_Scale_Sr_F32ToBF16F16_Pat<SDPatternOperator node, VOP3_Pseudo inst, ValueType DstTy> : GCNPat<
(DstTy (node DstTy:$vdst_in, f32:$src0, i32:$src1, timm:$word_sel)),
(inst (DstSelToOpSelXForm $word_sel), $src0, 0, $src1, VGPR_32:$vdst_in)
Expand Down Expand Up @@ -2212,6 +2224,9 @@ let AssemblerPredicate = isGFX11Plus in {
}

// These instructions differ from GFX12 variant by supporting DPP:
defm V_PERM_PK16_B4_U4 : VOP3Only_Real_Base_gfx1250<0x23f>;
defm V_PERM_PK16_B6_U4 : VOP3Only_Real_Base_gfx1250<0x242>;
defm V_PERM_PK16_B8_U4 : VOP3Only_Real_Base_gfx1250<0x243>;
defm V_LSHL_ADD_U64 : VOP3Only_Realtriple_gfx1250<0x252>;
defm V_ASHR_PK_I8_I32 : VOP3Only_Realtriple_gfx1250<0x290>;
defm V_ASHR_PK_U8_I32 : VOP3Only_Realtriple_gfx1250<0x291>;
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/TargetParser/TargetParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
Features["bitop3-insts"] = true;
Features["prng-inst"] = true;
Features["tanh-insts"] = true;
Features["tensor-cvt-lut-insts"] = true;
Features["transpose-load-f4f6-insts"] = true;
Features["bf16-trans-insts"] = true;
Features["bf16-cvt-insts"] = true;
Expand Down
66 changes: 66 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.perm.pk.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s

declare <2 x i32> @llvm.amdgcn.perm.pk16.b4.u4(i32, i32, <2 x i32>)
declare <3 x i32> @llvm.amdgcn.perm.pk16.b6.u4(i32, i64, <2 x i32>)
declare <4 x i32> @llvm.amdgcn.perm.pk16.b8.u4(i64, i64, <2 x i32>)

define void @test_perm_pk16_b4_u4(i32 %a, i32 %b, <2 x i32> %c, ptr %out) {
; GFX1250-LABEL: test_perm_pk16_b4_u4:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_perm_pk16_b4_u4 v[0:1], v0, v1, v[2:3]
; GFX1250-NEXT: flat_store_b64 v[4:5], v[0:1] scope:SCOPE_SE
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = tail call <2 x i32> @llvm.amdgcn.perm.pk16.b4.u4(i32 %a, i32 %b, <2 x i32> %c)
store <2 x i32> %ret, ptr %out, align 8
ret void
}

define void @test_perm_pk16_b6_u4(i32 %a, i64 %b, <2 x i32> %c, ptr %out) {
; GFX1250-SDAG-LABEL: test_perm_pk16_b6_u4:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v9, v4
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v3, v2
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v6, v5
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_perm_pk16_b6_u4 v[0:2], v0, v[2:3], v[8:9]
; GFX1250-SDAG-NEXT: flat_store_b96 v[6:7], v[0:2] scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31]
;
; GFX1250-GISEL-LABEL: test_perm_pk16_b6_u4:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-GISEL-NEXT: v_perm_pk16_b6_u4 v[0:2], v0, v[8:9], v[2:3]
; GFX1250-GISEL-NEXT: flat_store_b96 v[4:5], v[0:2] scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31]
%ret = tail call <3 x i32> @llvm.amdgcn.perm.pk16.b6.u4(i32 %a, i64 %b, <2 x i32> %c)
store <3 x i32> %ret, ptr %out, align 16
ret void
}

define void @test_perm_pk16_b8_u4(i64 %a, i64 %b, <2 x i32> %c, ptr %out) {
; GFX1250-LABEL: test_perm_pk16_b8_u4:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_perm_pk16_b8_u4 v[0:3], v[0:1], v[2:3], v[4:5]
; GFX1250-NEXT: flat_store_b128 v[6:7], v[0:3] scope:SCOPE_SE
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = tail call <4 x i32> @llvm.amdgcn.perm.pk16.b8.u4(i64 %a, i64 %b, <2 x i32> %c)
store <4 x i32> %ret, ptr %out, align 16
ret void
}
Loading