Skip to content

Commit 0988510

Browse files
authored
[AMDGPU] gfx1250 v_perm_pk16_* instructions (#151773)
1 parent 8039486 commit 0988510

File tree

15 files changed

+308
-1
lines changed

15 files changed

+308
-1
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -745,6 +745,10 @@ TARGET_BUILTIN(__builtin_amdgcn_permlane_down, "iiii", "nc", "gfx1250-insts,wave
745745
TARGET_BUILTIN(__builtin_amdgcn_permlane_xor, "iiii", "nc", "gfx1250-insts,wavefrontsize32")
746746
TARGET_BUILTIN(__builtin_amdgcn_permlane_idx_gen, "iii", "nc", "gfx1250-insts,wavefrontsize32")
747747

748+
TARGET_BUILTIN(__builtin_amdgcn_perm_pk16_b4_u4, "V2UiUiUiV2Ui", "nc", "tensor-cvt-lut-insts")
749+
TARGET_BUILTIN(__builtin_amdgcn_perm_pk16_b6_u4, "V3UiUiULiV2Ui", "nc", "tensor-cvt-lut-insts")
750+
TARGET_BUILTIN(__builtin_amdgcn_perm_pk16_b8_u4, "V4UiULiULiV2Ui", "nc", "tensor-cvt-lut-insts")
751+
748752
// GFX1250 WMMA builtins
749753
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x4_f32, "V8fIbV2fIbV2fIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
750754
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x32_bf16, "V8fIbV16yIbV16yIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")

clang/test/CodeGenOpenCL/amdgpu-features.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@
108108
// GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
109109
// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
110110
// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
111-
// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
111+
// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
112112

113113
// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
114114

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1070,6 +1070,61 @@ void test_permlane_idx_gen(global uint* out, uint src0, uint src1) {
10701070
*out = __builtin_amdgcn_permlane_idx_gen(src0, src1);
10711071
}
10721072

1073+
// CHECK-LABEL: @test_perm_pk(
1074+
// CHECK-NEXT: entry:
1075+
// CHECK-NEXT: [[A32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1076+
// CHECK-NEXT: [[A64_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1077+
// CHECK-NEXT: [[B32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1078+
// CHECK-NEXT: [[B64_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1079+
// CHECK-NEXT: [[C_ADDR:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
1080+
// CHECK-NEXT: [[OUT2_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1081+
// CHECK-NEXT: [[OUT3_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1082+
// CHECK-NEXT: [[OUT4_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1083+
// CHECK-NEXT: [[A32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A32_ADDR]] to ptr
1084+
// CHECK-NEXT: [[A64_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A64_ADDR]] to ptr
1085+
// CHECK-NEXT: [[B32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B32_ADDR]] to ptr
1086+
// CHECK-NEXT: [[B64_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B64_ADDR]] to ptr
1087+
// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
1088+
// CHECK-NEXT: [[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT2_ADDR]] to ptr
1089+
// CHECK-NEXT: [[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT3_ADDR]] to ptr
1090+
// CHECK-NEXT: [[OUT4_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT4_ADDR]] to ptr
1091+
// CHECK-NEXT: store i32 [[A32:%.*]], ptr [[A32_ADDR_ASCAST]], align 4
1092+
// CHECK-NEXT: store i32 [[A64:%.*]], ptr [[A64_ADDR_ASCAST]], align 4
1093+
// CHECK-NEXT: store i32 [[B32:%.*]], ptr [[B32_ADDR_ASCAST]], align 4
1094+
// CHECK-NEXT: store i32 [[B64:%.*]], ptr [[B64_ADDR_ASCAST]], align 4
1095+
// CHECK-NEXT: store <2 x i32> [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 8
1096+
// CHECK-NEXT: store ptr [[OUT2:%.*]], ptr [[OUT2_ADDR_ASCAST]], align 8
1097+
// CHECK-NEXT: store ptr [[OUT3:%.*]], ptr [[OUT3_ADDR_ASCAST]], align 8
1098+
// CHECK-NEXT: store ptr [[OUT4:%.*]], ptr [[OUT4_ADDR_ASCAST]], align 8
1099+
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A32_ADDR_ASCAST]], align 4
1100+
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B32_ADDR_ASCAST]], align 4
1101+
// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[C_ADDR_ASCAST]], align 8
1102+
// CHECK-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.amdgcn.perm.pk16.b4.u4(i32 [[TMP0]], i32 [[TMP1]], <2 x i32> [[TMP2]])
1103+
// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT2_ADDR_ASCAST]], align 8
1104+
// CHECK-NEXT: store <2 x i32> [[TMP3]], ptr [[TMP4]], align 8
1105+
// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[A32_ADDR_ASCAST]], align 4
1106+
// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[B64_ADDR_ASCAST]], align 4
1107+
// CHECK-NEXT: [[CONV:%.*]] = zext i32 [[TMP6]] to i64
1108+
// CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[C_ADDR_ASCAST]], align 8
1109+
// CHECK-NEXT: [[TMP8:%.*]] = call <3 x i32> @llvm.amdgcn.perm.pk16.b6.u4(i32 [[TMP5]], i64 [[CONV]], <2 x i32> [[TMP7]])
1110+
// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[OUT3_ADDR_ASCAST]], align 8
1111+
// CHECK-NEXT: store <3 x i32> [[TMP8]], ptr [[TMP9]], align 16
1112+
// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[A64_ADDR_ASCAST]], align 4
1113+
// CHECK-NEXT: [[CONV1:%.*]] = zext i32 [[TMP10]] to i64
1114+
// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[B64_ADDR_ASCAST]], align 4
1115+
// CHECK-NEXT: [[CONV2:%.*]] = zext i32 [[TMP11]] to i64
1116+
// CHECK-NEXT: [[TMP12:%.*]] = load <2 x i32>, ptr [[C_ADDR_ASCAST]], align 8
1117+
// CHECK-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.amdgcn.perm.pk16.b8.u4(i64 [[CONV1]], i64 [[CONV2]], <2 x i32> [[TMP12]])
1118+
// CHECK-NEXT: [[TMP14:%.*]] = load ptr, ptr [[OUT4_ADDR_ASCAST]], align 8
1119+
// CHECK-NEXT: store <4 x i32> [[TMP13]], ptr [[TMP14]], align 16
1120+
// CHECK-NEXT: ret void
1121+
//
1122+
void test_perm_pk(uint a32, uint a64, uint b32, uint b64, uint2 c, uint2 *out2, uint3 *out3, uint4 *out4) {
1123+
*out2 = __builtin_amdgcn_perm_pk16_b4_u4(a32, b32, c);
1124+
*out3 = __builtin_amdgcn_perm_pk16_b6_u4(a32, b64, c);
1125+
*out4 = __builtin_amdgcn_perm_pk16_b8_u4(a64, b64, c);
1126+
}
1127+
10731128
// CHECK-LABEL: @test_prefetch(
10741129
// CHECK-NEXT: entry:
10751130
// CHECK-NEXT: [[FPTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3705,6 +3705,18 @@ def int_amdgcn_permlane_idx_gen : ClangBuiltin<"__builtin_amdgcn_permlane_idx_ge
37053705
[llvm_i32_ty, llvm_i32_ty],
37063706
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
37073707

3708+
def int_amdgcn_perm_pk16_b4_u4 : ClangBuiltin<"__builtin_amdgcn_perm_pk16_b4_u4">,
3709+
DefaultAttrsIntrinsic<[llvm_v2i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty],
3710+
[IntrNoMem, IntrSpeculatable]>;
3711+
3712+
def int_amdgcn_perm_pk16_b6_u4 : ClangBuiltin<"__builtin_amdgcn_perm_pk16_b6_u4">,
3713+
DefaultAttrsIntrinsic<[llvm_v3i32_ty], [llvm_i32_ty, llvm_i64_ty, llvm_v2i32_ty],
3714+
[IntrNoMem, IntrSpeculatable]>;
3715+
3716+
def int_amdgcn_perm_pk16_b8_u4 : ClangBuiltin<"__builtin_amdgcn_perm_pk16_b8_u4">,
3717+
DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i64_ty, llvm_v2i32_ty],
3718+
[IntrNoMem, IntrSpeculatable]>;
3719+
37083720
//===----------------------------------------------------------------------===//
37093721
// Special Intrinsics for backend internal use only. No frontend
37103722
// should emit calls to these.

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1160,6 +1160,12 @@ def FeatureTanhInsts : SubtargetFeature<"tanh-insts",
11601160
"Has v_tanh_f32/f16 instructions"
11611161
>;
11621162

1163+
def FeatureTensorCvtLutInsts : SubtargetFeature<"tensor-cvt-lut-insts",
1164+
"HasTensorCvtLutInsts",
1165+
"true",
1166+
"Has v_perm_pk16* instructions"
1167+
>;
1168+
11631169
def FeatureTransposeLoadF4F6Insts : SubtargetFeature<"transpose-load-f4f6-insts",
11641170
"HasTransposeLoadF4F6Insts",
11651171
"true",
@@ -2030,6 +2036,7 @@ def FeatureISAVersion12_50 : FeatureSet<
20302036
FeatureDPPSrc1SGPR,
20312037
FeatureBitOp3Insts,
20322038
FeatureTanhInsts,
2039+
FeatureTensorCvtLutInsts,
20332040
FeatureTransposeLoadF4F6Insts,
20342041
FeatureBF16TransInsts,
20352042
FeatureBF16ConversionInsts,
@@ -2785,6 +2792,9 @@ def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">,
27852792
def HasTanhInsts : Predicate<"Subtarget->hasTanhInsts()">,
27862793
AssemblerPredicate<(all_of FeatureTanhInsts)>;
27872794

2795+
def HasTensorCvtLutInsts : Predicate<"Subtarget->hasTensorCvtLutInsts()">,
2796+
AssemblerPredicate<(all_of FeatureTensorCvtLutInsts)>;
2797+
27882798
def HasTransposeLoadF4F6Insts : Predicate<"Subtarget->hasTransposeLoadF4F6Insts()">,
27892799
AssemblerPredicate<(all_of FeatureTransposeLoadF4F6Insts)>;
27902800

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4795,6 +4795,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
47954795
case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
47964796
case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8:
47974797
case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
4798+
case Intrinsic::amdgcn_perm_pk16_b4_u4:
4799+
case Intrinsic::amdgcn_perm_pk16_b6_u4:
4800+
case Intrinsic::amdgcn_perm_pk16_b8_u4:
47984801
return getDefaultMappingVOP(MI);
47994802
case Intrinsic::amdgcn_log:
48004803
case Intrinsic::amdgcn_exp2:

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
236236
bool Has64BitLiterals = false;
237237
bool HasBitOp3Insts = false;
238238
bool HasTanhInsts = false;
239+
bool HasTensorCvtLutInsts = false;
239240
bool HasTransposeLoadF4F6Insts = false;
240241
bool HasPrngInst = false;
241242
bool HasBVHDualAndBVH8Insts = false;
@@ -1411,6 +1412,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
14111412

14121413
bool hasTanhInsts() const { return HasTanhInsts; }
14131414

1415+
bool hasTensorCvtLutInsts() const { return HasTensorCvtLutInsts; }
1416+
14141417
bool hasAddPC64Inst() const { return GFX1250Insts; }
14151418

14161419
bool hasMinimum3Maximum3PKF16() const {

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1774,6 +1774,7 @@ class getVALUDstForVT<ValueType VT, bit IsTrue16 = 0, bit IsVOP3Encoding = 0> {
17741774
!eq(VT.Size, 256) : VOPDstOperand<VReg_256>,
17751775
!eq(VT.Size, 192) : VOPDstOperand<VReg_192>,
17761776
!eq(VT.Size, 128) : VOPDstOperand<VReg_128>,
1777+
!eq(VT.Size, 96) : VOPDstOperand<VReg_96>,
17771778
!eq(VT.Size, 64) : VOPDstOperand<VReg_64>,
17781779
!eq(VT.Size, 32) : VOPDstOperand<VGPR_32>,
17791780
!eq(VT.Size, 16) : op16,
@@ -1924,6 +1925,7 @@ class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> {
19241925
!eq(VT, v2f16) : VCSrc_v2f16,
19251926
!eq(VT, v2bf16) : VCSrc_v2bf16,
19261927
!eq(VT, f32) : VCSrc_f32,
1928+
!eq(VT, v2i32) : VCSrc_v2b32,
19271929
1 : VCSrc_b32);
19281930
}
19291931

llvm/lib/Target/AMDGPU/SIRegisterInfo.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1302,6 +1302,7 @@ def VCSrc_f64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_FP64">;
13021302
def VCSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2INT16">;
13031303
def VCSrc_v2bf16: SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2BF16">;
13041304
def VCSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2FP16">;
1305+
def VCSrc_v2b32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_V2INT32">;
13051306

13061307
// True 16 Operands
13071308
def VCSrcT_b16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_INT16">;

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1726,6 +1726,12 @@ multiclass VOP3CvtScaleSelInst<string OpName, VOPProfile P, SDPatternOperator no
17261726
}
17271727
}
17281728

1729+
let HasExtVOP3DPP = 0, HasModifiers = 0 in {
1730+
def VOP3_V2I32_I32_I32_V2I32 : VOP3_Profile<VOPProfile<[v2i32, i32, i32, v2i32]>>;
1731+
def VOP3_V3I32_I32_I64_V2I32 : VOP3_Profile<VOPProfile<[v3i32, i32, i64, v2i32]>>;
1732+
def VOP3_V4I32_I64_I64_V2I32 : VOP3_Profile<VOPProfile<[v4i32, i64, i64, v2i32]>>;
1733+
}
1734+
17291735
let Src0RC64 = VSrc_NoInline_v2f16 in {
17301736
def VOP3_CVT_PK_F8_F16_Profile : VOP3_Profile<VOP_I16_V2F16>;
17311737
def VOP3_CVT_PK_F8_F16_True16_Profile : VOP3_Profile_True16<VOP3_CVT_PK_F8_F16_Profile>;
@@ -1814,6 +1820,12 @@ let SubtargetPredicate = isGFX1250Plus in {
18141820
}
18151821
} // End SubtargetPredicate = isGFX1250Plus
18161822

1823+
let SubtargetPredicate = HasTensorCvtLutInsts in {
1824+
defm V_PERM_PK16_B4_U4 : VOP3Inst<"v_perm_pk16_b4_u4", VOP3_V2I32_I32_I32_V2I32, int_amdgcn_perm_pk16_b4_u4>;
1825+
defm V_PERM_PK16_B6_U4 : VOP3Inst<"v_perm_pk16_b6_u4", VOP3_V3I32_I32_I64_V2I32, int_amdgcn_perm_pk16_b6_u4>;
1826+
defm V_PERM_PK16_B8_U4 : VOP3Inst<"v_perm_pk16_b8_u4", VOP3_V4I32_I64_I64_V2I32, int_amdgcn_perm_pk16_b8_u4>;
1827+
} // End SubtargetPredicate = HasTensorCvtLutInsts
1828+
18171829
class Cvt_Scale_Sr_F32ToBF16F16_Pat<SDPatternOperator node, VOP3_Pseudo inst, ValueType DstTy> : GCNPat<
18181830
(DstTy (node DstTy:$vdst_in, f32:$src0, i32:$src1, timm:$word_sel)),
18191831
(inst (DstSelToOpSelXForm $word_sel), $src0, 0, $src1, VGPR_32:$vdst_in)
@@ -2212,6 +2224,9 @@ let AssemblerPredicate = isGFX11Plus in {
22122224
}
22132225

22142226
// These instructions differ from GFX12 variant by supporting DPP:
2227+
defm V_PERM_PK16_B4_U4 : VOP3Only_Real_Base_gfx1250<0x23f>;
2228+
defm V_PERM_PK16_B6_U4 : VOP3Only_Real_Base_gfx1250<0x242>;
2229+
defm V_PERM_PK16_B8_U4 : VOP3Only_Real_Base_gfx1250<0x243>;
22152230
defm V_LSHL_ADD_U64 : VOP3Only_Realtriple_gfx1250<0x252>;
22162231
defm V_ASHR_PK_I8_I32 : VOP3Only_Realtriple_gfx1250<0x290>;
22172232
defm V_ASHR_PK_U8_I32 : VOP3Only_Realtriple_gfx1250<0x291>;

0 commit comments

Comments
 (0)