-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[PowerPC] Add dense math half-precision floating-point outer-product accumulate to DMR instructions #133272
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-mc @llvm/pr-subscribers-backend-powerpc Author: Maryam Moghadas (maryammo) ChangesThis patch adds the following Dense Math Facility 16-bit half-precision floating-point calculation instructions: dmxvf16gerx2, dmxvf16gerx2pp, dmxvf16gerx2pn, dmxvf16gerx2np, dmxvf16gerx2nn, pmdmxvf16gerx2, pmdmxvf16gerx2pp, pmdmxvf16gerx2pn, pmdmxvf16gerx2np, pmdmxvf16gerx2nn, along with their corresponding intrinsics and tests. Patch is 34.41 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/133272.diff 6 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
index b57102ef68f09..bcc3fc6f0fc13 100644
--- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -1755,6 +1755,13 @@ let TargetPrefix = "ppc" in {
defm int_ppc_mma_pmdmxvbf16gerx2 :
PowerPC_MMA_DMR_Intrinsic<[llvm_v256i1_ty, llvm_v16i8_ty, llvm_i32_ty,
llvm_i32_ty, llvm_i32_ty]>;
+
+ // MMA+ Half-precision Outer Product Intrinsic Definitions.
+ defm int_ppc_mma_dmxvf16gerx2 :
+ PowerPC_MMA_DMR_Intrinsic<[llvm_v256i1_ty, llvm_v16i8_ty]>;
+ defm int_ppc_mma_pmdmxvf16gerx2 :
+ PowerPC_MMA_DMR_Intrinsic<[llvm_v256i1_ty, llvm_v16i8_ty, llvm_i32_ty,
+ llvm_i32_ty, llvm_i32_ty]>;
}
// XL Compat intrinsics.
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td b/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td
index 8ea0924f09b43..331649bddafb7 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td
@@ -207,7 +207,7 @@ multiclass DMR_BF16_UM_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
}
}
-multiclass DMR_UM_M284_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
+multiclass DMR_BF16_UM_M284_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
string asmstr> {
defm NAME : DMR_BF16_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
@@ -228,9 +228,30 @@ multiclass DMR_UM_M284_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
}
}
+multiclass DMR_F16_UM_M284_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
+ string asmstr> {
+ defm NAME : DMR_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
+ let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
+ def PM#NAME :
+ MMIRR_XX3Form_X8Y4P2_XAp5B6<
+ opcode, !or(xo, 0x01), (outs dmr:$AT),
+ !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK)),
+ !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"@earlyclobber $AT">;
+ def PM#NAME#PP :
+ MMIRR_XX3Form_X8Y4P2_XAp5B6<
+ opcode, xo, (outs dmr:$AT),
+ !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+ !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ }
+}
+
multiclass DMR_NEG_UM_M284_XOXORf939a0<bits<6> opcode, bits<8> xo, dag IOL,
string asmbase, string asmstr> {
- defm NAME : DMR_UM_M284_XOEO<opcode, xo, IOL, asmbase, asmstr>;
+ defm NAME : DMR_BF16_UM_M284_XOEO<opcode, xo, IOL, asmbase, asmstr>;
let Predicates = [MMA, IsISAFuture] in {
def PN : XX3Form_AT3_XAp5B6<
opcode, !xor(xo, 0xF9), (outs dmr:$AT), !con((ins dmr:$ATi), IOL),
@@ -270,6 +291,48 @@ multiclass DMR_NEG_UM_M284_XOXORf939a0<bits<6> opcode, bits<8> xo, dag IOL,
}
}
+multiclass DMR_NEG_UM_M284_XOXORd11188<bits<6> opcode, bits<8> xo, dag IOL,
+ string asmbase, string asmstr> {
+ defm NAME : DMR_F16_UM_M284_XOEO<opcode, xo, IOL, asmbase, asmstr>;
+ let Predicates = [MMA, IsISAFuture] in {
+ def PN : XX3Form_AT3_XAp5B6<
+ opcode, !xor(xo, 0xD1), (outs dmr:$AT), !con((ins dmr:$ATi), IOL),
+ !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ def NP : XX3Form_AT3_XAp5B6<
+ opcode, !xor(xo, 0x11), (outs dmr:$AT), !con((ins dmr:$ATi), IOL),
+ !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ def NN : XX3Form_AT3_XAp5B6<
+ opcode, !xor(xo, 0x88), (outs dmr:$AT), !con((ins dmr:$ATi), IOL),
+ !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ }
+ let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
+ def PM#NAME#PN :
+ MMIRR_XX3Form_X8Y4P2_XAp5B6<
+ opcode, !xor(xo, 0xD1), (outs dmr:$AT),
+ !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+ !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ def PM#NAME#NP :
+ MMIRR_XX3Form_X8Y4P2_XAp5B6<
+ opcode, !xor(xo, 0x11), (outs dmr:$AT),
+ !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+ !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ def PM#NAME#NN :
+ MMIRR_XX3Form_X8Y4P2_XAp5B6<
+ opcode, !xor(xo, 0x88), (outs dmr:$AT),
+ !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+ !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ }
+}
+
let Predicates = [IsISAFuture] in {
def DMXXEXTFDMR512 : XX3Form_AT3_XABp5_P1<60, 226,
(outs vsrprc:$XAp, vsrprc:$XBp),
@@ -347,6 +410,11 @@ let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
defm DMXVBF16GERX2 : DMR_NEG_UM_M284_XOXORf939a0<59, 74, (ins vsrprc:$XAp, vsrc:$XB),
"dmxvbf16gerx2", "$AT, $XAp, $XB">;
+// DMXVF16GERX2, DMXVF16GERX2PP, DMXVF16GERX2PN, dMXVF16GERX2NP, DMXVF16GERX2NN
+// PMDMXVF16GERX2, PMDMXVF16GERX2PP, PMDMXVF16GERX2PN, PMDMXVF16GERX2NP, PMDMXVF16GERX2NN
+defm DMXVF16GERX2 : DMR_NEG_UM_M284_XOXORd11188<59, 66, (ins vsrprc:$XAp, vsrc:$XB),
+ "dmxvf16gerx2", "$AT, $XAp, $XB">;
+
// MMA+ Intrinsics
let Predicates = [MMA, IsISAFuture] in {
def : Pat<(v1024i1 (int_ppc_mma_dmxvi8gerx4 v256i1:$XAp, v16i8:$XB)),
@@ -371,6 +439,21 @@ let Predicates = [MMA, IsISAFuture] in {
def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2nn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
(DMXVBF16GERX2NN $ATi, $XAp, RCCp.BToVSRC)>;
+
+ def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2 v256i1:$XAp, v16i8:$XB)),
+ (DMXVF16GERX2 $XAp, RCCp.BToVSRC)>;
+
+ def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2pp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+ (DMXVF16GERX2PP $ATi, $XAp, RCCp.BToVSRC)>;
+
+ def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2pn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+ (DMXVF16GERX2PN $ATi, $XAp, RCCp.BToVSRC)>;
+
+ def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2np v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+ (DMXVF16GERX2NP $ATi, $XAp, RCCp.BToVSRC)>;
+
+ def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2nn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+ (DMXVF16GERX2NN $ATi, $XAp, RCCp.BToVSRC)>;
}
let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
@@ -419,4 +502,33 @@ let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
Msk2Imm:$PMSK)),
(PMDMXVBF16GERX2NN $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+
+ def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2 v256i1:$XAp, v16i8:$XB, Msk8Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
+ (PMDMXVF16GERX2 $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+
+ def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2pp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
+ Msk8Imm:$XMSK, Msk4Imm:$YMSK,
+ Msk2Imm:$PMSK)),
+ (PMDMXVF16GERX2PP $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+
+ def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2pn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
+ Msk8Imm:$XMSK, Msk4Imm:$YMSK,
+ Msk2Imm:$PMSK)),
+ (PMDMXVF16GERX2PN $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+
+ def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2np v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
+ Msk8Imm:$XMSK, Msk4Imm:$YMSK,
+ Msk2Imm:$PMSK)),
+ (PMDMXVF16GERX2NP $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+
+ def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2nn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
+ Msk8Imm:$XMSK, Msk4Imm:$YMSK,
+ Msk2Imm:$PMSK)),
+ (PMDMXVF16GERX2NN $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
}
diff --git a/llvm/test/CodeGen/PowerPC/dmf-outer-product.ll b/llvm/test/CodeGen/PowerPC/dmf-outer-product.ll
index e3b43062f417c..9a02689002459 100644
--- a/llvm/test/CodeGen/PowerPC/dmf-outer-product.ll
+++ b/llvm/test/CodeGen/PowerPC/dmf-outer-product.ll
@@ -769,3 +769,486 @@ entry:
store <1024 x i1> %call, ptr %resp, align 64
ret void
}
+
+declare <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2(<256 x i1>, <16 x i8>)
+define void @test_dmxvf16gerx2(ptr %vpp, ptr %vcp, ptr %resp) {
+; CHECK-LABEL: test_dmxvf16gerx2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lxv v2, 16(r3)
+; CHECK-NEXT: lxv vs0, 0(r4)
+; CHECK-NEXT: lxv v3, 0(r3)
+; CHECK-NEXT: dmxvf16gerx2 dmr0, vsp34, vs0
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT: stxvp vsp34, 96(r5)
+; CHECK-NEXT: stxvp vsp36, 64(r5)
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT: stxvp vsp34, 32(r5)
+; CHECK-NEXT: stxvp vsp36, 0(r5)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: test_dmxvf16gerx2:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: lxv v2, 0(r3)
+; CHECK-BE-NEXT: lxv vs0, 0(r4)
+; CHECK-BE-NEXT: lxv v3, 16(r3)
+; CHECK-BE-NEXT: dmxvf16gerx2 dmr0, vsp34, vs0
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT: stxvp vsp36, 96(r5)
+; CHECK-BE-NEXT: stxvp vsp34, 64(r5)
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT: stxvp vsp36, 32(r5)
+; CHECK-BE-NEXT: stxvp vsp34, 0(r5)
+; CHECK-BE-NEXT: blr
+entry:
+ %v1 = load <256 x i1>, ptr %vpp, align 32
+ %v2 = load <16 x i8>, ptr %vcp, align 32
+ %call = tail call <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2(<256 x i1> %v1, <16 x i8> %v2)
+ store <1024 x i1> %call, ptr %resp, align 64
+ ret void
+}
+
+declare <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2pp(<1024 x i1>, <256 x i1>, <16 x i8>)
+
+define void @test_dmxvf16gerx2pp(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
+; CHECK-LABEL: test_dmxvf16gerx2pp:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lxvp vsp34, 0(r3)
+; CHECK-NEXT: lxvp vsp36, 32(r3)
+; CHECK-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-NEXT: lxvp vsp34, 64(r3)
+; CHECK-NEXT: lxvp vsp36, 96(r3)
+; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-NEXT: lxv v2, 16(r4)
+; CHECK-NEXT: lxv vs0, 0(r5)
+; CHECK-NEXT: lxv v3, 0(r4)
+; CHECK-NEXT: dmxvf16gerx2pp dmr0, vsp34, vs0
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT: stxvp vsp34, 96(r6)
+; CHECK-NEXT: stxvp vsp36, 64(r6)
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT: stxvp vsp34, 32(r6)
+; CHECK-NEXT: stxvp vsp36, 0(r6)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: test_dmxvf16gerx2pp:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: lxvp vsp34, 96(r3)
+; CHECK-BE-NEXT: lxvp vsp36, 64(r3)
+; CHECK-BE-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-BE-NEXT: lxvp vsp34, 32(r3)
+; CHECK-BE-NEXT: lxvp vsp36, 0(r3)
+; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-BE-NEXT: lxv v2, 0(r4)
+; CHECK-BE-NEXT: lxv vs0, 0(r5)
+; CHECK-BE-NEXT: lxv v3, 16(r4)
+; CHECK-BE-NEXT: dmxvf16gerx2pp dmr0, vsp34, vs0
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT: stxvp vsp36, 96(r6)
+; CHECK-BE-NEXT: stxvp vsp34, 64(r6)
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT: stxvp vsp36, 32(r6)
+; CHECK-BE-NEXT: stxvp vsp34, 0(r6)
+; CHECK-BE-NEXT: blr
+entry:
+ %v.dmr = load <1024 x i1>, ptr %vop, align 64
+ %v1 = load <256 x i1>, ptr %vpp, align 32
+ %v2 = load <16 x i8>, ptr %vcp, align 32
+ %call = tail call <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2pp(<1024 x i1> %v.dmr, <256 x i1> %v1, <16 x i8> %v2)
+ store <1024 x i1> %call, ptr %resp, align 64
+ ret void
+}
+
+declare <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2pn(<1024 x i1>, <256 x i1>, <16 x i8>)
+
+define void @test_dmxvf16gerx2pn(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
+; CHECK-LABEL: test_dmxvf16gerx2pn:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lxvp vsp34, 0(r3)
+; CHECK-NEXT: lxvp vsp36, 32(r3)
+; CHECK-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-NEXT: lxvp vsp34, 64(r3)
+; CHECK-NEXT: lxvp vsp36, 96(r3)
+; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-NEXT: lxv v2, 16(r4)
+; CHECK-NEXT: lxv vs0, 0(r5)
+; CHECK-NEXT: lxv v3, 0(r4)
+; CHECK-NEXT: dmxvf16gerx2pn dmr0, vsp34, vs0
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT: stxvp vsp34, 96(r6)
+; CHECK-NEXT: stxvp vsp36, 64(r6)
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT: stxvp vsp34, 32(r6)
+; CHECK-NEXT: stxvp vsp36, 0(r6)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: test_dmxvf16gerx2pn:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: lxvp vsp34, 96(r3)
+; CHECK-BE-NEXT: lxvp vsp36, 64(r3)
+; CHECK-BE-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-BE-NEXT: lxvp vsp34, 32(r3)
+; CHECK-BE-NEXT: lxvp vsp36, 0(r3)
+; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-BE-NEXT: lxv v2, 0(r4)
+; CHECK-BE-NEXT: lxv vs0, 0(r5)
+; CHECK-BE-NEXT: lxv v3, 16(r4)
+; CHECK-BE-NEXT: dmxvf16gerx2pn dmr0, vsp34, vs0
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT: stxvp vsp36, 96(r6)
+; CHECK-BE-NEXT: stxvp vsp34, 64(r6)
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT: stxvp vsp36, 32(r6)
+; CHECK-BE-NEXT: stxvp vsp34, 0(r6)
+; CHECK-BE-NEXT: blr
+entry:
+ %v.dmr = load <1024 x i1>, ptr %vop, align 64
+ %v1 = load <256 x i1>, ptr %vpp, align 32
+ %v2 = load <16 x i8>, ptr %vcp, align 32
+ %call = tail call <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2pn(<1024 x i1> %v.dmr, <256 x i1> %v1, <16 x i8> %v2)
+ store <1024 x i1> %call, ptr %resp, align 64
+ ret void
+}
+
+declare <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2np(<1024 x i1>, <256 x i1>, <16 x i8>)
+
+define void @test_dmxvf16gerx2np(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
+; CHECK-LABEL: test_dmxvf16gerx2np:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lxvp vsp34, 0(r3)
+; CHECK-NEXT: lxvp vsp36, 32(r3)
+; CHECK-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-NEXT: lxvp vsp34, 64(r3)
+; CHECK-NEXT: lxvp vsp36, 96(r3)
+; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-NEXT: lxv v2, 16(r4)
+; CHECK-NEXT: lxv vs0, 0(r5)
+; CHECK-NEXT: lxv v3, 0(r4)
+; CHECK-NEXT: dmxvf16gerx2np dmr0, vsp34, vs0
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT: stxvp vsp34, 96(r6)
+; CHECK-NEXT: stxvp vsp36, 64(r6)
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT: stxvp vsp34, 32(r6)
+; CHECK-NEXT: stxvp vsp36, 0(r6)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: test_dmxvf16gerx2np:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: lxvp vsp34, 96(r3)
+; CHECK-BE-NEXT: lxvp vsp36, 64(r3)
+; CHECK-BE-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-BE-NEXT: lxvp vsp34, 32(r3)
+; CHECK-BE-NEXT: lxvp vsp36, 0(r3)
+; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-BE-NEXT: lxv v2, 0(r4)
+; CHECK-BE-NEXT: lxv vs0, 0(r5)
+; CHECK-BE-NEXT: lxv v3, 16(r4)
+; CHECK-BE-NEXT: dmxvf16gerx2np dmr0, vsp34, vs0
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT: stxvp vsp36, 96(r6)
+; CHECK-BE-NEXT: stxvp vsp34, 64(r6)
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT: stxvp vsp36, 32(r6)
+; CHECK-BE-NEXT: stxvp vsp34, 0(r6)
+; CHECK-BE-NEXT: blr
+entry:
+ %v.dmr = load <1024 x i1>, ptr %vop, align 64
+ %v1 = load <256 x i1>, ptr %vpp, align 32
+ %v2 = load <16 x i8>, ptr %vcp, align 32
+ %call = tail call <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2np(<1024 x i1> %v.dmr, <256 x i1> %v1, <16 x i8> %v2)
+ store <1024 x i1> %call, ptr %resp, align 64
+ ret void
+}
+
+declare <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2nn(<1024 x i1>, <256 x i1>, <16 x i8>)
+
+define void @test_dmxvf16gerx2nn(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
+; CHECK-LABEL: test_dmxvf16gerx2nn:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lxvp vsp34, 0(r3)
+; CHECK-NEXT: lxvp vsp36, 32(r3)
+; CHECK-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-NEXT: lxvp vsp34, 64(r3)
+; CHECK-NEXT: lxvp vsp36, 96(r3)
+; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-NEXT: lxv v2, 16(r4)
+; CHECK-NEXT: lxv vs0, 0(r5)
+; CHECK-NEXT: lxv v3, 0(r4)
+; CHECK-NEXT: dmxvf16gerx2nn dmr0, vsp34, vs0
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT: stxvp vsp34, 96(r6)
+; CHECK-NEXT: stxvp vsp36, 64(r6)
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT: stxvp vsp34, 32(r6)
+; CHECK-NEXT: stxvp vsp36, 0(r6)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: test_dmxvf16gerx2nn:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: lxvp vsp34, 96(r3)
+; CHECK-BE-NEXT: lxvp vsp36, 64(r3)
+; CHECK-BE-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-BE-NEXT: lxvp vsp34, 32(r3)
+; CHECK-BE-NEXT: lxvp vsp36, 0(r3)
+; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-BE-NEXT: lxv v2, 0(r4)
+; CHECK-BE-NEXT: lxv vs0, 0(r5)
+; CHECK-BE-NEXT: lxv v3, 16(r4)
+; CHECK-BE-NEXT: dmxvf16gerx2nn dmr0, vsp34, vs0
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT: stxvp vsp36, 96(r6)
+; CHECK-BE-NEXT: stxvp vsp34, 64(r6)
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT: stxvp vsp36, 32(r6)
+; CHECK-BE-NEXT: stxvp vsp34, 0(r6)
+; CHECK-BE-NEXT: blr
+entry:
+ %v.dmr = load <1024 x i1>, ptr %vop, align 64
+ %v1 = load <256 x i1>, ptr %vpp, align 32
+ %v2 = load <16 x i8>, ptr %vcp, align 32
+ %call = tail call <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2nn(<1024 x i1> %v.dmr, <256 x i1> %v1, <16 x i8> %v2)
+ store <1024 x i1> %call, ptr %resp, align 64
+ ret void
+}
+
+declare <1024 x i1> @llvm.ppc.mma.pmdmxvf16gerx2(<256 x i1>, <16 x i8>, i32, i32, i32)
+
+define void @test_pmdmxvf16gerx2(ptr %vpp, ptr %vcp, ptr %resp) {
+; CHECK-LABEL: test_pmdmxvf16gerx2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lxv v2, 16(r3)
+; CHECK-NEXT: lxv vs0, 0(r4)
+; CHECK-NEXT: lxv v3, 0(r3)
+; CHECK-NEXT: pmdmxvf16gerx2 dmr0, vsp34, vs0, 33, 5, 2
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT: stxvp vsp34, 96(r5)
+; CHECK-NEXT: stxvp vsp36, 64(r5)
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT: stxvp vsp34, 32(r5)
+; CHECK-NEXT: stxvp vsp36, 0(r5)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: test_pmdmxvf16gerx2:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: lxv v2, 0(r3)
+; CHECK-BE-NEXT: lxv vs0, 0(r4)
+; CHECK-BE-NEXT: lxv v3, 16(r3)
+; CHECK-BE-NEXT: pmdmxvf16gerx2 dmr0, vsp34, vs0, 33, 5, 2
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT: stxvp vsp36, 96(r5)
+; CHECK-BE-NEXT: stxvp vsp34, 64(r5)
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT: stxvp vsp36, 32(r5)
+; CHECK-BE-NEXT: stxvp vsp34, 0(r5)
+; CHECK-BE-NEXT: blr
+entry:
+ %v1 = load <256 x i1>, ptr %vpp, align 32
+ ...
[truncated]
|
aa2fd47 to
3faec5c
Compare
…accumulate to DMR instructions This patch adds the following Dense Math Facility 16-bit half-precision floating-point calculation instructions: dmxvf16gerx2, dmxvf16gerx2pp, dmxvf16gerx2pn, dmxvf16gerx2np, dmxvf16gerx2nn, pmdmxvf16gerx2, pmdmxvf16gerx2pp, pmdmxvf16gerx2pn, pmdmxvf16gerx2np, pmdmxvf16gerx2nn, along with their corresponding intrinsics and tests.
109f521 to
8b7aeb7
Compare
RolandF77
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
…accumulate to DMR instructions (llvm#133272) This patch adds the following Dense Math Facility 16-bit half-precision floating-point calculation instructions: dmxvf16gerx2, dmxvf16gerx2pp, dmxvf16gerx2pn, dmxvf16gerx2np, dmxvf16gerx2nn, pmdmxvf16gerx2, pmdmxvf16gerx2pp, pmdmxvf16gerx2pn, pmdmxvf16gerx2np, pmdmxvf16gerx2nn, along with their corresponding intrinsics and tests.
…accumulate to DMR instructions (llvm#133272) This patch adds the following Dense Math Facility 16-bit half-precision floating-point calculation instructions: dmxvf16gerx2, dmxvf16gerx2pp, dmxvf16gerx2pn, dmxvf16gerx2np, dmxvf16gerx2nn, pmdmxvf16gerx2, pmdmxvf16gerx2pp, pmdmxvf16gerx2pn, pmdmxvf16gerx2np, pmdmxvf16gerx2nn, along with their corresponding intrinsics and tests.
This patch adds the following Dense Math Facility 16-bit half-precision floating-point calculation instructions: dmxvf16gerx2, dmxvf16gerx2pp, dmxvf16gerx2pn, dmxvf16gerx2np, dmxvf16gerx2nn, pmdmxvf16gerx2, pmdmxvf16gerx2pp, pmdmxvf16gerx2pn, pmdmxvf16gerx2np, pmdmxvf16gerx2nn, along with their corresponding intrinsics and tests.