Skip to content

Commit d1cbe6e

Browse files
authored
[PowerPC] Add DMF builtins for build and disassemble (#153097)
Add support for PPC Dense Math builtins mma_build_dmr and mma_disassemble_dmr builtins.
1 parent 613ec4c commit d1cbe6e

File tree

8 files changed

+154
-10
lines changed

8 files changed

+154
-10
lines changed

clang/include/clang/Basic/BuiltinsPPC.def

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1100,6 +1100,10 @@ UNALIASED_CUSTOM_BUILTIN(mma_dmmr, "vW1024*W1024*", false,
11001100
"mma,isa-future-instructions")
11011101
UNALIASED_CUSTOM_BUILTIN(mma_dmxor, "vW1024*W1024*", true,
11021102
"mma,isa-future-instructions")
1103+
UNALIASED_CUSTOM_BUILTIN(mma_disassemble_dmr, "vv*W1024*", false,
1104+
"mma,isa-future-instructions")
1105+
UNALIASED_CUSTOM_BUILTIN(mma_build_dmr, "vW1024*VVVVVVVV", false,
1106+
"mma,isa-future-instructions")
11031107

11041108
// MMA builtins with positive/negative multiply/accumulate.
11051109
UNALIASED_CUSTOM_MMA_BUILTIN(mma_xvf16ger2, "vW512*VV",

clang/lib/CodeGen/TargetBuiltins/PPC.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1152,10 +1152,13 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
11521152
CallOps.push_back(Acc);
11531153
}
11541154
if (BuiltinID == PPC::BI__builtin_mma_dmmr ||
1155-
BuiltinID == PPC::BI__builtin_mma_dmxor) {
1155+
BuiltinID == PPC::BI__builtin_mma_dmxor ||
1156+
BuiltinID == PPC::BI__builtin_mma_disassemble_dmr) {
11561157
Address Addr = EmitPointerWithAlignment(E->getArg(1));
11571158
Ops[1] = Builder.CreateLoad(Addr);
11581159
}
1160+
if (BuiltinID == PPC::BI__builtin_mma_disassemble_dmr)
1161+
return Builder.CreateAlignedStore(Ops[1], Ops[0], MaybeAlign());
11591162
for (unsigned i=1; i<Ops.size(); i++)
11601163
CallOps.push_back(Ops[i]);
11611164
llvm::Function *F = CGM.getIntrinsic(ID);

clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -93,18 +93,36 @@ void test_pmdmxvi8gerx4spp(unsigned char *vdmrp, unsigned char *vpp, vector unsi
9393
*((__dmr1024 *)resp) = vdmr;
9494
}
9595

96-
// CHECK-LABEL: @test_dmf_basic
97-
// CHECK-NEXT: entry:
98-
// CHECK-NEXT: [[TMP0:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmsetdmrz()
99-
// CHECK-NEXT: [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmmr(<1024 x i1> [[TMP0]])
100-
// CHECK-NEXT: store <1024 x i1> [[TMP1]], ptr %res1, align 128
101-
// CHECK-NEXT: [[TMP2:%.*]] = load <1024 x i1>, ptr %res2, align 128
102-
// CHECK-NEXT: [[TMP3:%.*]] = load <1024 x i1>, ptr %p, align 128
103-
// CHECK-NEXT: [[TMP4:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxor(<1024 x i1> [[TMP2]], <1024 x i1> [[TMP3]])
104-
// CHECK-NEXT: store <1024 x i1> [[TMP4]], ptr %res2, align 128
96+
// CHECK-LABEL: @test_dmf_basic(
97+
// CHECK-NEXT: entry:
98+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmsetdmrz()
99+
// CHECK-NEXT: [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmmr(<1024 x i1> [[TMP0]])
100+
// CHECK-NEXT: store <1024 x i1> [[TMP1]], ptr [[RES1:%.*]], align 128
101+
// CHECK-NEXT: [[TMP2:%.*]] = load <1024 x i1>, ptr [[RES2:%.*]], align 128
102+
// CHECK-NEXT: [[TMP3:%.*]] = load <1024 x i1>, ptr [[P:%.*]], align 128
103+
// CHECK-NEXT: [[TMP4:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxor(<1024 x i1> [[TMP2]], <1024 x i1> [[TMP3]])
104+
// CHECK-NEXT: store <1024 x i1> [[TMP4]], ptr [[RES2]], align 128
105+
// CHECK-NEXT: ret void
106+
//
105107
void test_dmf_basic(char *p, char *res1, char *res2) {
106108
__dmr1024 x[2];
107109
__builtin_mma_dmsetdmrz(&x[0]);
108110
__builtin_mma_dmmr((__dmr1024*)res1, &x[0]);
109111
__builtin_mma_dmxor((__dmr1024*)res2, (__dmr1024*)p);
110112
}
113+
114+
// CHECK-LABEL: @test_dmf_basic2(
115+
// CHECK-NEXT: entry:
116+
// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[V:%.*]], align 16, !tbaa [[TBAA8:![0-9]+]]
117+
// CHECK-NEXT: [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.build.dmr(<16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]])
118+
// CHECK-NEXT: store <1024 x i1> [[TMP1]], ptr [[RES2:%.*]], align 128
119+
// CHECK-NEXT: [[TMP2:%.*]] = load <1024 x i1>, ptr [[P1:%.*]], align 128
120+
// CHECK-NEXT: store <1024 x i1> [[TMP2]], ptr [[RES1:%.*]], align 128
121+
// CHECK-NEXT: ret void
122+
//
123+
void test_dmf_basic2(char *p1, char *res1, char *res2,
124+
vector unsigned char *v) {
125+
vector unsigned char vv = *v;
126+
__builtin_mma_build_dmr((__dmr1024*)res2, vv, vv, vv, vv, vv, vv, vv, vv);
127+
__builtin_mma_disassemble_dmr(res1, (__dmr1024*)p1);
128+
}

clang/test/CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ void test_mma(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc)
1616
__builtin_mma_dmsetdmrz(&vdmr);
1717
__builtin_mma_dmmr(&vdmr, (__dmr1024*)vpp);
1818
__builtin_mma_dmxor(&vdmr, (__dmr1024*)vpp);
19+
__builtin_mma_build_dmr(&vdmr, vc, vc, vc, vc, vc, vc, vc, vc);
20+
__builtin_mma_disassemble_dmr(vdmrp, &vdmr);
1921

2022
// CHECK: error: '__builtin_mma_dmxvi8gerx4' needs target feature mma,paired-vector-memops
2123
// CHECK: error: '__builtin_mma_pmdmxvi8gerx4' needs target feature mma,paired-vector-memops
@@ -26,4 +28,6 @@ void test_mma(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc)
2628
// CHECK: error: '__builtin_mma_dmsetdmrz' needs target feature mma,isa-future-instructions
2729
// CHECK: error: '__builtin_mma_dmmr' needs target feature mma,isa-future-instructions
2830
// CHECK: error: '__builtin_mma_dmxor' needs target feature mma,isa-future-instructions
31+
// CHECK: error: '__builtin_mma_build_dmr' needs target feature mma,isa-future-instructions
32+
// CHECK: error: '__builtin_mma_disassemble_dmr' needs target feature mma,isa-future-instructions
2933
}

llvm/include/llvm/IR/IntrinsicsPowerPC.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1708,6 +1708,16 @@ let TargetPrefix = "ppc" in {
17081708
DefaultAttrsIntrinsic<[llvm_v1024i1_ty], [llvm_v1024i1_ty, llvm_v256i1_ty,
17091709
llvm_i32_ty], [IntrNoMem]>;
17101710

1711+
def int_ppc_mma_disassemble_dmr :
1712+
DefaultAttrsIntrinsic<[], [llvm_ptr_ty, llvm_v1024i1_ty],
1713+
[IntrWriteMem, IntrArgMemOnly]>;
1714+
1715+
def int_ppc_mma_build_dmr :
1716+
DefaultAttrsIntrinsic<[llvm_v1024i1_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
1717+
llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty,
1718+
llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
1719+
[IntrNoMem]>;
1720+
17111721
// MMA Reduced-Precision: Outer Product Intrinsic Definitions.
17121722
defm int_ppc_mma_xvi4ger8 :
17131723
PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty]>;

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11274,6 +11274,24 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
1127411274
return DAG.getMergeValues(RetOps, dl);
1127511275
}
1127611276

11277+
case Intrinsic::ppc_mma_build_dmr: {
11278+
SmallVector<SDValue, 8> Pairs;
11279+
SmallVector<SDValue, 8> Chains;
11280+
for (int i = 1; i < 9; i += 2) {
11281+
SDValue Hi = Op.getOperand(i);
11282+
SDValue Lo = Op.getOperand(i + 1);
11283+
if (Hi->getOpcode() == ISD::LOAD)
11284+
Chains.push_back(Hi.getValue(1));
11285+
if (Lo->getOpcode() == ISD::LOAD)
11286+
Chains.push_back(Lo.getValue(1));
11287+
Pairs.push_back(
11288+
DAG.getNode(PPCISD::PAIR_BUILD, dl, MVT::v256i1, {Hi, Lo}));
11289+
}
11290+
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
11291+
SDValue Value = DMFInsert1024(Pairs, SDLoc(Op), DAG);
11292+
return DAG.getMergeValues({Value, TF}, dl);
11293+
}
11294+
1127711295
case Intrinsic::ppc_mma_dmxxextfdmr512: {
1127811296
assert(Subtarget.isISAFuture() && "dmxxextfdmr512 requires ISA Future");
1127911297
auto *Idx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
@@ -11610,6 +11628,10 @@ SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
1161011628
Op.getOperand(0)),
1161111629
0);
1161211630
}
11631+
case Intrinsic::ppc_mma_disassemble_dmr: {
11632+
return DAG.getStore(DAG.getEntryNode(), DL, Op.getOperand(ArgStart + 2),
11633+
Op.getOperand(ArgStart + 1), MachinePointerInfo());
11634+
}
1161311635
default:
1161411636
break;
1161511637
}
@@ -12099,6 +12121,24 @@ SDValue PPCTargetLowering::LowerDMFVectorLoad(SDValue Op,
1209912121
return DAG.getMergeValues({DmrPValue, TF}, dl);
1210012122
}
1210112123

12124+
SDValue PPCTargetLowering::DMFInsert1024(const SmallVectorImpl<SDValue> &Pairs,
12125+
const SDLoc &dl,
12126+
SelectionDAG &DAG) const {
12127+
SDValue Lo(DAG.getMachineNode(PPC::DMXXINSTDMR512, dl, MVT::v512i1, Pairs[0],
12128+
Pairs[1]),
12129+
0);
12130+
SDValue LoSub = DAG.getTargetConstant(PPC::sub_wacc_lo, dl, MVT::i32);
12131+
SDValue Hi(DAG.getMachineNode(PPC::DMXXINSTDMR512_HI, dl, MVT::v512i1,
12132+
Pairs[2], Pairs[3]),
12133+
0);
12134+
SDValue HiSub = DAG.getTargetConstant(PPC::sub_wacc_hi, dl, MVT::i32);
12135+
SDValue RC = DAG.getTargetConstant(PPC::DMRRCRegClassID, dl, MVT::i32);
12136+
12137+
return SDValue(DAG.getMachineNode(PPC::REG_SEQUENCE, dl, MVT::v1024i1,
12138+
{RC, Lo, LoSub, Hi, HiSub}),
12139+
0);
12140+
}
12141+
1210212142
SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
1210312143
SelectionDAG &DAG) const {
1210412144
SDLoc dl(Op);

llvm/lib/Target/PowerPC/PPCISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1345,6 +1345,8 @@ namespace llvm {
13451345
SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const;
13461346
SDValue LowerDMFVectorLoad(SDValue Op, SelectionDAG &DAG) const;
13471347
SDValue LowerDMFVectorStore(SDValue Op, SelectionDAG &DAG) const;
1348+
SDValue DMFInsert1024(const SmallVectorImpl<SDValue> &Pairs,
1349+
const SDLoc &dl, SelectionDAG &DAG) const;
13481350

13491351
SDValue LowerCallResult(SDValue Chain, SDValue InGlue,
13501352
CallingConv::ID CallConv, bool isVarArg,

llvm/test/CodeGen/PowerPC/dmr-enable.ll

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,69 @@ entry:
367367
ret void
368368
}
369369

370+
define void @tbuild(ptr %p1, ptr %p2, ptr %res1, ptr %res2, ptr %v) {
371+
; CHECK-LABEL: tbuild:
372+
; CHECK: # %bb.0: # %entry
373+
; CHECK-NEXT: lxv v3, 0(r7)
374+
; CHECK-NEXT: vmr v2, v3
375+
; CHECK-NEXT: dmxxinstdmr512 wacc_hi0, vsp34, vsp34, 1
376+
; CHECK-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0
377+
; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
378+
; CHECK-NEXT: stxvp vsp34, 96(r6)
379+
; CHECK-NEXT: stxvp vsp36, 64(r6)
380+
; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
381+
; CHECK-NEXT: stxvp vsp34, 32(r6)
382+
; CHECK-NEXT: stxvp vsp36, 0(r6)
383+
; CHECK-NEXT: lxvp vsp34, 0(r3)
384+
; CHECK-NEXT: lxvp vsp36, 32(r3)
385+
; CHECK-NEXT: dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
386+
; CHECK-NEXT: lxvp vsp34, 64(r3)
387+
; CHECK-NEXT: lxvp vsp36, 96(r3)
388+
; CHECK-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0
389+
; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
390+
; CHECK-NEXT: stxvp vsp34, 96(r5)
391+
; CHECK-NEXT: stxvp vsp36, 64(r5)
392+
; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
393+
; CHECK-NEXT: stxvp vsp34, 32(r5)
394+
; CHECK-NEXT: stxvp vsp36, 0(r5)
395+
; CHECK-NEXT: blr
396+
;
397+
; CHECK-BE-LABEL: tbuild:
398+
; CHECK-BE: # %bb.0: # %entry
399+
; CHECK-BE-NEXT: lxv v3, 0(r7)
400+
; CHECK-BE-NEXT: vmr v2, v3
401+
; CHECK-BE-NEXT: dmxxinstdmr512 wacc_hi0, vsp34, vsp34, 1
402+
; CHECK-BE-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0
403+
; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
404+
; CHECK-BE-NEXT: stxvp vsp36, 96(r6)
405+
; CHECK-BE-NEXT: stxvp vsp34, 64(r6)
406+
; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
407+
; CHECK-BE-NEXT: stxvp vsp36, 32(r6)
408+
; CHECK-BE-NEXT: stxvp vsp34, 0(r6)
409+
; CHECK-BE-NEXT: lxvp vsp34, 96(r3)
410+
; CHECK-BE-NEXT: lxvp vsp36, 64(r3)
411+
; CHECK-BE-NEXT: dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
412+
; CHECK-BE-NEXT: lxvp vsp34, 32(r3)
413+
; CHECK-BE-NEXT: lxvp vsp36, 0(r3)
414+
; CHECK-BE-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0
415+
; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
416+
; CHECK-BE-NEXT: stxvp vsp36, 96(r5)
417+
; CHECK-BE-NEXT: stxvp vsp34, 64(r5)
418+
; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
419+
; CHECK-BE-NEXT: stxvp vsp36, 32(r5)
420+
; CHECK-BE-NEXT: stxvp vsp34, 0(r5)
421+
; CHECK-BE-NEXT: blr
422+
entry:
423+
%0 = load <16 x i8>, ptr %v, align 16
424+
%1 = tail call <1024 x i1> @llvm.ppc.mma.build.dmr(<16 x i8> %0, <16 x i8> %0, <16 x i8> %0, <16 x i8> %0, <16 x i8> %0, <16 x i8> %0, <16 x i8> %0, <16 x i8> %0)
425+
store <1024 x i1> %1, ptr %res2, align 128
426+
%2 = load <1024 x i1>, ptr %p1, align 128
427+
tail call void @llvm.ppc.mma.disassemble.dmr(ptr %res1, <1024 x i1> %2)
428+
ret void
429+
}
430+
431+
declare <1024 x i1> @llvm.ppc.mma.build.dmr(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
432+
declare void @llvm.ppc.mma.disassemble.dmr(ptr, <1024 x i1>)
370433
declare <1024 x i1> @llvm.ppc.mma.dmsetdmrz()
371434
declare <1024 x i1> @llvm.ppc.mma.dmmr(<1024 x i1>)
372435
declare <1024 x i1> @llvm.ppc.mma.dmxor(<1024 x i1>, <1024 x i1>)

0 commit comments

Comments
 (0)