Skip to content

Commit 76790cf

Browse files
[AArch64][SME2] Add Multi-vector add vector intrinsics
Add the following intrinsic: ADD vectors NOTE: These intrinsics are still in development and are subject to future changes. Reviewed By: david-arm Differential Revision: https://reviews.llvm.org/D142455
1 parent 557ea98 commit 76790cf

File tree

3 files changed

+175
-0
lines changed

3 files changed

+175
-0
lines changed

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3132,4 +3132,10 @@ let TargetPrefix = "aarch64" in {
31323132

31333133
def int_aarch64_sme_write_vg1x2 : SME2_ZA_ArrayVector_Write_VG2_Intrinsic;
31343134
def int_aarch64_sme_write_vg1x4 : SME2_ZA_ArrayVector_Write_VG4_Intrinsic;
3135+
3136+
//
3137+
// Multi-Single Vector add
3138+
//
3139+
def int_aarch64_sve_add_single_x2 : SME2_VG2_Multi_Single_Intrinsic;
3140+
def int_aarch64_sve_add_single_x4 : SME2_VG4_Multi_Single_Intrinsic;
31353141
}

llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5061,6 +5061,20 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
50615061
AArch64::FCLAMP_VG4_4Z4Z_D}))
50625062
SelectClamp(Node, 4, Op);
50635063
return;
5064+
case Intrinsic::aarch64_sve_add_single_x2:
5065+
if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int>(
5066+
Node->getValueType(0),
5067+
{AArch64::ADD_VG2_2ZZ_B, AArch64::ADD_VG2_2ZZ_H,
5068+
AArch64::ADD_VG2_2ZZ_S, AArch64::ADD_VG2_2ZZ_D}))
5069+
SelectDestructiveMultiIntrinsic(Node, 2, false, Op);
5070+
return;
5071+
case Intrinsic::aarch64_sve_add_single_x4:
5072+
if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int>(
5073+
Node->getValueType(0),
5074+
{AArch64::ADD_VG4_4ZZ_B, AArch64::ADD_VG4_4ZZ_H,
5075+
AArch64::ADD_VG4_4ZZ_S, AArch64::ADD_VG4_4ZZ_D}))
5076+
SelectDestructiveMultiIntrinsic(Node, 4, false, Op);
5077+
return;
50645078
}
50655079
break;
50665080
}

llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,153 @@ define void @multi_vector_add_za_vg1x4_f64(i32 %slice, <vscale x 2 x double> %zn
370370
ret void
371371
}
372372

373+
;
374+
; ADD Vectors Multi-Single x2
375+
;
376+
377+
define { <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_add_single_x2_s8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zm) {
378+
; CHECK-LABEL: multi_vec_add_single_x2_s8:
379+
; CHECK: // %bb.0:
380+
; CHECK-NEXT: mov z5.d, z2.d
381+
; CHECK-NEXT: mov z4.d, z1.d
382+
; CHECK-NEXT: add { z4.b, z5.b }, { z4.b, z5.b }, z3.b
383+
; CHECK-NEXT: mov z0.d, z4.d
384+
; CHECK-NEXT: mov z1.d, z5.d
385+
; CHECK-NEXT: ret
386+
%res = call { <vscale x 16 x i8>, <vscale x 16 x i8> }
387+
@llvm.aarch64.sve.add.single.x2.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2,
388+
<vscale x 16 x i8> %zm)
389+
ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
390+
}
391+
392+
define { <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_add_single_x2_s16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zm) {
393+
; CHECK-LABEL: multi_vec_add_single_x2_s16:
394+
; CHECK: // %bb.0:
395+
; CHECK-NEXT: mov z5.d, z2.d
396+
; CHECK-NEXT: mov z4.d, z1.d
397+
; CHECK-NEXT: add { z4.h, z5.h }, { z4.h, z5.h }, z3.h
398+
; CHECK-NEXT: mov z0.d, z4.d
399+
; CHECK-NEXT: mov z1.d, z5.d
400+
; CHECK-NEXT: ret
401+
%res = call { <vscale x 8 x i16>, <vscale x 8 x i16> }
402+
@llvm.aarch64.sve.add.single.x2.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2,
403+
<vscale x 8 x i16> %zm)
404+
ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res
405+
}
406+
407+
define { <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_add_single_x2_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zm) {
408+
; CHECK-LABEL: multi_vec_add_single_x2_s32:
409+
; CHECK: // %bb.0:
410+
; CHECK-NEXT: mov z5.d, z2.d
411+
; CHECK-NEXT: mov z4.d, z1.d
412+
; CHECK-NEXT: add { z4.s, z5.s }, { z4.s, z5.s }, z3.s
413+
; CHECK-NEXT: mov z0.d, z4.d
414+
; CHECK-NEXT: mov z1.d, z5.d
415+
; CHECK-NEXT: ret
416+
%res = call { <vscale x 4 x i32>, <vscale x 4 x i32> }
417+
@llvm.aarch64.sve.add.single.x2.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2,
418+
<vscale x 4 x i32> %zm)
419+
ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
420+
}
421+
422+
define { <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_add_single_x2_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zm) {
423+
; CHECK-LABEL: multi_vec_add_single_x2_s64:
424+
; CHECK: // %bb.0:
425+
; CHECK-NEXT: mov z5.d, z2.d
426+
; CHECK-NEXT: mov z4.d, z1.d
427+
; CHECK-NEXT: add { z4.d, z5.d }, { z4.d, z5.d }, z3.d
428+
; CHECK-NEXT: mov z0.d, z4.d
429+
; CHECK-NEXT: mov z1.d, z5.d
430+
; CHECK-NEXT: ret
431+
%res = call { <vscale x 2 x i64>, <vscale x 2 x i64> }
432+
@llvm.aarch64.sve.add.single.x2.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2,
433+
<vscale x 2 x i64> %zm)
434+
ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
435+
}
436+
437+
;
438+
; ADD Vectors Multi-Single x4
439+
;
440+
441+
define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_add_single_x4_s8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8>%zm) {
442+
; CHECK-LABEL: multi_vec_add_single_x4_s8:
443+
; CHECK: // %bb.0:
444+
; CHECK-NEXT: mov z27.d, z4.d
445+
; CHECK-NEXT: mov z26.d, z3.d
446+
; CHECK-NEXT: mov z25.d, z2.d
447+
; CHECK-NEXT: mov z24.d, z1.d
448+
; CHECK-NEXT: add { z24.b - z27.b }, { z24.b - z27.b }, z5.b
449+
; CHECK-NEXT: mov z0.d, z24.d
450+
; CHECK-NEXT: mov z1.d, z25.d
451+
; CHECK-NEXT: mov z2.d, z26.d
452+
; CHECK-NEXT: mov z3.d, z27.d
453+
; CHECK-NEXT: ret
454+
%res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
455+
@llvm.aarch64.sve.add.single.x4.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2,
456+
<vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4,
457+
<vscale x 16 x i8> %zm)
458+
ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
459+
}
460+
461+
define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_add_x4_single_s16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm) {
462+
; CHECK-LABEL: multi_vec_add_x4_single_s16:
463+
; CHECK: // %bb.0:
464+
; CHECK-NEXT: mov z27.d, z4.d
465+
; CHECK-NEXT: mov z26.d, z3.d
466+
; CHECK-NEXT: mov z25.d, z2.d
467+
; CHECK-NEXT: mov z24.d, z1.d
468+
; CHECK-NEXT: add { z24.h - z27.h }, { z24.h - z27.h }, z5.h
469+
; CHECK-NEXT: mov z0.d, z24.d
470+
; CHECK-NEXT: mov z1.d, z25.d
471+
; CHECK-NEXT: mov z2.d, z26.d
472+
; CHECK-NEXT: mov z3.d, z27.d
473+
; CHECK-NEXT: ret
474+
%res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
475+
@llvm.aarch64.sve.add.single.x4.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2,
476+
<vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4,
477+
<vscale x 8 x i16> %zm)
478+
ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
479+
}
480+
481+
define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_add_x4_single_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm) {
482+
; CHECK-LABEL: multi_vec_add_x4_single_s32:
483+
; CHECK: // %bb.0:
484+
; CHECK-NEXT: mov z27.d, z4.d
485+
; CHECK-NEXT: mov z26.d, z3.d
486+
; CHECK-NEXT: mov z25.d, z2.d
487+
; CHECK-NEXT: mov z24.d, z1.d
488+
; CHECK-NEXT: add { z24.s - z27.s }, { z24.s - z27.s }, z5.s
489+
; CHECK-NEXT: mov z0.d, z24.d
490+
; CHECK-NEXT: mov z1.d, z25.d
491+
; CHECK-NEXT: mov z2.d, z26.d
492+
; CHECK-NEXT: mov z3.d, z27.d
493+
; CHECK-NEXT: ret
494+
%res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
495+
@llvm.aarch64.sve.add.single.x4.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2,
496+
<vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4,
497+
<vscale x 4 x i32> %zm)
498+
ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
499+
}
500+
501+
define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_add_x4_single_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm) {
502+
; CHECK-LABEL: multi_vec_add_x4_single_s64:
503+
; CHECK: // %bb.0:
504+
; CHECK-NEXT: mov z27.d, z4.d
505+
; CHECK-NEXT: mov z26.d, z3.d
506+
; CHECK-NEXT: mov z25.d, z2.d
507+
; CHECK-NEXT: mov z24.d, z1.d
508+
; CHECK-NEXT: add { z24.d - z27.d }, { z24.d - z27.d }, z5.d
509+
; CHECK-NEXT: mov z0.d, z24.d
510+
; CHECK-NEXT: mov z1.d, z25.d
511+
; CHECK-NEXT: mov z2.d, z26.d
512+
; CHECK-NEXT: mov z3.d, z27.d
513+
; CHECK-NEXT: ret
514+
%res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
515+
@llvm.aarch64.sve.add.single.x4.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2,
516+
<vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4,
517+
<vscale x 2 x i64> %zm)
518+
ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
519+
}
373520
declare void@llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
374521
declare void@llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
375522
declare void@llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
@@ -386,3 +533,11 @@ declare [email protected](i32, <vscale x 4 x float>,
386533
declare void@llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>)
387534
declare void@llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>)
388535
declare void@llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>)
536+
declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x2.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
537+
declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x2.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
538+
declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x2.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
539+
declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x2.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
540+
declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x4.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
541+
declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x4.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
542+
declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x4.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
543+
declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x4.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)

0 commit comments

Comments
 (0)