Skip to content

Commit 698c06b

Browse files
committed
Improve handling of half element types
- Add tests for half element types, and only sink operands when subtargtet has fullfp16 - Refactor scalable test to use target-features attribute, rather than -mattr on the RUN line
1 parent 4ab2719 commit 698c06b

File tree

2 files changed

+91
-4
lines changed

2 files changed

+91
-4
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5244,6 +5244,10 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
52445244
if (I->getType()->isScalableTy())
52455245
return false;
52465246

5247+
if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
5248+
!ST->hasFullFP16())
5249+
return false;
5250+
52475251
// Sink splats for index lane variants
52485252
if (isSplatShuffle(I->getOperand(0)))
52495253
Ops.push_back(&I->getOperandUse(0));

llvm/test/CodeGen/AArch64/sinksplat.ll

Lines changed: 87 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -o - %s | FileCheck %s
2+
; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
33

44
define <4 x i32> @smull(<4 x i16> %x, ptr %y) {
55
; CHECK-LABEL: smull:
@@ -422,8 +422,91 @@ l2:
422422
ret <4 x i32> %r
423423
}
424424

425+
; We shouldn't sink without fullfp16.
426+
define <4 x half> @fmul_half(ptr %x, ptr %y) {
427+
; CHECK-LABEL: fmul_half:
428+
; CHECK: // %bb.0: // %entry
429+
; CHECK-NEXT: ld1r { v1.4h }, [x0]
430+
; CHECK-NEXT: movi d0, #0000000000000000
431+
; CHECK-NEXT: mov x8, xzr
432+
; CHECK-NEXT: fcvtl v1.4s, v1.4h
433+
; CHECK-NEXT: .LBB13_1: // %l1
434+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
435+
; CHECK-NEXT: ldr d2, [x1, x8]
436+
; CHECK-NEXT: fcvtl v0.4s, v0.4h
437+
; CHECK-NEXT: add x8, x8, #8
438+
; CHECK-NEXT: cmp w8, #8
439+
; CHECK-NEXT: fcvtl v2.4s, v2.4h
440+
; CHECK-NEXT: fmul v2.4s, v2.4s, v1.4s
441+
; CHECK-NEXT: fcvtn v2.4h, v2.4s
442+
; CHECK-NEXT: fcvtl v2.4s, v2.4h
443+
; CHECK-NEXT: fadd v0.4s, v2.4s, v0.4s
444+
; CHECK-NEXT: fcvtn v0.4h, v0.4s
445+
; CHECK-NEXT: b.eq .LBB13_1
446+
; CHECK-NEXT: // %bb.2: // %l2
447+
; CHECK-NEXT: ret
448+
entry:
449+
%x.val = load half, ptr %x
450+
%x.ins = insertelement <4 x half> poison, half %x.val, i64 0
451+
%a = shufflevector <4 x half> %x.ins, <4 x half> undef, <4 x i32> zeroinitializer
452+
br label %l1
453+
454+
l1:
455+
%p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
456+
%q = phi <4 x half> [ zeroinitializer, %entry ], [ %c, %l1 ]
457+
%idx.y = mul nuw nsw i32 %p, 4
458+
%ptr.y = getelementptr half, ptr %y, i32 %idx.y
459+
%l = load <4 x half>, ptr %ptr.y
460+
%b = fmul <4 x half> %l, %a
461+
%c = fadd <4 x half> %b, %q
462+
%pa = add i32 %p, 1
463+
%c1 = icmp eq i32 %p, 0
464+
br i1 %c1, label %l1, label %l2
465+
466+
l2:
467+
ret <4 x half> %c
468+
}
469+
470+
define <4 x half> @fmul_half_fullfp16(ptr %x, ptr %y) "target-features"="+fullfp16" {
471+
; CHECK-LABEL: fmul_half_fullfp16:
472+
; CHECK: // %bb.0: // %entry
473+
; CHECK-NEXT: movi d0, #0000000000000000
474+
; CHECK-NEXT: ldr h1, [x0]
475+
; CHECK-NEXT: mov x8, xzr
476+
; CHECK-NEXT: .LBB14_1: // %l1
477+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
478+
; CHECK-NEXT: ldr d2, [x1, x8]
479+
; CHECK-NEXT: add x8, x8, #8
480+
; CHECK-NEXT: cmp w8, #8
481+
; CHECK-NEXT: fmul v2.4h, v2.4h, v1.h[0]
482+
; CHECK-NEXT: fadd v0.4h, v2.4h, v0.4h
483+
; CHECK-NEXT: b.eq .LBB14_1
484+
; CHECK-NEXT: // %bb.2: // %l2
485+
; CHECK-NEXT: ret
486+
entry:
487+
%x.val = load half, ptr %x
488+
%x.ins = insertelement <4 x half> poison, half %x.val, i64 0
489+
%a = shufflevector <4 x half> %x.ins, <4 x half> undef, <4 x i32> zeroinitializer
490+
br label %l1
491+
492+
l1:
493+
%p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
494+
%q = phi <4 x half> [ zeroinitializer, %entry ], [ %c, %l1 ]
495+
%idx.y = mul nuw nsw i32 %p, 4
496+
%ptr.y = getelementptr half, ptr %y, i32 %idx.y
497+
%l = load <4 x half>, ptr %ptr.y
498+
%b = fmul <4 x half> %l, %a
499+
%c = fadd <4 x half> %b, %q
500+
%pa = add i32 %p, 1
501+
%c1 = icmp eq i32 %p, 0
502+
br i1 %c1, label %l1, label %l2
503+
504+
l2:
505+
ret <4 x half> %c
506+
}
507+
425508
; We shouldn't sink the splat operand for scalable vectors.
426-
define <vscale x 4 x float> @fmul_scalable(ptr %x, ptr %y) {
509+
define <vscale x 4 x float> @fmul_scalable(ptr %x, ptr %y) "target-features"="+sve" {
427510
; CHECK-LABEL: fmul_scalable:
428511
; CHECK: // %bb.0: // %entry
429512
; CHECK-NEXT: ptrue p0.s
@@ -433,14 +516,14 @@ define <vscale x 4 x float> @fmul_scalable(ptr %x, ptr %y) {
433516
; CHECK-NEXT: mov w9, #1 // =0x1
434517
; CHECK-NEXT: ld1rw { z1.s }, p0/z, [x0]
435518
; CHECK-NEXT: lsl x8, x8, #2
436-
; CHECK-NEXT: .LBB13_1: // %l1
519+
; CHECK-NEXT: .LBB15_1: // %l1
437520
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
438521
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1]
439522
; CHECK-NEXT: subs w9, w9, #1
440523
; CHECK-NEXT: add x1, x1, x8
441524
; CHECK-NEXT: fmul z2.s, z2.s, z1.s
442525
; CHECK-NEXT: fadd z0.s, z2.s, z0.s
443-
; CHECK-NEXT: b.eq .LBB13_1
526+
; CHECK-NEXT: b.eq .LBB15_1
444527
; CHECK-NEXT: // %bb.2: // %l2
445528
; CHECK-NEXT: ret
446529
entry:

0 commit comments

Comments
 (0)