Skip to content

Commit 32d761b

Browse files
[AArch64][machine-scheduler][Neoverse-N2] fdiv is blocking (#119206)
For Neoverse-N2, mark FP divide and square root instructions as blocking their pipeline until complete. This matches the way that blocking integer divide instructions are marked. From the Software Optimization Guide, section 3.14 Notes: 1. FP divide and square root operations are performed using an iterative algorithm and block subsequent similar operations to the same pipeline until complete. --------- Co-authored-by: Cullen Rhodes <[email protected]>
1 parent ec58ad6 commit 32d761b

File tree

3 files changed

+60
-20
lines changed

3 files changed

+60
-20
lines changed

llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td

Lines changed: 47 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -512,6 +512,12 @@ def N2Write_8c_3L_4V : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitL,
512512
let NumMicroOps = 7;
513513
}
514514

515+
def N2Write_7c_7V0 : SchedWriteRes<[N2UnitV0]> {
516+
let Latency = 7;
517+
let NumMicroOps = 7;
518+
let ReleaseAtCycles = [7];
519+
}
520+
515521
//===----------------------------------------------------------------------===//
516522
// Define generic 8 micro-op types
517523

@@ -547,6 +553,15 @@ def N2Write_9c_4L_4V : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitL, N2UnitL,
547553
let NumMicroOps = 8;
548554
}
549555

556+
//===----------------------------------------------------------------------===//
557+
// Define generic 9 micro-op types
558+
559+
def N2Write_9c_9V0 : SchedWriteRes<[N2UnitV0]> {
560+
let Latency = 9;
561+
let NumMicroOps = 9;
562+
let ReleaseAtCycles = [9];
563+
}
564+
550565
//===----------------------------------------------------------------------===//
551566
// Define generic 10 micro-op types
552567

@@ -557,6 +572,12 @@ def N2Write_7c_5L01_5V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01,
557572
let NumMicroOps = 10;
558573
}
559574

575+
def N2Write_10c_10V0 : SchedWriteRes<[N2UnitV0]> {
576+
let Latency = 10;
577+
let NumMicroOps = 10;
578+
let ReleaseAtCycles = [10];
579+
}
580+
560581
//===----------------------------------------------------------------------===//
561582
// Define generic 12 micro-op types
562583

@@ -580,6 +601,21 @@ def N2Write_7c_5L01_5S_5V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01,
580601
let NumMicroOps = 15;
581602
}
582603

604+
def N2Write_15c_15V0 : SchedWriteRes<[N2UnitV0]> {
605+
let Latency = 15;
606+
let NumMicroOps = 15;
607+
let ReleaseAtCycles = [15];
608+
}
609+
610+
//===----------------------------------------------------------------------===//
611+
// Define generic 16 micro-op types
612+
613+
def N2Write_16c_16V0 : SchedWriteRes<[N2UnitV0]> {
614+
let Latency = 16;
615+
let NumMicroOps = 16;
616+
let ReleaseAtCycles = [16];
617+
}
618+
583619
//===----------------------------------------------------------------------===//
584620
// Define generic 18 micro-op types
585621

@@ -795,22 +831,26 @@ def : SchedAlias<WriteF, N2Write_2c_1V>;
795831
// FP compare
796832
def : SchedAlias<WriteFCmp, N2Write_2c_1V0>;
797833

834+
// FP divide and square root operations are performed using an iterative
835+
// algorithm and block subsequent similar operations to the same pipeline
836+
// until complete (Arm Neoverse N2 Software Optimization Guide, 3.14).
837+
798838
// FP divide, square root
799-
def : SchedAlias<WriteFDiv, N2Write_7c_1V0>;
839+
def : SchedAlias<WriteFDiv, N2Write_7c_7V0>;
800840

801841
// FP divide, H-form
802-
def : InstRW<[N2Write_7c_1V0], (instrs FDIVHrr)>;
842+
def : InstRW<[N2Write_7c_7V0], (instrs FDIVHrr)>;
803843
// FP divide, S-form
804-
def : InstRW<[N2Write_10c_1V0], (instrs FDIVSrr)>;
844+
def : InstRW<[N2Write_10c_10V0], (instrs FDIVSrr)>;
805845
// FP divide, D-form
806-
def : InstRW<[N2Write_15c_1V0], (instrs FDIVDrr)>;
846+
def : InstRW<[N2Write_15c_15V0], (instrs FDIVDrr)>;
807847

808848
// FP square root, H-form
809-
def : InstRW<[N2Write_7c_1V0], (instrs FSQRTHr)>;
849+
def : InstRW<[N2Write_7c_7V0], (instrs FSQRTHr)>;
810850
// FP square root, S-form
811-
def : InstRW<[N2Write_9c_1V0], (instrs FSQRTSr)>;
851+
def : InstRW<[N2Write_9c_9V0], (instrs FSQRTSr)>;
812852
// FP square root, D-form
813-
def : InstRW<[N2Write_16c_1V0], (instrs FSQRTDr)>;
853+
def : InstRW<[N2Write_16c_16V0], (instrs FSQRTDr)>;
814854

815855
// FP multiply
816856
def : WriteRes<WriteFMul, [N2UnitV]> { let Latency = 3; }

llvm/test/CodeGen/AArch64/machine-combiner.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -262,8 +262,8 @@ define half @reassociate_adds_half(half %x0, half %x1, half %x2, half %x3) {
262262
; CHECK-UNSAFE-LABEL: reassociate_adds_half:
263263
; CHECK-UNSAFE: // %bb.0:
264264
; CHECK-UNSAFE-NEXT: fdiv h0, h0, h1
265-
; CHECK-UNSAFE-NEXT: fadd h1, h3, h2
266-
; CHECK-UNSAFE-NEXT: fadd h0, h1, h0
265+
; CHECK-UNSAFE-NEXT: fadd h2, h3, h2
266+
; CHECK-UNSAFE-NEXT: fadd h0, h2, h0
267267
; CHECK-UNSAFE-NEXT: ret
268268
%t0 = fdiv half %x0, %x1
269269
%t1 = fadd half %x2, %t0
@@ -284,8 +284,8 @@ define half @reassociate_muls_half(half %x0, half %x1, half %x2, half %x3) {
284284
; CHECK-UNSAFE-LABEL: reassociate_muls_half:
285285
; CHECK-UNSAFE: // %bb.0:
286286
; CHECK-UNSAFE-NEXT: fdiv h0, h0, h1
287-
; CHECK-UNSAFE-NEXT: fmul h1, h3, h2
288-
; CHECK-UNSAFE-NEXT: fmul h0, h1, h0
287+
; CHECK-UNSAFE-NEXT: fmul h2, h3, h2
288+
; CHECK-UNSAFE-NEXT: fmul h0, h2, h0
289289
; CHECK-UNSAFE-NEXT: ret
290290
%t0 = fdiv half %x0, %x1
291291
%t1 = fmul half %x2, %t0

llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-basic-instructions.s

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1891,7 +1891,7 @@ drps
18911891
# CHECK-NEXT: 1 2 0.50 fmov s0, s1
18921892
# CHECK-NEXT: 1 2 0.50 fabs s2, s3
18931893
# CHECK-NEXT: 1 2 0.50 fneg s4, s5
1894-
# CHECK-NEXT: 1 9 1.00 fsqrt s6, s7
1894+
# CHECK-NEXT: 9 9 9.00 fsqrt s6, s7
18951895
# CHECK-NEXT: 1 3 1.00 fcvt d8, s9
18961896
# CHECK-NEXT: 1 3 1.00 fcvt h10, s11
18971897
# CHECK-NEXT: 1 3 1.00 frintn s12, s13
@@ -1904,7 +1904,7 @@ drps
19041904
# CHECK-NEXT: 1 2 0.50 fmov d0, d1
19051905
# CHECK-NEXT: 1 2 0.50 fabs d2, d3
19061906
# CHECK-NEXT: 1 2 0.50 fneg d4, d5
1907-
# CHECK-NEXT: 1 16 1.00 fsqrt d6, d7
1907+
# CHECK-NEXT: 16 16 16.00 fsqrt d6, d7
19081908
# CHECK-NEXT: 1 3 1.00 fcvt s8, d9
19091909
# CHECK-NEXT: 1 3 1.00 fcvt h10, d11
19101910
# CHECK-NEXT: 1 3 1.00 frintn d12, d13
@@ -1917,7 +1917,7 @@ drps
19171917
# CHECK-NEXT: 1 3 1.00 fcvt s26, h27
19181918
# CHECK-NEXT: 1 3 1.00 fcvt d28, h29
19191919
# CHECK-NEXT: 1 3 0.50 fmul s20, s19, s17
1920-
# CHECK-NEXT: 1 10 1.00 fdiv s1, s2, s3
1920+
# CHECK-NEXT: 10 10 10.00 fdiv s1, s2, s3
19211921
# CHECK-NEXT: 1 2 0.50 fadd s4, s5, s6
19221922
# CHECK-NEXT: 1 2 0.50 fsub s7, s8, s9
19231923
# CHECK-NEXT: 1 2 0.50 fmax s10, s11, s12
@@ -1926,7 +1926,7 @@ drps
19261926
# CHECK-NEXT: 1 2 0.50 fminnm s19, s20, s21
19271927
# CHECK-NEXT: 1 3 0.50 fnmul s22, s23, s2
19281928
# CHECK-NEXT: 1 3 0.50 fmul d20, d19, d17
1929-
# CHECK-NEXT: 1 15 1.00 fdiv d1, d2, d3
1929+
# CHECK-NEXT: 15 15 15.00 fdiv d1, d2, d3
19301930
# CHECK-NEXT: 1 2 0.50 fadd d4, d5, d6
19311931
# CHECK-NEXT: 1 2 0.50 fsub d7, d8, d9
19321932
# CHECK-NEXT: 1 2 0.50 fmax d10, d11, d12
@@ -2557,7 +2557,7 @@ drps
25572557

25582558
# CHECK: Resource pressure per iteration:
25592559
# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8]
2560-
# CHECK-NEXT: 11.00 11.00 33.00 33.00 87.33 151.33 151.33 517.00 251.00 162.50 162.50 169.50 85.50
2560+
# CHECK-NEXT: 11.00 11.00 33.00 33.00 87.33 151.33 151.33 517.00 251.00 162.50 162.50 215.50 85.50
25612561

25622562
# CHECK: Resource pressure by instruction:
25632563
# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] Instructions:
@@ -3075,7 +3075,7 @@ drps
30753075
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fmov s0, s1
30763076
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fabs s2, s3
30773077
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fneg s4, s5
3078-
# CHECK-NEXT: - - - - - - - - - - - 1.00 - fsqrt s6, s7
3078+
# CHECK-NEXT: - - - - - - - - - - - 9.00 - fsqrt s6, s7
30793079
# CHECK-NEXT: - - - - - - - - - - - 1.00 - fcvt d8, s9
30803080
# CHECK-NEXT: - - - - - - - - - - - 1.00 - fcvt h10, s11
30813081
# CHECK-NEXT: - - - - - - - - - - - 1.00 - frintn s12, s13
@@ -3088,7 +3088,7 @@ drps
30883088
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fmov d0, d1
30893089
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fabs d2, d3
30903090
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fneg d4, d5
3091-
# CHECK-NEXT: - - - - - - - - - - - 1.00 - fsqrt d6, d7
3091+
# CHECK-NEXT: - - - - - - - - - - - 16.00 - fsqrt d6, d7
30923092
# CHECK-NEXT: - - - - - - - - - - - 1.00 - fcvt s8, d9
30933093
# CHECK-NEXT: - - - - - - - - - - - 1.00 - fcvt h10, d11
30943094
# CHECK-NEXT: - - - - - - - - - - - 1.00 - frintn d12, d13
@@ -3101,7 +3101,7 @@ drps
31013101
# CHECK-NEXT: - - - - - - - - - - - 1.00 - fcvt s26, h27
31023102
# CHECK-NEXT: - - - - - - - - - - - 1.00 - fcvt d28, h29
31033103
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fmul s20, s19, s17
3104-
# CHECK-NEXT: - - - - - - - - - - - 1.00 - fdiv s1, s2, s3
3104+
# CHECK-NEXT: - - - - - - - - - - - 10.00 - fdiv s1, s2, s3
31053105
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fadd s4, s5, s6
31063106
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fsub s7, s8, s9
31073107
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fmax s10, s11, s12
@@ -3110,7 +3110,7 @@ drps
31103110
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fminnm s19, s20, s21
31113111
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fnmul s22, s23, s2
31123112
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fmul d20, d19, d17
3113-
# CHECK-NEXT: - - - - - - - - - - - 1.00 - fdiv d1, d2, d3
3113+
# CHECK-NEXT: - - - - - - - - - - - 15.00 - fdiv d1, d2, d3
31143114
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fadd d4, d5, d6
31153115
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fsub d7, d8, d9
31163116
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fmax d10, d11, d12

0 commit comments

Comments
 (0)