Skip to content

Commit 9c0332d

Browse files
committed
Initial pass at "relaxed" multiply and add operations.
This commit adds the following to the RealFunctions protocol: static func _relaxedAdd(_:Self, _:Self) -> Self static func _relaxedMul(_:Self, _:Self) -> Self These are equivalent to + and *, but have "relaxed semantics"; specifically, they license the compiler to reassociate them and to form FMA nodes, which are both significant optimizations that can easily make many common loops 8-10x faster. These transformation perturb results slightly, so they should not be enabled without care, but the results with the relaxed operations are--for most purposes--"just as good as" (and often better than) what strict operations produce. The main thing to beware of is that they are no longer portable; different compiler versions and different targets and optimization flags will result in different results. They are underscored because they are not stable API. In particular: - `RealFunctions` is not really the right protocol for these (and neither is `Real`). I need to do some thinking about where to attach them. - Even if it were the right protocol, these are more like implementation hooks than the API I really want people to use (TBD). - I like "relaxed" more than other commonly used idioms ("fast"), but I'm not sure it's the name I ultimately want.
1 parent 97c716b commit 9c0332d

File tree

9 files changed

+307
-6
lines changed

9 files changed

+307
-6
lines changed

Sources/RealModule/Double+Real.swift

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,4 +224,14 @@ extension Double: Real {
224224
public static func _mulAdd(_ a: Double, _ b: Double, _ c: Double) -> Double {
225225
_numerics_muladd(a, b, c)
226226
}
227+
228+
@_transparent
229+
public static func _relaxedAdd(_ a: Double, _ b: Double) -> Double {
230+
_numerics_relaxed_add(a, b)
231+
}
232+
233+
@_transparent
234+
public static func _relaxedMul(_ a: Double, _ b: Double) -> Double {
235+
_numerics_relaxed_mul(a, b)
236+
}
227237
}

Sources/RealModule/Float+Real.swift

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,4 +197,14 @@ extension Float: Real {
197197
public static func _mulAdd(_ a: Float, _ b: Float, _ c: Float) -> Float {
198198
_numerics_muladdf(a, b, c)
199199
}
200+
201+
@_transparent
202+
public static func _relaxedAdd(_ a: Float, _ b: Float) -> Float {
203+
_numerics_relaxed_addf(a, b)
204+
}
205+
206+
@_transparent
207+
public static func _relaxedMul(_ a: Float, _ b: Float) -> Float {
208+
_numerics_relaxed_mulf(a, b)
209+
}
200210
}

Sources/RealModule/Float16+Real.swift

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,21 @@ extension Float16: Real {
172172
Float16(.logGamma(Float(x)))
173173
}
174174
#endif
175+
176+
@_transparent
177+
public static func _mulAdd(_ a: Float16, _ b: Float16, _ c: Float16) -> Float16 {
178+
_numerics_muladdf16(a, b, c)
179+
}
180+
181+
@_transparent
182+
public static func _relaxedAdd(_ a: Float16, _ b: Float16) -> Float16 {
183+
_numerics_relaxed_addf16(a, b)
184+
}
185+
186+
@_transparent
187+
public static func _relaxedMul(_ a: Float16, _ b: Float16) -> Float16 {
188+
_numerics_relaxed_mulf16(a, b)
189+
}
175190
}
176191

177192
#endif

Sources/RealModule/Float80+Real.swift

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,5 +165,15 @@ extension Float80: Real {
165165
var dontCare: Int32 = 0
166166
return libm_lgammal(x, &dontCare)
167167
}
168+
169+
@_transparent
170+
public static func _relaxedAdd(_ a: Float80, _ b: Float80) -> Float80 {
171+
_numerics_relaxed_addl(a, b)
172+
}
173+
174+
@_transparent
175+
public static func _relaxedMul(_ a: Float80, _ b: Float80) -> Float80 {
176+
_numerics_relaxed_mull(a, b)
177+
}
168178
}
169179
#endif

Sources/RealModule/Real.swift

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,16 @@ extension Real {
8888
a*b + c
8989
}
9090

91+
@_transparent
92+
public static func _relaxedAdd(_ a: Self, _ b: Self) -> Self {
93+
a + b
94+
}
95+
96+
@_transparent
97+
public static func _relaxedMul(_ a: Self, _ b: Self) -> Self {
98+
a * b
99+
}
100+
91101
@_transparent
92102
public static func sqrt(_ x: Self) -> Self {
93103
return x.squareRoot()

Sources/RealModule/RealFunctions.swift

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,46 @@ public protocol RealFunctions: ElementaryFunctions {
7777
static func signGamma(_ x: Self) -> FloatingPointSign
7878
#endif
7979

80-
/// a*b + c, computed _either_ with an FMA or with separate multiply and add.
81-
///
82-
/// Whichever is faster should be chosen by the compiler statically.
80+
/// a*b + c, computed _either_ with an FMA or with separate multiply and add,
81+
/// whichever is fastest on the compilation target.
8382
static func _mulAdd(_ a: Self, _ b: Self, _ c: Self) -> Self
83+
84+
/// a + b, with the optimizer licensed to reassociate and form FMAs.
85+
///
86+
/// Floating-point addition is not an associative operation, so the Swift
87+
/// compiler does not have any flexibility in how it evaluates an expression
88+
/// like:
89+
/// ```
90+
/// func sum(array: [Float]) -> Float {
91+
/// array.reduce(0, +)
92+
/// }
93+
/// ```
94+
/// Using `_relaxedAdd` instead of `+` permits the compiler to reorder the
95+
/// terms in the summation, which unlocks loop unrolling and vectorization.
96+
/// In a benchmark, simply using `_relaxedAdd` provides about an 8x speedup
97+
/// for release builds, without any unsafe flags or other optimizations.
98+
/// Further improvement should be possible by improving LLVM optimizations
99+
/// or adding attributes to license more aggressive unrolling and taking
100+
/// advantage of vector ISA extensions for swift.
101+
static func _relaxedAdd(_ a: Self, _ b: Self) -> Self
102+
103+
/// a * b, with the optimizer licensed to reassociate and form FMAs.
104+
///
105+
/// Floating-point addition and multiplication are not associative operations,
106+
/// so the Swift compiler does not have any flexibility in how it evaluates
107+
/// an expression
108+
/// like:
109+
/// ```
110+
/// func sumOfSquares(array: [Float]) -> Float {
111+
/// array.reduce(0) { $0 + $1*$1 }
112+
/// }
113+
/// ```
114+
/// Using `_relaxedAdd` and `_relaxedMul` instead of `+` and `*` permits the
115+
/// compiler to reorder the terms in the summation, which unlocks loop
116+
/// unrolling and vectorization, and form fused multiply-adds, which allows
117+
/// us to achieve twice the throughput on some hardware.
118+
///
119+
/// If you want to license FMA formation, but _not_ reassociation (desirable
120+
/// for some numerics tasks), use `_mulAdd(a, b, c)` instead.
121+
static func _relaxedMul(_ a: Self, _ b: Self) -> Self
84122
}

Sources/_NumericsShims/include/_NumericsShims.h

Lines changed: 69 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -382,18 +382,84 @@ HEADER_SHIM long double libm_lgammal(long double x, int *signp) {
382382
}
383383
#endif
384384

385-
// MARK: - fast mul-add inlines
385+
// MARK: - math inlines with relaxed semantics to support optimization.
386+
387+
/// a*b + c evaluated _either_ as two operations or fma, whichever is faster.
388+
HEADER_SHIM _Float16 _numerics_muladdf16(_Float16 a, _Float16 b, _Float16 c) {
389+
#pragma STDC FP_CONTRACT ON
390+
return a*b + c;
391+
}
392+
393+
/// a + b with the "allow reassociation" and "allow FMA formation" flags
394+
/// set in the IR.
395+
HEADER_SHIM _Float16 _numerics_relaxed_addf16(_Float16 a, _Float16 b) {
396+
#pragma clang fp reassociate(on) contract(fast)
397+
return a + b;
398+
}
399+
400+
/// a * b with the "allow reassociation" and "allow FMA formation" flags
401+
/// set in the IR.
402+
HEADER_SHIM _Float16 _numerics_relaxed_mulf16(_Float16 a, _Float16 b) {
403+
#pragma clang fp reassociate(on) contract(fast)
404+
return a * b;
405+
}
406+
386407
/// a*b + c evaluated _either_ as two operations or fma, whichever is faster.
387408
HEADER_SHIM float _numerics_muladdf(float a, float b, float c) {
388409
#pragma STDC FP_CONTRACT ON
389410
return a*b + c;
390411
}
391412

413+
/// a + b with the "allow reassociation" and "allow FMA formation" flags
414+
/// set in the IR.
415+
HEADER_SHIM float _numerics_relaxed_addf(float a, float b) {
416+
#pragma clang fp reassociate(on) contract(fast)
417+
return a + b;
418+
}
419+
420+
/// a * b with the "allow reassociation" and "allow FMA formation" flags
421+
/// set in the IR.
422+
HEADER_SHIM float _numerics_relaxed_mulf(float a, float b) {
423+
#pragma clang fp reassociate(on) contract(fast)
424+
return a * b;
425+
}
426+
392427
/// a*b + c evaluated _either_ as two operations or fma, whichever is faster.
393428
HEADER_SHIM double _numerics_muladd(double a, double b, double c) {
394429
#pragma STDC FP_CONTRACT ON
395430
return a*b + c;
396431
}
397432

398-
// No long-double muladd operation, because no one has built an FMA for it
399-
// (except for Itanium, which Swift doesn't support).
433+
/// a + b with the "allow reassociation" and "allow FMA formation" flags
434+
/// set in the IR.
435+
HEADER_SHIM double _numerics_relaxed_add(double a, double b) {
436+
#pragma clang fp reassociate(on) contract(fast)
437+
return a + b;
438+
}
439+
440+
/// a * b with the "allow reassociation" and "allow FMA formation" flags
441+
/// set in the IR.
442+
HEADER_SHIM double _numerics_relaxed_mul(double a, double b) {
443+
#pragma clang fp reassociate(on) contract(fast)
444+
return a * b;
445+
}
446+
447+
#if !defined _WIN32 && (defined __i386__ || defined __x86_64__)
448+
/// a + b with the "allow reassociation" and "allow FMA formation" flags
449+
/// set in the IR.
450+
HEADER_SHIM long double _numerics_relaxed_addl(long double a, long double b) {
451+
#pragma clang fp reassociate(on) contract(fast)
452+
return a + b;
453+
}
454+
455+
/// a * b with the "allow reassociation" and "allow FMA formation" flags
456+
/// set in the IR.
457+
HEADER_SHIM long double _numerics_relaxed_mull(long double a, long double b) {
458+
#pragma clang fp reassociate(on) contract(fast)
459+
return a * b;
460+
}
461+
#endif
462+
463+
HEADER_SHIM void _numerics_optimization_barrier(const void *pointer) {
464+
__asm("": :"r" (pointer));
465+
}

Sources/_TestSupport/BlackHole.swift

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
//===--- BlackHole.swift --------------------------------------*- swift -*-===//
2+
//
3+
// This source file is part of the Swift Numerics open source project
4+
//
5+
// Copyright (c) 2021 Apple Inc. and the Swift Numerics project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
import _NumericsShims
13+
14+
@_transparent
15+
public func blackHole<T>(_ thing: T) {
16+
withUnsafePointer(to: thing) {
17+
_numerics_optimization_barrier($0)
18+
}
19+
}
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
//===--- RelaxedArithmeticTests.swift -------------------------*- swift -*-===//
2+
//
3+
// This source file is part of the Swift Numerics open source project
4+
//
5+
// Copyright (c) 2021 Apple Inc. and the Swift Numerics project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
import XCTest
13+
import RealModule
14+
import _TestSupport
15+
#if canImport(Accelerate)
16+
import Accelerate
17+
#endif
18+
19+
func strictSum<T: Real>(_ array: [T]) -> T {
20+
array.reduce(0, +)
21+
}
22+
23+
func relaxedSum<T: Real>(_ array: [T]) -> T {
24+
array.reduce(0, T._relaxedAdd)
25+
}
26+
27+
func strictSumOfSquares<T: Real>(_ array: [T]) -> T {
28+
array.reduce(0) { $0 + $1*$1 }
29+
}
30+
31+
func relaxedSumOfSquares<T: Real>(_ array: [T]) -> T {
32+
array.reduce(0) { ._relaxedAdd($0, ._relaxedMul($1, $1)) }
33+
}
34+
35+
// TODO: not a great harness, but making it better bumps up against the
36+
// limitations of what XCT measure { } lets us do easily. Good enough for now.
37+
@_transparent
38+
func benchmarkReduction<T: Real>(_ data: [T], _ reduction: ([T]) -> T) {
39+
var accum: T = 0
40+
for _ in 0 ..< 10_000 {
41+
accum += reduction(data)
42+
}
43+
blackHole(accum)
44+
}
45+
46+
final class RelaxedArithmeticTests: XCTestCase {
47+
48+
var floatData: [Float] = []
49+
50+
override func setUp() {
51+
super.setUp()
52+
floatData = (0 ..< 1024).map { _ in .random(in: .sqrt(1/2) ..< .sqrt(2)) }
53+
}
54+
55+
func testStrictSumPerformance() {
56+
measure { benchmarkReduction(floatData, strictSum) }
57+
}
58+
59+
func testRelaxedSumPerformance() {
60+
measure { benchmarkReduction(floatData, relaxedSum) }
61+
}
62+
63+
#if canImport(Accelerate)
64+
func testvDSPSumPerformance() {
65+
measure { benchmarkReduction(floatData, vDSP.sum) }
66+
}
67+
#endif
68+
69+
func testStrictDotPerformance() {
70+
measure { benchmarkReduction(floatData, strictSumOfSquares) }
71+
}
72+
73+
func testRelaxedDotPerformance() {
74+
measure { benchmarkReduction(floatData, relaxedSumOfSquares) }
75+
}
76+
77+
#if canImport(Accelerate)
78+
func testvDSPDotPerformance() {
79+
measure { benchmarkReduction(floatData, vDSP.sumOfSquares) }
80+
}
81+
#endif
82+
83+
func testRelaxedArithmetic<T: FixedWidthFloatingPoint & Real>(_ type: T.Type) {
84+
// Relaxed add is still an add; it's just permitted to reorder relative
85+
// to other adds or form FMAs. So if we do one in isolation, it has to
86+
// produce the same result as a normal addition.
87+
let a = T.random(in: -1 ... 1)
88+
let b = T.random(in: -1 ... 1)
89+
XCTAssertEqual(a + b, ._relaxedAdd(a, b))
90+
// Same is true for mul.
91+
XCTAssertEqual(a * b, ._relaxedMul(a, b))
92+
// add + mul must be either two operations or an FMA:
93+
let unfused = a + 1.5 * b
94+
let fused = a.addingProduct(1.5, b)
95+
let relaxed = T._relaxedAdd(a, ._relaxedMul(1.5, b))
96+
XCTAssert(relaxed == unfused || relaxed == fused)
97+
// Summing all values in an array can be associated however we want, but
98+
// has to satisfy the usual error bound of 0.5 * sum.ulp * numberOfElements.
99+
// We don't have a golden reference, but we can compare two sums with twice
100+
// the bound for a sanity check.
101+
let array = (0 ..< 128).map { _ in T.random(in: 1 ..< 2) }
102+
var ref = strictSum(array)
103+
var tst = relaxedSum(array)
104+
var bound = max(ref, tst).ulp * T(array.count)
105+
XCTAssertLessThanOrEqual(abs(ref - tst), bound)
106+
// Similarly for sum of squares ...
107+
ref = strictSumOfSquares(array)
108+
tst = relaxedSumOfSquares(array)
109+
bound = 2 * max(ref, tst).ulp * T(array.count)
110+
XCTAssertLessThanOrEqual(abs(ref - tst), bound)
111+
}
112+
113+
func testRelaxedArithmetic() {
114+
#if swift(>=5.4) && !((os(macOS) || targetEnvironment(macCatalyst)) && arch(x86_64))
115+
testRelaxedArithmetic(Float16.self)
116+
#endif
117+
testRelaxedArithmetic(Float.self)
118+
testRelaxedArithmetic(Double.self)
119+
#if (arch(i386) || arch(x86_64)) && !os(Windows) && !os(Android)
120+
testRelaxedArithmetic(Float80.self)
121+
#endif
122+
}
123+
}

0 commit comments

Comments
 (0)