Initial pass at "relaxed" multiply and add operations.

stephentyrone · stephentyrone · commit 9c0332d57031 · 2023-04-26T08:56:43.000-04:00
This commit adds the following to the RealFunctions protocol:

    static func _relaxedAdd(_:Self, _:Self) -&gt; Self
    static func _relaxedMul(_:Self, _:Self) -&gt; Self

These are equivalent to + and *, but have "relaxed semantics"; specifically, they license the compiler to reassociate them and to form FMA nodes, which are both significant optimizations that can easily make many common loops 8-10x faster. These transformation perturb results slightly, so they should not be enabled without care, but the results with the relaxed operations are--for most purposes--"just as good as" (and often better than) what strict operations produce. The main thing to beware of is that they are no longer portable; different compiler versions and different targets and optimization flags will result in different results.

They are underscored because they are not stable API. In particular:
- `RealFunctions` is not really the right protocol for these (and neither is `Real`). I need to do some thinking about where to attach them.
- Even if it were the right protocol, these are more like implementation hooks than the API I really want people to use (TBD).
- I like "relaxed" more than other commonly used idioms ("fast"), but I'm not sure it's the name I ultimately want.
diff --git a/Sources/RealModule/Double+Real.swift b/Sources/RealModule/Double+Real.swift
@@ -224,4 +224,14 @@ extension Double: Real {
   public static func _mulAdd(_ a: Double, _ b: Double, _ c: Double) -> Double {
     _numerics_muladd(a, b, c)
   }
+  
+  @_transparent
+  public static func _relaxedAdd(_ a: Double, _ b: Double) -> Double {
+    _numerics_relaxed_add(a, b)
+  }
+  
+  @_transparent
+  public static func _relaxedMul(_ a: Double, _ b: Double) -> Double {
+    _numerics_relaxed_mul(a, b)
+  }
 }
diff --git a/Sources/RealModule/Float+Real.swift b/Sources/RealModule/Float+Real.swift
@@ -197,4 +197,14 @@ extension Float: Real {
   public static func _mulAdd(_ a: Float, _ b: Float, _ c: Float) -> Float {
     _numerics_muladdf(a, b, c)
   }
+  
+  @_transparent
+  public static func _relaxedAdd(_ a: Float, _ b: Float) -> Float {
+    _numerics_relaxed_addf(a, b)
+  }
+  
+  @_transparent
+  public static func _relaxedMul(_ a: Float, _ b: Float) -> Float {
+    _numerics_relaxed_mulf(a, b)
+  }
 }
diff --git a/Sources/RealModule/Float16+Real.swift b/Sources/RealModule/Float16+Real.swift
@@ -172,6 +172,21 @@ extension Float16: Real {
     Float16(.logGamma(Float(x)))
   }
   #endif
+  
+  @_transparent
+  public static func _mulAdd(_ a: Float16, _ b: Float16, _ c: Float16) -> Float16 {
+    _numerics_muladdf16(a, b, c)
+  }
+  
+  @_transparent
+  public static func _relaxedAdd(_ a: Float16, _ b: Float16) -> Float16 {
+    _numerics_relaxed_addf16(a, b)
+  }
+  
+  @_transparent
+  public static func _relaxedMul(_ a: Float16, _ b: Float16) -> Float16 {
+    _numerics_relaxed_mulf16(a, b)
+  }
 }
 
 #endif
diff --git a/Sources/RealModule/Float80+Real.swift b/Sources/RealModule/Float80+Real.swift
@@ -165,5 +165,15 @@ extension Float80: Real {
     var dontCare: Int32 = 0
     return libm_lgammal(x, &dontCare)
   }
+  
+  @_transparent
+  public static func _relaxedAdd(_ a: Float80, _ b: Float80) -> Float80 {
+    _numerics_relaxed_addl(a, b)
+  }
+  
+  @_transparent
+  public static func _relaxedMul(_ a: Float80, _ b: Float80) -> Float80 {
+    _numerics_relaxed_mull(a, b)
+  }
 }
 #endif
diff --git a/Sources/RealModule/Real.swift b/Sources/RealModule/Real.swift
@@ -88,6 +88,16 @@ extension Real {
     a*b + c
   }
   
+  @_transparent
+  public static func _relaxedAdd(_ a: Self, _ b: Self) -> Self {
+    a + b
+  }
+  
+  @_transparent
+  public static func _relaxedMul(_ a: Self, _ b: Self) -> Self {
+    a * b
+  }
+  
   @_transparent
   public static func sqrt(_ x: Self) -> Self {
     return x.squareRoot()
diff --git a/Sources/RealModule/RealFunctions.swift b/Sources/RealModule/RealFunctions.swift
@@ -77,8 +77,46 @@ public protocol RealFunctions: ElementaryFunctions {
   static func signGamma(_ x: Self) -> FloatingPointSign
 #endif
   
-  /// a*b + c, computed _either_ with an FMA or with separate multiply and add.
-  ///
-  /// Whichever is faster should be chosen by the compiler statically.
+  /// a*b + c, computed _either_ with an FMA or with separate multiply and add,
+  /// whichever is fastest on the compilation target.
   static func _mulAdd(_ a: Self, _ b: Self, _ c: Self) -> Self
+  
+  /// a + b, with the optimizer licensed to reassociate and form FMAs.
+  ///
+  /// Floating-point addition is not an associative operation, so the Swift
+  /// compiler does not have any flexibility in how it evaluates an expression
+  /// like:
+  /// ```
+  /// func sum(array: [Float]) -> Float {
+  ///   array.reduce(0, +)
+  /// }
+  /// ```
+  /// Using `_relaxedAdd` instead of `+` permits the compiler to reorder the
+  /// terms in the summation, which unlocks loop unrolling and vectorization.
+  /// In a benchmark, simply using `_relaxedAdd` provides about an 8x speedup
+  /// for release builds, without any unsafe flags or other optimizations.
+  /// Further improvement should be possible by improving LLVM optimizations
+  /// or adding attributes to license more aggressive unrolling and taking
+  /// advantage of vector ISA extensions for swift.
+  static func _relaxedAdd(_ a: Self, _ b: Self) -> Self
+  
+  /// a * b, with the optimizer licensed to reassociate and form FMAs.
+  ///
+  /// Floating-point addition and multiplication are not associative operations,
+  /// so the Swift compiler does not have any flexibility in how it evaluates
+  /// an expression
+  /// like:
+  /// ```
+  /// func sumOfSquares(array: [Float]) -> Float {
+  ///   array.reduce(0) { $0 + $1*$1 }
+  /// }
+  /// ```
+  /// Using `_relaxedAdd` and `_relaxedMul` instead of `+` and `*` permits the
+  /// compiler to reorder the terms in the summation, which unlocks loop
+  /// unrolling and vectorization, and form fused multiply-adds, which allows
+  /// us to achieve twice the throughput on some hardware.
+  ///
+  /// If you want to license FMA formation, but _not_ reassociation (desirable
+  /// for some numerics tasks), use `_mulAdd(a, b, c)` instead.
+  static func _relaxedMul(_ a: Self, _ b: Self) -> Self
 }
diff --git a/Sources/_NumericsShims/include/_NumericsShims.h b/Sources/_NumericsShims/include/_NumericsShims.h
@@ -382,18 +382,84 @@ HEADER_SHIM long double libm_lgammal(long double x, int *signp) {
 }
 #endif
 
-// MARK: - fast mul-add inlines
+// MARK: - math inlines with relaxed semantics to support optimization.
+
+/// a*b + c evaluated _either_ as two operations or fma, whichever is faster.
+HEADER_SHIM _Float16 _numerics_muladdf16(_Float16 a, _Float16 b, _Float16 c) {
+#pragma STDC FP_CONTRACT ON
+  return a*b + c;
+}
+
+/// a + b with the "allow reassociation" and "allow FMA formation" flags
+/// set in the IR.
+HEADER_SHIM _Float16 _numerics_relaxed_addf16(_Float16 a, _Float16 b) {
+#pragma clang fp reassociate(on) contract(fast)
+  return a + b;
+}
+
+/// a * b with the "allow reassociation" and "allow FMA formation" flags
+/// set in the IR.
+HEADER_SHIM _Float16 _numerics_relaxed_mulf16(_Float16 a, _Float16 b) {
+#pragma clang fp reassociate(on) contract(fast)
+  return a * b;
+}
+
 /// a*b + c evaluated _either_ as two operations or fma, whichever is faster.
 HEADER_SHIM float _numerics_muladdf(float a, float b, float c) {
 #pragma STDC FP_CONTRACT ON
   return a*b + c;
 }
 
+/// a + b with the "allow reassociation" and "allow FMA formation" flags
+/// set in the IR.
+HEADER_SHIM float _numerics_relaxed_addf(float a, float b) {
+#pragma clang fp reassociate(on) contract(fast)
+  return a + b;
+}
+
+/// a * b with the "allow reassociation" and "allow FMA formation" flags
+/// set in the IR.
+HEADER_SHIM float _numerics_relaxed_mulf(float a, float b) {
+#pragma clang fp reassociate(on) contract(fast)
+  return a * b;
+}
+
 /// a*b + c evaluated _either_ as two operations or fma, whichever is faster.
 HEADER_SHIM double _numerics_muladd(double a, double b, double c) {
 #pragma STDC FP_CONTRACT ON
   return a*b + c;
 }
 
-// No long-double muladd operation, because no one has built an FMA for it
-// (except for Itanium, which Swift doesn't support).
+/// a + b with the "allow reassociation" and "allow FMA formation" flags
+/// set in the IR.
+HEADER_SHIM double _numerics_relaxed_add(double a, double b) {
+#pragma clang fp reassociate(on) contract(fast)
+  return a + b;
+}
+
+/// a * b with the "allow reassociation" and "allow FMA formation" flags
+/// set in the IR.
+HEADER_SHIM double _numerics_relaxed_mul(double a, double b) {
+#pragma clang fp reassociate(on) contract(fast)
+  return a * b;
+}
+
+#if !defined _WIN32 && (defined __i386__ || defined __x86_64__)
+/// a + b with the "allow reassociation" and "allow FMA formation" flags
+/// set in the IR.
+HEADER_SHIM long double _numerics_relaxed_addl(long double a, long double b) {
+#pragma clang fp reassociate(on) contract(fast)
+  return a + b;
+}
+
+/// a * b with the "allow reassociation" and "allow FMA formation" flags
+/// set in the IR.
+HEADER_SHIM long double _numerics_relaxed_mull(long double a, long double b) {
+#pragma clang fp reassociate(on) contract(fast)
+  return a * b;
+}
+#endif
+
+HEADER_SHIM void _numerics_optimization_barrier(const void *pointer) {
+  __asm("": :"r" (pointer));
+}
diff --git a/Sources/_TestSupport/BlackHole.swift b/Sources/_TestSupport/BlackHole.swift
@@ -0,0 +1,19 @@
+//===--- BlackHole.swift --------------------------------------*- swift -*-===//
+//
+// This source file is part of the Swift Numerics open source project
+//
+// Copyright (c) 2021 Apple Inc. and the Swift Numerics project authors
+// Licensed under Apache License v2.0 with Runtime Library Exception
+//
+// See https://swift.org/LICENSE.txt for license information
+//
+//===----------------------------------------------------------------------===//
+
+import _NumericsShims
+
+@_transparent
+public func blackHole<T>(_ thing: T) {
+  withUnsafePointer(to: thing) {
+    _numerics_optimization_barrier($0)
+  }
+}
diff --git a/Tests/RealTests/RelaxedArithmeticTests.swift b/Tests/RealTests/RelaxedArithmeticTests.swift
@@ -0,0 +1,123 @@
+//===--- RelaxedArithmeticTests.swift -------------------------*- swift -*-===//
+//
+// This source file is part of the Swift Numerics open source project
+//
+// Copyright (c) 2021 Apple Inc. and the Swift Numerics project authors
+// Licensed under Apache License v2.0 with Runtime Library Exception
+//
+// See https://swift.org/LICENSE.txt for license information
+//
+//===----------------------------------------------------------------------===//
+
+import XCTest
+import RealModule
+import _TestSupport
+#if canImport(Accelerate)
+import Accelerate
+#endif
+
+func strictSum<T: Real>(_ array: [T]) -> T {
+  array.reduce(0, +)
+}
+
+func relaxedSum<T: Real>(_ array: [T]) -> T {
+  array.reduce(0, T._relaxedAdd)
+}
+
+func strictSumOfSquares<T: Real>(_ array: [T]) -> T {
+  array.reduce(0) { $0 + $1*$1 }
+}
+
+func relaxedSumOfSquares<T: Real>(_ array: [T]) -> T {
+  array.reduce(0) { ._relaxedAdd($0, ._relaxedMul($1, $1)) }
+}
+
+// TODO: not a great harness, but making it better bumps up against the
+// limitations of what XCT measure { } lets us do easily. Good enough for now.
+@_transparent
+func benchmarkReduction<T: Real>(_ data: [T], _ reduction: ([T]) -> T) {
+  var accum: T = 0
+  for _ in 0 ..< 10_000 {
+    accum += reduction(data)
+  }
+  blackHole(accum)
+}
+
+final class RelaxedArithmeticTests: XCTestCase {
+  
+  var floatData: [Float] = []
+  
+  override func setUp() {
+    super.setUp()
+    floatData = (0 ..< 1024).map { _ in .random(in: .sqrt(1/2) ..< .sqrt(2)) }
+  }
+  
+  func testStrictSumPerformance() {
+    measure { benchmarkReduction(floatData, strictSum) }
+  }
+  
+  func testRelaxedSumPerformance() {
+    measure { benchmarkReduction(floatData, relaxedSum) }
+  }
+  
+#if canImport(Accelerate)
+  func testvDSPSumPerformance() {
+    measure { benchmarkReduction(floatData, vDSP.sum) }
+  }
+#endif
+  
+  func testStrictDotPerformance() {
+    measure { benchmarkReduction(floatData, strictSumOfSquares) }
+  }
+  
+  func testRelaxedDotPerformance() {
+    measure { benchmarkReduction(floatData, relaxedSumOfSquares) }
+  }
+  
+#if canImport(Accelerate)
+  func testvDSPDotPerformance() {
+    measure { benchmarkReduction(floatData, vDSP.sumOfSquares) }
+  }
+#endif
+  
+  func testRelaxedArithmetic<T: FixedWidthFloatingPoint & Real>(_ type: T.Type) {
+    // Relaxed add is still an add; it's just permitted to reorder relative
+    // to other adds or form FMAs. So if we do one in isolation, it has to
+    // produce the same result as a normal addition.
+    let a = T.random(in: -1 ... 1)
+    let b = T.random(in: -1 ... 1)
+    XCTAssertEqual(a + b, ._relaxedAdd(a, b))
+    // Same is true for mul.
+    XCTAssertEqual(a * b, ._relaxedMul(a, b))
+    // add + mul must be either two operations or an FMA:
+    let unfused = a + 1.5 * b
+    let fused = a.addingProduct(1.5, b)
+    let relaxed = T._relaxedAdd(a, ._relaxedMul(1.5, b))
+    XCTAssert(relaxed == unfused || relaxed == fused)
+    // Summing all values in an array can be associated however we want, but
+    // has to satisfy the usual error bound of 0.5 * sum.ulp * numberOfElements.
+    // We don't have a golden reference, but we can compare two sums with twice
+    // the bound for a sanity check.
+    let array = (0 ..< 128).map { _ in T.random(in: 1 ..< 2) }
+    var ref = strictSum(array)
+    var tst = relaxedSum(array)
+    var bound = max(ref, tst).ulp * T(array.count)
+    XCTAssertLessThanOrEqual(abs(ref - tst), bound)
+    // Similarly for sum of squares ...
+    ref = strictSumOfSquares(array)
+    tst = relaxedSumOfSquares(array)
+    bound = 2 * max(ref, tst).ulp * T(array.count)
+    XCTAssertLessThanOrEqual(abs(ref - tst), bound)
+  }
+  
+  func testRelaxedArithmetic() {
+#if swift(>=5.4) && !((os(macOS) || targetEnvironment(macCatalyst)) && arch(x86_64))
+    testRelaxedArithmetic(Float16.self)
+#endif
+    testRelaxedArithmetic(Float.self)
+    testRelaxedArithmetic(Double.self)
+#if (arch(i386) || arch(x86_64)) && !os(Windows) && !os(Android)
+    testRelaxedArithmetic(Float80.self)
+#endif
+  }
+}