Merge pull request #214 from stephentyrone/rilakkuma

stephentyrone · web-flow · commit 93e5499e54c7 · 2023-04-26T09:14:35.000-04:00
Initial pass at "relaxed" multiply and add operations.
diff --git a/Package.swift b/Package.swift
@@ -53,7 +53,9 @@ let package = Package(
     .target(
       name: "_NumericsShims",
       exclude: excludedFilenames,
-      linkerSettings: [.linkedLibrary("m", .when(platforms: [.linux, .android]))]
+      linkerSettings: [
+        .linkedLibrary("m", .when(platforms: [.linux, .android]))
+      ]
     ),
     
     .target(
diff --git a/Sources/ComplexModule/Complex+AlgebraicField.swift b/Sources/ComplexModule/Complex+AlgebraicField.swift
@@ -123,4 +123,17 @@ extension Complex: AlgebraicField {
     }
     return nil
   }
+  
+  @_transparent
+  public static func _relaxedAdd(_ a: Self, _ b: Self) -> Self {
+    Complex(Relaxed.sum(a.x, b.x), Relaxed.sum(a.y, b.y))
+  }
+  
+  @_transparent
+  public static func _relaxedMul(_ a: Self, _ b: Self) -> Self {
+    Complex(
+      Relaxed.sum(Relaxed.product(a.x, b.x), -Relaxed.product(a.y, b.y)),
+      Relaxed.sum(Relaxed.product(a.x, b.y),  Relaxed.product(a.y, b.x))
+    )
+  }
 }
diff --git a/Sources/ComplexModule/Complex+ElementaryFunctions.swift b/Sources/ComplexModule/Complex+ElementaryFunctions.swift
@@ -119,7 +119,7 @@ extension Complex: ElementaryFunctions {
     }
     // Special cases out of the way, evaluate as discussed above.
     return Complex(
-      RealType._mulAdd(.cos(z.y), .expMinusOne(z.x), .cosMinusOne(z.y)),
+      Relaxed.multiplyAdd(.cos(z.y), .expMinusOne(z.x), .cosMinusOne(z.y)),
       .exp(z.x) * .sin(z.y)
     )
   }
@@ -300,7 +300,7 @@ extension Complex: ElementaryFunctions {
     // We are not trying for sub-ulp accuracy, just a good relative error
     // bound, so for our purposes it suffices to have log u dominate the
     // result:
-    if u >= 1 || u >= RealType._mulAdd(u,u,v*v) {
+    if u >= 1 || u >= Relaxed.multiplyAdd(u, u, v*v) {
       let r = v / u
       return Complex(.log(u) + .log(onePlus: r*r)/2, θ)
     }
diff --git a/Sources/RealModule/AlgebraicField.swift b/Sources/RealModule/AlgebraicField.swift
@@ -89,6 +89,12 @@ public protocol AlgebraicField: SignedNumeric {
   /// }
   /// ```
   var reciprocal: Self? { get }
+  
+  /// a + b, with the optimizer licensed to reassociate and form FMAs.
+  static func _relaxedAdd(_ a: Self, _ b: Self) -> Self
+  
+  /// a * b, with the optimizer licensed to reassociate and form FMAs.
+  static func _relaxedMul(_ a: Self, _ b: Self) -> Self
 }
 
 extension AlgebraicField {
@@ -99,12 +105,27 @@ extension AlgebraicField {
     return result
   }
   
-  /// Implementations should be *conservative* with the reciprocal property;
-  /// it is OK to return `nil` even in cases where a reciprocal could be
-  /// represented. For this reason, a default implementation that simply
-  /// always returns `nil` is correct, but conforming types should provide
-  /// a better implementation if possible.
+  // Implementations should be *conservative* with the reciprocal property;
+  // it is OK to return `nil` even in cases where a reciprocal could be
+  // represented. For this reason, a default implementation that simply
+  // always returns `nil` is correct, but conforming types should provide
+  // a better implementation if possible.
+  @_transparent
   public var reciprocal: Self? {
     return nil
   }
+  
+  // It's always OK to simply fall back on normal arithmetic, and for any
+  // field with exact arithmetic, this is the correct definition.
+  @_transparent
+  public static func _relaxedAdd(_ a: Self, _ b: Self) -> Self {
+    a + b
+  }
+  
+  // It's always OK to simply fall back on normal arithmetic, and for any
+  // field with exact arithmetic, this is the correct definition.
+  @_transparent
+  public static func _relaxedMul(_ a: Self, _ b: Self) -> Self {
+    a * b
+  }
 }
diff --git a/Sources/RealModule/CMakeLists.txt b/Sources/RealModule/CMakeLists.txt
@@ -17,7 +17,8 @@ add_library(RealModule
   Float16+Real.swift
   Float80+Real.swift
   Real.swift
-  RealFunctions.swift)
+  RealFunctions.swift
+  RelaxedArithmetic.swift)
 set_target_properties(RealModule PROPERTIES
   INTERFACE_INCLUDE_DIRECTORIES ${CMAKE_Swift_MODULE_DIRECTORY})
 target_link_libraries(RealModule PUBLIC
diff --git a/Sources/RealModule/Double+Real.swift b/Sources/RealModule/Double+Real.swift
@@ -221,7 +221,12 @@ extension Double: Real {
 #endif
   
   @_transparent
-  public static func _mulAdd(_ a: Double, _ b: Double, _ c: Double) -> Double {
-    _numerics_muladd(a, b, c)
+  public static func _relaxedAdd(_ a: Double, _ b: Double) -> Double {
+    _numerics_relaxed_add(a, b)
+  }
+  
+  @_transparent
+  public static func _relaxedMul(_ a: Double, _ b: Double) -> Double {
+    _numerics_relaxed_mul(a, b)
   }
 }
diff --git a/Sources/RealModule/Float+Real.swift b/Sources/RealModule/Float+Real.swift
@@ -194,7 +194,12 @@ extension Float: Real {
   #endif
   
   @_transparent
-  public static func _mulAdd(_ a: Float, _ b: Float, _ c: Float) -> Float {
-    _numerics_muladdf(a, b, c)
+  public static func _relaxedAdd(_ a: Float, _ b: Float) -> Float {
+    _numerics_relaxed_addf(a, b)
+  }
+  
+  @_transparent
+  public static func _relaxedMul(_ a: Float, _ b: Float) -> Float {
+    _numerics_relaxed_mulf(a, b)
   }
 }
diff --git a/Sources/RealModule/Float16+Real.swift b/Sources/RealModule/Float16+Real.swift
@@ -172,6 +172,20 @@ extension Float16: Real {
     Float16(.logGamma(Float(x)))
   }
   #endif
+  
+  // TODO: once clang stabilizes the calling conventions for _Float16 on Intel,
+  // we can re-enable these; presently the type is disabled on the target.
+  #if !(arch(i386) || arch(x86_64))
+  @_transparent
+  public static func _relaxedAdd(_ a: Float16, _ b: Float16) -> Float16 {
+    _numerics_relaxed_addf16(a, b)
+  }
+  
+  @_transparent
+  public static func _relaxedMul(_ a: Float16, _ b: Float16) -> Float16 {
+    _numerics_relaxed_mulf16(a, b)
+  }
+  #endif
 }
 
 #endif
diff --git a/Sources/RealModule/Float80+Real.swift b/Sources/RealModule/Float80+Real.swift
@@ -165,5 +165,15 @@ extension Float80: Real {
     var dontCare: Int32 = 0
     return libm_lgammal(x, &dontCare)
   }
+  
+  @_transparent
+  public static func _relaxedAdd(_ a: Float80, _ b: Float80) -> Float80 {
+    _numerics_relaxed_addl(a, b)
+  }
+  
+  @_transparent
+  public static func _relaxedMul(_ a: Float80, _ b: Float80) -> Float80 {
+    _numerics_relaxed_mull(a, b)
+  }
 }
 #endif
diff --git a/Sources/RealModule/RealFunctions.swift b/Sources/RealModule/RealFunctions.swift
@@ -76,9 +76,4 @@ public protocol RealFunctions: ElementaryFunctions {
   /// See also `gamma()` and `logGamma()`.
   static func signGamma(_ x: Self) -> FloatingPointSign
 #endif
-  
-  /// a*b + c, computed _either_ with an FMA or with separate multiply and add.
-  ///
-  /// Whichever is faster should be chosen by the compiler statically.
-  static func _mulAdd(_ a: Self, _ b: Self, _ c: Self) -> Self
 }
diff --git a/Sources/RealModule/RelaxedArithmetic.swift b/Sources/RealModule/RelaxedArithmetic.swift
@@ -0,0 +1,82 @@
+//===--- RelaxedArithmetic.swift ------------------------------*- swift -*-===//
+//
+// This source file is part of the Swift Numerics open source project
+//
+// Copyright (c) 2021 Apple Inc. and the Swift Numerics project authors
+// Licensed under Apache License v2.0 with Runtime Library Exception
+//
+// See https://swift.org/LICENSE.txt for license information
+//
+//===----------------------------------------------------------------------===//
+
+import _NumericsShims
+
+/// A namespace for "relaxed arithmetic" operations for types conforming to
+/// `AlgebraicField`.
+public enum Relaxed { }
+
+extension Relaxed {
+  /// a+b with the optimizer licensed to reassociate expressions and form FMAs.
+  ///
+  /// Floating-point addition is not an associative operation, so the Swift
+  /// compiler does not have any flexibility in how it evaluates an expression
+  /// like:
+  /// ```
+  /// func sum(array: [Float]) -> Float {
+  ///   array.reduce(0, +)
+  /// }
+  /// ```
+  /// Using `Relaxed.sum` instead of `+` permits the compiler to reorder the
+  /// terms in the summation, which unlocks loop unrolling and vectorization.
+  /// In a benchmark, simply using `Relaxed.sum` provides about an 8x speedup
+  /// for release builds, without any unsafe flags or other optimizations.
+  /// Further improvement should be possible by improving LLVM optimizations
+  /// or adding attributes to license more aggressive unrolling and taking
+  /// advantage of vector ISA extensions for swift.
+  ///
+  /// If you want to compute `a-b` with relaxed semantics, use
+  /// `Relaxed.sum(a, -b)`.
+  ///
+  /// If a type or toolchain does not support reassociation for optimization
+  /// purposes, this operation decays to a normal addition; it is a license
+  /// for the compiler to optimize, not a guarantee that any change occurs.
+  @_transparent
+  public static func sum<T: AlgebraicField>(_ a: T, _ b: T) -> T {
+    T._relaxedAdd(a, b)
+  }
+  
+  /// a*b with the optimizer licensed to reassociate expressions and form FMAs.
+  ///
+  /// Floating-point addition and multiplication are not associative operations,
+  /// so the Swift compiler does not have any flexibility in how it evaluates
+  /// an expression like:
+  /// ```
+  /// func sumOfSquares(array: [Float]) -> Float {
+  ///   array.reduce(0) { $0 + $1*$1 }
+  /// }
+  /// ```
+  /// Using `Relaxed.sum` and `Relaxed.product` instead of `+` and `*` permits
+  /// the compiler to reorder the terms in the summation, which unlocks loop
+  /// unrolling and vectorization, and form fused multiply-adds, which allows
+  /// us to achieve twice the throughput on some hardware.
+  ///
+  /// If a type or toolchain does not support reassociation for optimization
+  /// purposes, this operation decays to a normal multiplication; it is a
+  /// license for the compiler to optimize, not a guarantee that any change
+  /// occurs.
+  @_transparent
+  public static func product<T: AlgebraicField>(_ a: T, _ b: T) -> T {
+    T._relaxedMul(a, b)
+  }
+}
+
+extension Relaxed {
+  /// a*b + c, computed _either_ with an FMA or with separate multiply and add,
+  /// whichever is fastest according to the optimizer's heuristics.
+  @_transparent
+  public static func multiplyAdd<T: AlgebraicField>(
+    _ a: T, _ b: T, _ c: T
+  ) -> T {
+    T._relaxedAdd(c, T._relaxedMul(a, b))
+  }
+}
diff --git a/Sources/_NumericsShims/include/_NumericsShims.h b/Sources/_NumericsShims/include/_NumericsShims.h
@@ -382,18 +382,71 @@ HEADER_SHIM long double libm_lgammal(long double x, int *signp) {
 }
 #endif
 
-// MARK: - fast mul-add inlines
-/// a*b + c evaluated _either_ as two operations or fma, whichever is faster.
-HEADER_SHIM float _numerics_muladdf(float a, float b, float c) {
-#pragma STDC FP_CONTRACT ON
-  return a*b + c;
+// MARK: - math inlines with relaxed semantics to support optimization.
+#define CLANG_RELAX_FP _Pragma("clang fp reassociate(on) contract(fast)")
+
+#if !(__i386__ || __x86_64__)
+/// a + b with the "allow reassociation" and "allow FMA formation" flags
+/// set in the IR.
+HEADER_SHIM _Float16 _numerics_relaxed_addf16(_Float16 a, _Float16 b) {
+  CLANG_RELAX_FP
+  return a + b;
+}
+
+/// a * b with the "allow reassociation" and "allow FMA formation" flags
+/// set in the IR.
+HEADER_SHIM _Float16 _numerics_relaxed_mulf16(_Float16 a, _Float16 b) {
+  CLANG_RELAX_FP
+  return a * b;
+}
+#endif
+
+/// a + b with the "allow reassociation" and "allow FMA formation" flags
+/// set in the IR.
+HEADER_SHIM float _numerics_relaxed_addf(float a, float b) {
+  CLANG_RELAX_FP
+  return a + b;
+}
+
+/// a * b with the "allow reassociation" and "allow FMA formation" flags
+/// set in the IR.
+HEADER_SHIM float _numerics_relaxed_mulf(float a, float b) {
+  CLANG_RELAX_FP
+  return a * b;
+}
+
+/// a + b with the "allow reassociation" and "allow FMA formation" flags
+/// set in the IR.
+HEADER_SHIM double _numerics_relaxed_add(double a, double b) {
+  CLANG_RELAX_FP
+  return a + b;
+}
+
+/// a * b with the "allow reassociation" and "allow FMA formation" flags
+/// set in the IR.
+HEADER_SHIM double _numerics_relaxed_mul(double a, double b) {
+  CLANG_RELAX_FP
+  return a * b;
 }
 
-/// a*b + c evaluated _either_ as two operations or fma, whichever is faster.
-HEADER_SHIM double _numerics_muladd(double a, double b, double c) {
-#pragma STDC FP_CONTRACT ON
-  return a*b + c;
+#if !defined _WIN32 && (defined __i386__ || defined __x86_64__)
+/// a + b with the "allow reassociation" and "allow FMA formation" flags
+/// set in the IR.
+HEADER_SHIM long double _numerics_relaxed_addl(long double a, long double b) {
+  CLANG_RELAX_FP
+  return a + b;
+}
+
+/// a * b with the "allow reassociation" and "allow FMA formation" flags
+/// set in the IR.
+HEADER_SHIM long double _numerics_relaxed_mull(long double a, long double b) {
+  CLANG_RELAX_FP
+  return a * b;
+}
+#endif
+
+HEADER_SHIM void _numerics_optimization_barrier(const void *pointer) {
+  __asm("": :"r" (pointer));
 }
 
-// No long-double muladd operation, because no one has built an FMA for it
-// (except for Itanium, which Swift doesn't support).
+#undef CLANG_RELAX_FP
diff --git a/Sources/_TestSupport/BlackHole.swift b/Sources/_TestSupport/BlackHole.swift
@@ -0,0 +1,19 @@
+//===--- BlackHole.swift --------------------------------------*- swift -*-===//
+//
+// This source file is part of the Swift Numerics open source project
+//
+// Copyright (c) 2021 Apple Inc. and the Swift Numerics project authors
+// Licensed under Apache License v2.0 with Runtime Library Exception
+//
+// See https://swift.org/LICENSE.txt for license information
+//
+//===----------------------------------------------------------------------===//
+
+import _NumericsShims
+
+@_transparent
+public func blackHole<T>(_ thing: T) {
+  withUnsafePointer(to: thing) {
+    _numerics_optimization_barrier($0)
+  }
+}
diff --git a/Sources/_TestSupport/CMakeLists.txt b/Sources/_TestSupport/CMakeLists.txt
@@ -8,6 +8,7 @@ See https://swift.org/LICENSE.txt for license information
 #]]
 
 add_library(_TestSupport
+  BlackHole.swift
   DoubleWidth.swift
   Error.swift
   Interval.swift
diff --git a/Tests/RealTests/CMakeLists.txt b/Tests/RealTests/CMakeLists.txt
@@ -9,8 +9,10 @@ See https://swift.org/LICENSE.txt for license information
 
 add_library(RealTests
   ApproximateEqualityTests.swift
+  AugmentedArithmeticTests.swift
   ElementaryFunctionChecks.swift
-  IntegerExponentTests.swift)
+  IntegerExponentTests.swift
+  RelaxedArithmeticTests.swift)
 target_compile_options(RealTests PRIVATE
   -enable-testing)
 target_link_libraries(RealTests PUBLIC
diff --git a/Tests/RealTests/RelaxedArithmeticTests.swift b/Tests/RealTests/RelaxedArithmeticTests.swift
diff --git a/Tests/WindowsMain.swift b/Tests/WindowsMain.swift

Original file line number	Diff line number	Diff line change
`@@ -123,4 +123,17 @@ extension Complex: AlgebraicField {`
`123`	`123`	`}`
`124`	`124`	`return nil`
`125`	`125`	`}`
	`126`	`+`
	`127`	`+ @_transparent`
	`128`	`+ public static func _relaxedAdd(_ a: Self, _ b: Self) -> Self {`
	`129`	`+ Complex(Relaxed.sum(a.x, b.x), Relaxed.sum(a.y, b.y))`
	`130`	`+ }`
	`131`	`+`
	`132`	`+ @_transparent`
	`133`	`+ public static func _relaxedMul(_ a: Self, _ b: Self) -> Self {`
	`134`	`+ Complex(`
	`135`	`+ Relaxed.sum(Relaxed.product(a.x, b.x), -Relaxed.product(a.y, b.y)),`
	`136`	`+ Relaxed.sum(Relaxed.product(a.x, b.y), Relaxed.product(a.y, b.x))`
	`137`	`+ )`
	`138`	`+ }`
`126`	`139`	`}`
Original file line number	Diff line number	Diff line change
`@@ -119,7 +119,7 @@ extension Complex: ElementaryFunctions {`
`119`	`119`	`}`
`120`	`120`	`// Special cases out of the way, evaluate as discussed above.`
`121`	`121`	`return Complex(`
`122`		`- RealType._mulAdd(.cos(z.y), .expMinusOne(z.x), .cosMinusOne(z.y)),`
	`122`	`+ Relaxed.multiplyAdd(.cos(z.y), .expMinusOne(z.x), .cosMinusOne(z.y)),`
`123`	`123`	`.exp(z.x) * .sin(z.y)`
`124`	`124`	`)`
`125`	`125`	`}`
`@@ -300,7 +300,7 @@ extension Complex: ElementaryFunctions {`
`300`	`300`	`// We are not trying for sub-ulp accuracy, just a good relative error`
`301`	`301`	`// bound, so for our purposes it suffices to have log u dominate the`
`302`	`302`	`// result:`
`303`		`- if u >= 1 \|\| u >= RealType._mulAdd(u,u,v*v) {`
	`303`	`+ if u >= 1 \|\| u >= Relaxed.multiplyAdd(u, u, v*v) {`
`304`	`304`	`let r = v / u`
`305`	`305`	`return Complex(.log(u) + .log(onePlus: r*r)/2, θ)`
`306`	`306`	`}`
Original file line number	Diff line number	Diff line change
`@@ -221,7 +221,12 @@ extension Double: Real {`
`221`	`221`	`#endif`
`222`	`222`
`223`	`223`	`@_transparent`
`224`		`- public static func _mulAdd(_ a: Double, _ b: Double, _ c: Double) -> Double {`
`225`		`- _numerics_muladd(a, b, c)`
	`224`	`+ public static func _relaxedAdd(_ a: Double, _ b: Double) -> Double {`
	`225`	`+ _numerics_relaxed_add(a, b)`
	`226`	`+ }`
	`227`	`+`
	`228`	`+ @_transparent`
	`229`	`+ public static func _relaxedMul(_ a: Double, _ b: Double) -> Double {`
	`230`	`+ _numerics_relaxed_mul(a, b)`
`226`	`231`	`}`
`227`	`232`	`}`
Original file line number	Diff line number	Diff line change
`@@ -194,7 +194,12 @@ extension Float: Real {`
`194`	`194`	`#endif`
`195`	`195`
`196`	`196`	`@_transparent`
`197`		`- public static func _mulAdd(_ a: Float, _ b: Float, _ c: Float) -> Float {`
`198`		`- _numerics_muladdf(a, b, c)`
	`197`	`+ public static func _relaxedAdd(_ a: Float, _ b: Float) -> Float {`
	`198`	`+ _numerics_relaxed_addf(a, b)`
	`199`	`+ }`
	`200`	`+`
	`201`	`+ @_transparent`
	`202`	`+ public static func _relaxedMul(_ a: Float, _ b: Float) -> Float {`
	`203`	`+ _numerics_relaxed_mulf(a, b)`
`199`	`204`	`}`
`200`	`205`	`}`