Non-Metal based perceptual image comparison (#666)

ejensen · web-flow · commit b30b3beaf36b · 2024-01-02T12:06:56.000-08:00
* Improve the speed of comparing memory buffers by using a workaround to a missed compiler optimization

* Remove Metal usage since some virtualized environments don’t support it

Replaces the CoreImage operations that require Metal with CPU-based calculations

* Re-add the Metal-based image comparison

Check for support before using and fallback to CPU computation if Metal is not supported

* Update logic to determine if a Metal kernel is supported on the device

* Use the maintainers preferred method of using a while loop
diff --git a/Sources/SnapshotTesting/Snapshotting/UIImage.swift b/Sources/SnapshotTesting/Snapshotting/UIImage.swift
@@ -192,57 +192,132 @@
 #endif
 
 #if os(iOS) || os(tvOS) || os(macOS)
+  import Accelerate.vImage
   import CoreImage.CIKernel
   import MetalPerformanceShaders
 
   @available(iOS 10.0, tvOS 10.0, macOS 10.13, *)
   func perceptuallyCompare(
     _ old: CIImage, _ new: CIImage, pixelPrecision: Float, perceptualPrecision: Float
   ) -> String? {
-    let deltaOutputImage = old.applyingFilter("CILabDeltaE", parameters: ["inputImage2": new])
-    let thresholdOutputImage: CIImage
-    do {
-      thresholdOutputImage = try ThresholdImageProcessorKernel.apply(
-        withExtent: new.extent,
-        inputs: [deltaOutputImage],
-        arguments: [
-          ThresholdImageProcessorKernel.inputThresholdKey: (1 - perceptualPrecision) * 100
-        ]
-      )
-    } catch {
-      return "Newly-taken snapshot's data could not be loaded. \(error)"
-    }
-    var averagePixel: Float = 0
+    // Calculate the deltaE values. Each pixel is a value between 0-100.
+    // 0 means no difference, 100 means completely opposite.
+    let deltaOutputImage = old.applyingLabDeltaE(new)
+    // Setting the working color space and output color space to NSNull disables color management. This is appropriate when the output
+    // of the operations is computational instead of an image intended to be displayed.
     let context = CIContext(options: [.workingColorSpace: NSNull(), .outputColorSpace: NSNull()])
-    context.render(
-      thresholdOutputImage.applyingFilter(
-        "CIAreaAverage", parameters: [kCIInputExtentKey: new.extent]),
-      toBitmap: &averagePixel,
-      rowBytes: MemoryLayout<Float>.size,
-      bounds: CGRect(x: 0, y: 0, width: 1, height: 1),
-      format: .Rf,
-      colorSpace: nil
-    )
-    let actualPixelPrecision = 1 - averagePixel
-    guard actualPixelPrecision < pixelPrecision else { return nil }
+    let deltaThreshold = (1 - perceptualPrecision) * 100
+    let actualPixelPrecision: Float
     var maximumDeltaE: Float = 0
-    context.render(
-      deltaOutputImage.applyingFilter("CIAreaMaximum", parameters: [kCIInputExtentKey: new.extent]),
-      toBitmap: &maximumDeltaE,
-      rowBytes: MemoryLayout<Float>.size,
-      bounds: CGRect(x: 0, y: 0, width: 1, height: 1),
-      format: .Rf,
-      colorSpace: nil
-    )
-    let actualPerceptualPrecision = 1 - maximumDeltaE / 100
-    if pixelPrecision < 1 {
-      return """
-        Actual image precision \(actualPixelPrecision) is less than required \(pixelPrecision)
-        Actual perceptual precision \(actualPerceptualPrecision) is less than required \(perceptualPrecision)
-        """
+
+    // Metal is supported by all iOS/tvOS devices (2013 models or later) and Macs (2012 models or later).
+    // Older devices do not support iOS/tvOS 13 and macOS 10.15 which are the minimum versions of swift-snapshot-testing.
+    // However, some virtualized hardware do not have GPUs and therefore do not support Metal.
+    // In this case, macOS falls back to a CPU-based OpenGL ES renderer that silently fails when a Metal command is issued.
+    // We need to check for Metal device support and fallback to CPU based vImage buffer iteration.
+    if ThresholdImageProcessorKernel.isSupported {
+      // Fast path - Metal processing
+      guard
+        let thresholdOutputImage = try? deltaOutputImage.applyingThreshold(deltaThreshold),
+        let averagePixel = thresholdOutputImage.applyingAreaAverage().renderSingleValue(in: context)
+      else  {
+        return "Newly-taken snapshot's data could not be processed."
+      }
+      actualPixelPrecision = 1 - averagePixel
+      if actualPixelPrecision < pixelPrecision {
+        maximumDeltaE = deltaOutputImage.applyingAreaMaximum().renderSingleValue(in: context) ?? 0
+      }
     } else {
-      return
-        "Actual perceptual precision \(actualPerceptualPrecision) is less than required \(perceptualPrecision)"
+      // Slow path - CPU based vImage buffer iteration
+      guard let buffer = deltaOutputImage.render(in: context) else {
+        return "Newly-taken snapshot could not be processed."
+      }
+      defer { buffer.free() }
+      var failingPixelCount: Int = 0
+      // rowBytes must be a multiple of 8, so vImage_Buffer pads the end of each row with bytes to meet the multiple of 0 requirement.
+      // We must do 2D iteration of the vImage_Buffer in order to avoid loading the padding garbage bytes at the end of each row.
+      //
+      // NB: We are purposely using a verbose 'while' loop instead of a 'for in' loop.  When the
+      //     compiler doesn't have optimizations enabled, like in test targets, a `while` loop is
+      //     significantly faster than a `for` loop for iterating through the elements of a memory
+      //     buffer. Details can be found in [SR-6983](https://github.com/apple/swift/issues/49531)
+      let componentStride = MemoryLayout<Float>.stride
+      var line = 0
+      while line < buffer.height {
+        defer { line += 1 }
+        let lineOffset = buffer.rowBytes * line
+        var column = 0
+        while column < buffer.width {
+          defer { column += 1 }
+          let byteOffset = lineOffset + column * componentStride
+          let deltaE = buffer.data.load(fromByteOffset: byteOffset, as: Float.self)
+          if deltaE > deltaThreshold {
+            failingPixelCount += 1
+            if deltaE > maximumDeltaE {
+              maximumDeltaE = deltaE
+            }
+          }
+        }
+      }
+      let failingPixelPercent = Float(failingPixelCount) / Float(deltaOutputImage.extent.width * deltaOutputImage.extent.height)
+      actualPixelPrecision = 1 - failingPixelPercent
+    }
+
+    guard actualPixelPrecision < pixelPrecision else { return nil }
+    // The actual perceptual precision is the perceptual precision of the pixel with the highest DeltaE.
+    // DeltaE is in a 0-100 scale, so we need to divide by 100 to transform it to a percentage.
+    let minimumPerceptualPrecision = 1 - min(maximumDeltaE / 100, 1)
+    return """
+    The percentage of pixels that match \(actualPixelPrecision) is less than required \(pixelPrecision)
+    The lowest perceptual color precision \(minimumPerceptualPrecision) is less than required \(perceptualPrecision)
+    """
+  }
+
+  extension CIImage {
+    func applyingLabDeltaE(_ other: CIImage) -> CIImage {
+      applyingFilter("CILabDeltaE", parameters: ["inputImage2": other])
+    }
+
+    func applyingThreshold(_ threshold: Float) throws -> CIImage {
+      try ThresholdImageProcessorKernel.apply(
+        withExtent: extent,
+        inputs: [self],
+        arguments: [ThresholdImageProcessorKernel.inputThresholdKey: threshold]
+      )
+    }
+
+    func applyingAreaAverage() -> CIImage {
+      applyingFilter("CIAreaAverage", parameters: [kCIInputExtentKey: extent])
+    }
+
+    func applyingAreaMaximum() -> CIImage {
+      applyingFilter("CIAreaMaximum", parameters: [kCIInputExtentKey: extent])
+    }
+
+    func renderSingleValue(in context: CIContext) -> Float? {
+        guard let buffer = render(in: context) else { return nil }
+        defer { buffer.free() }
+        return buffer.data.load(fromByteOffset: 0, as: Float.self)
+    }
+
+    func render(in context: CIContext, format: CIFormat = CIFormat.Rh) -> vImage_Buffer? {
+      // Some hardware configurations (virtualized CPU renderers) do not support 32-bit float output formats,
+      // so use a compatible 16-bit float format and convert the output value to 32-bit floats.
+      guard var buffer16 = try? vImage_Buffer(width: Int(extent.width), height: Int(extent.height), bitsPerPixel: 16) else { return nil }
+      defer { buffer16.free() }
+      context.render(
+        self,
+        toBitmap: buffer16.data,
+        rowBytes: buffer16.rowBytes,
+        bounds: extent,
+        format: format,
+        colorSpace: nil
+      )
+      guard
+        var buffer32 = try? vImage_Buffer(width: Int(buffer16.width), height: Int(buffer16.height), bitsPerPixel: 32),
+        vImageConvert_Planar16FtoPlanarF(&buffer16, &buffer32, 0) == kvImageNoError
+      else { return nil }
+      return buffer32
     }
   }
 
@@ -252,6 +327,19 @@
     static let inputThresholdKey = "thresholdValue"
     static let device = MTLCreateSystemDefaultDevice()
 
+    static var isSupported: Bool {
+      guard let device = device else {
+        return false
+      }
+      #if targetEnvironment(simulator)
+        guard #available(iOS 14.0, tvOS 14.0, *) else {
+          // The MPSSupportsMTLDevice method throws an exception on iOS/tvOS simulators < 14.0
+          return false
+        }
+      #endif
+      return MPSSupportsMTLDevice(device)
+    }
+
     override class func process(
       with inputs: [CIImageProcessorInput]?, arguments: [String: Any]?,
       output: CIImageProcessorOutput