improvements and fixes

awxkee · awxkee · commit 162901068100 · 2023-10-24T22:07:09.000+08:00
diff --git a/JxlCoder.podspec b/JxlCoder.podspec
@@ -1,6 +1,6 @@
 Pod::Spec.new do |s|
     s.name             = 'JxlCoder'
-    s.version          = '1.2.5'
+    s.version          = '1.2.6'
     s.summary          = 'JXL coder for iOS and MacOS'
     s.description      = 'Provides support for JXL files in iOS and MacOS'
     s.homepage         = 'https://github.com/awxkee/jxl-coder-swift'
diff --git a/Sources/jxlc/JxlWorker.cpp b/Sources/jxlc/JxlWorker.cpp
@@ -234,7 +234,7 @@ bool DecodeBasicInfo(const uint8_t *jxl, size_t size, size_t *xsize, size_t *ysi
 bool EncodeJxlOneshot(const std::vector<uint8_t> &pixels, const uint32_t xsize,
                       const uint32_t ysize, std::vector<uint8_t> *compressed,
                       JxlPixelType colorspace, JxlCompressionOption compression_option,
-                      float compression_distance, int effort) {
+                      float compressionDistance, int effort) {
     auto enc = JxlEncoderMake(/*memory_manager=*/nullptr);
     auto runner = JxlThreadParallelRunnerMake(
             /*memory_manager=*/nullptr,
@@ -311,10 +311,18 @@ bool EncodeJxlOneshot(const std::vector<uint8_t> &pixels, const uint32_t xsize,
     }
 
     if (JXL_ENC_SUCCESS !=
-               JxlEncoderSetFrameDistance(frameSettings, compression_distance)) {
+               JxlEncoderSetFrameDistance(frameSettings, compressionDistance)) {
         return false;
     }
 
+    if (colorspace == rgba) {
+        if (JXL_ENC_SUCCESS !=
+                   JxlEncoderSetExtraChannelDistance(frameSettings, 0, compressionDistance)) {
+            return false;
+        }
+    }
+
+
     if (JxlEncoderFrameSettingsSetOption(frameSettings,
                                          JXL_ENC_FRAME_SETTING_EFFORT, effort) != JXL_ENC_SUCCESS) {
         return false;
diff --git a/Sources/jxlc/RgbaScaler.h b/Sources/jxlc/RgbaScaler.h
@@ -32,8 +32,8 @@
 #import "XScaler.hpp"
 
 typedef NS_ENUM(NSInteger, JxlIPixelFormat)  {
-    kU8 NS_SWIFT_NAME(Uniform8),
-    kF16 NS_SWIFT_NAME(Float16)
+    kU8 NS_SWIFT_NAME(uniform8),
+    kF16 NS_SWIFT_NAME(float16)
 };
 
 @interface RgbaScaler : NSObject
diff --git a/Sources/jxlc/RgbaScaler.mm b/Sources/jxlc/RgbaScaler.mm
@@ -33,9 +33,7 @@
 
 @implementation RgbaScaler
 
-//static bool API_AVAILABLE(macos(13.0), ios(16.0), watchos(9.0), tvos(16.0))
 static bool scaleF16iOS16(std::vector<uint8_t> &src, int components, int width, int height, int newWidth, int newHeight, XSampler sampler) {
-    //    if (components != 4) {
     std::vector<uint8_t> dst(components * sizeof(uint16_t) * newWidth * newHeight);
 
     scaleImageFloat16(reinterpret_cast<uint16_t*>(src.data()),
@@ -44,109 +42,9 @@ static bool scaleF16iOS16(std::vector<uint8_t> &src, int components, int width,
 
     src = dst;
     return true;
-    //    }
-    //
-    //    std::vector<uint8_t> dst(4 * sizeof(uint16_t) * newWidth * newHeight);
-    //
-    //    vImage_Buffer srcBuffer = {
-    //        .data = (void*)src.data(),
-    //        .width = static_cast<vImagePixelCount>(width),
-    //        .height = static_cast<vImagePixelCount>(height),
-    //        .rowBytes = width * 4 * sizeof(uint16_t)
-    //    };
-    //
-    //    vImage_Buffer dstBuffer = {
-    //        .data = dst.data(),
-    //        .width = static_cast<vImagePixelCount>(newWidth),
-    //        .height = static_cast<vImagePixelCount>(newHeight),
-    //        .rowBytes = newWidth * 4 * sizeof(uint16_t)
-    //    };
-    //
-    //    auto result = vImageScale_ARGB16F(&srcBuffer, &dstBuffer, nullptr, kvImageUseFP16Accumulator);
-    //    if (result != kvImageNoError) {
-    //        return false;
-    //    }
-    //    src = dst;
-    //    return true;
-}
-
-static bool scaleF16iOSPre16(std::vector<uint8_t> &src, int components, int width, int height, int newWidth, int newHeight, XSampler sampler) {
-
-    vImage_Buffer srcBuffer = {
-        .data = (void*)src.data(),
-        .width = static_cast<vImagePixelCount>(width * components),
-        .height = static_cast<vImagePixelCount>(height),
-        .rowBytes = width * components * sizeof(uint16_t)
-    };
-
-    vImage_Buffer dstBuffer = {
-        .data = src.data(),
-        .width = static_cast<vImagePixelCount>(width * components),
-        .height = static_cast<vImagePixelCount>(height),
-        .rowBytes = width * components * sizeof(uint16_t)
-    };
-    vImage_Error vEerror = vImageConvert_16Fto16U(&srcBuffer, &dstBuffer, kvImageNoFlags);
-    if (vEerror != kvImageNoError) {
-        return false;
-    }
-
-    if (components == 4) {
-
-        std::vector<uint8_t> dst(components * sizeof(uint16_t) * newWidth * newHeight);
-
-        vImage_Buffer srcBuffer = {
-            .data = (void*)src.data(),
-            .width = static_cast<vImagePixelCount>(width),
-            .height = static_cast<vImagePixelCount>(height),
-            .rowBytes = width * 4 * sizeof(uint16_t)
-        };
-
-        vImage_Buffer dstBuffer = {
-            .data = dst.data(),
-            .width = static_cast<vImagePixelCount>(newWidth),
-            .height = static_cast<vImagePixelCount>(newHeight),
-            .rowBytes = newWidth * 4 * sizeof(uint16_t)
-        };
-
-        auto result = vImageScale_ARGB16U(&srcBuffer, &dstBuffer, nullptr, kvImageNoFlags);
-        if (result != kvImageNoError) {
-            return false;
-        }
-        src = dst;
-    } else {
-        std::vector<uint8_t> dst(components * sizeof(uint16_t) * newWidth * newHeight);
-
-        scaleImageU16(reinterpret_cast<uint16_t*>(src.data()),
-                      components * sizeof(uint16_t) * width, width, height, reinterpret_cast<uint16_t*>(dst.data()),
-                      components * sizeof(uint16_t) * newWidth, newWidth, newHeight, components, 16, sampler);
-        src = dst;
-    }
-
-    {
-        vImage_Buffer srcBuffer = {
-            .data = (void*)src.data(),
-            .width = static_cast<vImagePixelCount>(newWidth * components),
-            .height = static_cast<vImagePixelCount>(newHeight),
-            .rowBytes = newWidth * components * sizeof(uint16_t)
-        };
-
-        vImage_Buffer dstBuffer = {
-            .data = (void*)src.data(),
-            .width = static_cast<vImagePixelCount>(newWidth * components),
-            .height = static_cast<vImagePixelCount>(newHeight),
-            .rowBytes = newWidth * components * sizeof(uint16_t)
-        };
-        const float scale = 1.0f / float((1 << 16) - 1);
-        vImage_Error vEerror = vImageConvert_16Uto16F(&srcBuffer, &dstBuffer, kvImageNoFlags);
-        if (vEerror != kvImageNoError) {
-            return false;
-        }
-    }
-    return true;
 }
 
 + (bool)scaleRGB8:(std::vector<uint8_t> &)src components:(int)components width:(int)width height:(int)height newWidth:(int)newWidth newHeight:(int)newHeight sampler:(XSampler)sampler {
-    //    if (components != 4) {
     std::vector<uint8_t> dst(components * sizeof(uint8_t) * newWidth * newHeight);
 
     scaleImageU8(reinterpret_cast<uint8_t*>(src.data()),
@@ -155,36 +53,10 @@ + (bool)scaleRGB8:(std::vector<uint8_t> &)src components:(int)components width:(
     src = dst;
 
     return true;
-    //    }
-    //
-    //    std::vector<uint8_t> dst(4 * sizeof(uint8_t) * newWidth * newHeight);
-    //
-    //    vImage_Buffer srcBuffer = {
-    //        .data = (void*)src.data(),
-    //        .width = static_cast<vImagePixelCount>(width),
-    //        .height = static_cast<vImagePixelCount>(height),
-    //        .rowBytes = width * 4 * sizeof(uint8_t)
-    //    };
-    //
-    //    vImage_Buffer dstBuffer = {
-    //        .data = dst.data(),
-    //        .width = static_cast<vImagePixelCount>(newWidth),
-    //        .height = static_cast<vImagePixelCount>(newHeight),
-    //        .rowBytes = newWidth * 4 * sizeof(uint8_t)
-    //    };
-    //
-    //    auto result = vImageScale_ARGB8888(&srcBuffer, &dstBuffer, nullptr, kvImageNoFlags);
-    //    if (result != kvImageNoError) {
-    //        return false;
-    //    }
-    //
-    //    src = dst;
-    //    return true;
 }
 
 +(bool) scaleData:(std::vector<uint8_t>&)src width:(int)width height:(int)height newWidth:(int)newWidth newHeight:(int)newHeight components:(int)components pixelFormat:(JxlIPixelFormat)pixelFormat sampler:(XSampler)sampler {
 
-    //Flipping not supported
     if (newWidth < 0 || newHeight < 0) {
         return false;
     }
@@ -194,14 +66,8 @@ +(bool) scaleData:(std::vector<uint8_t>&)src width:(int)width height:(int)height
             return [self scaleRGB8:src components:components width:width height:height newWidth:newWidth newHeight:newHeight sampler:sampler];
         } else if (pixelFormat == kF16) {
             return scaleF16iOS16(src, components, width, height, newWidth, newHeight, sampler);
-            //            if (@available(iOS 16.0, macOS 13.0, *)) {
-            //                return scaleF16iOS16(src, components, width, height, newWidth, newHeight, sampler);
-            //            } else {
-            //                return scaleF16iOSPre16(src, components, width, height, newWidth, newHeight, sampler);
-            //            }
         }
     } catch (const std::bad_alloc& e) {
-        // Memory allocation has failed
         return false;
     }
     return false;
diff --git a/Sources/jxlc/ScaleInterpolator.cpp b/Sources/jxlc/ScaleInterpolator.cpp
@@ -31,6 +31,10 @@
 using namespace half_float;
 using namespace std;
 
+#if defined(__clang__)
+#pragma clang fp contract(fast) exceptions(ignore) reassociate(on)
+#endif
+
 // P Found using maxima
 //
 // y(x) := 4 * x * (%pi-x) / (%pi^2) ;
@@ -206,7 +210,8 @@ inline T sinc(T x) {
 template <typename T>
 inline T LanczosWindow(T x, const T a) {
     if (abs(x) < a) {
-        return sinc(T(M_PI) * x) * sinc(T(M_PI) * x / a);
+        T rv = T(M_PI) * x;
+        return sinc(rv) * sinc(rv / a);
     }
     return T(0.0);
 }
diff --git a/Sources/jxlc/ScaleInterpolator.h b/Sources/jxlc/ScaleInterpolator.h
@@ -60,17 +60,68 @@ T CubicBSpline(T t);
 #if __arm64__
 #include <arm_neon.h>
 
-inline float32x4_t Cos(const float32x4_t d) {
+__attribute__((always_inline))
+static inline float32x4_t Cos(const float32x4_t d) {
 
-    constexpr float C0 = 0.99940307;
-    constexpr float C1 = -0.49558072;
-    constexpr float C2 = 0.03679168;
+    const float32x4_t C0 = vdupq_n_f32(0.99940307);
+    const float32x4_t C1 = vdupq_n_f32(-0.49558072);
+    const float32x4_t C2 = vdupq_n_f32(0.03679168);
     constexpr float C3 = -0.00434102;
     float32x4_t x2 = vmulq_f32(d, d);
-    return vmlaq_f32(vdupq_n_f32(C0), x2, vmlaq_f32(vdupq_n_f32(C1), x2, vmlaq_f32(vdupq_n_f32(C2), x2, vdupq_n_f32(C3))));
+    return vmlaq_f32(C0, x2, vmlaq_f32(C1, x2, vmlaq_n_f32(C2, x2, C3)));
 }
 
-inline float32x4_t CubicInterpolation(const float32x4_t d,
+__attribute__((always_inline))
+static inline float32x4_t FastSin(const float32x4_t v) {
+    constexpr float A = 4.0f/(M_PI*M_PI);
+    const float32x4_t P = vdupq_n_f32(0.1952403377008734f);
+    const float32x4_t Q = vdupq_n_f32(0.01915214119105392f);
+    const float32x4_t N_PI = vdupq_n_f32(M_PI);
+
+    float32x4_t y = vmulq_f32(vmulq_n_f32(v, A), vsubq_f32(N_PI, v));
+
+    const float32x4_t fract = vsubq_f32(vsubq_f32(vdupq_n_f32(1.0f), P), Q);
+    return vmulq_f32(y, vmlaq_f32(fract, y, vmlaq_f32(P, y, Q)));
+}
+
+__attribute__((always_inline))
+static inline float32x4_t Sinc(const float32x4_t v) {
+    const float32x4_t zeros = vdupq_n_f32(0);
+    const float32x4_t ones = vdupq_n_f32(0);
+    uint32x4_t mask = vceqq_f32(v, zeros);
+    // if < 0 then set to 1
+    float32x4_t x = vbslq_f32(mask, ones, v);
+    x = vmulq_f32(FastSin(v), vrecpeq_f32(v));
+    // elements that were < 0 set to zero
+    x = vbslq_f32(mask, zeros, v);
+    return x;
+}
+
+__attribute__((always_inline))
+static inline float32x4_t LanczosWindow(const float32x4_t v, const float a) {
+    const float32x4_t fullLength = vdupq_n_f32(a);
+    const float32x4_t invLength = vrecpeq_f32(fullLength);
+    const float32x4_t zeros = vdupq_n_f32(0);
+    uint32x4_t mask = vcltq_f32(vabsq_f32(v), fullLength);
+    float32x4_t rv = vmulq_n_f32(v, M_PI);
+    float32x4_t x = vmulq_f32(Sinc(rv), Sinc(vmulq_f32(v, invLength)));
+    x = vbslq_f32(mask, zeros, x);
+    return x;
+}
+
+__attribute__((always_inline))
+static inline float32x4_t HannWindow(const float32x4_t d, const float length) {
+    const float32x4_t fullLength = vrecpeq_f32(vdupq_n_f32(length));
+    const float32x4_t halfLength = vdupq_n_f32(length / 2);
+    const float32x4_t zeros = vdupq_n_f32(0);
+    uint32x4_t mask = vcltq_f32(vabsq_f32(d), halfLength);
+    float32x4_t cx = Cos(vmulq_f32(vmulq_n_f32(d, M_PI), fullLength));
+    cx = vmulq_f32(vmulq_f32(cx, cx), fullLength);
+    return vbslq_f32(mask, zeros, cx);
+}
+
+__attribute__((always_inline))
+static inline float32x4_t CubicInterpolation(const float32x4_t d,
                                const float32x4_t p0, const float32x4_t p1, const float32x4_t p2, const float32x4_t p3,
                                const float C, const float B) {
 
@@ -92,17 +143,8 @@ inline float32x4_t CubicInterpolation(const float32x4_t d,
     return result;
 }
 
-inline float32x4_t HannWindow(const float32x4_t d, const float length) {
-    float32x4_t x = vabsq_f32(d);
-    uint32x4_t mask = vcltq_f32(x, vdupq_n_f32(length / 2));
-
-    x = Cos(vdivq_f32(vmulq_f32(vdupq_n_f32(M_PI), x), vdupq_n_f32(length)));
-    x = vmulq_n_f32(vmulq_f32(x, x), length / 2);
-    x = vbslq_f32(mask, vdupq_n_f32(0), x);
-    return x;
-}
-
-inline float32x4_t CatmullRom(const float32x4_t d,
+__attribute__((always_inline))
+static inline float32x4_t CatmullRom(const float32x4_t d,
                               const float32x4_t p0, const float32x4_t p1, const float32x4_t p2, const float32x4_t p3) {
 
     float32x4_t x = vabsq_f32(d);
@@ -119,7 +161,8 @@ inline float32x4_t CatmullRom(const float32x4_t d,
     return result;
 }
 
-inline float32x4_t SimpleCubic(const float32x4_t d,
+__attribute__((always_inline))
+static inline float32x4_t SimpleCubic(const float32x4_t d,
                                const float32x4_t p0, const float32x4_t p1, const float32x4_t p2, const float32x4_t p3) {
 
     float32x4_t duplet = vmulq_f32(d, d);
@@ -139,17 +182,20 @@ inline float32x4_t SimpleCubic(const float32x4_t d,
     return result;
 }
 
-inline float32x4_t MitchellNetravali(float32x4_t d,
+__attribute__((always_inline))
+static inline float32x4_t MitchellNetravali(float32x4_t d,
                               float32x4_t p0, const float32x4_t p1, const float32x4_t p2, const float32x4_t p3) {
     return CubicInterpolation(d, p0, p1, p2, p3, 1.0f/3.0f, 1.0f/3.0f);
 }
 
-inline float32x4_t CubicHermite(const float32x4_t d,
+__attribute__((always_inline))
+static inline float32x4_t CubicHermite(const float32x4_t d,
                                const float32x4_t p0, const float32x4_t p1, const float32x4_t p2, const float32x4_t p3) {
     return CubicInterpolation(d, p0, p1, p2, p3, 0.0f, 0.0f);
 }
 
-inline float32x4_t CubicBSpline(const float32x4_t d,
+__attribute__((always_inline))
+static inline float32x4_t CubicBSpline(const float32x4_t d,
                                const float32x4_t p0, const float32x4_t p1, const float32x4_t p2, const float32x4_t p3) {
     return CubicInterpolation(d, p0, p1, p2, p3, 0.0f, 1.0f);
 }
diff --git a/Sources/jxlc/XScaler.mm b/Sources/jxlc/XScaler.mm