Merge pull request #10 from MeijisIrlnd/syl/simd-filter

MeijisIrlnd · web-flow · commit 27b816b247e6 · 2025-05-10T02:11:00.000+01:00
SIMD Biquad
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -25,6 +25,7 @@ if (APPLE)
             "-fassociative-math"
             "-fno-math-errno"
             "-freciprocal-math"
+            "-ftree-vectorize"
     )
 else ()
     if (WIN32)
@@ -75,6 +76,15 @@ FetchContent_Declare(
 )
 FetchContent_MakeAvailable(concurrentqueue)
 
+# Disable xsimd unit tests
+set(BUILD_TESTS OFF)
+FetchContent_Declare(xsimd
+        GIT_REPOSITORY https://github.com/xtensor-stack/xsimd.git
+        GIT_TAG fb250213fd8c2db857fa0db05eea9b09fb1fe764
+        GIT_SHALLOW ON
+)
+FetchContent_MakeAvailable(xsimd)
+
 add_subdirectory(source)
 add_subdirectory(include)
 add_subdirectory(tests)
@@ -94,6 +104,7 @@ target_link_libraries(marvin PUBLIC
         ${MARVIN_EXTRA_LINK_LIBS}
         readerwriterqueue
         concurrentqueue
+        xsimd
 )
 
 install(DIRECTORY "include/" # source directory
diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
@@ -22,6 +22,7 @@ set(MARVIN_HEADERS
         ${CMAKE_CURRENT_SOURCE_DIR}/marvin/dsp/filters/biquad/marvin_BiquadCoefficients.h
         ${CMAKE_CURRENT_SOURCE_DIR}/marvin/dsp/filters/biquad/marvin_SmoothedBiquadCoefficients.h
         ${CMAKE_CURRENT_SOURCE_DIR}/marvin/dsp/filters/biquad/marvin_Biquad.h
+        ${CMAKE_CURRENT_SOURCE_DIR}/marvin/dsp/filters/biquad/marvin_SIMDBiquad.h
         ${CMAKE_CURRENT_SOURCE_DIR}/marvin/dsp/filters/biquad/marvin_RBJCoefficients.h
         ${CMAKE_CURRENT_SOURCE_DIR}/marvin/dsp/oscillators/marvin_Oscillator.h
         ${CMAKE_CURRENT_SOURCE_DIR}/marvin/library/marvin_Concepts.h
diff --git a/include/marvin/dsp/filters/biquad/marvin_Biquad.h b/include/marvin/dsp/filters/biquad/marvin_Biquad.h
@@ -73,7 +73,19 @@ namespace marvin::dsp::filters {
                 // ));
                 const auto [a0, a1, a2, b0, b1, b2] = m_coeffs[stage];
                 auto& delay = m_delays[stage];
+                const auto invB0 = static_cast<SampleType>(1.0) / b0;
+                const auto a0x0 = a0 * x;
+                const auto a1x1 = a1 * delay.x_z1;
+                const auto a2x2 = a2 * delay.x_z2;
+                const auto aSum = a0x0 + a1x1 + a2x2;
+                const auto b1y1 = b1 * delay.y_z1;
+                const auto b2y2 = b2 * delay.y_z2;
+                const auto aMinusB1 = aSum - b1y1;
+                const auto aMinusB2 = aMinusB1 - b2y2;
+                const auto yOther = invB0 * aMinusB2;
+
                 const auto y = static_cast<SampleType>(1.0) / b0 * ((a0 * x) + (a1 * delay.x_z1) + (a2 * delay.x_z2) - (b1 * delay.y_z1) - (b2 * delay.y_z2));
+                // assert(yOther == y);
                 delay(x, y);
                 x = y;
             }
diff --git a/include/marvin/dsp/filters/biquad/marvin_SIMDBiquad.h b/include/marvin/dsp/filters/biquad/marvin_SIMDBiquad.h
@@ -0,0 +1,162 @@
+//
+// Created by Syl Morrison on 03/05/2025.
+//
+
+#ifndef MARVIN_SIMDBIQUAD_H
+#define MARVIN_SIMDBIQUAD_H
+#include <marvin/dsp/filters/biquad/marvin_BiquadCoefficients.h>
+#include <xsimd/xsimd.hpp>
+#include <marvin/library/marvin_Concepts.h>
+#include <marvin/math/marvin_VecOps.h>
+namespace marvin::dsp::filters {
+    /**
+     * \brief A SIMD optimised biquad, for running N biquads in parallel.
+     *
+     * From benchmarks, only gives a speedup in certain cases, and even in those cases, only ~100ns.
+     * That being said, a robust parallel structure for filters is arguably nicer than a std::array<filter, N>.
+     *
+     * @tparam SampleType float or double
+     * @tparam N The number of parallel biquads to process
+     */
+    template <marvin::FloatType SampleType, size_t N>
+    requires(N > 0)
+    class SIMDBiquad final {
+    public:
+        /**
+         * Constructor
+         */
+        SIMDBiquad() {
+            m_working.resize(N, 0.0);
+            m_a0.resize(N, 0.0);
+            m_a1.resize(N, 0.0);
+            m_a2.resize(N, 0.0);
+            m_b1.resize(N, 0.0);
+            m_b2.resize(N, 0.0);
+            m_x1.resize(N, 0.0);
+            m_x2.resize(N, 0.0);
+            m_y1.resize(N, 0.0);
+            m_y2.resize(N, 0.0);
+        }
+
+        /**
+         * Sets the coefficients for all filters to the ones passed to the `coeffs` arg
+         *
+         * @param coeffs A BiquadCoefficients<SampleType> containing the coeffs you want to set.
+         */
+        auto setCoeffs(BiquadCoefficients<SampleType> coeffs) noexcept -> void {
+            m_equalCoeffs = true;
+            const auto a0 = coeffs.a0 / coeffs.b0;
+            const auto a1 = coeffs.a1 / coeffs.b0;
+            const auto a2 = coeffs.a2 / coeffs.b0;
+            const auto b1 = coeffs.b1 / coeffs.b0;
+            const auto b2 = coeffs.b2 / coeffs.b0;
+
+            const auto a0Batch = xsimd::broadcast(a0);
+            const auto a1Batch = xsimd::broadcast(a1);
+            const auto a2Batch = xsimd::broadcast(a2);
+            const auto b1Batch = xsimd::broadcast(b1);
+            const auto b2Batch = xsimd::broadcast(b2);
+            for (size_t i = 0; i < m_vecSize; i += m_simdSize) {
+                a0Batch.store_aligned(&m_a0[i]);
+                a1Batch.store_aligned(&m_a1[i]);
+                a2Batch.store_aligned(&m_a2[i]);
+                b1Batch.store_aligned(&m_b1[i]);
+                b2Batch.store_aligned(&m_b2[i]);
+            }
+            for (size_t i = m_vecSize; i < N; ++i) {
+                m_a0[i] = a0;
+                m_a1[i] = a1;
+                m_a2[i] = a2;
+                m_b1[i] = b1;
+                m_b2[i] = b2;
+            }
+        }
+
+        /**
+         * Sets the coefficients for a specific biquad
+         * @param index
+         * @param coeffs
+         */
+        auto setCoeffs(size_t index, BiquadCoefficients<SampleType> coeffs) noexcept -> void {
+            m_equalCoeffs = false;
+            const auto [a0, a1, a2, b0, b1, b2] = coeffs;
+            m_a0[index] = a0 / b0;
+            m_a1[index] = a1 / b0;
+            m_a2[index] = a2 / b0;
+            m_b1[index] = b1 / b0;
+            m_b2[index] = b2 / b0;
+        }
+
+        /**
+         * Processes all samples in x through their respective biquads, and overwrites the values in x
+         * @param x An array-like containing N samples to be filtered.
+         */
+        auto operator()(std::span<SampleType, N> x) noexcept -> void {
+            constexpr static auto sizeBytes = sizeof(SampleType) * N;
+            std::memcpy(m_working.data(), x.data(), sizeBytes);
+            for (size_t i = 0; i < m_vecSize; i += m_simdSize) {
+                const auto& a0 = xsimd::load_aligned(&m_a0[i]);
+                const auto& x0 = xsimd::load_aligned(&m_working[i]);
+                const auto a0x0 = a0 * x0;
+                const auto& a1 = xsimd::load_aligned(&m_a1[i]);
+                const auto& x1 = xsimd::load_aligned(&m_x1[i]);
+                const auto a1x1 = a1 * x1;
+                const auto& a2 = xsimd::load_aligned(&m_a2[i]);
+                const auto& x2 = xsimd::load_aligned(&m_x2[i]);
+                const auto a2x2 = a2 * x2;
+                const auto& b1 = xsimd::load_aligned(&m_b1[i]);
+                const auto& y1 = xsimd::load_aligned(&m_y1[i]);
+                const auto b1y1 = b1 * y1;
+                const auto& b2 = xsimd::load_aligned(&m_b2[i]);
+                const auto& y2 = xsimd::load_aligned(&m_y2[i]);
+                const auto b2y2 = b2 * y2;
+                const auto res = a0x0 + a1x1 + a2x2 - b1y1 - b2y2;
+                x1.store_aligned(&m_x2[i]);
+                x0.store_aligned(&m_x1[i]);
+                y1.store_aligned(&m_y2[i]);
+                res.store_aligned(&m_y1[i]);
+            }
+
+            for (size_t i = m_vecSize; i < N; ++i) {
+                const auto res = (m_a0[i] * m_working[i]) + (m_a1[i] * m_x1[i]) + (m_a2[i] * m_x2[i]) - (m_b1[i] * m_y1[i]) - (m_b2[i] * m_y2[i]);
+                m_x2[i] = m_x1[i];
+                m_x1[i] = m_working[i];
+                m_y2[i] = m_y1[i];
+                m_y1[i] = res;
+            }
+            std::memcpy(x.data(), m_y1.data(), sizeBytes);
+        }
+
+        /**
+         * Zeroes all internal state (except coefficients).
+         */
+        auto reset() noexcept -> void {
+            auto batch = xsimd::broadcast(0.0);
+            for (size_t i = 0; i < m_vecSize; i += m_simdSize) {
+                batch.store_aligned(&m_working[i]);
+                batch.store_aligned(&m_x1[i]);
+                batch.store_aligned(&m_x2[i]);
+                batch.store_aligned(&m_y1[i]);
+                batch.store_aligned(&m_y2[i]);
+            }
+            for (size_t i = m_vecSize; i < N; ++i) {
+                m_working[i] = 0.0;
+                m_x1[i] = 0.0;
+                m_x2[i] = 0.0;
+                m_y1[i] = 0.0;
+                m_y2[i] = 0.0;
+            }
+        }
+
+    private:
+        constexpr static auto m_simdSize = xsimd::simd_type<SampleType>::size;
+        constexpr static auto m_vecSize = N - N % m_simdSize;
+        bool m_equalCoeffs{ false };
+        std::vector<SampleType, xsimd::aligned_allocator<SampleType>> m_working;
+        std::vector<SampleType, xsimd::aligned_allocator<SampleType>> m_a0, m_a1, m_a2, m_b1, m_b2;
+        std::vector<SampleType, xsimd::aligned_allocator<SampleType>> m_x1, m_x2, m_y1, m_y2;
+    };
+
+
+} // namespace marvin::dsp::filters
+#endif // MARVIN_SIMDBIQUAD_H
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
@@ -20,6 +20,7 @@ set(MARVIN_SOURCES
         ${CMAKE_CURRENT_SOURCE_DIR}/dsp/filters/marvin_APF.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/dsp/filters/marvin_LPF.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/dsp/filters/marvin_SVF.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/dsp/filters/biquad/marvin_SIMDBiquad.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/dsp/filters/biquad/marvin_Biquad.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/dsp/filters/biquad/marvin_BiquadCoefficients.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/dsp/filters/biquad/marvin_RBJCoefficients.cpp
diff --git a/source/dsp/filters/biquad/marvin_SIMDBiquad.cpp b/source/dsp/filters/biquad/marvin_SIMDBiquad.cpp
@@ -0,0 +1,4 @@
+//
+// Created by Syl Morrison on 10/05/2025.
+//
+#include <marvin/dsp/filters/biquad/marvin_SIMDBiquad.h>
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -11,6 +11,7 @@ set(MARVIN_TEST_SOURCE
         ${CMAKE_CURRENT_SOURCE_DIR}/dsp/filters/marvin_SVFTests.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/dsp/filters/biquad/marvin_BiquadTests.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/dsp/filters/biquad/marvin_SmoothedBiquadCoefficientsTests.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/dsp/filters/biquad/marvin_SIMDBiquadTests.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/library/marvin_ConceptsTests.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/library/marvin_PropagateConstTests.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/math/marvin_MathTests.cpp
@@ -23,7 +24,6 @@ set(MARVIN_TEST_SOURCE
         ${CMAKE_CURRENT_SOURCE_DIR}/utils/marvin_SmoothedValueTests.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/utils/marvin_RandomTests.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/utils/marvin_FIFOTests.cpp
-        # ${CMAKE_CURRENT_SOURCE_DIR}/utils/marvin_FormatReaderTests.cpp
         PARENT_SCOPE
 )
     
diff --git a/tests/dsp/filters/biquad/marvin_SIMDBiquadTests.cpp b/tests/dsp/filters/biquad/marvin_SIMDBiquadTests.cpp
@@ -0,0 +1,119 @@
+//
+// Created by Syl Morrison on 03/05/2025.
+//
+#include "catch2/benchmark/catch_benchmark.hpp"
+
+
+#include <iostream>
+#include <marvin/dsp/filters/biquad/marvin_Biquad.h>
+#include <marvin/dsp/filters/biquad/marvin_RBJCoefficients.h>
+#include <marvin/dsp/filters/biquad/marvin_SIMDBiquad.h>
+#include <marvin/dsp/oscillators/marvin_Oscillator.h>
+#include <fmt/core.h>
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/matchers/catch_matchers_floating_point.hpp>
+namespace marvin::testing {
+    static std::random_device s_rd;
+    template <marvin::FloatType SampleType>
+    std::vector<SampleType> generateImpulse(size_t len) {
+        std::vector<SampleType> impulse;
+        impulse.resize(len);
+        std::fill(impulse.begin(), impulse.end(), static_cast<SampleType>(0.0));
+        impulse[0] = static_cast<SampleType>(1.0);
+        return impulse;
+    }
+    template <FloatType T, size_t N>
+    std::vector<T> generateNoise() {
+        marvin::dsp::oscillators::NoiseOscillator<T> osc{ s_rd };
+        std::vector<T> vec(N, static_cast<T>(0.0));
+        for (auto i = 0; i < N; ++i) {
+            vec[i] = osc();
+        }
+        return vec;
+    }
+
+    TEST_CASE("Test parity with single biquad") {
+        constexpr static auto sampleRate{ 44100.0 };
+        constexpr static auto cutoff{ 200.0 };
+        constexpr static auto q{ 0.7070 };
+        marvin::dsp::filters::Biquad<double, 1> singleBiquad;
+        marvin::dsp::filters::SIMDBiquad<double, 1> simdBiquad;
+        auto lowpassCoeffs = dsp::filters::rbj::lowpass(sampleRate, cutoff, q);
+        singleBiquad.setCoeffs(0, lowpassCoeffs);
+        simdBiquad.setCoeffs(lowpassCoeffs);
+        auto impulse = generateImpulse<double>(100);
+        for (auto i = 0; i < impulse.size(); ++i) {
+            const auto singleFiltered = singleBiquad(impulse[i]);
+            std::array<double, 1> simdInput{ impulse[i] };
+            simdBiquad(simdInput);
+            std::cout << i;
+            REQUIRE_THAT(simdInput[0], Catch::Matchers::WithinRel(singleFiltered, 0.1));
+        }
+    }
+
+    template <NumericType T>
+    [[nodiscard]] std::string getTypeName() {
+        if constexpr (std::is_same_v<T, float>) {
+            return "float";
+        } else if constexpr (std::is_same_v<T, double>) {
+            return "double";
+        }
+    }
+
+    template <marvin::FloatType SampleType, size_t N, size_t NumSamples>
+    auto benchmarkSIMD() -> void {
+        constexpr static auto sampleRate{ 44100.0 };
+        constexpr static auto cutoff{ 200.0 };
+        constexpr static auto q{ 0.7070 };
+        const auto coeffs = marvin::dsp::filters::rbj::lowpass<SampleType>(sampleRate, cutoff, q);
+        std::array<marvin::dsp::filters::Biquad<SampleType, 1>, N> normalFilters;
+        marvin::dsp::filters::SIMDBiquad<SampleType, N> simdFilters;
+        for (auto& f : normalFilters) {
+            f.setCoeffs(0, coeffs);
+        }
+        simdFilters.setCoeffs(coeffs);
+        auto impulse = generateNoise<SampleType, NumSamples>();
+        std::vector<std::array<SampleType, N>> simdInputs;
+        for (auto& x : impulse) {
+            std::array<SampleType, N> current;
+            std::fill(current.begin(), current.end(), x);
+            simdInputs.emplace_back(current);
+        }
+        BENCHMARK(fmt::format("Biquad, N = {}, NSamples = {}, Type = {}", N, NumSamples, getTypeName<SampleType>(), N)) {
+            for (auto i = 0; i < impulse.size(); ++i) {
+                const auto coeffs = marvin::dsp::filters::rbj::lowpass<SampleType>(sampleRate, cutoff + static_cast<SampleType>(i), q);
+
+                for (auto& f : normalFilters) {
+                    f.setCoeffs(0, coeffs);
+                    [[maybe_unused]] const auto _ = f(impulse[i]);
+                }
+            }
+        };
+        BENCHMARK(fmt::format("SIMDBiquad<{}>, NSamples = {}, Type = {}", N, NumSamples, getTypeName<SampleType>(), N)) {
+            for (auto i = 0; i < impulse.size(); ++i) {
+                const auto coeffs = marvin::dsp::filters::rbj::lowpass<SampleType>(sampleRate, cutoff + static_cast<SampleType>(i), q);
+                simdFilters.setCoeffs(coeffs);
+                simdFilters(simdInputs[i]);
+            }
+        };
+    }
+
+    TEST_CASE("Benchmark Biquads") {
+        benchmarkSIMD<float, 2, 32>();
+        benchmarkSIMD<float, 3, 32>();
+        benchmarkSIMD<float, 4, 32>();
+        benchmarkSIMD<float, 5, 32>();
+        benchmarkSIMD<float, 6, 32>();
+        benchmarkSIMD<float, 7, 32>();
+        benchmarkSIMD<float, 8, 32>();
+        benchmarkSIMD<float, 9, 32>();
+        benchmarkSIMD<float, 10, 32>();
+        benchmarkSIMD<float, 11, 32>();
+        benchmarkSIMD<float, 12, 32>();
+        benchmarkSIMD<float, 13, 32>();
+        benchmarkSIMD<float, 14, 32>();
+        benchmarkSIMD<float, 15, 32>();
+        benchmarkSIMD<float, 16, 32>();
+    }
+
+} // namespace marvin::testing

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +//
 +// Created by Syl Morrison on 10/05/2025.
 +//
 +#include <marvin/dsp/filters/biquad/marvin_SIMDBiquad.h>