mixxxdj · dhunstack · Oct 27, 2025 · acolombier · Oct 21, 2025 · acolombier
diff --git a/.gitignore b/.gitignore
@@ -17,3 +17,4 @@ Session.vim
 *.onnx
 *.ort
 *.config
+/cppscripts/build
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "cppscripts/dependencies/eigen"]
+	path = cppscripts/dependencies/eigen
+	url = https://gitlab.com/libeigen/eigen.git
+[submodule "cppscripts/dependencies/libnyquist"]
+	path = cppscripts/dependencies/libnyquist
+	url = https://github.com/ddiakopoulos/libnyquist.git
diff --git a/cppscripts/Makefile b/cppscripts/Makefile
@@ -0,0 +1,15 @@
+default: cli
+
+cli:
+	cmake -S src_cli -B build/build-cli -DCMAKE_BUILD_TYPE=Release
+	cmake --build build/build-cli -- -j16
+
+cli-debug:
+	cmake -S src_cli -B build/build-cli -DCMAKE_BUILD_TYPE=Debug
+	cmake --build build/build-cli -- -j16
+
+clean-all:
+	rm -rf build
+
+clean-cli:
+	rm -rf build/build-cli
diff --git a/cppscripts/dependencies/eigen b/cppscripts/dependencies/eigen
diff --git a/cppscripts/dependencies/libnyquist b/cppscripts/dependencies/libnyquist
diff --git a/cppscripts/src/demucs.hpp b/cppscripts/src/demucs.hpp
@@ -0,0 +1,132 @@
+#ifndef MODEL_HPP
+#define MODEL_HPP
+
+#include "dsp.hpp"
+#include "tensor.hpp"
+#include <Eigen/Dense>
+#include <array>
+#include <functional>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <onnxruntime/onnxruntime_cxx_api.h>
+
+namespace demucsonnx
+{
+extern Ort::AllocatorWithDefaultOptions allocator;
+extern Ort::RunOptions run_options;
+
+// Define a type for your callback function
+using ProgressCallback = std::function<void(float, const std::string &)>;
+
+const int FREQ_BRANCH_LEN = 336;
+const int TIME_BRANCH_LEN_IN = 343980;
+
+struct demucs_model {
+    std::unique_ptr<Ort::Session> sess;        // Smart pointer to allow "empty" state
+    int nb_sources = 0;
+    Ort::Env env{ORT_LOGGING_LEVEL_ERROR, "demucs_onnx"}; // Persistent environment
+    std::vector<std::string> input_names;      // Persistent input names
+    std::vector<std::string> output_names;     // Persistent output names
+
+    std::vector<const char*> input_names_ptrs;
+    std::vector<const char*> output_names_ptrs;
+
+    // Constructor (optionally initialize here if needed)
+    demucs_model() = default;
+};
+
+bool load_model(const char *model_data,
+                int n_bytes,
+                struct demucs_model &model,
+                Ort::SessionOptions &session_options);
+
+bool load_model(const std::vector<char> &model_data,
+                struct demucs_model &model,
+                Ort::SessionOptions &session_options);
+
+struct demucs_segment_buffers
+{
+    int segment_samples;
+    int le;
+    int pad;
+    int pad_end;
+    int padded_segment_samples;
+    int nb_stft_frames;
+    int nb_stft_bins;
+
+    Eigen::Tensor3dXf targets_out;
+    Eigen::MatrixXf padded_mix;
+    Eigen::Tensor3dXcf z;
+
+    std::vector<int64_t> x_onnx_in_shape;
+    std::vector<int64_t> xt_onnx_in_shape;
+
+    std::vector<int64_t> x_onnx_out_shape;
+    std::vector<int64_t> xt_onnx_out_shape;
+
+    std::vector<Ort::Value> input_tensors;
+    std::vector<Ort::Value> output_tensors;
+
+    // constructor for demucs_segment_buffers that takes int parameters
+
+    // let's do pesky precomputing of the signal repadding to 1/4 hop
+    // for time and frequency alignment
+    demucs_segment_buffers(int nb_channels, int segment_samples, int nb_sources)
+        : segment_samples(segment_samples),
+          le(int(std::ceil((float)segment_samples / (float)FFT_HOP_SIZE))),
+          pad(std::floor((float)FFT_HOP_SIZE / 2.0f) * 3),
+          pad_end(pad + le * FFT_HOP_SIZE - segment_samples),
+          padded_segment_samples(segment_samples + pad + pad_end),
+          nb_stft_frames(segment_samples / demucsonnx::FFT_HOP_SIZE + 1),
+          nb_stft_bins(demucsonnx::FFT_WINDOW_SIZE / 2 + 1),
+          targets_out(nb_sources, nb_channels, segment_samples),
+          padded_mix(nb_channels, padded_segment_samples),
+          z(nb_channels, nb_stft_bins, nb_stft_frames+4),
+          // complex-as-channels implies 2*nb_channels for real+imag
+          x_onnx_in_shape({1, 2 * nb_channels, nb_stft_bins - 1, nb_stft_frames}),
+          xt_onnx_in_shape({1, nb_channels, segment_samples}),
+          x_onnx_out_shape({1, nb_sources, 2 * nb_channels, nb_stft_bins - 1, nb_stft_frames}),
+          xt_onnx_out_shape({1, nb_sources, nb_channels, segment_samples})
+    {
+        // precompute the input tensors
+        // inputs in form (xt, x)
+        input_tensors.push_back(Ort::Value::CreateTensor<float>(
+            demucsonnx::allocator,
+            xt_onnx_in_shape.data(),
+            xt_onnx_in_shape.size()));
+
+        // input_tensors.push_back(Ort::Value::CreateTensor<float>(
+        //     demucsonnx::allocator,
+        //     x_onnx_in_shape.data(),
+        //     x_onnx_in_shape.size()));
+
+        // precompute the output tensors
+        // outputs in form (x_out, xt_out)
+        // output_tensors.push_back(Ort::Value::CreateTensor<float>(
+        //     demucsonnx::allocator,
+        //     x_onnx_out_shape.data(),
+        //     x_onnx_out_shape.size()));
+
+        output_tensors.push_back(Ort::Value::CreateTensor<float>(
+            demucsonnx::allocator,
+            xt_onnx_out_shape.data(),
+            xt_onnx_out_shape.size()));
+    };
+};
+
+const float SEGMENT_LEN_SECS = 7.8;      // 8 seconds, the demucs chunk size
+const float SEGMENT_OVERLAP_SECS = 0.25; // 0.25 overlap
+const float MAX_SHIFT_SECS = 0.5;        // max shift
+const float OVERLAP = 0.25;              // overlap between segments
+const float TRANSITION_POWER = 1.0;      // transition between segments
+
+Eigen::Tensor3dXf demucs_inference(struct demucs_model &model,
+                                   const Eigen::MatrixXf &audio,
+                                   ProgressCallback cb);
+
+void model_inference(struct demucs_model &model,
+                     struct demucsonnx::demucs_segment_buffers &buffers);
+} // namespace demucsonnx
+
+#endif // MODEL_HPP
diff --git a/cppscripts/src/dsp.cpp b/cppscripts/src/dsp.cpp
@@ -0,0 +1,191 @@
+#include "dsp.hpp"
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <unsupported/Eigen/FFT>
+#include <vector>
+
+// forward declaration of inner stft
+void stft_inner(struct demucsonnx::stft_buffers &stft_buf,
+                Eigen::FFT<float> &cfg);
+
+void istft_inner(struct demucsonnx::stft_buffers &stft_buf,
+                 Eigen::FFT<float> &cfg);
+
+// reflect padding
+void pad_signal(struct demucsonnx::stft_buffers &stft_buf)
+{
+    // copy from stft_buf.padded_waveform_mono_in+pad into stft_buf.pad_start,
+    // stft_buf.pad_end
+    std::copy_n(stft_buf.padded_waveform_mono_in.begin() + stft_buf.pad,
+                stft_buf.pad, stft_buf.pad_start.begin());
+    std::copy_n(stft_buf.padded_waveform_mono_in.end() - 2 * stft_buf.pad,
+                stft_buf.pad, stft_buf.pad_end.begin());
+
+    std::reverse(stft_buf.pad_start.begin(), stft_buf.pad_start.end());
+    std::reverse(stft_buf.pad_end.begin(), stft_buf.pad_end.end());
+
+    // copy stft_buf.pad_start into stft_buf.padded_waveform_mono_in
+    std::copy_n(stft_buf.pad_start.begin(), stft_buf.pad,
+                stft_buf.padded_waveform_mono_in.begin());
+
+    // copy stft_buf.pad_end into stft_buf.padded_waveform_mono_in
+    std::copy_n(stft_buf.pad_end.begin(), stft_buf.pad,
+                stft_buf.padded_waveform_mono_in.end() - stft_buf.pad);
+}
+
+Eigen::FFT<float> get_fft_cfg()
+{
+    Eigen::FFT<float> cfg;
+
+    cfg.SetFlag(Eigen::FFT<float>::Speedy);
+    // cfg.SetFlag(Eigen::FFT<float>::HalfSpectrum);
+    // cfg.SetFlag(Eigen::FFT<float>::Unscaled);
+
+    return cfg;
+}
+
+void demucsonnx::stft(
+    struct stft_buffers &stft_buf,
+    const Eigen::MatrixXf &waveform,
+    Eigen::Tensor3dXcf &spec)
+{
+    // get the fft config
+    Eigen::FFT<float> cfg = get_fft_cfg();
+
+    /*****************************************/
+    /*  operate on each channel sequentially */
+    /*****************************************/
+
+    for (int channel = 0; channel < 2; ++channel)
+    {
+        Eigen::VectorXf row_vec = waveform.row(channel);
+
+        std::copy_n(row_vec.data(), row_vec.size(),
+                    stft_buf.padded_waveform_mono_in.begin() + stft_buf.pad);
+
+        // apply padding equivalent to center padding with center=True
+        // in torch.stft:
+        // https://pytorch.org/docs/stable/generated/torch.stft.html
+
+        // reflect pads stft_buf.padded_waveform_mono in-place
+        pad_signal(stft_buf);
+
+        // does forward fft on stft_buf.padded_waveform_mono, stores spectrum in
+        // complex_spec_mono
+        stft_inner(stft_buf, cfg);
+
+        for (int i = 0; i < stft_buf.nb_bins; ++i)
+        {
+            for (int j = 0; j < stft_buf.nb_frames; ++j)
+            {
+                spec(channel, i, j) = stft_buf.complex_spec_mono[j][i];
+            }
+        }
+    }
+}
+
+void demucsonnx::istft(
+    struct stft_buffers &stft_buf,
+    const Eigen::Tensor3dXcf &spec,
+    Eigen::MatrixXf &waveform)
+{
+    // get the fft config
+    Eigen::FFT<float> cfg = get_fft_cfg();
+
+    /*****************************************/
+    /*  operate on each channel sequentially */
+    /*****************************************/
+
+    for (int channel = 0; channel < 2; ++channel)
+    {
+        // Populate the nested vectors
+        for (int i = 0; i < stft_buf.nb_bins; ++i)
+        {
+            for (int j = 0; j < stft_buf.nb_frames; ++j)
+            {
+                stft_buf.complex_spec_mono[j][i] = spec(channel, i, j);
+            }
+        }
+
+        // does inverse fft on stft_buf.complex_spec_mono, stores waveform in
+        // padded_waveform_mono
+        istft_inner(stft_buf, cfg);
+
+        // copies waveform_mono into stft_buf.waveform past first pad samples
+        waveform.row(channel) = Eigen::Map<Eigen::MatrixXf>(
+            stft_buf.padded_waveform_mono_out.data() + stft_buf.pad, 1,
+            stft_buf.padded_waveform_mono_out.size() - FFT_WINDOW_SIZE);
+    }
+}
+
+void stft_inner(struct demucsonnx::stft_buffers &stft_buf,
+                Eigen::FFT<float> &cfg)
+{
+    int frame_idx = 0;
+
+    // Loop over the waveform with a stride of hop_size
+    for (std::size_t start = 0;
+         start <=
+         stft_buf.padded_waveform_mono_in.size() - demucsonnx::FFT_WINDOW_SIZE;
+         start += demucsonnx::FFT_HOP_SIZE)
+    {
+        // Apply window and run FFT
+        for (int i = 0; i < demucsonnx::FFT_WINDOW_SIZE; ++i)
+        {
+            stft_buf.windowed_waveform_mono[i] =
+                stft_buf.padded_waveform_mono_in[start + i] *
+                stft_buf.window[i];
+        }
+        cfg.fwd(stft_buf.complex_spec_mono[frame_idx],
+                stft_buf.windowed_waveform_mono);
+        // now scale stft_buf.complex_spec_mono[frame_idx] by 1.0f /
+        // sqrt(float(FFT_WINDOW_SIZE)))
+
+        for (int i = 0; i < demucsonnx::FFT_WINDOW_SIZE / 2 + 1; ++i)
+        {
+            stft_buf.complex_spec_mono[frame_idx][i] *=
+                1.0f / sqrt(float(demucsonnx::FFT_WINDOW_SIZE));
+        }
+        frame_idx++;
+    }
+}
+
+void istft_inner(struct demucsonnx::stft_buffers &stft_buf,
+                 Eigen::FFT<float> &cfg)
+{
+    // clear padded_waveform_mono
+    std::fill(stft_buf.padded_waveform_mono_out.begin(),
+              stft_buf.padded_waveform_mono_out.end(), 0.0f);
+
+    // Loop over the input with a stride of (hop_size)
+    for (int start = 0; start < stft_buf.nb_frames * demucsonnx::FFT_HOP_SIZE;
+         start += demucsonnx::FFT_HOP_SIZE)
+    {
+        int frame_idx = start / demucsonnx::FFT_HOP_SIZE;
+        // undo sqrt(nfft) scaling
+        for (int i = 0; i < demucsonnx::FFT_WINDOW_SIZE / 2 + 1; ++i)
+        {
+            stft_buf.complex_spec_mono[frame_idx][i] *=
+                sqrt(float(demucsonnx::FFT_WINDOW_SIZE));
+        }
+        // Run iFFT
+        cfg.inv(stft_buf.windowed_waveform_mono,
+                stft_buf.complex_spec_mono[frame_idx]);
+
+        // Apply window and add to output
+        for (int i = 0; i < demucsonnx::FFT_WINDOW_SIZE; ++i)
+        {
+            // x[start+i] is the sum of squared window values
+            // https://github.com/librosa/librosa/blob/main/librosa/core/spectrum.py#L613
+            // 1e-8f is a small number to avoid division by zero
+            stft_buf.padded_waveform_mono_out[start + i] +=
+                stft_buf.windowed_waveform_mono[i] * stft_buf.window[i] * 1.0f /
+                float(demucsonnx::FFT_WINDOW_SIZE) /
+                (stft_buf.normalized_window[start + i] + 1e-8f);
+        }
+    }
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -17,3 +17,4 @@ Session.vim @@
     *.onnx
     *.ort
     *.config
+    /cppscripts/build