diff --git a/.gitignore b/.gitignore index a7411787..a6a24c66 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,4 @@ Session.vim *.onnx *.ort *.config +/cppscripts/build diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..537d1dfc --- /dev/null +++ b/.gitmodules @@ -0,0 +1,6 @@ +[submodule "cppscripts/dependencies/eigen"] + path = cppscripts/dependencies/eigen + url = https://gitlab.com/libeigen/eigen.git +[submodule "cppscripts/dependencies/libnyquist"] + path = cppscripts/dependencies/libnyquist + url = https://github.com/ddiakopoulos/libnyquist.git diff --git a/cppscripts/Makefile b/cppscripts/Makefile new file mode 100644 index 00000000..717454ce --- /dev/null +++ b/cppscripts/Makefile @@ -0,0 +1,15 @@ +default: cli + +cli: + cmake -S src_cli -B build/build-cli -DCMAKE_BUILD_TYPE=Release + cmake --build build/build-cli -- -j16 + +cli-debug: + cmake -S src_cli -B build/build-cli -DCMAKE_BUILD_TYPE=Debug + cmake --build build/build-cli -- -j16 + +clean-all: + rm -rf build + +clean-cli: + rm -rf build/build-cli diff --git a/cppscripts/dependencies/eigen b/cppscripts/dependencies/eigen new file mode 160000 index 00000000..8e60d417 --- /dev/null +++ b/cppscripts/dependencies/eigen @@ -0,0 +1 @@ +Subproject commit 8e60d4173c6d8d5ee216c13ffd6142639be2bcc8 diff --git a/cppscripts/dependencies/libnyquist b/cppscripts/dependencies/libnyquist new file mode 160000 index 00000000..767efd97 --- /dev/null +++ b/cppscripts/dependencies/libnyquist @@ -0,0 +1 @@ +Subproject commit 767efd97cdd7a281d193296586e708490eb6e54f diff --git a/cppscripts/src/demucs.hpp b/cppscripts/src/demucs.hpp new file mode 100644 index 00000000..cb986ee9 --- /dev/null +++ b/cppscripts/src/demucs.hpp @@ -0,0 +1,132 @@ +#ifndef MODEL_HPP +#define MODEL_HPP + +#include "dsp.hpp" +#include "tensor.hpp" +#include +#include +#include +#include +#include +#include +#include + +namespace demucsonnx +{ +extern Ort::AllocatorWithDefaultOptions allocator; +extern Ort::RunOptions run_options; + +// Define a type for your callback function +using ProgressCallback = std::function; + +const int FREQ_BRANCH_LEN = 336; +const int TIME_BRANCH_LEN_IN = 343980; + +struct demucs_model { + std::unique_ptr sess; // Smart pointer to allow "empty" state + int nb_sources = 0; + Ort::Env env{ORT_LOGGING_LEVEL_ERROR, "demucs_onnx"}; // Persistent environment + std::vector input_names; // Persistent input names + std::vector output_names; // Persistent output names + + std::vector input_names_ptrs; + std::vector output_names_ptrs; + + // Constructor (optionally initialize here if needed) + demucs_model() = default; +}; + +bool load_model(const char *model_data, + int n_bytes, + struct demucs_model &model, + Ort::SessionOptions &session_options); + +bool load_model(const std::vector &model_data, + struct demucs_model &model, + Ort::SessionOptions &session_options); + +struct demucs_segment_buffers +{ + int segment_samples; + int le; + int pad; + int pad_end; + int padded_segment_samples; + int nb_stft_frames; + int nb_stft_bins; + + Eigen::Tensor3dXf targets_out; + Eigen::MatrixXf padded_mix; + Eigen::Tensor3dXcf z; + + std::vector x_onnx_in_shape; + std::vector xt_onnx_in_shape; + + std::vector x_onnx_out_shape; + std::vector xt_onnx_out_shape; + + std::vector input_tensors; + std::vector output_tensors; + + // constructor for demucs_segment_buffers that takes int parameters + + // let's do pesky precomputing of the signal repadding to 1/4 hop + // for time and frequency alignment + demucs_segment_buffers(int nb_channels, int segment_samples, int nb_sources) + : segment_samples(segment_samples), + le(int(std::ceil((float)segment_samples / (float)FFT_HOP_SIZE))), + pad(std::floor((float)FFT_HOP_SIZE / 2.0f) * 3), + pad_end(pad + le * FFT_HOP_SIZE - segment_samples), + padded_segment_samples(segment_samples + pad + pad_end), + nb_stft_frames(segment_samples / demucsonnx::FFT_HOP_SIZE + 1), + nb_stft_bins(demucsonnx::FFT_WINDOW_SIZE / 2 + 1), + targets_out(nb_sources, nb_channels, segment_samples), + padded_mix(nb_channels, padded_segment_samples), + z(nb_channels, nb_stft_bins, nb_stft_frames+4), + // complex-as-channels implies 2*nb_channels for real+imag + x_onnx_in_shape({1, 2 * nb_channels, nb_stft_bins - 1, nb_stft_frames}), + xt_onnx_in_shape({1, nb_channels, segment_samples}), + x_onnx_out_shape({1, nb_sources, 2 * nb_channels, nb_stft_bins - 1, nb_stft_frames}), + xt_onnx_out_shape({1, nb_sources, nb_channels, segment_samples}) + { + // precompute the input tensors + // inputs in form (xt, x) + input_tensors.push_back(Ort::Value::CreateTensor( + demucsonnx::allocator, + xt_onnx_in_shape.data(), + xt_onnx_in_shape.size())); + + // input_tensors.push_back(Ort::Value::CreateTensor( + // demucsonnx::allocator, + // x_onnx_in_shape.data(), + // x_onnx_in_shape.size())); + + // precompute the output tensors + // outputs in form (x_out, xt_out) + // output_tensors.push_back(Ort::Value::CreateTensor( + // demucsonnx::allocator, + // x_onnx_out_shape.data(), + // x_onnx_out_shape.size())); + + output_tensors.push_back(Ort::Value::CreateTensor( + demucsonnx::allocator, + xt_onnx_out_shape.data(), + xt_onnx_out_shape.size())); + }; +}; + +const float SEGMENT_LEN_SECS = 7.8; // 8 seconds, the demucs chunk size +const float SEGMENT_OVERLAP_SECS = 0.25; // 0.25 overlap +const float MAX_SHIFT_SECS = 0.5; // max shift +const float OVERLAP = 0.25; // overlap between segments +const float TRANSITION_POWER = 1.0; // transition between segments + +Eigen::Tensor3dXf demucs_inference(struct demucs_model &model, + const Eigen::MatrixXf &audio, + ProgressCallback cb); + +void model_inference(struct demucs_model &model, + struct demucsonnx::demucs_segment_buffers &buffers); +} // namespace demucsonnx + +#endif // MODEL_HPP diff --git a/cppscripts/src/dsp.cpp b/cppscripts/src/dsp.cpp new file mode 100644 index 00000000..85c2e71d --- /dev/null +++ b/cppscripts/src/dsp.cpp @@ -0,0 +1,191 @@ +#include "dsp.hpp" +#include +#include +#include +#include +#include +#include +#include +#include + +// forward declaration of inner stft +void stft_inner(struct demucsonnx::stft_buffers &stft_buf, + Eigen::FFT &cfg); + +void istft_inner(struct demucsonnx::stft_buffers &stft_buf, + Eigen::FFT &cfg); + +// reflect padding +void pad_signal(struct demucsonnx::stft_buffers &stft_buf) +{ + // copy from stft_buf.padded_waveform_mono_in+pad into stft_buf.pad_start, + // stft_buf.pad_end + std::copy_n(stft_buf.padded_waveform_mono_in.begin() + stft_buf.pad, + stft_buf.pad, stft_buf.pad_start.begin()); + std::copy_n(stft_buf.padded_waveform_mono_in.end() - 2 * stft_buf.pad, + stft_buf.pad, stft_buf.pad_end.begin()); + + std::reverse(stft_buf.pad_start.begin(), stft_buf.pad_start.end()); + std::reverse(stft_buf.pad_end.begin(), stft_buf.pad_end.end()); + + // copy stft_buf.pad_start into stft_buf.padded_waveform_mono_in + std::copy_n(stft_buf.pad_start.begin(), stft_buf.pad, + stft_buf.padded_waveform_mono_in.begin()); + + // copy stft_buf.pad_end into stft_buf.padded_waveform_mono_in + std::copy_n(stft_buf.pad_end.begin(), stft_buf.pad, + stft_buf.padded_waveform_mono_in.end() - stft_buf.pad); +} + +Eigen::FFT get_fft_cfg() +{ + Eigen::FFT cfg; + + cfg.SetFlag(Eigen::FFT::Speedy); + // cfg.SetFlag(Eigen::FFT::HalfSpectrum); + // cfg.SetFlag(Eigen::FFT::Unscaled); + + return cfg; +} + +void demucsonnx::stft( + struct stft_buffers &stft_buf, + const Eigen::MatrixXf &waveform, + Eigen::Tensor3dXcf &spec) +{ + // get the fft config + Eigen::FFT cfg = get_fft_cfg(); + + /*****************************************/ + /* operate on each channel sequentially */ + /*****************************************/ + + for (int channel = 0; channel < 2; ++channel) + { + Eigen::VectorXf row_vec = waveform.row(channel); + + std::copy_n(row_vec.data(), row_vec.size(), + stft_buf.padded_waveform_mono_in.begin() + stft_buf.pad); + + // apply padding equivalent to center padding with center=True + // in torch.stft: + // https://pytorch.org/docs/stable/generated/torch.stft.html + + // reflect pads stft_buf.padded_waveform_mono in-place + pad_signal(stft_buf); + + // does forward fft on stft_buf.padded_waveform_mono, stores spectrum in + // complex_spec_mono + stft_inner(stft_buf, cfg); + + for (int i = 0; i < stft_buf.nb_bins; ++i) + { + for (int j = 0; j < stft_buf.nb_frames; ++j) + { + spec(channel, i, j) = stft_buf.complex_spec_mono[j][i]; + } + } + } +} + +void demucsonnx::istft( + struct stft_buffers &stft_buf, + const Eigen::Tensor3dXcf &spec, + Eigen::MatrixXf &waveform) +{ + // get the fft config + Eigen::FFT cfg = get_fft_cfg(); + + /*****************************************/ + /* operate on each channel sequentially */ + /*****************************************/ + + for (int channel = 0; channel < 2; ++channel) + { + // Populate the nested vectors + for (int i = 0; i < stft_buf.nb_bins; ++i) + { + for (int j = 0; j < stft_buf.nb_frames; ++j) + { + stft_buf.complex_spec_mono[j][i] = spec(channel, i, j); + } + } + + // does inverse fft on stft_buf.complex_spec_mono, stores waveform in + // padded_waveform_mono + istft_inner(stft_buf, cfg); + + // copies waveform_mono into stft_buf.waveform past first pad samples + waveform.row(channel) = Eigen::Map( + stft_buf.padded_waveform_mono_out.data() + stft_buf.pad, 1, + stft_buf.padded_waveform_mono_out.size() - FFT_WINDOW_SIZE); + } +} + +void stft_inner(struct demucsonnx::stft_buffers &stft_buf, + Eigen::FFT &cfg) +{ + int frame_idx = 0; + + // Loop over the waveform with a stride of hop_size + for (std::size_t start = 0; + start <= + stft_buf.padded_waveform_mono_in.size() - demucsonnx::FFT_WINDOW_SIZE; + start += demucsonnx::FFT_HOP_SIZE) + { + // Apply window and run FFT + for (int i = 0; i < demucsonnx::FFT_WINDOW_SIZE; ++i) + { + stft_buf.windowed_waveform_mono[i] = + stft_buf.padded_waveform_mono_in[start + i] * + stft_buf.window[i]; + } + cfg.fwd(stft_buf.complex_spec_mono[frame_idx], + stft_buf.windowed_waveform_mono); + // now scale stft_buf.complex_spec_mono[frame_idx] by 1.0f / + // sqrt(float(FFT_WINDOW_SIZE))) + + for (int i = 0; i < demucsonnx::FFT_WINDOW_SIZE / 2 + 1; ++i) + { + stft_buf.complex_spec_mono[frame_idx][i] *= + 1.0f / sqrt(float(demucsonnx::FFT_WINDOW_SIZE)); + } + frame_idx++; + } +} + +void istft_inner(struct demucsonnx::stft_buffers &stft_buf, + Eigen::FFT &cfg) +{ + // clear padded_waveform_mono + std::fill(stft_buf.padded_waveform_mono_out.begin(), + stft_buf.padded_waveform_mono_out.end(), 0.0f); + + // Loop over the input with a stride of (hop_size) + for (int start = 0; start < stft_buf.nb_frames * demucsonnx::FFT_HOP_SIZE; + start += demucsonnx::FFT_HOP_SIZE) + { + int frame_idx = start / demucsonnx::FFT_HOP_SIZE; + // undo sqrt(nfft) scaling + for (int i = 0; i < demucsonnx::FFT_WINDOW_SIZE / 2 + 1; ++i) + { + stft_buf.complex_spec_mono[frame_idx][i] *= + sqrt(float(demucsonnx::FFT_WINDOW_SIZE)); + } + // Run iFFT + cfg.inv(stft_buf.windowed_waveform_mono, + stft_buf.complex_spec_mono[frame_idx]); + + // Apply window and add to output + for (int i = 0; i < demucsonnx::FFT_WINDOW_SIZE; ++i) + { + // x[start+i] is the sum of squared window values + // https://github.com/librosa/librosa/blob/main/librosa/core/spectrum.py#L613 + // 1e-8f is a small number to avoid division by zero + stft_buf.padded_waveform_mono_out[start + i] += + stft_buf.windowed_waveform_mono[i] * stft_buf.window[i] * 1.0f / + float(demucsonnx::FFT_WINDOW_SIZE) / + (stft_buf.normalized_window[start + i] + 1e-8f); + } + } +} diff --git a/cppscripts/src/dsp.hpp b/cppscripts/src/dsp.hpp new file mode 100644 index 00000000..ef99b61d --- /dev/null +++ b/cppscripts/src/dsp.hpp @@ -0,0 +1,109 @@ +#ifndef DSP_HPP +#define DSP_HPP + +#include +#include +#include +#include +#include +#include +#include + +namespace demucsonnx +{ + +const int SUPPORTED_SAMPLE_RATE = 44100; +const int FFT_WINDOW_SIZE = 4096; + +const int FFT_HOP_SIZE = 1024; // 25% hop i.e. 75% overlap + +struct stft_buffers +{ + int nb_frames; + int nb_bins; + int pad; + + const std::vector window; + const std::vector normalized_window; + + std::vector pad_start; + std::vector pad_end; + std::vector padded_waveform_mono_in; + std::vector padded_waveform_mono_out; + std::vector windowed_waveform_mono; + + std::vector>> complex_spec_mono; + + // constructor for stft_buffers that takes some parameters + // to hint at the sizes of the buffers + explicit stft_buffers(int n_samples) + : nb_frames(n_samples / FFT_HOP_SIZE + 1), + nb_bins(FFT_WINDOW_SIZE / 2 + 1), pad(FFT_WINDOW_SIZE / 2), + window(init_const_hann_window()), + normalized_window(init_const_normalized_window(window, nb_frames)), + pad_start(std::vector(pad)), pad_end(std::vector(pad)), + padded_waveform_mono_in( + std::vector(n_samples + FFT_WINDOW_SIZE)), + padded_waveform_mono_out( + std::vector(n_samples + FFT_WINDOW_SIZE)), + windowed_waveform_mono(std::vector(FFT_WINDOW_SIZE)), + complex_spec_mono(std::vector>>( + nb_frames, std::vector>(nb_bins))){}; + + static std::vector init_const_hann_window() + { + static constexpr float PI = 3.14159265359F; + std::vector window(FFT_WINDOW_SIZE); + + // create a periodic hann window + // by generating L+1 points and deleting the last one + auto floatN = (float)(FFT_WINDOW_SIZE + 1); + + for (std::size_t n = 0; n < FFT_WINDOW_SIZE; ++n) + { + window[n] = + 0.5F * (1.0F - cosf(2.0F * PI * (float)n / (floatN - 1))); + } + + return window; + } + + static std::vector + init_const_normalized_window(const std::vector &window, + int nb_frames) + { + float window_normalization_factor = + FFT_WINDOW_SIZE + FFT_HOP_SIZE * (nb_frames - 1); + std::vector normalized_window = + std::vector(window_normalization_factor, 0.0f); + + // Compute the window normalization factor + // using librosa window_sumsquare to compute the squared window + // https://github.com/librosa/librosa/blob/main/librosa/filters.py#L1545 + for (int i = 0; i < nb_frames; ++i) + { + auto sample = i * FFT_HOP_SIZE; + for (int j = sample; j < std::min((int)window_normalization_factor, + sample + FFT_WINDOW_SIZE); + ++j) + { + normalized_window[j] += window[j - sample] * window[j - sample]; + } + } + return normalized_window; + } +}; + +void stft( + struct stft_buffers &stft_buf, + const Eigen::MatrixXf &waveform, + Eigen::Tensor3dXcf &spec); + +void istft( + struct stft_buffers &stft_buf, + const Eigen::Tensor3dXcf &spec, + Eigen::MatrixXf &waveform); + +} // namespace demucsonnx + +#endif // DSP_HPP diff --git a/cppscripts/src/model_apply.cpp b/cppscripts/src/model_apply.cpp new file mode 100644 index 00000000..7dde466d --- /dev/null +++ b/cppscripts/src/model_apply.cpp @@ -0,0 +1,214 @@ +#include "demucs.hpp" +#include "dsp.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "demucs.hpp" + +// At global/class scope +static std::random_device rd; // Get entropy for seed +static std::mt19937 gen(rd()); // Mersenne Twister generator + +static void reflect_padding( + Eigen::MatrixXf &padded_mix, + int left_padding, + int right_padding, + int N) +{ + // Reflect from the first 'left_padding' samples of the original data + for (int i = 0; i < left_padding; ++i) + { + padded_mix.block(0, left_padding - 1 - i, 2, 1) = + padded_mix.block(0, left_padding + i, 2, 1); + } + + // Reflect from the last 'right_padding' samples of the original data + for (int i = 0; i < right_padding; ++i) + { + int last_elem = N + left_padding - 1 ; + padded_mix.block(0, last_elem + i + 1, 2, 1) = + padded_mix.block(0, last_elem - i, 2, 1); + } +} + +Eigen::Tensor3dXf demucsonnx::demucs_inference( + struct demucsonnx::demucs_model &model, + const Eigen::MatrixXf &audio, + demucsonnx::ProgressCallback cb) +{ + int length = audio.cols(); + int max_shift = static_cast(demucsonnx::MAX_SHIFT_SECS * demucsonnx::SUPPORTED_SAMPLE_RATE); + + // Calculate the overall mean and standard deviation + Eigen::VectorXf ref_mean_0 = audio.colwise().mean(); + float ref_mean = ref_mean_0.mean(); + float ref_std = std::sqrt((ref_mean_0.array() - ref_mean).square().sum() / + (ref_mean_0.size() - 1)); + + // Create padded_mix with symmetric zero padding + int padded_length = length + 2 * max_shift; + Eigen::MatrixXf padded_mix(2, padded_length); + padded_mix.setZero(); + + // Normalize and copy audio into padded_mix starting at column max_shift + padded_mix.block(0, max_shift, 2, length) = (audio.array() - ref_mean) / ref_std; + + // Generate random shift offset for time invariance + std::uniform_int_distribution<> dist(0, max_shift - 1); + int shift_offset = dist(gen); + std::cout << "shift offset is: " << shift_offset << std::endl; + + // Create a block view for shifted_audio to avoid extra allocation + int shifted_length = length + max_shift - shift_offset; + Eigen::Ref shifted_audio = padded_mix.block(0, shift_offset, 2, shifted_length); + + // --- Begin merged split_inference and segment_inference logic --- + + // Calculate segment size in samples + int segment_samples = static_cast(demucsonnx::SEGMENT_LEN_SECS * demucsonnx::SUPPORTED_SAMPLE_RATE); + int nb_out_sources = model.nb_sources; + + // Create reusable buffers with padded sizes + demucsonnx::demucs_segment_buffers buffers(2, segment_samples, nb_out_sources); + + // Calculate stride for overlapping segments + int stride_samples = static_cast((1.0f - demucsonnx::OVERLAP) * segment_samples); + + // Create an output tensor initialized to zero + Eigen::Tensor3dXf out(nb_out_sources, 2, shifted_length); + out.setZero(); + + // Create weight vector for overlapping segments + Eigen::VectorXf weight(segment_samples); + int half_segment = segment_samples / 2; + weight.head(half_segment) = Eigen::VectorXf::LinSpaced(half_segment, 1, half_segment); + weight.tail(half_segment) = weight.head(half_segment).reverse(); + weight /= weight.maxCoeff(); + weight = weight.array().pow(demucsonnx::TRANSITION_POWER); + + // Initialize sum_weight as Eigen::VectorXf + Eigen::VectorXf sum_weight(shifted_length); + sum_weight.setZero(); + + // Calculate total number of chunks + int total_chunks = static_cast(std::ceil(static_cast(shifted_length) / stride_samples)); + float increment_per_chunk = 1.0f / static_cast(total_chunks); + float inference_progress = 0.0f; + + // Preallocate a tensor for chunk_out to avoid repeated allocations + Eigen::Tensor3dXf chunk_out(nb_out_sources, 2, segment_samples); + chunk_out.setZero(); + + // Process each segment + for (int segment_offset = 0; segment_offset < shifted_length; segment_offset += stride_samples) + { + int chunk_length = std::min(segment_samples, shifted_length - segment_offset); + + // Create a block view for the current chunk + Eigen::Ref chunk = shifted_audio.block(0, segment_offset, 2, chunk_length); + + // first, symmetric padding with zeros to fit the smaller chunk + // into the bigger segment_samples + int symmetric_padding = buffers.padded_segment_samples - chunk_length; + + buffers.padded_mix.setZero(); + // copy chunk into padded_mix at position buffers.pad + symmetric_padding/2 + int symmetric_padding_start = symmetric_padding / 2; + buffers.padded_mix.block(0, buffers.pad + symmetric_padding_start, 2, chunk_length) = chunk; + + // then, reflect padding on the left and right for + // the stft boundary effects + reflect_padding(buffers.padded_mix, buffers.pad, buffers.pad_end, segment_samples); + + // Run model inference + demucsonnx::model_inference(model, buffers); + + // Update progress + cb(inference_progress + increment_per_chunk, "Segment inference complete"); + + // Copy from buffers.targets_out into chunk_out with center trimming + // Preallocate chunk_out and set to zero + chunk_out.setZero(); + + for (int i = 0; i < nb_out_sources; ++i) + { + for (int j = 0; j < 2; ++j) + { + for (int k = 0; k < chunk_length; ++k) + { + auto kidx = k + symmetric_padding_start; + kidx = std::min(kidx, int(buffers.targets_out.dimension(2)-1)); + // Undoing center_trim by offsetting with left_padding + chunk_out(i, j, k) = buffers.targets_out(i, j, kidx); + } + } + } + + // Accumulate the weighted chunk output using Eigen::Map for vectorization + for (int i = 0; i < nb_out_sources; ++i) + { + for (int j = 0; j < 2; ++j) + { + // Map the (i, j, chunk_length) slice of 'out' to an Eigen::ArrayXf + Eigen::Map out_slice(&out(i, j, segment_offset), chunk_length); + // Map the (i, j, chunk_length) slice of 'chunk_out' to an Eigen::ArrayXf + Eigen::Map chunk_out_slice(&chunk_out(i, j, 0), chunk_length); + // Accumulate the weighted chunk output + out_slice += weight.head(chunk_length).array() * chunk_out_slice; + } + } + + // Accumulate the weights using Eigen's segment operation + sum_weight.segment(segment_offset, chunk_length) += weight.head(chunk_length); + + inference_progress += increment_per_chunk; + } + + // Normalize the output by sum_weight using vectorized operations + for (int i = 0; i < nb_out_sources; ++i) + { + for (int j = 0; j < 2; ++j) + { + // Map the (i, j, shifted_length) slice of 'out' to an Eigen::ArrayXf + Eigen::Map out_slice(&out(i, j, 0), shifted_length); + // Perform element-wise division + out_slice /= sum_weight.array(); + } + } + + // Inverse the normalization directly on 'out' using vectorized operations + for (int i = 0; i < nb_out_sources; ++i) + { + for (int j = 0; j < 2; ++j) + { + // Map the (i, j, shifted_length) slice of 'out' to an Eigen::ArrayXf + Eigen::Map out_slice(&out(i, j, 0), shifted_length); + // Perform inverse normalization + out_slice = out_slice * ref_std + ref_mean; + } + } + + // Create a view (slice) of 'out' without allocating new memory + int trim_start = max_shift - shift_offset; + Eigen::array offset_array = {0, 0, trim_start}; + Eigen::array extent_array = {nb_out_sources, 2, length}; + auto result = out.slice(offset_array, extent_array); + + // Materialize the slice into a new tensor to return + Eigen::Tensor3dXf trimmed_waveform_outputs = result.eval(); + + return trimmed_waveform_outputs; +} diff --git a/cppscripts/src/model_inference.cpp b/cppscripts/src/model_inference.cpp new file mode 100644 index 00000000..819928c5 --- /dev/null +++ b/cppscripts/src/model_inference.cpp @@ -0,0 +1,135 @@ +#include "demucs.hpp" +#include "dsp.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "demucs.hpp" + +namespace demucsonnx { + Ort::AllocatorWithDefaultOptions allocator; + Ort::RunOptions run_options; +} + +// Core function to create and load model from in-memory data (byte array) +bool demucsonnx::load_model( + const char* model_data, + int n_bytes, + struct demucsonnx::demucs_model &model, + Ort::SessionOptions &session_options) +{ + const uint8_t* final_data = reinterpret_cast(model_data); + size_t final_size = n_bytes; + + model.sess = std::make_unique(model.env, final_data, final_size, session_options); + + std::vector input_name_allocs; + input_name_allocs.push_back(model.sess->GetInputNameAllocated(0, demucsonnx::allocator)); + + model.input_names.push_back(input_name_allocs[0].get()); // Store as std::string + + std::vector output_name_allocs; + output_name_allocs.push_back(model.sess->GetOutputNameAllocated(0, demucsonnx::allocator)); + + model.output_names.push_back(output_name_allocs[0].get()); + + for (const auto& name : model.input_names) { + model.input_names_ptrs.push_back(name.c_str()); + } + for (const auto& name : model.output_names) { + model.output_names_ptrs.push_back(name.c_str()); + } + + auto output0_shape = model.sess->GetOutputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape(); + + model.nb_sources = output0_shape[1]; + return true; +} + +// Overload for std::vector +bool demucsonnx::load_model( + const std::vector &model_data, + struct demucsonnx::demucs_model &model, + Ort::SessionOptions &session_options) +{ + return load_model( + model_data.data(), model_data.size(), model, session_options); +} + +void RunONNXInference( + struct demucsonnx::demucs_model &model, + struct demucsonnx::demucs_segment_buffers &buffers +) { + // Run the model + model.sess->Run( + demucsonnx::run_options, + model.input_names_ptrs.data(), + buffers.input_tensors.data(), + buffers.input_tensors.size(), + model.output_names_ptrs.data(), + buffers.output_tensors.data(), + model.output_names_ptrs.size() + ); +} + +// run core demucs inference using onnx +void demucsonnx::model_inference( + struct demucsonnx::demucs_model &model, + struct demucsonnx::demucs_segment_buffers &buffers) + // struct demucsonnx::stft_buffers &stft_buf) +{ + + // prepare time branch input by copying buffers.mix into input_tensors[0] + float *xt_onnx_data = buffers.input_tensors[0].GetTensorMutableData(); + + for (int i = 0; i < buffers.padded_mix.rows(); ++i) + { + for (int j = 0; j < buffers.segment_samples; ++j) + { + // calculate destination index, simple row major calculation + // given the onnx shape of (1, 2, segment_samples) + int dest_index = i * buffers.segment_samples + j; + xt_onnx_data[dest_index] = buffers.padded_mix(i, j + buffers.pad); + } + } + + // now we apply the core demucs inference + RunONNXInference(model, buffers); + + std::cout << "ONNX inference completed." << std::endl; + + int nb_out_sources = model.nb_sources; + + // nb_sources sources, 2 channels, N samples + std::vector xt_3d( + nb_out_sources, + Eigen::MatrixXf(2, buffers.segment_samples) + ); + + // Map output onnx tensors + float* xt_out_data = buffers.output_tensors[0].GetTensorMutableData(); + + for (int s = 0; s < nb_out_sources; ++s) + { // loop over 4 sources + for (int i = 0; i < 2; ++i) + { + for (int j = 0; j < buffers.segment_samples; ++j) + { + int index = s * 2 * buffers.segment_samples + i * buffers.segment_samples + j; + buffers.targets_out(s, i, j) = xt_out_data[index]; + } + } + } +} diff --git a/cppscripts/src/tensor.hpp b/cppscripts/src/tensor.hpp new file mode 100644 index 00000000..b3ec8bae --- /dev/null +++ b/cppscripts/src/tensor.hpp @@ -0,0 +1,24 @@ +#ifndef TENSOR_HPP +#define TENSOR_HPP + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace Eigen +{ +// define Tensor3dXf, Tensor3dXcf for spectrograms etc. +typedef Tensor Tensor5dXf; +typedef Tensor Tensor4dXf; +typedef Tensor Tensor3dXf; +typedef Tensor Tensor2dXf; +typedef Tensor Tensor1dXf; +typedef Tensor, 3, RowMajor> Tensor3dXcf; +} // namespace Eigen + +#endif // TENSOR_HPP diff --git a/cppscripts/src_cli/CMakeLists.txt b/cppscripts/src_cli/CMakeLists.txt new file mode 100644 index 00000000..759d0fcd --- /dev/null +++ b/cppscripts/src_cli/CMakeLists.txt @@ -0,0 +1,28 @@ +cmake_minimum_required(VERSION 3.0) + +project(demucs.onnx) +enable_testing() + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +set(CMAKE_CXX_FLAGS "-Wall -Wextra") +set(CMAKE_CXX_FLAGS_DEBUG "-g -DEIGEN_INTERNAL_DEBUGGING") +set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -ffast-math -flto -fno-signed-zeros -fassociative-math -freciprocal-math -fno-math-errno -fno-rounding-math -funsafe-math-optimizations -fno-trapping-math -fno-rtti -DNDEBUG") + +# Set your onnxruntime installation path here +set(ONNXRUNTIME_HOME /opt/homebrew/Cellar/onnxruntime/1.21.0) +include_directories(${ONNXRUNTIME_HOME}/include) +link_directories(${ONNXRUNTIME_HOME}/lib) + +set(LIBNYQUIST_BUILD_EXAMPLE OFF CACHE BOOL "Disable libnyquist example") +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../dependencies/libnyquist ${CMAKE_CURRENT_BINARY_DIR}/libnyquist_build) + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../dependencies/eigen) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../dependencies/libnyquist/include) + +file(GLOB SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/../src/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../src_cli/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../onnx-models/model/*.ort.c") +add_executable(demucs ${SOURCES}) + +target_link_libraries(demucs onnxruntime libnyquist) diff --git a/cppscripts/src_cli/demucs.cpp b/cppscripts/src_cli/demucs.cpp new file mode 100644 index 00000000..51b5f500 --- /dev/null +++ b/cppscripts/src_cli/demucs.cpp @@ -0,0 +1,289 @@ +#include "demucs.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace nqr; + +// Overload for file path, calling one of the other overloads as needed +static demucsonnx::demucs_model load_model( + const std::string& htdemucs_model_path, + Ort::SessionOptions& session_options +) { + struct demucsonnx::demucs_model model; + + std::ifstream file(htdemucs_model_path, std::ios::binary | std::ios::ate); + if (!file) { + throw std::runtime_error("Failed to open model file: " + htdemucs_model_path); + } + + std::streamsize size = file.tellg(); + file.seekg(0, std::ios::beg); + + std::vector file_data(size); + if (!file.read(file_data.data(), size)) { + throw std::runtime_error("Failed to read model file."); + } + + bool success = demucsonnx::load_model(file_data, model, session_options); + if (!success) { + throw std::runtime_error("Failed to load model."); + } + + return model; +} + +static Eigen::MatrixXf load_audio_file(std::string filename) +{ + // load a wav file with libnyquist + std::shared_ptr fileData = std::make_shared(); + + NyquistIO loader; + + loader.Load(fileData.get(), filename); + + if (fileData->sampleRate != demucsonnx::SUPPORTED_SAMPLE_RATE) + { + std::cerr << "[ERROR] demucs.cpp only supports the following sample " + "rate (Hz): " + << demucsonnx::SUPPORTED_SAMPLE_RATE << std::endl; + exit(1); + } + + std::cout << "Input samples: " + << fileData->samples.size() / fileData->channelCount << std::endl; + std::cout << "Length in seconds: " << fileData->lengthSeconds << std::endl; + std::cout << "Number of channels: " << fileData->channelCount << std::endl; + + if (fileData->channelCount != 2 && fileData->channelCount != 1) + { + std::cerr << "[ERROR] demucs.cpp only supports mono and stereo audio" + << std::endl; + exit(1); + } + + // number of samples per channel + std::size_t N = fileData->samples.size() / fileData->channelCount; + + // create a struct to hold two float vectors for left and right channels + Eigen::MatrixXf ret(2, N); + + if (fileData->channelCount == 1) + { + // Mono case + for (std::size_t i = 0; i < N; ++i) + { + ret(0, i) = fileData->samples[i]; // left channel + ret(1, i) = fileData->samples[i]; // right channel + } + } + else + { + // Stereo case + for (std::size_t i = 0; i < N; ++i) + { + ret(0, i) = fileData->samples[2 * i]; // left channel + ret(1, i) = fileData->samples[2 * i + 1]; // right channel + } + } + + return ret; +} + +// write a function to write a StereoWaveform to a wav file +static void write_audio_file(const Eigen::MatrixXf &waveform, + std::string filename) +{ + // create a struct to hold the audio data + std::shared_ptr fileData = std::make_shared(); + + // set the sample rate + fileData->sampleRate = demucsonnx::SUPPORTED_SAMPLE_RATE; + + // set the number of channels + fileData->channelCount = 2; + + // set the number of samples + fileData->samples.resize(waveform.cols() * 2); + + // write the left channel + for (long int i = 0; i < waveform.cols(); ++i) + { + fileData->samples[2 * i] = waveform(0, i); + fileData->samples[2 * i + 1] = waveform(1, i); + } + + int encoderStatus = + encode_wav_to_disk({fileData->channelCount, PCM_FLT, DITHER_TRIANGLE}, + fileData.get(), filename); + std::cout << "Encoder Status: " << encoderStatus << std::endl; +} + +int main(int argc, const char **argv) +{ + + try { + // your existing main logic + + if (argc != 4) + { + std::cerr << "Usage: " << argv[0] << " " + << std::endl; + exit(1); + } + + std::cout << "demucs.onnx Main driver program" << std::endl; + std::string model_file = argv[1]; + + // load audio passed as argument + std::string wav_file = argv[2]; + + // output dir passed as argument + std::string out_dir = argv[3]; + + // Check if the output directory exists, and create it if not + std::filesystem::path output_dir_path(out_dir); + if (!std::filesystem::exists(output_dir_path)) + { + std::cerr << "Directory does not exist: " << out_dir << ". Creating it." + << std::endl; + if (!std::filesystem::create_directories(output_dir_path)) + { + std::cerr << "Error: Unable to create directory: " << out_dir + << std::endl; + return 1; + } + } + else if (!std::filesystem::is_directory(output_dir_path)) + { + std::cerr << "Error: " << out_dir << " exists but is not a directory!" + << std::endl; + return 1; + } + + Eigen::MatrixXf audio = load_audio_file(wav_file); + Eigen::Tensor3dXf out_targets; + + std::cout << "Running Demucs.onnx inference for: " << wav_file << std::endl; + + // set output precision to 3 decimal places + std::cout << std::fixed << std::setprecision(3); + + demucsonnx::ProgressCallback progressCallback = + [](float progress, const std::string &log_message) + { + std::cout << "(" << std::setw(3) << std::setfill(' ') + << progress * 100.0f << "%) " << log_message << std::endl; + }; + + // create Ort::SessionOptions + Ort::SessionOptions session_options; + + // max out threads and increase performance to the max on my beefy + // desktop CPU + session_options.SetExecutionMode(ExecutionMode::ORT_PARALLEL); + session_options.SetIntraOpNumThreads(16); + session_options.SetInterOpNumThreads(16); + + // General optimizations + session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL); + + struct demucsonnx::demucs_model model = load_model( + model_file, + session_options + ); + + // create 4 audio matrix same size, to hold output + Eigen::Tensor3dXf audio_targets = + demucsonnx::demucs_inference(model, audio, progressCallback); + + out_targets = audio_targets; + + int nb_out_sources = model.nb_sources; + + for (int target = 0; target < nb_out_sources; ++target) + { + // now write the 4 audio waveforms to files in the output dir + // using libnyquist + // join out_dir with "/target_0.wav" + // using std::filesystem::path; + + std::filesystem::path p = out_dir; + // make sure the directory exists + std::filesystem::create_directories(p); + + auto p_target = p / "target_0.wav"; + + // target 0,1,2,3 map to drums,bass,other,vocals + + std::string target_name; + + switch (target) + { + case 0: + target_name = "drums"; + break; + case 1: + target_name = "bass"; + break; + case 2: + target_name = "other"; + break; + case 3: + target_name = "vocals"; + break; + case 4: + target_name = "guitar"; + break; + case 5: + target_name = "piano"; + break; + default: + std::cerr << "Error: target " << target << " not supported" + << std::endl; + exit(1); + } + + // insert target_name into the path after the digit + // e.g. target_name_0_drums.wav + p_target.replace_filename("target_" + std::to_string(target) + "_" + + target_name + ".wav"); + + std::cout << "Writing wav file " << p_target << std::endl; + + Eigen::MatrixXf target_waveform(2, audio.cols()); + + // copy the input stereo wav file into all 4 targets + for (int channel = 0; channel < 2; ++channel) + { + for (int sample = 0; sample < audio.cols(); ++sample) + { + target_waveform(channel, sample) = + out_targets(target, channel, sample); + } + } + + write_audio_file(target_waveform, p_target); + } + + return 0; + } catch (const std::exception& e) { + std::cerr << "Caught exception: " << e.what() << std::endl; + return 1; + } + +} diff --git a/docs/onnx.md b/docs/onnx.md index ea6c84c4..39285239 100644 --- a/docs/onnx.md +++ b/docs/onnx.md @@ -2,14 +2,39 @@ ## Convert PyTorch model to ONNX and ORT -Convert Demucs PyTorch model to ONNX: +- Convert Demucs PyTorch model to ONNX: ```python python ./scripts/convert-pth-to-onnx.py ./onnx-models ``` -Optionally, convert ONNX model to ORT: +- Optionally, convert ONNX model to ORT: ```python python -m onnxruntime.tools.convert_onnx_models_to_ort ./onnx-models --enable_type_reduction ``` + +## Using C++ scripts + +- Install dependencies + +```git +git submodule update --init --recursive +``` + +- Set ONNXRuntime path in `./cppscripts/src_cli/CMakeLists.txt` + +- Compile the C++ code + +```bash +cd cppscripts +make cli +cd .. +``` + +- Run inference using the following command + +```bash +mkdir ./separated/htdemucs_cpp/ +./cppscripts/build/build-cli/demucs ./onnx-models/htdemucs.ort ./test.mp3 ./separated/htdemucs_cpp/ +```