-
Notifications
You must be signed in to change notification settings - Fork 5
[feat] Add Cpp scripts for inference #7
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,3 +17,4 @@ Session.vim | |
| *.onnx | ||
| *.ort | ||
| *.config | ||
| /cppscripts/build | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,6 @@ | ||
| [submodule "cppscripts/dependencies/eigen"] | ||
| path = cppscripts/dependencies/eigen | ||
| url = https://gitlab.com/libeigen/eigen.git | ||
| [submodule "cppscripts/dependencies/libnyquist"] | ||
| path = cppscripts/dependencies/libnyquist | ||
| url = https://github.com/ddiakopoulos/libnyquist.git |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,15 @@ | ||
| default: cli | ||
|
|
||
| cli: | ||
| cmake -S src_cli -B build/build-cli -DCMAKE_BUILD_TYPE=Release | ||
| cmake --build build/build-cli -- -j16 | ||
|
|
||
| cli-debug: | ||
| cmake -S src_cli -B build/build-cli -DCMAKE_BUILD_TYPE=Debug | ||
| cmake --build build/build-cli -- -j16 | ||
|
|
||
| clean-all: | ||
| rm -rf build | ||
|
|
||
| clean-cli: | ||
| rm -rf build/build-cli |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,132 @@ | ||
| #ifndef MODEL_HPP | ||
| #define MODEL_HPP | ||
|
|
||
| #include "dsp.hpp" | ||
| #include "tensor.hpp" | ||
| #include <Eigen/Dense> | ||
| #include <array> | ||
| #include <functional> | ||
| #include <iostream> | ||
| #include <string> | ||
| #include <vector> | ||
| #include <onnxruntime/onnxruntime_cxx_api.h> | ||
|
|
||
| namespace demucsonnx | ||
| { | ||
| extern Ort::AllocatorWithDefaultOptions allocator; | ||
| extern Ort::RunOptions run_options; | ||
|
|
||
| // Define a type for your callback function | ||
| using ProgressCallback = std::function<void(float, const std::string &)>; | ||
|
|
||
| const int FREQ_BRANCH_LEN = 336; | ||
| const int TIME_BRANCH_LEN_IN = 343980; | ||
|
|
||
| struct demucs_model { | ||
| std::unique_ptr<Ort::Session> sess; // Smart pointer to allow "empty" state | ||
| int nb_sources = 0; | ||
| Ort::Env env{ORT_LOGGING_LEVEL_ERROR, "demucs_onnx"}; // Persistent environment | ||
| std::vector<std::string> input_names; // Persistent input names | ||
| std::vector<std::string> output_names; // Persistent output names | ||
|
|
||
| std::vector<const char*> input_names_ptrs; | ||
| std::vector<const char*> output_names_ptrs; | ||
|
|
||
| // Constructor (optionally initialize here if needed) | ||
| demucs_model() = default; | ||
| }; | ||
|
|
||
| bool load_model(const char *model_data, | ||
| int n_bytes, | ||
| struct demucs_model &model, | ||
| Ort::SessionOptions &session_options); | ||
|
|
||
| bool load_model(const std::vector<char> &model_data, | ||
| struct demucs_model &model, | ||
| Ort::SessionOptions &session_options); | ||
|
|
||
| struct demucs_segment_buffers | ||
| { | ||
| int segment_samples; | ||
| int le; | ||
| int pad; | ||
| int pad_end; | ||
| int padded_segment_samples; | ||
| int nb_stft_frames; | ||
| int nb_stft_bins; | ||
|
|
||
| Eigen::Tensor3dXf targets_out; | ||
| Eigen::MatrixXf padded_mix; | ||
| Eigen::Tensor3dXcf z; | ||
|
|
||
| std::vector<int64_t> x_onnx_in_shape; | ||
| std::vector<int64_t> xt_onnx_in_shape; | ||
|
|
||
| std::vector<int64_t> x_onnx_out_shape; | ||
| std::vector<int64_t> xt_onnx_out_shape; | ||
|
|
||
| std::vector<Ort::Value> input_tensors; | ||
| std::vector<Ort::Value> output_tensors; | ||
|
|
||
| // constructor for demucs_segment_buffers that takes int parameters | ||
|
|
||
| // let's do pesky precomputing of the signal repadding to 1/4 hop | ||
| // for time and frequency alignment | ||
| demucs_segment_buffers(int nb_channels, int segment_samples, int nb_sources) | ||
| : segment_samples(segment_samples), | ||
| le(int(std::ceil((float)segment_samples / (float)FFT_HOP_SIZE))), | ||
| pad(std::floor((float)FFT_HOP_SIZE / 2.0f) * 3), | ||
| pad_end(pad + le * FFT_HOP_SIZE - segment_samples), | ||
| padded_segment_samples(segment_samples + pad + pad_end), | ||
| nb_stft_frames(segment_samples / demucsonnx::FFT_HOP_SIZE + 1), | ||
| nb_stft_bins(demucsonnx::FFT_WINDOW_SIZE / 2 + 1), | ||
| targets_out(nb_sources, nb_channels, segment_samples), | ||
| padded_mix(nb_channels, padded_segment_samples), | ||
| z(nb_channels, nb_stft_bins, nb_stft_frames+4), | ||
| // complex-as-channels implies 2*nb_channels for real+imag | ||
| x_onnx_in_shape({1, 2 * nb_channels, nb_stft_bins - 1, nb_stft_frames}), | ||
| xt_onnx_in_shape({1, nb_channels, segment_samples}), | ||
| x_onnx_out_shape({1, nb_sources, 2 * nb_channels, nb_stft_bins - 1, nb_stft_frames}), | ||
| xt_onnx_out_shape({1, nb_sources, nb_channels, segment_samples}) | ||
| { | ||
| // precompute the input tensors | ||
| // inputs in form (xt, x) | ||
| input_tensors.push_back(Ort::Value::CreateTensor<float>( | ||
| demucsonnx::allocator, | ||
| xt_onnx_in_shape.data(), | ||
| xt_onnx_in_shape.size())); | ||
|
|
||
| // input_tensors.push_back(Ort::Value::CreateTensor<float>( | ||
| // demucsonnx::allocator, | ||
| // x_onnx_in_shape.data(), | ||
| // x_onnx_in_shape.size())); | ||
|
|
||
| // precompute the output tensors | ||
| // outputs in form (x_out, xt_out) | ||
| // output_tensors.push_back(Ort::Value::CreateTensor<float>( | ||
| // demucsonnx::allocator, | ||
| // x_onnx_out_shape.data(), | ||
| // x_onnx_out_shape.size())); | ||
|
|
||
| output_tensors.push_back(Ort::Value::CreateTensor<float>( | ||
| demucsonnx::allocator, | ||
| xt_onnx_out_shape.data(), | ||
| xt_onnx_out_shape.size())); | ||
| }; | ||
| }; | ||
|
|
||
| const float SEGMENT_LEN_SECS = 7.8; // 8 seconds, the demucs chunk size | ||
| const float SEGMENT_OVERLAP_SECS = 0.25; // 0.25 overlap | ||
| const float MAX_SHIFT_SECS = 0.5; // max shift | ||
| const float OVERLAP = 0.25; // overlap between segments | ||
| const float TRANSITION_POWER = 1.0; // transition between segments | ||
|
|
||
| Eigen::Tensor3dXf demucs_inference(struct demucs_model &model, | ||
| const Eigen::MatrixXf &audio, | ||
| ProgressCallback cb); | ||
|
|
||
| void model_inference(struct demucs_model &model, | ||
| struct demucsonnx::demucs_segment_buffers &buffers); | ||
| } // namespace demucsonnx | ||
|
|
||
| #endif // MODEL_HPP |
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is this used for? I cannot see any reference to |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,191 @@ | ||
| #include "dsp.hpp" | ||
| #include <algorithm> | ||
| #include <cmath> | ||
| #include <cstdlib> | ||
| #include <iostream> | ||
| #include <memory> | ||
| #include <string> | ||
| #include <unsupported/Eigen/FFT> | ||
| #include <vector> | ||
|
|
||
| // forward declaration of inner stft | ||
| void stft_inner(struct demucsonnx::stft_buffers &stft_buf, | ||
| Eigen::FFT<float> &cfg); | ||
|
|
||
| void istft_inner(struct demucsonnx::stft_buffers &stft_buf, | ||
| Eigen::FFT<float> &cfg); | ||
|
|
||
| // reflect padding | ||
| void pad_signal(struct demucsonnx::stft_buffers &stft_buf) | ||
| { | ||
| // copy from stft_buf.padded_waveform_mono_in+pad into stft_buf.pad_start, | ||
| // stft_buf.pad_end | ||
| std::copy_n(stft_buf.padded_waveform_mono_in.begin() + stft_buf.pad, | ||
| stft_buf.pad, stft_buf.pad_start.begin()); | ||
| std::copy_n(stft_buf.padded_waveform_mono_in.end() - 2 * stft_buf.pad, | ||
| stft_buf.pad, stft_buf.pad_end.begin()); | ||
|
|
||
| std::reverse(stft_buf.pad_start.begin(), stft_buf.pad_start.end()); | ||
| std::reverse(stft_buf.pad_end.begin(), stft_buf.pad_end.end()); | ||
|
|
||
| // copy stft_buf.pad_start into stft_buf.padded_waveform_mono_in | ||
| std::copy_n(stft_buf.pad_start.begin(), stft_buf.pad, | ||
| stft_buf.padded_waveform_mono_in.begin()); | ||
|
|
||
| // copy stft_buf.pad_end into stft_buf.padded_waveform_mono_in | ||
| std::copy_n(stft_buf.pad_end.begin(), stft_buf.pad, | ||
| stft_buf.padded_waveform_mono_in.end() - stft_buf.pad); | ||
| } | ||
|
|
||
| Eigen::FFT<float> get_fft_cfg() | ||
| { | ||
| Eigen::FFT<float> cfg; | ||
|
|
||
| cfg.SetFlag(Eigen::FFT<float>::Speedy); | ||
| // cfg.SetFlag(Eigen::FFT<float>::HalfSpectrum); | ||
| // cfg.SetFlag(Eigen::FFT<float>::Unscaled); | ||
|
|
||
| return cfg; | ||
| } | ||
|
|
||
| void demucsonnx::stft( | ||
| struct stft_buffers &stft_buf, | ||
| const Eigen::MatrixXf &waveform, | ||
| Eigen::Tensor3dXcf &spec) | ||
| { | ||
| // get the fft config | ||
| Eigen::FFT<float> cfg = get_fft_cfg(); | ||
|
|
||
| /*****************************************/ | ||
| /* operate on each channel sequentially */ | ||
| /*****************************************/ | ||
|
|
||
| for (int channel = 0; channel < 2; ++channel) | ||
| { | ||
| Eigen::VectorXf row_vec = waveform.row(channel); | ||
|
|
||
| std::copy_n(row_vec.data(), row_vec.size(), | ||
| stft_buf.padded_waveform_mono_in.begin() + stft_buf.pad); | ||
|
|
||
| // apply padding equivalent to center padding with center=True | ||
| // in torch.stft: | ||
| // https://pytorch.org/docs/stable/generated/torch.stft.html | ||
|
|
||
| // reflect pads stft_buf.padded_waveform_mono in-place | ||
| pad_signal(stft_buf); | ||
|
|
||
| // does forward fft on stft_buf.padded_waveform_mono, stores spectrum in | ||
| // complex_spec_mono | ||
| stft_inner(stft_buf, cfg); | ||
|
|
||
| for (int i = 0; i < stft_buf.nb_bins; ++i) | ||
| { | ||
| for (int j = 0; j < stft_buf.nb_frames; ++j) | ||
| { | ||
| spec(channel, i, j) = stft_buf.complex_spec_mono[j][i]; | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| void demucsonnx::istft( | ||
| struct stft_buffers &stft_buf, | ||
| const Eigen::Tensor3dXcf &spec, | ||
| Eigen::MatrixXf &waveform) | ||
| { | ||
| // get the fft config | ||
| Eigen::FFT<float> cfg = get_fft_cfg(); | ||
|
|
||
| /*****************************************/ | ||
| /* operate on each channel sequentially */ | ||
| /*****************************************/ | ||
|
|
||
| for (int channel = 0; channel < 2; ++channel) | ||
| { | ||
| // Populate the nested vectors | ||
| for (int i = 0; i < stft_buf.nb_bins; ++i) | ||
| { | ||
| for (int j = 0; j < stft_buf.nb_frames; ++j) | ||
| { | ||
| stft_buf.complex_spec_mono[j][i] = spec(channel, i, j); | ||
| } | ||
| } | ||
|
|
||
| // does inverse fft on stft_buf.complex_spec_mono, stores waveform in | ||
| // padded_waveform_mono | ||
| istft_inner(stft_buf, cfg); | ||
|
|
||
| // copies waveform_mono into stft_buf.waveform past first pad samples | ||
| waveform.row(channel) = Eigen::Map<Eigen::MatrixXf>( | ||
| stft_buf.padded_waveform_mono_out.data() + stft_buf.pad, 1, | ||
| stft_buf.padded_waveform_mono_out.size() - FFT_WINDOW_SIZE); | ||
| } | ||
| } | ||
|
|
||
| void stft_inner(struct demucsonnx::stft_buffers &stft_buf, | ||
| Eigen::FFT<float> &cfg) | ||
| { | ||
| int frame_idx = 0; | ||
|
|
||
| // Loop over the waveform with a stride of hop_size | ||
| for (std::size_t start = 0; | ||
| start <= | ||
| stft_buf.padded_waveform_mono_in.size() - demucsonnx::FFT_WINDOW_SIZE; | ||
| start += demucsonnx::FFT_HOP_SIZE) | ||
| { | ||
| // Apply window and run FFT | ||
| for (int i = 0; i < demucsonnx::FFT_WINDOW_SIZE; ++i) | ||
| { | ||
| stft_buf.windowed_waveform_mono[i] = | ||
| stft_buf.padded_waveform_mono_in[start + i] * | ||
| stft_buf.window[i]; | ||
| } | ||
| cfg.fwd(stft_buf.complex_spec_mono[frame_idx], | ||
| stft_buf.windowed_waveform_mono); | ||
| // now scale stft_buf.complex_spec_mono[frame_idx] by 1.0f / | ||
| // sqrt(float(FFT_WINDOW_SIZE))) | ||
|
|
||
| for (int i = 0; i < demucsonnx::FFT_WINDOW_SIZE / 2 + 1; ++i) | ||
| { | ||
| stft_buf.complex_spec_mono[frame_idx][i] *= | ||
| 1.0f / sqrt(float(demucsonnx::FFT_WINDOW_SIZE)); | ||
| } | ||
| frame_idx++; | ||
| } | ||
| } | ||
|
|
||
| void istft_inner(struct demucsonnx::stft_buffers &stft_buf, | ||
| Eigen::FFT<float> &cfg) | ||
| { | ||
| // clear padded_waveform_mono | ||
| std::fill(stft_buf.padded_waveform_mono_out.begin(), | ||
| stft_buf.padded_waveform_mono_out.end(), 0.0f); | ||
|
|
||
| // Loop over the input with a stride of (hop_size) | ||
| for (int start = 0; start < stft_buf.nb_frames * demucsonnx::FFT_HOP_SIZE; | ||
| start += demucsonnx::FFT_HOP_SIZE) | ||
| { | ||
| int frame_idx = start / demucsonnx::FFT_HOP_SIZE; | ||
| // undo sqrt(nfft) scaling | ||
| for (int i = 0; i < demucsonnx::FFT_WINDOW_SIZE / 2 + 1; ++i) | ||
| { | ||
| stft_buf.complex_spec_mono[frame_idx][i] *= | ||
| sqrt(float(demucsonnx::FFT_WINDOW_SIZE)); | ||
| } | ||
| // Run iFFT | ||
| cfg.inv(stft_buf.windowed_waveform_mono, | ||
| stft_buf.complex_spec_mono[frame_idx]); | ||
|
|
||
| // Apply window and add to output | ||
| for (int i = 0; i < demucsonnx::FFT_WINDOW_SIZE; ++i) | ||
| { | ||
| // x[start+i] is the sum of squared window values | ||
| // https://github.com/librosa/librosa/blob/main/librosa/core/spectrum.py#L613 | ||
| // 1e-8f is a small number to avoid division by zero | ||
| stft_buf.padded_waveform_mono_out[start + i] += | ||
| stft_buf.windowed_waveform_mono[i] * stft_buf.window[i] * 1.0f / | ||
| float(demucsonnx::FFT_WINDOW_SIZE) / | ||
| (stft_buf.normalized_window[start + i] + 1e-8f); | ||
| } | ||
| } | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you clarify the usecase for this dependency? Looking at this example, it looks like the ONNX library already contains its own data type to manage tensor, which can create from interleaved 32-bits float audio samples. What is the added benefits?