Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ Session.vim
*.onnx
*.ort
*.config
/cppscripts/build
6 changes: 6 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[submodule "cppscripts/dependencies/eigen"]
path = cppscripts/dependencies/eigen
url = https://gitlab.com/libeigen/eigen.git
[submodule "cppscripts/dependencies/libnyquist"]
path = cppscripts/dependencies/libnyquist
url = https://github.com/ddiakopoulos/libnyquist.git
15 changes: 15 additions & 0 deletions cppscripts/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
default: cli

cli:
cmake -S src_cli -B build/build-cli -DCMAKE_BUILD_TYPE=Release
cmake --build build/build-cli -- -j16

cli-debug:
cmake -S src_cli -B build/build-cli -DCMAKE_BUILD_TYPE=Debug
cmake --build build/build-cli -- -j16

clean-all:
rm -rf build

clean-cli:
rm -rf build/build-cli
1 change: 1 addition & 0 deletions cppscripts/dependencies/eigen
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you clarify the usecase for this dependency? Looking at this example, it looks like the ONNX library already contains its own data type to manage tensor, which can create from interleaved 32-bits float audio samples. What is the added benefits?

Submodule eigen added at 8e60d4
1 change: 1 addition & 0 deletions cppscripts/dependencies/libnyquist
Submodule libnyquist added at 767efd
132 changes: 132 additions & 0 deletions cppscripts/src/demucs.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
#ifndef MODEL_HPP
#define MODEL_HPP

#include "dsp.hpp"
#include "tensor.hpp"
#include <Eigen/Dense>
#include <array>
#include <functional>
#include <iostream>
#include <string>
#include <vector>
#include <onnxruntime/onnxruntime_cxx_api.h>

namespace demucsonnx
{
extern Ort::AllocatorWithDefaultOptions allocator;
extern Ort::RunOptions run_options;

// Define a type for your callback function
using ProgressCallback = std::function<void(float, const std::string &)>;

const int FREQ_BRANCH_LEN = 336;
const int TIME_BRANCH_LEN_IN = 343980;

struct demucs_model {
std::unique_ptr<Ort::Session> sess; // Smart pointer to allow "empty" state
int nb_sources = 0;
Ort::Env env{ORT_LOGGING_LEVEL_ERROR, "demucs_onnx"}; // Persistent environment
std::vector<std::string> input_names; // Persistent input names
std::vector<std::string> output_names; // Persistent output names

std::vector<const char*> input_names_ptrs;
std::vector<const char*> output_names_ptrs;

// Constructor (optionally initialize here if needed)
demucs_model() = default;
};

bool load_model(const char *model_data,
int n_bytes,
struct demucs_model &model,
Ort::SessionOptions &session_options);

bool load_model(const std::vector<char> &model_data,
struct demucs_model &model,
Ort::SessionOptions &session_options);

struct demucs_segment_buffers
{
int segment_samples;
int le;
int pad;
int pad_end;
int padded_segment_samples;
int nb_stft_frames;
int nb_stft_bins;

Eigen::Tensor3dXf targets_out;
Eigen::MatrixXf padded_mix;
Eigen::Tensor3dXcf z;

std::vector<int64_t> x_onnx_in_shape;
std::vector<int64_t> xt_onnx_in_shape;

std::vector<int64_t> x_onnx_out_shape;
std::vector<int64_t> xt_onnx_out_shape;

std::vector<Ort::Value> input_tensors;
std::vector<Ort::Value> output_tensors;

// constructor for demucs_segment_buffers that takes int parameters

// let's do pesky precomputing of the signal repadding to 1/4 hop
// for time and frequency alignment
demucs_segment_buffers(int nb_channels, int segment_samples, int nb_sources)
: segment_samples(segment_samples),
le(int(std::ceil((float)segment_samples / (float)FFT_HOP_SIZE))),
pad(std::floor((float)FFT_HOP_SIZE / 2.0f) * 3),
pad_end(pad + le * FFT_HOP_SIZE - segment_samples),
padded_segment_samples(segment_samples + pad + pad_end),
nb_stft_frames(segment_samples / demucsonnx::FFT_HOP_SIZE + 1),
nb_stft_bins(demucsonnx::FFT_WINDOW_SIZE / 2 + 1),
targets_out(nb_sources, nb_channels, segment_samples),
padded_mix(nb_channels, padded_segment_samples),
z(nb_channels, nb_stft_bins, nb_stft_frames+4),
// complex-as-channels implies 2*nb_channels for real+imag
x_onnx_in_shape({1, 2 * nb_channels, nb_stft_bins - 1, nb_stft_frames}),
xt_onnx_in_shape({1, nb_channels, segment_samples}),
x_onnx_out_shape({1, nb_sources, 2 * nb_channels, nb_stft_bins - 1, nb_stft_frames}),
xt_onnx_out_shape({1, nb_sources, nb_channels, segment_samples})
{
// precompute the input tensors
// inputs in form (xt, x)
input_tensors.push_back(Ort::Value::CreateTensor<float>(
demucsonnx::allocator,
xt_onnx_in_shape.data(),
xt_onnx_in_shape.size()));

// input_tensors.push_back(Ort::Value::CreateTensor<float>(
// demucsonnx::allocator,
// x_onnx_in_shape.data(),
// x_onnx_in_shape.size()));

// precompute the output tensors
// outputs in form (x_out, xt_out)
// output_tensors.push_back(Ort::Value::CreateTensor<float>(
// demucsonnx::allocator,
// x_onnx_out_shape.data(),
// x_onnx_out_shape.size()));

output_tensors.push_back(Ort::Value::CreateTensor<float>(
demucsonnx::allocator,
xt_onnx_out_shape.data(),
xt_onnx_out_shape.size()));
};
};

const float SEGMENT_LEN_SECS = 7.8; // 8 seconds, the demucs chunk size
const float SEGMENT_OVERLAP_SECS = 0.25; // 0.25 overlap
const float MAX_SHIFT_SECS = 0.5; // max shift
const float OVERLAP = 0.25; // overlap between segments
const float TRANSITION_POWER = 1.0; // transition between segments

Eigen::Tensor3dXf demucs_inference(struct demucs_model &model,
const Eigen::MatrixXf &audio,
ProgressCallback cb);

void model_inference(struct demucs_model &model,
struct demucsonnx::demucs_segment_buffers &buffers);
} // namespace demucsonnx

#endif // MODEL_HPP
191 changes: 191 additions & 0 deletions cppscripts/src/dsp.cpp
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is this used for? I cannot see any reference to demucsonnx::stft and demucsonnx::istft. My understanding is that the stft and istft function are already part of the model, is that intending to swap them with explicit implementation? If yes, how does that work with GPU/NPU devices?

Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
#include "dsp.hpp"
#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <iostream>
#include <memory>
#include <string>
#include <unsupported/Eigen/FFT>
#include <vector>

// forward declaration of inner stft
void stft_inner(struct demucsonnx::stft_buffers &stft_buf,
Eigen::FFT<float> &cfg);

void istft_inner(struct demucsonnx::stft_buffers &stft_buf,
Eigen::FFT<float> &cfg);

// reflect padding
void pad_signal(struct demucsonnx::stft_buffers &stft_buf)
{
// copy from stft_buf.padded_waveform_mono_in+pad into stft_buf.pad_start,
// stft_buf.pad_end
std::copy_n(stft_buf.padded_waveform_mono_in.begin() + stft_buf.pad,
stft_buf.pad, stft_buf.pad_start.begin());
std::copy_n(stft_buf.padded_waveform_mono_in.end() - 2 * stft_buf.pad,
stft_buf.pad, stft_buf.pad_end.begin());

std::reverse(stft_buf.pad_start.begin(), stft_buf.pad_start.end());
std::reverse(stft_buf.pad_end.begin(), stft_buf.pad_end.end());

// copy stft_buf.pad_start into stft_buf.padded_waveform_mono_in
std::copy_n(stft_buf.pad_start.begin(), stft_buf.pad,
stft_buf.padded_waveform_mono_in.begin());

// copy stft_buf.pad_end into stft_buf.padded_waveform_mono_in
std::copy_n(stft_buf.pad_end.begin(), stft_buf.pad,
stft_buf.padded_waveform_mono_in.end() - stft_buf.pad);
}

Eigen::FFT<float> get_fft_cfg()
{
Eigen::FFT<float> cfg;

cfg.SetFlag(Eigen::FFT<float>::Speedy);
// cfg.SetFlag(Eigen::FFT<float>::HalfSpectrum);
// cfg.SetFlag(Eigen::FFT<float>::Unscaled);

return cfg;
}

void demucsonnx::stft(
struct stft_buffers &stft_buf,
const Eigen::MatrixXf &waveform,
Eigen::Tensor3dXcf &spec)
{
// get the fft config
Eigen::FFT<float> cfg = get_fft_cfg();

/*****************************************/
/* operate on each channel sequentially */
/*****************************************/

for (int channel = 0; channel < 2; ++channel)
{
Eigen::VectorXf row_vec = waveform.row(channel);

std::copy_n(row_vec.data(), row_vec.size(),
stft_buf.padded_waveform_mono_in.begin() + stft_buf.pad);

// apply padding equivalent to center padding with center=True
// in torch.stft:
// https://pytorch.org/docs/stable/generated/torch.stft.html

// reflect pads stft_buf.padded_waveform_mono in-place
pad_signal(stft_buf);

// does forward fft on stft_buf.padded_waveform_mono, stores spectrum in
// complex_spec_mono
stft_inner(stft_buf, cfg);

for (int i = 0; i < stft_buf.nb_bins; ++i)
{
for (int j = 0; j < stft_buf.nb_frames; ++j)
{
spec(channel, i, j) = stft_buf.complex_spec_mono[j][i];
}
}
}
}

void demucsonnx::istft(
struct stft_buffers &stft_buf,
const Eigen::Tensor3dXcf &spec,
Eigen::MatrixXf &waveform)
{
// get the fft config
Eigen::FFT<float> cfg = get_fft_cfg();

/*****************************************/
/* operate on each channel sequentially */
/*****************************************/

for (int channel = 0; channel < 2; ++channel)
{
// Populate the nested vectors
for (int i = 0; i < stft_buf.nb_bins; ++i)
{
for (int j = 0; j < stft_buf.nb_frames; ++j)
{
stft_buf.complex_spec_mono[j][i] = spec(channel, i, j);
}
}

// does inverse fft on stft_buf.complex_spec_mono, stores waveform in
// padded_waveform_mono
istft_inner(stft_buf, cfg);

// copies waveform_mono into stft_buf.waveform past first pad samples
waveform.row(channel) = Eigen::Map<Eigen::MatrixXf>(
stft_buf.padded_waveform_mono_out.data() + stft_buf.pad, 1,
stft_buf.padded_waveform_mono_out.size() - FFT_WINDOW_SIZE);
}
}

void stft_inner(struct demucsonnx::stft_buffers &stft_buf,
Eigen::FFT<float> &cfg)
{
int frame_idx = 0;

// Loop over the waveform with a stride of hop_size
for (std::size_t start = 0;
start <=
stft_buf.padded_waveform_mono_in.size() - demucsonnx::FFT_WINDOW_SIZE;
start += demucsonnx::FFT_HOP_SIZE)
{
// Apply window and run FFT
for (int i = 0; i < demucsonnx::FFT_WINDOW_SIZE; ++i)
{
stft_buf.windowed_waveform_mono[i] =
stft_buf.padded_waveform_mono_in[start + i] *
stft_buf.window[i];
}
cfg.fwd(stft_buf.complex_spec_mono[frame_idx],
stft_buf.windowed_waveform_mono);
// now scale stft_buf.complex_spec_mono[frame_idx] by 1.0f /
// sqrt(float(FFT_WINDOW_SIZE)))

for (int i = 0; i < demucsonnx::FFT_WINDOW_SIZE / 2 + 1; ++i)
{
stft_buf.complex_spec_mono[frame_idx][i] *=
1.0f / sqrt(float(demucsonnx::FFT_WINDOW_SIZE));
}
frame_idx++;
}
}

void istft_inner(struct demucsonnx::stft_buffers &stft_buf,
Eigen::FFT<float> &cfg)
{
// clear padded_waveform_mono
std::fill(stft_buf.padded_waveform_mono_out.begin(),
stft_buf.padded_waveform_mono_out.end(), 0.0f);

// Loop over the input with a stride of (hop_size)
for (int start = 0; start < stft_buf.nb_frames * demucsonnx::FFT_HOP_SIZE;
start += demucsonnx::FFT_HOP_SIZE)
{
int frame_idx = start / demucsonnx::FFT_HOP_SIZE;
// undo sqrt(nfft) scaling
for (int i = 0; i < demucsonnx::FFT_WINDOW_SIZE / 2 + 1; ++i)
{
stft_buf.complex_spec_mono[frame_idx][i] *=
sqrt(float(demucsonnx::FFT_WINDOW_SIZE));
}
// Run iFFT
cfg.inv(stft_buf.windowed_waveform_mono,
stft_buf.complex_spec_mono[frame_idx]);

// Apply window and add to output
for (int i = 0; i < demucsonnx::FFT_WINDOW_SIZE; ++i)
{
// x[start+i] is the sum of squared window values
// https://github.com/librosa/librosa/blob/main/librosa/core/spectrum.py#L613
// 1e-8f is a small number to avoid division by zero
stft_buf.padded_waveform_mono_out[start + i] +=
stft_buf.windowed_waveform_mono[i] * stft_buf.window[i] * 1.0f /
float(demucsonnx::FFT_WINDOW_SIZE) /
(stft_buf.normalized_window[start + i] + 1e-8f);
}
}
}
Loading
Loading