diff --git a/.gitignore b/.gitignore
index a7411787..a6a24c66 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,3 +17,4 @@ Session.vim
 *.onnx
 *.ort
 *.config
+/cppscripts/build
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000..537d1dfc
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "cppscripts/dependencies/eigen"]
+	path = cppscripts/dependencies/eigen
+	url = https://gitlab.com/libeigen/eigen.git
+[submodule "cppscripts/dependencies/libnyquist"]
+	path = cppscripts/dependencies/libnyquist
+	url = https://github.com/ddiakopoulos/libnyquist.git
diff --git a/cppscripts/Makefile b/cppscripts/Makefile
new file mode 100644
index 00000000..717454ce
--- /dev/null
+++ b/cppscripts/Makefile
@@ -0,0 +1,15 @@
+default: cli
+
+cli:
+	cmake -S src_cli -B build/build-cli -DCMAKE_BUILD_TYPE=Release
+	cmake --build build/build-cli -- -j16
+
+cli-debug:
+	cmake -S src_cli -B build/build-cli -DCMAKE_BUILD_TYPE=Debug
+	cmake --build build/build-cli -- -j16
+
+clean-all:
+	rm -rf build
+
+clean-cli:
+	rm -rf build/build-cli
diff --git a/cppscripts/dependencies/eigen b/cppscripts/dependencies/eigen
new file mode 160000
index 00000000..8e60d417
--- /dev/null
+++ b/cppscripts/dependencies/eigen
@@ -0,0 +1 @@
+Subproject commit 8e60d4173c6d8d5ee216c13ffd6142639be2bcc8
diff --git a/cppscripts/dependencies/libnyquist b/cppscripts/dependencies/libnyquist
new file mode 160000
index 00000000..767efd97
--- /dev/null
+++ b/cppscripts/dependencies/libnyquist
@@ -0,0 +1 @@
+Subproject commit 767efd97cdd7a281d193296586e708490eb6e54f
diff --git a/cppscripts/src/demucs.hpp b/cppscripts/src/demucs.hpp
new file mode 100644
index 00000000..cb986ee9
--- /dev/null
+++ b/cppscripts/src/demucs.hpp
@@ -0,0 +1,132 @@
+#ifndef MODEL_HPP
+#define MODEL_HPP
+
+#include "dsp.hpp"
+#include "tensor.hpp"
+#include <Eigen/Dense>
+#include <array>
+#include <functional>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <onnxruntime/onnxruntime_cxx_api.h>
+
+namespace demucsonnx
+{
+extern Ort::AllocatorWithDefaultOptions allocator;
+extern Ort::RunOptions run_options;
+
+// Define a type for your callback function
+using ProgressCallback = std::function<void(float, const std::string &)>;
+
+const int FREQ_BRANCH_LEN = 336;
+const int TIME_BRANCH_LEN_IN = 343980;
+
+struct demucs_model {
+    std::unique_ptr<Ort::Session> sess;        // Smart pointer to allow "empty" state
+    int nb_sources = 0;
+    Ort::Env env{ORT_LOGGING_LEVEL_ERROR, "demucs_onnx"}; // Persistent environment
+    std::vector<std::string> input_names;      // Persistent input names
+    std::vector<std::string> output_names;     // Persistent output names
+
+    std::vector<const char*> input_names_ptrs;
+    std::vector<const char*> output_names_ptrs;
+
+    // Constructor (optionally initialize here if needed)
+    demucs_model() = default;
+};
+
+bool load_model(const char *model_data,
+                int n_bytes,
+                struct demucs_model &model,
+                Ort::SessionOptions &session_options);
+
+bool load_model(const std::vector<char> &model_data,
+                struct demucs_model &model,
+                Ort::SessionOptions &session_options);
+
+struct demucs_segment_buffers
+{
+    int segment_samples;
+    int le;
+    int pad;
+    int pad_end;
+    int padded_segment_samples;
+    int nb_stft_frames;
+    int nb_stft_bins;
+
+    Eigen::Tensor3dXf targets_out;
+    Eigen::MatrixXf padded_mix;
+    Eigen::Tensor3dXcf z;
+
+    std::vector<int64_t> x_onnx_in_shape;
+    std::vector<int64_t> xt_onnx_in_shape;
+
+    std::vector<int64_t> x_onnx_out_shape;
+    std::vector<int64_t> xt_onnx_out_shape;
+
+    std::vector<Ort::Value> input_tensors;
+    std::vector<Ort::Value> output_tensors;
+
+    // constructor for demucs_segment_buffers that takes int parameters
+
+    // let's do pesky precomputing of the signal repadding to 1/4 hop
+    // for time and frequency alignment
+    demucs_segment_buffers(int nb_channels, int segment_samples, int nb_sources)
+        : segment_samples(segment_samples),
+          le(int(std::ceil((float)segment_samples / (float)FFT_HOP_SIZE))),
+          pad(std::floor((float)FFT_HOP_SIZE / 2.0f) * 3),
+          pad_end(pad + le * FFT_HOP_SIZE - segment_samples),
+          padded_segment_samples(segment_samples + pad + pad_end),
+          nb_stft_frames(segment_samples / demucsonnx::FFT_HOP_SIZE + 1),
+          nb_stft_bins(demucsonnx::FFT_WINDOW_SIZE / 2 + 1),
+          targets_out(nb_sources, nb_channels, segment_samples),
+          padded_mix(nb_channels, padded_segment_samples),
+          z(nb_channels, nb_stft_bins, nb_stft_frames+4),
+          // complex-as-channels implies 2*nb_channels for real+imag
+          x_onnx_in_shape({1, 2 * nb_channels, nb_stft_bins - 1, nb_stft_frames}),
+          xt_onnx_in_shape({1, nb_channels, segment_samples}),
+          x_onnx_out_shape({1, nb_sources, 2 * nb_channels, nb_stft_bins - 1, nb_stft_frames}),
+          xt_onnx_out_shape({1, nb_sources, nb_channels, segment_samples})
+    {
+        // precompute the input tensors
+        // inputs in form (xt, x)
+        input_tensors.push_back(Ort::Value::CreateTensor<float>(
+            demucsonnx::allocator,
+            xt_onnx_in_shape.data(),
+            xt_onnx_in_shape.size()));
+
+        // input_tensors.push_back(Ort::Value::CreateTensor<float>(
+        //     demucsonnx::allocator,
+        //     x_onnx_in_shape.data(),
+        //     x_onnx_in_shape.size()));
+
+        // precompute the output tensors
+        // outputs in form (x_out, xt_out)
+        // output_tensors.push_back(Ort::Value::CreateTensor<float>(
+        //     demucsonnx::allocator,
+        //     x_onnx_out_shape.data(),
+        //     x_onnx_out_shape.size()));
+
+        output_tensors.push_back(Ort::Value::CreateTensor<float>(
+            demucsonnx::allocator,
+            xt_onnx_out_shape.data(),
+            xt_onnx_out_shape.size()));
+    };
+};
+
+const float SEGMENT_LEN_SECS = 7.8;      // 8 seconds, the demucs chunk size
+const float SEGMENT_OVERLAP_SECS = 0.25; // 0.25 overlap
+const float MAX_SHIFT_SECS = 0.5;        // max shift
+const float OVERLAP = 0.25;              // overlap between segments
+const float TRANSITION_POWER = 1.0;      // transition between segments
+
+Eigen::Tensor3dXf demucs_inference(struct demucs_model &model,
+                                   const Eigen::MatrixXf &audio,
+                                   ProgressCallback cb);
+
+void model_inference(struct demucs_model &model,
+                     struct demucsonnx::demucs_segment_buffers &buffers);
+} // namespace demucsonnx
+
+#endif // MODEL_HPP
diff --git a/cppscripts/src/dsp.cpp b/cppscripts/src/dsp.cpp
new file mode 100644
index 00000000..85c2e71d
--- /dev/null
+++ b/cppscripts/src/dsp.cpp
@@ -0,0 +1,191 @@
+#include "dsp.hpp"
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <unsupported/Eigen/FFT>
+#include <vector>
+
+// forward declaration of inner stft
+void stft_inner(struct demucsonnx::stft_buffers &stft_buf,
+                Eigen::FFT<float> &cfg);
+
+void istft_inner(struct demucsonnx::stft_buffers &stft_buf,
+                 Eigen::FFT<float> &cfg);
+
+// reflect padding
+void pad_signal(struct demucsonnx::stft_buffers &stft_buf)
+{
+    // copy from stft_buf.padded_waveform_mono_in+pad into stft_buf.pad_start,
+    // stft_buf.pad_end
+    std::copy_n(stft_buf.padded_waveform_mono_in.begin() + stft_buf.pad,
+                stft_buf.pad, stft_buf.pad_start.begin());
+    std::copy_n(stft_buf.padded_waveform_mono_in.end() - 2 * stft_buf.pad,
+                stft_buf.pad, stft_buf.pad_end.begin());
+
+    std::reverse(stft_buf.pad_start.begin(), stft_buf.pad_start.end());
+    std::reverse(stft_buf.pad_end.begin(), stft_buf.pad_end.end());
+
+    // copy stft_buf.pad_start into stft_buf.padded_waveform_mono_in
+    std::copy_n(stft_buf.pad_start.begin(), stft_buf.pad,
+                stft_buf.padded_waveform_mono_in.begin());
+
+    // copy stft_buf.pad_end into stft_buf.padded_waveform_mono_in
+    std::copy_n(stft_buf.pad_end.begin(), stft_buf.pad,
+                stft_buf.padded_waveform_mono_in.end() - stft_buf.pad);
+}
+
+Eigen::FFT<float> get_fft_cfg()
+{
+    Eigen::FFT<float> cfg;
+
+    cfg.SetFlag(Eigen::FFT<float>::Speedy);
+    // cfg.SetFlag(Eigen::FFT<float>::HalfSpectrum);
+    // cfg.SetFlag(Eigen::FFT<float>::Unscaled);
+
+    return cfg;
+}
+
+void demucsonnx::stft(
+    struct stft_buffers &stft_buf,
+    const Eigen::MatrixXf &waveform,
+    Eigen::Tensor3dXcf &spec)
+{
+    // get the fft config
+    Eigen::FFT<float> cfg = get_fft_cfg();
+
+    /*****************************************/
+    /*  operate on each channel sequentially */
+    /*****************************************/
+
+    for (int channel = 0; channel < 2; ++channel)
+    {
+        Eigen::VectorXf row_vec = waveform.row(channel);
+
+        std::copy_n(row_vec.data(), row_vec.size(),
+                    stft_buf.padded_waveform_mono_in.begin() + stft_buf.pad);
+
+        // apply padding equivalent to center padding with center=True
+        // in torch.stft:
+        // https://pytorch.org/docs/stable/generated/torch.stft.html
+
+        // reflect pads stft_buf.padded_waveform_mono in-place
+        pad_signal(stft_buf);
+
+        // does forward fft on stft_buf.padded_waveform_mono, stores spectrum in
+        // complex_spec_mono
+        stft_inner(stft_buf, cfg);
+
+        for (int i = 0; i < stft_buf.nb_bins; ++i)
+        {
+            for (int j = 0; j < stft_buf.nb_frames; ++j)
+            {
+                spec(channel, i, j) = stft_buf.complex_spec_mono[j][i];
+            }
+        }
+    }
+}
+
+void demucsonnx::istft(
+    struct stft_buffers &stft_buf,
+    const Eigen::Tensor3dXcf &spec,
+    Eigen::MatrixXf &waveform)
+{
+    // get the fft config
+    Eigen::FFT<float> cfg = get_fft_cfg();
+
+    /*****************************************/
+    /*  operate on each channel sequentially */
+    /*****************************************/
+
+    for (int channel = 0; channel < 2; ++channel)
+    {
+        // Populate the nested vectors
+        for (int i = 0; i < stft_buf.nb_bins; ++i)
+        {
+            for (int j = 0; j < stft_buf.nb_frames; ++j)
+            {
+                stft_buf.complex_spec_mono[j][i] = spec(channel, i, j);
+            }
+        }
+
+        // does inverse fft on stft_buf.complex_spec_mono, stores waveform in
+        // padded_waveform_mono
+        istft_inner(stft_buf, cfg);
+
+        // copies waveform_mono into stft_buf.waveform past first pad samples
+        waveform.row(channel) = Eigen::Map<Eigen::MatrixXf>(
+            stft_buf.padded_waveform_mono_out.data() + stft_buf.pad, 1,
+            stft_buf.padded_waveform_mono_out.size() - FFT_WINDOW_SIZE);
+    }
+}
+
+void stft_inner(struct demucsonnx::stft_buffers &stft_buf,
+                Eigen::FFT<float> &cfg)
+{
+    int frame_idx = 0;
+
+    // Loop over the waveform with a stride of hop_size
+    for (std::size_t start = 0;
+         start <=
+         stft_buf.padded_waveform_mono_in.size() - demucsonnx::FFT_WINDOW_SIZE;
+         start += demucsonnx::FFT_HOP_SIZE)
+    {
+        // Apply window and run FFT
+        for (int i = 0; i < demucsonnx::FFT_WINDOW_SIZE; ++i)
+        {
+            stft_buf.windowed_waveform_mono[i] =
+                stft_buf.padded_waveform_mono_in[start + i] *
+                stft_buf.window[i];
+        }
+        cfg.fwd(stft_buf.complex_spec_mono[frame_idx],
+                stft_buf.windowed_waveform_mono);
+        // now scale stft_buf.complex_spec_mono[frame_idx] by 1.0f /
+        // sqrt(float(FFT_WINDOW_SIZE)))
+
+        for (int i = 0; i < demucsonnx::FFT_WINDOW_SIZE / 2 + 1; ++i)
+        {
+            stft_buf.complex_spec_mono[frame_idx][i] *=
+                1.0f / sqrt(float(demucsonnx::FFT_WINDOW_SIZE));
+        }
+        frame_idx++;
+    }
+}
+
+void istft_inner(struct demucsonnx::stft_buffers &stft_buf,
+                 Eigen::FFT<float> &cfg)
+{
+    // clear padded_waveform_mono
+    std::fill(stft_buf.padded_waveform_mono_out.begin(),
+              stft_buf.padded_waveform_mono_out.end(), 0.0f);
+
+    // Loop over the input with a stride of (hop_size)
+    for (int start = 0; start < stft_buf.nb_frames * demucsonnx::FFT_HOP_SIZE;
+         start += demucsonnx::FFT_HOP_SIZE)
+    {
+        int frame_idx = start / demucsonnx::FFT_HOP_SIZE;
+        // undo sqrt(nfft) scaling
+        for (int i = 0; i < demucsonnx::FFT_WINDOW_SIZE / 2 + 1; ++i)
+        {
+            stft_buf.complex_spec_mono[frame_idx][i] *=
+                sqrt(float(demucsonnx::FFT_WINDOW_SIZE));
+        }
+        // Run iFFT
+        cfg.inv(stft_buf.windowed_waveform_mono,
+                stft_buf.complex_spec_mono[frame_idx]);
+
+        // Apply window and add to output
+        for (int i = 0; i < demucsonnx::FFT_WINDOW_SIZE; ++i)
+        {
+            // x[start+i] is the sum of squared window values
+            // https://github.com/librosa/librosa/blob/main/librosa/core/spectrum.py#L613
+            // 1e-8f is a small number to avoid division by zero
+            stft_buf.padded_waveform_mono_out[start + i] +=
+                stft_buf.windowed_waveform_mono[i] * stft_buf.window[i] * 1.0f /
+                float(demucsonnx::FFT_WINDOW_SIZE) /
+                (stft_buf.normalized_window[start + i] + 1e-8f);
+        }
+    }
+}
diff --git a/cppscripts/src/dsp.hpp b/cppscripts/src/dsp.hpp
new file mode 100644
index 00000000..ef99b61d
--- /dev/null
+++ b/cppscripts/src/dsp.hpp
@@ -0,0 +1,109 @@
+#ifndef DSP_HPP
+#define DSP_HPP
+
+#include <Eigen/Dense>
+#include <complex>
+#include <iostream>
+#include <string>
+#include <tensor.hpp>
+#include <unsupported/Eigen/FFT>
+#include <vector>
+
+namespace demucsonnx
+{
+
+const int SUPPORTED_SAMPLE_RATE = 44100;
+const int FFT_WINDOW_SIZE = 4096;
+
+const int FFT_HOP_SIZE = 1024; // 25% hop i.e. 75% overlap
+
+struct stft_buffers
+{
+    int nb_frames;
+    int nb_bins;
+    int pad;
+
+    const std::vector<float> window;
+    const std::vector<float> normalized_window;
+
+    std::vector<float> pad_start;
+    std::vector<float> pad_end;
+    std::vector<float> padded_waveform_mono_in;
+    std::vector<float> padded_waveform_mono_out;
+    std::vector<float> windowed_waveform_mono;
+
+    std::vector<std::vector<std::complex<float>>> complex_spec_mono;
+
+    // constructor for stft_buffers that takes some parameters
+    // to hint at the sizes of the buffers
+    explicit stft_buffers(int n_samples)
+        : nb_frames(n_samples / FFT_HOP_SIZE + 1),
+          nb_bins(FFT_WINDOW_SIZE / 2 + 1), pad(FFT_WINDOW_SIZE / 2),
+          window(init_const_hann_window()),
+          normalized_window(init_const_normalized_window(window, nb_frames)),
+          pad_start(std::vector<float>(pad)), pad_end(std::vector<float>(pad)),
+          padded_waveform_mono_in(
+              std::vector<float>(n_samples + FFT_WINDOW_SIZE)),
+          padded_waveform_mono_out(
+              std::vector<float>(n_samples + FFT_WINDOW_SIZE)),
+          windowed_waveform_mono(std::vector<float>(FFT_WINDOW_SIZE)),
+          complex_spec_mono(std::vector<std::vector<std::complex<float>>>(
+              nb_frames, std::vector<std::complex<float>>(nb_bins))){};
+
+    static std::vector<float> init_const_hann_window()
+    {
+        static constexpr float PI = 3.14159265359F;
+        std::vector<float> window(FFT_WINDOW_SIZE);
+
+        // create a periodic hann window
+        // by generating L+1 points and deleting the last one
+        auto floatN = (float)(FFT_WINDOW_SIZE + 1);
+
+        for (std::size_t n = 0; n < FFT_WINDOW_SIZE; ++n)
+        {
+            window[n] =
+                0.5F * (1.0F - cosf(2.0F * PI * (float)n / (floatN - 1)));
+        }
+
+        return window;
+    }
+
+    static std::vector<float>
+    init_const_normalized_window(const std::vector<float> &window,
+                                 int nb_frames)
+    {
+        float window_normalization_factor =
+            FFT_WINDOW_SIZE + FFT_HOP_SIZE * (nb_frames - 1);
+        std::vector<float> normalized_window =
+            std::vector<float>(window_normalization_factor, 0.0f);
+
+        // Compute the window normalization factor
+        // using librosa window_sumsquare to compute the squared window
+        // https://github.com/librosa/librosa/blob/main/librosa/filters.py#L1545
+        for (int i = 0; i < nb_frames; ++i)
+        {
+            auto sample = i * FFT_HOP_SIZE;
+            for (int j = sample; j < std::min((int)window_normalization_factor,
+                                              sample + FFT_WINDOW_SIZE);
+                 ++j)
+            {
+                normalized_window[j] += window[j - sample] * window[j - sample];
+            }
+        }
+        return normalized_window;
+    }
+};
+
+void stft(
+    struct stft_buffers &stft_buf,
+    const Eigen::MatrixXf &waveform,
+    Eigen::Tensor3dXcf &spec);
+
+void istft(
+    struct stft_buffers &stft_buf,
+    const Eigen::Tensor3dXcf &spec,
+    Eigen::MatrixXf &waveform);
+
+} // namespace demucsonnx
+
+#endif // DSP_HPP
diff --git a/cppscripts/src/model_apply.cpp b/cppscripts/src/model_apply.cpp
new file mode 100644
index 00000000..7dde466d
--- /dev/null
+++ b/cppscripts/src/model_apply.cpp
@@ -0,0 +1,214 @@
+#include "demucs.hpp"
+#include "dsp.hpp"
+#include <Eigen/Dense>
+#include <cmath>
+#include <cstdlib>
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <random>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <unsupported/Eigen/FFT>
+#include <unsupported/Eigen/MatrixFunctions>
+#include <vector>
+#include <Eigen/Dense>
+#include <onnxruntime/onnxruntime_cxx_api.h>
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "demucs.hpp"
+
+// At global/class scope
+static std::random_device rd;  // Get entropy for seed
+static std::mt19937 gen(rd()); // Mersenne Twister generator
+
+static void reflect_padding(
+    Eigen::MatrixXf &padded_mix,
+    int left_padding,
+    int right_padding,
+    int N)
+{
+    // Reflect from the first 'left_padding' samples of the original data
+    for (int i = 0; i < left_padding; ++i)
+    {
+        padded_mix.block(0, left_padding - 1 - i, 2, 1) =
+            padded_mix.block(0, left_padding + i, 2, 1);
+    }
+
+    // Reflect from the last 'right_padding' samples of the original data
+    for (int i = 0; i < right_padding; ++i)
+    {
+        int last_elem = N + left_padding - 1 ;
+        padded_mix.block(0, last_elem + i + 1, 2, 1) =
+            padded_mix.block(0, last_elem - i, 2, 1);
+    }
+}
+
+Eigen::Tensor3dXf demucsonnx::demucs_inference(
+    struct demucsonnx::demucs_model &model,
+    const Eigen::MatrixXf &audio,
+    demucsonnx::ProgressCallback cb)
+{
+    int length = audio.cols();
+    int max_shift = static_cast<int>(demucsonnx::MAX_SHIFT_SECS * demucsonnx::SUPPORTED_SAMPLE_RATE);
+
+    // Calculate the overall mean and standard deviation
+    Eigen::VectorXf ref_mean_0 = audio.colwise().mean();
+    float ref_mean = ref_mean_0.mean();
+    float ref_std = std::sqrt((ref_mean_0.array() - ref_mean).square().sum() /
+                              (ref_mean_0.size() - 1));
+
+    // Create padded_mix with symmetric zero padding
+    int padded_length = length + 2 * max_shift;
+    Eigen::MatrixXf padded_mix(2, padded_length);
+    padded_mix.setZero();
+
+    // Normalize and copy audio into padded_mix starting at column max_shift
+    padded_mix.block(0, max_shift, 2, length) = (audio.array() - ref_mean) / ref_std;
+
+    // Generate random shift offset for time invariance
+    std::uniform_int_distribution<> dist(0, max_shift - 1);
+    int shift_offset = dist(gen);
+    std::cout << "shift offset is: " << shift_offset << std::endl;
+
+    // Create a block view for shifted_audio to avoid extra allocation
+    int shifted_length = length + max_shift - shift_offset;
+    Eigen::Ref<const Eigen::MatrixXf> shifted_audio = padded_mix.block(0, shift_offset, 2, shifted_length);
+
+    // --- Begin merged split_inference and segment_inference logic ---
+
+    // Calculate segment size in samples
+    int segment_samples = static_cast<int>(demucsonnx::SEGMENT_LEN_SECS * demucsonnx::SUPPORTED_SAMPLE_RATE);
+    int nb_out_sources = model.nb_sources;
+
+    // Create reusable buffers with padded sizes
+    demucsonnx::demucs_segment_buffers buffers(2, segment_samples, nb_out_sources);
+
+    // Calculate stride for overlapping segments
+    int stride_samples = static_cast<int>((1.0f - demucsonnx::OVERLAP) * segment_samples);
+
+    // Create an output tensor initialized to zero
+    Eigen::Tensor3dXf out(nb_out_sources, 2, shifted_length);
+    out.setZero();
+
+    // Create weight vector for overlapping segments
+    Eigen::VectorXf weight(segment_samples);
+    int half_segment = segment_samples / 2;
+    weight.head(half_segment) = Eigen::VectorXf::LinSpaced(half_segment, 1, half_segment);
+    weight.tail(half_segment) = weight.head(half_segment).reverse();
+    weight /= weight.maxCoeff();
+    weight = weight.array().pow(demucsonnx::TRANSITION_POWER);
+
+    // Initialize sum_weight as Eigen::VectorXf
+    Eigen::VectorXf sum_weight(shifted_length);
+    sum_weight.setZero();
+
+    // Calculate total number of chunks
+    int total_chunks = static_cast<int>(std::ceil(static_cast<float>(shifted_length) / stride_samples));
+    float increment_per_chunk = 1.0f / static_cast<float>(total_chunks);
+    float inference_progress = 0.0f;
+
+    // Preallocate a tensor for chunk_out to avoid repeated allocations
+    Eigen::Tensor3dXf chunk_out(nb_out_sources, 2, segment_samples);
+    chunk_out.setZero();
+
+    // Process each segment
+    for (int segment_offset = 0; segment_offset < shifted_length; segment_offset += stride_samples)
+    {
+        int chunk_length = std::min(segment_samples, shifted_length - segment_offset);
+
+        // Create a block view for the current chunk
+        Eigen::Ref<const Eigen::MatrixXf> chunk = shifted_audio.block(0, segment_offset, 2, chunk_length);
+
+        // first, symmetric padding with zeros to fit the smaller chunk
+        // into the bigger segment_samples
+        int symmetric_padding = buffers.padded_segment_samples - chunk_length;
+
+        buffers.padded_mix.setZero();
+        // copy chunk into padded_mix at position buffers.pad + symmetric_padding/2
+        int symmetric_padding_start = symmetric_padding / 2;
+        buffers.padded_mix.block(0, buffers.pad + symmetric_padding_start, 2, chunk_length) = chunk;
+
+        // then, reflect padding on the left and right for
+        // the stft boundary effects
+        reflect_padding(buffers.padded_mix, buffers.pad, buffers.pad_end, segment_samples);
+
+        // Run model inference
+        demucsonnx::model_inference(model, buffers);
+
+        // Update progress
+        cb(inference_progress + increment_per_chunk, "Segment inference complete");
+
+        // Copy from buffers.targets_out into chunk_out with center trimming
+        // Preallocate chunk_out and set to zero
+        chunk_out.setZero();
+
+        for (int i = 0; i < nb_out_sources; ++i)
+        {
+            for (int j = 0; j < 2; ++j)
+            {
+                for (int k = 0; k < chunk_length; ++k)
+                {
+                    auto kidx = k + symmetric_padding_start;
+                    kidx = std::min(kidx, int(buffers.targets_out.dimension(2)-1));
+                    // Undoing center_trim by offsetting with left_padding
+                    chunk_out(i, j, k) = buffers.targets_out(i, j, kidx);
+                }
+            }
+        }
+
+        // Accumulate the weighted chunk output using Eigen::Map for vectorization
+        for (int i = 0; i < nb_out_sources; ++i)
+        {
+            for (int j = 0; j < 2; ++j)
+            {
+                // Map the (i, j, chunk_length) slice of 'out' to an Eigen::ArrayXf
+                Eigen::Map<Eigen::ArrayXf> out_slice(&out(i, j, segment_offset), chunk_length);
+                // Map the (i, j, chunk_length) slice of 'chunk_out' to an Eigen::ArrayXf
+                Eigen::Map<const Eigen::ArrayXf> chunk_out_slice(&chunk_out(i, j, 0), chunk_length);
+                // Accumulate the weighted chunk output
+                out_slice += weight.head(chunk_length).array() * chunk_out_slice;
+            }
+        }
+
+        // Accumulate the weights using Eigen's segment operation
+        sum_weight.segment(segment_offset, chunk_length) += weight.head(chunk_length);
+
+        inference_progress += increment_per_chunk;
+    }
+
+    // Normalize the output by sum_weight using vectorized operations
+    for (int i = 0; i < nb_out_sources; ++i)
+    {
+        for (int j = 0; j < 2; ++j)
+        {
+            // Map the (i, j, shifted_length) slice of 'out' to an Eigen::ArrayXf
+            Eigen::Map<Eigen::ArrayXf> out_slice(&out(i, j, 0), shifted_length);
+            // Perform element-wise division
+            out_slice /= sum_weight.array();
+        }
+    }
+
+    // Inverse the normalization directly on 'out' using vectorized operations
+    for (int i = 0; i < nb_out_sources; ++i)
+    {
+        for (int j = 0; j < 2; ++j)
+        {
+            // Map the (i, j, shifted_length) slice of 'out' to an Eigen::ArrayXf
+            Eigen::Map<Eigen::ArrayXf> out_slice(&out(i, j, 0), shifted_length);
+            // Perform inverse normalization
+            out_slice = out_slice * ref_std + ref_mean;
+        }
+    }
+
+    // Create a view (slice) of 'out' without allocating new memory
+    int trim_start = max_shift - shift_offset;
+    Eigen::array<Eigen::Index, 3> offset_array = {0, 0, trim_start};
+    Eigen::array<Eigen::Index, 3> extent_array = {nb_out_sources, 2, length};
+    auto result = out.slice(offset_array, extent_array);
+
+    // Materialize the slice into a new tensor to return
+    Eigen::Tensor3dXf trimmed_waveform_outputs = result.eval();
+
+    return trimmed_waveform_outputs;
+}
diff --git a/cppscripts/src/model_inference.cpp b/cppscripts/src/model_inference.cpp
new file mode 100644
index 00000000..819928c5
--- /dev/null
+++ b/cppscripts/src/model_inference.cpp
@@ -0,0 +1,135 @@
+#include "demucs.hpp"
+#include "dsp.hpp"
+#include <Eigen/Dense>
+#include <cmath>
+#include <cstdlib>
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <random>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <unsupported/Eigen/FFT>
+#include <unsupported/Eigen/MatrixFunctions>
+#include <vector>
+#include <Eigen/Dense>
+#include <onnxruntime/onnxruntime_cxx_api.h>
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "demucs.hpp"
+
+namespace demucsonnx {
+    Ort::AllocatorWithDefaultOptions allocator;
+    Ort::RunOptions run_options;
+}
+
+// Core function to create and load model from in-memory data (byte array)
+bool demucsonnx::load_model(
+    const char* model_data,
+    int n_bytes,
+    struct demucsonnx::demucs_model &model,
+    Ort::SessionOptions &session_options)
+{
+    const uint8_t* final_data = reinterpret_cast<const uint8_t*>(model_data);
+    size_t final_size = n_bytes;
+
+    model.sess = std::make_unique<Ort::Session>(model.env, final_data, final_size, session_options);
+
+    std::vector<Ort::AllocatedStringPtr> input_name_allocs;
+    input_name_allocs.push_back(model.sess->GetInputNameAllocated(0, demucsonnx::allocator));
+
+    model.input_names.push_back(input_name_allocs[0].get());  // Store as std::string
+
+    std::vector<Ort::AllocatedStringPtr> output_name_allocs;
+    output_name_allocs.push_back(model.sess->GetOutputNameAllocated(0, demucsonnx::allocator));
+
+    model.output_names.push_back(output_name_allocs[0].get());
+
+    for (const auto& name : model.input_names) {
+        model.input_names_ptrs.push_back(name.c_str());
+    }
+    for (const auto& name : model.output_names) {
+        model.output_names_ptrs.push_back(name.c_str());
+    }
+
+    auto output0_shape = model.sess->GetOutputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape();
+
+    model.nb_sources = output0_shape[1];
+    return true;
+}
+
+// Overload for std::vector<char>
+bool demucsonnx::load_model(
+    const std::vector<char> &model_data,
+    struct demucsonnx::demucs_model &model,
+    Ort::SessionOptions &session_options)
+{
+    return load_model(
+        model_data.data(), model_data.size(), model, session_options);
+}
+
+void RunONNXInference(
+    struct demucsonnx::demucs_model &model,
+    struct demucsonnx::demucs_segment_buffers &buffers
+) {
+    // Run the model
+    model.sess->Run(
+        demucsonnx::run_options,
+        model.input_names_ptrs.data(),
+        buffers.input_tensors.data(),
+        buffers.input_tensors.size(),
+        model.output_names_ptrs.data(),
+        buffers.output_tensors.data(),
+        model.output_names_ptrs.size()
+    );
+}
+
+// run core demucs inference using onnx
+void demucsonnx::model_inference(
+    struct demucsonnx::demucs_model &model,
+    struct demucsonnx::demucs_segment_buffers &buffers)
+    // struct demucsonnx::stft_buffers &stft_buf)
+{
+
+    // prepare time branch input by copying buffers.mix into  input_tensors[0]
+    float *xt_onnx_data = buffers.input_tensors[0].GetTensorMutableData<float>();
+
+    for (int i = 0; i < buffers.padded_mix.rows(); ++i)
+    {
+        for (int j = 0; j < buffers.segment_samples; ++j)
+        {
+            // calculate destination index, simple row major calculation
+            // given the onnx shape of (1, 2, segment_samples)
+            int dest_index = i * buffers.segment_samples + j;
+            xt_onnx_data[dest_index] = buffers.padded_mix(i, j + buffers.pad);
+        }
+    }
+    
+    // now we apply the core demucs inference
+    RunONNXInference(model, buffers);
+
+    std::cout << "ONNX inference completed." << std::endl;
+
+    int nb_out_sources = model.nb_sources;
+
+    // nb_sources sources, 2 channels, N samples
+    std::vector<Eigen::MatrixXf> xt_3d(
+        nb_out_sources,
+        Eigen::MatrixXf(2, buffers.segment_samples)
+    );
+
+    // Map output onnx tensors
+    float* xt_out_data = buffers.output_tensors[0].GetTensorMutableData<float>();
+
+    for (int s = 0; s < nb_out_sources; ++s)
+    { // loop over 4 sources
+        for (int i = 0; i < 2; ++i)
+        {
+            for (int j = 0; j < buffers.segment_samples; ++j)
+            {
+                int index = s * 2 * buffers.segment_samples + i * buffers.segment_samples + j;
+                buffers.targets_out(s, i, j) = xt_out_data[index];
+            }
+        }
+    }
+}
diff --git a/cppscripts/src/tensor.hpp b/cppscripts/src/tensor.hpp
new file mode 100644
index 00000000..b3ec8bae
--- /dev/null
+++ b/cppscripts/src/tensor.hpp
@@ -0,0 +1,24 @@
+#ifndef TENSOR_HPP
+#define TENSOR_HPP
+
+#include <Eigen/Dense>
+#include <complex>
+#include <iostream>
+#include <string>
+#include <unsupported/Eigen/CXX11/Tensor>
+#include <vector>
+#include <limits>
+#include <onnxruntime/onnxruntime_cxx_api.h>
+
+namespace Eigen
+{
+// define Tensor3dXf, Tensor3dXcf for spectrograms etc.
+typedef Tensor<float, 5, RowMajor> Tensor5dXf;
+typedef Tensor<float, 4, RowMajor> Tensor4dXf;
+typedef Tensor<float, 3, RowMajor> Tensor3dXf;
+typedef Tensor<float, 2, RowMajor> Tensor2dXf;
+typedef Tensor<float, 1, RowMajor> Tensor1dXf;
+typedef Tensor<std::complex<float>, 3, RowMajor> Tensor3dXcf;
+} // namespace Eigen
+
+#endif // TENSOR_HPP
diff --git a/cppscripts/src_cli/CMakeLists.txt b/cppscripts/src_cli/CMakeLists.txt
new file mode 100644
index 00000000..759d0fcd
--- /dev/null
+++ b/cppscripts/src_cli/CMakeLists.txt
@@ -0,0 +1,28 @@
+cmake_minimum_required(VERSION 3.0)
+
+project(demucs.onnx)
+enable_testing()
+
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+set(CMAKE_CXX_FLAGS "-Wall -Wextra")
+set(CMAKE_CXX_FLAGS_DEBUG "-g -DEIGEN_INTERNAL_DEBUGGING")
+set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -ffast-math -flto -fno-signed-zeros -fassociative-math -freciprocal-math -fno-math-errno -fno-rounding-math -funsafe-math-optimizations -fno-trapping-math -fno-rtti -DNDEBUG")
+
+# Set your onnxruntime installation path here
+set(ONNXRUNTIME_HOME /opt/homebrew/Cellar/onnxruntime/1.21.0)
+include_directories(${ONNXRUNTIME_HOME}/include)
+link_directories(${ONNXRUNTIME_HOME}/lib)
+
+set(LIBNYQUIST_BUILD_EXAMPLE OFF CACHE BOOL "Disable libnyquist example")
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../dependencies/libnyquist ${CMAKE_CURRENT_BINARY_DIR}/libnyquist_build)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../dependencies/eigen)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../dependencies/libnyquist/include)
+
+file(GLOB SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/../src/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../src_cli/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../onnx-models/model/*.ort.c")
+add_executable(demucs ${SOURCES})
+
+target_link_libraries(demucs onnxruntime libnyquist)
diff --git a/cppscripts/src_cli/demucs.cpp b/cppscripts/src_cli/demucs.cpp
new file mode 100644
index 00000000..51b5f500
--- /dev/null
+++ b/cppscripts/src_cli/demucs.cpp
@@ -0,0 +1,289 @@
+#include "demucs.hpp"
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <libnyquist/Common.h>
+#include <libnyquist/Decoders.h>
+#include <libnyquist/Encoders.h>
+#include <map>
+#include <numeric>
+#include <ranges>
+#include <sstream>
+#include <stddef.h>
+#include <tuple>
+#include <vector>
+
+using namespace nqr;
+
+// Overload for file path, calling one of the other overloads as needed
+static demucsonnx::demucs_model load_model(
+    const std::string& htdemucs_model_path,
+    Ort::SessionOptions& session_options
+) {
+    struct demucsonnx::demucs_model model;
+
+    std::ifstream file(htdemucs_model_path, std::ios::binary | std::ios::ate);
+    if (!file) {
+        throw std::runtime_error("Failed to open model file: " + htdemucs_model_path);
+    }
+
+    std::streamsize size = file.tellg();
+    file.seekg(0, std::ios::beg);
+
+    std::vector<char> file_data(size);
+    if (!file.read(file_data.data(), size)) {
+        throw std::runtime_error("Failed to read model file.");
+    }
+
+    bool success = demucsonnx::load_model(file_data, model, session_options);
+    if (!success) {
+        throw std::runtime_error("Failed to load model.");
+    }
+
+    return model;
+}
+
+static Eigen::MatrixXf load_audio_file(std::string filename)
+{
+    // load a wav file with libnyquist
+    std::shared_ptr<AudioData> fileData = std::make_shared<AudioData>();
+
+    NyquistIO loader;
+
+    loader.Load(fileData.get(), filename);
+
+    if (fileData->sampleRate != demucsonnx::SUPPORTED_SAMPLE_RATE)
+    {
+        std::cerr << "[ERROR] demucs.cpp only supports the following sample "
+                     "rate (Hz): "
+                  << demucsonnx::SUPPORTED_SAMPLE_RATE << std::endl;
+        exit(1);
+    }
+
+    std::cout << "Input samples: "
+              << fileData->samples.size() / fileData->channelCount << std::endl;
+    std::cout << "Length in seconds: " << fileData->lengthSeconds << std::endl;
+    std::cout << "Number of channels: " << fileData->channelCount << std::endl;
+
+    if (fileData->channelCount != 2 && fileData->channelCount != 1)
+    {
+        std::cerr << "[ERROR] demucs.cpp only supports mono and stereo audio"
+                  << std::endl;
+        exit(1);
+    }
+
+    // number of samples per channel
+    std::size_t N = fileData->samples.size() / fileData->channelCount;
+
+    // create a struct to hold two float vectors for left and right channels
+    Eigen::MatrixXf ret(2, N);
+
+    if (fileData->channelCount == 1)
+    {
+        // Mono case
+        for (std::size_t i = 0; i < N; ++i)
+        {
+            ret(0, i) = fileData->samples[i]; // left channel
+            ret(1, i) = fileData->samples[i]; // right channel
+        }
+    }
+    else
+    {
+        // Stereo case
+        for (std::size_t i = 0; i < N; ++i)
+        {
+            ret(0, i) = fileData->samples[2 * i];     // left channel
+            ret(1, i) = fileData->samples[2 * i + 1]; // right channel
+        }
+    }
+
+    return ret;
+}
+
+// write a function to write a StereoWaveform to a wav file
+static void write_audio_file(const Eigen::MatrixXf &waveform,
+                             std::string filename)
+{
+    // create a struct to hold the audio data
+    std::shared_ptr<AudioData> fileData = std::make_shared<AudioData>();
+
+    // set the sample rate
+    fileData->sampleRate = demucsonnx::SUPPORTED_SAMPLE_RATE;
+
+    // set the number of channels
+    fileData->channelCount = 2;
+
+    // set the number of samples
+    fileData->samples.resize(waveform.cols() * 2);
+
+    // write the left channel
+    for (long int i = 0; i < waveform.cols(); ++i)
+    {
+        fileData->samples[2 * i] = waveform(0, i);
+        fileData->samples[2 * i + 1] = waveform(1, i);
+    }
+
+    int encoderStatus =
+        encode_wav_to_disk({fileData->channelCount, PCM_FLT, DITHER_TRIANGLE},
+                           fileData.get(), filename);
+    std::cout << "Encoder Status: " << encoderStatus << std::endl;
+}
+
+int main(int argc, const char **argv)
+{
+
+    try {
+        // your existing main logic
+
+        if (argc != 4)
+        {
+            std::cerr << "Usage: " << argv[0] << " <model file> <wav file> <out dir>"
+                      << std::endl;
+            exit(1);
+        }
+    
+        std::cout << "demucs.onnx Main driver program" << std::endl;
+        std::string model_file = argv[1];
+    
+        // load audio passed as argument
+        std::string wav_file = argv[2];
+    
+        // output dir passed as argument
+        std::string out_dir = argv[3];
+    
+        // Check if the output directory exists, and create it if not
+        std::filesystem::path output_dir_path(out_dir);
+        if (!std::filesystem::exists(output_dir_path))
+        {
+            std::cerr << "Directory does not exist: " << out_dir << ". Creating it."
+                      << std::endl;
+            if (!std::filesystem::create_directories(output_dir_path))
+            {
+                std::cerr << "Error: Unable to create directory: " << out_dir
+                          << std::endl;
+                return 1;
+            }
+        }
+        else if (!std::filesystem::is_directory(output_dir_path))
+        {
+            std::cerr << "Error: " << out_dir << " exists but is not a directory!"
+                      << std::endl;
+            return 1;
+        }
+    
+        Eigen::MatrixXf audio = load_audio_file(wav_file);
+        Eigen::Tensor3dXf out_targets;
+    
+        std::cout << "Running Demucs.onnx inference for: " << wav_file << std::endl;
+    
+            // set output precision to 3 decimal places
+        std::cout << std::fixed << std::setprecision(3);
+    
+        demucsonnx::ProgressCallback progressCallback =
+            [](float progress, const std::string &log_message)
+        {
+            std::cout << "(" << std::setw(3) << std::setfill(' ')
+                      << progress * 100.0f << "%) " << log_message << std::endl;
+        };
+    
+        // create Ort::SessionOptions
+        Ort::SessionOptions session_options;
+    
+        // max out threads and increase performance to the max on my beefy
+        // desktop CPU
+        session_options.SetExecutionMode(ExecutionMode::ORT_PARALLEL);
+        session_options.SetIntraOpNumThreads(16);
+        session_options.SetInterOpNumThreads(16);
+    
+        // General optimizations
+        session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
+    
+        struct demucsonnx::demucs_model model = load_model(
+            model_file,
+            session_options
+        );
+    
+        // create 4 audio matrix same size, to hold output
+        Eigen::Tensor3dXf audio_targets =
+            demucsonnx::demucs_inference(model, audio, progressCallback);
+    
+        out_targets = audio_targets;
+    
+        int nb_out_sources = model.nb_sources;
+    
+        for (int target = 0; target < nb_out_sources; ++target)
+        {
+            // now write the 4 audio waveforms to files in the output dir
+            // using libnyquist
+            // join out_dir with "/target_0.wav"
+            // using std::filesystem::path;
+    
+            std::filesystem::path p = out_dir;
+            // make sure the directory exists
+            std::filesystem::create_directories(p);
+    
+            auto p_target = p / "target_0.wav";
+    
+            // target 0,1,2,3 map to drums,bass,other,vocals
+    
+            std::string target_name;
+    
+            switch (target)
+            {
+            case 0:
+                target_name = "drums";
+                break;
+            case 1:
+                target_name = "bass";
+                break;
+            case 2:
+                target_name = "other";
+                break;
+            case 3:
+                target_name = "vocals";
+                break;
+            case 4:
+                target_name = "guitar";
+                break;
+            case 5:
+                target_name = "piano";
+                break;
+            default:
+                std::cerr << "Error: target " << target << " not supported"
+                          << std::endl;
+                exit(1);
+            }
+    
+            // insert target_name into the path after the digit
+            // e.g. target_name_0_drums.wav
+            p_target.replace_filename("target_" + std::to_string(target) + "_" +
+                                      target_name + ".wav");
+    
+            std::cout << "Writing wav file " << p_target << std::endl;
+    
+            Eigen::MatrixXf target_waveform(2, audio.cols());
+    
+            // copy the input stereo wav file into all 4 targets
+            for (int channel = 0; channel < 2; ++channel)
+            {
+                for (int sample = 0; sample < audio.cols(); ++sample)
+                {
+                    target_waveform(channel, sample) =
+                        out_targets(target, channel, sample);
+                }
+            }
+    
+            write_audio_file(target_waveform, p_target);
+        }
+    
+        return 0;
+    } catch (const std::exception& e) {
+        std::cerr << "Caught exception: " << e.what() << std::endl;
+        return 1;
+    }
+
+}
diff --git a/docs/onnx.md b/docs/onnx.md
index ea6c84c4..39285239 100644
--- a/docs/onnx.md
+++ b/docs/onnx.md
@@ -2,14 +2,39 @@
 
 ## Convert PyTorch model to ONNX and ORT
 
-Convert Demucs PyTorch model to ONNX:
+- Convert Demucs PyTorch model to ONNX:
 
 ```python
 python ./scripts/convert-pth-to-onnx.py ./onnx-models
 ```
 
-Optionally, convert ONNX model to ORT:
+- Optionally, convert ONNX model to ORT:
 
 ```python
 python -m onnxruntime.tools.convert_onnx_models_to_ort ./onnx-models --enable_type_reduction 
 ```
+
+## Using C++ scripts
+
+- Install dependencies
+
+```git
+git submodule update --init --recursive
+```
+
+- Set ONNXRuntime path in `./cppscripts/src_cli/CMakeLists.txt`
+
+- Compile the C++ code
+
+```bash
+cd cppscripts
+make cli
+cd ..
+```
+
+- Run inference using the following command
+
+```bash
+mkdir ./separated/htdemucs_cpp/
+./cppscripts/build/build-cli/demucs ./onnx-models/htdemucs.ort ./test.mp3 ./separated/htdemucs_cpp/
+```