diff --git a/CMakeLists.txt b/CMakeLists.txt index ddc6dc15a2..6fada209fe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -166,10 +166,6 @@ else() endif() add_subdirectory(src/libtorchaudio) -if (BUILD_SOX) - add_subdirectory(third_party/sox) - add_subdirectory(src/libtorchaudio/sox) -endif() if (USE_FFMPEG) if (DEFINED ENV{FFMPEG_ROOT}) add_subdirectory(third_party/ffmpeg/single) diff --git a/docs/source/functional.rst b/docs/source/functional.rst index f58a6730b8..158ae54869 100644 --- a/docs/source/functional.rst +++ b/docs/source/functional.rst @@ -23,7 +23,6 @@ Utility mask_along_axis_iid mu_law_encoding mu_law_decoding - apply_codec resample loudness convolve diff --git a/docs/source/sox_effects.rst b/docs/source/sox_effects.rst deleted file mode 100644 index a8ee260144..0000000000 --- a/docs/source/sox_effects.rst +++ /dev/null @@ -1,34 +0,0 @@ -.. py:module:: torchaudio.sox_effects - -torchaudio.sox_effects -====================== - -.. currentmodule:: torchaudio.sox_effects - -.. warning:: - Starting with version 2.8, we are refactoring TorchAudio to transition it - into a maintenance phase. As a result, the ``sox_effect`` module is - deprecated in 2.8 and will be removed in 2.9. - -Applying effects ----------------- - -Apply SoX effects chain on torch.Tensor or on file and load as torch.Tensor. - -.. autosummary:: - :toctree: generated - :nosignatures: - - apply_effects_tensor - apply_effects_file - -.. minigallery:: torchaudio.sox_effects.apply_effects_tensor - -Utilities ---------- - -.. autosummary:: - :toctree: generated - :nosignatures: - - effect_names diff --git a/docs/source/torchaudio.rst b/docs/source/torchaudio.rst index aa933e84ad..629ffd312a 100644 --- a/docs/source/torchaudio.rst +++ b/docs/source/torchaudio.rst @@ -78,14 +78,6 @@ The following table summarizes the backends. to retrieve the supported codecs. This backend Supports various protocols, such as HTTPS and MP4, and file-like objects. - * - 2 - - SoX - - Linux, macOS - - Use :py:func:`~torchaudio.utils.sox_utils.list_read_formats` and - :py:func:`~torchaudio.utils.sox_utils.list_write_formats` - to retrieve the supported codecs. - - This backend does *not* support file-like objects. * - 3 - SoundFile - Linux, macOS, Windows diff --git a/docs/source/utils.rst b/docs/source/utils.rst index af42445765..70d29f3093 100644 --- a/docs/source/utils.rst +++ b/docs/source/utils.rst @@ -8,7 +8,7 @@ torchaudio.utils .. warning:: Starting with version 2.8, we are refactoring TorchAudio to transition it into a maintenance phase. As a result: - - ``sox_utils`` and `ffmpeg_utils`` are deprecated in 2.8 and will be removed in 2.9. + - ``ffmpeg_utils`` are deprecated in 2.8 and will be removed in 2.9. - The decoding and encoding capabilities of PyTorch for both audio and video are being consolidated into TorchCodec. Please see https://github.com/pytorch/audio/issues/3902 for more information. diff --git a/examples/libtorchaudio/CMakeLists.txt b/examples/libtorchaudio/CMakeLists.txt index b4cf58b375..e540f88044 100644 --- a/examples/libtorchaudio/CMakeLists.txt +++ b/examples/libtorchaudio/CMakeLists.txt @@ -2,8 +2,6 @@ cmake_minimum_required(VERSION 3.5) project(libtorchaudio-cpp-example) -SET(BUILD_SOX ON CACHE BOOL "Build libsox into libtorchaudio") - SET(BUILD_KALDI OFF CACHE BOOL "Build Kaldi into libtorchaudio") SET(BUILD_RNNT ON CACHE BOOL "Build RNN transducer into libtorchaudio") SET(BUILD_TORCHAUDIO_PYTHON_EXTENSION OFF CACHE BOOL "Build Python binding") diff --git a/src/libtorchaudio/sox/CMakeLists.txt b/src/libtorchaudio/sox/CMakeLists.txt deleted file mode 100644 index 5ffe782c82..0000000000 --- a/src/libtorchaudio/sox/CMakeLists.txt +++ /dev/null @@ -1,25 +0,0 @@ -set( - sources - io.cpp - utils.cpp - effects.cpp - effects_chain.cpp - types.cpp - ) -torchaudio_library( - libtorchaudio_sox - "${sources}" - "" - "torch;sox" - "" - ) - -if (BUILD_TORCHAUDIO_PYTHON_EXTENSION) - torchaudio_extension( - _torchaudio_sox - "pybind/pybind.cpp;" - "" - "libtorchaudio_sox" - "" - ) -endif() diff --git a/src/libtorchaudio/sox/effects.cpp b/src/libtorchaudio/sox/effects.cpp deleted file mode 100644 index 947c04e3fc..0000000000 --- a/src/libtorchaudio/sox/effects.cpp +++ /dev/null @@ -1,133 +0,0 @@ -#include -#include -#include -#include - -namespace torchaudio::sox { -namespace { - -enum SoxEffectsResourceState { NotInitialized, Initialized, ShutDown }; -SoxEffectsResourceState SOX_RESOURCE_STATE = NotInitialized; -std::mutex SOX_RESOUCE_STATE_MUTEX; - -} // namespace - -void initialize_sox_effects() { - const std::lock_guard lock(SOX_RESOUCE_STATE_MUTEX); - - switch (SOX_RESOURCE_STATE) { - case NotInitialized: - TORCH_CHECK( - sox_init() == SOX_SUCCESS, "Failed to initialize sox effects."); - SOX_RESOURCE_STATE = Initialized; - break; - case Initialized: - break; - case ShutDown: - TORCH_CHECK( - false, "SoX Effects has been shut down. Cannot initialize again."); - } -}; - -void shutdown_sox_effects() { - const std::lock_guard lock(SOX_RESOUCE_STATE_MUTEX); - - switch (SOX_RESOURCE_STATE) { - case NotInitialized: - TORCH_CHECK(false, "SoX Effects is not initialized. Cannot shutdown."); - case Initialized: - TORCH_CHECK( - sox_quit() == SOX_SUCCESS, "Failed to initialize sox effects."); - SOX_RESOURCE_STATE = ShutDown; - break; - case ShutDown: - break; - } -} - -auto apply_effects_tensor( - torch::Tensor waveform, - int64_t sample_rate, - const std::vector>& effects, - bool channels_first) -> std::tuple { - validate_input_tensor(waveform); - - // Create SoxEffectsChain - const auto dtype = waveform.dtype(); - SoxEffectsChain chain( - /*input_encoding=*/get_tensor_encodinginfo(dtype), - /*output_encoding=*/get_tensor_encodinginfo(dtype)); - - // Prepare output buffer - std::vector out_buffer; - out_buffer.reserve(waveform.numel()); - - // Build and run effects chain - chain.addInputTensor(&waveform, sample_rate, channels_first); - for (const auto& effect : effects) { - chain.addEffect(effect); - } - chain.addOutputBuffer(&out_buffer); - chain.run(); - - // Create tensor from buffer - auto out_tensor = convert_to_tensor( - /*buffer=*/out_buffer.data(), - /*num_samples=*/out_buffer.size(), - /*num_channels=*/chain.getOutputNumChannels(), - dtype, - /*normalize=*/false, - channels_first); - - return std::tuple( - out_tensor, chain.getOutputSampleRate()); -} - -auto apply_effects_file( - const std::string& path, - const std::vector>& effects, - std::optional normalize, - std::optional channels_first, - const std::optional& format) - -> std::tuple { - // Open input file - SoxFormat sf(sox_open_read( - path.c_str(), - /*signal=*/nullptr, - /*encoding=*/nullptr, - /*filetype=*/format.has_value() ? format.value().c_str() : nullptr)); - - validate_input_file(sf, path); - - const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision); - - // Prepare output - std::vector out_buffer; - out_buffer.reserve(sf->signal.length); - - // Create and run SoxEffectsChain - SoxEffectsChain chain( - /*input_encoding=*/sf->encoding, - /*output_encoding=*/get_tensor_encodinginfo(dtype)); - - chain.addInputFile(sf); - for (const auto& effect : effects) { - chain.addEffect(effect); - } - chain.addOutputBuffer(&out_buffer); - chain.run(); - - // Create tensor from buffer - bool channels_first_ = channels_first.value_or(true); - auto tensor = convert_to_tensor( - /*buffer=*/out_buffer.data(), - /*num_samples=*/out_buffer.size(), - /*num_channels=*/chain.getOutputNumChannels(), - dtype, - normalize.value_or(true), - channels_first_); - - return std::tuple( - tensor, chain.getOutputSampleRate()); -} -} // namespace torchaudio::sox diff --git a/src/libtorchaudio/sox/effects.h b/src/libtorchaudio/sox/effects.h deleted file mode 100644 index 8b56427c1e..0000000000 --- a/src/libtorchaudio/sox/effects.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef TORCHAUDIO_SOX_EFFECTS_H -#define TORCHAUDIO_SOX_EFFECTS_H - -#include -#include - -namespace torchaudio::sox { - -void initialize_sox_effects(); - -void shutdown_sox_effects(); - -auto apply_effects_tensor( - torch::Tensor waveform, - int64_t sample_rate, - const std::vector>& effects, - bool channels_first) -> std::tuple; - -auto apply_effects_file( - const std::string& path, - const std::vector>& effects, - std::optional normalize, - std::optional channels_first, - const std::optional& format) - -> std::tuple; - -} // namespace torchaudio::sox - -#endif diff --git a/src/libtorchaudio/sox/effects_chain.cpp b/src/libtorchaudio/sox/effects_chain.cpp deleted file mode 100644 index 7f6109a343..0000000000 --- a/src/libtorchaudio/sox/effects_chain.cpp +++ /dev/null @@ -1,301 +0,0 @@ -#include -#include -#include "c10/util/Exception.h" - -using namespace torch::indexing; - -namespace torchaudio::sox { - -namespace { - -/// helper classes for passing the location of input tensor and output buffer -/// -/// drain/flow callback functions require plaing C style function signature and -/// the way to pass extra data is to attach data to sox_effect_t::priv pointer. -/// The following structs will be assigned to sox_effect_t::priv pointer which -/// gives sox_effect_t an access to input Tensor and output buffer object. -struct TensorInputPriv { - size_t index; - torch::Tensor* waveform; - int64_t sample_rate; - bool channels_first; -}; -struct TensorOutputPriv { - std::vector* buffer; -}; -struct FileOutputPriv { - sox_format_t* sf; -}; - -/// Callback function to feed Tensor data to SoxEffectChain. -int tensor_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp) { - // Retrieve the input Tensor and current index - auto priv = static_cast(effp->priv); - auto index = priv->index; - auto tensor = *(priv->waveform); - auto num_channels = effp->out_signal.channels; - - // Adjust the number of samples to read - const size_t num_samples = tensor.numel(); - if (index + *osamp > num_samples) { - *osamp = num_samples - index; - } - // Ensure that it's a multiple of the number of channels - *osamp -= *osamp % num_channels; - - // Slice the input Tensor - auto chunk = [&]() { - auto i_frame = index / num_channels; - auto num_frames = *osamp / num_channels; - auto t = (priv->channels_first) - ? tensor.index({Slice(), Slice(i_frame, i_frame + num_frames)}).t() - : tensor.index({Slice(i_frame, i_frame + num_frames), Slice()}); - return t.reshape({-1}); - }(); - - // Convert to sox_sample_t (int32_t) - switch (chunk.dtype().toScalarType()) { - case c10::ScalarType::Float: { - // Need to convert to 64-bit precision so that - // values around INT32_MIN/MAX are handled correctly. - chunk = chunk.to(c10::ScalarType::Double); - chunk *= 2147483648.; - chunk.clamp_(INT32_MIN, INT32_MAX); - chunk = chunk.to(c10::ScalarType::Int); - break; - } - case c10::ScalarType::Int: { - break; - } - case c10::ScalarType::Short: { - chunk = chunk.to(c10::ScalarType::Int); - chunk *= 65536; - break; - } - case c10::ScalarType::Byte: { - chunk = chunk.to(c10::ScalarType::Int); - chunk -= 128; - chunk *= 16777216; - break; - } - default: - TORCH_CHECK(false, "Unexpected dtype: ", chunk.dtype()); - } - // Write to buffer - chunk = chunk.contiguous(); - memcpy(obuf, chunk.data_ptr(), *osamp * 4); - priv->index += *osamp; - return (priv->index == num_samples) ? SOX_EOF : SOX_SUCCESS; -} - -/// Callback function to fetch data from SoxEffectChain. -int tensor_output_flow( - sox_effect_t* effp, - sox_sample_t const* ibuf, - sox_sample_t* obuf LSX_UNUSED, - size_t* isamp, - size_t* osamp) { - *osamp = 0; - // Get output buffer - auto out_buffer = static_cast(effp->priv)->buffer; - // Append at the end - out_buffer->insert(out_buffer->end(), ibuf, ibuf + *isamp); - return SOX_SUCCESS; -} - -int file_output_flow( - sox_effect_t* effp, - sox_sample_t const* ibuf, - sox_sample_t* obuf LSX_UNUSED, - size_t* isamp, - size_t* osamp) { - *osamp = 0; - if (*isamp) { - auto sf = static_cast(effp->priv)->sf; - if (sox_write(sf, ibuf, *isamp) != *isamp) { - TORCH_CHECK( - !sf->sox_errno, - sf->sox_errstr, - " ", - sox_strerror(sf->sox_errno), - " ", - sf->filename); - return SOX_EOF; - } - } - return SOX_SUCCESS; -} - -sox_effect_handler_t* get_tensor_input_handler() { - static sox_effect_handler_t handler{ - /*name=*/"input_tensor", - /*usage=*/nullptr, - /*flags=*/SOX_EFF_MCHAN, - /*getopts=*/nullptr, - /*start=*/nullptr, - /*flow=*/nullptr, - /*drain=*/tensor_input_drain, - /*stop=*/nullptr, - /*kill=*/nullptr, - /*priv_size=*/sizeof(TensorInputPriv)}; - return &handler; -} - -sox_effect_handler_t* get_tensor_output_handler() { - static sox_effect_handler_t handler{ - /*name=*/"output_tensor", - /*usage=*/nullptr, - /*flags=*/SOX_EFF_MCHAN, - /*getopts=*/nullptr, - /*start=*/nullptr, - /*flow=*/tensor_output_flow, - /*drain=*/nullptr, - /*stop=*/nullptr, - /*kill=*/nullptr, - /*priv_size=*/sizeof(TensorOutputPriv)}; - return &handler; -} - -sox_effect_handler_t* get_file_output_handler() { - static sox_effect_handler_t handler{ - /*name=*/"output_file", - /*usage=*/nullptr, - /*flags=*/SOX_EFF_MCHAN, - /*getopts=*/nullptr, - /*start=*/nullptr, - /*flow=*/file_output_flow, - /*drain=*/nullptr, - /*stop=*/nullptr, - /*kill=*/nullptr, - /*priv_size=*/sizeof(FileOutputPriv)}; - return &handler; -} - -} // namespace - -SoxEffect::SoxEffect(sox_effect_t* se) noexcept : se_(se) {} - -SoxEffect::~SoxEffect() { - if (se_ != nullptr) { - free(se_); - } -} - -SoxEffect::operator sox_effect_t*() const { - return se_; -} - -auto SoxEffect::operator->() noexcept -> sox_effect_t* { - return se_; -} - -SoxEffectsChain::SoxEffectsChain( - sox_encodinginfo_t input_encoding, - sox_encodinginfo_t output_encoding) - : in_enc_(input_encoding), - out_enc_(output_encoding), - in_sig_(), - interm_sig_(), - out_sig_(), - sec_(sox_create_effects_chain(&in_enc_, &out_enc_)) { - TORCH_CHECK(sec_, "Failed to create effect chain."); -} - -SoxEffectsChain::~SoxEffectsChain() { - if (sec_ != nullptr) { - sox_delete_effects_chain(sec_); - } -} - -void SoxEffectsChain::run() { - sox_flow_effects(sec_, nullptr, nullptr); -} - -void SoxEffectsChain::addInputTensor( - torch::Tensor* waveform, - int64_t sample_rate, - bool channels_first) { - in_sig_ = get_signalinfo(waveform, sample_rate, "wav", channels_first); - interm_sig_ = in_sig_; - SoxEffect e(sox_create_effect(get_tensor_input_handler())); - auto priv = static_cast(e->priv); - priv->index = 0; - priv->waveform = waveform; - priv->sample_rate = sample_rate; - priv->channels_first = channels_first; - TORCH_CHECK( - sox_add_effect(sec_, e, &interm_sig_, &in_sig_) == SOX_SUCCESS, - "Internal Error: Failed to add effect: input_tensor"); -} - -void SoxEffectsChain::addOutputBuffer( - std::vector* output_buffer) { - SoxEffect e(sox_create_effect(get_tensor_output_handler())); - static_cast(e->priv)->buffer = output_buffer; - TORCH_CHECK( - sox_add_effect(sec_, e, &interm_sig_, &in_sig_) == SOX_SUCCESS, - "Internal Error: Failed to add effect: output_tensor"); -} - -void SoxEffectsChain::addInputFile(sox_format_t* sf) { - in_sig_ = sf->signal; - interm_sig_ = in_sig_; - SoxEffect e(sox_create_effect(sox_find_effect("input"))); - char* opts[] = {(char*)sf}; - sox_effect_options(e, 1, opts); - TORCH_CHECK( - sox_add_effect(sec_, e, &interm_sig_, &in_sig_) == SOX_SUCCESS, - "Internal Error: Failed to add effect: input ", - sf->filename); -} - -void SoxEffectsChain::addOutputFile(sox_format_t* sf) { - out_sig_ = sf->signal; - SoxEffect e(sox_create_effect(get_file_output_handler())); - static_cast(e->priv)->sf = sf; - TORCH_CHECK( - sox_add_effect(sec_, e, &interm_sig_, &out_sig_) == SOX_SUCCESS, - "Internal Error: Failed to add effect: output ", - sf->filename); -} - -void SoxEffectsChain::addEffect(const std::vector& effect) { - const auto num_args = effect.size(); - TORCH_CHECK(num_args != 0, "Invalid argument: empty effect."); - const auto name = effect[0]; - TORCH_CHECK( - UNSUPPORTED_EFFECTS.find(name) == UNSUPPORTED_EFFECTS.end(), - "Unsupported effect: ", - name) - - auto returned_effect = sox_find_effect(name.c_str()); - TORCH_CHECK(returned_effect, "Unsupported effect: ", name) - - SoxEffect e(sox_create_effect(returned_effect)); - const auto num_options = num_args - 1; - - std::vector opts; - for (size_t i = 1; i < num_args; ++i) { - opts.push_back((char*)effect[i].c_str()); - } - TORCH_CHECK( - sox_effect_options(e, num_options, num_options ? opts.data() : nullptr) == - SOX_SUCCESS, - "Invalid effect option: ", - c10::Join(" ", effect)) - TORCH_CHECK( - sox_add_effect(sec_, e, &interm_sig_, &in_sig_) == SOX_SUCCESS, - "Internal Error: Failed to add effect: \"", - c10::Join(" ", effect), - "\""); -} - -int64_t SoxEffectsChain::getOutputNumChannels() { - return interm_sig_.channels; -} - -int64_t SoxEffectsChain::getOutputSampleRate() { - return interm_sig_.rate; -} - -} // namespace torchaudio::sox diff --git a/src/libtorchaudio/sox/effects_chain.h b/src/libtorchaudio/sox/effects_chain.h deleted file mode 100644 index e6a892b5e8..0000000000 --- a/src/libtorchaudio/sox/effects_chain.h +++ /dev/null @@ -1,61 +0,0 @@ -#ifndef TORCHAUDIO_SOX_EFFECTS_CHAIN_H -#define TORCHAUDIO_SOX_EFFECTS_CHAIN_H - -#include -#include - -namespace torchaudio::sox { - -// Helper struct to safely close sox_effect_t* pointer returned by -// sox_create_effect - -struct SoxEffect { - explicit SoxEffect(sox_effect_t* se) noexcept; - SoxEffect(const SoxEffect& other) = delete; - SoxEffect(SoxEffect&& other) = delete; - auto operator=(const SoxEffect& other) -> SoxEffect& = delete; - auto operator=(SoxEffect&& other) -> SoxEffect& = delete; - ~SoxEffect(); - operator sox_effect_t*() const; - auto operator->() noexcept -> sox_effect_t*; - - private: - sox_effect_t* se_; -}; - -// Helper struct to safely close sox_effects_chain_t with handy methods -class SoxEffectsChain { - const sox_encodinginfo_t in_enc_; - const sox_encodinginfo_t out_enc_; - - protected: - sox_signalinfo_t in_sig_; - sox_signalinfo_t interm_sig_; - sox_signalinfo_t out_sig_; - sox_effects_chain_t* sec_; - - public: - explicit SoxEffectsChain( - sox_encodinginfo_t input_encoding, - sox_encodinginfo_t output_encoding); - SoxEffectsChain(const SoxEffectsChain& other) = delete; - SoxEffectsChain(SoxEffectsChain&& other) = delete; - SoxEffectsChain& operator=(const SoxEffectsChain& other) = delete; - SoxEffectsChain& operator=(SoxEffectsChain&& other) = delete; - ~SoxEffectsChain(); - void run(); - void addInputTensor( - torch::Tensor* waveform, - int64_t sample_rate, - bool channels_first); - void addInputFile(sox_format_t* sf); - void addOutputBuffer(std::vector* output_buffer); - void addOutputFile(sox_format_t* sf); - void addEffect(const std::vector& effect); - int64_t getOutputNumChannels(); - int64_t getOutputSampleRate(); -}; - -} // namespace torchaudio::sox - -#endif diff --git a/src/libtorchaudio/sox/io.cpp b/src/libtorchaudio/sox/io.cpp deleted file mode 100644 index 474726ad1c..0000000000 --- a/src/libtorchaudio/sox/io.cpp +++ /dev/null @@ -1,128 +0,0 @@ -#include -#include -#include -#include -#include - -using namespace torch::indexing; - -namespace torchaudio::sox { - -std::tuple get_info_file( - const std::string& path, - const std::optional& format) { - SoxFormat sf(sox_open_read( - path.c_str(), - /*signal=*/nullptr, - /*encoding=*/nullptr, - /*filetype=*/format.has_value() ? format.value().c_str() : nullptr)); - - validate_input_file(sf, path); - - return std::make_tuple( - static_cast(sf->signal.rate), - static_cast(sf->signal.length / sf->signal.channels), - static_cast(sf->signal.channels), - static_cast(sf->encoding.bits_per_sample), - get_encoding(sf->encoding.encoding)); -} - -std::vector> get_effects( - const std::optional& frame_offset, - const std::optional& num_frames) { - const auto offset = frame_offset.value_or(0); - TORCH_CHECK( - offset >= 0, - "Invalid argument: frame_offset must be non-negative. Found: ", - offset); - const auto frames = num_frames.value_or(-1); - TORCH_CHECK( - frames > 0 || frames == -1, - "Invalid argument: num_frames must be -1 or greater than 0."); - - std::vector> effects; - if (frames != -1) { - std::ostringstream os_offset, os_frames; - os_offset << offset << "s"; - os_frames << "+" << frames << "s"; - effects.emplace_back( - std::vector{"trim", os_offset.str(), os_frames.str()}); - } else if (offset != 0) { - std::ostringstream os_offset; - os_offset << offset << "s"; - effects.emplace_back(std::vector{"trim", os_offset.str()}); - } - return effects; -} - -std::tuple load_audio_file( - const std::string& path, - const std::optional& frame_offset, - const std::optional& num_frames, - std::optional normalize, - std::optional channels_first, - const std::optional& format) { - auto effects = get_effects(frame_offset, num_frames); - return apply_effects_file(path, effects, normalize, channels_first, format); -} - -void save_audio_file( - const std::string& path, - torch::Tensor tensor, - int64_t sample_rate, - bool channels_first, - std::optional compression, - std::optional format, - std::optional encoding, - std::optional bits_per_sample) { - validate_input_tensor(tensor); - - const auto filetype = [&]() { - if (format.has_value()) { - return format.value(); - } - return get_filetype(path); - }(); - - if (filetype == "amr-nb") { - const auto num_channels = tensor.size(channels_first ? 0 : 1); - TORCH_CHECK( - num_channels == 1, "amr-nb format only supports single channel audio."); - } else if (filetype == "htk") { - const auto num_channels = tensor.size(channels_first ? 0 : 1); - TORCH_CHECK( - num_channels == 1, "htk format only supports single channel audio."); - } else if (filetype == "gsm") { - const auto num_channels = tensor.size(channels_first ? 0 : 1); - TORCH_CHECK( - num_channels == 1, "gsm format only supports single channel audio."); - TORCH_CHECK( - sample_rate == 8000, - "gsm format only supports a sampling rate of 8kHz."); - } - const auto signal_info = - get_signalinfo(&tensor, sample_rate, filetype, channels_first); - const auto encoding_info = get_encodinginfo_for_save( - filetype, tensor.dtype(), compression, encoding, bits_per_sample); - - SoxFormat sf(sox_open_write( - path.c_str(), - &signal_info, - &encoding_info, - /*filetype=*/filetype.c_str(), - /*oob=*/nullptr, - /*overwrite_permitted=*/nullptr)); - - TORCH_CHECK( - static_cast(sf) != nullptr, - "Error saving audio file: failed to open file ", - path); - - SoxEffectsChain chain( - /*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()), - /*output_encoding=*/sf->encoding); - chain.addInputTensor(&tensor, sample_rate, channels_first); - chain.addOutputFile(sf); - chain.run(); -} -} // namespace torchaudio::sox diff --git a/src/libtorchaudio/sox/io.h b/src/libtorchaudio/sox/io.h deleted file mode 100644 index b011ef59be..0000000000 --- a/src/libtorchaudio/sox/io.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef TORCHAUDIO_SOX_IO_H -#define TORCHAUDIO_SOX_IO_H - -#include -#include - -namespace torchaudio::sox { - -auto get_effects( - const std::optional& frame_offset, - const std::optional& num_frames) - -> std::vector>; - -std::tuple get_info_file( - const std::string& path, - const std::optional& format); - -std::tuple load_audio_file( - const std::string& path, - const std::optional& frame_offset, - const std::optional& num_frames, - std::optional normalize, - std::optional channels_first, - const std::optional& format); - -void save_audio_file( - const std::string& path, - torch::Tensor tensor, - int64_t sample_rate, - bool channels_first, - std::optional compression, - std::optional format, - std::optional encoding, - std::optional bits_per_sample); - -} // namespace torchaudio::sox - -#endif diff --git a/src/libtorchaudio/sox/pybind/pybind.cpp b/src/libtorchaudio/sox/pybind/pybind.cpp deleted file mode 100644 index bd9c82c349..0000000000 --- a/src/libtorchaudio/sox/pybind/pybind.cpp +++ /dev/null @@ -1,39 +0,0 @@ -#include -#include -#include -#include - -namespace torchaudio { -namespace sox { -namespace { - -TORCH_LIBRARY(torchaudio_sox, m) { - m.def("torchaudio_sox::get_info", &get_info_file); - m.def("torchaudio_sox::load_audio_file", &load_audio_file); - m.def("torchaudio_sox::save_audio_file", &save_audio_file); - m.def("torchaudio_sox::initialize_sox_effects", &initialize_sox_effects); - m.def("torchaudio_sox::shutdown_sox_effects", &shutdown_sox_effects); - m.def("torchaudio_sox::apply_effects_tensor", &apply_effects_tensor); - m.def("torchaudio_sox::apply_effects_file", &apply_effects_file); -} - -PYBIND11_MODULE(_torchaudio_sox, m) { - m.def("set_seed", &set_seed, "Set random seed."); - m.def("set_verbosity", &set_verbosity, "Set verbosity."); - m.def("set_use_threads", &set_use_threads, "Set threading."); - m.def("set_buffer_size", &set_buffer_size, "Set buffer size."); - m.def("get_buffer_size", &get_buffer_size, "Get buffer size."); - m.def("list_effects", &list_effects, "List available effects."); - m.def( - "list_read_formats", - &list_read_formats, - "List supported formats for decoding."); - m.def( - "list_write_formats", - &list_write_formats, - "List supported formats for encoding."); -} - -} // namespace -} // namespace sox -} // namespace torchaudio diff --git a/src/libtorchaudio/sox/types.cpp b/src/libtorchaudio/sox/types.cpp deleted file mode 100644 index 12bd070105..0000000000 --- a/src/libtorchaudio/sox/types.cpp +++ /dev/null @@ -1,148 +0,0 @@ -#include - -namespace torchaudio::sox { - -Format get_format_from_string(const std::string& format) { - if (format == "wav") { - return Format::WAV; - } - if (format == "mp3") { - return Format::MP3; - } - if (format == "flac") { - return Format::FLAC; - } - if (format == "ogg" || format == "vorbis") { - return Format::VORBIS; - } - if (format == "amr-nb") { - return Format::AMR_NB; - } - if (format == "amr-wb") { - return Format::AMR_WB; - } - if (format == "amb") { - return Format::AMB; - } - if (format == "sph") { - return Format::SPHERE; - } - if (format == "htk") { - return Format::HTK; - } - if (format == "gsm") { - return Format::GSM; - } - TORCH_CHECK(false, "Internal Error: unexpected format value: ", format); -} - -std::string to_string(Encoding v) { - switch (v) { - case Encoding::UNKNOWN: - return "UNKNOWN"; - case Encoding::PCM_SIGNED: - return "PCM_S"; - case Encoding::PCM_UNSIGNED: - return "PCM_U"; - case Encoding::PCM_FLOAT: - return "PCM_F"; - case Encoding::FLAC: - return "FLAC"; - case Encoding::ULAW: - return "ULAW"; - case Encoding::ALAW: - return "ALAW"; - case Encoding::MP3: - return "MP3"; - case Encoding::VORBIS: - return "VORBIS"; - case Encoding::AMR_WB: - return "AMR_WB"; - case Encoding::AMR_NB: - return "AMR_NB"; - case Encoding::OPUS: - return "OPUS"; - default: - TORCH_CHECK(false, "Internal Error: unexpected encoding."); - } -} - -Encoding get_encoding_from_option(const std::optional& encoding) { - if (!encoding.has_value()) { - return Encoding::NOT_PROVIDED; - } - std::string v = encoding.value(); - if (v == "PCM_S") { - return Encoding::PCM_SIGNED; - } - if (v == "PCM_U") { - return Encoding::PCM_UNSIGNED; - } - if (v == "PCM_F") { - return Encoding::PCM_FLOAT; - } - if (v == "ULAW") { - return Encoding::ULAW; - } - if (v == "ALAW") { - return Encoding::ALAW; - } - TORCH_CHECK(false, "Internal Error: unexpected encoding value: ", v); -} - -BitDepth get_bit_depth_from_option(const std::optional& bit_depth) { - if (!bit_depth.has_value()) { - return BitDepth::NOT_PROVIDED; - } - int64_t v = bit_depth.value(); - switch (v) { - case 8: - return BitDepth::B8; - case 16: - return BitDepth::B16; - case 24: - return BitDepth::B24; - case 32: - return BitDepth::B32; - case 64: - return BitDepth::B64; - default: { - TORCH_CHECK(false, "Internal Error: unexpected bit depth value: ", v); - } - } -} - -std::string get_encoding(sox_encoding_t encoding) { - switch (encoding) { - case SOX_ENCODING_UNKNOWN: - return "UNKNOWN"; - case SOX_ENCODING_SIGN2: - return "PCM_S"; - case SOX_ENCODING_UNSIGNED: - return "PCM_U"; - case SOX_ENCODING_FLOAT: - return "PCM_F"; - case SOX_ENCODING_FLAC: - return "FLAC"; - case SOX_ENCODING_ULAW: - return "ULAW"; - case SOX_ENCODING_ALAW: - return "ALAW"; - case SOX_ENCODING_MP3: - return "MP3"; - case SOX_ENCODING_VORBIS: - return "VORBIS"; - case SOX_ENCODING_AMR_WB: - return "AMR_WB"; - case SOX_ENCODING_AMR_NB: - return "AMR_NB"; - case SOX_ENCODING_OPUS: - return "OPUS"; - case SOX_ENCODING_GSM: - return "GSM"; - default: - return "UNKNOWN"; - } -} - -} // namespace torchaudio::sox diff --git a/src/libtorchaudio/sox/types.h b/src/libtorchaudio/sox/types.h deleted file mode 100644 index 714d303313..0000000000 --- a/src/libtorchaudio/sox/types.h +++ /dev/null @@ -1,58 +0,0 @@ -#ifndef TORCHAUDIO_SOX_TYPES_H -#define TORCHAUDIO_SOX_TYPES_H - -#include -#include - -namespace torchaudio::sox { - -enum class Format { - WAV, - MP3, - FLAC, - VORBIS, - AMR_NB, - AMR_WB, - AMB, - SPHERE, - GSM, - HTK, -}; - -Format get_format_from_string(const std::string& format); - -enum class Encoding { - NOT_PROVIDED, - UNKNOWN, - PCM_SIGNED, - PCM_UNSIGNED, - PCM_FLOAT, - FLAC, - ULAW, - ALAW, - MP3, - VORBIS, - AMR_WB, - AMR_NB, - OPUS, -}; - -std::string to_string(Encoding v); -Encoding get_encoding_from_option(const std::optional& encoding); - -enum class BitDepth : unsigned { - NOT_PROVIDED = 0, - B8 = 8, - B16 = 16, - B24 = 24, - B32 = 32, - B64 = 64, -}; - -BitDepth get_bit_depth_from_option(const std::optional& bit_depth); - -std::string get_encoding(sox_encoding_t encoding); - -} // namespace torchaudio::sox - -#endif diff --git a/src/libtorchaudio/sox/utils.cpp b/src/libtorchaudio/sox/utils.cpp deleted file mode 100644 index 94748c5209..0000000000 --- a/src/libtorchaudio/sox/utils.cpp +++ /dev/null @@ -1,509 +0,0 @@ -#include -#include -#include -#include - -namespace torchaudio::sox { - -const std::unordered_set UNSUPPORTED_EFFECTS{ - "input", - "output", - "spectrogram", - "noiseprof", - "noisered", - "splice"}; - -void set_seed(const int64_t seed) { - sox_get_globals()->ranqd1 = static_cast(seed); -} - -void set_verbosity(const int64_t verbosity) { - sox_get_globals()->verbosity = static_cast(verbosity); -} - -void set_use_threads(const bool use_threads) { - sox_get_globals()->use_threads = static_cast(use_threads); -} - -void set_buffer_size(const int64_t buffer_size) { - sox_get_globals()->bufsiz = static_cast(buffer_size); -} - -int64_t get_buffer_size() { - return sox_get_globals()->bufsiz; -} - -std::vector> list_effects() { - std::vector> effects; - for (const sox_effect_fn_t* fns = sox_get_effect_fns(); *fns; ++fns) { - const sox_effect_handler_t* handler = (*fns)(); - if (handler && handler->name) { - if (UNSUPPORTED_EFFECTS.find(handler->name) == - UNSUPPORTED_EFFECTS.end()) { - effects.emplace_back(std::vector{ - handler->name, - handler->usage ? std::string(handler->usage) : std::string("")}); - } - } - } - return effects; -} - -std::vector list_write_formats() { - std::vector formats; - for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) { - const sox_format_handler_t* handler = fns->fn(); - for (const char* const* names = handler->names; *names; ++names) { - if (!strchr(*names, '/') && handler->write) { - formats.emplace_back(*names); - } - } - } - return formats; -} - -std::vector list_read_formats() { - std::vector formats; - for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) { - const sox_format_handler_t* handler = fns->fn(); - for (const char* const* names = handler->names; *names; ++names) { - if (!strchr(*names, '/') && handler->read) { - formats.emplace_back(*names); - } - } - } - return formats; -} - -SoxFormat::SoxFormat(sox_format_t* fd) noexcept : fd_(fd) {} -SoxFormat::~SoxFormat() { - close(); -} - -sox_format_t* SoxFormat::operator->() const noexcept { - return fd_; -} -SoxFormat::operator sox_format_t*() const noexcept { - return fd_; -} - -void SoxFormat::close() { - if (fd_ != nullptr) { - sox_close(fd_); - fd_ = nullptr; - } -} - -void validate_input_file(const SoxFormat& sf, const std::string& path) { - TORCH_CHECK( - static_cast(sf) != nullptr, - "Error loading audio file: failed to open file " + path); - TORCH_CHECK( - sf->encoding.encoding != SOX_ENCODING_UNKNOWN, - "Error loading audio file: unknown encoding."); -} - -void validate_input_tensor(const torch::Tensor& tensor) { - TORCH_CHECK(tensor.device().is_cpu(), "Input tensor has to be on CPU."); - - TORCH_CHECK(tensor.ndimension() == 2, "Input tensor has to be 2D."); - - switch (tensor.dtype().toScalarType()) { - case c10::ScalarType::Byte: - case c10::ScalarType::Short: - case c10::ScalarType::Int: - case c10::ScalarType::Float: - break; - default: - TORCH_CHECK( - false, - "Input tensor has to be one of float32, int32, int16 or uint8 type."); - } -} - -caffe2::TypeMeta get_dtype( - const sox_encoding_t encoding, - const unsigned precision) { - const auto dtype = [&]() { - switch (encoding) { - case SOX_ENCODING_UNSIGNED: // 8-bit PCM WAV - return torch::kUInt8; - case SOX_ENCODING_SIGN2: // 16-bit, 24-bit, or 32-bit PCM WAV - switch (precision) { - case 16: - return torch::kInt16; - case 24: // Cast 24-bit to 32-bit. - case 32: - return torch::kInt32; - default: - TORCH_CHECK( - false, - "Only 16, 24, and 32 bits are supported for signed PCM."); - } - default: - // default to float32 for the other formats, including - // 32-bit flaoting-point WAV, - // MP3, - // FLAC, - // VORBIS etc... - return torch::kFloat32; - } - }(); - return c10::scalarTypeToTypeMeta(dtype); -} - -torch::Tensor convert_to_tensor( - sox_sample_t* buffer, - const int32_t num_samples, - const int32_t num_channels, - const caffe2::TypeMeta dtype, - const bool normalize, - const bool channels_first) { - torch::Tensor t; - uint64_t dummy = 0; - SOX_SAMPLE_LOCALS; - if (normalize || dtype == torch::kFloat32) { - t = torch::empty( - {num_samples / num_channels, num_channels}, torch::kFloat32); - auto ptr = t.data_ptr(); - for (int32_t i = 0; i < num_samples; ++i) { - ptr[i] = SOX_SAMPLE_TO_FLOAT_32BIT(buffer[i], dummy); - } - } else if (dtype == torch::kInt32) { - t = torch::from_blob( - buffer, {num_samples / num_channels, num_channels}, torch::kInt32) - .clone(); - } else if (dtype == torch::kInt16) { - t = torch::empty({num_samples / num_channels, num_channels}, torch::kInt16); - auto ptr = t.data_ptr(); - for (int32_t i = 0; i < num_samples; ++i) { - ptr[i] = SOX_SAMPLE_TO_SIGNED_16BIT(buffer[i], dummy); - } - } else if (dtype == torch::kUInt8) { - t = torch::empty({num_samples / num_channels, num_channels}, torch::kUInt8); - auto ptr = t.data_ptr(); - for (int32_t i = 0; i < num_samples; ++i) { - ptr[i] = SOX_SAMPLE_TO_UNSIGNED_8BIT(buffer[i], dummy); - } - } else { - TORCH_CHECK(false, "Unsupported dtype: ", dtype); - } - if (channels_first) { - t = t.transpose(1, 0); - } - return t.contiguous(); -} - -const std::string get_filetype(const std::string& path) { - std::string ext = path.substr(path.find_last_of('.') + 1); - std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower); - return ext; -} - -namespace { - -std::tuple get_save_encoding_for_wav( - const std::string& format, - caffe2::TypeMeta dtype, - const Encoding& encoding, - const BitDepth& bits_per_sample) { - switch (encoding) { - case Encoding::NOT_PROVIDED: - switch (bits_per_sample) { - case BitDepth::NOT_PROVIDED: - switch (dtype.toScalarType()) { - case c10::ScalarType::Float: - return std::make_tuple<>(SOX_ENCODING_FLOAT, 32); - case c10::ScalarType::Int: - return std::make_tuple<>(SOX_ENCODING_SIGN2, 32); - case c10::ScalarType::Short: - return std::make_tuple<>(SOX_ENCODING_SIGN2, 16); - case c10::ScalarType::Byte: - return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8); - default: - TORCH_CHECK(false, "Internal Error: Unexpected dtype: ", dtype); - } - case BitDepth::B8: - return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8); - default: - return std::make_tuple<>( - SOX_ENCODING_SIGN2, static_cast(bits_per_sample)); - } - case Encoding::PCM_SIGNED: - switch (bits_per_sample) { - case BitDepth::NOT_PROVIDED: - return std::make_tuple<>(SOX_ENCODING_SIGN2, 16); - case BitDepth::B8: - TORCH_CHECK( - false, format, " does not support 8-bit signed PCM encoding."); - default: - return std::make_tuple<>( - SOX_ENCODING_SIGN2, static_cast(bits_per_sample)); - } - case Encoding::PCM_UNSIGNED: - switch (bits_per_sample) { - case BitDepth::NOT_PROVIDED: - case BitDepth::B8: - return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8); - default: - TORCH_CHECK( - false, format, " only supports 8-bit for unsigned PCM encoding."); - } - case Encoding::PCM_FLOAT: - switch (bits_per_sample) { - case BitDepth::NOT_PROVIDED: - case BitDepth::B32: - return std::make_tuple<>(SOX_ENCODING_FLOAT, 32); - case BitDepth::B64: - return std::make_tuple<>(SOX_ENCODING_FLOAT, 64); - default: - TORCH_CHECK( - false, - format, - " only supports 32-bit or 64-bit for floating-point PCM encoding."); - } - case Encoding::ULAW: - switch (bits_per_sample) { - case BitDepth::NOT_PROVIDED: - case BitDepth::B8: - return std::make_tuple<>(SOX_ENCODING_ULAW, 8); - default: - TORCH_CHECK( - false, format, " only supports 8-bit for mu-law encoding."); - } - case Encoding::ALAW: - switch (bits_per_sample) { - case BitDepth::NOT_PROVIDED: - case BitDepth::B8: - return std::make_tuple<>(SOX_ENCODING_ALAW, 8); - default: - TORCH_CHECK( - false, format, " only supports 8-bit for a-law encoding."); - } - default: - TORCH_CHECK( - false, format, " does not support encoding: " + to_string(encoding)); - } -} - -std::tuple get_save_encoding( - const std::string& format, - const caffe2::TypeMeta& dtype, - const std::optional& encoding, - const std::optional& bits_per_sample) { - const Format fmt = get_format_from_string(format); - const Encoding enc = get_encoding_from_option(encoding); - const BitDepth bps = get_bit_depth_from_option(bits_per_sample); - - switch (fmt) { - case Format::WAV: - case Format::AMB: - return get_save_encoding_for_wav(format, dtype, enc, bps); - case Format::MP3: - TORCH_CHECK( - enc == Encoding::NOT_PROVIDED, - "mp3 does not support `encoding` option."); - TORCH_CHECK( - bps == BitDepth::NOT_PROVIDED, - "mp3 does not support `bits_per_sample` option."); - return std::make_tuple<>(SOX_ENCODING_MP3, 16); - case Format::HTK: - TORCH_CHECK( - enc == Encoding::NOT_PROVIDED, - "htk does not support `encoding` option."); - TORCH_CHECK( - bps == BitDepth::NOT_PROVIDED, - "htk does not support `bits_per_sample` option."); - return std::make_tuple<>(SOX_ENCODING_SIGN2, 16); - case Format::VORBIS: - TORCH_CHECK( - enc == Encoding::NOT_PROVIDED, - "vorbis does not support `encoding` option."); - TORCH_CHECK( - bps == BitDepth::NOT_PROVIDED, - "vorbis does not support `bits_per_sample` option."); - return std::make_tuple<>(SOX_ENCODING_VORBIS, 0); - case Format::AMR_NB: - TORCH_CHECK( - enc == Encoding::NOT_PROVIDED, - "amr-nb does not support `encoding` option."); - TORCH_CHECK( - bps == BitDepth::NOT_PROVIDED, - "amr-nb does not support `bits_per_sample` option."); - return std::make_tuple<>(SOX_ENCODING_AMR_NB, 16); - case Format::FLAC: - TORCH_CHECK( - enc == Encoding::NOT_PROVIDED, - "flac does not support `encoding` option."); - switch (bps) { - case BitDepth::B32: - case BitDepth::B64: - TORCH_CHECK( - false, "flac does not support `bits_per_sample` larger than 24."); - default: - return std::make_tuple<>( - SOX_ENCODING_FLAC, static_cast(bps)); - } - case Format::SPHERE: - switch (enc) { - case Encoding::NOT_PROVIDED: - case Encoding::PCM_SIGNED: - switch (bps) { - case BitDepth::NOT_PROVIDED: - return std::make_tuple<>(SOX_ENCODING_SIGN2, 32); - default: - return std::make_tuple<>( - SOX_ENCODING_SIGN2, static_cast(bps)); - } - case Encoding::PCM_UNSIGNED: - TORCH_CHECK(false, "sph does not support unsigned integer PCM."); - case Encoding::PCM_FLOAT: - TORCH_CHECK(false, "sph does not support floating point PCM."); - case Encoding::ULAW: - switch (bps) { - case BitDepth::NOT_PROVIDED: - case BitDepth::B8: - return std::make_tuple<>(SOX_ENCODING_ULAW, 8); - default: - TORCH_CHECK( - false, "sph only supports 8-bit for mu-law encoding."); - } - case Encoding::ALAW: - switch (bps) { - case BitDepth::NOT_PROVIDED: - case BitDepth::B8: - return std::make_tuple<>(SOX_ENCODING_ALAW, 8); - default: - return std::make_tuple<>( - SOX_ENCODING_ALAW, static_cast(bps)); - } - default: - TORCH_CHECK( - false, "sph does not support encoding: ", encoding.value()); - } - case Format::GSM: - TORCH_CHECK( - enc == Encoding::NOT_PROVIDED, - "gsm does not support `encoding` option."); - TORCH_CHECK( - bps == BitDepth::NOT_PROVIDED, - "gsm does not support `bits_per_sample` option."); - return std::make_tuple<>(SOX_ENCODING_GSM, 16); - - default: - TORCH_CHECK(false, "Unsupported format: " + format); - } -} - -unsigned get_precision(const std::string& filetype, caffe2::TypeMeta dtype) { - if (filetype == "mp3") { - return SOX_UNSPEC; - } - if (filetype == "flac") { - return 24; - } - if (filetype == "ogg" || filetype == "vorbis") { - return SOX_UNSPEC; - } - if (filetype == "wav" || filetype == "amb") { - switch (dtype.toScalarType()) { - case c10::ScalarType::Byte: - return 8; - case c10::ScalarType::Short: - return 16; - case c10::ScalarType::Int: - return 32; - case c10::ScalarType::Float: - return 32; - default: - TORCH_CHECK(false, "Unsupported dtype: ", dtype); - } - } - if (filetype == "sph") { - return 32; - } - if (filetype == "amr-nb") { - return 16; - } - if (filetype == "gsm") { - return 16; - } - if (filetype == "htk") { - return 16; - } - TORCH_CHECK(false, "Unsupported file type: ", filetype); -} - -} // namespace - -sox_signalinfo_t get_signalinfo( - const torch::Tensor* waveform, - const int64_t sample_rate, - const std::string& filetype, - const bool channels_first) { - return sox_signalinfo_t{ - /*rate=*/static_cast(sample_rate), - /*channels=*/ - static_cast(waveform->size(channels_first ? 0 : 1)), - /*precision=*/get_precision(filetype, waveform->dtype()), - /*length=*/static_cast(waveform->numel()), - nullptr}; -} - -sox_encodinginfo_t get_tensor_encodinginfo(caffe2::TypeMeta dtype) { - sox_encoding_t encoding = [&]() { - switch (dtype.toScalarType()) { - case c10::ScalarType::Byte: - return SOX_ENCODING_UNSIGNED; - case c10::ScalarType::Short: - return SOX_ENCODING_SIGN2; - case c10::ScalarType::Int: - return SOX_ENCODING_SIGN2; - case c10::ScalarType::Float: - return SOX_ENCODING_FLOAT; - default: - TORCH_CHECK(false, "Unsupported dtype: ", dtype); - } - }(); - unsigned bits_per_sample = [&]() { - switch (dtype.toScalarType()) { - case c10::ScalarType::Byte: - return 8; - case c10::ScalarType::Short: - return 16; - case c10::ScalarType::Int: - return 32; - case c10::ScalarType::Float: - return 32; - default: - TORCH_CHECK(false, "Unsupported dtype: ", dtype); - } - }(); - return sox_encodinginfo_t{ - /*encoding=*/encoding, - /*bits_per_sample=*/bits_per_sample, - /*compression=*/HUGE_VAL, - /*reverse_bytes=*/sox_option_default, - /*reverse_nibbles=*/sox_option_default, - /*reverse_bits=*/sox_option_default, - /*opposite_endian=*/sox_false}; -} - -sox_encodinginfo_t get_encodinginfo_for_save( - const std::string& format, - const caffe2::TypeMeta& dtype, - const std::optional& compression, - const std::optional& encoding, - const std::optional& bits_per_sample) { - auto enc = get_save_encoding(format, dtype, encoding, bits_per_sample); - return sox_encodinginfo_t{ - /*encoding=*/std::get<0>(enc), - /*bits_per_sample=*/std::get<1>(enc), - /*compression=*/compression.value_or(HUGE_VAL), - /*reverse_bytes=*/sox_option_default, - /*reverse_nibbles=*/sox_option_default, - /*reverse_bits=*/sox_option_default, - /*opposite_endian=*/sox_false}; -} - -} // namespace torchaudio::sox diff --git a/src/libtorchaudio/sox/utils.h b/src/libtorchaudio/sox/utils.h deleted file mode 100644 index b26e25f65e..0000000000 --- a/src/libtorchaudio/sox/utils.h +++ /dev/null @@ -1,112 +0,0 @@ -#ifndef TORCHAUDIO_SOX_UTILS_H -#define TORCHAUDIO_SOX_UTILS_H - -#include -#include - -namespace torchaudio::sox { - -//////////////////////////////////////////////////////////////////////////////// -// APIs for Python interaction -//////////////////////////////////////////////////////////////////////////////// - -/// Set sox global options -void set_seed(const int64_t seed); - -void set_verbosity(const int64_t verbosity); - -void set_use_threads(const bool use_threads); - -void set_buffer_size(const int64_t buffer_size); - -int64_t get_buffer_size(); - -std::vector> list_effects(); - -std::vector list_read_formats(); - -std::vector list_write_formats(); - -//////////////////////////////////////////////////////////////////////////////// -// Utilities for sox_io / sox_effects implementations -//////////////////////////////////////////////////////////////////////////////// - -extern const std::unordered_set UNSUPPORTED_EFFECTS; - -/// helper class to automatically close sox_format_t* -struct SoxFormat { - explicit SoxFormat(sox_format_t* fd) noexcept; - SoxFormat(const SoxFormat& other) = delete; - SoxFormat(SoxFormat&& other) = delete; - SoxFormat& operator=(const SoxFormat& other) = delete; - SoxFormat& operator=(SoxFormat&& other) = delete; - ~SoxFormat(); - sox_format_t* operator->() const noexcept; - operator sox_format_t*() const noexcept; - - void close(); - - private: - sox_format_t* fd_; -}; - -/// -/// Verify that input file is found, has known encoding, and not empty -void validate_input_file(const SoxFormat& sf, const std::string& path); - -/// -/// Verify that input Tensor is 2D, CPU and either uin8, int16, int32 or float32 -void validate_input_tensor(const torch::Tensor&); - -/// -/// Get target dtype for the given encoding and precision. -caffe2::TypeMeta get_dtype( - const sox_encoding_t encoding, - const unsigned precision); - -/// -/// Convert sox_sample_t buffer to uint8/int16/int32/float32 Tensor -/// NOTE: This function might modify the values in the input buffer to -/// reduce the number of memory copy. -/// @param buffer Pointer to buffer that contains audio data. -/// @param num_samples The number of samples to read. -/// @param num_channels The number of channels. Used to reshape the resulting -/// Tensor. -/// @param dtype Target dtype. Determines the output dtype and value range in -/// conjunction with normalization. -/// @param noramlize Perform normalization. Only effective when dtype is not -/// kFloat32. When effective, the output tensor is kFloat32 type and value range -/// is [-1.0, 1.0] -/// @param channels_first When True, output Tensor has shape of [num_channels, -/// num_frames]. -torch::Tensor convert_to_tensor( - sox_sample_t* buffer, - const int32_t num_samples, - const int32_t num_channels, - const caffe2::TypeMeta dtype, - const bool normalize, - const bool channels_first); - -/// Extract extension from file path -const std::string get_filetype(const std::string& path); - -/// Get sox_signalinfo_t for passing a torch::Tensor object. -sox_signalinfo_t get_signalinfo( - const torch::Tensor* waveform, - const int64_t sample_rate, - const std::string& filetype, - const bool channels_first); - -/// Get sox_encodinginfo_t for Tensor I/O -sox_encodinginfo_t get_tensor_encodinginfo(const caffe2::TypeMeta dtype); - -/// Get sox_encodinginfo_t for saving to file/file object -sox_encodinginfo_t get_encodinginfo_for_save( - const std::string& format, - const caffe2::TypeMeta& dtype, - const std::optional& compression, - const std::optional& encoding, - const std::optional& bits_per_sample); - -} // namespace torchaudio::sox -#endif diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index e533cafe9d..db3ca5a293 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -27,7 +27,6 @@ kaldi_io, models, pipelines, - sox_effects, transforms, utils, ) @@ -56,7 +55,6 @@ "pipelines", "kaldi_io", "utils", - "sox_effects", "transforms", "list_audio_backends", "get_audio_backend", diff --git a/src/torchaudio/_backend/sox.py b/src/torchaudio/_backend/sox.py deleted file mode 100644 index f26ce83ca0..0000000000 --- a/src/torchaudio/_backend/sox.py +++ /dev/null @@ -1,91 +0,0 @@ -import os -from typing import BinaryIO, Optional, Tuple, Union - -import torch -import torchaudio - -from .backend import Backend -from .common import AudioMetaData - -sox_ext = torchaudio._extension.lazy_import_sox_ext() - - -class SoXBackend(Backend): - @staticmethod - def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData: - if hasattr(uri, "read"): - raise ValueError( - "SoX backend does not support reading from file-like objects. ", - "Please use an alternative backend that does support reading from file-like objects, e.g. FFmpeg.", - ) - else: - sinfo = sox_ext.get_info(uri, format) - if sinfo: - return AudioMetaData(*sinfo) - else: - raise RuntimeError(f"Failed to fetch metadata for {uri}.") - - @staticmethod - def load( - uri: Union[BinaryIO, str, os.PathLike], - frame_offset: int = 0, - num_frames: int = -1, - normalize: bool = True, - channels_first: bool = True, - format: Optional[str] = None, - buffer_size: int = 4096, - ) -> Tuple[torch.Tensor, int]: - if hasattr(uri, "read"): - raise ValueError( - "SoX backend does not support loading from file-like objects. ", - "Please use an alternative backend that does support loading from file-like objects, e.g. FFmpeg.", - ) - else: - ret = sox_ext.load_audio_file(str(uri), frame_offset, num_frames, normalize, channels_first, format) - if not ret: - raise RuntimeError(f"Failed to load audio from {uri}.") - return ret - - @staticmethod - def save( - uri: Union[BinaryIO, str, os.PathLike], - src: torch.Tensor, - sample_rate: int, - channels_first: bool = True, - format: Optional[str] = None, - encoding: Optional[str] = None, - bits_per_sample: Optional[int] = None, - buffer_size: int = 4096, - compression: Optional[Union[torchaudio.io.CodecConfig, float, int]] = None, - ) -> None: - if not isinstance(compression, (float, int, type(None))): - raise ValueError( - "SoX backend expects non-`None` value for argument `compression` to be of ", - f"type `float` or `int`, but received value of type {type(compression)}", - ) - if hasattr(uri, "write"): - raise ValueError( - "SoX backend does not support writing to file-like objects. ", - "Please use an alternative backend that does support writing to file-like objects, e.g. FFmpeg.", - ) - else: - sox_ext.save_audio_file( - str(uri), - src, - sample_rate, - channels_first, - compression, - format, - encoding, - bits_per_sample, - ) - - @staticmethod - def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool: - # i.e. not a file-like object. - return not hasattr(uri, "read") - - @staticmethod - def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool: - # i.e. not a file-like object. - return not hasattr(uri, "write") diff --git a/src/torchaudio/_backend/utils.py b/src/torchaudio/_backend/utils.py index eb7c51f0cb..4cc26ab7ae 100644 --- a/src/torchaudio/_backend/utils.py +++ b/src/torchaudio/_backend/utils.py @@ -5,7 +5,6 @@ import torch -from torchaudio._extension import lazy_import_sox_ext from torchaudio.io import CodecConfig from torio._extension import lazy_import_ffmpeg_ext @@ -15,7 +14,6 @@ from .common import AudioMetaData from .ffmpeg import FFmpegBackend from .soundfile import SoundfileBackend -from .sox import SoXBackend @lru_cache(None) @@ -23,8 +21,6 @@ def get_available_backends() -> Dict[str, Type[Backend]]: backend_specs: Dict[str, Type[Backend]] = {} if lazy_import_ffmpeg_ext().is_available(): backend_specs["ffmpeg"] = FFmpegBackend - if lazy_import_sox_ext().is_available(): - backend_specs["sox"] = SoXBackend if soundfile_backend._IS_SOUNDFILE_AVAILABLE: backend_specs["soundfile"] = SoundfileBackend return backend_specs @@ -86,7 +82,7 @@ def info( backend (str or None, optional): I/O backend to use. If ``None``, function selects backend given input and available backends. - Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``], + Otherwise, must be one of [``"ffmpeg"``, ``"soundfile"``], with the corresponding backend available. (Default: ``None``) diff --git a/src/torchaudio/_extension/__init__.py b/src/torchaudio/_extension/__init__.py index 5c2ff55583..b7e19fa38c 100644 --- a/src/torchaudio/_extension/__init__.py +++ b/src/torchaudio/_extension/__init__.py @@ -4,7 +4,7 @@ from torchaudio._internal.module_utils import fail_with_message, is_module_available, no_op -from .utils import _check_cuda_version, _init_dll_path, _init_sox, _LazyImporter, _load_lib +from .utils import _check_cuda_version, _init_dll_path, _LazyImporter, _load_lib _LG = logging.getLogger(__name__) @@ -17,7 +17,6 @@ "_check_cuda_version", "_IS_TORCHAUDIO_EXT_AVAILABLE", "_IS_RIR_AVAILABLE", - "lazy_import_sox_ext", ] @@ -44,18 +43,6 @@ _IS_ALIGN_AVAILABLE = torchaudio.lib._torchaudio.is_align_available() -_SOX_EXT = None - - -def lazy_import_sox_ext(): - """Load SoX integration based on availability in lazy manner""" - - global _SOX_EXT - if _SOX_EXT is None: - _SOX_EXT = _LazyImporter("_torchaudio_sox", _init_sox) - return _SOX_EXT - - fail_if_no_rir = ( no_op if _IS_RIR_AVAILABLE diff --git a/src/torchaudio/_extension/utils.py b/src/torchaudio/_extension/utils.py index c5660a1e22..8820c68e47 100644 --- a/src/torchaudio/_extension/utils.py +++ b/src/torchaudio/_extension/utils.py @@ -60,52 +60,6 @@ def _load_lib(lib: str) -> bool: torch.ops.load_library(path) return True - -def _import_sox_ext(): - if os.name == "nt": - raise RuntimeError("sox extension is not supported on Windows") - if not eval_env("TORCHAUDIO_USE_SOX", True): - raise RuntimeError("sox extension is disabled. (TORCHAUDIO_USE_SOX=0)") - - ext = "torchaudio.lib._torchaudio_sox" - - if not importlib.util.find_spec(ext): - raise RuntimeError( - # fmt: off - "TorchAudio is not built with sox extension. " - "Please build TorchAudio with libsox support. (BUILD_SOX=1)" - # fmt: on - ) - - _load_lib("libtorchaudio_sox") - return importlib.import_module(ext) - - -def _init_sox(): - ext = _import_sox_ext() - ext.set_verbosity(0) - - import atexit - - torch.ops.torchaudio_sox.initialize_sox_effects() - atexit.register(torch.ops.torchaudio_sox.shutdown_sox_effects) - - # Bundle functions registered with TORCH_LIBRARY into extension - # so that they can also be accessed in the same (lazy) manner - # from the extension. - keys = [ - "get_info", - "load_audio_file", - "save_audio_file", - "apply_effects_tensor", - "apply_effects_file", - ] - for key in keys: - setattr(ext, key, getattr(torch.ops.torchaudio_sox, key)) - - return ext - - class _LazyImporter(types.ModuleType): """Lazily import module/extension.""" diff --git a/src/torchaudio/backend/__init__.py b/src/torchaudio/backend/__init__.py index 84df7e7d69..dc3f6a3668 100644 --- a/src/torchaudio/backend/__init__.py +++ b/src/torchaudio/backend/__init__.py @@ -3,6 +3,6 @@ # New things should be added to `torchaudio._backend`. # Only things related to backward compatibility should be placed here. -from . import common, no_backend, soundfile_backend, sox_io_backend # noqa +from . import common, no_backend, soundfile_backend # noqa __all__ = [] diff --git a/src/torchaudio/backend/sox_io_backend.py b/src/torchaudio/backend/sox_io_backend.py deleted file mode 100644 index 7e83b8fbf4..0000000000 --- a/src/torchaudio/backend/sox_io_backend.py +++ /dev/null @@ -1,14 +0,0 @@ -def __getattr__(name: str): - import warnings - - warnings.warn( - "Torchaudio's I/O functions now support per-call backend dispatch. " - "Importing backend implementation directly is no longer guaranteed to work. " - "Please use `backend` keyword with load/save/info function, instead of " - "calling the underlying implementation directly.", - stacklevel=2, - ) - - from . import _sox_io_backend - - return getattr(_sox_io_backend, name) diff --git a/src/torchaudio/functional/__init__.py b/src/torchaudio/functional/__init__.py index 1c3b86b5da..1227b932c8 100644 --- a/src/torchaudio/functional/__init__.py +++ b/src/torchaudio/functional/__init__.py @@ -32,7 +32,6 @@ add_noise, amplitude_to_DB, apply_beamforming, - apply_codec, compute_deltas, convolve, create_dct, @@ -111,7 +110,6 @@ "riaa_biquad", "treble_biquad", "vad", - "apply_codec", "resample", "edit_distance", "pitch_shift", diff --git a/src/torchaudio/functional/functional.py b/src/torchaudio/functional/functional.py index 810d1f51fc..d1b2f630b2 100644 --- a/src/torchaudio/functional/functional.py +++ b/src/torchaudio/functional/functional.py @@ -34,7 +34,6 @@ "mask_along_axis_iid", "sliding_window_cmn", "spectral_centroid", - "apply_codec", "resample", "edit_distance", "loudness", @@ -1295,52 +1294,6 @@ def spectral_centroid( freq_dim = -2 return (freqs * specgram).sum(dim=freq_dim) / specgram.sum(dim=freq_dim) - -@deprecated("Please migrate to :py:class:`torchaudio.io.AudioEffector`.", remove=False) -def apply_codec( - waveform: Tensor, - sample_rate: int, - format: str, - channels_first: bool = True, - compression: Optional[float] = None, - encoding: Optional[str] = None, - bits_per_sample: Optional[int] = None, -) -> Tensor: - r""" - Apply codecs as a form of augmentation. - - .. devices:: CPU - - Args: - waveform (Tensor): Audio data. Must be 2 dimensional. See also ```channels_first```. - sample_rate (int): Sample rate of the audio waveform. - format (str): File format. - channels_first (bool, optional): - When True, both the input and output Tensor have dimension `(channel, time)`. - Otherwise, they have dimension `(time, channel)`. - compression (float or None, optional): Used for formats other than WAV. - For more details see :py:func:`torchaudio.backend.sox_io_backend.save`. - encoding (str or None, optional): Changes the encoding for the supported formats. - For more details see :py:func:`torchaudio.backend.sox_io_backend.save`. - bits_per_sample (int or None, optional): Changes the bit depth for the supported formats. - For more details see :py:func:`torchaudio.backend.sox_io_backend.save`. - - Returns: - Tensor: Resulting Tensor. - If ``channels_first=True``, it has `(channel, time)` else `(time, channel)`. - """ - from torchaudio.backend import _sox_io_backend - - with tempfile.NamedTemporaryFile() as f: - torchaudio.backend._sox_io_backend.save( - f.name, waveform, sample_rate, channels_first, compression, format, encoding, bits_per_sample - ) - augmented, sr = _sox_io_backend.load(f.name, channels_first=channels_first, format=format) - if sr != sample_rate: - augmented = resample(augmented, sr, sample_rate) - return augmented - - _CPU = torch.device("cpu") diff --git a/src/torchaudio/sox_effects/__init__.py b/src/torchaudio/sox_effects/__init__.py deleted file mode 100644 index 93c63cae1d..0000000000 --- a/src/torchaudio/sox_effects/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -from .sox_effects import apply_effects_file, apply_effects_tensor, effect_names, init_sox_effects, shutdown_sox_effects - - -__all__ = [ - "init_sox_effects", - "shutdown_sox_effects", - "effect_names", - "apply_effects_tensor", - "apply_effects_file", -] diff --git a/src/torchaudio/sox_effects/sox_effects.py b/src/torchaudio/sox_effects/sox_effects.py deleted file mode 100644 index 256c461edc..0000000000 --- a/src/torchaudio/sox_effects/sox_effects.py +++ /dev/null @@ -1,275 +0,0 @@ -import os -from typing import List, Optional, Tuple - -import torch -import torchaudio -from torchaudio._internal.module_utils import deprecated, dropping_support -from torchaudio.utils.sox_utils import list_effects - - -sox_ext = torchaudio._extension.lazy_import_sox_ext() - - -@deprecated("Please remove the call. This function is called automatically.") -def init_sox_effects(): - """Initialize resources required to use sox effects. - - Note: - You do not need to call this function manually. It is called automatically. - - Once initialized, you do not need to call this function again across the multiple uses of - sox effects though it is safe to do so as long as :func:`shutdown_sox_effects` is not called yet. - Once :func:`shutdown_sox_effects` is called, you can no longer use SoX effects and initializing - again will result in error. - """ - pass - - -@deprecated("Please remove the call. This function is called automatically.") -def shutdown_sox_effects(): - """Clean up resources required to use sox effects. - - Note: - You do not need to call this function manually. It is called automatically. - - It is safe to call this function multiple times. - Once :py:func:`shutdown_sox_effects` is called, you can no longer use SoX effects and - initializing again will result in error. - """ - pass - - -@dropping_support -def effect_names() -> List[str]: - """Gets list of valid sox effect names - - Returns: - List[str]: list of available effect names. - - Example - >>> torchaudio.sox_effects.effect_names() - ['allpass', 'band', 'bandpass', ... ] - """ - return list(list_effects().keys()) - - -@dropping_support -def apply_effects_tensor( - tensor: torch.Tensor, - sample_rate: int, - effects: List[List[str]], - channels_first: bool = True, -) -> Tuple[torch.Tensor, int]: - """Apply sox effects to given Tensor - - .. devices:: CPU - - .. properties:: TorchScript - - Note: - This function only works on CPU Tensors. - This function works in the way very similar to ``sox`` command, however there are slight - differences. For example, ``sox`` command adds certain effects automatically (such as - ``rate`` effect after ``speed`` and ``pitch`` and other effects), but this function does - only applies the given effects. (Therefore, to actually apply ``speed`` effect, you also - need to give ``rate`` effect with desired sampling rate.). - - Args: - tensor (torch.Tensor): Input 2D CPU Tensor. - sample_rate (int): Sample rate - effects (List[List[str]]): List of effects. - channels_first (bool, optional): Indicates if the input Tensor's dimension is - `[channels, time]` or `[time, channels]` - - Returns: - (Tensor, int): Resulting Tensor and sample rate. - The resulting Tensor has the same ``dtype`` as the input Tensor, and - the same channels order. The shape of the Tensor can be different based on the - effects applied. Sample rate can also be different based on the effects applied. - - Example - Basic usage - >>> - >>> # Defines the effects to apply - >>> effects = [ - ... ['gain', '-n'], # normalises to 0dB - ... ['pitch', '5'], # 5 cent pitch shift - ... ['rate', '8000'], # resample to 8000 Hz - ... ] - >>> - >>> # Generate pseudo wave: - >>> # normalized, channels first, 2ch, sampling rate 16000, 1 second - >>> sample_rate = 16000 - >>> waveform = 2 * torch.rand([2, sample_rate * 1]) - 1 - >>> waveform.shape - torch.Size([2, 16000]) - >>> waveform - tensor([[ 0.3138, 0.7620, -0.9019, ..., -0.7495, -0.4935, 0.5442], - [-0.0832, 0.0061, 0.8233, ..., -0.5176, -0.9140, -0.2434]]) - >>> - >>> # Apply effects - >>> waveform, sample_rate = apply_effects_tensor( - ... wave_form, sample_rate, effects, channels_first=True) - >>> - >>> # Check the result - >>> # The new waveform is sampling rate 8000, 1 second. - >>> # normalization and channel order are preserved - >>> waveform.shape - torch.Size([2, 8000]) - >>> waveform - tensor([[ 0.5054, -0.5518, -0.4800, ..., -0.0076, 0.0096, -0.0110], - [ 0.1331, 0.0436, -0.3783, ..., -0.0035, 0.0012, 0.0008]]) - >>> sample_rate - 8000 - - Example - Torchscript-able transform - >>> - >>> # Use `apply_effects_tensor` in `torch.nn.Module` and dump it to file, - >>> # then run sox effect via Torchscript runtime. - >>> - >>> class SoxEffectTransform(torch.nn.Module): - ... effects: List[List[str]] - ... - ... def __init__(self, effects: List[List[str]]): - ... super().__init__() - ... self.effects = effects - ... - ... def forward(self, tensor: torch.Tensor, sample_rate: int): - ... return sox_effects.apply_effects_tensor( - ... tensor, sample_rate, self.effects) - ... - ... - >>> # Create transform object - >>> effects = [ - ... ["lowpass", "-1", "300"], # apply single-pole lowpass filter - ... ["rate", "8000"], # change sample rate to 8000 - ... ] - >>> transform = SoxEffectTensorTransform(effects, input_sample_rate) - >>> - >>> # Dump it to file and load - >>> path = 'sox_effect.zip' - >>> torch.jit.script(trans).save(path) - >>> transform = torch.jit.load(path) - >>> - >>>> # Run transform - >>> waveform, input_sample_rate = torchaudio.load("input.wav") - >>> waveform, sample_rate = transform(waveform, input_sample_rate) - >>> assert sample_rate == 8000 - """ - return sox_ext.apply_effects_tensor(tensor, sample_rate, effects, channels_first) - - -@dropping_support -def apply_effects_file( - path: str, - effects: List[List[str]], - normalize: bool = True, - channels_first: bool = True, - format: Optional[str] = None, -) -> Tuple[torch.Tensor, int]: - """Apply sox effects to the audio file and load the resulting data as Tensor - - .. devices:: CPU - - .. properties:: TorchScript - - Note: - This function works in the way very similar to ``sox`` command, however there are slight - differences. For example, ``sox`` commnad adds certain effects automatically (such as - ``rate`` effect after ``speed``, ``pitch`` etc), but this function only applies the given - effects. Therefore, to actually apply ``speed`` effect, you also need to give ``rate`` - effect with desired sampling rate, because internally, ``speed`` effects only alter sampling - rate and leave samples untouched. - - Args: - path (path-like object): - Source of audio data. - effects (List[List[str]]): List of effects. - normalize (bool, optional): - When ``True``, this function converts the native sample type to ``float32``. - Default: ``True``. - - If input file is integer WAV, giving ``False`` will change the resulting Tensor type to - integer type. - This argument has no effect for formats other than integer WAV type. - - channels_first (bool, optional): When True, the returned Tensor has dimension `[channel, time]`. - Otherwise, the returned Tensor's dimension is `[time, channel]`. - format (str or None, optional): - Override the format detection with the given format. - Providing the argument might help when libsox can not infer the format - from header or extension, - - Returns: - (Tensor, int): Resulting Tensor and sample rate. - If ``normalize=True``, the resulting Tensor is always ``float32`` type. - If ``normalize=False`` and the input audio file is of integer WAV file, then the - resulting Tensor has corresponding integer type. (Note 24 bit integer type is not supported) - If ``channels_first=True``, the resulting Tensor has dimension `[channel, time]`, - otherwise `[time, channel]`. - - Example - Basic usage - >>> - >>> # Defines the effects to apply - >>> effects = [ - ... ['gain', '-n'], # normalises to 0dB - ... ['pitch', '5'], # 5 cent pitch shift - ... ['rate', '8000'], # resample to 8000 Hz - ... ] - >>> - >>> # Apply effects and load data with channels_first=True - >>> waveform, sample_rate = apply_effects_file("data.wav", effects, channels_first=True) - >>> - >>> # Check the result - >>> waveform.shape - torch.Size([2, 8000]) - >>> waveform - tensor([[ 5.1151e-03, 1.8073e-02, 2.2188e-02, ..., 1.0431e-07, - -1.4761e-07, 1.8114e-07], - [-2.6924e-03, 2.1860e-03, 1.0650e-02, ..., 6.4122e-07, - -5.6159e-07, 4.8103e-07]]) - >>> sample_rate - 8000 - - Example - Apply random speed perturbation to dataset - >>> - >>> # Load data from file, apply random speed perturbation - >>> class RandomPerturbationFile(torch.utils.data.Dataset): - ... \"\"\"Given flist, apply random speed perturbation - ... - ... Suppose all the input files are at least one second long. - ... \"\"\" - ... def __init__(self, flist: List[str], sample_rate: int): - ... super().__init__() - ... self.flist = flist - ... self.sample_rate = sample_rate - ... - ... def __getitem__(self, index): - ... speed = 0.5 + 1.5 * random.randn() - ... effects = [ - ... ['gain', '-n', '-10'], # apply 10 db attenuation - ... ['remix', '-'], # merge all the channels - ... ['speed', f'{speed:.5f}'], # duration is now 0.5 ~ 2.0 seconds. - ... ['rate', f'{self.sample_rate}'], - ... ['pad', '0', '1.5'], # add 1.5 seconds silence at the end - ... ['trim', '0', '2'], # get the first 2 seconds - ... ] - ... waveform, _ = torchaudio.sox_effects.apply_effects_file( - ... self.flist[index], effects) - ... return waveform - ... - ... def __len__(self): - ... return len(self.flist) - ... - >>> dataset = RandomPerturbationFile(file_list, sample_rate=8000) - >>> loader = torch.utils.data.DataLoader(dataset, batch_size=32) - >>> for batch in loader: - >>> pass - """ - if not torch.jit.is_scripting(): - if hasattr(path, "read"): - raise RuntimeError( - "apply_effects_file function does not support file-like object. " - "Please use torchaudio.io.AudioEffector." - ) - path = os.fspath(path) - return sox_ext.apply_effects_file(path, effects, normalize, channels_first, format) diff --git a/src/torchaudio/utils/__init__.py b/src/torchaudio/utils/__init__.py index 9d4dd2dd72..be1f0bad21 100644 --- a/src/torchaudio/utils/__init__.py +++ b/src/torchaudio/utils/__init__.py @@ -1,10 +1,8 @@ from torio.utils import ffmpeg_utils -from . import sox_utils from .download import _download_asset __all__ = [ - "sox_utils", "ffmpeg_utils", ] diff --git a/src/torchaudio/utils/sox_utils.py b/src/torchaudio/utils/sox_utils.py deleted file mode 100644 index 8cc68361d5..0000000000 --- a/src/torchaudio/utils/sox_utils.py +++ /dev/null @@ -1,118 +0,0 @@ -"""Module to change the configuration of libsox, which is used by I/O functions like -:py:mod:`~torchaudio.backend.sox_io_backend` and :py:mod:`~torchaudio.sox_effects`. - -.. warning:: - Starting with version 2.8, we are refactoring TorchAudio to transition it - into a maintenance phase. As a result: - - - Some APIs are deprecated in 2.8 and will be removed in 2.9. - - The decoding and encoding capabilities of PyTorch for both audio and video - are being consolidated into TorchCodec. - - Please see https://github.com/pytorch/audio/issues/3902 for more information. -""" - -from typing import Dict, List - -import torchaudio - -sox_ext = torchaudio._extension.lazy_import_sox_ext() - -from torchaudio._internal.module_utils import dropping_support - -@dropping_support -def set_seed(seed: int): - """Set libsox's PRNG - - Args: - seed (int): seed value. valid range is int32. - - See Also: - http://sox.sourceforge.net/sox.html - """ - sox_ext.set_seed(seed) - - -@dropping_support -def set_verbosity(verbosity: int): - """Set libsox's verbosity - - Args: - verbosity (int): Set verbosity level of libsox. - - * ``1`` failure messages - * ``2`` warnings - * ``3`` details of processing - * ``4``-``6`` increasing levels of debug messages - - See Also: - http://sox.sourceforge.net/sox.html - """ - sox_ext.set_verbosity(verbosity) - - -@dropping_support -def set_buffer_size(buffer_size: int): - """Set buffer size for sox effect chain - - Args: - buffer_size (int): Set the size in bytes of the buffers used for processing audio. - - See Also: - http://sox.sourceforge.net/sox.html - """ - sox_ext.set_buffer_size(buffer_size) - - -@dropping_support -def set_use_threads(use_threads: bool): - """Set multithread option for sox effect chain - - Args: - use_threads (bool): When ``True``, enables ``libsox``'s parallel effects channels processing. - To use mutlithread, the underlying ``libsox`` has to be compiled with OpenMP support. - - See Also: - http://sox.sourceforge.net/sox.html - """ - sox_ext.set_use_threads(use_threads) - - -@dropping_support -def list_effects() -> Dict[str, str]: - """List the available sox effect names - - Returns: - Dict[str, str]: Mapping from ``effect name`` to ``usage`` - """ - return dict(sox_ext.list_effects()) - - -@dropping_support -def list_read_formats() -> List[str]: - """List the supported audio formats for read - - Returns: - List[str]: List of supported audio formats - """ - return sox_ext.list_read_formats() - - -@dropping_support -def list_write_formats() -> List[str]: - """List the supported audio formats for write - - Returns: - List[str]: List of supported audio formats - """ - return sox_ext.list_write_formats() - - -@dropping_support -def get_buffer_size() -> int: - """Get buffer size for sox effect chain - - Returns: - int: size in bytes of buffers used for processing audio. - """ - return sox_ext.get_buffer_size() diff --git a/test/torchaudio_unittest/common_utils/__init__.py b/test/torchaudio_unittest/common_utils/__init__.py index ff58db8f6c..509d5208df 100644 --- a/test/torchaudio_unittest/common_utils/__init__.py +++ b/test/torchaudio_unittest/common_utils/__init__.py @@ -15,9 +15,6 @@ skipIfNoModule, skipIfNoQengine, skipIfNoRIR, - skipIfNoSox, - skipIfNoSoxDecoder, - skipIfNoSoxEncoder, skipIfPy310, skipIfRocm, TempDirMixin, @@ -63,9 +60,6 @@ def inject_request(self, request): "skipIfNoMacOS", "skipIfNoModule", "skipIfNoRIR", - "skipIfNoSox", - "skipIfNoSoxDecoder", - "skipIfNoSoxEncoder", "skipIfRocm", "skipIfNoQengine", "skipIfNoFFmpeg", diff --git a/test/torchaudio_unittest/common_utils/case_utils.py b/test/torchaudio_unittest/common_utils/case_utils.py index ae8ab05cee..7ce9c89dd3 100644 --- a/test/torchaudio_unittest/common_utils/case_utils.py +++ b/test/torchaudio_unittest/common_utils/case_utils.py @@ -109,7 +109,6 @@ class TorchaudioTestCase(TestBaseMixin, PytorchTestCase): _IS_FFMPEG_AVAILABLE = torio._extension.lazy_import_ffmpeg_ext().is_available() -_IS_SOX_AVAILABLE = torchaudio._extension.lazy_import_sox_ext().is_available() _IS_CTC_DECODER_AVAILABLE = None _IS_CUDA_CTC_DECODER_AVAILABLE = None @@ -206,28 +205,6 @@ def skipIfNoModule(module, display_name=None): reason="CUDA does not have enough memory.", key="CUDA_SMALL_MEMORY", ) -skipIfNoSox = _skipIf( - not _IS_SOX_AVAILABLE, - reason="Sox features are not available.", - key="NO_SOX", -) - - -def skipIfNoSoxDecoder(ext): - return _skipIf( - not _IS_SOX_AVAILABLE or ext not in torchaudio.utils.sox_utils.list_read_formats(), - f'sox does not handle "{ext}" for read.', - key="NO_SOX_DECODER", - ) - - -def skipIfNoSoxEncoder(ext): - return _skipIf( - not _IS_SOX_AVAILABLE or ext not in torchaudio.utils.sox_utils.list_write_formats(), - f'sox does not handle "{ext}" for write.', - key="NO_SOX_ENCODER", - ) - skipIfNoRIR = _skipIf( not torchaudio._extension._IS_RIR_AVAILABLE, diff --git a/test/torchaudio_unittest/deprecation_test.py b/test/torchaudio_unittest/deprecation_test.py index 04493c8dc3..c44d1907f9 100644 --- a/test/torchaudio_unittest/deprecation_test.py +++ b/test/torchaudio_unittest/deprecation_test.py @@ -3,7 +3,7 @@ import torch from torchaudio._internal.module_utils import UNSUPPORTED -from torchaudio.sox_effects import apply_effects_tensor +from torchaudio.prototype.functional import exp_sigmoid # Importing prototype modules is needed to trigger the registration of the # corresponding APIs in the UNSUPPORTED register. @@ -25,10 +25,8 @@ def test_deprecations(func): # deprecated for years. @pytest.mark.parametrize("scripted", (True, False)) def test_torchscript_fails(scripted): - f = apply_effects_tensor + f = exp_sigmoid if scripted: pytest.xfail("Deprecation decorator breaks torchscript") f = torch.jit.script(f) - _, out_sample_rate = f(torch.rand(2, 1000), sample_rate=16_000, effects=[["rate", "8000"]]) - assert out_sample_rate == 8000 - + f(torch.rand(2, 1000)) diff --git a/test/torchaudio_unittest/functional/functional_cpu_test.py b/test/torchaudio_unittest/functional/functional_cpu_test.py index 7b81cc92ac..9a6ad0a63d 100644 --- a/test/torchaudio_unittest/functional/functional_cpu_test.py +++ b/test/torchaudio_unittest/functional/functional_cpu_test.py @@ -4,7 +4,7 @@ import torchaudio.functional as F from parameterized import parameterized import unittest -from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoSox, TorchaudioTestCase +from torchaudio_unittest.common_utils import PytorchTestCase from .functional_impl import Functional, FunctionalCPUOnly @@ -21,38 +21,3 @@ def test_lfilter_9th_order_filter_stability(self): class TestFunctionalFloat64(Functional, PytorchTestCase): dtype = torch.float64 device = torch.device("cpu") - - -@unittest.skip("deprecated") -@skipIfNoSox -class TestApplyCodec(TorchaudioTestCase): - def _smoke_test(self, format, compression, check_num_frames): - """ - The purpose of this test suite is to verify that apply_codec functionalities do not exhibit - abnormal behaviors. - """ - sample_rate = 8000 - num_frames = 3 * sample_rate - num_channels = 2 - waveform = torch.rand(num_channels, num_frames) - - augmented = F.apply_codec(waveform, sample_rate, format, True, compression) - assert augmented.dtype == waveform.dtype - assert augmented.shape[0] == num_channels - if check_num_frames: - assert augmented.shape[1] == num_frames - - def test_wave(self): - self._smoke_test("wav", compression=None, check_num_frames=True) - - @parameterized.expand([(96,), (128,), (160,), (192,), (224,), (256,), (320,)]) - def test_mp3(self, compression): - self._smoke_test("mp3", compression, check_num_frames=False) - - @parameterized.expand([(0,), (1,), (2,), (3,), (4,), (5,), (6,), (7,), (8,)]) - def test_flac(self, compression): - self._smoke_test("flac", compression, check_num_frames=False) - - @parameterized.expand([(-1,), (0,), (1,), (2,), (3,), (3.6,), (5,), (10,)]) - def test_vorbis(self, compression): - self._smoke_test("vorbis", compression, check_num_frames=False) diff --git a/test/torchaudio_unittest/functional/sox_compatibility_test.py b/test/torchaudio_unittest/functional/sox_compatibility_test.py index 9ec8383272..51abb899bb 100644 --- a/test/torchaudio_unittest/functional/sox_compatibility_test.py +++ b/test/torchaudio_unittest/functional/sox_compatibility_test.py @@ -6,16 +6,11 @@ load_wav, save_wav, skipIfNoExec, - skipIfNoSox, - sox_utils, TempDirMixin, TorchaudioTestCase, RequestMixin ) - -@skipIfNoSox -@skipIfNoExec("sox") class TestFunctionalFiltering(TempDirMixin, TorchaudioTestCase, RequestMixin): def run_sox_effect(self, input_file, effect): output_file = self.get_temp_path("expected.wav") diff --git a/test/torchaudio_unittest/transforms/sox_compatibility_test.py b/test/torchaudio_unittest/transforms/sox_compatibility_test.py index 222bb463b3..3460b71bf4 100644 --- a/test/torchaudio_unittest/transforms/sox_compatibility_test.py +++ b/test/torchaudio_unittest/transforms/sox_compatibility_test.py @@ -9,7 +9,6 @@ load_wav, save_wav, skipIfNoExec, - skipIfNoSox, sox_utils, TempDirMixin, TorchaudioTestCase, @@ -17,8 +16,6 @@ ) -@skipIfNoSox -@skipIfNoExec("sox") class TestFunctionalFiltering(TempDirMixin, TorchaudioTestCase, RequestMixin): def run_sox_effect(self, input_file, effect): output_file = self.get_temp_path("expected.wav") diff --git a/test/torchaudio_unittest/utils/sox_utils_test.py b/test/torchaudio_unittest/utils/sox_utils_test.py deleted file mode 100644 index 8b88d966c3..0000000000 --- a/test/torchaudio_unittest/utils/sox_utils_test.py +++ /dev/null @@ -1,46 +0,0 @@ -from torchaudio.utils import sox_utils -from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoSox - - -@skipIfNoSox -class TestSoxUtils(PytorchTestCase): - """Smoke tests for sox_util module""" - - def test_set_seed(self): - """`set_seed` does not crush""" - sox_utils.set_seed(0) - - def test_set_verbosity(self): - """`set_verbosity` does not crush""" - for val in range(6, 0, -1): - sox_utils.set_verbosity(val) - - def test_set_buffer_size(self): - """`set_buffer_size` does not crush""" - sox_utils.set_buffer_size(131072) - # back to default - sox_utils.set_buffer_size(8192) - - def test_set_use_threads(self): - """`set_use_threads` does not crush""" - sox_utils.set_use_threads(True) - # back to default - sox_utils.set_use_threads(False) - - def test_list_effects(self): - """`list_effects` returns the list of available effects""" - effects = sox_utils.list_effects() - # We cannot infer what effects are available, so only check some of them. - assert "highpass" in effects - assert "phaser" in effects - assert "gain" in effects - - def test_list_read_formats(self): - """`list_read_formats` returns the list of supported formats""" - formats = sox_utils.list_read_formats() - assert "wav" in formats - - def test_list_write_formats(self): - """`list_write_formats` returns the list of supported formats""" - formats = sox_utils.list_write_formats() - assert "opus" not in formats diff --git a/tools/setup_helpers/extension.py b/tools/setup_helpers/extension.py index 58f5087854..b322541e36 100644 --- a/tools/setup_helpers/extension.py +++ b/tools/setup_helpers/extension.py @@ -51,13 +51,6 @@ def get_ext_modules(): Extension(name="torchaudio.lib.libtorchaudio", sources=[]), Extension(name="torchaudio.lib._torchaudio", sources=[]), ] - if _BUILD_SOX: - modules.extend( - [ - Extension(name="torchaudio.lib.libtorchaudio_sox", sources=[]), - Extension(name="torchaudio.lib._torchaudio_sox", sources=[]), - ] - ) if _BUILD_CUDA_CTC_DECODER: modules.extend( [