diff --git a/CMakeLists.txt b/CMakeLists.txt index ddc6dc15a2..6fada209fe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -166,10 +166,6 @@ else() endif() add_subdirectory(src/libtorchaudio) -if (BUILD_SOX) - add_subdirectory(third_party/sox) - add_subdirectory(src/libtorchaudio/sox) -endif() if (USE_FFMPEG) if (DEFINED ENV{FFMPEG_ROOT}) add_subdirectory(third_party/ffmpeg/single) diff --git a/docs/source/index.rst b/docs/source/index.rst index 819f72d813..785341c363 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -102,7 +102,6 @@ model implementations and application components. models models.decoder pipelines - sox_effects compliance.kaldi kaldi_io utils diff --git a/docs/source/installation.rst b/docs/source/installation.rst index cb0fa190b8..91136d52dd 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -70,27 +70,6 @@ Optional Dependencies files you installed follow this naming scheme, (and then make sure that they are in one of the directories listed in library search path.) -* `SoX `__ - - Required to use ``backend="sox"`` in `I/O functions <./torchaudio.html#i-o>`__. - - Starting version 2.1, TorchAudio requires separately installed libsox. - - If dynamic linking is causing an issue, you can set the environment variable - ``TORCHAUDIO_USE_SOX=0``, and TorchAudio won't use SoX. - - .. note:: - - TorchAudio looks for a library file with unversioned name, that is ``libsox.so`` - for Linux, and ``libsox.dylib`` for macOS. Some package managers install the library - file with different name. For example, aptitude on Ubuntu installs ``libsox.so.3``. - To have TorchAudio link against it, you can create a symbolic link to it with name - ``libsox.so`` (and put the symlink in a library search path). - - .. note:: - TorchAudio is tested on libsox 14.4.2. (And it is unlikely that other - versions would work.) - * `SoundFile `__ Required to use ``backend="soundfile"`` in `I/O functions <./torchaudio.html#i-o>`__. diff --git a/docs/source/sox_effects.rst b/docs/source/sox_effects.rst deleted file mode 100644 index a8ee260144..0000000000 --- a/docs/source/sox_effects.rst +++ /dev/null @@ -1,34 +0,0 @@ -.. py:module:: torchaudio.sox_effects - -torchaudio.sox_effects -====================== - -.. currentmodule:: torchaudio.sox_effects - -.. warning:: - Starting with version 2.8, we are refactoring TorchAudio to transition it - into a maintenance phase. As a result, the ``sox_effect`` module is - deprecated in 2.8 and will be removed in 2.9. - -Applying effects ----------------- - -Apply SoX effects chain on torch.Tensor or on file and load as torch.Tensor. - -.. autosummary:: - :toctree: generated - :nosignatures: - - apply_effects_tensor - apply_effects_file - -.. minigallery:: torchaudio.sox_effects.apply_effects_tensor - -Utilities ---------- - -.. autosummary:: - :toctree: generated - :nosignatures: - - effect_names diff --git a/docs/source/torchaudio.rst b/docs/source/torchaudio.rst index aa933e84ad..629ffd312a 100644 --- a/docs/source/torchaudio.rst +++ b/docs/source/torchaudio.rst @@ -78,14 +78,6 @@ The following table summarizes the backends. to retrieve the supported codecs. This backend Supports various protocols, such as HTTPS and MP4, and file-like objects. - * - 2 - - SoX - - Linux, macOS - - Use :py:func:`~torchaudio.utils.sox_utils.list_read_formats` and - :py:func:`~torchaudio.utils.sox_utils.list_write_formats` - to retrieve the supported codecs. - - This backend does *not* support file-like objects. * - 3 - SoundFile - Linux, macOS, Windows diff --git a/examples/libtorchaudio/augmentation/CMakeLists.txt b/examples/libtorchaudio/augmentation/CMakeLists.txt deleted file mode 100644 index e9bfece93a..0000000000 --- a/examples/libtorchaudio/augmentation/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -add_executable(augment main.cpp) -target_link_libraries(augment "${TORCH_LIBRARIES}" "${TORCHAUDIO_LIBRARY}") -set_property(TARGET augment PROPERTY CXX_STANDARD 14) diff --git a/examples/libtorchaudio/augmentation/README.md b/examples/libtorchaudio/augmentation/README.md deleted file mode 100644 index 81c58b3bd6..0000000000 --- a/examples/libtorchaudio/augmentation/README.md +++ /dev/null @@ -1,35 +0,0 @@ -# Augmentation - -This example demonstrates how you can use torchaudio's I/O features and augmentations in C++ application. - -**NOTE** -This example uses `"sox_io"` backend, thus does not work on Windows. - -## Steps -### 1. Create augmentation pipeline TorchScript file. - -First, we implement our data process pipeline as a regular Python, and save it as a TorchScript object. -We will load and execute it in our C++ application. The C++ code is found in [`main.cpp`](./main.cpp). - -```python -python create_jittable_pipeline.py \ - --rir-path "../data/rir.wav" \ - --output-path "./pipeline.zip" -``` - -### 2. Build the application - -Please refer to [the top level README.md](../README.md) - -### 3. Run the application - -Now we run the C++ application `augment`, with the TorchScript object we created in Step.1 and an input audio file. - -In [the top level directory](../) - -```bash -input_audio_file="./data/input.wav" -./build/augmentation/augment ./augmentation/pipeline.zip "${input_audio_file}" "output.wav" -``` - -When you give a clean speech file, the output audio sounds like it's a phone conversation. diff --git a/examples/libtorchaudio/augmentation/create_jittable_pipeline.py b/examples/libtorchaudio/augmentation/create_jittable_pipeline.py deleted file mode 100755 index 79f56819fc..0000000000 --- a/examples/libtorchaudio/augmentation/create_jittable_pipeline.py +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env python3 -""" -Create a data preprocess pipeline that can be run with libtorchaudio -""" -import argparse -import os - -import torch -import torchaudio - - -class Pipeline(torch.nn.Module): - """Example audio process pipeline. - - This example load waveform from a file then apply effects and save it to a file. - """ - - def __init__(self, rir_path: str): - super().__init__() - rir, sample_rate = torchaudio.load(rir_path) - self.register_buffer("rir", rir) - self.rir_sample_rate: int = sample_rate - - def forward(self, input_path: str, output_path: str): - torchaudio.sox_effects.init_sox_effects() - - # 1. load audio - waveform, sample_rate = torchaudio.load(input_path) - - # 2. Add background noise - alpha = 0.01 - waveform = alpha * torch.randn_like(waveform) + (1 - alpha) * waveform - - # 3. Reample the RIR filter to much the audio sample rate - rir, _ = torchaudio.sox_effects.apply_effects_tensor( - self.rir, self.rir_sample_rate, effects=[["rate", str(sample_rate)]] - ) - rir = rir / torch.linalg.vector_norm(rir, ord=2) - rir = torch.flip(rir, [1]) - - # 4. Apply RIR filter - waveform = torch.nn.functional.pad(waveform, (rir.shape[1] - 1, 0)) - waveform = torch.nn.functional.conv1d(waveform[None, ...], rir[None, ...])[0] - - # Save - torchaudio.save(output_path, waveform, sample_rate) - - -def _create_jit_pipeline(rir_path, output_path): - module = torch.jit.script(Pipeline(rir_path)) - print("*" * 40) - print("* Pipeline code") - print("*" * 40) - print() - print(module.code) - print("*" * 40) - module.save(output_path) - - -def _get_path(*paths): - return os.path.join(os.path.dirname(__file__), *paths) - - -def _parse_args(): - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument( - "--rir-path", default=_get_path("..", "data", "rir.wav"), help="Audio dara for room impulse response." - ) - parser.add_argument("--output-path", default=_get_path("pipeline.zip"), help="Output JIT file.") - return parser.parse_args() - - -def _main(): - args = _parse_args() - _create_jit_pipeline(args.rir_path, args.output_path) - - -if __name__ == "__main__": - _main() diff --git a/examples/libtorchaudio/augmentation/main.cpp b/examples/libtorchaudio/augmentation/main.cpp deleted file mode 100644 index 921c67972b..0000000000 --- a/examples/libtorchaudio/augmentation/main.cpp +++ /dev/null @@ -1,22 +0,0 @@ -#include - -int main(int argc, char* argv[]) { - if (argc != 4) { - std::cerr << "Usage: " << argv[0] - << " " << std::endl; - return -1; - } - - torch::jit::script::Module module; - std::cout << "Loading module from: " << argv[1] << std::endl; - try { - module = torch::jit::load(argv[1]); - } catch (const c10::Error& error) { - std::cerr << "Failed to load the module:" << error.what() << std::endl; - return -1; - } - - std::cout << "Performing the process ..." << std::endl; - module.forward({c10::IValue(argv[2]), c10::IValue(argv[3])}); - std::cout << "Done." << std::endl; -} diff --git a/examples/source_separation/conv_tasnet/train.py b/examples/source_separation/conv_tasnet/train.py index 133b1f4f5e..72b8f57824 100644 --- a/examples/source_separation/conv_tasnet/train.py +++ b/examples/source_separation/conv_tasnet/train.py @@ -189,8 +189,6 @@ def train(args): _LG.info("%s", args) args.save_dir.mkdir(parents=True, exist_ok=True) - if "sox_io" in torchaudio.list_audio_backends(): - torchaudio.set_audio_backend("sox_io") start_epoch = 1 if args.resume: diff --git a/src/libtorchaudio/sox/CMakeLists.txt b/src/libtorchaudio/sox/CMakeLists.txt deleted file mode 100644 index 5ffe782c82..0000000000 --- a/src/libtorchaudio/sox/CMakeLists.txt +++ /dev/null @@ -1,25 +0,0 @@ -set( - sources - io.cpp - utils.cpp - effects.cpp - effects_chain.cpp - types.cpp - ) -torchaudio_library( - libtorchaudio_sox - "${sources}" - "" - "torch;sox" - "" - ) - -if (BUILD_TORCHAUDIO_PYTHON_EXTENSION) - torchaudio_extension( - _torchaudio_sox - "pybind/pybind.cpp;" - "" - "libtorchaudio_sox" - "" - ) -endif() diff --git a/src/libtorchaudio/sox/effects.cpp b/src/libtorchaudio/sox/effects.cpp deleted file mode 100644 index 947c04e3fc..0000000000 --- a/src/libtorchaudio/sox/effects.cpp +++ /dev/null @@ -1,133 +0,0 @@ -#include -#include -#include -#include - -namespace torchaudio::sox { -namespace { - -enum SoxEffectsResourceState { NotInitialized, Initialized, ShutDown }; -SoxEffectsResourceState SOX_RESOURCE_STATE = NotInitialized; -std::mutex SOX_RESOUCE_STATE_MUTEX; - -} // namespace - -void initialize_sox_effects() { - const std::lock_guard lock(SOX_RESOUCE_STATE_MUTEX); - - switch (SOX_RESOURCE_STATE) { - case NotInitialized: - TORCH_CHECK( - sox_init() == SOX_SUCCESS, "Failed to initialize sox effects."); - SOX_RESOURCE_STATE = Initialized; - break; - case Initialized: - break; - case ShutDown: - TORCH_CHECK( - false, "SoX Effects has been shut down. Cannot initialize again."); - } -}; - -void shutdown_sox_effects() { - const std::lock_guard lock(SOX_RESOUCE_STATE_MUTEX); - - switch (SOX_RESOURCE_STATE) { - case NotInitialized: - TORCH_CHECK(false, "SoX Effects is not initialized. Cannot shutdown."); - case Initialized: - TORCH_CHECK( - sox_quit() == SOX_SUCCESS, "Failed to initialize sox effects."); - SOX_RESOURCE_STATE = ShutDown; - break; - case ShutDown: - break; - } -} - -auto apply_effects_tensor( - torch::Tensor waveform, - int64_t sample_rate, - const std::vector>& effects, - bool channels_first) -> std::tuple { - validate_input_tensor(waveform); - - // Create SoxEffectsChain - const auto dtype = waveform.dtype(); - SoxEffectsChain chain( - /*input_encoding=*/get_tensor_encodinginfo(dtype), - /*output_encoding=*/get_tensor_encodinginfo(dtype)); - - // Prepare output buffer - std::vector out_buffer; - out_buffer.reserve(waveform.numel()); - - // Build and run effects chain - chain.addInputTensor(&waveform, sample_rate, channels_first); - for (const auto& effect : effects) { - chain.addEffect(effect); - } - chain.addOutputBuffer(&out_buffer); - chain.run(); - - // Create tensor from buffer - auto out_tensor = convert_to_tensor( - /*buffer=*/out_buffer.data(), - /*num_samples=*/out_buffer.size(), - /*num_channels=*/chain.getOutputNumChannels(), - dtype, - /*normalize=*/false, - channels_first); - - return std::tuple( - out_tensor, chain.getOutputSampleRate()); -} - -auto apply_effects_file( - const std::string& path, - const std::vector>& effects, - std::optional normalize, - std::optional channels_first, - const std::optional& format) - -> std::tuple { - // Open input file - SoxFormat sf(sox_open_read( - path.c_str(), - /*signal=*/nullptr, - /*encoding=*/nullptr, - /*filetype=*/format.has_value() ? format.value().c_str() : nullptr)); - - validate_input_file(sf, path); - - const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision); - - // Prepare output - std::vector out_buffer; - out_buffer.reserve(sf->signal.length); - - // Create and run SoxEffectsChain - SoxEffectsChain chain( - /*input_encoding=*/sf->encoding, - /*output_encoding=*/get_tensor_encodinginfo(dtype)); - - chain.addInputFile(sf); - for (const auto& effect : effects) { - chain.addEffect(effect); - } - chain.addOutputBuffer(&out_buffer); - chain.run(); - - // Create tensor from buffer - bool channels_first_ = channels_first.value_or(true); - auto tensor = convert_to_tensor( - /*buffer=*/out_buffer.data(), - /*num_samples=*/out_buffer.size(), - /*num_channels=*/chain.getOutputNumChannels(), - dtype, - normalize.value_or(true), - channels_first_); - - return std::tuple( - tensor, chain.getOutputSampleRate()); -} -} // namespace torchaudio::sox diff --git a/src/libtorchaudio/sox/effects.h b/src/libtorchaudio/sox/effects.h deleted file mode 100644 index 8b56427c1e..0000000000 --- a/src/libtorchaudio/sox/effects.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef TORCHAUDIO_SOX_EFFECTS_H -#define TORCHAUDIO_SOX_EFFECTS_H - -#include -#include - -namespace torchaudio::sox { - -void initialize_sox_effects(); - -void shutdown_sox_effects(); - -auto apply_effects_tensor( - torch::Tensor waveform, - int64_t sample_rate, - const std::vector>& effects, - bool channels_first) -> std::tuple; - -auto apply_effects_file( - const std::string& path, - const std::vector>& effects, - std::optional normalize, - std::optional channels_first, - const std::optional& format) - -> std::tuple; - -} // namespace torchaudio::sox - -#endif diff --git a/src/libtorchaudio/sox/effects_chain.cpp b/src/libtorchaudio/sox/effects_chain.cpp deleted file mode 100644 index 7f6109a343..0000000000 --- a/src/libtorchaudio/sox/effects_chain.cpp +++ /dev/null @@ -1,301 +0,0 @@ -#include -#include -#include "c10/util/Exception.h" - -using namespace torch::indexing; - -namespace torchaudio::sox { - -namespace { - -/// helper classes for passing the location of input tensor and output buffer -/// -/// drain/flow callback functions require plaing C style function signature and -/// the way to pass extra data is to attach data to sox_effect_t::priv pointer. -/// The following structs will be assigned to sox_effect_t::priv pointer which -/// gives sox_effect_t an access to input Tensor and output buffer object. -struct TensorInputPriv { - size_t index; - torch::Tensor* waveform; - int64_t sample_rate; - bool channels_first; -}; -struct TensorOutputPriv { - std::vector* buffer; -}; -struct FileOutputPriv { - sox_format_t* sf; -}; - -/// Callback function to feed Tensor data to SoxEffectChain. -int tensor_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp) { - // Retrieve the input Tensor and current index - auto priv = static_cast(effp->priv); - auto index = priv->index; - auto tensor = *(priv->waveform); - auto num_channels = effp->out_signal.channels; - - // Adjust the number of samples to read - const size_t num_samples = tensor.numel(); - if (index + *osamp > num_samples) { - *osamp = num_samples - index; - } - // Ensure that it's a multiple of the number of channels - *osamp -= *osamp % num_channels; - - // Slice the input Tensor - auto chunk = [&]() { - auto i_frame = index / num_channels; - auto num_frames = *osamp / num_channels; - auto t = (priv->channels_first) - ? tensor.index({Slice(), Slice(i_frame, i_frame + num_frames)}).t() - : tensor.index({Slice(i_frame, i_frame + num_frames), Slice()}); - return t.reshape({-1}); - }(); - - // Convert to sox_sample_t (int32_t) - switch (chunk.dtype().toScalarType()) { - case c10::ScalarType::Float: { - // Need to convert to 64-bit precision so that - // values around INT32_MIN/MAX are handled correctly. - chunk = chunk.to(c10::ScalarType::Double); - chunk *= 2147483648.; - chunk.clamp_(INT32_MIN, INT32_MAX); - chunk = chunk.to(c10::ScalarType::Int); - break; - } - case c10::ScalarType::Int: { - break; - } - case c10::ScalarType::Short: { - chunk = chunk.to(c10::ScalarType::Int); - chunk *= 65536; - break; - } - case c10::ScalarType::Byte: { - chunk = chunk.to(c10::ScalarType::Int); - chunk -= 128; - chunk *= 16777216; - break; - } - default: - TORCH_CHECK(false, "Unexpected dtype: ", chunk.dtype()); - } - // Write to buffer - chunk = chunk.contiguous(); - memcpy(obuf, chunk.data_ptr(), *osamp * 4); - priv->index += *osamp; - return (priv->index == num_samples) ? SOX_EOF : SOX_SUCCESS; -} - -/// Callback function to fetch data from SoxEffectChain. -int tensor_output_flow( - sox_effect_t* effp, - sox_sample_t const* ibuf, - sox_sample_t* obuf LSX_UNUSED, - size_t* isamp, - size_t* osamp) { - *osamp = 0; - // Get output buffer - auto out_buffer = static_cast(effp->priv)->buffer; - // Append at the end - out_buffer->insert(out_buffer->end(), ibuf, ibuf + *isamp); - return SOX_SUCCESS; -} - -int file_output_flow( - sox_effect_t* effp, - sox_sample_t const* ibuf, - sox_sample_t* obuf LSX_UNUSED, - size_t* isamp, - size_t* osamp) { - *osamp = 0; - if (*isamp) { - auto sf = static_cast(effp->priv)->sf; - if (sox_write(sf, ibuf, *isamp) != *isamp) { - TORCH_CHECK( - !sf->sox_errno, - sf->sox_errstr, - " ", - sox_strerror(sf->sox_errno), - " ", - sf->filename); - return SOX_EOF; - } - } - return SOX_SUCCESS; -} - -sox_effect_handler_t* get_tensor_input_handler() { - static sox_effect_handler_t handler{ - /*name=*/"input_tensor", - /*usage=*/nullptr, - /*flags=*/SOX_EFF_MCHAN, - /*getopts=*/nullptr, - /*start=*/nullptr, - /*flow=*/nullptr, - /*drain=*/tensor_input_drain, - /*stop=*/nullptr, - /*kill=*/nullptr, - /*priv_size=*/sizeof(TensorInputPriv)}; - return &handler; -} - -sox_effect_handler_t* get_tensor_output_handler() { - static sox_effect_handler_t handler{ - /*name=*/"output_tensor", - /*usage=*/nullptr, - /*flags=*/SOX_EFF_MCHAN, - /*getopts=*/nullptr, - /*start=*/nullptr, - /*flow=*/tensor_output_flow, - /*drain=*/nullptr, - /*stop=*/nullptr, - /*kill=*/nullptr, - /*priv_size=*/sizeof(TensorOutputPriv)}; - return &handler; -} - -sox_effect_handler_t* get_file_output_handler() { - static sox_effect_handler_t handler{ - /*name=*/"output_file", - /*usage=*/nullptr, - /*flags=*/SOX_EFF_MCHAN, - /*getopts=*/nullptr, - /*start=*/nullptr, - /*flow=*/file_output_flow, - /*drain=*/nullptr, - /*stop=*/nullptr, - /*kill=*/nullptr, - /*priv_size=*/sizeof(FileOutputPriv)}; - return &handler; -} - -} // namespace - -SoxEffect::SoxEffect(sox_effect_t* se) noexcept : se_(se) {} - -SoxEffect::~SoxEffect() { - if (se_ != nullptr) { - free(se_); - } -} - -SoxEffect::operator sox_effect_t*() const { - return se_; -} - -auto SoxEffect::operator->() noexcept -> sox_effect_t* { - return se_; -} - -SoxEffectsChain::SoxEffectsChain( - sox_encodinginfo_t input_encoding, - sox_encodinginfo_t output_encoding) - : in_enc_(input_encoding), - out_enc_(output_encoding), - in_sig_(), - interm_sig_(), - out_sig_(), - sec_(sox_create_effects_chain(&in_enc_, &out_enc_)) { - TORCH_CHECK(sec_, "Failed to create effect chain."); -} - -SoxEffectsChain::~SoxEffectsChain() { - if (sec_ != nullptr) { - sox_delete_effects_chain(sec_); - } -} - -void SoxEffectsChain::run() { - sox_flow_effects(sec_, nullptr, nullptr); -} - -void SoxEffectsChain::addInputTensor( - torch::Tensor* waveform, - int64_t sample_rate, - bool channels_first) { - in_sig_ = get_signalinfo(waveform, sample_rate, "wav", channels_first); - interm_sig_ = in_sig_; - SoxEffect e(sox_create_effect(get_tensor_input_handler())); - auto priv = static_cast(e->priv); - priv->index = 0; - priv->waveform = waveform; - priv->sample_rate = sample_rate; - priv->channels_first = channels_first; - TORCH_CHECK( - sox_add_effect(sec_, e, &interm_sig_, &in_sig_) == SOX_SUCCESS, - "Internal Error: Failed to add effect: input_tensor"); -} - -void SoxEffectsChain::addOutputBuffer( - std::vector* output_buffer) { - SoxEffect e(sox_create_effect(get_tensor_output_handler())); - static_cast(e->priv)->buffer = output_buffer; - TORCH_CHECK( - sox_add_effect(sec_, e, &interm_sig_, &in_sig_) == SOX_SUCCESS, - "Internal Error: Failed to add effect: output_tensor"); -} - -void SoxEffectsChain::addInputFile(sox_format_t* sf) { - in_sig_ = sf->signal; - interm_sig_ = in_sig_; - SoxEffect e(sox_create_effect(sox_find_effect("input"))); - char* opts[] = {(char*)sf}; - sox_effect_options(e, 1, opts); - TORCH_CHECK( - sox_add_effect(sec_, e, &interm_sig_, &in_sig_) == SOX_SUCCESS, - "Internal Error: Failed to add effect: input ", - sf->filename); -} - -void SoxEffectsChain::addOutputFile(sox_format_t* sf) { - out_sig_ = sf->signal; - SoxEffect e(sox_create_effect(get_file_output_handler())); - static_cast(e->priv)->sf = sf; - TORCH_CHECK( - sox_add_effect(sec_, e, &interm_sig_, &out_sig_) == SOX_SUCCESS, - "Internal Error: Failed to add effect: output ", - sf->filename); -} - -void SoxEffectsChain::addEffect(const std::vector& effect) { - const auto num_args = effect.size(); - TORCH_CHECK(num_args != 0, "Invalid argument: empty effect."); - const auto name = effect[0]; - TORCH_CHECK( - UNSUPPORTED_EFFECTS.find(name) == UNSUPPORTED_EFFECTS.end(), - "Unsupported effect: ", - name) - - auto returned_effect = sox_find_effect(name.c_str()); - TORCH_CHECK(returned_effect, "Unsupported effect: ", name) - - SoxEffect e(sox_create_effect(returned_effect)); - const auto num_options = num_args - 1; - - std::vector opts; - for (size_t i = 1; i < num_args; ++i) { - opts.push_back((char*)effect[i].c_str()); - } - TORCH_CHECK( - sox_effect_options(e, num_options, num_options ? opts.data() : nullptr) == - SOX_SUCCESS, - "Invalid effect option: ", - c10::Join(" ", effect)) - TORCH_CHECK( - sox_add_effect(sec_, e, &interm_sig_, &in_sig_) == SOX_SUCCESS, - "Internal Error: Failed to add effect: \"", - c10::Join(" ", effect), - "\""); -} - -int64_t SoxEffectsChain::getOutputNumChannels() { - return interm_sig_.channels; -} - -int64_t SoxEffectsChain::getOutputSampleRate() { - return interm_sig_.rate; -} - -} // namespace torchaudio::sox diff --git a/src/libtorchaudio/sox/effects_chain.h b/src/libtorchaudio/sox/effects_chain.h deleted file mode 100644 index e6a892b5e8..0000000000 --- a/src/libtorchaudio/sox/effects_chain.h +++ /dev/null @@ -1,61 +0,0 @@ -#ifndef TORCHAUDIO_SOX_EFFECTS_CHAIN_H -#define TORCHAUDIO_SOX_EFFECTS_CHAIN_H - -#include -#include - -namespace torchaudio::sox { - -// Helper struct to safely close sox_effect_t* pointer returned by -// sox_create_effect - -struct SoxEffect { - explicit SoxEffect(sox_effect_t* se) noexcept; - SoxEffect(const SoxEffect& other) = delete; - SoxEffect(SoxEffect&& other) = delete; - auto operator=(const SoxEffect& other) -> SoxEffect& = delete; - auto operator=(SoxEffect&& other) -> SoxEffect& = delete; - ~SoxEffect(); - operator sox_effect_t*() const; - auto operator->() noexcept -> sox_effect_t*; - - private: - sox_effect_t* se_; -}; - -// Helper struct to safely close sox_effects_chain_t with handy methods -class SoxEffectsChain { - const sox_encodinginfo_t in_enc_; - const sox_encodinginfo_t out_enc_; - - protected: - sox_signalinfo_t in_sig_; - sox_signalinfo_t interm_sig_; - sox_signalinfo_t out_sig_; - sox_effects_chain_t* sec_; - - public: - explicit SoxEffectsChain( - sox_encodinginfo_t input_encoding, - sox_encodinginfo_t output_encoding); - SoxEffectsChain(const SoxEffectsChain& other) = delete; - SoxEffectsChain(SoxEffectsChain&& other) = delete; - SoxEffectsChain& operator=(const SoxEffectsChain& other) = delete; - SoxEffectsChain& operator=(SoxEffectsChain&& other) = delete; - ~SoxEffectsChain(); - void run(); - void addInputTensor( - torch::Tensor* waveform, - int64_t sample_rate, - bool channels_first); - void addInputFile(sox_format_t* sf); - void addOutputBuffer(std::vector* output_buffer); - void addOutputFile(sox_format_t* sf); - void addEffect(const std::vector& effect); - int64_t getOutputNumChannels(); - int64_t getOutputSampleRate(); -}; - -} // namespace torchaudio::sox - -#endif diff --git a/src/libtorchaudio/sox/io.cpp b/src/libtorchaudio/sox/io.cpp deleted file mode 100644 index 474726ad1c..0000000000 --- a/src/libtorchaudio/sox/io.cpp +++ /dev/null @@ -1,128 +0,0 @@ -#include -#include -#include -#include -#include - -using namespace torch::indexing; - -namespace torchaudio::sox { - -std::tuple get_info_file( - const std::string& path, - const std::optional& format) { - SoxFormat sf(sox_open_read( - path.c_str(), - /*signal=*/nullptr, - /*encoding=*/nullptr, - /*filetype=*/format.has_value() ? format.value().c_str() : nullptr)); - - validate_input_file(sf, path); - - return std::make_tuple( - static_cast(sf->signal.rate), - static_cast(sf->signal.length / sf->signal.channels), - static_cast(sf->signal.channels), - static_cast(sf->encoding.bits_per_sample), - get_encoding(sf->encoding.encoding)); -} - -std::vector> get_effects( - const std::optional& frame_offset, - const std::optional& num_frames) { - const auto offset = frame_offset.value_or(0); - TORCH_CHECK( - offset >= 0, - "Invalid argument: frame_offset must be non-negative. Found: ", - offset); - const auto frames = num_frames.value_or(-1); - TORCH_CHECK( - frames > 0 || frames == -1, - "Invalid argument: num_frames must be -1 or greater than 0."); - - std::vector> effects; - if (frames != -1) { - std::ostringstream os_offset, os_frames; - os_offset << offset << "s"; - os_frames << "+" << frames << "s"; - effects.emplace_back( - std::vector{"trim", os_offset.str(), os_frames.str()}); - } else if (offset != 0) { - std::ostringstream os_offset; - os_offset << offset << "s"; - effects.emplace_back(std::vector{"trim", os_offset.str()}); - } - return effects; -} - -std::tuple load_audio_file( - const std::string& path, - const std::optional& frame_offset, - const std::optional& num_frames, - std::optional normalize, - std::optional channels_first, - const std::optional& format) { - auto effects = get_effects(frame_offset, num_frames); - return apply_effects_file(path, effects, normalize, channels_first, format); -} - -void save_audio_file( - const std::string& path, - torch::Tensor tensor, - int64_t sample_rate, - bool channels_first, - std::optional compression, - std::optional format, - std::optional encoding, - std::optional bits_per_sample) { - validate_input_tensor(tensor); - - const auto filetype = [&]() { - if (format.has_value()) { - return format.value(); - } - return get_filetype(path); - }(); - - if (filetype == "amr-nb") { - const auto num_channels = tensor.size(channels_first ? 0 : 1); - TORCH_CHECK( - num_channels == 1, "amr-nb format only supports single channel audio."); - } else if (filetype == "htk") { - const auto num_channels = tensor.size(channels_first ? 0 : 1); - TORCH_CHECK( - num_channels == 1, "htk format only supports single channel audio."); - } else if (filetype == "gsm") { - const auto num_channels = tensor.size(channels_first ? 0 : 1); - TORCH_CHECK( - num_channels == 1, "gsm format only supports single channel audio."); - TORCH_CHECK( - sample_rate == 8000, - "gsm format only supports a sampling rate of 8kHz."); - } - const auto signal_info = - get_signalinfo(&tensor, sample_rate, filetype, channels_first); - const auto encoding_info = get_encodinginfo_for_save( - filetype, tensor.dtype(), compression, encoding, bits_per_sample); - - SoxFormat sf(sox_open_write( - path.c_str(), - &signal_info, - &encoding_info, - /*filetype=*/filetype.c_str(), - /*oob=*/nullptr, - /*overwrite_permitted=*/nullptr)); - - TORCH_CHECK( - static_cast(sf) != nullptr, - "Error saving audio file: failed to open file ", - path); - - SoxEffectsChain chain( - /*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()), - /*output_encoding=*/sf->encoding); - chain.addInputTensor(&tensor, sample_rate, channels_first); - chain.addOutputFile(sf); - chain.run(); -} -} // namespace torchaudio::sox diff --git a/src/libtorchaudio/sox/io.h b/src/libtorchaudio/sox/io.h deleted file mode 100644 index b011ef59be..0000000000 --- a/src/libtorchaudio/sox/io.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef TORCHAUDIO_SOX_IO_H -#define TORCHAUDIO_SOX_IO_H - -#include -#include - -namespace torchaudio::sox { - -auto get_effects( - const std::optional& frame_offset, - const std::optional& num_frames) - -> std::vector>; - -std::tuple get_info_file( - const std::string& path, - const std::optional& format); - -std::tuple load_audio_file( - const std::string& path, - const std::optional& frame_offset, - const std::optional& num_frames, - std::optional normalize, - std::optional channels_first, - const std::optional& format); - -void save_audio_file( - const std::string& path, - torch::Tensor tensor, - int64_t sample_rate, - bool channels_first, - std::optional compression, - std::optional format, - std::optional encoding, - std::optional bits_per_sample); - -} // namespace torchaudio::sox - -#endif diff --git a/src/libtorchaudio/sox/pybind/pybind.cpp b/src/libtorchaudio/sox/pybind/pybind.cpp deleted file mode 100644 index bd9c82c349..0000000000 --- a/src/libtorchaudio/sox/pybind/pybind.cpp +++ /dev/null @@ -1,39 +0,0 @@ -#include -#include -#include -#include - -namespace torchaudio { -namespace sox { -namespace { - -TORCH_LIBRARY(torchaudio_sox, m) { - m.def("torchaudio_sox::get_info", &get_info_file); - m.def("torchaudio_sox::load_audio_file", &load_audio_file); - m.def("torchaudio_sox::save_audio_file", &save_audio_file); - m.def("torchaudio_sox::initialize_sox_effects", &initialize_sox_effects); - m.def("torchaudio_sox::shutdown_sox_effects", &shutdown_sox_effects); - m.def("torchaudio_sox::apply_effects_tensor", &apply_effects_tensor); - m.def("torchaudio_sox::apply_effects_file", &apply_effects_file); -} - -PYBIND11_MODULE(_torchaudio_sox, m) { - m.def("set_seed", &set_seed, "Set random seed."); - m.def("set_verbosity", &set_verbosity, "Set verbosity."); - m.def("set_use_threads", &set_use_threads, "Set threading."); - m.def("set_buffer_size", &set_buffer_size, "Set buffer size."); - m.def("get_buffer_size", &get_buffer_size, "Get buffer size."); - m.def("list_effects", &list_effects, "List available effects."); - m.def( - "list_read_formats", - &list_read_formats, - "List supported formats for decoding."); - m.def( - "list_write_formats", - &list_write_formats, - "List supported formats for encoding."); -} - -} // namespace -} // namespace sox -} // namespace torchaudio diff --git a/src/libtorchaudio/sox/types.cpp b/src/libtorchaudio/sox/types.cpp deleted file mode 100644 index 12bd070105..0000000000 --- a/src/libtorchaudio/sox/types.cpp +++ /dev/null @@ -1,148 +0,0 @@ -#include - -namespace torchaudio::sox { - -Format get_format_from_string(const std::string& format) { - if (format == "wav") { - return Format::WAV; - } - if (format == "mp3") { - return Format::MP3; - } - if (format == "flac") { - return Format::FLAC; - } - if (format == "ogg" || format == "vorbis") { - return Format::VORBIS; - } - if (format == "amr-nb") { - return Format::AMR_NB; - } - if (format == "amr-wb") { - return Format::AMR_WB; - } - if (format == "amb") { - return Format::AMB; - } - if (format == "sph") { - return Format::SPHERE; - } - if (format == "htk") { - return Format::HTK; - } - if (format == "gsm") { - return Format::GSM; - } - TORCH_CHECK(false, "Internal Error: unexpected format value: ", format); -} - -std::string to_string(Encoding v) { - switch (v) { - case Encoding::UNKNOWN: - return "UNKNOWN"; - case Encoding::PCM_SIGNED: - return "PCM_S"; - case Encoding::PCM_UNSIGNED: - return "PCM_U"; - case Encoding::PCM_FLOAT: - return "PCM_F"; - case Encoding::FLAC: - return "FLAC"; - case Encoding::ULAW: - return "ULAW"; - case Encoding::ALAW: - return "ALAW"; - case Encoding::MP3: - return "MP3"; - case Encoding::VORBIS: - return "VORBIS"; - case Encoding::AMR_WB: - return "AMR_WB"; - case Encoding::AMR_NB: - return "AMR_NB"; - case Encoding::OPUS: - return "OPUS"; - default: - TORCH_CHECK(false, "Internal Error: unexpected encoding."); - } -} - -Encoding get_encoding_from_option(const std::optional& encoding) { - if (!encoding.has_value()) { - return Encoding::NOT_PROVIDED; - } - std::string v = encoding.value(); - if (v == "PCM_S") { - return Encoding::PCM_SIGNED; - } - if (v == "PCM_U") { - return Encoding::PCM_UNSIGNED; - } - if (v == "PCM_F") { - return Encoding::PCM_FLOAT; - } - if (v == "ULAW") { - return Encoding::ULAW; - } - if (v == "ALAW") { - return Encoding::ALAW; - } - TORCH_CHECK(false, "Internal Error: unexpected encoding value: ", v); -} - -BitDepth get_bit_depth_from_option(const std::optional& bit_depth) { - if (!bit_depth.has_value()) { - return BitDepth::NOT_PROVIDED; - } - int64_t v = bit_depth.value(); - switch (v) { - case 8: - return BitDepth::B8; - case 16: - return BitDepth::B16; - case 24: - return BitDepth::B24; - case 32: - return BitDepth::B32; - case 64: - return BitDepth::B64; - default: { - TORCH_CHECK(false, "Internal Error: unexpected bit depth value: ", v); - } - } -} - -std::string get_encoding(sox_encoding_t encoding) { - switch (encoding) { - case SOX_ENCODING_UNKNOWN: - return "UNKNOWN"; - case SOX_ENCODING_SIGN2: - return "PCM_S"; - case SOX_ENCODING_UNSIGNED: - return "PCM_U"; - case SOX_ENCODING_FLOAT: - return "PCM_F"; - case SOX_ENCODING_FLAC: - return "FLAC"; - case SOX_ENCODING_ULAW: - return "ULAW"; - case SOX_ENCODING_ALAW: - return "ALAW"; - case SOX_ENCODING_MP3: - return "MP3"; - case SOX_ENCODING_VORBIS: - return "VORBIS"; - case SOX_ENCODING_AMR_WB: - return "AMR_WB"; - case SOX_ENCODING_AMR_NB: - return "AMR_NB"; - case SOX_ENCODING_OPUS: - return "OPUS"; - case SOX_ENCODING_GSM: - return "GSM"; - default: - return "UNKNOWN"; - } -} - -} // namespace torchaudio::sox diff --git a/src/libtorchaudio/sox/types.h b/src/libtorchaudio/sox/types.h deleted file mode 100644 index 714d303313..0000000000 --- a/src/libtorchaudio/sox/types.h +++ /dev/null @@ -1,58 +0,0 @@ -#ifndef TORCHAUDIO_SOX_TYPES_H -#define TORCHAUDIO_SOX_TYPES_H - -#include -#include - -namespace torchaudio::sox { - -enum class Format { - WAV, - MP3, - FLAC, - VORBIS, - AMR_NB, - AMR_WB, - AMB, - SPHERE, - GSM, - HTK, -}; - -Format get_format_from_string(const std::string& format); - -enum class Encoding { - NOT_PROVIDED, - UNKNOWN, - PCM_SIGNED, - PCM_UNSIGNED, - PCM_FLOAT, - FLAC, - ULAW, - ALAW, - MP3, - VORBIS, - AMR_WB, - AMR_NB, - OPUS, -}; - -std::string to_string(Encoding v); -Encoding get_encoding_from_option(const std::optional& encoding); - -enum class BitDepth : unsigned { - NOT_PROVIDED = 0, - B8 = 8, - B16 = 16, - B24 = 24, - B32 = 32, - B64 = 64, -}; - -BitDepth get_bit_depth_from_option(const std::optional& bit_depth); - -std::string get_encoding(sox_encoding_t encoding); - -} // namespace torchaudio::sox - -#endif diff --git a/src/libtorchaudio/sox/utils.cpp b/src/libtorchaudio/sox/utils.cpp deleted file mode 100644 index 94748c5209..0000000000 --- a/src/libtorchaudio/sox/utils.cpp +++ /dev/null @@ -1,509 +0,0 @@ -#include -#include -#include -#include - -namespace torchaudio::sox { - -const std::unordered_set UNSUPPORTED_EFFECTS{ - "input", - "output", - "spectrogram", - "noiseprof", - "noisered", - "splice"}; - -void set_seed(const int64_t seed) { - sox_get_globals()->ranqd1 = static_cast(seed); -} - -void set_verbosity(const int64_t verbosity) { - sox_get_globals()->verbosity = static_cast(verbosity); -} - -void set_use_threads(const bool use_threads) { - sox_get_globals()->use_threads = static_cast(use_threads); -} - -void set_buffer_size(const int64_t buffer_size) { - sox_get_globals()->bufsiz = static_cast(buffer_size); -} - -int64_t get_buffer_size() { - return sox_get_globals()->bufsiz; -} - -std::vector> list_effects() { - std::vector> effects; - for (const sox_effect_fn_t* fns = sox_get_effect_fns(); *fns; ++fns) { - const sox_effect_handler_t* handler = (*fns)(); - if (handler && handler->name) { - if (UNSUPPORTED_EFFECTS.find(handler->name) == - UNSUPPORTED_EFFECTS.end()) { - effects.emplace_back(std::vector{ - handler->name, - handler->usage ? std::string(handler->usage) : std::string("")}); - } - } - } - return effects; -} - -std::vector list_write_formats() { - std::vector formats; - for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) { - const sox_format_handler_t* handler = fns->fn(); - for (const char* const* names = handler->names; *names; ++names) { - if (!strchr(*names, '/') && handler->write) { - formats.emplace_back(*names); - } - } - } - return formats; -} - -std::vector list_read_formats() { - std::vector formats; - for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) { - const sox_format_handler_t* handler = fns->fn(); - for (const char* const* names = handler->names; *names; ++names) { - if (!strchr(*names, '/') && handler->read) { - formats.emplace_back(*names); - } - } - } - return formats; -} - -SoxFormat::SoxFormat(sox_format_t* fd) noexcept : fd_(fd) {} -SoxFormat::~SoxFormat() { - close(); -} - -sox_format_t* SoxFormat::operator->() const noexcept { - return fd_; -} -SoxFormat::operator sox_format_t*() const noexcept { - return fd_; -} - -void SoxFormat::close() { - if (fd_ != nullptr) { - sox_close(fd_); - fd_ = nullptr; - } -} - -void validate_input_file(const SoxFormat& sf, const std::string& path) { - TORCH_CHECK( - static_cast(sf) != nullptr, - "Error loading audio file: failed to open file " + path); - TORCH_CHECK( - sf->encoding.encoding != SOX_ENCODING_UNKNOWN, - "Error loading audio file: unknown encoding."); -} - -void validate_input_tensor(const torch::Tensor& tensor) { - TORCH_CHECK(tensor.device().is_cpu(), "Input tensor has to be on CPU."); - - TORCH_CHECK(tensor.ndimension() == 2, "Input tensor has to be 2D."); - - switch (tensor.dtype().toScalarType()) { - case c10::ScalarType::Byte: - case c10::ScalarType::Short: - case c10::ScalarType::Int: - case c10::ScalarType::Float: - break; - default: - TORCH_CHECK( - false, - "Input tensor has to be one of float32, int32, int16 or uint8 type."); - } -} - -caffe2::TypeMeta get_dtype( - const sox_encoding_t encoding, - const unsigned precision) { - const auto dtype = [&]() { - switch (encoding) { - case SOX_ENCODING_UNSIGNED: // 8-bit PCM WAV - return torch::kUInt8; - case SOX_ENCODING_SIGN2: // 16-bit, 24-bit, or 32-bit PCM WAV - switch (precision) { - case 16: - return torch::kInt16; - case 24: // Cast 24-bit to 32-bit. - case 32: - return torch::kInt32; - default: - TORCH_CHECK( - false, - "Only 16, 24, and 32 bits are supported for signed PCM."); - } - default: - // default to float32 for the other formats, including - // 32-bit flaoting-point WAV, - // MP3, - // FLAC, - // VORBIS etc... - return torch::kFloat32; - } - }(); - return c10::scalarTypeToTypeMeta(dtype); -} - -torch::Tensor convert_to_tensor( - sox_sample_t* buffer, - const int32_t num_samples, - const int32_t num_channels, - const caffe2::TypeMeta dtype, - const bool normalize, - const bool channels_first) { - torch::Tensor t; - uint64_t dummy = 0; - SOX_SAMPLE_LOCALS; - if (normalize || dtype == torch::kFloat32) { - t = torch::empty( - {num_samples / num_channels, num_channels}, torch::kFloat32); - auto ptr = t.data_ptr(); - for (int32_t i = 0; i < num_samples; ++i) { - ptr[i] = SOX_SAMPLE_TO_FLOAT_32BIT(buffer[i], dummy); - } - } else if (dtype == torch::kInt32) { - t = torch::from_blob( - buffer, {num_samples / num_channels, num_channels}, torch::kInt32) - .clone(); - } else if (dtype == torch::kInt16) { - t = torch::empty({num_samples / num_channels, num_channels}, torch::kInt16); - auto ptr = t.data_ptr(); - for (int32_t i = 0; i < num_samples; ++i) { - ptr[i] = SOX_SAMPLE_TO_SIGNED_16BIT(buffer[i], dummy); - } - } else if (dtype == torch::kUInt8) { - t = torch::empty({num_samples / num_channels, num_channels}, torch::kUInt8); - auto ptr = t.data_ptr(); - for (int32_t i = 0; i < num_samples; ++i) { - ptr[i] = SOX_SAMPLE_TO_UNSIGNED_8BIT(buffer[i], dummy); - } - } else { - TORCH_CHECK(false, "Unsupported dtype: ", dtype); - } - if (channels_first) { - t = t.transpose(1, 0); - } - return t.contiguous(); -} - -const std::string get_filetype(const std::string& path) { - std::string ext = path.substr(path.find_last_of('.') + 1); - std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower); - return ext; -} - -namespace { - -std::tuple get_save_encoding_for_wav( - const std::string& format, - caffe2::TypeMeta dtype, - const Encoding& encoding, - const BitDepth& bits_per_sample) { - switch (encoding) { - case Encoding::NOT_PROVIDED: - switch (bits_per_sample) { - case BitDepth::NOT_PROVIDED: - switch (dtype.toScalarType()) { - case c10::ScalarType::Float: - return std::make_tuple<>(SOX_ENCODING_FLOAT, 32); - case c10::ScalarType::Int: - return std::make_tuple<>(SOX_ENCODING_SIGN2, 32); - case c10::ScalarType::Short: - return std::make_tuple<>(SOX_ENCODING_SIGN2, 16); - case c10::ScalarType::Byte: - return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8); - default: - TORCH_CHECK(false, "Internal Error: Unexpected dtype: ", dtype); - } - case BitDepth::B8: - return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8); - default: - return std::make_tuple<>( - SOX_ENCODING_SIGN2, static_cast(bits_per_sample)); - } - case Encoding::PCM_SIGNED: - switch (bits_per_sample) { - case BitDepth::NOT_PROVIDED: - return std::make_tuple<>(SOX_ENCODING_SIGN2, 16); - case BitDepth::B8: - TORCH_CHECK( - false, format, " does not support 8-bit signed PCM encoding."); - default: - return std::make_tuple<>( - SOX_ENCODING_SIGN2, static_cast(bits_per_sample)); - } - case Encoding::PCM_UNSIGNED: - switch (bits_per_sample) { - case BitDepth::NOT_PROVIDED: - case BitDepth::B8: - return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8); - default: - TORCH_CHECK( - false, format, " only supports 8-bit for unsigned PCM encoding."); - } - case Encoding::PCM_FLOAT: - switch (bits_per_sample) { - case BitDepth::NOT_PROVIDED: - case BitDepth::B32: - return std::make_tuple<>(SOX_ENCODING_FLOAT, 32); - case BitDepth::B64: - return std::make_tuple<>(SOX_ENCODING_FLOAT, 64); - default: - TORCH_CHECK( - false, - format, - " only supports 32-bit or 64-bit for floating-point PCM encoding."); - } - case Encoding::ULAW: - switch (bits_per_sample) { - case BitDepth::NOT_PROVIDED: - case BitDepth::B8: - return std::make_tuple<>(SOX_ENCODING_ULAW, 8); - default: - TORCH_CHECK( - false, format, " only supports 8-bit for mu-law encoding."); - } - case Encoding::ALAW: - switch (bits_per_sample) { - case BitDepth::NOT_PROVIDED: - case BitDepth::B8: - return std::make_tuple<>(SOX_ENCODING_ALAW, 8); - default: - TORCH_CHECK( - false, format, " only supports 8-bit for a-law encoding."); - } - default: - TORCH_CHECK( - false, format, " does not support encoding: " + to_string(encoding)); - } -} - -std::tuple get_save_encoding( - const std::string& format, - const caffe2::TypeMeta& dtype, - const std::optional& encoding, - const std::optional& bits_per_sample) { - const Format fmt = get_format_from_string(format); - const Encoding enc = get_encoding_from_option(encoding); - const BitDepth bps = get_bit_depth_from_option(bits_per_sample); - - switch (fmt) { - case Format::WAV: - case Format::AMB: - return get_save_encoding_for_wav(format, dtype, enc, bps); - case Format::MP3: - TORCH_CHECK( - enc == Encoding::NOT_PROVIDED, - "mp3 does not support `encoding` option."); - TORCH_CHECK( - bps == BitDepth::NOT_PROVIDED, - "mp3 does not support `bits_per_sample` option."); - return std::make_tuple<>(SOX_ENCODING_MP3, 16); - case Format::HTK: - TORCH_CHECK( - enc == Encoding::NOT_PROVIDED, - "htk does not support `encoding` option."); - TORCH_CHECK( - bps == BitDepth::NOT_PROVIDED, - "htk does not support `bits_per_sample` option."); - return std::make_tuple<>(SOX_ENCODING_SIGN2, 16); - case Format::VORBIS: - TORCH_CHECK( - enc == Encoding::NOT_PROVIDED, - "vorbis does not support `encoding` option."); - TORCH_CHECK( - bps == BitDepth::NOT_PROVIDED, - "vorbis does not support `bits_per_sample` option."); - return std::make_tuple<>(SOX_ENCODING_VORBIS, 0); - case Format::AMR_NB: - TORCH_CHECK( - enc == Encoding::NOT_PROVIDED, - "amr-nb does not support `encoding` option."); - TORCH_CHECK( - bps == BitDepth::NOT_PROVIDED, - "amr-nb does not support `bits_per_sample` option."); - return std::make_tuple<>(SOX_ENCODING_AMR_NB, 16); - case Format::FLAC: - TORCH_CHECK( - enc == Encoding::NOT_PROVIDED, - "flac does not support `encoding` option."); - switch (bps) { - case BitDepth::B32: - case BitDepth::B64: - TORCH_CHECK( - false, "flac does not support `bits_per_sample` larger than 24."); - default: - return std::make_tuple<>( - SOX_ENCODING_FLAC, static_cast(bps)); - } - case Format::SPHERE: - switch (enc) { - case Encoding::NOT_PROVIDED: - case Encoding::PCM_SIGNED: - switch (bps) { - case BitDepth::NOT_PROVIDED: - return std::make_tuple<>(SOX_ENCODING_SIGN2, 32); - default: - return std::make_tuple<>( - SOX_ENCODING_SIGN2, static_cast(bps)); - } - case Encoding::PCM_UNSIGNED: - TORCH_CHECK(false, "sph does not support unsigned integer PCM."); - case Encoding::PCM_FLOAT: - TORCH_CHECK(false, "sph does not support floating point PCM."); - case Encoding::ULAW: - switch (bps) { - case BitDepth::NOT_PROVIDED: - case BitDepth::B8: - return std::make_tuple<>(SOX_ENCODING_ULAW, 8); - default: - TORCH_CHECK( - false, "sph only supports 8-bit for mu-law encoding."); - } - case Encoding::ALAW: - switch (bps) { - case BitDepth::NOT_PROVIDED: - case BitDepth::B8: - return std::make_tuple<>(SOX_ENCODING_ALAW, 8); - default: - return std::make_tuple<>( - SOX_ENCODING_ALAW, static_cast(bps)); - } - default: - TORCH_CHECK( - false, "sph does not support encoding: ", encoding.value()); - } - case Format::GSM: - TORCH_CHECK( - enc == Encoding::NOT_PROVIDED, - "gsm does not support `encoding` option."); - TORCH_CHECK( - bps == BitDepth::NOT_PROVIDED, - "gsm does not support `bits_per_sample` option."); - return std::make_tuple<>(SOX_ENCODING_GSM, 16); - - default: - TORCH_CHECK(false, "Unsupported format: " + format); - } -} - -unsigned get_precision(const std::string& filetype, caffe2::TypeMeta dtype) { - if (filetype == "mp3") { - return SOX_UNSPEC; - } - if (filetype == "flac") { - return 24; - } - if (filetype == "ogg" || filetype == "vorbis") { - return SOX_UNSPEC; - } - if (filetype == "wav" || filetype == "amb") { - switch (dtype.toScalarType()) { - case c10::ScalarType::Byte: - return 8; - case c10::ScalarType::Short: - return 16; - case c10::ScalarType::Int: - return 32; - case c10::ScalarType::Float: - return 32; - default: - TORCH_CHECK(false, "Unsupported dtype: ", dtype); - } - } - if (filetype == "sph") { - return 32; - } - if (filetype == "amr-nb") { - return 16; - } - if (filetype == "gsm") { - return 16; - } - if (filetype == "htk") { - return 16; - } - TORCH_CHECK(false, "Unsupported file type: ", filetype); -} - -} // namespace - -sox_signalinfo_t get_signalinfo( - const torch::Tensor* waveform, - const int64_t sample_rate, - const std::string& filetype, - const bool channels_first) { - return sox_signalinfo_t{ - /*rate=*/static_cast(sample_rate), - /*channels=*/ - static_cast(waveform->size(channels_first ? 0 : 1)), - /*precision=*/get_precision(filetype, waveform->dtype()), - /*length=*/static_cast(waveform->numel()), - nullptr}; -} - -sox_encodinginfo_t get_tensor_encodinginfo(caffe2::TypeMeta dtype) { - sox_encoding_t encoding = [&]() { - switch (dtype.toScalarType()) { - case c10::ScalarType::Byte: - return SOX_ENCODING_UNSIGNED; - case c10::ScalarType::Short: - return SOX_ENCODING_SIGN2; - case c10::ScalarType::Int: - return SOX_ENCODING_SIGN2; - case c10::ScalarType::Float: - return SOX_ENCODING_FLOAT; - default: - TORCH_CHECK(false, "Unsupported dtype: ", dtype); - } - }(); - unsigned bits_per_sample = [&]() { - switch (dtype.toScalarType()) { - case c10::ScalarType::Byte: - return 8; - case c10::ScalarType::Short: - return 16; - case c10::ScalarType::Int: - return 32; - case c10::ScalarType::Float: - return 32; - default: - TORCH_CHECK(false, "Unsupported dtype: ", dtype); - } - }(); - return sox_encodinginfo_t{ - /*encoding=*/encoding, - /*bits_per_sample=*/bits_per_sample, - /*compression=*/HUGE_VAL, - /*reverse_bytes=*/sox_option_default, - /*reverse_nibbles=*/sox_option_default, - /*reverse_bits=*/sox_option_default, - /*opposite_endian=*/sox_false}; -} - -sox_encodinginfo_t get_encodinginfo_for_save( - const std::string& format, - const caffe2::TypeMeta& dtype, - const std::optional& compression, - const std::optional& encoding, - const std::optional& bits_per_sample) { - auto enc = get_save_encoding(format, dtype, encoding, bits_per_sample); - return sox_encodinginfo_t{ - /*encoding=*/std::get<0>(enc), - /*bits_per_sample=*/std::get<1>(enc), - /*compression=*/compression.value_or(HUGE_VAL), - /*reverse_bytes=*/sox_option_default, - /*reverse_nibbles=*/sox_option_default, - /*reverse_bits=*/sox_option_default, - /*opposite_endian=*/sox_false}; -} - -} // namespace torchaudio::sox diff --git a/src/libtorchaudio/sox/utils.h b/src/libtorchaudio/sox/utils.h deleted file mode 100644 index b26e25f65e..0000000000 --- a/src/libtorchaudio/sox/utils.h +++ /dev/null @@ -1,112 +0,0 @@ -#ifndef TORCHAUDIO_SOX_UTILS_H -#define TORCHAUDIO_SOX_UTILS_H - -#include -#include - -namespace torchaudio::sox { - -//////////////////////////////////////////////////////////////////////////////// -// APIs for Python interaction -//////////////////////////////////////////////////////////////////////////////// - -/// Set sox global options -void set_seed(const int64_t seed); - -void set_verbosity(const int64_t verbosity); - -void set_use_threads(const bool use_threads); - -void set_buffer_size(const int64_t buffer_size); - -int64_t get_buffer_size(); - -std::vector> list_effects(); - -std::vector list_read_formats(); - -std::vector list_write_formats(); - -//////////////////////////////////////////////////////////////////////////////// -// Utilities for sox_io / sox_effects implementations -//////////////////////////////////////////////////////////////////////////////// - -extern const std::unordered_set UNSUPPORTED_EFFECTS; - -/// helper class to automatically close sox_format_t* -struct SoxFormat { - explicit SoxFormat(sox_format_t* fd) noexcept; - SoxFormat(const SoxFormat& other) = delete; - SoxFormat(SoxFormat&& other) = delete; - SoxFormat& operator=(const SoxFormat& other) = delete; - SoxFormat& operator=(SoxFormat&& other) = delete; - ~SoxFormat(); - sox_format_t* operator->() const noexcept; - operator sox_format_t*() const noexcept; - - void close(); - - private: - sox_format_t* fd_; -}; - -/// -/// Verify that input file is found, has known encoding, and not empty -void validate_input_file(const SoxFormat& sf, const std::string& path); - -/// -/// Verify that input Tensor is 2D, CPU and either uin8, int16, int32 or float32 -void validate_input_tensor(const torch::Tensor&); - -/// -/// Get target dtype for the given encoding and precision. -caffe2::TypeMeta get_dtype( - const sox_encoding_t encoding, - const unsigned precision); - -/// -/// Convert sox_sample_t buffer to uint8/int16/int32/float32 Tensor -/// NOTE: This function might modify the values in the input buffer to -/// reduce the number of memory copy. -/// @param buffer Pointer to buffer that contains audio data. -/// @param num_samples The number of samples to read. -/// @param num_channels The number of channels. Used to reshape the resulting -/// Tensor. -/// @param dtype Target dtype. Determines the output dtype and value range in -/// conjunction with normalization. -/// @param noramlize Perform normalization. Only effective when dtype is not -/// kFloat32. When effective, the output tensor is kFloat32 type and value range -/// is [-1.0, 1.0] -/// @param channels_first When True, output Tensor has shape of [num_channels, -/// num_frames]. -torch::Tensor convert_to_tensor( - sox_sample_t* buffer, - const int32_t num_samples, - const int32_t num_channels, - const caffe2::TypeMeta dtype, - const bool normalize, - const bool channels_first); - -/// Extract extension from file path -const std::string get_filetype(const std::string& path); - -/// Get sox_signalinfo_t for passing a torch::Tensor object. -sox_signalinfo_t get_signalinfo( - const torch::Tensor* waveform, - const int64_t sample_rate, - const std::string& filetype, - const bool channels_first); - -/// Get sox_encodinginfo_t for Tensor I/O -sox_encodinginfo_t get_tensor_encodinginfo(const caffe2::TypeMeta dtype); - -/// Get sox_encodinginfo_t for saving to file/file object -sox_encodinginfo_t get_encodinginfo_for_save( - const std::string& format, - const caffe2::TypeMeta& dtype, - const std::optional& compression, - const std::optional& encoding, - const std::optional& bits_per_sample); - -} // namespace torchaudio::sox -#endif diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index f57572e5c8..f21454226c 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -15,7 +15,6 @@ kaldi_io, models, pipelines, - sox_effects, transforms, utils, ) @@ -205,6 +204,5 @@ def save( "pipelines", "kaldi_io", "utils", - "sox_effects", "transforms", ] diff --git a/src/torchaudio/_extension/__init__.py b/src/torchaudio/_extension/__init__.py index 5c2ff55583..11f7c6deec 100644 --- a/src/torchaudio/_extension/__init__.py +++ b/src/torchaudio/_extension/__init__.py @@ -4,7 +4,7 @@ from torchaudio._internal.module_utils import fail_with_message, is_module_available, no_op -from .utils import _check_cuda_version, _init_dll_path, _init_sox, _LazyImporter, _load_lib +from .utils import _check_cuda_version, _init_dll_path, _LazyImporter, _load_lib _LG = logging.getLogger(__name__) @@ -17,7 +17,6 @@ "_check_cuda_version", "_IS_TORCHAUDIO_EXT_AVAILABLE", "_IS_RIR_AVAILABLE", - "lazy_import_sox_ext", ] @@ -44,17 +43,6 @@ _IS_ALIGN_AVAILABLE = torchaudio.lib._torchaudio.is_align_available() -_SOX_EXT = None - - -def lazy_import_sox_ext(): - """Load SoX integration based on availability in lazy manner""" - - global _SOX_EXT - if _SOX_EXT is None: - _SOX_EXT = _LazyImporter("_torchaudio_sox", _init_sox) - return _SOX_EXT - fail_if_no_rir = ( no_op diff --git a/src/torchaudio/_extension/utils.py b/src/torchaudio/_extension/utils.py index c5660a1e22..1cbe3d93e5 100644 --- a/src/torchaudio/_extension/utils.py +++ b/src/torchaudio/_extension/utils.py @@ -61,51 +61,6 @@ def _load_lib(lib: str) -> bool: return True -def _import_sox_ext(): - if os.name == "nt": - raise RuntimeError("sox extension is not supported on Windows") - if not eval_env("TORCHAUDIO_USE_SOX", True): - raise RuntimeError("sox extension is disabled. (TORCHAUDIO_USE_SOX=0)") - - ext = "torchaudio.lib._torchaudio_sox" - - if not importlib.util.find_spec(ext): - raise RuntimeError( - # fmt: off - "TorchAudio is not built with sox extension. " - "Please build TorchAudio with libsox support. (BUILD_SOX=1)" - # fmt: on - ) - - _load_lib("libtorchaudio_sox") - return importlib.import_module(ext) - - -def _init_sox(): - ext = _import_sox_ext() - ext.set_verbosity(0) - - import atexit - - torch.ops.torchaudio_sox.initialize_sox_effects() - atexit.register(torch.ops.torchaudio_sox.shutdown_sox_effects) - - # Bundle functions registered with TORCH_LIBRARY into extension - # so that they can also be accessed in the same (lazy) manner - # from the extension. - keys = [ - "get_info", - "load_audio_file", - "save_audio_file", - "apply_effects_tensor", - "apply_effects_file", - ] - for key in keys: - setattr(ext, key, getattr(torch.ops.torchaudio_sox, key)) - - return ext - - class _LazyImporter(types.ModuleType): """Lazily import module/extension.""" diff --git a/src/torchaudio/_internal/module_utils.py b/src/torchaudio/_internal/module_utils.py index 45956cb175..2201055954 100644 --- a/src/torchaudio/_internal/module_utils.py +++ b/src/torchaudio/_internal/module_utils.py @@ -97,10 +97,6 @@ def decorator(func): {func.__doc__} """ - # This is a temporary fix to avoid depending on sox during testing. - # It will be removed once the sox dependency is removed from the rest of the codebase. - if 'sox' not in func.__module__: - UNSUPPORTED.append(wrapped) return wrapped return decorator diff --git a/src/torchaudio/sox_effects/__init__.py b/src/torchaudio/sox_effects/__init__.py deleted file mode 100644 index 93c63cae1d..0000000000 --- a/src/torchaudio/sox_effects/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -from .sox_effects import apply_effects_file, apply_effects_tensor, effect_names, init_sox_effects, shutdown_sox_effects - - -__all__ = [ - "init_sox_effects", - "shutdown_sox_effects", - "effect_names", - "apply_effects_tensor", - "apply_effects_file", -] diff --git a/src/torchaudio/sox_effects/sox_effects.py b/src/torchaudio/sox_effects/sox_effects.py deleted file mode 100644 index 256c461edc..0000000000 --- a/src/torchaudio/sox_effects/sox_effects.py +++ /dev/null @@ -1,275 +0,0 @@ -import os -from typing import List, Optional, Tuple - -import torch -import torchaudio -from torchaudio._internal.module_utils import deprecated, dropping_support -from torchaudio.utils.sox_utils import list_effects - - -sox_ext = torchaudio._extension.lazy_import_sox_ext() - - -@deprecated("Please remove the call. This function is called automatically.") -def init_sox_effects(): - """Initialize resources required to use sox effects. - - Note: - You do not need to call this function manually. It is called automatically. - - Once initialized, you do not need to call this function again across the multiple uses of - sox effects though it is safe to do so as long as :func:`shutdown_sox_effects` is not called yet. - Once :func:`shutdown_sox_effects` is called, you can no longer use SoX effects and initializing - again will result in error. - """ - pass - - -@deprecated("Please remove the call. This function is called automatically.") -def shutdown_sox_effects(): - """Clean up resources required to use sox effects. - - Note: - You do not need to call this function manually. It is called automatically. - - It is safe to call this function multiple times. - Once :py:func:`shutdown_sox_effects` is called, you can no longer use SoX effects and - initializing again will result in error. - """ - pass - - -@dropping_support -def effect_names() -> List[str]: - """Gets list of valid sox effect names - - Returns: - List[str]: list of available effect names. - - Example - >>> torchaudio.sox_effects.effect_names() - ['allpass', 'band', 'bandpass', ... ] - """ - return list(list_effects().keys()) - - -@dropping_support -def apply_effects_tensor( - tensor: torch.Tensor, - sample_rate: int, - effects: List[List[str]], - channels_first: bool = True, -) -> Tuple[torch.Tensor, int]: - """Apply sox effects to given Tensor - - .. devices:: CPU - - .. properties:: TorchScript - - Note: - This function only works on CPU Tensors. - This function works in the way very similar to ``sox`` command, however there are slight - differences. For example, ``sox`` command adds certain effects automatically (such as - ``rate`` effect after ``speed`` and ``pitch`` and other effects), but this function does - only applies the given effects. (Therefore, to actually apply ``speed`` effect, you also - need to give ``rate`` effect with desired sampling rate.). - - Args: - tensor (torch.Tensor): Input 2D CPU Tensor. - sample_rate (int): Sample rate - effects (List[List[str]]): List of effects. - channels_first (bool, optional): Indicates if the input Tensor's dimension is - `[channels, time]` or `[time, channels]` - - Returns: - (Tensor, int): Resulting Tensor and sample rate. - The resulting Tensor has the same ``dtype`` as the input Tensor, and - the same channels order. The shape of the Tensor can be different based on the - effects applied. Sample rate can also be different based on the effects applied. - - Example - Basic usage - >>> - >>> # Defines the effects to apply - >>> effects = [ - ... ['gain', '-n'], # normalises to 0dB - ... ['pitch', '5'], # 5 cent pitch shift - ... ['rate', '8000'], # resample to 8000 Hz - ... ] - >>> - >>> # Generate pseudo wave: - >>> # normalized, channels first, 2ch, sampling rate 16000, 1 second - >>> sample_rate = 16000 - >>> waveform = 2 * torch.rand([2, sample_rate * 1]) - 1 - >>> waveform.shape - torch.Size([2, 16000]) - >>> waveform - tensor([[ 0.3138, 0.7620, -0.9019, ..., -0.7495, -0.4935, 0.5442], - [-0.0832, 0.0061, 0.8233, ..., -0.5176, -0.9140, -0.2434]]) - >>> - >>> # Apply effects - >>> waveform, sample_rate = apply_effects_tensor( - ... wave_form, sample_rate, effects, channels_first=True) - >>> - >>> # Check the result - >>> # The new waveform is sampling rate 8000, 1 second. - >>> # normalization and channel order are preserved - >>> waveform.shape - torch.Size([2, 8000]) - >>> waveform - tensor([[ 0.5054, -0.5518, -0.4800, ..., -0.0076, 0.0096, -0.0110], - [ 0.1331, 0.0436, -0.3783, ..., -0.0035, 0.0012, 0.0008]]) - >>> sample_rate - 8000 - - Example - Torchscript-able transform - >>> - >>> # Use `apply_effects_tensor` in `torch.nn.Module` and dump it to file, - >>> # then run sox effect via Torchscript runtime. - >>> - >>> class SoxEffectTransform(torch.nn.Module): - ... effects: List[List[str]] - ... - ... def __init__(self, effects: List[List[str]]): - ... super().__init__() - ... self.effects = effects - ... - ... def forward(self, tensor: torch.Tensor, sample_rate: int): - ... return sox_effects.apply_effects_tensor( - ... tensor, sample_rate, self.effects) - ... - ... - >>> # Create transform object - >>> effects = [ - ... ["lowpass", "-1", "300"], # apply single-pole lowpass filter - ... ["rate", "8000"], # change sample rate to 8000 - ... ] - >>> transform = SoxEffectTensorTransform(effects, input_sample_rate) - >>> - >>> # Dump it to file and load - >>> path = 'sox_effect.zip' - >>> torch.jit.script(trans).save(path) - >>> transform = torch.jit.load(path) - >>> - >>>> # Run transform - >>> waveform, input_sample_rate = torchaudio.load("input.wav") - >>> waveform, sample_rate = transform(waveform, input_sample_rate) - >>> assert sample_rate == 8000 - """ - return sox_ext.apply_effects_tensor(tensor, sample_rate, effects, channels_first) - - -@dropping_support -def apply_effects_file( - path: str, - effects: List[List[str]], - normalize: bool = True, - channels_first: bool = True, - format: Optional[str] = None, -) -> Tuple[torch.Tensor, int]: - """Apply sox effects to the audio file and load the resulting data as Tensor - - .. devices:: CPU - - .. properties:: TorchScript - - Note: - This function works in the way very similar to ``sox`` command, however there are slight - differences. For example, ``sox`` commnad adds certain effects automatically (such as - ``rate`` effect after ``speed``, ``pitch`` etc), but this function only applies the given - effects. Therefore, to actually apply ``speed`` effect, you also need to give ``rate`` - effect with desired sampling rate, because internally, ``speed`` effects only alter sampling - rate and leave samples untouched. - - Args: - path (path-like object): - Source of audio data. - effects (List[List[str]]): List of effects. - normalize (bool, optional): - When ``True``, this function converts the native sample type to ``float32``. - Default: ``True``. - - If input file is integer WAV, giving ``False`` will change the resulting Tensor type to - integer type. - This argument has no effect for formats other than integer WAV type. - - channels_first (bool, optional): When True, the returned Tensor has dimension `[channel, time]`. - Otherwise, the returned Tensor's dimension is `[time, channel]`. - format (str or None, optional): - Override the format detection with the given format. - Providing the argument might help when libsox can not infer the format - from header or extension, - - Returns: - (Tensor, int): Resulting Tensor and sample rate. - If ``normalize=True``, the resulting Tensor is always ``float32`` type. - If ``normalize=False`` and the input audio file is of integer WAV file, then the - resulting Tensor has corresponding integer type. (Note 24 bit integer type is not supported) - If ``channels_first=True``, the resulting Tensor has dimension `[channel, time]`, - otherwise `[time, channel]`. - - Example - Basic usage - >>> - >>> # Defines the effects to apply - >>> effects = [ - ... ['gain', '-n'], # normalises to 0dB - ... ['pitch', '5'], # 5 cent pitch shift - ... ['rate', '8000'], # resample to 8000 Hz - ... ] - >>> - >>> # Apply effects and load data with channels_first=True - >>> waveform, sample_rate = apply_effects_file("data.wav", effects, channels_first=True) - >>> - >>> # Check the result - >>> waveform.shape - torch.Size([2, 8000]) - >>> waveform - tensor([[ 5.1151e-03, 1.8073e-02, 2.2188e-02, ..., 1.0431e-07, - -1.4761e-07, 1.8114e-07], - [-2.6924e-03, 2.1860e-03, 1.0650e-02, ..., 6.4122e-07, - -5.6159e-07, 4.8103e-07]]) - >>> sample_rate - 8000 - - Example - Apply random speed perturbation to dataset - >>> - >>> # Load data from file, apply random speed perturbation - >>> class RandomPerturbationFile(torch.utils.data.Dataset): - ... \"\"\"Given flist, apply random speed perturbation - ... - ... Suppose all the input files are at least one second long. - ... \"\"\" - ... def __init__(self, flist: List[str], sample_rate: int): - ... super().__init__() - ... self.flist = flist - ... self.sample_rate = sample_rate - ... - ... def __getitem__(self, index): - ... speed = 0.5 + 1.5 * random.randn() - ... effects = [ - ... ['gain', '-n', '-10'], # apply 10 db attenuation - ... ['remix', '-'], # merge all the channels - ... ['speed', f'{speed:.5f}'], # duration is now 0.5 ~ 2.0 seconds. - ... ['rate', f'{self.sample_rate}'], - ... ['pad', '0', '1.5'], # add 1.5 seconds silence at the end - ... ['trim', '0', '2'], # get the first 2 seconds - ... ] - ... waveform, _ = torchaudio.sox_effects.apply_effects_file( - ... self.flist[index], effects) - ... return waveform - ... - ... def __len__(self): - ... return len(self.flist) - ... - >>> dataset = RandomPerturbationFile(file_list, sample_rate=8000) - >>> loader = torch.utils.data.DataLoader(dataset, batch_size=32) - >>> for batch in loader: - >>> pass - """ - if not torch.jit.is_scripting(): - if hasattr(path, "read"): - raise RuntimeError( - "apply_effects_file function does not support file-like object. " - "Please use torchaudio.io.AudioEffector." - ) - path = os.fspath(path) - return sox_ext.apply_effects_file(path, effects, normalize, channels_first, format) diff --git a/src/torchaudio/utils/__init__.py b/src/torchaudio/utils/__init__.py index 9d4dd2dd72..be1f0bad21 100644 --- a/src/torchaudio/utils/__init__.py +++ b/src/torchaudio/utils/__init__.py @@ -1,10 +1,8 @@ from torio.utils import ffmpeg_utils -from . import sox_utils from .download import _download_asset __all__ = [ - "sox_utils", "ffmpeg_utils", ] diff --git a/src/torchaudio/utils/sox_utils.py b/src/torchaudio/utils/sox_utils.py deleted file mode 100644 index 8cc68361d5..0000000000 --- a/src/torchaudio/utils/sox_utils.py +++ /dev/null @@ -1,118 +0,0 @@ -"""Module to change the configuration of libsox, which is used by I/O functions like -:py:mod:`~torchaudio.backend.sox_io_backend` and :py:mod:`~torchaudio.sox_effects`. - -.. warning:: - Starting with version 2.8, we are refactoring TorchAudio to transition it - into a maintenance phase. As a result: - - - Some APIs are deprecated in 2.8 and will be removed in 2.9. - - The decoding and encoding capabilities of PyTorch for both audio and video - are being consolidated into TorchCodec. - - Please see https://github.com/pytorch/audio/issues/3902 for more information. -""" - -from typing import Dict, List - -import torchaudio - -sox_ext = torchaudio._extension.lazy_import_sox_ext() - -from torchaudio._internal.module_utils import dropping_support - -@dropping_support -def set_seed(seed: int): - """Set libsox's PRNG - - Args: - seed (int): seed value. valid range is int32. - - See Also: - http://sox.sourceforge.net/sox.html - """ - sox_ext.set_seed(seed) - - -@dropping_support -def set_verbosity(verbosity: int): - """Set libsox's verbosity - - Args: - verbosity (int): Set verbosity level of libsox. - - * ``1`` failure messages - * ``2`` warnings - * ``3`` details of processing - * ``4``-``6`` increasing levels of debug messages - - See Also: - http://sox.sourceforge.net/sox.html - """ - sox_ext.set_verbosity(verbosity) - - -@dropping_support -def set_buffer_size(buffer_size: int): - """Set buffer size for sox effect chain - - Args: - buffer_size (int): Set the size in bytes of the buffers used for processing audio. - - See Also: - http://sox.sourceforge.net/sox.html - """ - sox_ext.set_buffer_size(buffer_size) - - -@dropping_support -def set_use_threads(use_threads: bool): - """Set multithread option for sox effect chain - - Args: - use_threads (bool): When ``True``, enables ``libsox``'s parallel effects channels processing. - To use mutlithread, the underlying ``libsox`` has to be compiled with OpenMP support. - - See Also: - http://sox.sourceforge.net/sox.html - """ - sox_ext.set_use_threads(use_threads) - - -@dropping_support -def list_effects() -> Dict[str, str]: - """List the available sox effect names - - Returns: - Dict[str, str]: Mapping from ``effect name`` to ``usage`` - """ - return dict(sox_ext.list_effects()) - - -@dropping_support -def list_read_formats() -> List[str]: - """List the supported audio formats for read - - Returns: - List[str]: List of supported audio formats - """ - return sox_ext.list_read_formats() - - -@dropping_support -def list_write_formats() -> List[str]: - """List the supported audio formats for write - - Returns: - List[str]: List of supported audio formats - """ - return sox_ext.list_write_formats() - - -@dropping_support -def get_buffer_size() -> int: - """Get buffer size for sox effect chain - - Returns: - int: size in bytes of buffers used for processing audio. - """ - return sox_ext.get_buffer_size() diff --git a/test/torchaudio_unittest/common_utils/case_utils.py b/test/torchaudio_unittest/common_utils/case_utils.py index b99b96f5b0..7ce9c89dd3 100644 --- a/test/torchaudio_unittest/common_utils/case_utils.py +++ b/test/torchaudio_unittest/common_utils/case_utils.py @@ -109,7 +109,6 @@ class TorchaudioTestCase(TestBaseMixin, PytorchTestCase): _IS_FFMPEG_AVAILABLE = torio._extension.lazy_import_ffmpeg_ext().is_available() -_IS_SOX_AVAILABLE = torchaudio._extension.lazy_import_sox_ext().is_available() _IS_CTC_DECODER_AVAILABLE = None _IS_CUDA_CTC_DECODER_AVAILABLE = None diff --git a/third_party/sox/CMakeLists.txt b/third_party/sox/CMakeLists.txt deleted file mode 100644 index db96f05faf..0000000000 --- a/third_party/sox/CMakeLists.txt +++ /dev/null @@ -1,21 +0,0 @@ -include(FetchContent) - -FetchContent_Declare( - sox_src - URL https://downloads.sourceforge.net/project/sox/sox/14.4.2/sox-14.4.2.tar.bz2 - URL_HASH SHA256=81a6956d4330e75b5827316e44ae381e6f1e8928003c6aa45896da9041ea149c - PATCH_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - ) -# FetchContent_MakeAvailable will parse the downloaded content and setup the targets. -# We want to only download and not build, so we run Populate manually. -if(NOT sox_src_POPULATED) - FetchContent_Populate(sox_src) -endif() - -add_library(sox SHARED stub.c) -if(APPLE) - set_target_properties(sox PROPERTIES SUFFIX .dylib) -endif(APPLE) -target_include_directories(sox PUBLIC ${sox_src_SOURCE_DIR}/src) diff --git a/third_party/sox/stub.c b/third_party/sox/stub.c deleted file mode 100644 index 4e668caf37..0000000000 --- a/third_party/sox/stub.c +++ /dev/null @@ -1,85 +0,0 @@ -#include - -int sox_add_effect( - sox_effects_chain_t* chain, - sox_effect_t* effp, - sox_signalinfo_t* in, - sox_signalinfo_t const* out) { - return -1; -} -int sox_close(sox_format_t* ft) { - return -1; -} - -sox_effect_t* sox_create_effect(sox_effect_handler_t const* eh) { - return NULL; -} - -sox_effects_chain_t* sox_create_effects_chain( - sox_encodinginfo_t const* in_enc, - sox_encodinginfo_t const* out_enc) { - return NULL; -} - -void sox_delete_effect(sox_effect_t* effp) {} -void sox_delete_effects_chain(sox_effects_chain_t* ecp) {} - -int sox_effect_options(sox_effect_t* effp, int argc, char* const argv[]) { - return -1; -} - -const sox_effect_handler_t* sox_find_effect(char const* name) { - return NULL; -} - -int sox_flow_effects( - sox_effects_chain_t* chain, - int callback(sox_bool all_done, void* client_data), - void* client_data) { - return -1; -} - -const sox_effect_fn_t* sox_get_effect_fns(void) { - return NULL; -} - -const sox_format_tab_t* sox_get_format_fns(void) { - return NULL; -} - -sox_globals_t* sox_get_globals(void) { - return NULL; -} - -sox_format_t* sox_open_read( - char const* path, - sox_signalinfo_t const* signal, - sox_encodinginfo_t const* encoding, - char const* filetype) { - return NULL; -} - -sox_format_t* sox_open_write( - char const* path, - sox_signalinfo_t const* signal, - sox_encodinginfo_t const* encoding, - char const* filetype, - sox_oob_t const* oob, - sox_bool overwrite_permitted(char const* filename)) { - return NULL; -} - -const char* sox_strerror(int sox_errno) { - return NULL; -} - -size_t sox_write(sox_format_t* ft, const sox_sample_t* buf, size_t len) { - return 0; -} - -int sox_init() { - return -1; -}; -int sox_quit() { - return -1; -}; diff --git a/tools/setup_helpers/extension.py b/tools/setup_helpers/extension.py index 58f5087854..b322541e36 100644 --- a/tools/setup_helpers/extension.py +++ b/tools/setup_helpers/extension.py @@ -51,13 +51,6 @@ def get_ext_modules(): Extension(name="torchaudio.lib.libtorchaudio", sources=[]), Extension(name="torchaudio.lib._torchaudio", sources=[]), ] - if _BUILD_SOX: - modules.extend( - [ - Extension(name="torchaudio.lib.libtorchaudio_sox", sources=[]), - Extension(name="torchaudio.lib._torchaudio_sox", sources=[]), - ] - ) if _BUILD_CUDA_CTC_DECODER: modules.extend( [