diff --git a/CMakeLists.txt b/CMakeLists.txt
index ddc6dc15a2..6fada209fe 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -166,10 +166,6 @@ else()
endif()
add_subdirectory(src/libtorchaudio)
-if (BUILD_SOX)
- add_subdirectory(third_party/sox)
- add_subdirectory(src/libtorchaudio/sox)
-endif()
if (USE_FFMPEG)
if (DEFINED ENV{FFMPEG_ROOT})
add_subdirectory(third_party/ffmpeg/single)
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 819f72d813..785341c363 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -102,7 +102,6 @@ model implementations and application components.
models
models.decoder
pipelines
- sox_effects
compliance.kaldi
kaldi_io
utils
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
index cb0fa190b8..91136d52dd 100644
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -70,27 +70,6 @@ Optional Dependencies
files you installed follow this naming scheme, (and then make sure
that they are in one of the directories listed in library search path.)
-* `SoX `__
-
- Required to use ``backend="sox"`` in `I/O functions <./torchaudio.html#i-o>`__.
-
- Starting version 2.1, TorchAudio requires separately installed libsox.
-
- If dynamic linking is causing an issue, you can set the environment variable
- ``TORCHAUDIO_USE_SOX=0``, and TorchAudio won't use SoX.
-
- .. note::
-
- TorchAudio looks for a library file with unversioned name, that is ``libsox.so``
- for Linux, and ``libsox.dylib`` for macOS. Some package managers install the library
- file with different name. For example, aptitude on Ubuntu installs ``libsox.so.3``.
- To have TorchAudio link against it, you can create a symbolic link to it with name
- ``libsox.so`` (and put the symlink in a library search path).
-
- .. note::
- TorchAudio is tested on libsox 14.4.2. (And it is unlikely that other
- versions would work.)
-
* `SoundFile `__
Required to use ``backend="soundfile"`` in `I/O functions <./torchaudio.html#i-o>`__.
diff --git a/docs/source/sox_effects.rst b/docs/source/sox_effects.rst
deleted file mode 100644
index a8ee260144..0000000000
--- a/docs/source/sox_effects.rst
+++ /dev/null
@@ -1,34 +0,0 @@
-.. py:module:: torchaudio.sox_effects
-
-torchaudio.sox_effects
-======================
-
-.. currentmodule:: torchaudio.sox_effects
-
-.. warning::
- Starting with version 2.8, we are refactoring TorchAudio to transition it
- into a maintenance phase. As a result, the ``sox_effect`` module is
- deprecated in 2.8 and will be removed in 2.9.
-
-Applying effects
-----------------
-
-Apply SoX effects chain on torch.Tensor or on file and load as torch.Tensor.
-
-.. autosummary::
- :toctree: generated
- :nosignatures:
-
- apply_effects_tensor
- apply_effects_file
-
-.. minigallery:: torchaudio.sox_effects.apply_effects_tensor
-
-Utilities
----------
-
-.. autosummary::
- :toctree: generated
- :nosignatures:
-
- effect_names
diff --git a/docs/source/torchaudio.rst b/docs/source/torchaudio.rst
index aa933e84ad..629ffd312a 100644
--- a/docs/source/torchaudio.rst
+++ b/docs/source/torchaudio.rst
@@ -78,14 +78,6 @@ The following table summarizes the backends.
to retrieve the supported codecs.
This backend Supports various protocols, such as HTTPS and MP4, and file-like objects.
- * - 2
- - SoX
- - Linux, macOS
- - Use :py:func:`~torchaudio.utils.sox_utils.list_read_formats` and
- :py:func:`~torchaudio.utils.sox_utils.list_write_formats`
- to retrieve the supported codecs.
-
- This backend does *not* support file-like objects.
* - 3
- SoundFile
- Linux, macOS, Windows
diff --git a/examples/libtorchaudio/augmentation/CMakeLists.txt b/examples/libtorchaudio/augmentation/CMakeLists.txt
deleted file mode 100644
index e9bfece93a..0000000000
--- a/examples/libtorchaudio/augmentation/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-add_executable(augment main.cpp)
-target_link_libraries(augment "${TORCH_LIBRARIES}" "${TORCHAUDIO_LIBRARY}")
-set_property(TARGET augment PROPERTY CXX_STANDARD 14)
diff --git a/examples/libtorchaudio/augmentation/README.md b/examples/libtorchaudio/augmentation/README.md
deleted file mode 100644
index 81c58b3bd6..0000000000
--- a/examples/libtorchaudio/augmentation/README.md
+++ /dev/null
@@ -1,35 +0,0 @@
-# Augmentation
-
-This example demonstrates how you can use torchaudio's I/O features and augmentations in C++ application.
-
-**NOTE**
-This example uses `"sox_io"` backend, thus does not work on Windows.
-
-## Steps
-### 1. Create augmentation pipeline TorchScript file.
-
-First, we implement our data process pipeline as a regular Python, and save it as a TorchScript object.
-We will load and execute it in our C++ application. The C++ code is found in [`main.cpp`](./main.cpp).
-
-```python
-python create_jittable_pipeline.py \
- --rir-path "../data/rir.wav" \
- --output-path "./pipeline.zip"
-```
-
-### 2. Build the application
-
-Please refer to [the top level README.md](../README.md)
-
-### 3. Run the application
-
-Now we run the C++ application `augment`, with the TorchScript object we created in Step.1 and an input audio file.
-
-In [the top level directory](../)
-
-```bash
-input_audio_file="./data/input.wav"
-./build/augmentation/augment ./augmentation/pipeline.zip "${input_audio_file}" "output.wav"
-```
-
-When you give a clean speech file, the output audio sounds like it's a phone conversation.
diff --git a/examples/libtorchaudio/augmentation/create_jittable_pipeline.py b/examples/libtorchaudio/augmentation/create_jittable_pipeline.py
deleted file mode 100755
index 79f56819fc..0000000000
--- a/examples/libtorchaudio/augmentation/create_jittable_pipeline.py
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/usr/bin/env python3
-"""
-Create a data preprocess pipeline that can be run with libtorchaudio
-"""
-import argparse
-import os
-
-import torch
-import torchaudio
-
-
-class Pipeline(torch.nn.Module):
- """Example audio process pipeline.
-
- This example load waveform from a file then apply effects and save it to a file.
- """
-
- def __init__(self, rir_path: str):
- super().__init__()
- rir, sample_rate = torchaudio.load(rir_path)
- self.register_buffer("rir", rir)
- self.rir_sample_rate: int = sample_rate
-
- def forward(self, input_path: str, output_path: str):
- torchaudio.sox_effects.init_sox_effects()
-
- # 1. load audio
- waveform, sample_rate = torchaudio.load(input_path)
-
- # 2. Add background noise
- alpha = 0.01
- waveform = alpha * torch.randn_like(waveform) + (1 - alpha) * waveform
-
- # 3. Reample the RIR filter to much the audio sample rate
- rir, _ = torchaudio.sox_effects.apply_effects_tensor(
- self.rir, self.rir_sample_rate, effects=[["rate", str(sample_rate)]]
- )
- rir = rir / torch.linalg.vector_norm(rir, ord=2)
- rir = torch.flip(rir, [1])
-
- # 4. Apply RIR filter
- waveform = torch.nn.functional.pad(waveform, (rir.shape[1] - 1, 0))
- waveform = torch.nn.functional.conv1d(waveform[None, ...], rir[None, ...])[0]
-
- # Save
- torchaudio.save(output_path, waveform, sample_rate)
-
-
-def _create_jit_pipeline(rir_path, output_path):
- module = torch.jit.script(Pipeline(rir_path))
- print("*" * 40)
- print("* Pipeline code")
- print("*" * 40)
- print()
- print(module.code)
- print("*" * 40)
- module.save(output_path)
-
-
-def _get_path(*paths):
- return os.path.join(os.path.dirname(__file__), *paths)
-
-
-def _parse_args():
- parser = argparse.ArgumentParser(description=__doc__)
- parser.add_argument(
- "--rir-path", default=_get_path("..", "data", "rir.wav"), help="Audio dara for room impulse response."
- )
- parser.add_argument("--output-path", default=_get_path("pipeline.zip"), help="Output JIT file.")
- return parser.parse_args()
-
-
-def _main():
- args = _parse_args()
- _create_jit_pipeline(args.rir_path, args.output_path)
-
-
-if __name__ == "__main__":
- _main()
diff --git a/examples/libtorchaudio/augmentation/main.cpp b/examples/libtorchaudio/augmentation/main.cpp
deleted file mode 100644
index 921c67972b..0000000000
--- a/examples/libtorchaudio/augmentation/main.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-#include
-
-int main(int argc, char* argv[]) {
- if (argc != 4) {
- std::cerr << "Usage: " << argv[0]
- << " " << std::endl;
- return -1;
- }
-
- torch::jit::script::Module module;
- std::cout << "Loading module from: " << argv[1] << std::endl;
- try {
- module = torch::jit::load(argv[1]);
- } catch (const c10::Error& error) {
- std::cerr << "Failed to load the module:" << error.what() << std::endl;
- return -1;
- }
-
- std::cout << "Performing the process ..." << std::endl;
- module.forward({c10::IValue(argv[2]), c10::IValue(argv[3])});
- std::cout << "Done." << std::endl;
-}
diff --git a/examples/source_separation/conv_tasnet/train.py b/examples/source_separation/conv_tasnet/train.py
index 133b1f4f5e..72b8f57824 100644
--- a/examples/source_separation/conv_tasnet/train.py
+++ b/examples/source_separation/conv_tasnet/train.py
@@ -189,8 +189,6 @@ def train(args):
_LG.info("%s", args)
args.save_dir.mkdir(parents=True, exist_ok=True)
- if "sox_io" in torchaudio.list_audio_backends():
- torchaudio.set_audio_backend("sox_io")
start_epoch = 1
if args.resume:
diff --git a/src/libtorchaudio/sox/CMakeLists.txt b/src/libtorchaudio/sox/CMakeLists.txt
deleted file mode 100644
index 5ffe782c82..0000000000
--- a/src/libtorchaudio/sox/CMakeLists.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-set(
- sources
- io.cpp
- utils.cpp
- effects.cpp
- effects_chain.cpp
- types.cpp
- )
-torchaudio_library(
- libtorchaudio_sox
- "${sources}"
- ""
- "torch;sox"
- ""
- )
-
-if (BUILD_TORCHAUDIO_PYTHON_EXTENSION)
- torchaudio_extension(
- _torchaudio_sox
- "pybind/pybind.cpp;"
- ""
- "libtorchaudio_sox"
- ""
- )
-endif()
diff --git a/src/libtorchaudio/sox/effects.cpp b/src/libtorchaudio/sox/effects.cpp
deleted file mode 100644
index 947c04e3fc..0000000000
--- a/src/libtorchaudio/sox/effects.cpp
+++ /dev/null
@@ -1,133 +0,0 @@
-#include
-#include
-#include
-#include
-
-namespace torchaudio::sox {
-namespace {
-
-enum SoxEffectsResourceState { NotInitialized, Initialized, ShutDown };
-SoxEffectsResourceState SOX_RESOURCE_STATE = NotInitialized;
-std::mutex SOX_RESOUCE_STATE_MUTEX;
-
-} // namespace
-
-void initialize_sox_effects() {
- const std::lock_guard lock(SOX_RESOUCE_STATE_MUTEX);
-
- switch (SOX_RESOURCE_STATE) {
- case NotInitialized:
- TORCH_CHECK(
- sox_init() == SOX_SUCCESS, "Failed to initialize sox effects.");
- SOX_RESOURCE_STATE = Initialized;
- break;
- case Initialized:
- break;
- case ShutDown:
- TORCH_CHECK(
- false, "SoX Effects has been shut down. Cannot initialize again.");
- }
-};
-
-void shutdown_sox_effects() {
- const std::lock_guard lock(SOX_RESOUCE_STATE_MUTEX);
-
- switch (SOX_RESOURCE_STATE) {
- case NotInitialized:
- TORCH_CHECK(false, "SoX Effects is not initialized. Cannot shutdown.");
- case Initialized:
- TORCH_CHECK(
- sox_quit() == SOX_SUCCESS, "Failed to initialize sox effects.");
- SOX_RESOURCE_STATE = ShutDown;
- break;
- case ShutDown:
- break;
- }
-}
-
-auto apply_effects_tensor(
- torch::Tensor waveform,
- int64_t sample_rate,
- const std::vector>& effects,
- bool channels_first) -> std::tuple {
- validate_input_tensor(waveform);
-
- // Create SoxEffectsChain
- const auto dtype = waveform.dtype();
- SoxEffectsChain chain(
- /*input_encoding=*/get_tensor_encodinginfo(dtype),
- /*output_encoding=*/get_tensor_encodinginfo(dtype));
-
- // Prepare output buffer
- std::vector out_buffer;
- out_buffer.reserve(waveform.numel());
-
- // Build and run effects chain
- chain.addInputTensor(&waveform, sample_rate, channels_first);
- for (const auto& effect : effects) {
- chain.addEffect(effect);
- }
- chain.addOutputBuffer(&out_buffer);
- chain.run();
-
- // Create tensor from buffer
- auto out_tensor = convert_to_tensor(
- /*buffer=*/out_buffer.data(),
- /*num_samples=*/out_buffer.size(),
- /*num_channels=*/chain.getOutputNumChannels(),
- dtype,
- /*normalize=*/false,
- channels_first);
-
- return std::tuple(
- out_tensor, chain.getOutputSampleRate());
-}
-
-auto apply_effects_file(
- const std::string& path,
- const std::vector>& effects,
- std::optional normalize,
- std::optional channels_first,
- const std::optional& format)
- -> std::tuple {
- // Open input file
- SoxFormat sf(sox_open_read(
- path.c_str(),
- /*signal=*/nullptr,
- /*encoding=*/nullptr,
- /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
-
- validate_input_file(sf, path);
-
- const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision);
-
- // Prepare output
- std::vector out_buffer;
- out_buffer.reserve(sf->signal.length);
-
- // Create and run SoxEffectsChain
- SoxEffectsChain chain(
- /*input_encoding=*/sf->encoding,
- /*output_encoding=*/get_tensor_encodinginfo(dtype));
-
- chain.addInputFile(sf);
- for (const auto& effect : effects) {
- chain.addEffect(effect);
- }
- chain.addOutputBuffer(&out_buffer);
- chain.run();
-
- // Create tensor from buffer
- bool channels_first_ = channels_first.value_or(true);
- auto tensor = convert_to_tensor(
- /*buffer=*/out_buffer.data(),
- /*num_samples=*/out_buffer.size(),
- /*num_channels=*/chain.getOutputNumChannels(),
- dtype,
- normalize.value_or(true),
- channels_first_);
-
- return std::tuple(
- tensor, chain.getOutputSampleRate());
-}
-} // namespace torchaudio::sox
diff --git a/src/libtorchaudio/sox/effects.h b/src/libtorchaudio/sox/effects.h
deleted file mode 100644
index 8b56427c1e..0000000000
--- a/src/libtorchaudio/sox/effects.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef TORCHAUDIO_SOX_EFFECTS_H
-#define TORCHAUDIO_SOX_EFFECTS_H
-
-#include
-#include
-
-namespace torchaudio::sox {
-
-void initialize_sox_effects();
-
-void shutdown_sox_effects();
-
-auto apply_effects_tensor(
- torch::Tensor waveform,
- int64_t sample_rate,
- const std::vector>& effects,
- bool channels_first) -> std::tuple;
-
-auto apply_effects_file(
- const std::string& path,
- const std::vector>& effects,
- std::optional normalize,
- std::optional channels_first,
- const std::optional& format)
- -> std::tuple;
-
-} // namespace torchaudio::sox
-
-#endif
diff --git a/src/libtorchaudio/sox/effects_chain.cpp b/src/libtorchaudio/sox/effects_chain.cpp
deleted file mode 100644
index 7f6109a343..0000000000
--- a/src/libtorchaudio/sox/effects_chain.cpp
+++ /dev/null
@@ -1,301 +0,0 @@
-#include
-#include
-#include "c10/util/Exception.h"
-
-using namespace torch::indexing;
-
-namespace torchaudio::sox {
-
-namespace {
-
-/// helper classes for passing the location of input tensor and output buffer
-///
-/// drain/flow callback functions require plaing C style function signature and
-/// the way to pass extra data is to attach data to sox_effect_t::priv pointer.
-/// The following structs will be assigned to sox_effect_t::priv pointer which
-/// gives sox_effect_t an access to input Tensor and output buffer object.
-struct TensorInputPriv {
- size_t index;
- torch::Tensor* waveform;
- int64_t sample_rate;
- bool channels_first;
-};
-struct TensorOutputPriv {
- std::vector* buffer;
-};
-struct FileOutputPriv {
- sox_format_t* sf;
-};
-
-/// Callback function to feed Tensor data to SoxEffectChain.
-int tensor_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp) {
- // Retrieve the input Tensor and current index
- auto priv = static_cast(effp->priv);
- auto index = priv->index;
- auto tensor = *(priv->waveform);
- auto num_channels = effp->out_signal.channels;
-
- // Adjust the number of samples to read
- const size_t num_samples = tensor.numel();
- if (index + *osamp > num_samples) {
- *osamp = num_samples - index;
- }
- // Ensure that it's a multiple of the number of channels
- *osamp -= *osamp % num_channels;
-
- // Slice the input Tensor
- auto chunk = [&]() {
- auto i_frame = index / num_channels;
- auto num_frames = *osamp / num_channels;
- auto t = (priv->channels_first)
- ? tensor.index({Slice(), Slice(i_frame, i_frame + num_frames)}).t()
- : tensor.index({Slice(i_frame, i_frame + num_frames), Slice()});
- return t.reshape({-1});
- }();
-
- // Convert to sox_sample_t (int32_t)
- switch (chunk.dtype().toScalarType()) {
- case c10::ScalarType::Float: {
- // Need to convert to 64-bit precision so that
- // values around INT32_MIN/MAX are handled correctly.
- chunk = chunk.to(c10::ScalarType::Double);
- chunk *= 2147483648.;
- chunk.clamp_(INT32_MIN, INT32_MAX);
- chunk = chunk.to(c10::ScalarType::Int);
- break;
- }
- case c10::ScalarType::Int: {
- break;
- }
- case c10::ScalarType::Short: {
- chunk = chunk.to(c10::ScalarType::Int);
- chunk *= 65536;
- break;
- }
- case c10::ScalarType::Byte: {
- chunk = chunk.to(c10::ScalarType::Int);
- chunk -= 128;
- chunk *= 16777216;
- break;
- }
- default:
- TORCH_CHECK(false, "Unexpected dtype: ", chunk.dtype());
- }
- // Write to buffer
- chunk = chunk.contiguous();
- memcpy(obuf, chunk.data_ptr(), *osamp * 4);
- priv->index += *osamp;
- return (priv->index == num_samples) ? SOX_EOF : SOX_SUCCESS;
-}
-
-/// Callback function to fetch data from SoxEffectChain.
-int tensor_output_flow(
- sox_effect_t* effp,
- sox_sample_t const* ibuf,
- sox_sample_t* obuf LSX_UNUSED,
- size_t* isamp,
- size_t* osamp) {
- *osamp = 0;
- // Get output buffer
- auto out_buffer = static_cast(effp->priv)->buffer;
- // Append at the end
- out_buffer->insert(out_buffer->end(), ibuf, ibuf + *isamp);
- return SOX_SUCCESS;
-}
-
-int file_output_flow(
- sox_effect_t* effp,
- sox_sample_t const* ibuf,
- sox_sample_t* obuf LSX_UNUSED,
- size_t* isamp,
- size_t* osamp) {
- *osamp = 0;
- if (*isamp) {
- auto sf = static_cast(effp->priv)->sf;
- if (sox_write(sf, ibuf, *isamp) != *isamp) {
- TORCH_CHECK(
- !sf->sox_errno,
- sf->sox_errstr,
- " ",
- sox_strerror(sf->sox_errno),
- " ",
- sf->filename);
- return SOX_EOF;
- }
- }
- return SOX_SUCCESS;
-}
-
-sox_effect_handler_t* get_tensor_input_handler() {
- static sox_effect_handler_t handler{
- /*name=*/"input_tensor",
- /*usage=*/nullptr,
- /*flags=*/SOX_EFF_MCHAN,
- /*getopts=*/nullptr,
- /*start=*/nullptr,
- /*flow=*/nullptr,
- /*drain=*/tensor_input_drain,
- /*stop=*/nullptr,
- /*kill=*/nullptr,
- /*priv_size=*/sizeof(TensorInputPriv)};
- return &handler;
-}
-
-sox_effect_handler_t* get_tensor_output_handler() {
- static sox_effect_handler_t handler{
- /*name=*/"output_tensor",
- /*usage=*/nullptr,
- /*flags=*/SOX_EFF_MCHAN,
- /*getopts=*/nullptr,
- /*start=*/nullptr,
- /*flow=*/tensor_output_flow,
- /*drain=*/nullptr,
- /*stop=*/nullptr,
- /*kill=*/nullptr,
- /*priv_size=*/sizeof(TensorOutputPriv)};
- return &handler;
-}
-
-sox_effect_handler_t* get_file_output_handler() {
- static sox_effect_handler_t handler{
- /*name=*/"output_file",
- /*usage=*/nullptr,
- /*flags=*/SOX_EFF_MCHAN,
- /*getopts=*/nullptr,
- /*start=*/nullptr,
- /*flow=*/file_output_flow,
- /*drain=*/nullptr,
- /*stop=*/nullptr,
- /*kill=*/nullptr,
- /*priv_size=*/sizeof(FileOutputPriv)};
- return &handler;
-}
-
-} // namespace
-
-SoxEffect::SoxEffect(sox_effect_t* se) noexcept : se_(se) {}
-
-SoxEffect::~SoxEffect() {
- if (se_ != nullptr) {
- free(se_);
- }
-}
-
-SoxEffect::operator sox_effect_t*() const {
- return se_;
-}
-
-auto SoxEffect::operator->() noexcept -> sox_effect_t* {
- return se_;
-}
-
-SoxEffectsChain::SoxEffectsChain(
- sox_encodinginfo_t input_encoding,
- sox_encodinginfo_t output_encoding)
- : in_enc_(input_encoding),
- out_enc_(output_encoding),
- in_sig_(),
- interm_sig_(),
- out_sig_(),
- sec_(sox_create_effects_chain(&in_enc_, &out_enc_)) {
- TORCH_CHECK(sec_, "Failed to create effect chain.");
-}
-
-SoxEffectsChain::~SoxEffectsChain() {
- if (sec_ != nullptr) {
- sox_delete_effects_chain(sec_);
- }
-}
-
-void SoxEffectsChain::run() {
- sox_flow_effects(sec_, nullptr, nullptr);
-}
-
-void SoxEffectsChain::addInputTensor(
- torch::Tensor* waveform,
- int64_t sample_rate,
- bool channels_first) {
- in_sig_ = get_signalinfo(waveform, sample_rate, "wav", channels_first);
- interm_sig_ = in_sig_;
- SoxEffect e(sox_create_effect(get_tensor_input_handler()));
- auto priv = static_cast(e->priv);
- priv->index = 0;
- priv->waveform = waveform;
- priv->sample_rate = sample_rate;
- priv->channels_first = channels_first;
- TORCH_CHECK(
- sox_add_effect(sec_, e, &interm_sig_, &in_sig_) == SOX_SUCCESS,
- "Internal Error: Failed to add effect: input_tensor");
-}
-
-void SoxEffectsChain::addOutputBuffer(
- std::vector* output_buffer) {
- SoxEffect e(sox_create_effect(get_tensor_output_handler()));
- static_cast(e->priv)->buffer = output_buffer;
- TORCH_CHECK(
- sox_add_effect(sec_, e, &interm_sig_, &in_sig_) == SOX_SUCCESS,
- "Internal Error: Failed to add effect: output_tensor");
-}
-
-void SoxEffectsChain::addInputFile(sox_format_t* sf) {
- in_sig_ = sf->signal;
- interm_sig_ = in_sig_;
- SoxEffect e(sox_create_effect(sox_find_effect("input")));
- char* opts[] = {(char*)sf};
- sox_effect_options(e, 1, opts);
- TORCH_CHECK(
- sox_add_effect(sec_, e, &interm_sig_, &in_sig_) == SOX_SUCCESS,
- "Internal Error: Failed to add effect: input ",
- sf->filename);
-}
-
-void SoxEffectsChain::addOutputFile(sox_format_t* sf) {
- out_sig_ = sf->signal;
- SoxEffect e(sox_create_effect(get_file_output_handler()));
- static_cast(e->priv)->sf = sf;
- TORCH_CHECK(
- sox_add_effect(sec_, e, &interm_sig_, &out_sig_) == SOX_SUCCESS,
- "Internal Error: Failed to add effect: output ",
- sf->filename);
-}
-
-void SoxEffectsChain::addEffect(const std::vector& effect) {
- const auto num_args = effect.size();
- TORCH_CHECK(num_args != 0, "Invalid argument: empty effect.");
- const auto name = effect[0];
- TORCH_CHECK(
- UNSUPPORTED_EFFECTS.find(name) == UNSUPPORTED_EFFECTS.end(),
- "Unsupported effect: ",
- name)
-
- auto returned_effect = sox_find_effect(name.c_str());
- TORCH_CHECK(returned_effect, "Unsupported effect: ", name)
-
- SoxEffect e(sox_create_effect(returned_effect));
- const auto num_options = num_args - 1;
-
- std::vector opts;
- for (size_t i = 1; i < num_args; ++i) {
- opts.push_back((char*)effect[i].c_str());
- }
- TORCH_CHECK(
- sox_effect_options(e, num_options, num_options ? opts.data() : nullptr) ==
- SOX_SUCCESS,
- "Invalid effect option: ",
- c10::Join(" ", effect))
- TORCH_CHECK(
- sox_add_effect(sec_, e, &interm_sig_, &in_sig_) == SOX_SUCCESS,
- "Internal Error: Failed to add effect: \"",
- c10::Join(" ", effect),
- "\"");
-}
-
-int64_t SoxEffectsChain::getOutputNumChannels() {
- return interm_sig_.channels;
-}
-
-int64_t SoxEffectsChain::getOutputSampleRate() {
- return interm_sig_.rate;
-}
-
-} // namespace torchaudio::sox
diff --git a/src/libtorchaudio/sox/effects_chain.h b/src/libtorchaudio/sox/effects_chain.h
deleted file mode 100644
index e6a892b5e8..0000000000
--- a/src/libtorchaudio/sox/effects_chain.h
+++ /dev/null
@@ -1,61 +0,0 @@
-#ifndef TORCHAUDIO_SOX_EFFECTS_CHAIN_H
-#define TORCHAUDIO_SOX_EFFECTS_CHAIN_H
-
-#include
-#include
-
-namespace torchaudio::sox {
-
-// Helper struct to safely close sox_effect_t* pointer returned by
-// sox_create_effect
-
-struct SoxEffect {
- explicit SoxEffect(sox_effect_t* se) noexcept;
- SoxEffect(const SoxEffect& other) = delete;
- SoxEffect(SoxEffect&& other) = delete;
- auto operator=(const SoxEffect& other) -> SoxEffect& = delete;
- auto operator=(SoxEffect&& other) -> SoxEffect& = delete;
- ~SoxEffect();
- operator sox_effect_t*() const;
- auto operator->() noexcept -> sox_effect_t*;
-
- private:
- sox_effect_t* se_;
-};
-
-// Helper struct to safely close sox_effects_chain_t with handy methods
-class SoxEffectsChain {
- const sox_encodinginfo_t in_enc_;
- const sox_encodinginfo_t out_enc_;
-
- protected:
- sox_signalinfo_t in_sig_;
- sox_signalinfo_t interm_sig_;
- sox_signalinfo_t out_sig_;
- sox_effects_chain_t* sec_;
-
- public:
- explicit SoxEffectsChain(
- sox_encodinginfo_t input_encoding,
- sox_encodinginfo_t output_encoding);
- SoxEffectsChain(const SoxEffectsChain& other) = delete;
- SoxEffectsChain(SoxEffectsChain&& other) = delete;
- SoxEffectsChain& operator=(const SoxEffectsChain& other) = delete;
- SoxEffectsChain& operator=(SoxEffectsChain&& other) = delete;
- ~SoxEffectsChain();
- void run();
- void addInputTensor(
- torch::Tensor* waveform,
- int64_t sample_rate,
- bool channels_first);
- void addInputFile(sox_format_t* sf);
- void addOutputBuffer(std::vector* output_buffer);
- void addOutputFile(sox_format_t* sf);
- void addEffect(const std::vector& effect);
- int64_t getOutputNumChannels();
- int64_t getOutputSampleRate();
-};
-
-} // namespace torchaudio::sox
-
-#endif
diff --git a/src/libtorchaudio/sox/io.cpp b/src/libtorchaudio/sox/io.cpp
deleted file mode 100644
index 474726ad1c..0000000000
--- a/src/libtorchaudio/sox/io.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-#include
-#include
-#include
-#include
-#include
-
-using namespace torch::indexing;
-
-namespace torchaudio::sox {
-
-std::tuple get_info_file(
- const std::string& path,
- const std::optional& format) {
- SoxFormat sf(sox_open_read(
- path.c_str(),
- /*signal=*/nullptr,
- /*encoding=*/nullptr,
- /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
-
- validate_input_file(sf, path);
-
- return std::make_tuple(
- static_cast(sf->signal.rate),
- static_cast(sf->signal.length / sf->signal.channels),
- static_cast(sf->signal.channels),
- static_cast(sf->encoding.bits_per_sample),
- get_encoding(sf->encoding.encoding));
-}
-
-std::vector> get_effects(
- const std::optional& frame_offset,
- const std::optional& num_frames) {
- const auto offset = frame_offset.value_or(0);
- TORCH_CHECK(
- offset >= 0,
- "Invalid argument: frame_offset must be non-negative. Found: ",
- offset);
- const auto frames = num_frames.value_or(-1);
- TORCH_CHECK(
- frames > 0 || frames == -1,
- "Invalid argument: num_frames must be -1 or greater than 0.");
-
- std::vector> effects;
- if (frames != -1) {
- std::ostringstream os_offset, os_frames;
- os_offset << offset << "s";
- os_frames << "+" << frames << "s";
- effects.emplace_back(
- std::vector{"trim", os_offset.str(), os_frames.str()});
- } else if (offset != 0) {
- std::ostringstream os_offset;
- os_offset << offset << "s";
- effects.emplace_back(std::vector{"trim", os_offset.str()});
- }
- return effects;
-}
-
-std::tuple load_audio_file(
- const std::string& path,
- const std::optional& frame_offset,
- const std::optional& num_frames,
- std::optional normalize,
- std::optional channels_first,
- const std::optional& format) {
- auto effects = get_effects(frame_offset, num_frames);
- return apply_effects_file(path, effects, normalize, channels_first, format);
-}
-
-void save_audio_file(
- const std::string& path,
- torch::Tensor tensor,
- int64_t sample_rate,
- bool channels_first,
- std::optional compression,
- std::optional format,
- std::optional encoding,
- std::optional bits_per_sample) {
- validate_input_tensor(tensor);
-
- const auto filetype = [&]() {
- if (format.has_value()) {
- return format.value();
- }
- return get_filetype(path);
- }();
-
- if (filetype == "amr-nb") {
- const auto num_channels = tensor.size(channels_first ? 0 : 1);
- TORCH_CHECK(
- num_channels == 1, "amr-nb format only supports single channel audio.");
- } else if (filetype == "htk") {
- const auto num_channels = tensor.size(channels_first ? 0 : 1);
- TORCH_CHECK(
- num_channels == 1, "htk format only supports single channel audio.");
- } else if (filetype == "gsm") {
- const auto num_channels = tensor.size(channels_first ? 0 : 1);
- TORCH_CHECK(
- num_channels == 1, "gsm format only supports single channel audio.");
- TORCH_CHECK(
- sample_rate == 8000,
- "gsm format only supports a sampling rate of 8kHz.");
- }
- const auto signal_info =
- get_signalinfo(&tensor, sample_rate, filetype, channels_first);
- const auto encoding_info = get_encodinginfo_for_save(
- filetype, tensor.dtype(), compression, encoding, bits_per_sample);
-
- SoxFormat sf(sox_open_write(
- path.c_str(),
- &signal_info,
- &encoding_info,
- /*filetype=*/filetype.c_str(),
- /*oob=*/nullptr,
- /*overwrite_permitted=*/nullptr));
-
- TORCH_CHECK(
- static_cast(sf) != nullptr,
- "Error saving audio file: failed to open file ",
- path);
-
- SoxEffectsChain chain(
- /*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()),
- /*output_encoding=*/sf->encoding);
- chain.addInputTensor(&tensor, sample_rate, channels_first);
- chain.addOutputFile(sf);
- chain.run();
-}
-} // namespace torchaudio::sox
diff --git a/src/libtorchaudio/sox/io.h b/src/libtorchaudio/sox/io.h
deleted file mode 100644
index b011ef59be..0000000000
--- a/src/libtorchaudio/sox/io.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef TORCHAUDIO_SOX_IO_H
-#define TORCHAUDIO_SOX_IO_H
-
-#include
-#include
-
-namespace torchaudio::sox {
-
-auto get_effects(
- const std::optional& frame_offset,
- const std::optional& num_frames)
- -> std::vector>;
-
-std::tuple get_info_file(
- const std::string& path,
- const std::optional& format);
-
-std::tuple load_audio_file(
- const std::string& path,
- const std::optional& frame_offset,
- const std::optional& num_frames,
- std::optional normalize,
- std::optional channels_first,
- const std::optional& format);
-
-void save_audio_file(
- const std::string& path,
- torch::Tensor tensor,
- int64_t sample_rate,
- bool channels_first,
- std::optional compression,
- std::optional format,
- std::optional encoding,
- std::optional bits_per_sample);
-
-} // namespace torchaudio::sox
-
-#endif
diff --git a/src/libtorchaudio/sox/pybind/pybind.cpp b/src/libtorchaudio/sox/pybind/pybind.cpp
deleted file mode 100644
index bd9c82c349..0000000000
--- a/src/libtorchaudio/sox/pybind/pybind.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-#include
-#include
-#include
-#include
-
-namespace torchaudio {
-namespace sox {
-namespace {
-
-TORCH_LIBRARY(torchaudio_sox, m) {
- m.def("torchaudio_sox::get_info", &get_info_file);
- m.def("torchaudio_sox::load_audio_file", &load_audio_file);
- m.def("torchaudio_sox::save_audio_file", &save_audio_file);
- m.def("torchaudio_sox::initialize_sox_effects", &initialize_sox_effects);
- m.def("torchaudio_sox::shutdown_sox_effects", &shutdown_sox_effects);
- m.def("torchaudio_sox::apply_effects_tensor", &apply_effects_tensor);
- m.def("torchaudio_sox::apply_effects_file", &apply_effects_file);
-}
-
-PYBIND11_MODULE(_torchaudio_sox, m) {
- m.def("set_seed", &set_seed, "Set random seed.");
- m.def("set_verbosity", &set_verbosity, "Set verbosity.");
- m.def("set_use_threads", &set_use_threads, "Set threading.");
- m.def("set_buffer_size", &set_buffer_size, "Set buffer size.");
- m.def("get_buffer_size", &get_buffer_size, "Get buffer size.");
- m.def("list_effects", &list_effects, "List available effects.");
- m.def(
- "list_read_formats",
- &list_read_formats,
- "List supported formats for decoding.");
- m.def(
- "list_write_formats",
- &list_write_formats,
- "List supported formats for encoding.");
-}
-
-} // namespace
-} // namespace sox
-} // namespace torchaudio
diff --git a/src/libtorchaudio/sox/types.cpp b/src/libtorchaudio/sox/types.cpp
deleted file mode 100644
index 12bd070105..0000000000
--- a/src/libtorchaudio/sox/types.cpp
+++ /dev/null
@@ -1,148 +0,0 @@
-#include
-
-namespace torchaudio::sox {
-
-Format get_format_from_string(const std::string& format) {
- if (format == "wav") {
- return Format::WAV;
- }
- if (format == "mp3") {
- return Format::MP3;
- }
- if (format == "flac") {
- return Format::FLAC;
- }
- if (format == "ogg" || format == "vorbis") {
- return Format::VORBIS;
- }
- if (format == "amr-nb") {
- return Format::AMR_NB;
- }
- if (format == "amr-wb") {
- return Format::AMR_WB;
- }
- if (format == "amb") {
- return Format::AMB;
- }
- if (format == "sph") {
- return Format::SPHERE;
- }
- if (format == "htk") {
- return Format::HTK;
- }
- if (format == "gsm") {
- return Format::GSM;
- }
- TORCH_CHECK(false, "Internal Error: unexpected format value: ", format);
-}
-
-std::string to_string(Encoding v) {
- switch (v) {
- case Encoding::UNKNOWN:
- return "UNKNOWN";
- case Encoding::PCM_SIGNED:
- return "PCM_S";
- case Encoding::PCM_UNSIGNED:
- return "PCM_U";
- case Encoding::PCM_FLOAT:
- return "PCM_F";
- case Encoding::FLAC:
- return "FLAC";
- case Encoding::ULAW:
- return "ULAW";
- case Encoding::ALAW:
- return "ALAW";
- case Encoding::MP3:
- return "MP3";
- case Encoding::VORBIS:
- return "VORBIS";
- case Encoding::AMR_WB:
- return "AMR_WB";
- case Encoding::AMR_NB:
- return "AMR_NB";
- case Encoding::OPUS:
- return "OPUS";
- default:
- TORCH_CHECK(false, "Internal Error: unexpected encoding.");
- }
-}
-
-Encoding get_encoding_from_option(const std::optional& encoding) {
- if (!encoding.has_value()) {
- return Encoding::NOT_PROVIDED;
- }
- std::string v = encoding.value();
- if (v == "PCM_S") {
- return Encoding::PCM_SIGNED;
- }
- if (v == "PCM_U") {
- return Encoding::PCM_UNSIGNED;
- }
- if (v == "PCM_F") {
- return Encoding::PCM_FLOAT;
- }
- if (v == "ULAW") {
- return Encoding::ULAW;
- }
- if (v == "ALAW") {
- return Encoding::ALAW;
- }
- TORCH_CHECK(false, "Internal Error: unexpected encoding value: ", v);
-}
-
-BitDepth get_bit_depth_from_option(const std::optional& bit_depth) {
- if (!bit_depth.has_value()) {
- return BitDepth::NOT_PROVIDED;
- }
- int64_t v = bit_depth.value();
- switch (v) {
- case 8:
- return BitDepth::B8;
- case 16:
- return BitDepth::B16;
- case 24:
- return BitDepth::B24;
- case 32:
- return BitDepth::B32;
- case 64:
- return BitDepth::B64;
- default: {
- TORCH_CHECK(false, "Internal Error: unexpected bit depth value: ", v);
- }
- }
-}
-
-std::string get_encoding(sox_encoding_t encoding) {
- switch (encoding) {
- case SOX_ENCODING_UNKNOWN:
- return "UNKNOWN";
- case SOX_ENCODING_SIGN2:
- return "PCM_S";
- case SOX_ENCODING_UNSIGNED:
- return "PCM_U";
- case SOX_ENCODING_FLOAT:
- return "PCM_F";
- case SOX_ENCODING_FLAC:
- return "FLAC";
- case SOX_ENCODING_ULAW:
- return "ULAW";
- case SOX_ENCODING_ALAW:
- return "ALAW";
- case SOX_ENCODING_MP3:
- return "MP3";
- case SOX_ENCODING_VORBIS:
- return "VORBIS";
- case SOX_ENCODING_AMR_WB:
- return "AMR_WB";
- case SOX_ENCODING_AMR_NB:
- return "AMR_NB";
- case SOX_ENCODING_OPUS:
- return "OPUS";
- case SOX_ENCODING_GSM:
- return "GSM";
- default:
- return "UNKNOWN";
- }
-}
-
-} // namespace torchaudio::sox
diff --git a/src/libtorchaudio/sox/types.h b/src/libtorchaudio/sox/types.h
deleted file mode 100644
index 714d303313..0000000000
--- a/src/libtorchaudio/sox/types.h
+++ /dev/null
@@ -1,58 +0,0 @@
-#ifndef TORCHAUDIO_SOX_TYPES_H
-#define TORCHAUDIO_SOX_TYPES_H
-
-#include
-#include
-
-namespace torchaudio::sox {
-
-enum class Format {
- WAV,
- MP3,
- FLAC,
- VORBIS,
- AMR_NB,
- AMR_WB,
- AMB,
- SPHERE,
- GSM,
- HTK,
-};
-
-Format get_format_from_string(const std::string& format);
-
-enum class Encoding {
- NOT_PROVIDED,
- UNKNOWN,
- PCM_SIGNED,
- PCM_UNSIGNED,
- PCM_FLOAT,
- FLAC,
- ULAW,
- ALAW,
- MP3,
- VORBIS,
- AMR_WB,
- AMR_NB,
- OPUS,
-};
-
-std::string to_string(Encoding v);
-Encoding get_encoding_from_option(const std::optional& encoding);
-
-enum class BitDepth : unsigned {
- NOT_PROVIDED = 0,
- B8 = 8,
- B16 = 16,
- B24 = 24,
- B32 = 32,
- B64 = 64,
-};
-
-BitDepth get_bit_depth_from_option(const std::optional& bit_depth);
-
-std::string get_encoding(sox_encoding_t encoding);
-
-} // namespace torchaudio::sox
-
-#endif
diff --git a/src/libtorchaudio/sox/utils.cpp b/src/libtorchaudio/sox/utils.cpp
deleted file mode 100644
index 94748c5209..0000000000
--- a/src/libtorchaudio/sox/utils.cpp
+++ /dev/null
@@ -1,509 +0,0 @@
-#include
-#include
-#include
-#include
-
-namespace torchaudio::sox {
-
-const std::unordered_set UNSUPPORTED_EFFECTS{
- "input",
- "output",
- "spectrogram",
- "noiseprof",
- "noisered",
- "splice"};
-
-void set_seed(const int64_t seed) {
- sox_get_globals()->ranqd1 = static_cast(seed);
-}
-
-void set_verbosity(const int64_t verbosity) {
- sox_get_globals()->verbosity = static_cast(verbosity);
-}
-
-void set_use_threads(const bool use_threads) {
- sox_get_globals()->use_threads = static_cast(use_threads);
-}
-
-void set_buffer_size(const int64_t buffer_size) {
- sox_get_globals()->bufsiz = static_cast(buffer_size);
-}
-
-int64_t get_buffer_size() {
- return sox_get_globals()->bufsiz;
-}
-
-std::vector> list_effects() {
- std::vector> effects;
- for (const sox_effect_fn_t* fns = sox_get_effect_fns(); *fns; ++fns) {
- const sox_effect_handler_t* handler = (*fns)();
- if (handler && handler->name) {
- if (UNSUPPORTED_EFFECTS.find(handler->name) ==
- UNSUPPORTED_EFFECTS.end()) {
- effects.emplace_back(std::vector{
- handler->name,
- handler->usage ? std::string(handler->usage) : std::string("")});
- }
- }
- }
- return effects;
-}
-
-std::vector list_write_formats() {
- std::vector formats;
- for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) {
- const sox_format_handler_t* handler = fns->fn();
- for (const char* const* names = handler->names; *names; ++names) {
- if (!strchr(*names, '/') && handler->write) {
- formats.emplace_back(*names);
- }
- }
- }
- return formats;
-}
-
-std::vector list_read_formats() {
- std::vector formats;
- for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) {
- const sox_format_handler_t* handler = fns->fn();
- for (const char* const* names = handler->names; *names; ++names) {
- if (!strchr(*names, '/') && handler->read) {
- formats.emplace_back(*names);
- }
- }
- }
- return formats;
-}
-
-SoxFormat::SoxFormat(sox_format_t* fd) noexcept : fd_(fd) {}
-SoxFormat::~SoxFormat() {
- close();
-}
-
-sox_format_t* SoxFormat::operator->() const noexcept {
- return fd_;
-}
-SoxFormat::operator sox_format_t*() const noexcept {
- return fd_;
-}
-
-void SoxFormat::close() {
- if (fd_ != nullptr) {
- sox_close(fd_);
- fd_ = nullptr;
- }
-}
-
-void validate_input_file(const SoxFormat& sf, const std::string& path) {
- TORCH_CHECK(
- static_cast(sf) != nullptr,
- "Error loading audio file: failed to open file " + path);
- TORCH_CHECK(
- sf->encoding.encoding != SOX_ENCODING_UNKNOWN,
- "Error loading audio file: unknown encoding.");
-}
-
-void validate_input_tensor(const torch::Tensor& tensor) {
- TORCH_CHECK(tensor.device().is_cpu(), "Input tensor has to be on CPU.");
-
- TORCH_CHECK(tensor.ndimension() == 2, "Input tensor has to be 2D.");
-
- switch (tensor.dtype().toScalarType()) {
- case c10::ScalarType::Byte:
- case c10::ScalarType::Short:
- case c10::ScalarType::Int:
- case c10::ScalarType::Float:
- break;
- default:
- TORCH_CHECK(
- false,
- "Input tensor has to be one of float32, int32, int16 or uint8 type.");
- }
-}
-
-caffe2::TypeMeta get_dtype(
- const sox_encoding_t encoding,
- const unsigned precision) {
- const auto dtype = [&]() {
- switch (encoding) {
- case SOX_ENCODING_UNSIGNED: // 8-bit PCM WAV
- return torch::kUInt8;
- case SOX_ENCODING_SIGN2: // 16-bit, 24-bit, or 32-bit PCM WAV
- switch (precision) {
- case 16:
- return torch::kInt16;
- case 24: // Cast 24-bit to 32-bit.
- case 32:
- return torch::kInt32;
- default:
- TORCH_CHECK(
- false,
- "Only 16, 24, and 32 bits are supported for signed PCM.");
- }
- default:
- // default to float32 for the other formats, including
- // 32-bit flaoting-point WAV,
- // MP3,
- // FLAC,
- // VORBIS etc...
- return torch::kFloat32;
- }
- }();
- return c10::scalarTypeToTypeMeta(dtype);
-}
-
-torch::Tensor convert_to_tensor(
- sox_sample_t* buffer,
- const int32_t num_samples,
- const int32_t num_channels,
- const caffe2::TypeMeta dtype,
- const bool normalize,
- const bool channels_first) {
- torch::Tensor t;
- uint64_t dummy = 0;
- SOX_SAMPLE_LOCALS;
- if (normalize || dtype == torch::kFloat32) {
- t = torch::empty(
- {num_samples / num_channels, num_channels}, torch::kFloat32);
- auto ptr = t.data_ptr();
- for (int32_t i = 0; i < num_samples; ++i) {
- ptr[i] = SOX_SAMPLE_TO_FLOAT_32BIT(buffer[i], dummy);
- }
- } else if (dtype == torch::kInt32) {
- t = torch::from_blob(
- buffer, {num_samples / num_channels, num_channels}, torch::kInt32)
- .clone();
- } else if (dtype == torch::kInt16) {
- t = torch::empty({num_samples / num_channels, num_channels}, torch::kInt16);
- auto ptr = t.data_ptr();
- for (int32_t i = 0; i < num_samples; ++i) {
- ptr[i] = SOX_SAMPLE_TO_SIGNED_16BIT(buffer[i], dummy);
- }
- } else if (dtype == torch::kUInt8) {
- t = torch::empty({num_samples / num_channels, num_channels}, torch::kUInt8);
- auto ptr = t.data_ptr();
- for (int32_t i = 0; i < num_samples; ++i) {
- ptr[i] = SOX_SAMPLE_TO_UNSIGNED_8BIT(buffer[i], dummy);
- }
- } else {
- TORCH_CHECK(false, "Unsupported dtype: ", dtype);
- }
- if (channels_first) {
- t = t.transpose(1, 0);
- }
- return t.contiguous();
-}
-
-const std::string get_filetype(const std::string& path) {
- std::string ext = path.substr(path.find_last_of('.') + 1);
- std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
- return ext;
-}
-
-namespace {
-
-std::tuple get_save_encoding_for_wav(
- const std::string& format,
- caffe2::TypeMeta dtype,
- const Encoding& encoding,
- const BitDepth& bits_per_sample) {
- switch (encoding) {
- case Encoding::NOT_PROVIDED:
- switch (bits_per_sample) {
- case BitDepth::NOT_PROVIDED:
- switch (dtype.toScalarType()) {
- case c10::ScalarType::Float:
- return std::make_tuple<>(SOX_ENCODING_FLOAT, 32);
- case c10::ScalarType::Int:
- return std::make_tuple<>(SOX_ENCODING_SIGN2, 32);
- case c10::ScalarType::Short:
- return std::make_tuple<>(SOX_ENCODING_SIGN2, 16);
- case c10::ScalarType::Byte:
- return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
- default:
- TORCH_CHECK(false, "Internal Error: Unexpected dtype: ", dtype);
- }
- case BitDepth::B8:
- return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
- default:
- return std::make_tuple<>(
- SOX_ENCODING_SIGN2, static_cast(bits_per_sample));
- }
- case Encoding::PCM_SIGNED:
- switch (bits_per_sample) {
- case BitDepth::NOT_PROVIDED:
- return std::make_tuple<>(SOX_ENCODING_SIGN2, 16);
- case BitDepth::B8:
- TORCH_CHECK(
- false, format, " does not support 8-bit signed PCM encoding.");
- default:
- return std::make_tuple<>(
- SOX_ENCODING_SIGN2, static_cast(bits_per_sample));
- }
- case Encoding::PCM_UNSIGNED:
- switch (bits_per_sample) {
- case BitDepth::NOT_PROVIDED:
- case BitDepth::B8:
- return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
- default:
- TORCH_CHECK(
- false, format, " only supports 8-bit for unsigned PCM encoding.");
- }
- case Encoding::PCM_FLOAT:
- switch (bits_per_sample) {
- case BitDepth::NOT_PROVIDED:
- case BitDepth::B32:
- return std::make_tuple<>(SOX_ENCODING_FLOAT, 32);
- case BitDepth::B64:
- return std::make_tuple<>(SOX_ENCODING_FLOAT, 64);
- default:
- TORCH_CHECK(
- false,
- format,
- " only supports 32-bit or 64-bit for floating-point PCM encoding.");
- }
- case Encoding::ULAW:
- switch (bits_per_sample) {
- case BitDepth::NOT_PROVIDED:
- case BitDepth::B8:
- return std::make_tuple<>(SOX_ENCODING_ULAW, 8);
- default:
- TORCH_CHECK(
- false, format, " only supports 8-bit for mu-law encoding.");
- }
- case Encoding::ALAW:
- switch (bits_per_sample) {
- case BitDepth::NOT_PROVIDED:
- case BitDepth::B8:
- return std::make_tuple<>(SOX_ENCODING_ALAW, 8);
- default:
- TORCH_CHECK(
- false, format, " only supports 8-bit for a-law encoding.");
- }
- default:
- TORCH_CHECK(
- false, format, " does not support encoding: " + to_string(encoding));
- }
-}
-
-std::tuple get_save_encoding(
- const std::string& format,
- const caffe2::TypeMeta& dtype,
- const std::optional& encoding,
- const std::optional& bits_per_sample) {
- const Format fmt = get_format_from_string(format);
- const Encoding enc = get_encoding_from_option(encoding);
- const BitDepth bps = get_bit_depth_from_option(bits_per_sample);
-
- switch (fmt) {
- case Format::WAV:
- case Format::AMB:
- return get_save_encoding_for_wav(format, dtype, enc, bps);
- case Format::MP3:
- TORCH_CHECK(
- enc == Encoding::NOT_PROVIDED,
- "mp3 does not support `encoding` option.");
- TORCH_CHECK(
- bps == BitDepth::NOT_PROVIDED,
- "mp3 does not support `bits_per_sample` option.");
- return std::make_tuple<>(SOX_ENCODING_MP3, 16);
- case Format::HTK:
- TORCH_CHECK(
- enc == Encoding::NOT_PROVIDED,
- "htk does not support `encoding` option.");
- TORCH_CHECK(
- bps == BitDepth::NOT_PROVIDED,
- "htk does not support `bits_per_sample` option.");
- return std::make_tuple<>(SOX_ENCODING_SIGN2, 16);
- case Format::VORBIS:
- TORCH_CHECK(
- enc == Encoding::NOT_PROVIDED,
- "vorbis does not support `encoding` option.");
- TORCH_CHECK(
- bps == BitDepth::NOT_PROVIDED,
- "vorbis does not support `bits_per_sample` option.");
- return std::make_tuple<>(SOX_ENCODING_VORBIS, 0);
- case Format::AMR_NB:
- TORCH_CHECK(
- enc == Encoding::NOT_PROVIDED,
- "amr-nb does not support `encoding` option.");
- TORCH_CHECK(
- bps == BitDepth::NOT_PROVIDED,
- "amr-nb does not support `bits_per_sample` option.");
- return std::make_tuple<>(SOX_ENCODING_AMR_NB, 16);
- case Format::FLAC:
- TORCH_CHECK(
- enc == Encoding::NOT_PROVIDED,
- "flac does not support `encoding` option.");
- switch (bps) {
- case BitDepth::B32:
- case BitDepth::B64:
- TORCH_CHECK(
- false, "flac does not support `bits_per_sample` larger than 24.");
- default:
- return std::make_tuple<>(
- SOX_ENCODING_FLAC, static_cast(bps));
- }
- case Format::SPHERE:
- switch (enc) {
- case Encoding::NOT_PROVIDED:
- case Encoding::PCM_SIGNED:
- switch (bps) {
- case BitDepth::NOT_PROVIDED:
- return std::make_tuple<>(SOX_ENCODING_SIGN2, 32);
- default:
- return std::make_tuple<>(
- SOX_ENCODING_SIGN2, static_cast(bps));
- }
- case Encoding::PCM_UNSIGNED:
- TORCH_CHECK(false, "sph does not support unsigned integer PCM.");
- case Encoding::PCM_FLOAT:
- TORCH_CHECK(false, "sph does not support floating point PCM.");
- case Encoding::ULAW:
- switch (bps) {
- case BitDepth::NOT_PROVIDED:
- case BitDepth::B8:
- return std::make_tuple<>(SOX_ENCODING_ULAW, 8);
- default:
- TORCH_CHECK(
- false, "sph only supports 8-bit for mu-law encoding.");
- }
- case Encoding::ALAW:
- switch (bps) {
- case BitDepth::NOT_PROVIDED:
- case BitDepth::B8:
- return std::make_tuple<>(SOX_ENCODING_ALAW, 8);
- default:
- return std::make_tuple<>(
- SOX_ENCODING_ALAW, static_cast(bps));
- }
- default:
- TORCH_CHECK(
- false, "sph does not support encoding: ", encoding.value());
- }
- case Format::GSM:
- TORCH_CHECK(
- enc == Encoding::NOT_PROVIDED,
- "gsm does not support `encoding` option.");
- TORCH_CHECK(
- bps == BitDepth::NOT_PROVIDED,
- "gsm does not support `bits_per_sample` option.");
- return std::make_tuple<>(SOX_ENCODING_GSM, 16);
-
- default:
- TORCH_CHECK(false, "Unsupported format: " + format);
- }
-}
-
-unsigned get_precision(const std::string& filetype, caffe2::TypeMeta dtype) {
- if (filetype == "mp3") {
- return SOX_UNSPEC;
- }
- if (filetype == "flac") {
- return 24;
- }
- if (filetype == "ogg" || filetype == "vorbis") {
- return SOX_UNSPEC;
- }
- if (filetype == "wav" || filetype == "amb") {
- switch (dtype.toScalarType()) {
- case c10::ScalarType::Byte:
- return 8;
- case c10::ScalarType::Short:
- return 16;
- case c10::ScalarType::Int:
- return 32;
- case c10::ScalarType::Float:
- return 32;
- default:
- TORCH_CHECK(false, "Unsupported dtype: ", dtype);
- }
- }
- if (filetype == "sph") {
- return 32;
- }
- if (filetype == "amr-nb") {
- return 16;
- }
- if (filetype == "gsm") {
- return 16;
- }
- if (filetype == "htk") {
- return 16;
- }
- TORCH_CHECK(false, "Unsupported file type: ", filetype);
-}
-
-} // namespace
-
-sox_signalinfo_t get_signalinfo(
- const torch::Tensor* waveform,
- const int64_t sample_rate,
- const std::string& filetype,
- const bool channels_first) {
- return sox_signalinfo_t{
- /*rate=*/static_cast(sample_rate),
- /*channels=*/
- static_cast(waveform->size(channels_first ? 0 : 1)),
- /*precision=*/get_precision(filetype, waveform->dtype()),
- /*length=*/static_cast(waveform->numel()),
- nullptr};
-}
-
-sox_encodinginfo_t get_tensor_encodinginfo(caffe2::TypeMeta dtype) {
- sox_encoding_t encoding = [&]() {
- switch (dtype.toScalarType()) {
- case c10::ScalarType::Byte:
- return SOX_ENCODING_UNSIGNED;
- case c10::ScalarType::Short:
- return SOX_ENCODING_SIGN2;
- case c10::ScalarType::Int:
- return SOX_ENCODING_SIGN2;
- case c10::ScalarType::Float:
- return SOX_ENCODING_FLOAT;
- default:
- TORCH_CHECK(false, "Unsupported dtype: ", dtype);
- }
- }();
- unsigned bits_per_sample = [&]() {
- switch (dtype.toScalarType()) {
- case c10::ScalarType::Byte:
- return 8;
- case c10::ScalarType::Short:
- return 16;
- case c10::ScalarType::Int:
- return 32;
- case c10::ScalarType::Float:
- return 32;
- default:
- TORCH_CHECK(false, "Unsupported dtype: ", dtype);
- }
- }();
- return sox_encodinginfo_t{
- /*encoding=*/encoding,
- /*bits_per_sample=*/bits_per_sample,
- /*compression=*/HUGE_VAL,
- /*reverse_bytes=*/sox_option_default,
- /*reverse_nibbles=*/sox_option_default,
- /*reverse_bits=*/sox_option_default,
- /*opposite_endian=*/sox_false};
-}
-
-sox_encodinginfo_t get_encodinginfo_for_save(
- const std::string& format,
- const caffe2::TypeMeta& dtype,
- const std::optional& compression,
- const std::optional& encoding,
- const std::optional& bits_per_sample) {
- auto enc = get_save_encoding(format, dtype, encoding, bits_per_sample);
- return sox_encodinginfo_t{
- /*encoding=*/std::get<0>(enc),
- /*bits_per_sample=*/std::get<1>(enc),
- /*compression=*/compression.value_or(HUGE_VAL),
- /*reverse_bytes=*/sox_option_default,
- /*reverse_nibbles=*/sox_option_default,
- /*reverse_bits=*/sox_option_default,
- /*opposite_endian=*/sox_false};
-}
-
-} // namespace torchaudio::sox
diff --git a/src/libtorchaudio/sox/utils.h b/src/libtorchaudio/sox/utils.h
deleted file mode 100644
index b26e25f65e..0000000000
--- a/src/libtorchaudio/sox/utils.h
+++ /dev/null
@@ -1,112 +0,0 @@
-#ifndef TORCHAUDIO_SOX_UTILS_H
-#define TORCHAUDIO_SOX_UTILS_H
-
-#include
-#include
-
-namespace torchaudio::sox {
-
-////////////////////////////////////////////////////////////////////////////////
-// APIs for Python interaction
-////////////////////////////////////////////////////////////////////////////////
-
-/// Set sox global options
-void set_seed(const int64_t seed);
-
-void set_verbosity(const int64_t verbosity);
-
-void set_use_threads(const bool use_threads);
-
-void set_buffer_size(const int64_t buffer_size);
-
-int64_t get_buffer_size();
-
-std::vector> list_effects();
-
-std::vector list_read_formats();
-
-std::vector list_write_formats();
-
-////////////////////////////////////////////////////////////////////////////////
-// Utilities for sox_io / sox_effects implementations
-////////////////////////////////////////////////////////////////////////////////
-
-extern const std::unordered_set UNSUPPORTED_EFFECTS;
-
-/// helper class to automatically close sox_format_t*
-struct SoxFormat {
- explicit SoxFormat(sox_format_t* fd) noexcept;
- SoxFormat(const SoxFormat& other) = delete;
- SoxFormat(SoxFormat&& other) = delete;
- SoxFormat& operator=(const SoxFormat& other) = delete;
- SoxFormat& operator=(SoxFormat&& other) = delete;
- ~SoxFormat();
- sox_format_t* operator->() const noexcept;
- operator sox_format_t*() const noexcept;
-
- void close();
-
- private:
- sox_format_t* fd_;
-};
-
-///
-/// Verify that input file is found, has known encoding, and not empty
-void validate_input_file(const SoxFormat& sf, const std::string& path);
-
-///
-/// Verify that input Tensor is 2D, CPU and either uin8, int16, int32 or float32
-void validate_input_tensor(const torch::Tensor&);
-
-///
-/// Get target dtype for the given encoding and precision.
-caffe2::TypeMeta get_dtype(
- const sox_encoding_t encoding,
- const unsigned precision);
-
-///
-/// Convert sox_sample_t buffer to uint8/int16/int32/float32 Tensor
-/// NOTE: This function might modify the values in the input buffer to
-/// reduce the number of memory copy.
-/// @param buffer Pointer to buffer that contains audio data.
-/// @param num_samples The number of samples to read.
-/// @param num_channels The number of channels. Used to reshape the resulting
-/// Tensor.
-/// @param dtype Target dtype. Determines the output dtype and value range in
-/// conjunction with normalization.
-/// @param noramlize Perform normalization. Only effective when dtype is not
-/// kFloat32. When effective, the output tensor is kFloat32 type and value range
-/// is [-1.0, 1.0]
-/// @param channels_first When True, output Tensor has shape of [num_channels,
-/// num_frames].
-torch::Tensor convert_to_tensor(
- sox_sample_t* buffer,
- const int32_t num_samples,
- const int32_t num_channels,
- const caffe2::TypeMeta dtype,
- const bool normalize,
- const bool channels_first);
-
-/// Extract extension from file path
-const std::string get_filetype(const std::string& path);
-
-/// Get sox_signalinfo_t for passing a torch::Tensor object.
-sox_signalinfo_t get_signalinfo(
- const torch::Tensor* waveform,
- const int64_t sample_rate,
- const std::string& filetype,
- const bool channels_first);
-
-/// Get sox_encodinginfo_t for Tensor I/O
-sox_encodinginfo_t get_tensor_encodinginfo(const caffe2::TypeMeta dtype);
-
-/// Get sox_encodinginfo_t for saving to file/file object
-sox_encodinginfo_t get_encodinginfo_for_save(
- const std::string& format,
- const caffe2::TypeMeta& dtype,
- const std::optional& compression,
- const std::optional& encoding,
- const std::optional& bits_per_sample);
-
-} // namespace torchaudio::sox
-#endif
diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py
index f57572e5c8..f21454226c 100644
--- a/src/torchaudio/__init__.py
+++ b/src/torchaudio/__init__.py
@@ -15,7 +15,6 @@
kaldi_io,
models,
pipelines,
- sox_effects,
transforms,
utils,
)
@@ -205,6 +204,5 @@ def save(
"pipelines",
"kaldi_io",
"utils",
- "sox_effects",
"transforms",
]
diff --git a/src/torchaudio/_extension/__init__.py b/src/torchaudio/_extension/__init__.py
index 5c2ff55583..11f7c6deec 100644
--- a/src/torchaudio/_extension/__init__.py
+++ b/src/torchaudio/_extension/__init__.py
@@ -4,7 +4,7 @@
from torchaudio._internal.module_utils import fail_with_message, is_module_available, no_op
-from .utils import _check_cuda_version, _init_dll_path, _init_sox, _LazyImporter, _load_lib
+from .utils import _check_cuda_version, _init_dll_path, _LazyImporter, _load_lib
_LG = logging.getLogger(__name__)
@@ -17,7 +17,6 @@
"_check_cuda_version",
"_IS_TORCHAUDIO_EXT_AVAILABLE",
"_IS_RIR_AVAILABLE",
- "lazy_import_sox_ext",
]
@@ -44,17 +43,6 @@
_IS_ALIGN_AVAILABLE = torchaudio.lib._torchaudio.is_align_available()
-_SOX_EXT = None
-
-
-def lazy_import_sox_ext():
- """Load SoX integration based on availability in lazy manner"""
-
- global _SOX_EXT
- if _SOX_EXT is None:
- _SOX_EXT = _LazyImporter("_torchaudio_sox", _init_sox)
- return _SOX_EXT
-
fail_if_no_rir = (
no_op
diff --git a/src/torchaudio/_extension/utils.py b/src/torchaudio/_extension/utils.py
index c5660a1e22..1cbe3d93e5 100644
--- a/src/torchaudio/_extension/utils.py
+++ b/src/torchaudio/_extension/utils.py
@@ -61,51 +61,6 @@ def _load_lib(lib: str) -> bool:
return True
-def _import_sox_ext():
- if os.name == "nt":
- raise RuntimeError("sox extension is not supported on Windows")
- if not eval_env("TORCHAUDIO_USE_SOX", True):
- raise RuntimeError("sox extension is disabled. (TORCHAUDIO_USE_SOX=0)")
-
- ext = "torchaudio.lib._torchaudio_sox"
-
- if not importlib.util.find_spec(ext):
- raise RuntimeError(
- # fmt: off
- "TorchAudio is not built with sox extension. "
- "Please build TorchAudio with libsox support. (BUILD_SOX=1)"
- # fmt: on
- )
-
- _load_lib("libtorchaudio_sox")
- return importlib.import_module(ext)
-
-
-def _init_sox():
- ext = _import_sox_ext()
- ext.set_verbosity(0)
-
- import atexit
-
- torch.ops.torchaudio_sox.initialize_sox_effects()
- atexit.register(torch.ops.torchaudio_sox.shutdown_sox_effects)
-
- # Bundle functions registered with TORCH_LIBRARY into extension
- # so that they can also be accessed in the same (lazy) manner
- # from the extension.
- keys = [
- "get_info",
- "load_audio_file",
- "save_audio_file",
- "apply_effects_tensor",
- "apply_effects_file",
- ]
- for key in keys:
- setattr(ext, key, getattr(torch.ops.torchaudio_sox, key))
-
- return ext
-
-
class _LazyImporter(types.ModuleType):
"""Lazily import module/extension."""
diff --git a/src/torchaudio/_internal/module_utils.py b/src/torchaudio/_internal/module_utils.py
index 45956cb175..2201055954 100644
--- a/src/torchaudio/_internal/module_utils.py
+++ b/src/torchaudio/_internal/module_utils.py
@@ -97,10 +97,6 @@ def decorator(func):
{func.__doc__}
"""
- # This is a temporary fix to avoid depending on sox during testing.
- # It will be removed once the sox dependency is removed from the rest of the codebase.
- if 'sox' not in func.__module__:
- UNSUPPORTED.append(wrapped)
return wrapped
return decorator
diff --git a/src/torchaudio/sox_effects/__init__.py b/src/torchaudio/sox_effects/__init__.py
deleted file mode 100644
index 93c63cae1d..0000000000
--- a/src/torchaudio/sox_effects/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from .sox_effects import apply_effects_file, apply_effects_tensor, effect_names, init_sox_effects, shutdown_sox_effects
-
-
-__all__ = [
- "init_sox_effects",
- "shutdown_sox_effects",
- "effect_names",
- "apply_effects_tensor",
- "apply_effects_file",
-]
diff --git a/src/torchaudio/sox_effects/sox_effects.py b/src/torchaudio/sox_effects/sox_effects.py
deleted file mode 100644
index 256c461edc..0000000000
--- a/src/torchaudio/sox_effects/sox_effects.py
+++ /dev/null
@@ -1,275 +0,0 @@
-import os
-from typing import List, Optional, Tuple
-
-import torch
-import torchaudio
-from torchaudio._internal.module_utils import deprecated, dropping_support
-from torchaudio.utils.sox_utils import list_effects
-
-
-sox_ext = torchaudio._extension.lazy_import_sox_ext()
-
-
-@deprecated("Please remove the call. This function is called automatically.")
-def init_sox_effects():
- """Initialize resources required to use sox effects.
-
- Note:
- You do not need to call this function manually. It is called automatically.
-
- Once initialized, you do not need to call this function again across the multiple uses of
- sox effects though it is safe to do so as long as :func:`shutdown_sox_effects` is not called yet.
- Once :func:`shutdown_sox_effects` is called, you can no longer use SoX effects and initializing
- again will result in error.
- """
- pass
-
-
-@deprecated("Please remove the call. This function is called automatically.")
-def shutdown_sox_effects():
- """Clean up resources required to use sox effects.
-
- Note:
- You do not need to call this function manually. It is called automatically.
-
- It is safe to call this function multiple times.
- Once :py:func:`shutdown_sox_effects` is called, you can no longer use SoX effects and
- initializing again will result in error.
- """
- pass
-
-
-@dropping_support
-def effect_names() -> List[str]:
- """Gets list of valid sox effect names
-
- Returns:
- List[str]: list of available effect names.
-
- Example
- >>> torchaudio.sox_effects.effect_names()
- ['allpass', 'band', 'bandpass', ... ]
- """
- return list(list_effects().keys())
-
-
-@dropping_support
-def apply_effects_tensor(
- tensor: torch.Tensor,
- sample_rate: int,
- effects: List[List[str]],
- channels_first: bool = True,
-) -> Tuple[torch.Tensor, int]:
- """Apply sox effects to given Tensor
-
- .. devices:: CPU
-
- .. properties:: TorchScript
-
- Note:
- This function only works on CPU Tensors.
- This function works in the way very similar to ``sox`` command, however there are slight
- differences. For example, ``sox`` command adds certain effects automatically (such as
- ``rate`` effect after ``speed`` and ``pitch`` and other effects), but this function does
- only applies the given effects. (Therefore, to actually apply ``speed`` effect, you also
- need to give ``rate`` effect with desired sampling rate.).
-
- Args:
- tensor (torch.Tensor): Input 2D CPU Tensor.
- sample_rate (int): Sample rate
- effects (List[List[str]]): List of effects.
- channels_first (bool, optional): Indicates if the input Tensor's dimension is
- `[channels, time]` or `[time, channels]`
-
- Returns:
- (Tensor, int): Resulting Tensor and sample rate.
- The resulting Tensor has the same ``dtype`` as the input Tensor, and
- the same channels order. The shape of the Tensor can be different based on the
- effects applied. Sample rate can also be different based on the effects applied.
-
- Example - Basic usage
- >>>
- >>> # Defines the effects to apply
- >>> effects = [
- ... ['gain', '-n'], # normalises to 0dB
- ... ['pitch', '5'], # 5 cent pitch shift
- ... ['rate', '8000'], # resample to 8000 Hz
- ... ]
- >>>
- >>> # Generate pseudo wave:
- >>> # normalized, channels first, 2ch, sampling rate 16000, 1 second
- >>> sample_rate = 16000
- >>> waveform = 2 * torch.rand([2, sample_rate * 1]) - 1
- >>> waveform.shape
- torch.Size([2, 16000])
- >>> waveform
- tensor([[ 0.3138, 0.7620, -0.9019, ..., -0.7495, -0.4935, 0.5442],
- [-0.0832, 0.0061, 0.8233, ..., -0.5176, -0.9140, -0.2434]])
- >>>
- >>> # Apply effects
- >>> waveform, sample_rate = apply_effects_tensor(
- ... wave_form, sample_rate, effects, channels_first=True)
- >>>
- >>> # Check the result
- >>> # The new waveform is sampling rate 8000, 1 second.
- >>> # normalization and channel order are preserved
- >>> waveform.shape
- torch.Size([2, 8000])
- >>> waveform
- tensor([[ 0.5054, -0.5518, -0.4800, ..., -0.0076, 0.0096, -0.0110],
- [ 0.1331, 0.0436, -0.3783, ..., -0.0035, 0.0012, 0.0008]])
- >>> sample_rate
- 8000
-
- Example - Torchscript-able transform
- >>>
- >>> # Use `apply_effects_tensor` in `torch.nn.Module` and dump it to file,
- >>> # then run sox effect via Torchscript runtime.
- >>>
- >>> class SoxEffectTransform(torch.nn.Module):
- ... effects: List[List[str]]
- ...
- ... def __init__(self, effects: List[List[str]]):
- ... super().__init__()
- ... self.effects = effects
- ...
- ... def forward(self, tensor: torch.Tensor, sample_rate: int):
- ... return sox_effects.apply_effects_tensor(
- ... tensor, sample_rate, self.effects)
- ...
- ...
- >>> # Create transform object
- >>> effects = [
- ... ["lowpass", "-1", "300"], # apply single-pole lowpass filter
- ... ["rate", "8000"], # change sample rate to 8000
- ... ]
- >>> transform = SoxEffectTensorTransform(effects, input_sample_rate)
- >>>
- >>> # Dump it to file and load
- >>> path = 'sox_effect.zip'
- >>> torch.jit.script(trans).save(path)
- >>> transform = torch.jit.load(path)
- >>>
- >>>> # Run transform
- >>> waveform, input_sample_rate = torchaudio.load("input.wav")
- >>> waveform, sample_rate = transform(waveform, input_sample_rate)
- >>> assert sample_rate == 8000
- """
- return sox_ext.apply_effects_tensor(tensor, sample_rate, effects, channels_first)
-
-
-@dropping_support
-def apply_effects_file(
- path: str,
- effects: List[List[str]],
- normalize: bool = True,
- channels_first: bool = True,
- format: Optional[str] = None,
-) -> Tuple[torch.Tensor, int]:
- """Apply sox effects to the audio file and load the resulting data as Tensor
-
- .. devices:: CPU
-
- .. properties:: TorchScript
-
- Note:
- This function works in the way very similar to ``sox`` command, however there are slight
- differences. For example, ``sox`` commnad adds certain effects automatically (such as
- ``rate`` effect after ``speed``, ``pitch`` etc), but this function only applies the given
- effects. Therefore, to actually apply ``speed`` effect, you also need to give ``rate``
- effect with desired sampling rate, because internally, ``speed`` effects only alter sampling
- rate and leave samples untouched.
-
- Args:
- path (path-like object):
- Source of audio data.
- effects (List[List[str]]): List of effects.
- normalize (bool, optional):
- When ``True``, this function converts the native sample type to ``float32``.
- Default: ``True``.
-
- If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
- integer type.
- This argument has no effect for formats other than integer WAV type.
-
- channels_first (bool, optional): When True, the returned Tensor has dimension `[channel, time]`.
- Otherwise, the returned Tensor's dimension is `[time, channel]`.
- format (str or None, optional):
- Override the format detection with the given format.
- Providing the argument might help when libsox can not infer the format
- from header or extension,
-
- Returns:
- (Tensor, int): Resulting Tensor and sample rate.
- If ``normalize=True``, the resulting Tensor is always ``float32`` type.
- If ``normalize=False`` and the input audio file is of integer WAV file, then the
- resulting Tensor has corresponding integer type. (Note 24 bit integer type is not supported)
- If ``channels_first=True``, the resulting Tensor has dimension `[channel, time]`,
- otherwise `[time, channel]`.
-
- Example - Basic usage
- >>>
- >>> # Defines the effects to apply
- >>> effects = [
- ... ['gain', '-n'], # normalises to 0dB
- ... ['pitch', '5'], # 5 cent pitch shift
- ... ['rate', '8000'], # resample to 8000 Hz
- ... ]
- >>>
- >>> # Apply effects and load data with channels_first=True
- >>> waveform, sample_rate = apply_effects_file("data.wav", effects, channels_first=True)
- >>>
- >>> # Check the result
- >>> waveform.shape
- torch.Size([2, 8000])
- >>> waveform
- tensor([[ 5.1151e-03, 1.8073e-02, 2.2188e-02, ..., 1.0431e-07,
- -1.4761e-07, 1.8114e-07],
- [-2.6924e-03, 2.1860e-03, 1.0650e-02, ..., 6.4122e-07,
- -5.6159e-07, 4.8103e-07]])
- >>> sample_rate
- 8000
-
- Example - Apply random speed perturbation to dataset
- >>>
- >>> # Load data from file, apply random speed perturbation
- >>> class RandomPerturbationFile(torch.utils.data.Dataset):
- ... \"\"\"Given flist, apply random speed perturbation
- ...
- ... Suppose all the input files are at least one second long.
- ... \"\"\"
- ... def __init__(self, flist: List[str], sample_rate: int):
- ... super().__init__()
- ... self.flist = flist
- ... self.sample_rate = sample_rate
- ...
- ... def __getitem__(self, index):
- ... speed = 0.5 + 1.5 * random.randn()
- ... effects = [
- ... ['gain', '-n', '-10'], # apply 10 db attenuation
- ... ['remix', '-'], # merge all the channels
- ... ['speed', f'{speed:.5f}'], # duration is now 0.5 ~ 2.0 seconds.
- ... ['rate', f'{self.sample_rate}'],
- ... ['pad', '0', '1.5'], # add 1.5 seconds silence at the end
- ... ['trim', '0', '2'], # get the first 2 seconds
- ... ]
- ... waveform, _ = torchaudio.sox_effects.apply_effects_file(
- ... self.flist[index], effects)
- ... return waveform
- ...
- ... def __len__(self):
- ... return len(self.flist)
- ...
- >>> dataset = RandomPerturbationFile(file_list, sample_rate=8000)
- >>> loader = torch.utils.data.DataLoader(dataset, batch_size=32)
- >>> for batch in loader:
- >>> pass
- """
- if not torch.jit.is_scripting():
- if hasattr(path, "read"):
- raise RuntimeError(
- "apply_effects_file function does not support file-like object. "
- "Please use torchaudio.io.AudioEffector."
- )
- path = os.fspath(path)
- return sox_ext.apply_effects_file(path, effects, normalize, channels_first, format)
diff --git a/src/torchaudio/utils/__init__.py b/src/torchaudio/utils/__init__.py
index 9d4dd2dd72..be1f0bad21 100644
--- a/src/torchaudio/utils/__init__.py
+++ b/src/torchaudio/utils/__init__.py
@@ -1,10 +1,8 @@
from torio.utils import ffmpeg_utils
-from . import sox_utils
from .download import _download_asset
__all__ = [
- "sox_utils",
"ffmpeg_utils",
]
diff --git a/src/torchaudio/utils/sox_utils.py b/src/torchaudio/utils/sox_utils.py
deleted file mode 100644
index 8cc68361d5..0000000000
--- a/src/torchaudio/utils/sox_utils.py
+++ /dev/null
@@ -1,118 +0,0 @@
-"""Module to change the configuration of libsox, which is used by I/O functions like
-:py:mod:`~torchaudio.backend.sox_io_backend` and :py:mod:`~torchaudio.sox_effects`.
-
-.. warning::
- Starting with version 2.8, we are refactoring TorchAudio to transition it
- into a maintenance phase. As a result:
-
- - Some APIs are deprecated in 2.8 and will be removed in 2.9.
- - The decoding and encoding capabilities of PyTorch for both audio and video
- are being consolidated into TorchCodec.
-
- Please see https://github.com/pytorch/audio/issues/3902 for more information.
-"""
-
-from typing import Dict, List
-
-import torchaudio
-
-sox_ext = torchaudio._extension.lazy_import_sox_ext()
-
-from torchaudio._internal.module_utils import dropping_support
-
-@dropping_support
-def set_seed(seed: int):
- """Set libsox's PRNG
-
- Args:
- seed (int): seed value. valid range is int32.
-
- See Also:
- http://sox.sourceforge.net/sox.html
- """
- sox_ext.set_seed(seed)
-
-
-@dropping_support
-def set_verbosity(verbosity: int):
- """Set libsox's verbosity
-
- Args:
- verbosity (int): Set verbosity level of libsox.
-
- * ``1`` failure messages
- * ``2`` warnings
- * ``3`` details of processing
- * ``4``-``6`` increasing levels of debug messages
-
- See Also:
- http://sox.sourceforge.net/sox.html
- """
- sox_ext.set_verbosity(verbosity)
-
-
-@dropping_support
-def set_buffer_size(buffer_size: int):
- """Set buffer size for sox effect chain
-
- Args:
- buffer_size (int): Set the size in bytes of the buffers used for processing audio.
-
- See Also:
- http://sox.sourceforge.net/sox.html
- """
- sox_ext.set_buffer_size(buffer_size)
-
-
-@dropping_support
-def set_use_threads(use_threads: bool):
- """Set multithread option for sox effect chain
-
- Args:
- use_threads (bool): When ``True``, enables ``libsox``'s parallel effects channels processing.
- To use mutlithread, the underlying ``libsox`` has to be compiled with OpenMP support.
-
- See Also:
- http://sox.sourceforge.net/sox.html
- """
- sox_ext.set_use_threads(use_threads)
-
-
-@dropping_support
-def list_effects() -> Dict[str, str]:
- """List the available sox effect names
-
- Returns:
- Dict[str, str]: Mapping from ``effect name`` to ``usage``
- """
- return dict(sox_ext.list_effects())
-
-
-@dropping_support
-def list_read_formats() -> List[str]:
- """List the supported audio formats for read
-
- Returns:
- List[str]: List of supported audio formats
- """
- return sox_ext.list_read_formats()
-
-
-@dropping_support
-def list_write_formats() -> List[str]:
- """List the supported audio formats for write
-
- Returns:
- List[str]: List of supported audio formats
- """
- return sox_ext.list_write_formats()
-
-
-@dropping_support
-def get_buffer_size() -> int:
- """Get buffer size for sox effect chain
-
- Returns:
- int: size in bytes of buffers used for processing audio.
- """
- return sox_ext.get_buffer_size()
diff --git a/test/torchaudio_unittest/common_utils/case_utils.py b/test/torchaudio_unittest/common_utils/case_utils.py
index b99b96f5b0..7ce9c89dd3 100644
--- a/test/torchaudio_unittest/common_utils/case_utils.py
+++ b/test/torchaudio_unittest/common_utils/case_utils.py
@@ -109,7 +109,6 @@ class TorchaudioTestCase(TestBaseMixin, PytorchTestCase):
_IS_FFMPEG_AVAILABLE = torio._extension.lazy_import_ffmpeg_ext().is_available()
-_IS_SOX_AVAILABLE = torchaudio._extension.lazy_import_sox_ext().is_available()
_IS_CTC_DECODER_AVAILABLE = None
_IS_CUDA_CTC_DECODER_AVAILABLE = None
diff --git a/third_party/sox/CMakeLists.txt b/third_party/sox/CMakeLists.txt
deleted file mode 100644
index db96f05faf..0000000000
--- a/third_party/sox/CMakeLists.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-include(FetchContent)
-
-FetchContent_Declare(
- sox_src
- URL https://downloads.sourceforge.net/project/sox/sox/14.4.2/sox-14.4.2.tar.bz2
- URL_HASH SHA256=81a6956d4330e75b5827316e44ae381e6f1e8928003c6aa45896da9041ea149c
- PATCH_COMMAND ""
- CONFIGURE_COMMAND ""
- BUILD_COMMAND ""
- )
-# FetchContent_MakeAvailable will parse the downloaded content and setup the targets.
-# We want to only download and not build, so we run Populate manually.
-if(NOT sox_src_POPULATED)
- FetchContent_Populate(sox_src)
-endif()
-
-add_library(sox SHARED stub.c)
-if(APPLE)
- set_target_properties(sox PROPERTIES SUFFIX .dylib)
-endif(APPLE)
-target_include_directories(sox PUBLIC ${sox_src_SOURCE_DIR}/src)
diff --git a/third_party/sox/stub.c b/third_party/sox/stub.c
deleted file mode 100644
index 4e668caf37..0000000000
--- a/third_party/sox/stub.c
+++ /dev/null
@@ -1,85 +0,0 @@
-#include
-
-int sox_add_effect(
- sox_effects_chain_t* chain,
- sox_effect_t* effp,
- sox_signalinfo_t* in,
- sox_signalinfo_t const* out) {
- return -1;
-}
-int sox_close(sox_format_t* ft) {
- return -1;
-}
-
-sox_effect_t* sox_create_effect(sox_effect_handler_t const* eh) {
- return NULL;
-}
-
-sox_effects_chain_t* sox_create_effects_chain(
- sox_encodinginfo_t const* in_enc,
- sox_encodinginfo_t const* out_enc) {
- return NULL;
-}
-
-void sox_delete_effect(sox_effect_t* effp) {}
-void sox_delete_effects_chain(sox_effects_chain_t* ecp) {}
-
-int sox_effect_options(sox_effect_t* effp, int argc, char* const argv[]) {
- return -1;
-}
-
-const sox_effect_handler_t* sox_find_effect(char const* name) {
- return NULL;
-}
-
-int sox_flow_effects(
- sox_effects_chain_t* chain,
- int callback(sox_bool all_done, void* client_data),
- void* client_data) {
- return -1;
-}
-
-const sox_effect_fn_t* sox_get_effect_fns(void) {
- return NULL;
-}
-
-const sox_format_tab_t* sox_get_format_fns(void) {
- return NULL;
-}
-
-sox_globals_t* sox_get_globals(void) {
- return NULL;
-}
-
-sox_format_t* sox_open_read(
- char const* path,
- sox_signalinfo_t const* signal,
- sox_encodinginfo_t const* encoding,
- char const* filetype) {
- return NULL;
-}
-
-sox_format_t* sox_open_write(
- char const* path,
- sox_signalinfo_t const* signal,
- sox_encodinginfo_t const* encoding,
- char const* filetype,
- sox_oob_t const* oob,
- sox_bool overwrite_permitted(char const* filename)) {
- return NULL;
-}
-
-const char* sox_strerror(int sox_errno) {
- return NULL;
-}
-
-size_t sox_write(sox_format_t* ft, const sox_sample_t* buf, size_t len) {
- return 0;
-}
-
-int sox_init() {
- return -1;
-};
-int sox_quit() {
- return -1;
-};
diff --git a/tools/setup_helpers/extension.py b/tools/setup_helpers/extension.py
index 58f5087854..b322541e36 100644
--- a/tools/setup_helpers/extension.py
+++ b/tools/setup_helpers/extension.py
@@ -51,13 +51,6 @@ def get_ext_modules():
Extension(name="torchaudio.lib.libtorchaudio", sources=[]),
Extension(name="torchaudio.lib._torchaudio", sources=[]),
]
- if _BUILD_SOX:
- modules.extend(
- [
- Extension(name="torchaudio.lib.libtorchaudio_sox", sources=[]),
- Extension(name="torchaudio.lib._torchaudio_sox", sources=[]),
- ]
- )
if _BUILD_CUDA_CTC_DECODER:
modules.extend(
[