Skip to content

Commit f7b442b

Browse files
committed
Clean up
1 parent 7d09bdd commit f7b442b

File tree

5 files changed

+52
-92
lines changed

5 files changed

+52
-92
lines changed

.github/scripts/unittest-windows/run_test.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ env | grep TORCHAUDIO || true
1313

1414
cd test
1515
if [ -z "${CUDA_VERSION:-}" ] ; then
16-
pytest -sv --continue-on-collection-errors --cov=torchaudio --junitxml=${RUNNER_TEST_RESULTS_DIR}/junit.xml -v --durations 20 torchaudio_unittest -k "not torchscript and not fairseq and not demucs and not librosa and (rnnt or forced_align)"
16+
pytest --continue-on-collection-errors --cov=torchaudio --junitxml=${RUNNER_TEST_RESULTS_DIR}/junit.xml -v --durations 20 torchaudio_unittest -k "not torchscript and not fairseq and not demucs and not librosa"
1717
else
18-
pytest -sv --continue-on-collection-errors --cov=torchaudio --junitxml=${RUNNER_TEST_RESULTS_DIR}/junit.xml -v --durations 20 torchaudio_unittest -k "not cpu and (cuda or gpu) and not torchscript and not fairseq and not demucs and not librosa and (rnnt or forced_align)"
18+
pytest --continue-on-collection-errors --cov=torchaudio --junitxml=${RUNNER_TEST_RESULTS_DIR}/junit.xml -v --durations 20 torchaudio_unittest -k "not cpu and (cuda or gpu) and not torchscript and not fairseq and not demucs and not librosa"
1919
fi
2020
coverage html

src/libtorchaudio/forced_align/cpu/compute.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,9 @@ void forced_align_impl(
3838
for (int i = 0; i < T * S; i++) {
3939
backPtr_a[i] = -1;
4040
}
41-
auto logProbs_a = accessor<scalar_t, 3>(logProbs);
42-
auto targets_a = accessor<target_t, 2>(targets);
43-
auto paths_a = accessor<target_t, 2>(paths);
41+
auto logProbs_a = torchaudio::accessor<scalar_t, 3>(logProbs);
42+
auto targets_a = torchaudio::accessor<target_t, 2>(targets);
43+
auto paths_a = torchaudio::accessor<target_t, 2>(paths);
4444
auto R = 0;
4545
for (auto i = 1; i < L; i++) {
4646
if (targets_a[batchIndex][i] == targets_a[batchIndex][i - 1]) {

src/libtorchaudio/forced_align/gpu/compute.cu

Lines changed: 13 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55

66
#include <cub/cub.cuh>
77
#include <limits.h>
8-
#include <iostream>
98

109
namespace {
1110
constexpr int kNumThreads =
@@ -23,9 +22,9 @@ using torch::headeronly::ScalarType;
2322

2423
template <typename scalar_t, typename target_t>
2524
__global__ void falign_cuda_step_kernel(
26-
const PackedTensorAccessor32<scalar_t, 3>
25+
const torchaudio::PackedTensorAccessor32<scalar_t, 3>
2726
logProbs_a,
28-
const PackedTensorAccessor32<target_t, 2>
27+
const torchaudio::PackedTensorAccessor32<target_t, 2>
2928
targets_a,
3029
const int T,
3130
const int L,
@@ -36,9 +35,9 @@ __global__ void falign_cuda_step_kernel(
3635
int start,
3736
int end,
3837
int backPtrBufferLen,
39-
PackedTensorAccessor32<scalar_t, 2>
38+
torchaudio::PackedTensorAccessor32<scalar_t, 2>
4039
alphas_a,
41-
PackedTensorAccessor32<int8_t, 2>
40+
torchaudio::PackedTensorAccessor32<int8_t, 2>
4241
backPtrBuffer_a) {
4342
scalar_t kNegInfinity = -std::numeric_limits<scalar_t>::infinity();
4443
const int batchIndex =
@@ -120,44 +119,38 @@ void forced_align_impl(
120119
const Tensor& targets,
121120
const int64_t blank,
122121
Tensor& paths) {
123-
std::cout << "forced_align_impl: entering" << std::endl;
124122
auto defaultStream = at::cuda::getCurrentCUDAStream();
125123
auto cpuDataTranferStream = at::cuda::getStreamFromPool();
126124
const scalar_t kNegInfinity = -std::numeric_limits<scalar_t>::infinity();
127125
using target_t = typename std::
128126
conditional<target_scalar_type == ScalarType::Int, int, int64_t>::type;
129-
auto paths_a = accessor<target_t, 2>(paths);
127+
auto paths_a = torchaudio::accessor<target_t, 2>(paths);
130128
const int batchIndex =
131129
0; // TODO: support batch version and use the real batch index
132130
const int T = logProbs.size(1); // num frames
133131
const int N = logProbs.size(2); // alphabet size
134132
const int L = targets.size(1); // label length
135133
const int S = 2 * L + 1;
136134

137-
std::cout << "forced_align_impl: 1" << std::endl;
138135
auto targetsCpu = torchaudio::stable::cpu(targets);
139136
// backPtrBuffer stores the index offset fthe best path at current position
140137
// We copy the values to CPU after running every kBackPtrBufferSize of
141138
// frames.
142-
std::cout << "forced_align_impl: 2" << std::endl;
143139
Tensor backPtrBuffer = torch::stable::new_empty(logProbs, {min(kBackPtrBufferSize, T), S}, ScalarType::Char);
144140
torch::stable::fill_(backPtrBuffer, -1);
145141

146-
std::cout << "forced_align_impl: 3" << std::endl;
147142
Tensor backPtrCpu = torch::stable::new_empty(targetsCpu, {T, S}, ScalarType::Char);
148143
torch::stable::fill_(backPtrCpu, -1);
149144

150145
// we store only two time frames for alphas
151146
// alphas for compute current timeframe can be computed only from previous
152147
// time frame.
153-
std::cout << "forced_align_impl: 4" << std::endl;
154148
Tensor alphas = torch::stable::new_empty(logProbs, {2, S});
155149
torch::stable::fill_(alphas, kNegInfinity);
156150

157151
// CPU accessors
158-
std::cout << "forced_align_impl: 5" << std::endl;
159-
auto targetsCpu_a = accessor<target_t, 2>(targetsCpu);
160-
auto backPtrCpu_a = accessor<int8_t, 2>(backPtrCpu);
152+
auto targetsCpu_a = torchaudio::accessor<target_t, 2>(targetsCpu);
153+
auto backPtrCpu_a = torchaudio::accessor<int8_t, 2>(backPtrCpu);
161154
// count the number of repeats in label
162155
int R = 0;
163156
for (int i = 1; i < L; ++i) {
@@ -177,7 +170,6 @@ void forced_align_impl(
177170
int end = (S == 1) ? 1 : 2;
178171
int backPtrBufferLen = 0;
179172
Tensor bufferCopy;
180-
std::cout << "forced_align_impl: 6" << std::endl;
181173
for (int t = 0; t < T; ++t) {
182174
if (t > 0) {
183175
if (T - t <= L + R) {
@@ -197,11 +189,10 @@ void forced_align_impl(
197189
end = end + 1;
198190
}
199191
}
200-
std::cout << "forced_align_impl: t=" << t << std::endl;
201192
falign_cuda_step_kernel<scalar_t, target_t>
202193
<<<1, kNumThreads, 0, defaultStream>>>(
203-
packed_accessor32<scalar_t, 3>(logProbs),
204-
packed_accessor32<target_t, 2>(targets),
194+
torchaudio::packed_accessor32<scalar_t, 3>(logProbs),
195+
torchaudio::packed_accessor32<target_t, 2>(targets),
205196
T,
206197
L,
207198
N,
@@ -211,8 +202,8 @@ void forced_align_impl(
211202
start,
212203
end,
213204
backPtrBufferLen,
214-
packed_accessor32<scalar_t, 2>(alphas),
215-
packed_accessor32<int8_t, 2>(backPtrBuffer));
205+
torchaudio::packed_accessor32<scalar_t, 2>(alphas),
206+
torchaudio::packed_accessor32<int8_t, 2>(backPtrBuffer));
216207
C10_CUDA_KERNEL_LAUNCH_CHECK();
217208
++backPtrBufferLen;
218209
if (backPtrBufferLen == kBackPtrBufferSize || t == T - 1) {
@@ -236,23 +227,20 @@ void forced_align_impl(
236227
backPtrBufferLen = 0;
237228
}
238229
}
239-
std::cout << "forced_align_impl: 7" << std::endl;
240230
cpuDataTranferStream.synchronize();
241231
auto alphasCpu = torchaudio::stable::cpu(alphas);
242-
auto alphasCpu_a = accessor<scalar_t, 2>(alphasCpu);
232+
auto alphasCpu_a = torchaudio::accessor<scalar_t, 2>(alphasCpu);
243233
int curIdxOffset = ((T - 1) % 2);
244234
int ltrIdx =
245235
alphasCpu_a[curIdxOffset][S - 1] > alphasCpu_a[curIdxOffset][S - 2]
246236
? S - 1
247237
: S - 2;
248-
std::cout << "forced_align_impl: 8" << std::endl;
249238
for (int t = T - 1; t >= 0; --t) {
250239
auto lbl_idx =
251240
ltrIdx % 2 == 0 ? blank : targetsCpu_a[batchIndex][ltrIdx / 2];
252241
paths_a[batchIndex][t] = lbl_idx;
253242
ltrIdx -= backPtrCpu_a[t][ltrIdx];
254243
}
255-
std::cout << "forced_align_impl: leaving" << std::endl;
256244
}
257245

258246
std::tuple<Tensor, Tensor> compute(
@@ -261,7 +249,6 @@ std::tuple<Tensor, Tensor> compute(
261249
Tensor inputLengths,
262250
Tensor targetLengths,
263251
const int64_t blank) {
264-
std::cout << "forced_align: compute" << std::endl;
265252
STD_TORCH_CHECK(logProbs.is_cuda(), "log_probs must be a CUDA tensor");
266253
STD_TORCH_CHECK(targets.is_cuda(), "targets must be a CUDA tensor");
267254
STD_TORCH_CHECK(
@@ -306,30 +293,19 @@ std::tuple<Tensor, Tensor> compute(
306293

307294
auto B = logProbs.size(0);
308295
auto T = logProbs.size(1); // num frames
309-
std::cout << "forced_align: compute: 1" << std::endl;
310296
Tensor paths = torchaudio::stable::new_zeros(targets, {B, T}, /*dtype=*/std::nullopt, /*layout=*/std::nullopt, /*device=*/torchaudio::stable::cpu_device());
311-
std::cout << "forced_align: compute: 2" << std::endl;
312297
THO_DISPATCH_V2(logProbs.scalar_type(), "forced_align_impl", AT_WRAP([&] {
313298
if (targets.scalar_type() == ScalarType::Long) {
314-
std::cout << "forced_align: compute: 2.1" << std::endl;
315299
(forced_align_impl<scalar_t, ScalarType::Long>(logProbs, targets, blank, paths));
316-
std::cout << "forced_align: compute: 2.2" << std::endl;
317300
} else {
318-
STD_TORCH_CHECK(targets.scalar_type() == ScalarType::Int, "unexpected dtype");
319-
std::cout << "forced_align: compute: 2.3" << std::endl;
320301
(forced_align_impl<scalar_t, ScalarType::Int>(logProbs, targets, blank, paths));
321-
std::cout << "forced_align: compute: 2.4" << std::endl;
322-
}
302+
}
323303
}), AT_EXPAND(AT_FLOATING_TYPES), ScalarType::Half);
324-
std::cout << "forced_align: compute: 3" << std::endl;
325304
Tensor pathsCuda = torchaudio::stable::cuda(paths, logProbs.get_device_index());
326-
std::cout << "forced_align: compute: 4" << std::endl;
327305
return std::make_tuple(pathsCuda, logProbs);
328306
}
329307

330-
331308
STABLE_TORCH_LIBRARY_IMPL(torchaudio, CUDA, m) {
332-
std::cout << "forced_align: library impl" << std::endl;
333309
m.impl("forced_align", TORCH_BOX(&compute));
334310
}
335311

src/libtorchaudio/overdrive.cpp

Lines changed: 16 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,20 @@
1+
#include <libtorchaudio/utils.h>
12
#include <torch/csrc/stable/library.h>
23
#include <torch/csrc/stable/ops.h>
34
#include <torch/csrc/stable/tensor.h>
45
#include <torch/headeronly/core/Dispatch_v2.h>
56
#include <torch/headeronly/core/TensorAccessor.h>
67

78
namespace {
8-
99
using torch::stable::Tensor;
1010

11-
template <typename T, size_t N>
12-
using TensorAccessor = torch::headeronly::HeaderOnlyTensorAccessor<T, N>;
13-
14-
// TODO: eliminate accessor<T, N>(t) in favor of t.accessor<T, N>
15-
// after Tensor::accessor is supported in stable ABI
16-
template <typename T, size_t N>
17-
inline TensorAccessor<T, N> accessor(Tensor t) {
18-
return TensorAccessor<T, N>(
19-
reinterpret_cast<T*>(t.data_ptr()), t.sizes().data(), t.strides().data());
20-
}
21-
2211
template <typename scalar_t>
2312
void overdrive_cpu_kernel(
24-
TensorAccessor<scalar_t, 2> waveform_accessor,
25-
TensorAccessor<scalar_t, 2> temp_accessor,
26-
TensorAccessor<scalar_t, 1> last_in_accessor,
27-
TensorAccessor<scalar_t, 1> last_out_accessor,
28-
TensorAccessor<scalar_t, 2> output_waveform_accessor) {
13+
torchaudio::TensorAccessor<scalar_t, 2> waveform_accessor,
14+
torchaudio::TensorAccessor<scalar_t, 2> temp_accessor,
15+
torchaudio::TensorAccessor<scalar_t, 1> last_in_accessor,
16+
torchaudio::TensorAccessor<scalar_t, 1> last_out_accessor,
17+
torchaudio::TensorAccessor<scalar_t, 2> output_waveform_accessor) {
2918
int64_t n_frames = waveform_accessor.size(1);
3019
int64_t n_channels = waveform_accessor.size(0);
3120

@@ -56,11 +45,11 @@ std::tuple<Tensor, Tensor, Tensor> overdrive_core_loop_cpu(
5645
"overdrive_cpu",
5746
AT_WRAP([&] {
5847
overdrive_cpu_kernel<scalar_t>(
59-
accessor<scalar_t, 2>(waveform),
60-
accessor<scalar_t, 2>(temp),
61-
accessor<scalar_t, 1>(last_in),
62-
accessor<scalar_t, 1>(last_out),
63-
accessor<scalar_t, 2>(output_waveform));
48+
torchaudio::accessor<scalar_t, 2>(waveform),
49+
torchaudio::accessor<scalar_t, 2>(temp),
50+
torchaudio::accessor<scalar_t, 1>(last_in),
51+
torchaudio::accessor<scalar_t, 1>(last_out),
52+
torchaudio::accessor<scalar_t, 2>(output_waveform));
6453
}),
6554
AT_FLOATING_TYPES);
6655
return std::make_tuple(last_in, last_out, output_waveform);
@@ -70,7 +59,11 @@ std::tuple<Tensor, Tensor, Tensor> overdrive_core_loop_cpu(
7059

7160
STABLE_TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
7261
m.def(
73-
"_overdrive_core_loop(Tensor waveform, Tensor temp, Tensor(a!) last_in, Tensor(b!) last_out, Tensor(c!) output_waveform) -> (Tensor(a!), Tensor(b!), Tensor(c!))");
62+
"_overdrive_core_loop(Tensor waveform,"
63+
"Tensor temp,"
64+
"Tensor(a!) last_in,"
65+
"Tensor(b!) last_out,"
66+
"Tensor(c!) output_waveform) -> (Tensor(a!), Tensor(b!), Tensor(c!))");
7467
}
7568

7669
STABLE_TORCH_LIBRARY_IMPL(torchaudio, CPU, m) {

src/libtorchaudio/rnnt/cpu/compute.cpp

Lines changed: 18 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include <torch/csrc/stable/library.h>
33
#include <torch/csrc/stable/ops.h>
44
#include <torch/csrc/stable/tensor.h>
5+
#include <torch/headeronly/core/Dispatch_v2.h>
56

67
namespace torchaudio {
78
namespace rnnt {
@@ -109,40 +110,30 @@ std::tuple<Tensor, Tensor> compute(
109110
{DtypeWorkspace<float>::ComputeSizeFromOptions(options)},
110111
ScalarType::Float);
111112

113+
// TODO: use t.mutable_data_ptr<..>() instead of reinterpret_cast
114+
// when stable ABI Tensor supports mutable_data_ptr templates.
112115
Workspace<float> workspace(
113116
/*options=*/options,
114117
/*dtype_data=*/reinterpret_cast<float*>(float_workspace.data_ptr()),
115118
/*dtype_size=*/float_workspace.numel(),
116119
/*int_data=*/reinterpret_cast<int*>(int_workspace.data_ptr()),
117120
/*int_size=*/int_workspace.numel());
118121

119-
switch (logits.scalar_type()) {
120-
case ScalarType::Float: {
121-
Compute</*DTYPE=*/float, /*CAST_DTYPE=*/float>(
122-
/*workspace=*/workspace,
123-
/*logits=*/reinterpret_cast<float*>(logits.data_ptr()),
124-
/*targets=*/reinterpret_cast<int*>(targets.data_ptr()),
125-
/*srcLengths=*/reinterpret_cast<int*>(logit_lengths.data_ptr()),
126-
/*tgtLengths=*/reinterpret_cast<int*>(target_lengths.data_ptr()),
127-
/*costs=*/reinterpret_cast<float*>(costs.data_ptr()),
128-
/*gradients=*/reinterpret_cast<float*>(gradients.data_ptr()));
129-
break;
130-
}
131-
case ScalarType::Half: {
132-
Compute</*DTYPE=*/c10::Half, /*CAST_DTYPE=*/float>(
133-
/*workspace=*/workspace,
134-
/*logits=*/reinterpret_cast<c10::Half*>(logits.data_ptr()),
135-
/*targets=*/reinterpret_cast<int*>(targets.data_ptr()),
136-
/*srcLengths=*/reinterpret_cast<int*>(logit_lengths.data_ptr()),
137-
/*tgtLengths=*/reinterpret_cast<int*>(target_lengths.data_ptr()),
138-
/*costs=*/reinterpret_cast<c10::Half*>(costs.data_ptr()),
139-
/*gradients=*/reinterpret_cast<c10::Half*>(gradients.data_ptr()));
140-
break;
141-
}
142-
default: {
143-
STD_TORCH_CHECK(false, "unreachable");
144-
}
145-
};
122+
THO_DISPATCH_V2(
123+
logits.scalar_type(),
124+
"rnnt:compute",
125+
AT_WRAP([&] {
126+
(Compute</*DTYPE=*/scalar_t, /*CAST_DTYPE=*/float>(
127+
/*workspace=*/workspace,
128+
/*logits=*/reinterpret_cast<scalar_t*>(logits.data_ptr()),
129+
/*targets=*/reinterpret_cast<int*>(targets.data_ptr()),
130+
/*srcLengths=*/reinterpret_cast<int*>(logit_lengths.data_ptr()),
131+
/*tgtLengths=*/reinterpret_cast<int*>(target_lengths.data_ptr()),
132+
/*costs=*/reinterpret_cast<scalar_t*>(costs.data_ptr()),
133+
/*gradients=*/reinterpret_cast<scalar_t*>(gradients.data_ptr())));
134+
}),
135+
ScalarType::Float,
136+
ScalarType::Half);
146137

147138
return std::make_tuple(costs, gradients);
148139
}

0 commit comments

Comments
 (0)