Skip to content

Commit 18efb7b

Browse files
committed
debug: 1
1 parent f4e4593 commit 18efb7b

File tree

2 files changed

+18
-5
lines changed

2 files changed

+18
-5
lines changed

.github/scripts/unittest-windows/run_test.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ env | grep TORCHAUDIO || true
1313

1414
cd test
1515
if [ -z "${CUDA_VERSION:-}" ] ; then
16-
pytest --continue-on-collection-errors --cov=torchaudio --junitxml=${RUNNER_TEST_RESULTS_DIR}/junit.xml -v --durations 20 torchaudio_unittest -k "not torchscript and not fairseq and not demucs and not librosa"
16+
pytest -sv --continue-on-collection-errors --cov=torchaudio --junitxml=${RUNNER_TEST_RESULTS_DIR}/junit.xml -v --durations 20 torchaudio_unittest -k "not torchscript and not fairseq and not demucs and not librosa"
1717
else
18-
pytest --continue-on-collection-errors --cov=torchaudio --junitxml=${RUNNER_TEST_RESULTS_DIR}/junit.xml -v --durations 20 torchaudio_unittest -k "not cpu and (cuda or gpu) and not torchscript and not fairseq and not demucs and not librosa"
18+
pytest -sv --continue-on-collection-errors --cov=torchaudio --junitxml=${RUNNER_TEST_RESULTS_DIR}/junit.xml -v --durations 20 torchaudio_unittest -k "not cpu and (cuda or gpu) and not torchscript and not fairseq and not demucs and not librosa"
1919
fi
2020
coverage html

src/libtorchaudio/forced_align/gpu/compute.cu

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
#include <cub/cub.cuh>
77
#include <limits.h>
8+
#include <iostream>
89

910
namespace {
1011
constexpr int kNumThreads =
@@ -119,6 +120,7 @@ void forced_align_impl(
119120
const Tensor& targets,
120121
const int64_t blank,
121122
Tensor& paths) {
123+
std::cout << "forced_align_impl: entering" << std::endl;
122124
auto defaultStream = at::cuda::getCurrentCUDAStream();
123125
auto cpuDataTranferStream = at::cuda::getStreamFromPool();
124126
const scalar_t kNegInfinity = -std::numeric_limits<scalar_t>::infinity();
@@ -132,23 +134,28 @@ void forced_align_impl(
132134
const int L = targets.size(1); // label length
133135
const int S = 2 * L + 1;
134136

137+
std::cout << "forced_align_impl: 1" << std::endl;
135138
auto targetsCpu = torchaudio::stable::cpu(targets);
136139
// backPtrBuffer stores the index offset fthe best path at current position
137140
// We copy the values to CPU after running every kBackPtrBufferSize of
138141
// frames.
142+
std::cout << "forced_align_impl: 2" << std::endl;
139143
Tensor backPtrBuffer = torch::stable::new_empty(logProbs, {min(kBackPtrBufferSize, T), S}, ScalarType::Char);
140144
torch::stable::fill_(backPtrBuffer, -1);
141145

146+
std::cout << "forced_align_impl: 3" << std::endl;
142147
Tensor backPtrCpu = torch::stable::new_empty(targetsCpu, {T, S}, ScalarType::Char);
143148
torch::stable::fill_(backPtrCpu, -1);
144149

145150
// we store only two time frames for alphas
146151
// alphas for compute current timeframe can be computed only from previous
147152
// time frame.
153+
std::cout << "forced_align_impl: 4" << std::endl;
148154
Tensor alphas = torch::stable::new_empty(logProbs, {2, S});
149155
torch::stable::fill_(alphas, kNegInfinity);
150156

151157
// CPU accessors
158+
std::cout << "forced_align_impl: 5" << std::endl;
152159
auto targetsCpu_a = torchaudio::stable::accessor<target_t, 2>(targetsCpu);
153160
auto backPtrCpu_a = torchaudio::stable::accessor<int8_t, 2>(backPtrCpu);
154161
// count the number of repeats in label
@@ -170,6 +177,7 @@ void forced_align_impl(
170177
int end = (S == 1) ? 1 : 2;
171178
int backPtrBufferLen = 0;
172179
Tensor bufferCopy;
180+
std::cout << "forced_align_impl: 6" << std::endl;
173181
for (int t = 0; t < T; ++t) {
174182
if (t > 0) {
175183
if (T - t <= L + R) {
@@ -189,6 +197,7 @@ void forced_align_impl(
189197
end = end + 1;
190198
}
191199
}
200+
std::cout << "forced_align_impl: t=" << t << std::endl;
192201
falign_cuda_step_kernel<scalar_t, target_t>
193202
<<<1, kNumThreads, 0, defaultStream>>>(
194203
packed_accessor32<scalar_t, 3>(logProbs),
@@ -227,6 +236,7 @@ void forced_align_impl(
227236
backPtrBufferLen = 0;
228237
}
229238
}
239+
std::cout << "forced_align_impl: 7" << std::endl;
230240
cpuDataTranferStream.synchronize();
231241
auto alphasCpu = torchaudio::stable::cpu(alphas);
232242
auto alphasCpu_a = torchaudio::stable::accessor<scalar_t, 2>(alphasCpu);
@@ -235,12 +245,14 @@ void forced_align_impl(
235245
alphasCpu_a[curIdxOffset][S - 1] > alphasCpu_a[curIdxOffset][S - 2]
236246
? S - 1
237247
: S - 2;
248+
std::cout << "forced_align_impl: 8" << std::endl;
238249
for (int t = T - 1; t >= 0; --t) {
239250
auto lbl_idx =
240251
ltrIdx % 2 == 0 ? blank : targetsCpu_a[batchIndex][ltrIdx / 2];
241252
paths_a[batchIndex][t] = lbl_idx;
242253
ltrIdx -= backPtrCpu_a[t][ltrIdx];
243254
}
255+
std::cout << "forced_align_impl: leaving" << std::endl;
244256
}
245257

246258
template <typename scalar_t>
@@ -256,7 +268,7 @@ std::tuple<Tensor, Tensor> compute(
256268
Tensor inputLengths,
257269
Tensor targetLengths,
258270
const int64_t blank) {
259-
271+
std::cout << "forced_align: compute" << std::endl;
260272
STD_TORCH_CHECK(logProbs.is_cuda(), "log_probs must be a CUDA tensor");
261273
STD_TORCH_CHECK(targets.is_cuda(), "targets must be a CUDA tensor");
262274
STD_TORCH_CHECK(
@@ -301,9 +313,9 @@ std::tuple<Tensor, Tensor> compute(
301313

302314
auto B = logProbs.size(0);
303315
auto T = logProbs.size(1); // num frames
304-
316+
std::cout << "forced_align: compute: 1" << std::endl;
305317
Tensor paths = torchaudio::stable::new_zeros(targets, {B, T}, /*dtype=*/std::nullopt, /*layout=*/std::nullopt, /*device=*/torchaudio::stable::cpu_device());
306-
318+
std::cout << "forced_align: compute: 2" << std::endl;
307319
THO_DISPATCH_V2(logProbs.scalar_type(), "forced_align_impl", AT_WRAP([&] {
308320
if (targets.scalar_type() == ScalarType::Long) {
309321
forced_align_long_impl<scalar_t>(logProbs, targets, blank, paths);
@@ -318,6 +330,7 @@ std::tuple<Tensor, Tensor> compute(
318330

319331

320332
STABLE_TORCH_LIBRARY_IMPL(torchaudio, CUDA, m) {
333+
std::cout << "forced_align: library impl" << std::endl;
321334
m.impl("forced_align", TORCH_BOX(&compute));
322335
}
323336

0 commit comments

Comments
 (0)