Use stable tensors throughout forced_align code

samanklesaria · samanklesaria · commit 77fd1ad5926d · 2025-08-06T21:30:48.000Z
diff --git a/src/libtorchaudio/accessor.h b/src/libtorchaudio/accessor.h
@@ -15,8 +15,8 @@ class Accessor {
   using tensor_type = typename std::conditional<IsConst, const Tensor&, Tensor&>::type;
 
   Accessor(tensor_type tensor) {
-    data = tensor.template data_ptr<T>();
-    for (int i = 0; i < k; i++) {
+    data = (T*)tensor.template data_ptr();
+    for (unsigned int i = 0; i < k; i++) {
       strides[i] = tensor.stride(i);
     }
   }
@@ -25,7 +25,7 @@ class Accessor {
     va_list args;
     va_start(args, k);
     int64_t ix = 0;
-    for (int i = 0; i < k; i++) {
+    for (unsigned int i = 0; i < k; i++) {
         ix += strides[i] * va_arg(args, int);
     }
     va_end(args);
@@ -37,7 +37,7 @@ class Accessor {
     va_list args;
     va_start(args, value);
     int64_t ix = 0;
-    for (int i = 0; i < k; i++) {
+    for (unsigned int i = 0; i < k; i++) {
         ix += strides[i] * va_arg(args, int);
     }
     va_end(args);
diff --git a/src/libtorchaudio/accessor_tests.cpp b/src/libtorchaudio/accessor_tests.cpp
@@ -4,11 +4,15 @@
 #include <torch/csrc/stable/tensor.h>
 #include <torch/csrc/stable/library.h>
 
+namespace torchaudio {
+
+namespace accessor_tests {
+
 using namespace std;
 using torch::stable::Tensor;
 
 bool test_accessor(const Tensor tensor) {
-  int64_t* data_ptr = tensor.template data_ptr<int64_t>();
+  int64_t* data_ptr = (int64_t*)tensor.data_ptr();
   auto accessor = Accessor<3, int64_t>(tensor);
   for (unsigned int i = 0; i < tensor.size(0); i++) {
     for (unsigned int j = 0; j < tensor.size(1); j++) {
@@ -25,10 +29,18 @@ bool test_accessor(const Tensor tensor) {
 
 void boxed_test_accessor(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
   Tensor t1(to<AtenTensorHandle>(stack[0]));
-  auto result = compute(std::move(t1));
+  auto result = test_accessor(std::move(t1));
   stack[0] = from(result);
 }
 
-STABLE_TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
-  m.def("torchaudio::_test_accessor", &boxed_test_accessor);
+TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
+  m.def(
+      "_test_accessor(Tensor log_probs) -> bool");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(torchaudio, CPU, m) {
+  m.impl("torchaudio::_test_accessor", &boxed_test_accessor);
+}
+
+}
 }
diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp
@@ -6,6 +6,7 @@
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
 #include <torch/csrc/inductor/aoti_torch/utils.h>
 #include <libtorchaudio/accessor.h>
+#include <torch/headeronly/util/Half.h>
 
 
 using namespace std;
@@ -22,7 +23,7 @@ template <typename scalar_t, typename target_t>
 void forced_align_impl(
     const Tensor logProbs,
     const Tensor targets,
-    const Tensor blank,
+    target_t blank,
     Tensor paths) {
   const scalar_t kNegInfinity = -std::numeric_limits<scalar_t>::infinity();
   const auto batchIndex =
@@ -143,15 +144,15 @@ std::tuple<Tensor, Tensor> compute(
   TORCH_CHECK(logProbs.is_cpu(), "log_probs must be a CPU tensor");
   TORCH_CHECK(targets.is_cpu(), "targets must be a CPU tensor");
   TORCH_CHECK(
-      logProbs.device() == targets.device(),
+      logProbs.get_device() == targets.get_device(),
       "log_probs and targets need to be on the same device");
   TORCH_CHECK(
-      logProbs.dtype() == torch::kFloat64 ||
-          logProbs.dtype() == torch::kFloat32 ||
-          logProbs.dtype() == torch::kFloat16,
+      logProbs.dtype() == aoti_torch_dtype_float64() ||
+          logProbs.dtype() == aoti_torch_dtype_float32() ||
+          logProbs.dtype() == aoti_torch_dtype_float16(),
       "log_probs must be float64, float32 or float16 (half) type");
   TORCH_CHECK(
-      targets.dtype() == torch::kInt32 || targets.dtype() == torch::kInt64,
+      targets.dtype() == aoti_torch_dtype_int32() || targets.dtype() == aoti_torch_dtype_int64(),
       "targets must be int32 or int64 type");
   TORCH_CHECK(logProbs.is_contiguous(), "log_probs must be contiguous");
   TORCH_CHECK(targets.is_contiguous(), "targets must be contiguous");
@@ -174,38 +175,41 @@ std::tuple<Tensor, Tensor> compute(
       blank >= 0 && blank < logProbs.size(-1),
       "blank must be within [0, num classes)");
 
-  TORCH_CHECK(
-      logProbs.size(1) == at::max(inputLengths).item().toInt(),
-      "input length mismatch");
-  TORCH_CHECK(
-      targets.size(1) == at::max(targetLengths).item().toInt(),
-      "target length mismatch");
+  // TODO: Requires port of `max` operator.
+  // TORCH_CHECK(
+  //     logProbs.size(1) == at::max(inputLengths).item().toInt(),
+  //     "input length mismatch");
+  // TORCH_CHECK(
+  //     targets.size(1) == at::max(targetLengths).item().toInt(),
+  //     "target length mismatch");
 
   const auto B = logProbs.size(0);
   const auto T = logProbs.size(1);
 
   int64_t paths_size[2] = {B, T};
   int64_t paths_stride[2] = {T, 1};
   AtenTensorHandle paths_h;
-  aoti_torch_empty_strided(1, paths_size, paths_stride, targets_dtype, targets_device, targets_device_index, &paths_h);
+  int32_t targets_device;
+  aoti_torch_get_device_type(targets.get(), &targets_device);
+  aoti_torch_empty_strided(1, paths_size, paths_stride, targets.dtype(), targets_device, targets.get_device(), &paths_h);
   auto paths = Tensor(paths_h);
 
 
   if (targets.dtype() == aoti_torch_dtype_int64()) {
-    if (logProbs.scalar_type() == aoti_torch_dtype_float64()) {
-      forced_align_impl<float64, int64>(logProbs, targets, blank, paths);
-    } else if (logProbs.scalar_type() == aoti_torch_dtype_float32()) {
-      forced_align_impl<float32, int64>(logProbs, targets, blank, paths);
-    } else if (logProbs.scalar_type() == aoti_torch_dtype_float16()) {
-      forced_align_impl<float16, int64>(logProbs, targets, blank, paths);
+    if (logProbs.dtype() == aoti_torch_dtype_float64()) {
+      forced_align_impl<double, int64_t>(logProbs, targets, blank, paths);
+    } else if (logProbs.dtype() == aoti_torch_dtype_float32()) {
+      forced_align_impl<float, int64_t>(logProbs, targets, blank, paths);
+    } else if (logProbs.dtype() == aoti_torch_dtype_float16()) {
+      forced_align_impl<c10::Half, int64_t>(logProbs, targets, blank, paths);
     }
-  } else if (targets.scalar_type() == aoti_torch_dtype_int32()) {
-    if (logProbs.scalar_type() == aoti_torch_dtype_float64()) {
-      forced_align_impl<float64, int32>(logProbs, targets, blank, paths);
-    } else if (logProbs.scalar_type() == aoti_torch_dtype_float32()) {
-      forced_align_impl<float32, int32>(logProbs, targets, blank, paths);
-    } else if (logProbs.scalar_type() == aoti_torch_dtype_float16()) {
-      forced_align_impl<float16, int32>(logProbs, targets, blank, paths);
+  } else if (targets.dtype() == aoti_torch_dtype_int32()) {
+    if (logProbs.dtype() == aoti_torch_dtype_float64()) {
+      forced_align_impl<double, int32_t>(logProbs, targets, blank, paths);
+    } else if (logProbs.dtype() == aoti_torch_dtype_float32()) {
+      forced_align_impl<float, int32_t>(logProbs, targets, blank, paths);
+    } else if (logProbs.dtype() == aoti_torch_dtype_float16()) {
+      forced_align_impl<c10::Half, int32_t>(logProbs, targets, blank, paths);
     }
   }
   return std::make_tuple(