cat: improve error handling and error messages. (#9548)

ysiraichi · web-flow · commit 38e0f03796d3 · 2025-08-11T13:08:04.000-03:00
diff --git a/test/test_operations.py b/test/test_operations.py
@@ -2473,6 +2473,19 @@ def test_construct_large_tensor_raises_error(self):
       # OOM is raised when we try to bring data from the device.
       b.cpu()
 
+  def test_cat_raises_error_on_incompatible_shapes(self):
+    a = torch.rand(2, 2, device=torch_xla.device())
+    b = torch.rand(5, 1, device=torch_xla.device())
+
+    try:
+      torch.cat([a, b])
+    except RuntimeError as e:
+      expected_error = (
+          "cat(): cannot concatenate tensors of shape f32[2,2] with f32[5,1] "
+          "at dimension 0. Expected shapes to be equal (except at dimension 0) "
+          "or that either of them was a 1D empty tensor of size (0,).")
+      self.assertEqual(str(e), expected_error)
+
 
 class MNISTComparator(nn.Module):
 
diff --git a/torch_xla/csrc/aten_xla_type.cpp b/torch_xla/csrc/aten_xla_type.cpp
@@ -1314,9 +1314,10 @@ at::Tensor XLANativeFunctions::bmm(const at::Tensor& self,
 at::Tensor XLANativeFunctions::cat(const at::ITensorListRef& tensors,
                                    int64_t dim) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
-  return bridge::AtenFromXlaTensor(
-      tensor_methods::cat(GetValueOrThrow(bridge::GetXlaTensors(tensors)), dim,
-                          at::native::result_type(tensors)));
+  auto xtensors = GetValueOrThrow(bridge::GetXlaTensors(tensors));
+  auto output = GetValueOrThrow(
+      tensor_methods::cat(xtensors, dim, at::native::result_type(tensors)));
+  return bridge::AtenFromXlaTensor(std::move(output));
 }
 
 at::Tensor XLANativeFunctions::celu(const at::Tensor& self,
diff --git a/torch_xla/csrc/tensor_methods.cpp b/torch_xla/csrc/tensor_methods.cpp
@@ -9,6 +9,7 @@
 #include <algorithm>
 #include <functional>
 
+#include "absl/log/absl_check.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
 #include "torch_xla/csrc/LazyIr.h"
@@ -1160,18 +1161,19 @@ std::vector<XLATensorPtr> broadcast_tensors(
   return tensors.front()->MakeOutputTensors(node);
 }
 
-XLATensorPtr cat(absl::Span<const XLATensorPtr> tensors, int64_t dim,
-                 at::ScalarType dtype) {
+absl::StatusOr<absl_nonnull XLATensorPtr> cat(
+    absl::Span<const XLATensorPtr> tensors, int64_t dim, at::ScalarType dtype) {
   // Shape checks for cat:
   // - If not empty, every tensor shape must be the same.
   // - Empty tensor passes but is simply ignore in implementation,
   //   e.g. ([2, 3, 5], [])
   // - If empty dimension, other dimensions must be the same.
   //   e.g. ([4, 0, 32, 32], [4, 2, 32, 32], dim=1) passes.
   //   ([4, 0, 32, 32], [4, 2, 31, 32], dim=1) throws.
-  XLA_CHECK_GT(tensors.size(), 0);
+  ABSL_CHECK(tensors.size() > 0);
   std::vector<torch::lazy::Value> values;
   std::vector<xla::Shape> shapes;
+  size_t last_tensor_index;
   for (size_t i = 0; i < tensors.size(); ++i) {
     xla::Shape tensor_shape = tensors[i]->shape();
     if (tensor_shape.dimensions_size() == 1 &&
@@ -1181,13 +1183,20 @@ XLATensorPtr cat(absl::Span<const XLATensorPtr> tensors, int64_t dim,
     dim = torch::lazy::GetCanonicalDimensionIndex(
         dim, tensor_shape.dimensions_size());
     tensor_shape.DeleteDimension(dim);
-    if (!shapes.empty()) {
-      XLA_CHECK(xla::ShapeUtil::CompatibleIgnoringElementType(shapes.back(),
-                                                              tensor_shape))
-          << shapes.back() << " vs. " << tensor_shape;
+    if (!shapes.empty() && !xla::ShapeUtil::CompatibleIgnoringElementType(
+                               shapes.back(), tensor_shape)) {
+      auto last_tensor = tensors[last_tensor_index];
+      auto tensor = tensors[i];
+      return XLA_ERROR_WITH_LOCATION(absl::InvalidArgumentError(absl::StrCat(
+          "cat(): cannot concatenate tensors of shape ",
+          last_tensor->shape().get().ToString(), " with ",
+          tensor->shape().get().ToString(), " at dimension ", dim,
+          ". Expected shapes to be equal (except at dimension ", dim,
+          ") or that either of them was a 1D empty tensor of size (0,).")));
     }
     shapes.push_back(tensor_shape);
     values.push_back(tensors[i]->GetIrValue());
+    last_tensor_index = i;
   }
   if (values.empty()) {
     return tensors[0];
diff --git a/torch_xla/csrc/tensor_methods.h b/torch_xla/csrc/tensor_methods.h
@@ -1,6 +1,7 @@
 #ifndef XLA_TORCH_XLA_CSRC_TENSOR_METHODS_H_
 #define XLA_TORCH_XLA_CSRC_TENSOR_METHODS_H_
 
+#include "absl/base/nullability.h"
 #include "torch_xla/csrc/cross_replica_reduces.h"
 #include "torch_xla/csrc/ops/custom_sharding.h"
 #include "torch_xla/csrc/runtime/computation_client.h"
@@ -307,8 +308,8 @@ XLATensorPtr bmm(const XLATensorPtr& batch1, const XLATensorPtr& batch2);
 std::vector<XLATensorPtr> broadcast_tensors(
     absl::Span<const XLATensorPtr> tensors);
 
-XLATensorPtr cat(absl::Span<const XLATensorPtr> tensors, int64_t dim,
-                 at::ScalarType dtype);
+absl::StatusOr<absl_nonnull XLATensorPtr> cat(
+    absl::Span<const XLATensorPtr> tensors, int64_t dim, at::ScalarType dtype);
 
 XLATensorPtr cdist_forward(const XLATensorPtr& x1, const XLATensorPtr& x2,
                            double p);