Improve error message of functions related to GetXlaTensor(). (#9520)

ysiraichi · web-flow · commit 6050927ce079 · 2025-08-07T11:44:02.000-03:00
diff --git a/test/quantized_ops/test_dot_general.py b/test/quantized_ops/test_dot_general.py
@@ -56,6 +56,25 @@ def test_dot_general_int32_dtype(self):
         preferred_element_type=torch.int32)
     self.assertTrue(torch.allclose(xla_out.cpu(), expected_out))
 
+  def test_raises_error_on_non_xla_tensor(self):
+    lhs = torch.rand(10, 3, 4, dtype=torch.bfloat16)
+    rhs = torch.rand(10, 4, 5, dtype=torch.bfloat16)
+
+    def test(args, non_xla_tensor_arg):
+      arg_number_to_str = ["first", "second"]
+      position = arg_number_to_str[non_xla_tensor_arg]
+      try:
+        torch_xla._XLAC._xla_dot_general(*args, (([2], [1]), ([0], [0])))
+      except RuntimeError as err:
+        error_message = (
+            f"Expected input tensor ({position} argument) to be an actual XLA tensor. "
+            f"Got: CPUBFloat16Type. Consider moving it ({position} argument) to XLA."
+        )
+        self.assertEqual(str(err), error_message)
+
+    test((lhs, rhs.to(device)), non_xla_tensor_arg=0)
+    test((lhs.to(device), rhs), non_xla_tensor_arg=1)
+
 
 if __name__ == '__main__':
   test = unittest.main()
diff --git a/torch_xla/csrc/BUILD b/torch_xla/csrc/BUILD
@@ -280,6 +280,7 @@ ptxla_cc_library(
         "//torch_xla/csrc/runtime:xla_coordinator",
         "//torch_xla/csrc/runtime:xla_util",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:absl_check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:variant",
diff --git a/torch_xla/csrc/aten_xla_bridge.cpp b/torch_xla/csrc/aten_xla_bridge.cpp
@@ -7,7 +7,7 @@
 #include <string>
 #include <vector>
 
-#include "absl/status/status.h"
+#include "absl/log/absl_check.h"
 #include "absl/strings/str_cat.h"
 #include "torch_xla/csrc/device.h"
 #include "torch_xla/csrc/runtime/debug_macros.h"
@@ -80,8 +80,12 @@ static absl::StatusOr<XLATensorImpl * absl_nonnull> GetXlaTensorImpl(
   XLATensorImpl* impl =
       dynamic_cast<XLATensorImpl*>(inner_tensor.unsafeGetTensorImpl());
   if (impl == nullptr) {
-    return XLA_ERROR_WITH_LOCATION(absl::InvalidArgumentError(absl::StrCat(
-        "Input tensor is not an XLA tensor: ", tensor.toString())));
+    auto error_message =
+        absl::StrCat("Failed retrieving the inner XLATensorImpl* from ",
+                     tensor.toString(), ". ",
+                     "It's likely that `tensor` is not an actual XLA tensor, "
+                     "i.e. it wasn't created inside PyTorch/XLA.");
+    return XLA_ERROR_WITH_LOCATION(absl::InvalidArgumentError(error_message));
   }
   return impl;
 }
@@ -99,41 +103,63 @@ absl::StatusOr<absl_nonnull XLATensorPtr> GetXlaTensor(
     // To make sure we have the most updated version of tensor.
     at::functionalization::impl::sync(tensor);
   }
-  XLA_ASSIGN_OR_RETURN(XLATensorImpl * impl, GetXlaTensorImpl(tensor));
+  XLA_ASSIGN_OR_RETURN(
+      XLATensorImpl * impl, GetXlaTensorImpl(tensor),
+      absl::StrCat("Expected XLA tensor. Got: ", tensor.toString()));
   return impl->tensor();
 }
 
 absl::StatusOr<std::vector<absl_nonnull XLATensorPtr>> GetXlaTensors(
     const at::ITensorListRef& tensors) {
   std::vector<absl_nonnull XLATensorPtr> xla_tensors;
   xla_tensors.reserve(tensors.size());
+  std::size_t index = 0;
   for (const auto& tensor : tensors) {
-    XLA_ASSIGN_OR_RETURN(XLATensorPtr ptr, bridge::GetXlaTensor(tensor));
+    XLA_ASSIGN_OR_RETURN(
+        XLATensorPtr ptr, bridge::GetXlaTensor(tensor),
+        absl::StrCat("Expected all tensors in the given list to be XLA "
+                     "tensors. Element at index ",
+                     index, " is not an XLA tensor. Got: ", tensor.toString()));
     xla_tensors.push_back(std::move(ptr));
+    index += 1;
   }
   return xla_tensors;
 }
 
+absl::StatusOr<absl_nonnull XLATensorPtr> GetInputXlaTensor(
+    const at::Tensor& tensor, const std::string_view param) {
+  XLA_ASSIGN_OR_RETURN(
+      XLATensorPtr ptr, GetXlaTensor(tensor),
+      absl::StrCat("Expected input tensor (", param,
+                   ") to be an actual XLA tensor. Got: ", tensor.toString(),
+                   ". Consider moving it (", param, ") to XLA."));
+  return ptr;
+}
+
 bool IsXlaTensor(const at::Tensor& tensor) {
   return GetXlaTensorImpl(tensor).ok();
 }
 
 absl::Status ReplaceXlaTensor(const at::Tensor& tensor,
                               XLATensorPtr new_xla_tensor) {
-  XLA_ASSIGN_OR_RETURN(XLATensorImpl * impl, GetXlaTensorImpl(tensor));
+  XLA_ASSIGN_OR_RETURN(XLATensorImpl * impl, GetXlaTensorImpl(tensor),
+                       "Failed replacing the XLA tensor in the given tensor.");
   impl->set_tensor(std::move(new_xla_tensor));
   return absl::OkStatus();
 }
 
 absl::Status ReplaceXlaTensor(const std::vector<at::Tensor>& tensors,
                               const std::vector<XLATensorPtr> new_xla_tensors) {
-  if (tensors.size() != new_xla_tensors.size()) {
-    return XLA_ERROR_WITH_LOCATION(absl::InvalidArgumentError(
-        absl::StrCat("The size of tensors and new_xla_tensors are not equal: ",
-                     tensors.size(), " vs. ", new_xla_tensors.size())));
-  }
+  ABSL_CHECK(tensors.size() == new_xla_tensors.size())
+      << "Expected the size of the list of tensors (" << tensors.size()
+      << ") to match the size of the list of XLATensorPtr ("
+      << new_xla_tensors.size() << ")";
   for (size_t i = 0; i < tensors.size(); ++i) {
-    XLA_RETURN_IF_ERROR(ReplaceXlaTensor(tensors[i], new_xla_tensors[i]));
+    XLA_RETURN_IF_ERROR(
+        ReplaceXlaTensor(tensors[i], new_xla_tensors[i]),
+        absl::StrCat(
+            "Failed replacing the XLA tensor at index ", i,
+            ". The reason being that that tensor is not an XLA tensor."));
   }
   return absl::OkStatus();
 }
diff --git a/torch_xla/csrc/aten_xla_bridge.h b/torch_xla/csrc/aten_xla_bridge.h
@@ -59,6 +59,15 @@ absl::StatusOr<absl_nonnull XLATensorPtr> GetXlaTensor(
 absl::StatusOr<std::vector<absl_nonnull XLATensorPtr>> GetXlaTensors(
     const at::ITensorListRef& tensors);
 
+// Retrieves the underlying `XLATensorPtr` from `tensor`.
+//
+// If `tensor` is not an actual XLA tensor, this function will craft a
+// specialized error message for PyTorch operations or Python API
+// functions, i.e. functions where the parameter name makes sense for
+// the end user.
+absl::StatusOr<absl_nonnull XLATensorPtr> GetInputXlaTensor(
+    const at::Tensor& tensor, std::string_view param);
+
 bool IsXlaTensor(const at::Tensor& tensor);
 
 // Replaces the XLA tensor embedded within `tensor`'s XLA TensorImpl with
diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
@@ -25,7 +25,7 @@
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
-#include "absl/status/status.h"
+#include "absl/log/absl_check.h"
 #include "absl/strings/str_cat.h"
 #include "absl/synchronization/blocking_counter.h"
 #include "absl/types/variant.h"
@@ -38,6 +38,7 @@
 #include "pybind11/pytypes.h"
 #include "pybind11/stl.h"
 #include "pybind11/stl_bind.h"
+#include "status.h"
 #include "torch_xla/csrc/XLANativeFunctions.h"
 #include "torch_xla/csrc/aten_autograd_ops.h"
 #include "torch_xla/csrc/aten_fallback.h"
@@ -87,6 +88,23 @@ namespace {
 
 constexpr int64_t kSeedInfoId = -127389;
 
+// Traits related to the return type of the lambda function that wraps the
+// actual implementation inside PythonScope.
+template <class T>
+struct RemoveStatus {
+  using type = T;
+};
+
+template <>
+struct RemoveStatus<absl::Status> {
+  using type = void;
+};
+
+template <class T>
+struct RemoveStatus<absl::StatusOr<T>> {
+  using type = T;
+};
+
 // Wraps a python scope (e.g. py::module) to provide more convenient APIs.
 // It behaves like a Scope object but has enhanced behaviors for the def*()
 // methods. This class has reference semantics, just like the Scope class.
@@ -153,15 +171,29 @@ class PythonScope : public Scope {
     template <typename F>
     static void Bind(Scope& scope, const char* const name, F&& f,
                      const Extra&... extra) {
-      using RetType =
+      // `f` return type.
+      using FnRetType =
           typename c10::guts::infer_function_traits<F>::type::return_type;
-      auto lambda = [f = std::move(f)](Args... args) -> RetType {
+      // Wrapper lambda return type.
+      // This is needed for handling returning status types.
+      using LambdaRetType = typename RemoveStatus<FnRetType>::type;
+      // FnRetType is a status type iff after unwrapping the status type,
+      // the resulting type (i.e. LambdaRetType) is NOT the same as FnRetType.
+      constexpr bool returns_status_type =
+          !std::is_same<FnRetType, LambdaRetType>::value;
+
+      auto lambda = [f = std::move(f)](Args... args) -> LambdaRetType {
         // RAII for emitting Python warnings.
         //
         // This turns messages passed to `TORCH_WARN()` in `f` into Python
         // warnings.
         torch::PyWarningHandler handler;
-        return f(args...);
+
+        if constexpr (returns_status_type) {
+          return GetValueOrThrow(f(args...));
+        } else {
+          return f(args...);
+        }
       };
 
       if constexpr (kind == FunctionKind::kInit) {
@@ -237,13 +269,11 @@ std::string GetTensorsDump(
     const std::vector<at::Tensor>& tensors,
     const std::function<
         std::string(absl::Span<const torch::lazy::Node* const>)>& coverter) {
+  auto xtensors = GetValueOrThrow(bridge::GetXlaTensors(tensors));
   std::vector<const torch::lazy::Node*> nodes;
-  std::vector<torch::lazy::Value> values;
-  for (auto& tensor : tensors) {
-    XLATensorPtr xtensor = GetValueOrThrow(bridge::GetXlaTensor(tensor));
-    values.push_back(xtensor->GetIrValue());
-    nodes.push_back(values.back().node.get());
-  }
+  std::transform(
+      xtensors.begin(), xtensors.end(), std::back_inserter(nodes),
+      [](const XLATensorPtr& ptr) { return ptr->GetIrValue().node.get(); });
   return coverter(nodes);
 }
 
@@ -363,7 +393,7 @@ std::vector<std::vector<int>> ExtractXlaDotGeneralDimVectors(
   return dim_vectors;
 }
 
-at::Tensor XlaDotGeneral(const at::Tensor& lhs, const at::Tensor& rhs,
+at::Tensor XlaDotGeneral(const XLATensorPtr& xlhs, const XLATensorPtr& xrhs,
                          const std::vector<std::vector<int>>& dim_vectors,
                          std::optional<py::object> preferred_element_type) {
   std::optional<at::ScalarType> at_preferred_element_type;
@@ -373,9 +403,7 @@ at::Tensor XlaDotGeneral(const at::Tensor& lhs, const at::Tensor& rhs,
             ->scalar_type;
   }
   return bridge::AtenFromXlaTensor(tensor_methods::xla_dot_general(
-      GetValueOrThrow(bridge::GetXlaTensor(lhs)),
-      GetValueOrThrow(bridge::GetXlaTensor(rhs)), dim_vectors,
-      at_preferred_element_type));
+      xlhs, xrhs, dim_vectors, at_preferred_element_type));
 }
 
 std::vector<std::pair<int64_t, int64_t>> CreateSourceTargetPairs(
@@ -1841,20 +1869,25 @@ void InitXlaModuleBindings(py::module m) {
            })
       .def(
           "_xla_dot_general",
-          [](const at::Tensor& lhs, const at::Tensor& rhs,
+          [](const at::Tensor& lhs,
+             const at::Tensor& rhs,
              py::tuple dimension_numbers,
              std::optional<std::string>& precision_config,
-             std::optional<py::object>& preferred_element_type) -> at::Tensor {
+             std::optional<py::object>& preferred_element_type) -> absl::StatusOr<at::Tensor> {
             // Python binding for xla::DotGeneral
             // https://openxla.org/xla/operation_semantics#dotgeneral
             std::vector<std::vector<int>> dim_vectors =
                 ExtractXlaDotGeneralDimVectors(dimension_numbers);
             XLA_CHECK(!precision_config.has_value())
                 << "_xla_dot_general: precision_config is not supported yet, "
                    "default precision setting will be applied.";
-            at::Tensor result =
-                XlaDotGeneral(lhs, rhs, dim_vectors, preferred_element_type);
-            return result;
+            XLA_ASSIGN_OR_RETURN(
+                XLATensorPtr xlhs,
+                bridge::GetInputXlaTensor(lhs, /* param= */ "first argument"));
+            XLA_ASSIGN_OR_RETURN(
+                XLATensorPtr xrhs,
+                bridge::GetInputXlaTensor(rhs, /* param= */ "second argument"));
+            return XlaDotGeneral(xlhs, xrhs, dim_vectors, preferred_element_type);
           },
           py::arg("lhs"),                            //
           py::arg("rhs"),                            //
@@ -3340,19 +3373,25 @@ void InitXlaModuleBindings(py::module m) {
                     opt_device ? &opt_device.value() : nullptr);
             return check_materialization_helper(xtensors);
           })
-      .def(
-          "_get_graph_hash",
-          [](const std::vector<at::Tensor>& tensors) {
-            std::vector<XLATensorPtr> xtensors;
-            xtensors.reserve(tensors.size());
-            for (auto& tensor : tensors) {
-              xtensors.push_back(GetValueOrThrow(bridge::GetXlaTensor(tensor)));
-            }
-            torch::lazy::hash_t hash =
-                XLAGraphExecutor::Get()->GetGraphHash(xtensors);
-            std::string bin((const char*)&hash, sizeof(hash));
-            return py::bytes(bin);
-          })
+      .def("_get_graph_hash",
+           [](const std::vector<at::Tensor>& tensors) -> py::bytes {
+             absl::StatusOr<std::vector<absl_nonnull XLATensorPtr>>
+                 xtensors_status = bridge::GetXlaTensors(tensors);
+             ABSL_CHECK(xtensors_status.ok())
+                 << "_get_graph_hash(): error retrieving the XLA tensors from "
+                 << "the given tensor arguments. "
+                 << "This is a bug! Please, open an issue in the PyTorch/XLA "
+                 << "GitHub repository: https://github.com/pytorch/xla"
+                 << std::endl
+                 << "Status Error: "
+                 << BuildStatusErrorMessage(xtensors_status.status());
+             std::vector<absl_nonnull XLATensorPtr> xtensors =
+                 xtensors_status.value();
+             torch::lazy::hash_t hash =
+                 XLAGraphExecutor::Get()->GetGraphHash(xtensors);
+             std::string bin((const char*)&hash, sizeof(hash));
+             return py::bytes(bin);
+           })
       .def("_clear_pending_irs",
            [](const std::string& device) {
              // Use with caution. Those tensor whole ir was cleared
diff --git a/torch_xla/csrc/status.cpp b/torch_xla/csrc/status.cpp
@@ -119,9 +119,15 @@ static std::string MaybeGetMessageWithLineBreak(const absl::Status& status) {
              : std::string(status.message());
 }
 
+std::string BuildStatusErrorMessage(const absl::Status& status) {
+  return absl::StrCat(MaybeGetMessageWithLineBreak(status),
+                      GetFormattedStatusPropagationTrace(status));
+}
+
 void MaybeThrow(const absl::Status& status) {
-  TORCH_CHECK(status.ok(), MaybeGetMessageWithLineBreak(status),
-              GetFormattedStatusPropagationTrace(status));
+  TORCH_CHECK(status.ok(), BuildStatusErrorMessage(status));
 }
 
+void GetValueOrThrow(const absl::Status& status) { MaybeThrow(status); }
+
 }  // namespace torch_xla
diff --git a/torch_xla/csrc/status.h b/torch_xla/csrc/status.h
@@ -174,6 +174,17 @@ absl::Status MaybeWithNewMessage(const absl::Status& status, const char* file,
 
 }  // namespace status_internal
 
+// Builds the complete error message for the given `status`.
+//
+// If `TORCH_SHOW_CPP_STACKTRACES` is enabled, returns the concatenation of
+// `status.message()` with its inner status propagation trace.
+//
+// TODO(ysiraichi): this call does not append the C++ stacktrace, which,
+// ideally, should. It can be done by not using `TORCH_CHECK()` macro directly
+// in `MaybeThrow()`, but using PyTorch `c10::get_lazy_backtrace()`
+// (at c10/util/Backtrace.h).
+std::string BuildStatusErrorMessage(const absl::Status& status);
+
 // Maybe throws an exception if `status` has a non-ok code.
 //
 // Ideally, this function should be used only used in the project's
@@ -200,6 +211,9 @@ T GetValueOrThrow(absl::StatusOr<T>&& status) {
   return std::move(status).value();
 }
 
+// `GetValueOrThrow` overload for `Status`.
+void GetValueOrThrow(const absl::Status& status);
+
 }  // namespace torch_xla
 
 #endif  // XLA_TORCH_XLA_CSRC_STATUS_H_