WIP: linalg.eig: (CUDA)

johannesz-codes · johannesz-codes · commit 8cee9e616eac · 2025-10-23T11:26:52.000+02:00
-Fixed edge cases (especially empty matrices of various batch and nonbatch dimensions)
diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -2919,11 +2919,10 @@ static Tensor& linalg_eig_make_complex_eigenvectors(Tensor& complex_vectors, con
 DEFINE_DISPATCH(linalg_eig_stub);
 
 static std::tuple<Tensor&, Tensor&> linalg_eig_out_info(const Tensor& input, Tensor& values, Tensor& vectors, Tensor& infos, bool compute_eigenvectors) {
-  TORCH_WARN("input dtype: ", input.scalar_type());
-  TORCH_WARN("input device", input.device());
   auto options = input.options();
 
 
+
   // These internal asserts make explicit the assumptions in the implementation
   // Error check with the actual error messages are done on the higher level of the hierarchy of calls
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.dim() >= 2);
@@ -3000,24 +2999,8 @@ static std::tuple<Tensor&, Tensor&> linalg_eig_out_info(const Tensor& input, Ten
   // }
 
   //call to the device-specific linalg_eig_stub (LAPACK, MAGMA or cuSOLVER)
-  TORCH_WARN("input device before linalg_eig_stub call: ", input.device());
-  TORCH_WARN("input dtype before linalg_eig_stub call: ", input.scalar_type());
-
-  TORCH_WARN("values device before linalg_eig_stub call: ", real_imag_values.device());
-  TORCH_WARN("values dtype before linalg_eig_stub call: ", real_imag_values.scalar_type());
-
-  TORCH_WARN("vectors device before linalg_eig_stub call: ", maybe_complex_vectors.device());
-  TORCH_WARN("vectors dtype before linalg_eig_stub call: ", maybe_complex_vectors.scalar_type());
-
-  TORCH_WARN("infos device before linalg_eig_stub call: ", infos.device());
-  TORCH_WARN("infos dtype before linalg_eig_stub call: ", infos.scalar_type());
-
-  TORCH_WARN("compute eigenvectors", compute_eigenvectors);
-
   linalg_eig_stub(input.device().type(), real_imag_values, maybe_complex_vectors, infos, input, compute_eigenvectors);
 
-  TORCH_WARN("passed linalg_eig_stub");
-
   // if input is not complex we need to do some post-processing
   if (!input.is_complex()) {
     // extract real and imaginary parts of the output
@@ -3062,13 +3045,6 @@ static std::tuple<Tensor&, Tensor&> linalg_eig_out_info(const Tensor& input, Ten
     }
   }
 
-  auto n = input.size(-1);
-  TORCH_CHECK(values.is_complex(), "values (complex_values) not complex");
-  TORCH_CHECK(values.numel() >= n, "values tensor too small: ", values.numel(), " < ", n);
-  TORCH_CHECK(values.is_contiguous(), "values tensor not contiguous");
-  TORCH_CHECK(real_imag_values.is_contiguous(), "real_imag_values not contiguous");
-
-
   return std::tuple<Tensor&, Tensor&>(values, vectors);
 }
 
@@ -3155,25 +3131,17 @@ std::tuple<Tensor&, Tensor&> linalg_eig_out(const Tensor& input, Tensor& values,
 }
 
 std::tuple<Tensor, Tensor> linalg_eig(const Tensor& input) {
-  TORCH_WARN("input dtype: ", input.scalar_type());
   ScalarType complex_dtype = toComplexType(input.scalar_type());
   Tensor values = at::empty({0}, input.options().dtype(complex_dtype));
   Tensor vectors = at::empty({0}, input.options().dtype(complex_dtype));
 
-  // TORCH_WARN("input shape: ", input.sizes());
-  // TORCH_WARN("values shape: ", values.sizes());
-  // TORCH_WARN("vectors shape: ", vectors.sizes());
-
 
   at::linalg_eig_outf(input, values, vectors);
 
   return std::tuple<Tensor, Tensor>(values, vectors);
 }
 
 Tensor& linalg_eigvals_out(const Tensor& input, Tensor& values) {
-  TORCH_WARN("entered linalg_eigvals_out");
-  TORCH_WARN("input dtype: ", input.scalar_type());
-  TORCH_WARN("input device: ", input.device());
   squareCheckInputs(input, "linalg.eigvals");
   TORCH_CHECK(input.isfinite().all().item<bool>(), "torch.linalg.eigvals: input tensor should not contain infs or NaNs.");
 
@@ -3228,11 +3196,9 @@ Tensor& linalg_eigvals_out(const Tensor& input, Tensor& values) {
 }
 
 Tensor linalg_eigvals(const Tensor& input) {
-  TORCH_WARN("entered linalg_eigvals");
   // if input requires grad we must compute the eigenvectors to make this function differentiable
   // the eigenvectors are not exposed to the user
   if (_may_require_fw_or_bw_grad(input)) {
-    TORCH_WARN("Gradient required, computing eigenvectors in linalg.eigvals");
     return std::get<0>(at::linalg_eig(input));
   }
   return at::_linalg_eigvals(input);
diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
@@ -2066,16 +2066,15 @@ TORCH_CHECK(false, "Calling torch.linalg.eig on a CUDA tensor requires compiling
 }
 
 void linalg_eig_kernel(Tensor& eigenvalues, Tensor& eigenvectors, Tensor& infos, const Tensor& input, bool compute_eigenvectors) {
-  TORCH_WARN("entered linalg_eig_kernel CUDA implementation");
   // This function calculates the non-symmetric eigendecomposition in-place
   // tensors should be in batched column major memory format
   // the content of eigenvalues, eigenvectors and infos is overwritten by 'linalg_eig_magma' or 'linalg_eig_cusolver_xgeev'
-
   // both geev routines modify the provided input matrix in-place, therefore we need a copy
+
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.is_cuda());
 #if defined(CUSOLVER_VERSION) && (CUSOLVER_VERSION >= 11702)
   // ───────────────────────────────────────────────
-  // New CUDA 12.6+ path using cuSOLVER Xgeev
+  // New CUDA 12.8+ path using cuSOLVER Xgeev
   // ───────────────────────────────────────────────
   auto preferred_backend = at::globalContext().linalgPreferredBackend();
   switch (preferred_backend) {
diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
@@ -1638,23 +1638,40 @@ void apply_xgeev(const Tensor& values, const Tensor& vectors, const Tensor& inpu
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(vectors.is_cuda());
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.is_cuda());
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(infos.is_cuda());
-  TORCH_WARN("entered apply_xgeev")
 
-  auto device = input.device();
 
 
   int n = cuda_int_cast(input.size(-1), "n");
   int lda = std::max<int64_t>(1, n);
   auto batch_size = batchCount(vectors);
 
-  TORCH_WARN("---0---")
+  if (n == 0 || batch_size == 0) {
+    //XGeev does not support empty input, so we need to handle this case separately to
+    // emulate CPU semantics for empty input
+    auto values_shape = IntArrayRef(input.sizes().data(), input.dim() - 1);
+    values.resize_(values_shape, MemoryFormat::Contiguous);
+    values.zero_();  // optional
+
+    if (compute_eigenvectors) {
+      vectors.resize_(input.sizes(), MemoryFormat::Contiguous);
+      vectors.zero_();  // optional
+    } else {
+      // ensure defined but empty (e.g. for eigvals)
+      vectors.resize_({0});
+    }
+
+    infos.resize_({std::max<int64_t>(1, batch_size)}, MemoryFormat::Contiguous);
+    infos.zero_();
+
+    // early exit – nothing to compute
+    return;
+  }
+
   int64_t vectors_stride = 0;
   if (compute_eigenvectors){
     vectors_stride = matrixStride(vectors);
   }
 
-  TORCH_WARN("---1---")
-
   auto values_stride = values.size(-1);
 
 
@@ -1683,8 +1700,6 @@ void apply_xgeev(const Tensor& values, const Tensor& vectors, const Tensor& inpu
     jobvr = CUSOLVER_EIG_MODE_NOVECTOR;
   }
 
-  TORCH_WARN("---2---")
-
 
   scalar_t* W  = values.data_ptr<scalar_t>();
   scalar_t*    VL = nullptr;
@@ -1697,7 +1712,6 @@ void apply_xgeev(const Tensor& values, const Tensor& vectors, const Tensor& inpu
   const scalar_t*    	VL_const = VL;
   const scalar_t*    	VR_const = VR;
 
-  TORCH_WARN("calling bufferSize")
   size_t ws_dev = 0, ws_host = 0;
   at::cuda::solver::xgeev_bufferSize<scalar_t>(
     handle, params,
@@ -1746,8 +1760,6 @@ void apply_xgeev(const Tensor& values, const Tensor& vectors, const Tensor& inpu
       info);
   }
   TORCH_CUSOLVER_CHECK(cusolverDnDestroyParams(params));
-  TORCH_WARN("passed apply_xgeev")
-
 
 }