JuliaManifolds · zazabap · Feb 11, 2026 · Feb 16, 2026 · Feb 16, 2026 · Feb 16, 2026
diff --git a/Changelog.md b/Changelog.md
@@ -6,6 +6,17 @@ The file was started with Version `0.4`.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.5.33] – unreleased
+
+### Added
+
+* Added `ManoptCUDAExt` extension for CUDA/GPU support, enabling solvers
+  (`gradient_descent`, `conjugate_gradient_descent`) to work transparently
+  with `CuArray`-backed manifold points. The extension overrides
+  `ManifoldsBase.allocate` and `linesearch_backtrack!` so that GPU arrays
+  stay on device throughout the solver loop. Supported stepsizes:
+  `ConstantLength`, `ArmijoLinesearchStepsize`, `NonmonotoneLinesearchStepsize`.
+
 ## [0.5.32] January 15, 2026
 
 ### Fixed

diff --git a/Project.toml b/Project.toml
@@ -24,6 +24,7 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
 LRUCache = "8ac3fa9e-de4c-5943-b1dc-09c6b5f20637"
 LineSearches = "d3d80556-e9d4-5f37-9878-2ab0fcc64255"
@@ -34,6 +35,7 @@ RecursiveArrayTools = "731186ca-8d62-57ce-b412-fbd966d074cd"
 RipQP = "1e40b3f8-35eb-4cd8-8edd-3e515bb9de08"
 
 [extensions]
+ManoptCUDAExt = "CUDA"
 ManoptJuMPExt = "JuMP"
 ManoptLRUCacheExt = "LRUCache"
 ManoptLineSearchesExt = "LineSearches"
@@ -42,6 +44,7 @@ ManoptRecursiveArrayToolsExt = "RecursiveArrayTools"
 ManoptRipQPQuadraticModelsExt = ["RipQP", "QuadraticModels"]
 
 [compat]
+CUDA = "5"
 ColorSchemes = "3.5.0"
 ColorTypes = "0.9.1, 0.10, 0.11, 0.12"
 Colors = "0.11.2, 0.12, 0.13"

diff --git a/ext/ManoptCUDAExt.jl b/ext/ManoptCUDAExt.jl
@@ -0,0 +1,109 @@
+"""
+    ManoptCUDAExt
+
+CUDA extension for Manopt.jl, enabling solvers to work transparently with
+`CuArray`-backed manifold points.
+
+The main issue is that several linesearch stepsizes pre-allocate workspace
+arrays (e.g. `candidate_point`) via `allocate_result(M, rand)` at construction
+time, which always returns CPU `Array`s. When the optimization iterates are
+`CuArray`s, the type mismatch between CPU workspace and GPU iterates causes
+`retract_fused!` and broadcasting to fail.
+
+Since these workspace arrays are stored in parametric `mutable struct` fields
+(e.g. `candidate_point::P` where `P = Vector{Float64}`), their types cannot
+be changed after construction. Instead, this extension intercepts the
+linesearch functions and allocates GPU-compatible scratch buffers when a
+CPU/GPU type mismatch is detected.
+
+This extension fixes the problem by:
+1. Overriding `ManifoldsBase.allocate` for `CuArray` points so that
+   `linesearch_backtrack` (non-mutating) allocates GPU arrays.
+2. Overriding `linesearch_backtrack!` (mutating) to detect CPU workspace /
+   GPU point mismatches and allocate a GPU scratch buffer. This covers
+   `ArmijoLinesearchStepsize` and `NonmonotoneLinesearchStepsize`.
+"""
+module ManoptCUDAExt
+
+# === Imports (following Manopt extension conventions) ===
+using Manopt
+using ManifoldsBase
+using ManifoldsBase: AbstractRetractionMethod, default_retraction_method,
+    default_vector_transport_method
+using LinearAlgebra
+
+# Import functions we extend with new methods
+import Manopt: linesearch_backtrack!
+
+using CUDA
+
+# === CuArray type union (following OMEinsum.jl pattern) ===
+
+"""
+    CUDAManifoldPoint{T,N}
+
+Type union covering `CuArray` and its wrapped variants (`Transpose`, `Adjoint`)
+for dispatch in GPU-aware methods.
+"""
+const CUDAManifoldPoint{T,N} = Union{
+    CuArray{T,N},
+    LinearAlgebra.Transpose{T,<:CuArray{T,N}},
+    LinearAlgebra.Adjoint{T,<:CuArray{T,N}},
+}
+
+# === GPU-aware allocations ===
+
+# Override allocate for CuArray manifold points.
+# This fixes linesearch_backtrack (linesearch.jl:148) which calls allocate(M, p)
+# and would otherwise create a CPU Array.
+
+function ManifoldsBase.allocate(::ManifoldsBase.AbstractManifold, p::CuArray{T,N}) where {T,N}
+    return CuArray{T,N}(undef, size(p))
+end
+
+function ManifoldsBase.allocate(
+    ::ManifoldsBase.AbstractManifold, p::CuArray{T,N}, ::Type{S}
+) where {T,N,S}
+    return CuArray{S,N}(undef, size(p))
+end
+
+# Allocate without manifold argument (used by some ManifoldsBase paths)
+function ManifoldsBase.allocate(p::CuArray{T,N}) where {T,N}
+    return CuArray{T,N}(undef, size(p))
+end
+
+function ManifoldsBase.allocate(p::CuArray{T,N}, ::Type{S}) where {T,N,S}
+    return CuArray{S,N}(undef, size(p))
+end
+
+# === Fix linesearch_backtrack! for CPU workspace / GPU point mismatch ===
+
+# ArmijoLinesearchStepsize and NonmonotoneLinesearchStepsize both call
+# linesearch_backtrack!(M, candidate_point, f, p, ...) where candidate_point
+# is a CPU Array (pre-allocated at stepsize construction) and p is a CuArray
+# (the current iterate). The function only uses q as scratch space and returns
+# the step size, so we can safely allocate a new GPU buffer for q.
+
+function Manopt.linesearch_backtrack!(
+    M::ManifoldsBase.AbstractManifold,
+    q::Array,       # CPU workspace (from stepsize.candidate_point)
+    f::TF,
+    p::CuArray,     # GPU iterate
+    s,
+    decrease,
+    contract,
+    η;
+    kwargs...,
+) where {TF}
+    q_gpu = ManifoldsBase.allocate(M, p)
+    return Manopt.linesearch_backtrack!(M, q_gpu, f, p, s, decrease, contract, η; kwargs...)
+end
+
+# Note: WolfePowellLinesearchStepsize and CubicBracketingLinesearchStepsize
+# use candidate_point directly (not via linesearch_backtrack!). GPU support
+# for these stepsizes requires changes to the Manopt.jl base package to make
+# the candidate_point field non-parametric or to add a dispatch hook. For now,
+# use ArmijoLinesearch (the default for gradient_descent and
+# conjugate_gradient_descent) or ConstantLength with CuArray points.
+
+end # module
diff --git a/test/test_cuda_ext.jl b/test/test_cuda_ext.jl
@@ -0,0 +1,82 @@
+using Manopt, Manifolds, ManifoldsBase, Test
+using CUDA
+
+@testset "ManoptCUDAExt" begin
+    if CUDA.functional()
+        @testset "GPU allocate" begin
+            M = Euclidean(10)
+            p_gpu = CUDA.zeros(10)
+            q = ManifoldsBase.allocate(M, p_gpu)
+            @test q isa CuArray
+            @test size(q) == (10,)
+
+            q2 = ManifoldsBase.allocate(M, p_gpu, Float32)
+            @test q2 isa CuArray{Float32}
+            @test size(q2) == (10,)
+        end
+
+        @testset "Gradient descent with ConstantLength on Euclidean" begin
+            # Simple quadratic on R^n.
+            # Euclidean uses broadcasting-based retract/inner/norm, so this
+            # tests the basic GPU solver path without linesearch workspace.
+            M = Euclidean(10)
+            target = randn(10)
+            target_gpu = CuArray(target)
+
+            f_cpu(M, p) = 0.5 * sum((p .- target) .^ 2)
+            grad_f_cpu(M, p) = p .- target
+
+            f_gpu(M, p) = 0.5 * sum((p .- target_gpu) .^ 2)
+            grad_f_gpu(M, p) = p .- target_gpu
+
+            p0_cpu = zeros(10)
+            result_cpu = gradient_descent(
+                M, f_cpu, grad_f_cpu, p0_cpu;
+                stopping_criterion=StopAfterIteration(100),
+                stepsize=ConstantLength(0.1),
+            )
+
+            p0_gpu = CuArray(zeros(10))
+            result_gpu = gradient_descent(
+                M, f_gpu, grad_f_gpu, p0_gpu;
+                stopping_criterion=StopAfterIteration(100),
+                stepsize=ConstantLength(0.1),
+            )
+
+            @test result_gpu isa CuArray
+            @test isapprox(Array(result_gpu), result_cpu; atol=1e-10)
+        end
+
+        @testset "Gradient descent with ArmijoLinesearch on Euclidean" begin
+            # Tests the candidate_point adaptation path.
+            # ArmijoLinesearchStepsize is the default stepsize for gradient_descent.
+            M = Euclidean(10)
+            target = randn(10)
+            target_gpu = CuArray(target)
+
+            f_cpu(M, p) = 0.5 * sum((p .- target) .^ 2)
+            grad_f_cpu(M, p) = p .- target
+
+            f_gpu(M, p) = 0.5 * sum((p .- target_gpu) .^ 2)
+            grad_f_gpu(M, p) = p .- target_gpu
+
+            p0_cpu = zeros(10)
+            result_cpu = gradient_descent(
+                M, f_cpu, grad_f_cpu, p0_cpu;
+                stopping_criterion=StopAfterIteration(50),
+            )
+            # Default stepsize is ArmijoLinesearchStepsize
+
+            p0_gpu = CuArray(zeros(10))
+            result_gpu = gradient_descent(
+                M, f_gpu, grad_f_gpu, p0_gpu;
+                stopping_criterion=StopAfterIteration(50),
+            )
+
+            @test result_gpu isa CuArray
+            @test isapprox(Array(result_gpu), result_cpu; atol=1e-10)
+        end
+    else
+        @info "CUDA not functional, skipping ManoptCUDAExt GPU tests"
+    end
+end