Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,17 @@ The file was started with Version `0.4`.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.5.33] – unreleased

### Added

* Added `ManoptCUDAExt` extension for CUDA/GPU support, enabling solvers
(`gradient_descent`, `conjugate_gradient_descent`) to work transparently
with `CuArray`-backed manifold points. The extension overrides
`ManifoldsBase.allocate` and `linesearch_backtrack!` so that GPU arrays
stay on device throughout the solver loop. Supported stepsizes:
`ConstantLength`, `ArmijoLinesearchStepsize`, `NonmonotoneLinesearchStepsize`.

## [0.5.32] January 15, 2026

### Fixed
Expand Down
3 changes: 3 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

[weakdeps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
LRUCache = "8ac3fa9e-de4c-5943-b1dc-09c6b5f20637"
LineSearches = "d3d80556-e9d4-5f37-9878-2ab0fcc64255"
Expand All @@ -34,6 +35,7 @@ RecursiveArrayTools = "731186ca-8d62-57ce-b412-fbd966d074cd"
RipQP = "1e40b3f8-35eb-4cd8-8edd-3e515bb9de08"

[extensions]
ManoptCUDAExt = "CUDA"
ManoptJuMPExt = "JuMP"
ManoptLRUCacheExt = "LRUCache"
ManoptLineSearchesExt = "LineSearches"
Expand All @@ -42,6 +44,7 @@ ManoptRecursiveArrayToolsExt = "RecursiveArrayTools"
ManoptRipQPQuadraticModelsExt = ["RipQP", "QuadraticModels"]

[compat]
CUDA = "5"
ColorSchemes = "3.5.0"
ColorTypes = "0.9.1, 0.10, 0.11, 0.12"
Colors = "0.11.2, 0.12, 0.13"
Expand Down
109 changes: 109 additions & 0 deletions ext/ManoptCUDAExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
"""
ManoptCUDAExt

CUDA extension for Manopt.jl, enabling solvers to work transparently with
`CuArray`-backed manifold points.

The main issue is that several linesearch stepsizes pre-allocate workspace
arrays (e.g. `candidate_point`) via `allocate_result(M, rand)` at construction
time, which always returns CPU `Array`s. When the optimization iterates are
`CuArray`s, the type mismatch between CPU workspace and GPU iterates causes
`retract_fused!` and broadcasting to fail.

Since these workspace arrays are stored in parametric `mutable struct` fields
(e.g. `candidate_point::P` where `P = Vector{Float64}`), their types cannot
be changed after construction. Instead, this extension intercepts the
linesearch functions and allocates GPU-compatible scratch buffers when a
CPU/GPU type mismatch is detected.

This extension fixes the problem by:
1. Overriding `ManifoldsBase.allocate` for `CuArray` points so that
`linesearch_backtrack` (non-mutating) allocates GPU arrays.
2. Overriding `linesearch_backtrack!` (mutating) to detect CPU workspace /
GPU point mismatches and allocate a GPU scratch buffer. This covers
`ArmijoLinesearchStepsize` and `NonmonotoneLinesearchStepsize`.
"""
module ManoptCUDAExt

# === Imports (following Manopt extension conventions) ===
using Manopt
using ManifoldsBase
using ManifoldsBase: AbstractRetractionMethod, default_retraction_method,
default_vector_transport_method
using LinearAlgebra

# Import functions we extend with new methods
import Manopt: linesearch_backtrack!

using CUDA

# === CuArray type union (following OMEinsum.jl pattern) ===

"""
CUDAManifoldPoint{T,N}

Type union covering `CuArray` and its wrapped variants (`Transpose`, `Adjoint`)
for dispatch in GPU-aware methods.
"""
const CUDAManifoldPoint{T,N} = Union{
CuArray{T,N},
LinearAlgebra.Transpose{T,<:CuArray{T,N}},
LinearAlgebra.Adjoint{T,<:CuArray{T,N}},
}

# === GPU-aware allocations ===

# Override allocate for CuArray manifold points.
# This fixes linesearch_backtrack (linesearch.jl:148) which calls allocate(M, p)
# and would otherwise create a CPU Array.

function ManifoldsBase.allocate(::ManifoldsBase.AbstractManifold, p::CuArray{T,N}) where {T,N}
return CuArray{T,N}(undef, size(p))
end

function ManifoldsBase.allocate(
::ManifoldsBase.AbstractManifold, p::CuArray{T,N}, ::Type{S}
) where {T,N,S}
return CuArray{S,N}(undef, size(p))
end

# Allocate without manifold argument (used by some ManifoldsBase paths)
function ManifoldsBase.allocate(p::CuArray{T,N}) where {T,N}
return CuArray{T,N}(undef, size(p))
end

function ManifoldsBase.allocate(p::CuArray{T,N}, ::Type{S}) where {T,N,S}
return CuArray{S,N}(undef, size(p))
end
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you write a test which is fixed by these methods? I would expect these to not be necessary. All added allocation tests pass on unpatched JuliaManifolds.


# === Fix linesearch_backtrack! for CPU workspace / GPU point mismatch ===

# ArmijoLinesearchStepsize and NonmonotoneLinesearchStepsize both call
# linesearch_backtrack!(M, candidate_point, f, p, ...) where candidate_point
# is a CPU Array (pre-allocated at stepsize construction) and p is a CuArray
# (the current iterate). The function only uses q as scratch space and returns
# the step size, so we can safely allocate a new GPU buffer for q.

function Manopt.linesearch_backtrack!(
M::ManifoldsBase.AbstractManifold,
q::Array, # CPU workspace (from stepsize.candidate_point)
f::TF,
p::CuArray, # GPU iterate
s,
decrease,
contract,
η;
kwargs...,
) where {TF}
q_gpu = ManifoldsBase.allocate(M, p)
return Manopt.linesearch_backtrack!(M, q_gpu, f, p, s, decrease, contract, η; kwargs...)
end
Comment on lines +22 to +37
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think a better solution would be to extend _produce_type to pass a point along the manifold, which would allow us to properly propagate the type. @kellertuer , what do you think about it?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Effectively _produce_type(stepsize, M) would be _produce_type(stepsize, M, p) and then ArmijoLinesearchStepsize constructor could use that p to initialize candidate_point::P = allocate(p).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While it is an internal function and we could even break that without having a breaking change, it would even be possible to do p = rand(M) or so in order pass the point type a well; but sure, we could do that.


# Note: WolfePowellLinesearchStepsize and CubicBracketingLinesearchStepsize
# use candidate_point directly (not via linesearch_backtrack!). GPU support
# for these stepsizes requires changes to the Manopt.jl base package to make
# the candidate_point field non-parametric or to add a dispatch hook. For now,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is a parametric field? Unclear what this refers to, but I would prefer generic solutions that work with as many step sizes as possible; for best of cases all.

# use ArmijoLinesearch (the default for gradient_descent and
# conjugate_gradient_descent) or ConstantLength with CuArray points.

end # module
82 changes: 82 additions & 0 deletions test/test_cuda_ext.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
using Manopt, Manifolds, ManifoldsBase, Test
using CUDA

@testset "ManoptCUDAExt" begin
if CUDA.functional()
@testset "GPU allocate" begin
M = Euclidean(10)
p_gpu = CUDA.zeros(10)
q = ManifoldsBase.allocate(M, p_gpu)
@test q isa CuArray
@test size(q) == (10,)

q2 = ManifoldsBase.allocate(M, p_gpu, Float32)
@test q2 isa CuArray{Float32}
@test size(q2) == (10,)
end

@testset "Gradient descent with ConstantLength on Euclidean" begin
# Simple quadratic on R^n.
# Euclidean uses broadcasting-based retract/inner/norm, so this
# tests the basic GPU solver path without linesearch workspace.
M = Euclidean(10)
target = randn(10)
target_gpu = CuArray(target)

f_cpu(M, p) = 0.5 * sum((p .- target) .^ 2)
grad_f_cpu(M, p) = p .- target

f_gpu(M, p) = 0.5 * sum((p .- target_gpu) .^ 2)
grad_f_gpu(M, p) = p .- target_gpu

p0_cpu = zeros(10)
result_cpu = gradient_descent(
M, f_cpu, grad_f_cpu, p0_cpu;
stopping_criterion=StopAfterIteration(100),
stepsize=ConstantLength(0.1),
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the quadratic task above this is not necessary, you know the closed form solution to just be target.


p0_gpu = CuArray(zeros(10))
result_gpu = gradient_descent(
M, f_gpu, grad_f_gpu, p0_gpu;
stopping_criterion=StopAfterIteration(100),
stepsize=ConstantLength(0.1),
)

@test result_gpu isa CuArray
@test isapprox(Array(result_gpu), result_cpu; atol=1e-10)
end

@testset "Gradient descent with ArmijoLinesearch on Euclidean" begin
# Tests the candidate_point adaptation path.
# ArmijoLinesearchStepsize is the default stepsize for gradient_descent.
M = Euclidean(10)
target = randn(10)
target_gpu = CuArray(target)

f_cpu(M, p) = 0.5 * sum((p .- target) .^ 2)
grad_f_cpu(M, p) = p .- target

f_gpu(M, p) = 0.5 * sum((p .- target_gpu) .^ 2)
grad_f_gpu(M, p) = p .- target_gpu

p0_cpu = zeros(10)
result_cpu = gradient_descent(
M, f_cpu, grad_f_cpu, p0_cpu;
stopping_criterion=StopAfterIteration(50),
)
# Default stepsize is ArmijoLinesearchStepsize

p0_gpu = CuArray(zeros(10))
result_gpu = gradient_descent(
M, f_gpu, grad_f_gpu, p0_gpu;
stopping_criterion=StopAfterIteration(50),
)

@test result_gpu isa CuArray
@test isapprox(Array(result_gpu), result_cpu; atol=1e-10)
end
else
@info "CUDA not functional, skipping ManoptCUDAExt GPU tests"
end
end