Here is my implementation:
using CuArrays, CUDAnative, LinearAlgebra, MLKernel
function make_symmetric(A::Mat, uplo::Char='U') where {T<:AbstractFloat, Vec<:AbstractVector{T}, Mat<:AbstractMatrix{T}}
return LinearAlgebra.copytri!(A |> Matrix{T}, uplo)
end
function nystrom_inv_gpu!(A::Mat) where {T<:AbstractFloat, Vec<:AbstractVector{T}, Mat<:AbstractMatrix{T}}
A = cu(A)
vals, vectors = CuArrays.CUSOLVER.syevd!('V', 'U', A)
tol = eps(T)*size(A,1)
max_eig = maximum(vals)
# for i in eachindex(vals)
# vals[i] = abs(vals[i]) <= max_eig * tol ? zero(T) : one(T) / sqrt(vals[i])
# end
predicate = one(T) .* (vals .>= max_eig * tol)
vals .= predicate .* CUDAnative.rsqrt.(vals .^ predicate)
QD = CuArrays.CUBLAS.dgmm!('R', vectors, vals, vectors)
W = CuArrays.CUBLAS.syrk('U', 'N', QD)
return make_symmetric(W)
end