Use only device memory for now

kshyatt · kshyatt · commit 1c023a1486be · 2025-08-29T02:35:56.000-04:00
diff --git a/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
@@ -5,6 +5,7 @@ using CUDA: @allowscalar
 using cuTENSOR: cuTENSOR
 
 using TensorKit
+import TensorKit.VectorInterface: scalartype as vi_scalartype
 using TensorKit.Factorizations
 using TensorKit.Factorizations: select_svd_algorithm, OFA, initialize_output, AbstractAlgorithm
 using TensorKit: SectorDict, tensormaptype, scalar, similarstoragetype, AdjointTensorMap
@@ -19,7 +20,7 @@ TensorKit.Factorizations.select_svd_algorithm(::CuTensorMap, ::TensorKit.Factori
 TensorKit.Factorizations.select_svd_algorithm(::CuTensorMap, ::TensorKit.Factorizations.SDD) = throw(ArgumentError("DivideAndConquer unavailable on CUDA")) 
 TensorKit.Factorizations.select_svd_algorithm(::CuTensorMap, alg::OFA) = throw(ArgumentError(lazy"Unknown algorithm $alg"))
 
-const CuDiagonalTensorMap{T, S} = DiagonalTensorMap{T, S, CuVector{T}}
+const CuDiagonalTensorMap{T, S} = DiagonalTensorMap{T, S, CuVector{T, CUDA.DeviceMemory}}
 
 """
     CuDiagonalTensorMap{T}(undef, domain::S) where {T,S<:IndexSpace}
@@ -82,4 +83,8 @@ end
 
 # TODO
 # add VectorInterface extensions for proper CUDA promotion
+function TensorKit.VectorInterface.promote_add(TA::Type{<:CUDA.StridedCuMatrix{Tx}}, TB::Type{<:CUDA.StridedCuMatrix{Ty}}, α::Tα = TensorKit.VectorInterface.One(), β::Tβ = TensorKit.VectorInterface.One()) where {Tx, Ty, Tα, Tβ}
+    return Base.promote_op(add, Tx, Ty, Tα, Tβ)
+end
+
 end
diff --git a/ext/TensorKitCUDAExt/cutensormap.jl b/ext/TensorKitCUDAExt/cutensormap.jl
@@ -1,16 +1,16 @@
-const CuTensorMap{T,S,N₁,N₂,A<:CuVector{T}} = TensorMap{T,S,N₁,N₂,A}
-const CuTensor{T, S, N, A<:CuVector{T}} = CuTensorMap{T, S, N, 0, A}
+const CuTensorMap{T,S,N₁,N₂} = TensorMap{T,S,N₁,N₂, CuVector{T,CUDA.DeviceMemory}}
+const CuTensor{T, S, N} = CuTensorMap{T, S, N, 0}
 
 function TensorKit.tensormaptype(S::Type{<:IndexSpace}, N₁, N₂, TorA::Type{<:StridedCuArray})
     if TorA <: CuArray
-        return TensorMap{eltype(TorA),S,N₁,N₂,CuVector{eltype(TorA)}}
+        return TensorMap{eltype(TorA),S,N₁,N₂,CuVector{eltype(TorA), CUDA.DeviceMemory}}
     else
         throw(ArgumentError("argument $TorA should specify a scalar type (`<:Number`) or a storage type `<:CuVector{<:Number}`"))
     end
 end
 
 function CuTensorMap{T}(::UndefInitializer, V::TensorMapSpace{S, N₁, N₂}) where {T, S, N₁, N₂}
-    return CuTensorMap{T,S,N₁,N₂,CuVector{T}}(undef, V)
+    return CuTensorMap{T,S,N₁,N₂}(undef, V)
 end
 
 function CuTensorMap{T}(::UndefInitializer, codomain::TensorSpace{S},
@@ -164,14 +164,16 @@ function TensorKit.scalar(t::CuTensorMap)
            first(blocks(t))[2][1, 1] : throw(DimensionMismatch())
 end
 
-TensorKit.scalartype(A::CuArray{T}) where {T} = T
+TensorKit.scalartype(A::StridedCuArray{T}) where {T} = T
+vi_scalartype(::Type{<:CuTensorMap{T}}) where {T} = T
+vi_scalartype(::Type{<:CuArray{T}}) where {T} = T
 
-function TensorKit.similarstoragetype(TT::Type{<:CuTensorMap}, ::Type{T}) where {T}
-    return CuVector{T}
+function TensorKit.similarstoragetype(TT::Type{<:CuTensorMap{TTT,S,N₁,N₂}}, ::Type{T}) where {TTT,T,S,N₁,N₂}
+    return CuVector{T, CUDA.DeviceMemory}
 end
 
-function Base.convert(TT::Type{CuTensorMap{T,S,N₁,N₂,A}},
-                      t::AbstractTensorMap{<:Any,S,N₁,N₂}) where {T,S,N₁,N₂,A<:CuVector{T}}
+function Base.convert(TT::Type{CuTensorMap{T,S,N₁,N₂}},
+                      t::AbstractTensorMap{<:Any,S,N₁,N₂}) where {T,S,N₁,N₂}
     if typeof(t) === TT
         return t
     else
@@ -180,7 +182,7 @@ function Base.convert(TT::Type{CuTensorMap{T,S,N₁,N₂,A}},
     end
 end
 
-function Base.copy!(tdst::CuTensorMap{T, S, N₁, N₂, A}, tsrc::CuTensorMap{T, S, N₁, N₂, A}) where {T, S, N₁, N₂, A}
+function Base.copy!(tdst::CuTensorMap{T, S, N₁, N₂}, tsrc::CuTensorMap{T, S, N₁, N₂}) where {T, S, N₁, N₂}
     space(tdst) == space(tsrc) || throw(SpaceMismatch("$(space(tdst)) ≠ $(space(tsrc))"))
     for ((c, bdst), (_, bsrc)) in zip(blocks(tdst), blocks(tsrc))
         copy!(bdst, bsrc)
@@ -195,3 +197,11 @@ function Base.copy!(tdst::CuTensorMap, tsrc::TensorKit.AdjointTensorMap)
     end
     return tdst
 end
+
+function Base.promote_rule(::Type{<:TT₁},
+                           ::Type{<:TT₂}) where {S,N₁,N₂, TTT₁, TTT₂,
+                                                 TT₁<:CuTensorMap{TTT₁,S,N₁,N₂},
+                                                 TT₂<:CuTensorMap{TTT₂,S,N₁,N₂}}
+    T = TensorKit.VectorInterface.promote_add(TTT₁, TTT₂)
+    return CuTensorMap{T,S,N₁,N₂}
+end
diff --git a/src/tensors/linalg.jl b/src/tensors/linalg.jl
@@ -545,7 +545,7 @@ function ⊗(t1::AbstractTensorMap, t2::AbstractTensorMap)
                             m1 = sreshape(t1[f1l, f1r], (d1, 1, d3, 1))
                             m2 = sreshape(t2[f2l, f2r], (1, d2, 1, d4))
                             m  = sreshape(t[fl, fr], (d1, d2, d3, d4))
-                            @. m += coeff1 * conj(coeff2) * m1 * m2
+                            m .+= coeff1 .* conj.(coeff2) .* m1 .* m2
                         end
                     end
                 end