QuantumKitHub · kshyatt · Aug 25, 2025 · Aug 26, 2025 · Aug 26, 2025 · Aug 27, 2025
diff --git a/Project.toml b/Project.toml
@@ -18,15 +18,24 @@ TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6"
 VectorInterface = "409d34a3-91d5-4945-b6ec-7529ddf182d8"
 
 [weakdeps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
 
 [extensions]
+TensorKitAMDGPUExt = "AMDGPU"
+TensorKitCUDAExt = ["CUDA", "cuTENSOR"]
 TensorKitChainRulesCoreExt = "ChainRulesCore"
 TensorKitFiniteDifferencesExt = "FiniteDifferences"
 
 [compat]
+AMDGPU = "2"
+Adapt = "4"
 Aqua = "0.6, 0.7, 0.8"
+CUDA = "5"
+cuTENSOR = "2"
 ChainRulesCore = "1"
 ChainRulesTestUtils = "1"
 Combinatorics = "1"
@@ -49,7 +58,10 @@ Zygote = "0.7"
 julia = "1.10"
 
 [extras]
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"
 Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
@@ -61,4 +73,10 @@ TestExtras = "5ed8adda-3752-4e41-b88a-e8b09835ee3a"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["Aqua", "Combinatorics", "LinearAlgebra", "TensorOperations", "Test", "TestExtras", "ChainRulesCore", "ChainRulesTestUtils", "FiniteDifferences", "Zygote"]
+test = ["Adapt", "Aqua", "Combinatorics", "CUDA", "cuTENSOR", "LinearAlgebra", "TensorOperations", "Test", "TestExtras", "ChainRulesCore", "ChainRulesTestUtils", "FiniteDifferences", "Zygote"]
+
+[sources]
+CUDA = {url = "https://github.com/JuliaGPU/CUDA.jl", rev = "master"}
+cuTENSOR = {url = "https://github.com/JuliaGPU/CUDA.jl", subdir="lib/cutensor", rev = "ksh/cutensor_bump"}
+MatrixAlgebraKit = {url = "https://github.com/QuantumKitHub/MatrixAlgebraKit.jl", rev = "ksh/tk"}
+TensorOperations = {url = "https://github.com/QuantumKitHub/TensorOperations.jl", rev = "ksh/cutensor_bump"}
diff --git a/ext/TensorKitAMDGPUExt/TensorKitAMDGPUExt.jl b/ext/TensorKitAMDGPUExt/TensorKitAMDGPUExt.jl
@@ -0,0 +1,10 @@
+module TensorKitAMDGPUExt
+
+using TensorKit
+using TensorKit: SectorDict
+using AMDGPU 
+using Random
+
+include("roctensormap.jl")
+
+end
diff --git a/ext/TensorKitAMDGPUExt/roctensormap.jl b/ext/TensorKitAMDGPUExt/roctensormap.jl
@@ -0,0 +1,103 @@
+const _ROCMatOrDict{I,T} = Union{ROCMatrix{T},SectorDict{I,ROCMatrix{T}}}
+const ROCTensorMap{T,S,N₁,N₂,I,A<:_ROCMatOrDict{I,T}} = TensorMap{T,S,N₁,N₂,A}
+const ROCTensor{T, S, N, I, A <: _ROCMatOrDict{I, T}} = ROCTensorMap{T, S, N, 0, I, A}
+
+function ROCTensorMap{T}(::UndefInitializer, V::TensorMapSpace{S, N₁, N₂}) where {T, S, N₁, N₂}
+    A = ROCMatrix{T, AMDGPU.default_memory}
+    TT = tensormaptype{S, N₁, N₂, A}
+    return TT(undef, codomain(V), domain(V))
+end
+
+function ROCTensorMap{T}(::UndefInitializer, codomain::TensorSpace{S},
+                         domain::TensorSpace{S}) where {T,S}
+    return ROCTensorMap{T}(undef, codomain ← domain)
+end
+function ROCTensor{T}(::UndefInitializer, V::TensorSpace{S}) where {T,S}
+    return ROCTensorMap{T}(undef, V ← one(V))
+end
+
+for (fname, felt) in ((:zeros, :zero), (:ones, :one))
+    @eval begin
+        function AMDGPU.$fname(codomain::TensorSpace{S},
+                               domain::TensorSpace{S}=one(codomain)) where {S<:IndexSpace}
+            return AMDGPU.$fname(codomain ← domain)
+        end
+        function AMDGPU.$fname(::Type{T}, codomain::TensorSpace{S},
+                               domain::TensorSpace{S}=one(codomain)) where {T,S<:IndexSpace}
+            return AMDGPU.$fname(T, codomain ← domain)
+        end
+        AMDGPU.$fname(V::TensorMapSpace) = AMDGPU.$fname(Float64, V)
+        function AMDGPU.$fname(::Type{T}, V::TensorMapSpace) where {T}
+            t = ROCTensorMap{T}(undef, V)
+            fill!(t, $felt(T))
+            return t
+        end
+    end
+end
+
+for randfun in (:rand, :randn)
+    randfun! = Symbol(randfun, :!)
+    @eval begin
+        # converting `codomain` and `domain` into `HomSpace`
+        function AMDGPU.$randfun(codomain::TensorSpace{S},
+                                 domain::TensorSpace{S}) where {S<:IndexSpace}
+            return AMDGPU.$randfun(codomain ← domain)
+        end
+        function AMDGPU.$randfun(::Type{T}, codomain::TensorSpace{S},
+                                 domain::TensorSpace{S}) where {T,S<:IndexSpace}
+            return AMDGPU.$randfun(T, codomain ← domain)
+        end
+        function AMDGPU.$randfun(rng::Random.AbstractRNG, ::Type{T},
+                                 codomain::TensorSpace{S},
+                                 domain::TensorSpace{S}) where {T,S<:IndexSpace}
+            return AMDGPU.$randfun(rng, T, codomain ← domain)
+        end
+
+        # accepting single `TensorSpace`
+        AMDGPU.$randfun(codomain::TensorSpace) = AMDGPU.$randfun(codomain ← one(codomain))
+        function AMDGPU.$randfun(::Type{T}, codomain::TensorSpace) where {T}
+            return AMDGPU.$randfun(T, codomain ← one(codomain))
+        end
+        function AMDGPU.$randfun(rng::Random.AbstractRNG, ::Type{T},
+                                 codomain::TensorSpace) where {T}
+            return AMDGPU.$randfun(rng, T, codomain ← one(domain))
+        end
+
+        # filling in default eltype
+        AMDGPU.$randfun(V::TensorMapSpace) = AMDGPU.$randfun(Float64, V)
+        function AMDGPU.$randfun(rng::Random.AbstractRNG, V::TensorMapSpace)
+            return AMDGPU.$randfun(rng, Float64, V)
+        end
+
+        # filling in default rng
+        function AMDGPU.$randfun(::Type{T}, V::TensorMapSpace) where {T}
+            return AMDGPU.$randfun(Random.default_rng(), T, V)
+        end
+
+        # implementation
+        function AMDGPU.$randfun(rng::Random.AbstractRNG, ::Type{T},
+                               V::TensorMapSpace) where {T}
+            t = ROCTensorMap{T}(undef, V)
+            AMDGPU.$randfun!(rng, t)
+            return t
+        end
+    end
+end
+
+# converters
+# ----------
+function Base.convert(::Type{ROCTensorMap}, d::Dict{Symbol,Any})
+    try
+        codomain = eval(Meta.parse(d[:codomain]))
+        domain = eval(Meta.parse(d[:domain]))
+        data = SectorDict(eval(Meta.parse(c)) => ROCArray(b) for (c, b) in d[:data])
+        return TensorMap(data, codomain, domain)
+    catch e # sector unknown in TensorKit.jl; user-defined, hopefully accessible in Main
+        codomain = Base.eval(Main, Meta.parse(d[:codomain]))
+        domain = Base.eval(Main, Meta.parse(d[:domain]))
+        data = SectorDict(Base.eval(Main, Meta.parse(c)) => ROCArray(b)
+                          for (c, b) in d[:data])
+        return TensorMap(data, codomain, domain)
+    end
+end
+
diff --git a/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
@@ -0,0 +1,90 @@
+module TensorKitCUDAExt
+
+using CUDA, CUDA.CUBLAS, LinearAlgebra
+using CUDA: @allowscalar
+using cuTENSOR: cuTENSOR
+
+using TensorKit
+import TensorKit.VectorInterface: scalartype as vi_scalartype
+using TensorKit.Factorizations
+using TensorKit.Factorizations: select_svd_algorithm, OFA, initialize_output, AbstractAlgorithm
+using TensorKit: SectorDict, tensormaptype, scalar, similarstoragetype, AdjointTensorMap
+
+using TensorKit.MatrixAlgebraKit
+
+using Random
+
+include("cutensormap.jl")
+
+TensorKit.Factorizations.select_svd_algorithm(::CuTensorMap, ::TensorKit.Factorizations.SVD) = CUSOLVER_QRIteration()
+TensorKit.Factorizations.select_svd_algorithm(::CuTensorMap, ::TensorKit.Factorizations.SDD) = throw(ArgumentError("DivideAndConquer unavailable on CUDA")) 
+TensorKit.Factorizations.select_svd_algorithm(::CuTensorMap, alg::OFA) = throw(ArgumentError(lazy"Unknown algorithm $alg"))
+
+const CuDiagonalTensorMap{T, S} = DiagonalTensorMap{T, S, CuVector{T, CUDA.DeviceMemory}}
+
+"""
+    CuDiagonalTensorMap{T}(undef, domain::S) where {T,S<:IndexSpace}
+    # expert mode: select storage type `A`
+    DiagonalTensorMap{T,S,A}(undef, domain::S) where {T,S<:IndexSpace,A<:DenseVector{T}}
+
+Construct a `DiagonalTensorMap` with uninitialized data.
+"""
+function CuDiagonalTensorMap{T}(::UndefInitializer, V::TensorMapSpace) where {T}
+    (numin(V) == numout(V) == 1 && domain(V) == codomain(V)) ||
+        throw(ArgumentError("DiagonalTensorMap requires a space with equal domain and codomain and 2 indices"))
+    return CuDiagonalTensorMap{T}(undef, domain(V))
+end
+function CuDiagonalTensorMap{T}(::UndefInitializer, V::ProductSpace) where {T}
+    length(V) == 1 ||
+        throw(ArgumentError("DiagonalTensorMap requires `numin(d) == numout(d) == 1`"))
+    return CuDiagonalTensorMap{T}(undef, only(V))
+end
+function CuDiagonalTensorMap{T}(::UndefInitializer, V::S) where {T,S<:IndexSpace}
+    return CuDiagonalTensorMap{T,S}(undef, V)
+end
+CuDiagonalTensorMap(::UndefInitializer, V::IndexSpace) = CuDiagonalTensorMap{Float64}(undef, V)
+
+function TensorKit.Factorizations.initialize_output(::typeof(svd_compact!), t::CuTensorMap, ::AbstractAlgorithm)
+    V_cod = V_dom = infimum(fuse(codomain(t)), fuse(domain(t)))
+    U = similar(t, codomain(t) ← V_cod)
+    S = CuDiagonalTensorMap{real(scalartype(t))}(undef, V_cod)
+    Vᴴ = similar(t, V_dom ← domain(t))
+    return U, S, Vᴴ
+end
+
+function TensorKit.Factorizations.initialize_output(::typeof(eigh_full!), t::CuTensorMap, ::AbstractAlgorithm)
+    V_D = fuse(domain(t))
+    T = real(scalartype(t))
+    D = CuDiagonalTensorMap{T}(undef, V_D)
+    V = similar(t, codomain(t) ← V_D)
+    return D, V
+end
+
+function TensorKit.Factorizations.initialize_output(::typeof(eig_full!), t::CuTensorMap, ::AbstractAlgorithm)
+    V_D = fuse(domain(t))
+    Tc = complex(scalartype(t))
+    D = CuDiagonalTensorMap{Tc}(undef, V_D)
+    V = similar(t, Tc, codomain(t) ← V_D)
+    return D, V
+end
+
+function TensorKit.Factorizations.initialize_output(::typeof(eigh_vals!), t::CuTensorMap, alg::AbstractAlgorithm)
+    V_D = fuse(domain(t))
+    T = real(scalartype(t))
+    return D = CuDiagonalTensorMap{Tc}(undef, V_D)
+end
+
+function TensorKit.Factorizations.initialize_output(::typeof(eig_vals!), t::CuTensorMap, alg::AbstractAlgorithm)
+    V_D = fuse(domain(t))
+    Tc = complex(scalartype(t))
+    return D = CuDiagonalTensorMap{Tc}(undef, V_D)
+end
+
+
+# TODO
+# add VectorInterface extensions for proper CUDA promotion
+function TensorKit.VectorInterface.promote_add(TA::Type{<:CUDA.StridedCuMatrix{Tx}}, TB::Type{<:CUDA.StridedCuMatrix{Ty}}, α::Tα = TensorKit.VectorInterface.One(), β::Tβ = TensorKit.VectorInterface.One()) where {Tx, Ty, Tα, Tβ}
+    return Base.promote_op(add, Tx, Ty, Tα, Tβ)
+end
+
+end