diff --git a/Project.toml b/Project.toml
index a1847b6dd..132a19ae1 100644
--- a/Project.toml
+++ b/Project.toml
@@ -18,15 +18,24 @@ TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6"
 VectorInterface = "409d34a3-91d5-4945-b6ec-7529ddf182d8"
 
 [weakdeps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
 
 [extensions]
+TensorKitAMDGPUExt = "AMDGPU"
+TensorKitCUDAExt = ["CUDA", "cuTENSOR"]
 TensorKitChainRulesCoreExt = "ChainRulesCore"
 TensorKitFiniteDifferencesExt = "FiniteDifferences"
 
 [compat]
+AMDGPU = "2"
+Adapt = "4"
 Aqua = "0.6, 0.7, 0.8"
+CUDA = "5"
+cuTENSOR = "2"
 ChainRulesCore = "1"
 ChainRulesTestUtils = "1"
 Combinatorics = "1"
@@ -49,7 +58,10 @@ Zygote = "0.7"
 julia = "1.10"
 
 [extras]
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"
 Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
@@ -61,4 +73,10 @@ TestExtras = "5ed8adda-3752-4e41-b88a-e8b09835ee3a"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["Aqua", "Combinatorics", "LinearAlgebra", "TensorOperations", "Test", "TestExtras", "ChainRulesCore", "ChainRulesTestUtils", "FiniteDifferences", "Zygote"]
+test = ["Adapt", "Aqua", "Combinatorics", "CUDA", "cuTENSOR", "LinearAlgebra", "TensorOperations", "Test", "TestExtras", "ChainRulesCore", "ChainRulesTestUtils", "FiniteDifferences", "Zygote"]
+
+[sources]
+CUDA = {url = "https://github.com/JuliaGPU/CUDA.jl", rev = "master"}
+cuTENSOR = {url = "https://github.com/JuliaGPU/CUDA.jl", subdir="lib/cutensor", rev = "ksh/cutensor_bump"}
+MatrixAlgebraKit = {url = "https://github.com/QuantumKitHub/MatrixAlgebraKit.jl", rev = "ksh/tk"}
+TensorOperations = {url = "https://github.com/QuantumKitHub/TensorOperations.jl", rev = "ksh/cutensor_bump"}
diff --git a/ext/TensorKitAMDGPUExt/TensorKitAMDGPUExt.jl b/ext/TensorKitAMDGPUExt/TensorKitAMDGPUExt.jl
new file mode 100644
index 000000000..db5a5c733
--- /dev/null
+++ b/ext/TensorKitAMDGPUExt/TensorKitAMDGPUExt.jl
@@ -0,0 +1,10 @@
+module TensorKitAMDGPUExt
+
+using TensorKit
+using TensorKit: SectorDict
+using AMDGPU 
+using Random
+
+include("roctensormap.jl")
+
+end
diff --git a/ext/TensorKitAMDGPUExt/roctensormap.jl b/ext/TensorKitAMDGPUExt/roctensormap.jl
new file mode 100644
index 000000000..38a3e1355
--- /dev/null
+++ b/ext/TensorKitAMDGPUExt/roctensormap.jl
@@ -0,0 +1,103 @@
+const _ROCMatOrDict{I,T} = Union{ROCMatrix{T},SectorDict{I,ROCMatrix{T}}}
+const ROCTensorMap{T,S,N₁,N₂,I,A<:_ROCMatOrDict{I,T}} = TensorMap{T,S,N₁,N₂,A}
+const ROCTensor{T, S, N, I, A <: _ROCMatOrDict{I, T}} = ROCTensorMap{T, S, N, 0, I, A}
+
+function ROCTensorMap{T}(::UndefInitializer, V::TensorMapSpace{S, N₁, N₂}) where {T, S, N₁, N₂}
+    A = ROCMatrix{T, AMDGPU.default_memory}
+    TT = tensormaptype{S, N₁, N₂, A}
+    return TT(undef, codomain(V), domain(V))
+end
+
+function ROCTensorMap{T}(::UndefInitializer, codomain::TensorSpace{S},
+                         domain::TensorSpace{S}) where {T,S}
+    return ROCTensorMap{T}(undef, codomain ← domain)
+end
+function ROCTensor{T}(::UndefInitializer, V::TensorSpace{S}) where {T,S}
+    return ROCTensorMap{T}(undef, V ← one(V))
+end
+
+for (fname, felt) in ((:zeros, :zero), (:ones, :one))
+    @eval begin
+        function AMDGPU.$fname(codomain::TensorSpace{S},
+                               domain::TensorSpace{S}=one(codomain)) where {S<:IndexSpace}
+            return AMDGPU.$fname(codomain ← domain)
+        end
+        function AMDGPU.$fname(::Type{T}, codomain::TensorSpace{S},
+                               domain::TensorSpace{S}=one(codomain)) where {T,S<:IndexSpace}
+            return AMDGPU.$fname(T, codomain ← domain)
+        end
+        AMDGPU.$fname(V::TensorMapSpace) = AMDGPU.$fname(Float64, V)
+        function AMDGPU.$fname(::Type{T}, V::TensorMapSpace) where {T}
+            t = ROCTensorMap{T}(undef, V)
+            fill!(t, $felt(T))
+            return t
+        end
+    end
+end
+
+for randfun in (:rand, :randn)
+    randfun! = Symbol(randfun, :!)
+    @eval begin
+        # converting `codomain` and `domain` into `HomSpace`
+        function AMDGPU.$randfun(codomain::TensorSpace{S},
+                                 domain::TensorSpace{S}) where {S<:IndexSpace}
+            return AMDGPU.$randfun(codomain ← domain)
+        end
+        function AMDGPU.$randfun(::Type{T}, codomain::TensorSpace{S},
+                                 domain::TensorSpace{S}) where {T,S<:IndexSpace}
+            return AMDGPU.$randfun(T, codomain ← domain)
+        end
+        function AMDGPU.$randfun(rng::Random.AbstractRNG, ::Type{T},
+                                 codomain::TensorSpace{S},
+                                 domain::TensorSpace{S}) where {T,S<:IndexSpace}
+            return AMDGPU.$randfun(rng, T, codomain ← domain)
+        end
+
+        # accepting single `TensorSpace`
+        AMDGPU.$randfun(codomain::TensorSpace) = AMDGPU.$randfun(codomain ← one(codomain))
+        function AMDGPU.$randfun(::Type{T}, codomain::TensorSpace) where {T}
+            return AMDGPU.$randfun(T, codomain ← one(codomain))
+        end
+        function AMDGPU.$randfun(rng::Random.AbstractRNG, ::Type{T},
+                                 codomain::TensorSpace) where {T}
+            return AMDGPU.$randfun(rng, T, codomain ← one(domain))
+        end
+
+        # filling in default eltype
+        AMDGPU.$randfun(V::TensorMapSpace) = AMDGPU.$randfun(Float64, V)
+        function AMDGPU.$randfun(rng::Random.AbstractRNG, V::TensorMapSpace)
+            return AMDGPU.$randfun(rng, Float64, V)
+        end
+
+        # filling in default rng
+        function AMDGPU.$randfun(::Type{T}, V::TensorMapSpace) where {T}
+            return AMDGPU.$randfun(Random.default_rng(), T, V)
+        end
+
+        # implementation
+        function AMDGPU.$randfun(rng::Random.AbstractRNG, ::Type{T},
+                               V::TensorMapSpace) where {T}
+            t = ROCTensorMap{T}(undef, V)
+            AMDGPU.$randfun!(rng, t)
+            return t
+        end
+    end
+end
+
+# converters
+# ----------
+function Base.convert(::Type{ROCTensorMap}, d::Dict{Symbol,Any})
+    try
+        codomain = eval(Meta.parse(d[:codomain]))
+        domain = eval(Meta.parse(d[:domain]))
+        data = SectorDict(eval(Meta.parse(c)) => ROCArray(b) for (c, b) in d[:data])
+        return TensorMap(data, codomain, domain)
+    catch e # sector unknown in TensorKit.jl; user-defined, hopefully accessible in Main
+        codomain = Base.eval(Main, Meta.parse(d[:codomain]))
+        domain = Base.eval(Main, Meta.parse(d[:domain]))
+        data = SectorDict(Base.eval(Main, Meta.parse(c)) => ROCArray(b)
+                          for (c, b) in d[:data])
+        return TensorMap(data, codomain, domain)
+    end
+end
+
diff --git a/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
new file mode 100644
index 000000000..ae776673c
--- /dev/null
+++ b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
@@ -0,0 +1,90 @@
+module TensorKitCUDAExt
+
+using CUDA, CUDA.CUBLAS, LinearAlgebra
+using CUDA: @allowscalar
+using cuTENSOR: cuTENSOR
+
+using TensorKit
+import TensorKit.VectorInterface: scalartype as vi_scalartype
+using TensorKit.Factorizations
+using TensorKit.Factorizations: select_svd_algorithm, OFA, initialize_output, AbstractAlgorithm
+using TensorKit: SectorDict, tensormaptype, scalar, similarstoragetype, AdjointTensorMap
+
+using TensorKit.MatrixAlgebraKit
+
+using Random
+
+include("cutensormap.jl")
+
+TensorKit.Factorizations.select_svd_algorithm(::CuTensorMap, ::TensorKit.Factorizations.SVD) = CUSOLVER_QRIteration()
+TensorKit.Factorizations.select_svd_algorithm(::CuTensorMap, ::TensorKit.Factorizations.SDD) = throw(ArgumentError("DivideAndConquer unavailable on CUDA")) 
+TensorKit.Factorizations.select_svd_algorithm(::CuTensorMap, alg::OFA) = throw(ArgumentError(lazy"Unknown algorithm $alg"))
+
+const CuDiagonalTensorMap{T, S} = DiagonalTensorMap{T, S, CuVector{T, CUDA.DeviceMemory}}
+
+"""
+    CuDiagonalTensorMap{T}(undef, domain::S) where {T,S<:IndexSpace}
+    # expert mode: select storage type `A`
+    DiagonalTensorMap{T,S,A}(undef, domain::S) where {T,S<:IndexSpace,A<:DenseVector{T}}
+
+Construct a `DiagonalTensorMap` with uninitialized data.
+"""
+function CuDiagonalTensorMap{T}(::UndefInitializer, V::TensorMapSpace) where {T}
+    (numin(V) == numout(V) == 1 && domain(V) == codomain(V)) ||
+        throw(ArgumentError("DiagonalTensorMap requires a space with equal domain and codomain and 2 indices"))
+    return CuDiagonalTensorMap{T}(undef, domain(V))
+end
+function CuDiagonalTensorMap{T}(::UndefInitializer, V::ProductSpace) where {T}
+    length(V) == 1 ||
+        throw(ArgumentError("DiagonalTensorMap requires `numin(d) == numout(d) == 1`"))
+    return CuDiagonalTensorMap{T}(undef, only(V))
+end
+function CuDiagonalTensorMap{T}(::UndefInitializer, V::S) where {T,S<:IndexSpace}
+    return CuDiagonalTensorMap{T,S}(undef, V)
+end
+CuDiagonalTensorMap(::UndefInitializer, V::IndexSpace) = CuDiagonalTensorMap{Float64}(undef, V)
+
+function TensorKit.Factorizations.initialize_output(::typeof(svd_compact!), t::CuTensorMap, ::AbstractAlgorithm)
+    V_cod = V_dom = infimum(fuse(codomain(t)), fuse(domain(t)))
+    U = similar(t, codomain(t) ← V_cod)
+    S = CuDiagonalTensorMap{real(scalartype(t))}(undef, V_cod)
+    Vᴴ = similar(t, V_dom ← domain(t))
+    return U, S, Vᴴ
+end
+
+function TensorKit.Factorizations.initialize_output(::typeof(eigh_full!), t::CuTensorMap, ::AbstractAlgorithm)
+    V_D = fuse(domain(t))
+    T = real(scalartype(t))
+    D = CuDiagonalTensorMap{T}(undef, V_D)
+    V = similar(t, codomain(t) ← V_D)
+    return D, V
+end
+
+function TensorKit.Factorizations.initialize_output(::typeof(eig_full!), t::CuTensorMap, ::AbstractAlgorithm)
+    V_D = fuse(domain(t))
+    Tc = complex(scalartype(t))
+    D = CuDiagonalTensorMap{Tc}(undef, V_D)
+    V = similar(t, Tc, codomain(t) ← V_D)
+    return D, V
+end
+
+function TensorKit.Factorizations.initialize_output(::typeof(eigh_vals!), t::CuTensorMap, alg::AbstractAlgorithm)
+    V_D = fuse(domain(t))
+    T = real(scalartype(t))
+    return D = CuDiagonalTensorMap{Tc}(undef, V_D)
+end
+
+function TensorKit.Factorizations.initialize_output(::typeof(eig_vals!), t::CuTensorMap, alg::AbstractAlgorithm)
+    V_D = fuse(domain(t))
+    Tc = complex(scalartype(t))
+    return D = CuDiagonalTensorMap{Tc}(undef, V_D)
+end
+
+
+# TODO
+# add VectorInterface extensions for proper CUDA promotion
+function TensorKit.VectorInterface.promote_add(TA::Type{<:CUDA.StridedCuMatrix{Tx}}, TB::Type{<:CUDA.StridedCuMatrix{Ty}}, α::Tα = TensorKit.VectorInterface.One(), β::Tβ = TensorKit.VectorInterface.One()) where {Tx, Ty, Tα, Tβ}
+    return Base.promote_op(add, Tx, Ty, Tα, Tβ)
+end
+
+end
diff --git a/ext/TensorKitCUDAExt/cutensormap.jl b/ext/TensorKitCUDAExt/cutensormap.jl
new file mode 100644
index 000000000..e3f6cf203
--- /dev/null
+++ b/ext/TensorKitCUDAExt/cutensormap.jl
@@ -0,0 +1,207 @@
+const CuTensorMap{T,S,N₁,N₂} = TensorMap{T,S,N₁,N₂, CuVector{T,CUDA.DeviceMemory}}
+const CuTensor{T, S, N} = CuTensorMap{T, S, N, 0}
+
+function TensorKit.tensormaptype(S::Type{<:IndexSpace}, N₁, N₂, TorA::Type{<:StridedCuArray})
+    if TorA <: CuArray
+        return TensorMap{eltype(TorA),S,N₁,N₂,CuVector{eltype(TorA), CUDA.DeviceMemory}}
+    else
+        throw(ArgumentError("argument $TorA should specify a scalar type (`<:Number`) or a storage type `<:CuVector{<:Number}`"))
+    end
+end
+
+function CuTensorMap{T}(::UndefInitializer, V::TensorMapSpace{S, N₁, N₂}) where {T, S, N₁, N₂}
+    return CuTensorMap{T,S,N₁,N₂}(undef, V)
+end
+
+function CuTensorMap{T}(::UndefInitializer, codomain::TensorSpace{S},
+                        domain::TensorSpace{S}) where {T,S}
+    return CuTensorMap{T}(undef, codomain ← domain)
+end
+function CuTensor{T}(::UndefInitializer, V::TensorSpace{S}) where {T,S}
+    return CuTensorMap{T}(undef, V ← one(V))
+end
+# constructor starting from block data
+"""
+    CuTensorMap(data::AbstractDict{<:Sector,<:CuMatrix}, codomain::ProductSpace{S,N₁},
+                domain::ProductSpace{S,N₂}) where {S<:ElementarySpace,N₁,N₂}
+    CuTensorMap(data, codomain ← domain)
+    CuTensorMap(data, domain → codomain)
+
+Construct a `CuTensorMap` by explicitly specifying its block data.
+
+## Arguments
+- `data::AbstractDict{<:Sector,<:CuMatrix}`: dictionary containing the block data for
+  each coupled sector `c` as a matrix of size `(blockdim(codomain, c), blockdim(domain, c))`.
+- `codomain::ProductSpace{S,N₁}`: the codomain as a `ProductSpace` of `N₁` spaces of type
+  `S<:ElementarySpace`.
+- `domain::ProductSpace{S,N₂}`: the domain as a `ProductSpace` of `N₂` spaces of type
+  `S<:ElementarySpace`.
+
+Alternatively, the domain and codomain can be specified by passing a [`HomSpace`](@ref)
+using the syntax `codomain ← domain` or `domain → codomain`.
+"""
+function CuTensorMap(data::AbstractDict{<:Sector,<:CuArray},
+                     V::TensorMapSpace{S,N₁,N₂}) where {S,N₁,N₂}
+    T = eltype(valtype(data))
+    t = CuTensorMap{T}(undef, V)
+    for (c, b) in blocks(t)
+        haskey(data, c) || throw(SectorMismatch("no data for block sector $c"))
+        datac = data[c]
+        size(datac) == size(b) ||
+            throw(DimensionMismatch("wrong size of block for sector $c"))
+        copy!(b, datac)
+    end
+    for (c, b) in data
+        c ∈ blocksectors(t) || isempty(b) ||
+            throw(SectorMismatch("data for block sector $c not expected"))
+    end
+    return t
+end
+function CuTensorMap{T}(data::DenseVector{T}, codomain::TensorSpace{S},
+                        domain::TensorSpace{S}) where {T,S}
+    return CuTensorMap(data, codomain ← domain)
+end
+function CuTensorMap(data::AbstractDict{<:Sector,<:CuMatrix}, codom::TensorSpace{S},
+                   dom::TensorSpace{S}) where {S}
+    return CuTensorMap(data, codom ← dom)
+end
+
+for (fname, felt) in ((:zeros, :zero), (:ones, :one))
+    @eval begin
+        function CUDA.$fname(codomain::TensorSpace{S},
+                             domain::TensorSpace{S}=one(codomain)) where {S<:IndexSpace}
+            return CUDA.$fname(codomain ← domain)
+        end
+        function CUDA.$fname(::Type{T}, codomain::TensorSpace{S},
+                             domain::TensorSpace{S}=one(codomain)) where {T,S<:IndexSpace}
+            return CUDA.$fname(T, codomain ← domain)
+        end
+        CUDA.$fname(V::TensorMapSpace) = CUDA.$fname(Float64, V)
+        function CUDA.$fname(::Type{T}, V::TensorMapSpace) where {T}
+            t = CuTensorMap{T}(undef, V)
+            fill!(t, $felt(T))
+            return t
+        end
+    end
+end
+
+for randfun in (:rand, :randn)
+    randfun! = Symbol(randfun, :!)
+    @eval begin
+        # converting `codomain` and `domain` into `HomSpace`
+        function CUDA.$randfun(codomain::TensorSpace{S},
+                               domain::TensorSpace{S}) where {S<:IndexSpace}
+            return CUDA.$randfun(codomain ← domain)
+        end
+        function CUDA.$randfun(::Type{T}, codomain::TensorSpace{S},
+                               domain::TensorSpace{S}) where {T,S<:IndexSpace}
+            return CUDA.$randfun(T, codomain ← domain)
+        end
+        function CUDA.$randfun(rng::Random.AbstractRNG, ::Type{T},
+                               codomain::TensorSpace{S},
+                               domain::TensorSpace{S}) where {T,S<:IndexSpace}
+            return CUDA.$randfun(rng, T, codomain ← domain)
+        end
+
+        # accepting single `TensorSpace`
+        CUDA.$randfun(codomain::TensorSpace) = CUDA.$randfun(codomain ← one(codomain))
+        function CUDA.$randfun(::Type{T}, codomain::TensorSpace) where {T}
+            return CUDA.$randfun(T, codomain ← one(codomain))
+        end
+        function CUDA.$randfun(rng::Random.AbstractRNG, ::Type{T},
+                               codomain::TensorSpace) where {T}
+            return CUDA.$randfun(rng, T, codomain ← one(domain))
+        end
+
+        # filling in default eltype
+        CUDA.$randfun(V::TensorMapSpace) = CUDA.$randfun(Float64, V)
+        function CUDA.$randfun(rng::Random.AbstractRNG, V::TensorMapSpace)
+            return CUDA.$randfun(rng, Float64, V)
+        end
+
+        # filling in default rng
+        function CUDA.$randfun(::Type{T}, V::TensorMapSpace) where {T}
+            return CUDA.$randfun(Random.default_rng(), T, V)
+        end
+
+        # implementation
+        function CUDA.$randfun(rng::Random.AbstractRNG, ::Type{T},
+                               V::TensorMapSpace) where {T}
+            t = CuTensorMap{T}(undef, V)
+            CUDA.$randfun!(rng, t)
+            return t
+        end
+    end
+end
+
+# converters
+# ----------
+function Base.convert(::Type{CuTensorMap}, d::Dict{Symbol,Any})
+    try
+        codomain = eval(Meta.parse(d[:codomain]))
+        domain = eval(Meta.parse(d[:domain]))
+        data = SectorDict(eval(Meta.parse(c)) => CuArray(b) for (c, b) in d[:data])
+        return CuTensorMap(data, codomain, domain)
+    catch e # sector unknown in TensorKit.jl; user-defined, hopefully accessible in Main
+        codomain = Base.eval(Main, Meta.parse(d[:codomain]))
+        domain = Base.eval(Main, Meta.parse(d[:domain]))
+        data = SectorDict(Base.eval(Main, Meta.parse(c)) => CuArray(b)
+                          for (c, b) in d[:data])
+        return CuTensorMap(data, codomain, domain)
+    end
+end
+
+function Base.convert(::Type{CuTensorMap}, t::AbstractTensorMap)
+    return copy!(CuTensorMap{scalartype(t)}(undef, space(t)), t)
+end
+
+# Scalar implementation
+#-----------------------
+function TensorKit.scalar(t::CuTensorMap)
+    
+    # TODO: should scalar only work if N₁ == N₂ == 0?
+    return @allowscalar dim(codomain(t)) == dim(domain(t)) == 1 ?
+           first(blocks(t))[2][1, 1] : throw(DimensionMismatch())
+end
+
+TensorKit.scalartype(A::StridedCuArray{T}) where {T} = T
+vi_scalartype(::Type{<:CuTensorMap{T}}) where {T} = T
+vi_scalartype(::Type{<:CuArray{T}}) where {T} = T
+
+function TensorKit.similarstoragetype(TT::Type{<:CuTensorMap{TTT,S,N₁,N₂}}, ::Type{T}) where {TTT,T,S,N₁,N₂}
+    return CuVector{T, CUDA.DeviceMemory}
+end
+
+function Base.convert(TT::Type{CuTensorMap{T,S,N₁,N₂}},
+                      t::AbstractTensorMap{<:Any,S,N₁,N₂}) where {T,S,N₁,N₂}
+    if typeof(t) === TT
+        return t
+    else
+        tnew = TT(undef, space(t))
+        return copy!(tnew, t)
+    end
+end
+
+function Base.copy!(tdst::CuTensorMap{T, S, N₁, N₂}, tsrc::CuTensorMap{T, S, N₁, N₂}) where {T, S, N₁, N₂}
+    space(tdst) == space(tsrc) || throw(SpaceMismatch("$(space(tdst)) ≠ $(space(tsrc))"))
+    for ((c, bdst), (_, bsrc)) in zip(blocks(tdst), blocks(tsrc))
+        copy!(bdst, bsrc)
+    end
+    return tdst
+end
+
+function Base.copy!(tdst::CuTensorMap, tsrc::TensorKit.AdjointTensorMap)
+    space(tdst) == space(tsrc) || throw(SpaceMismatch("$(space(tdst)) ≠ $(space(tsrc))"))
+    for ((c, bdst), (_, bsrc)) in zip(blocks(tdst), blocks(tsrc))
+        copy!(bdst, bsrc)
+    end
+    return tdst
+end
+
+function Base.promote_rule(::Type{<:TT₁},
+                           ::Type{<:TT₂}) where {S,N₁,N₂, TTT₁, TTT₂,
+                                                 TT₁<:CuTensorMap{TTT₁,S,N₁,N₂},
+                                                 TT₂<:CuTensorMap{TTT₂,S,N₁,N₂}}
+    T = TensorKit.VectorInterface.promote_add(TTT₁, TTT₂)
+    return CuTensorMap{T,S,N₁,N₂}
+end
diff --git a/src/tensors/factorizations/factorizations.jl b/src/tensors/factorizations/factorizations.jl
index db1b1c3c7..f56e44a1b 100644
--- a/src/tensors/factorizations/factorizations.jl
+++ b/src/tensors/factorizations/factorizations.jl
@@ -99,10 +99,10 @@ eigh!(d::DiagonalTensorMap{<:Real}) = d, one(d)
 eigh!(d::DiagonalTensorMap{<:Complex}) = DiagonalTensorMap(real(d.data), d.domain), one(d)
 
 function LinearAlgebra.svdvals(d::DiagonalTensorMap)
-    return SectorDict(c => LinearAlgebra.svdvals(b) for (c, b) in blocks(d))
+    return SectorDict(c => svd_vals(b) for (c, b) in blocks(d))
 end
 function LinearAlgebra.eigvals(d::DiagonalTensorMap)
-    return SectorDict(c => LinearAlgebra.eigvals(b) for (c, b) in blocks(d))
+    return SectorDict(c => eig_vals(b) for (c, b) in blocks(d))
 end
 
 function LinearAlgebra.cond(d::DiagonalTensorMap, p::Real=2)
@@ -121,11 +121,11 @@ LinearAlgebra.svdvals!(t::AdjointTensorMap) = svdvals!(adjoint(t))
 #--------------------------#
 
 function LinearAlgebra.eigvals!(t::TensorMap{<:RealOrComplexFloat}; kwargs...)
-    return SectorDict(c => complex(LinearAlgebra.eigvals!(b; kwargs...))
+    return SectorDict(c => complex(eig_vals!(b; kwargs...))
                       for (c, b) in blocks(t))
 end
 function LinearAlgebra.eigvals!(t::AdjointTensorMap{<:RealOrComplexFloat}; kwargs...)
-    return SectorDict(c => conj!(complex(LinearAlgebra.eigvals!(b; kwargs...)))
+    return SectorDict(c => conj!(complex(eig_vals!(b; kwargs...)))
                       for (c, b) in blocks(t))
 end
 
diff --git a/src/tensors/factorizations/implementations.jl b/src/tensors/factorizations/implementations.jl
index d00703549..f341b4e70 100644
--- a/src/tensors/factorizations/implementations.jl
+++ b/src/tensors/factorizations/implementations.jl
@@ -3,6 +3,11 @@ _kindof(::Union{QR,QRpos}) = :qr
 _kindof(::Union{LQ,LQpos}) = :lq
 _kindof(::Polar) = :polar
 
+select_svd_algorithm(t, ::SVD) = LAPACK_QRIteration()
+select_svd_algorithm(t, ::SDD) = LAPACK_DivideAndConquer()
+select_svd_algorithm(t, alg::OFA) = throw(ArgumentError(lazy"Unknown algorithm $alg"))
+select_svd_algorithm(t::Base.ReshapedArray, alg) = select_svd_algorithm(parent(parent(t)), alg) 
+
 leftorth!(t; alg=nothing, kwargs...) = _leftorth!(t, alg; kwargs...)
 
 function _leftorth!(t::AbstractTensorMap, alg::Nothing, ; kwargs...)
@@ -26,9 +31,7 @@ function _leftorth!(t, alg::OFA; kwargs...)
 
     kind = _kindof(alg)
     if kind == :svd
-        alg_svd = alg === SVD() ? LAPACK_QRIteration() :
-                  alg === SDD() ? LAPACK_DivideAndConquer() :
-                  throw(ArgumentError(lazy"Unknown algorithm $alg"))
+        alg_svd = select_svd_algorithm(t, alg) 
         return left_orth!(t; kind, alg_svd, trunc)
     elseif kind == :qr
         alg_qr = (; positive=(alg == QRpos()))
@@ -52,9 +55,7 @@ function leftnull!(t::AbstractTensorMap;
 
     kind = _kindof(alg)
     if kind == :svd
-        alg_svd = alg === SVD() ? LAPACK_QRIteration() :
-                  alg === SDD() ? LAPACK_DivideAndConquer() :
-                  throw(ArgumentError(lazy"Unknown algorithm $alg"))
+        alg_svd = select_svd_algorithm(t, alg) 
         return left_null!(t; kind, alg_svd, trunc)
     elseif kind == :qr
         alg_qr = (; positive=(alg == QRpos()))
@@ -82,9 +83,7 @@ function rightorth!(t::AbstractTensorMap;
 
     kind = _kindof(alg)
     if kind == :svd
-        alg_svd = alg === SVD() ? LAPACK_QRIteration() :
-                  alg === SDD() ? LAPACK_DivideAndConquer() :
-                  throw(ArgumentError(lazy"Unknown algorithm $alg"))
+        alg_svd = select_svd_algorithm(t, alg) 
         return right_orth!(t; kind, alg_svd, trunc)
     elseif kind == :lq
         alg_lq = (; positive=(alg == LQpos()))
@@ -106,9 +105,7 @@ function rightnull!(t::AbstractTensorMap;
 
     kind = _kindof(alg)
     if kind == :svd
-        alg_svd = alg === SVD() ? LAPACK_QRIteration() :
-                  alg === SDD() ? LAPACK_DivideAndConquer() :
-                  throw(ArgumentError(lazy"Unknown algorithm $alg"))
+        alg_svd = select_svd_algorithm(t, alg) 
         return right_null!(t; kind, alg_svd, trunc)
     elseif kind == :lq
         alg_lq = (; positive=(alg == LQpos()))
@@ -149,16 +146,16 @@ function tsvd!(t::AbstractTensorMap; trunc=notrunc(), p=nothing, alg=nothing, kw
     InnerProductStyle(t) === EuclideanInnerProduct() || throw_invalid_innerproduct(:tsvd!)
     isnothing(p) || Base.depwarn("p is no longer supported", :tsvd!)
 
-    if alg isa OFA
+    if alg isa Union{OFA, SVD}
         Base.depwarn(lazy"$alg is deprecated", :tsvd!)
-        alg = alg === SVD() ? LAPACK_QRIteration() :
-              alg === SDD() ? LAPACK_DivideAndConquer() :
-              throw(ArgumentError(lazy"Unknown algorithm $alg"))
+        alg_svd = select_svd_algorithm(t, alg)
+    else
+        alg_svd = alg
     end
 
     if trunc == notrunc()
-        return svd_compact!(t; alg, kwargs...)
+        return svd_compact!(t; alg_svd, kwargs...)
     else
-        return svd_trunc!(t; trunc, alg, kwargs...)
+        return svd_trunc!(t; trunc, alg_svd, kwargs...)
     end
 end
diff --git a/src/tensors/factorizations/matrixalgebrakit.jl b/src/tensors/factorizations/matrixalgebrakit.jl
index eb27654a5..2ddf3eb5e 100644
--- a/src/tensors/factorizations/matrixalgebrakit.jl
+++ b/src/tensors/factorizations/matrixalgebrakit.jl
@@ -29,7 +29,6 @@ for f! in (:qr_compact!, :qr_full!,
            :left_orth!, :right_orth!)
     @eval function $f!(t::AbstractTensorMap, F, alg::AbstractAlgorithm)
         check_input($f!, t, F, alg)
-
         foreachblock(t, F...) do _, bs
             factors = Base.tail(bs)
             factors′ = $f!(first(bs), factors, alg)
diff --git a/src/tensors/factorizations/truncation.jl b/src/tensors/factorizations/truncation.jl
index d553c5c93..b37f6dd8a 100644
--- a/src/tensors/factorizations/truncation.jl
+++ b/src/tensors/factorizations/truncation.jl
@@ -24,26 +24,25 @@ truncspace(space::ElementarySpace) = TruncationSpace(space)
 function truncate!(::typeof(svd_trunc!), (U, S, Vᴴ)::_T_USVᴴ, strategy::TruncationStrategy)
     ind = findtruncated_sorted(diagview(S), strategy)
     V_truncated = spacetype(S)(c => length(I) for (c, I) in ind)
-
     Ũ = similar(U, codomain(U) ← V_truncated)
     for (c, b) in blocks(Ũ)
         I = get(ind, c, nothing)
         @assert !isnothing(I)
-        copy!(b, @view(block(U, c)[:, I]))
+        copyto!(b, @view(block(U, c)[:, I]))
     end
 
-    S̃ = DiagonalTensorMap{scalartype(S)}(undef, V_truncated)
+    S̃ = DiagonalTensorMap{scalartype(S), spacetype(S), storagetype(S)}(undef, V_truncated)
     for (c, b) in blocks(S̃)
         I = get(ind, c, nothing)
         @assert !isnothing(I)
-        copy!(b.diag, @view(block(S, c).diag[I]))
+        copyto!(b.diag, @view(block(S, c).diag[I]))
     end
 
     Ṽᴴ = similar(Vᴴ, V_truncated ← domain(Vᴴ))
     for (c, b) in blocks(Ṽᴴ)
         I = get(ind, c, nothing)
         @assert !isnothing(I)
-        copy!(b, @view(block(Vᴴ, c)[I, :]))
+        copyto!(b, @view(block(Vᴴ, c)[I, :]))
     end
 
     return Ũ, S̃, Ṽᴴ
@@ -68,8 +67,8 @@ end
 function truncate!(::typeof(eigh_trunc!), (D, V)::_T_DV, strategy::TruncationStrategy)
     ind = findtruncated(diagview(D), strategy)
     V_truncated = spacetype(D)(c => length(I) for (c, I) in ind)
-
-    D̃ = DiagonalTensorMap{scalartype(D)}(undef, V_truncated)
+    
+    D̃ = DiagonalTensorMap{scalartype(D), spacetype(D), storagetype(D)}(undef, V_truncated)
     for (c, b) in blocks(D̃)
         I = get(ind, c, nothing)
         @assert !isnothing(I)
@@ -89,7 +88,7 @@ function truncate!(::typeof(eig_trunc!), (D, V)::_T_DV, strategy::TruncationStra
     ind = findtruncated(diagview(D), strategy)
     V_truncated = spacetype(D)(c => length(I) for (c, I) in ind)
 
-    D̃ = DiagonalTensorMap{scalartype(D)}(undef, V_truncated)
+    D̃ = DiagonalTensorMap{scalartype(D), spacetype(D), storagetype(D)}(undef, V_truncated)
     for (c, b) in blocks(D̃)
         I = get(ind, c, nothing)
         @assert !isnothing(I)
diff --git a/src/tensors/linalg.jl b/src/tensors/linalg.jl
index 51567dec7..4f00f1444 100644
--- a/src/tensors/linalg.jl
+++ b/src/tensors/linalg.jl
@@ -268,7 +268,7 @@ function _norm(blockiter, p::Real, init::Real)
         end
     elseif p == 2
         n² = mapreduce(+, blockiter; init=init) do (c, b)
-            return isempty(b) ? init : oftype(init, dim(c) * LinearAlgebra.norm2(b)^2)
+            return isempty(b) ? init : oftype(init, dim(c) * norm(b, 2)^2)
         end
         return sqrt(n²)
     elseif p == 1
@@ -277,7 +277,7 @@ function _norm(blockiter, p::Real, init::Real)
         end
     elseif p > 0
         nᵖ = mapreduce(+, blockiter; init=init) do (c, b)
-            return isempty(b) ? init : oftype(init, dim(c) * LinearAlgebra.normp(b, p)^p)
+            return isempty(b) ? init : oftype(init, dim(c) * norm(b, p)^p)
         end
         return (nᵖ)^inv(oftype(nᵖ, p))
     else
@@ -544,8 +544,8 @@ function ⊗(t1::AbstractTensorMap, t2::AbstractTensorMap)
                             d4 = dim(dom2, f2r.uncoupled)
                             m1 = sreshape(t1[f1l, f1r], (d1, 1, d3, 1))
                             m2 = sreshape(t2[f2l, f2r], (1, d2, 1, d4))
-                            m = sreshape(t[fl, fr], (d1, d2, d3, d4))
-                            m .+= coeff1 .* conj(coeff2) .* m1 .* m2
+                            m  = sreshape(t[fl, fr], (d1, d2, d3, d4))
+                            m .+= coeff1 .* conj.(coeff2) .* m1 .* m2
                         end
                     end
                 end
diff --git a/src/tensors/tensor.jl b/src/tensors/tensor.jl
index a918478c3..914da3c3b 100644
--- a/src/tensors/tensor.jl
+++ b/src/tensors/tensor.jl
@@ -138,7 +138,7 @@ function TensorMap(data::AbstractDict{<:Sector,<:AbstractMatrix},
         datac = data[c]
         size(datac) == size(b) ||
             throw(DimensionMismatch("wrong size of block for sector $c"))
-        copy!(b, datac)
+        copyto!(b, datac)
     end
     for (c, b) in data
         c ∈ blocksectors(t) || isempty(b) ||
diff --git a/test/amdgpu.jl b/test/amdgpu.jl
new file mode 100644
index 000000000..b7d1bfb41
--- /dev/null
+++ b/test/amdgpu.jl
@@ -0,0 +1,597 @@
+using Adapt
+ad = adapt(Array)
+
+const AMDGPUExt = Base.get_extension(TensorKit, :TensorKitAMDGPUExt)
+@assert !isnothing(AMDGPUExt)
+# const ROCTensorMap{T,S,N1,N2,I,A} = AMDGPUExt.ROCTensorMap{T,S,N1,N2,I,A}
+const ROCTensorMap = getglobal(AMDGPUExt, :ROCTensorMap)
+
+for V in (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂, VfSU₂)#, VSU₃)
+    V1, V2, V3, V4, V5 = V
+    @assert V3 * V4 * V2 ≿ V1' * V5' # necessary for leftorth tests
+    @assert V3 * V4 ≾ V1' * V2' * V5' # necessary for rightorth tests
+end
+
+spacelist = try
+    if ENV["CI"] == "true"
+        println("Detected running on CI")
+        if Sys.iswindows()
+            (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂)
+        elseif Sys.isapple()
+            (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VfU₁, VfSU₂)#, VSU₃)
+        else
+            (Vtr, Vℤ₂, Vfℤ₂, VU₁, VCU₁, VSU₂, VfSU₂)#, VSU₃)
+        end
+    else
+        (Vtr, VU₁, VSU₂, Vfℤ₂)
+    end
+catch
+    (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂, VfSU₂)#, VSU₃)
+end
+
+for V in spacelist
+    I = sectortype(first(V))
+    Istr = TensorKit.type_repr(I)
+    println("---------------------------------------")
+    println("AMDGPU Tensors with symmetry: $Istr")
+    println("---------------------------------------")
+    @timedtestset "Tensors with symmetry: $Istr" verbose = true begin
+        V1, V2, V3, V4, V5 = V
+        @timedtestset "Basic tensor properties" begin
+            W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+            for T in (Int, Float32, Float64, ComplexF32, ComplexF64)
+                t = @constinferred AMDGPU.zeros(T, W)
+                # @test @constinferred(hash(t)) == hash(deepcopy(t)) # hash is not defined for CuArray?
+                @test scalartype(t) == T
+                @test norm(t) == 0
+                @test codomain(t) == W
+                @test space(t) == (W ← one(W))
+                @test domain(t) == one(W)
+                @test typeof(t) ==
+                      @constinferred tensormaptype(spacetype(t), 5, 0, storagetype(t))
+            end
+        end
+        @timedtestset "Tensor Dict conversion" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            for T in (Int, Float32, ComplexF64)
+                t = @constinferred AMDGPU.rand(T, W)
+                d = convert(Dict, t)
+                @test t == convert(ROCTensorMap, d)
+            end
+        end
+        if hasfusiontensor(I)
+            @timedtestset "Tensor Array conversion" begin
+                W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+
+                # cuTENSOR does not support Int
+                Ts = sectortype(W) === Trivial ? (Int, Float32, ComplexF64) :
+                     (Float32, ComplexF64)
+                for T in Ts
+                    t = @constinferred AMDGPU.rand(T, W)
+                    a = @constinferred convert(CuArray, t)
+                    @test t ≈ @constinferred TensorMap(a, W)
+                    # also test if input is matrix
+                    a2 = reshape(a, prod(dim, codomain(t)), prod(dim, domain(t)))
+                    @test t ≈ @constinferred TensorMap(a2, codomain(t), domain(t))
+                end
+            end
+        end
+        @timedtestset "Basic linear algebra" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            for T in (Float32, ComplexF64)
+                t = @constinferred AMDGPU.rand(T, W)
+                @test scalartype(t) == T
+                @test space(t) == W
+                @test space(t') == W'
+                @test dim(t) == dim(space(t))
+                @test codomain(t) == codomain(W)
+                @test domain(t) == domain(W)
+                @test isa(@constinferred(norm(t)), real(T))
+                @test norm(t)^2 ≈ dot(t, t)
+                α = rand(T)
+                @test norm(α * t) ≈ abs(α) * norm(t)
+                @test norm(t + t, 2) ≈ 2 * norm(t, 2)
+                @test norm(t + t, 1) ≈ 2 * norm(t, 1)
+                @test norm(t + t, Inf) ≈ 2 * norm(t, Inf)
+                p = 3 * rand(Float64)
+                @test norm(t + t, p) ≈ 2 * norm(t, p)
+                @test norm(t) ≈ norm(t')
+
+                t2 = @constinferred rand!(similar(t))
+                β = rand(T)
+                @test @constinferred(dot(β * t2, α * t)) ≈ conj(β) * α * conj(dot(t, t2))
+                @test dot(t2, t) ≈ conj(dot(t, t2))
+                @test dot(t2, t) ≈ conj(dot(t2', t'))
+                @test dot(t2, t) ≈ dot(t', t2')
+
+                A = storagetype(t)
+
+                i1 = @constinferred(isomorphism(CuMatrix{T}, V1 ⊗ V2, V2 ⊗ V1))
+                i2 = @constinferred(isomorphism(CuMatrix{T}, V2 ⊗ V1, V1 ⊗ V2))
+                @test i1 * i2 == @constinferred(id(CuMatrix{T}, V1 ⊗ V2))
+                @test i2 * i1 == @constinferred(id(CuMatrix{T}, V2 ⊗ V1))
+
+                w = @constinferred(isometry(CuMatrix{T}, V1 ⊗ (oneunit(V1) ⊕ oneunit(V1)),
+                                            V1))
+                @test dim(w) == 2 * dim(V1 ← V1)
+                @test w' * w == id(CuMatrix{T}, V1)
+                @test w * w' == (w * w')^2
+            end
+        end
+        if hasfusiontensor(I)
+            @timedtestset "Basic linear algebra: test via conversion" begin
+                W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+                for T in (Float32, ComplexF64)
+                    t = AMDGPU.rand(T, W)
+                    t2 = @constinferred rand!(similar(t))
+                    @test norm(t, 2) ≈ norm(ad(t), 2)
+                    @test dot(t2, t) ≈ dot(ad(t2), ad(t))
+                    α = rand(T)
+                    @test ad(α * t) ≈ α * ad(t)
+                    @test ad(t + t) ≈ 2 * ad(t)
+                end
+            end
+            @timedtestset "Real and imaginary parts" begin
+                W = V1 ⊗ V2
+                for T in (Float64, ComplexF64, ComplexF32)
+                    t = @constinferred AMDGPU.randn(T, W, W)
+                    @test real(ad(t)) == ad(@constinferred real(t))
+                    @test imag(ad(t)) == ad(@constinferred imag(t))
+                end
+            end
+        end
+        @timedtestset "Tensor conversion" begin
+            W = V1 ⊗ V2
+            t = @constinferred AMDGPU.randn(W ← W)
+            @test typeof(convert(TensorMap, t')) == typeof(t)
+            tc = complex(t)
+            @test convert(typeof(tc), t) == tc
+            @test typeof(convert(typeof(tc), t)) == typeof(tc)
+            @test typeof(convert(typeof(tc), t')) == typeof(tc)
+            @test Base.promote_typeof(t, tc) == typeof(tc)
+            @test Base.promote_typeof(tc, t) == typeof(tc + t)
+        end
+        @timedtestset "diag/diagm" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            t = AMDGPU.randn(ComplexF64, W)
+            d = LinearAlgebra.diag(t)
+            D = LinearAlgebra.diagm(codomain(t), domain(t), d)
+            @test LinearAlgebra.isdiag(D)
+            @test LinearAlgebra.diag(D) == d
+        end
+        @timedtestset "Permutations: test via inner product invariance" begin
+            W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+            t = AMDGPU.rand(ComplexF64, W)
+            t′ = randn!(similar(t))
+            for k in 0:5
+                for p in permutations(1:5)
+                    p1 = ntuple(n -> p[n], k)
+                    p2 = ntuple(n -> p[k + n], 5 - k)
+                    t2 = @constinferred permute(t, (p1, p2))
+                    @test norm(t2) ≈ norm(t)
+                    t2′ = permute(t′, (p1, p2))
+                    @test dot(t2′, t2) ≈ dot(t′, t) ≈ dot(transpose(t2′), transpose(t2))
+                end
+
+                t3 = VERSION < v"1.7" ? repartition(t, k) :
+                     @constinferred repartition(t, $k)
+                @test norm(t3) ≈ norm(t)
+                t3′ = @constinferred repartition!(similar(t3), t′)
+                @test norm(t3′) ≈ norm(t′)
+                @test dot(t′, t) ≈ dot(t3′, t3)
+            end
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Permutations: test via conversion" begin
+                W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+                t = AMDGPU.rand(ComplexF64, W)
+                a = ad(t)
+                for k in 0:5
+                    for p in permutations(1:5)
+                        p1 = ntuple(n -> p[n], k)
+                        p2 = ntuple(n -> p[k + n], 5 - k)
+                        t2 = permute(t, (p1, p2))
+                        a2 = ad(t2)
+                        @test a2 ≈ permute(a, (p1, p2))
+                        @test ad(transpose(t2)) ≈ transpose(a2)
+                    end
+
+                    t3 = repartition(t, k)
+                    a3 = ad(t3)
+                    @test a3 ≈ repartition(a, k)
+                end
+            end
+        end
+        @timedtestset "Full trace: test self-consistency" begin
+            t = AMDGPU.rand(ComplexF64, V1 ⊗ V2' ⊗ V2 ⊗ V1')
+            t2 = permute(t, ((1, 2), (4, 3)))
+            s = @constinferred tr(t2)
+            @test conj(s) ≈ tr(t2')
+            if !isdual(V1)
+                t2 = twist!(t2, 1)
+            end
+            if isdual(V2)
+                t2 = twist!(t2, 2)
+            end
+            ss = tr(t2)
+            @tensor s2 = t[a, b, b, a]
+            @tensor t3[a, b] := t[a, c, c, b]
+            @tensor s3 = t3[a, a]
+            @test ss ≈ s2
+            @test ss ≈ s3
+        end
+        @timedtestset "Partial trace: test self-consistency" begin
+            t = AMDGPU.rand(ComplexF64, V1 ⊗ V2' ⊗ V3 ⊗ V2 ⊗ V1' ⊗ V3')
+            @tensor t2[a, b] := t[c, d, b, d, c, a]
+            @tensor t4[a, b, c, d] := t[d, e, b, e, c, a]
+            @tensor t5[a, b] := t4[a, b, c, c]
+            @test t2 ≈ t5
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Trace: test via conversion" begin
+                t = AMDGPU.rand(ComplexF64, V1 ⊗ V2' ⊗ V3 ⊗ V2 ⊗ V1' ⊗ V3')
+                @tensor t2[a, b] := t[c, d, b, d, c, a]
+                @tensor t3[a, b] := ad(t)[c, d, b, d, c, a]
+                @test t3 ≈ ad(t2)
+            end
+        end
+        @timedtestset "Trace and contraction" begin
+            t1 = AMDGPU.rand(ComplexF64, V1 ⊗ V2 ⊗ V3)
+            t2 = AMDGPU.rand(ComplexF64, V2' ⊗ V4 ⊗ V1')
+            t3 = t1 ⊗ t2
+            @tensor ta[a, b] := t1[x, y, a] * t2[y, b, x]
+            @tensor tb[a, b] := t3[x, y, a, y, b, x]
+            @test ta ≈ tb
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Tensor contraction: test via conversion" begin
+                A1 = AMDGPU.randn(ComplexF64, V1' * V2', V3')
+                A2 = AMDGPU.randn(ComplexF64, V3 * V4, V5)
+                rhoL = AMDGPU.randn(ComplexF64, V1, V1)
+                rhoR = AMDGPU.randn(ComplexF64, V5, V5)' # test adjoint tensor
+                H = AMDGPU.randn(ComplexF64, V2 * V4, V2 * V4)
+                @tensor HrA12[a, s1, s2, c] := rhoL[a, a'] * conj(A1[a', t1, b]) *
+                                               A2[b, t2, c'] * rhoR[c', c] *
+                                               H[s1, s2, t1, t2]
+
+                @tensor HrA12array[a, s1, s2, c] := ad(rhoL)[a, a'] *
+                                                    conj(ad(A1)[a', t1, b]) *
+                                                    ad(A2)[b, t2, c'] *
+                                                    ad(rhoR)[c', c] *
+                                                    ad(H)[s1, s2, t1, t2]
+
+                @test HrA12array ≈ ad(HrA12)
+            end
+        end
+        @timedtestset "Multiplication and inverse: test compatibility" begin
+            W1 = V1 ⊗ V2 ⊗ V3
+            W2 = V4 ⊗ V5
+            for T in (Float64, ComplexF64)
+                t1 = AMDGPU.rand(T, W1, W1)
+                t2 = AMDGPU.rand(T, W2, W2)
+                t = AMDGPU.rand(T, W1, W2)
+                @test t1 * (t1 \ t) ≈ t
+                @test (t / t2) * t2 ≈ t
+                @test t1 \ one(t1) ≈ inv(t1)
+                @test one(t1) / t1 ≈ pinv(t1)
+                @test_throws SpaceMismatch inv(t)
+                @test_throws SpaceMismatch t2 \ t
+                @test_throws SpaceMismatch t / t1
+                tp = pinv(t) * t
+                @test tp ≈ tp * tp
+            end
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Multiplication and inverse: test via conversion" begin
+                W1 = V1 ⊗ V2 ⊗ V3
+                W2 = V4 ⊗ V5
+                for T in (Float32, Float64, ComplexF32, ComplexF64)
+                    t1 = AMDGPU.rand(T, W1, W1)
+                    t2 = AMDGPU.rand(T, W2, W2)
+                    t = AMDGPU.rand(T, W1, W2)
+                    d1 = dim(W1)
+                    d2 = dim(W2)
+                    At1 = reshape(convert(Array, t1), d1, d1)
+                    At2 = reshape(convert(Array, t2), d2, d2)
+                    At = reshape(convert(Array, t), d1, d2)
+                    @test ad(t1 * t) ≈ ad(t1) * ad(t)
+                    @test ad(t1' * t) ≈ ad(t1)' * ad(t)
+                    @test ad(t2 * t') ≈ ad(t2) * ad(t)'
+                    @test ad(t2' * t') ≈ ad(t2)' * ad(t)'
+                    @test ad(inv(t1)) ≈ inv(ad(t1))
+                    @test ad(pinv(t)) ≈ pinv(ad(t))
+
+                    if T == Float32 || T == ComplexF32
+                        continue
+                    end
+
+                    @test ad(t1 \ t) ≈ ad(t1) \ ad(t)
+                    @test ad(t1' \ t) ≈ ad(t1)' \ ad(t)
+                    @test ad(t2 \ t') ≈ ad(t2) \ ad(t)'
+                    @test ad(t2' \ t') ≈ ad(t2)' \ ad(t)'
+
+                    @test ad(t2 / t) ≈ ad(t2) / ad(t)
+                    @test ad(t2' / t) ≈ ad(t2)' / ad(t)
+                    @test ad(t1 / t') ≈ ad(t1) / ad(t)'
+                    @test ad(t1' / t') ≈ ad(t1)' / ad(t)'
+                end
+            end
+        end
+        @timedtestset "Factorization" begin
+            W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+            for T in (Float32, ComplexF64)
+                # Test both a normal tensor and an adjoint one.
+                ts = (AMDGPU.rand(T, W), AMDGPU.rand(T, W)')
+                for t in ts
+                    @testset "leftorth with $alg" for alg in
+                                                      (TensorKit.QR(), TensorKit.QRpos(),
+                                                       TensorKit.QL(), TensorKit.QLpos(),
+                                                       TensorKit.Polar(), # TensorKit.SVD(),
+                                                       TensorKit.SDD())
+                        Q, R = @constinferred leftorth(t, ((3, 4, 2), (1, 5)); alg=alg)
+                        QdQ = Q' * Q
+                        @test QdQ ≈ one(QdQ)
+                        @test Q * R ≈ permute(t, ((3, 4, 2), (1, 5)))
+                        if alg isa Polar
+                            # @test isposdef(R) # not defined for AMDGPU
+                            @test domain(R) == codomain(R) == space(t, 1)' ⊗ space(t, 5)'
+                        end
+                    end
+                    @testset "leftnull with $alg" for alg in
+                                                      (TensorKit.QR(), # TensorKit.SVD(),
+                                                       TensorKit.SDD())
+                        N = @constinferred leftnull(t, ((3, 4, 2), (1, 5)); alg=alg)
+                        NdN = N' * N
+                        @test NdN ≈ one(NdN)
+                        @test norm(N' * permute(t, ((3, 4, 2), (1, 5)))) <
+                              100 * eps(norm(t))
+                    end
+                    @testset "rightorth with $alg" for alg in
+                                                       (TensorKit.RQ(), TensorKit.RQpos(),
+                                                        TensorKit.LQ(), TensorKit.LQpos(),
+                                                        TensorKit.Polar(), #TensorKit.SVD(),
+                                                        TensorKit.SDD())
+                        # cusolver SVD requires m >= n for some reason
+                        L, Q = @constinferred rightorth(t, ((3, 4), (2, 1, 5)); alg=alg)
+                        QQd = Q * Q'
+                        @test QQd ≈ one(QQd)
+                        @test L * Q ≈ permute(t, ((3, 4), (2, 1, 5)))
+                        if alg isa Polar
+                            # @test isposdef(L) # not defined for AMDGPU
+                            @test domain(L) == codomain(L) == space(t, 3) ⊗ space(t, 4)
+                        end
+                    end
+                    @testset "rightnull with $alg" for alg in
+                                                       (TensorKit.LQ(), # TensorKit.SVD(),
+                                                        TensorKit.SDD())
+                        M = @constinferred rightnull(t, ((3, 4), (2, 1, 5)); alg=alg)
+                        MMd = M * M'
+                        @test MMd ≈ one(MMd)
+                        @test norm(permute(t, ((3, 4), (2, 1, 5))) * M') <
+                              100 * eps(norm(t))
+                    end
+                    @testset "tsvd with $alg" for alg in (TensorKit.SDD(),)
+                        U, S, V = @constinferred tsvd(t, ((3, 4, 2), (1, 5)); alg=alg)
+                        UdU = U' * U
+                        @test UdU ≈ one(UdU)
+                        VVd = V * V'
+                        @test VVd ≈ one(VVd)
+                        t2 = permute(t, ((3, 4, 2), (1, 5)))
+                        @test U * S * V ≈ t2
+
+                        s = LinearAlgebra.svdvals(t2)
+                        s′ = LinearAlgebra.diag(S)
+                        for (c, b) in s
+                            @test b ≈ s′[c]
+                        end
+                    end
+                end
+                @testset "empty tensor" begin
+                    t = AMDGPU.randn(T, V1 ⊗ V2, typeof(V1)())
+                    @testset "leftorth with $alg" for alg in
+                                                      (TensorKit.QR(), TensorKit.QRpos(),
+                                                       TensorKit.QL(), TensorKit.QLpos(),
+                                                       TensorKit.Polar(), TensorKit.SVD(),
+                                                       TensorKit.SDD())
+                        Q, R = @constinferred leftorth(t; alg=alg)
+                        @test Q == t
+                        @test dim(Q) == dim(R) == 0
+                    end
+                    @testset "leftnull with $alg" for alg in
+                                                      (TensorKit.QR(), TensorKit.SVD(),
+                                                       TensorKit.SDD())
+                        N = @constinferred leftnull(t; alg=alg)
+                        @test N' * N ≈ id(storagetype(t), domain(N))
+                        @test N * N' ≈ id(storagetype(t), codomain(N))
+                    end
+                    @testset "rightorth with $alg" for alg in
+                                                       (TensorKit.RQ(), TensorKit.RQpos(),
+                                                        TensorKit.LQ(), TensorKit.LQpos(),
+                                                        TensorKit.Polar(), TensorKit.SVD(),
+                                                        TensorKit.SDD())
+                        L, Q = @constinferred rightorth(copy(t'); alg=alg)
+                        @test Q == t'
+                        @test dim(Q) == dim(L) == 0
+                    end
+                    @testset "rightnull with $alg" for alg in
+                                                       (TensorKit.LQ(), TensorKit.SVD(),
+                                                        TensorKit.SDD())
+                        M = @constinferred rightnull(copy(t'); alg=alg)
+                        @test M * M' ≈ id(storagetype(t), codomain(M))
+                        @test M' * M ≈ id(storagetype(t), domain(M))
+                    end
+                    @testset "tsvd with $alg" for alg in (TensorKit.SVD(), TensorKit.SDD())
+                        U, S, V = @constinferred tsvd(t; alg=alg)
+                        @test U == t
+                        @test dim(U) == dim(S) == dim(V)
+                    end
+                end
+
+                # AMDGPU only supports symmetric/hermitian eigen
+                @testset "eig and isposdef" begin
+                    t = AMDGPU.rand(T, V1 ⊗ V2 ← V1 ⊗ V2)
+                    t += t'
+                    D, V = eigen(t)
+                    @test t * V ≈ V * D
+
+                    # d = LinearAlgebra.eigvals(t; sortby=nothing)
+                    # d′ = LinearAlgebra.diag(D)
+                    # for (c, b) in d
+                    #     @test b ≈ d′[c]
+                    # end
+
+                    # Somehow moving these test before the previous one gives rise to errors
+                    # with T=Float32 on x86 platforms. Is this an OpenBLAS issue? 
+                    # VdV = V' * V
+                    # VdV = (VdV + VdV') / 2
+                    # @test isposdef(VdV)
+                    #
+                    # @test !isposdef(t2) # unlikely for non-hermitian map
+                    # t2 = (t2 + t2')
+                    # D, V = eigen(t2)
+                    # VdV = V' * V
+                    # @test VdV ≈ one(VdV)
+                    # D̃, Ṽ = @constinferred eigh(t2)
+                    # @test D ≈ D̃
+                    # @test V ≈ Ṽ
+                    # λ = minimum(minimum(real(LinearAlgebra.diag(b)))
+                    #             for (c, b) in blocks(D))
+                    # @test isposdef(t2) == isposdef(λ)
+                    # @test isposdef(t2 - λ * one(t2) + 0.1 * one(t2))
+                    # @test !isposdef(t2 - λ * one(t2) - 0.1 * one(t2))
+                end
+            end
+        end
+        @timedtestset "Tensor truncation" begin
+            for T in (Float32, ComplexF64)
+                for p in (1, 2, 3, Inf)
+                    # Test both a normal tensor and an adjoint one.
+                    ts = (AMDGPU.randn(T, V1 ⊗ V2 ⊗ V3, V4 ⊗ V5),
+                          AMDGPU.randn(T, V4 ⊗ V5, V1 ⊗ V2 ⊗ V3)')
+                    for t in ts
+                        U₀, S₀, V₀, = tsvd(t)
+                        t = rmul!(t, 1 / norm(S₀, p))
+                        # Probably shouldn't allow truncerr and truncdim, as these require scalar indexing?
+                        U, S, V, ϵ = tsvd(t; trunc=truncbelow(1 / dim(domain(S₀))), p=p)
+                        U′, S′, V′, ϵ′ = tsvd(t; trunc=truncspace(space(S, 1)), p=p)
+                        @test (U, S, V, ϵ) == (U′, S′, V′, ϵ′)
+                    end
+                end
+            end
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Tensor functions" begin
+                W = V1 ⊗ V2
+                for T in (Float64, ComplexF64)
+                    t = AMDGPU.randn(T, W, W)
+                    s = dim(W)
+                    expt = @constinferred exp(t)
+                    @test ad(expt) ≈ exp(ad(t))
+
+                    @test (@constinferred sqrt(t))^2 ≈ t
+                    @test ad(sqrt(t)) ≈ sqrt(ad(t))
+
+                    @test exp(@constinferred log(expt)) ≈ expt
+                    @test ad(log(expt)) ≈ log(ad(expt))
+
+                    @test (@constinferred cos(t))^2 + (@constinferred sin(t))^2 ≈
+                          id(storagetype(t), W)
+                    @test (@constinferred tan(t)) ≈ sin(t) / cos(t)
+                    @test (@constinferred cot(t)) ≈ cos(t) / sin(t)
+                    @test (@constinferred cosh(t))^2 - (@constinferred sinh(t))^2 ≈
+                          id(storagetype(t), W)
+                    @test (@constinferred tanh(t)) ≈ sinh(t) / cosh(t)
+                    @test (@constinferred coth(t)) ≈ cosh(t) / sinh(t)
+
+                    t1 = sin(t)
+                    @test sin(@constinferred asin(t1)) ≈ t1
+                    t2 = cos(t)
+                    @test cos(@constinferred acos(t2)) ≈ t2
+                    t3 = sinh(t)
+                    @test sinh(@constinferred asinh(t3)) ≈ t3
+                    t4 = cosh(t)
+                    @test cosh(@constinferred acosh(t4)) ≈ t4
+                    t5 = tan(t)
+                    @test tan(@constinferred atan(t5)) ≈ t5
+                    t6 = cot(t)
+                    @test cot(@constinferred acot(t6)) ≈ t6
+                    t7 = tanh(t)
+                    @test tanh(@constinferred atanh(t7)) ≈ t7
+                    t8 = coth(t)
+                    @test coth(@constinferred acoth(t8)) ≈ t8
+                end
+            end
+        end
+        # Sylvester not defined for AMDGPU
+        # @timedtestset "Sylvester equation" begin
+        #     for T in (Float32, ComplexF64)
+        #         tA = AMDGPU.rand(T, V1 ⊗ V3, V1 ⊗ V3)
+        #         tB = AMDGPU.rand(T, V2 ⊗ V4, V2 ⊗ V4)
+        #         tA = 3 // 2 * leftorth(tA; alg=Polar())[1]
+        #         tB = 1 // 5 * leftorth(tB; alg=Polar())[1]
+        #         tC = AMDGPU.rand(T, V1 ⊗ V3, V2 ⊗ V4)
+        #         t = @constinferred sylvester(tA, tB, tC)
+        #         @test codomain(t) == V1 ⊗ V3
+        #         @test domain(t) == V2 ⊗ V4
+        #         @test norm(tA * t + t * tB + tC) <
+        #               (norm(tA) + norm(tB) + norm(tC)) * eps(real(T))^(2 / 3)
+        #         if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+        #             matrix(x) = reshape(convert(Array, x), dim(codomain(x)), dim(domain(x)))
+        #             @test matrix(t) ≈ sylvester(matrix(tA), matrix(tB), matrix(tC))
+        #         end
+        #     end
+        # end
+        @timedtestset "Tensor product: test via norm preservation" begin
+            for T in (Float32, ComplexF64)
+                t1 = AMDGPU.rand(T, V2 ⊗ V3 ⊗ V1, V1 ⊗ V2)
+                t2 = AMDGPU.rand(T, V2 ⊗ V1 ⊗ V3, V1 ⊗ V1)
+                t = @constinferred (t1 ⊗ t2)
+                @test norm(t) ≈ norm(t1) * norm(t2)
+            end
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Tensor product: test via conversion" begin
+                for T in (Float32, ComplexF64)
+                    t1 = AMDGPU.rand(T, V2 ⊗ V3 ⊗ V1, V1)
+                    t2 = AMDGPU.rand(T, V2 ⊗ V1 ⊗ V3, V2)
+                    t = @constinferred (t1 ⊗ t2)
+                    d1 = dim(codomain(t1))
+                    d2 = dim(codomain(t2))
+                    d3 = dim(domain(t1))
+                    d4 = dim(domain(t2))
+                    At = ad(t)
+                    @test ad(t) ≈ ad(t1) ⊗ ad(t2)
+                end
+            end
+        end
+        @timedtestset "Tensor product: test via tensor contraction" begin
+            for T in (Float32, ComplexF64)
+                t1 = AMDGPU.rand(T, V2 ⊗ V3 ⊗ V1)
+                t2 = AMDGPU.rand(T, V2 ⊗ V1 ⊗ V3)
+                t = @constinferred (t1 ⊗ t2)
+                @tensor t′[1, 2, 3, 4, 5, 6] := t1[1, 2, 3] * t2[4, 5, 6]
+                @test t ≈ t′
+            end
+        end
+    end
+end
+
+@timedtestset "Deligne tensor product: test via conversion" begin
+    Vlists1 = (Vtr,) # VSU₂)
+    Vlists2 = (Vtr,) # Vℤ₂)
+    @testset for Vlist1 in Vlists1, Vlist2 in Vlists2
+        V1, V2, V3, V4, V5 = Vlist1
+        W1, W2, W3, W4, W5 = Vlist2
+        for T in (Float32, ComplexF64)
+            t1 = AMDGPU.rand(T, V1 ⊗ V2, V3' ⊗ V4)
+            t2 = AMDGPU.rand(T, W2, W1 ⊗ W1')
+            t = @constinferred (t1 ⊠ t2)
+            d1 = dim(codomain(t1))
+            d2 = dim(codomain(t2))
+            d3 = dim(domain(t1))
+            d4 = dim(domain(t2))
+            @test ad(t1) ⊠ ad(t2) ≈ ad(t1 ⊠ t2)
+        end
+    end
+end
+
diff --git a/test/cuda.jl b/test/cuda.jl
new file mode 100644
index 000000000..f40402a1c
--- /dev/null
+++ b/test/cuda.jl
@@ -0,0 +1,591 @@
+using Adapt
+ad = adapt(Array)
+
+const CUDAExt = Base.get_extension(TensorKit, :TensorKitCUDAExt)
+@assert !isnothing(CUDAExt)
+# const CuTensorMap{T,S,N1,N2,I,A} = CUDAExt.CuTensorMap{T,S,N1,N2,I,A}
+const CuTensorMap = getglobal(CUDAExt, :CuTensorMap)
+
+for V in (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂, VfSU₂)#, VSU₃)
+    V1, V2, V3, V4, V5 = V
+    @assert V3 * V4 * V2 ≿ V1' * V5' # necessary for leftorth tests
+    @assert V3 * V4 ≾ V1' * V2' * V5' # necessary for rightorth tests
+end
+
+spacelist = try
+    if ENV["CI"] == "true"
+        println("Detected running on CI")
+        if Sys.iswindows()
+            (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂)
+        elseif Sys.isapple()
+            (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VfU₁, VfSU₂)#, VSU₃)
+        else
+            (Vtr, Vℤ₂, Vfℤ₂, VU₁, VCU₁, VSU₂, VfSU₂)#, VSU₃)
+        end
+    else
+        (Vtr, VU₁, VSU₂, Vfℤ₂)
+    end
+catch
+    (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂, VfSU₂)#, VSU₃)
+end
+
+for V in spacelist
+    I = sectortype(first(V))
+    Istr = TensorKit.type_repr(I)
+    println("---------------------------------------")
+    println("CUDA Tensors with symmetry: $Istr")
+    println("---------------------------------------")
+    @timedtestset "Tensors with symmetry: $Istr" verbose = true begin
+        V1, V2, V3, V4, V5 = V
+        @timedtestset "Basic tensor properties" begin
+            W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+            for T in (Int, Float32, Float64, ComplexF32, ComplexF64)
+                t = @constinferred CUDA.zeros(T, W)
+                # @test @constinferred(hash(t)) == hash(deepcopy(t)) # hash is not defined for CuArray?
+                @test scalartype(t) == T
+                @test norm(t) == 0
+                @test codomain(t) == W
+                @test space(t) == (W ← one(W))
+                @test domain(t) == one(W)
+                @test typeof(t) ==
+                      @constinferred tensormaptype(spacetype(t), 5, 0, storagetype(t))
+            end
+        end
+        @timedtestset "Tensor Dict conversion" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            for T in (Int, Float32, ComplexF64)
+                t = @constinferred CUDA.rand(T, W)
+                d = convert(Dict, t)
+                @test t == convert(CuTensorMap, d)
+            end
+        end
+        if hasfusiontensor(I)
+            @timedtestset "Tensor Array conversion" begin
+                W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+
+                # cuTENSOR does not support Int
+                Ts = sectortype(W) === Trivial ? (Int, Float32, ComplexF64) :
+                     (Float32, ComplexF64)
+                for T in Ts
+                    t = @constinferred CUDA.rand(T, W)
+                    a = @constinferred convert(CuArray, t)
+                    @test t ≈ @constinferred TensorMap(a, W)
+                    # also test if input is matrix
+                    a2 = reshape(a, prod(dim, codomain(t)), prod(dim, domain(t)))
+                    @test t ≈ @constinferred TensorMap(a2, codomain(t), domain(t))
+                end
+            end
+        end
+        @timedtestset "Basic linear algebra" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            for T in (Float32, ComplexF64)
+                t = @constinferred CUDA.rand(T, W)
+                @test scalartype(t) == T
+                @test space(t) == W
+                @test space(t') == W'
+                @test dim(t) == dim(space(t))
+                @test codomain(t) == codomain(W)
+                @test domain(t) == domain(W)
+                @test isa(@constinferred(norm(t)), real(T))
+                @test norm(t)^2 ≈ dot(t, t)
+                α = rand(T)
+                @test norm(α * t) ≈ abs(α) * norm(t)
+                @test norm(t + t, 2) ≈ 2 * norm(t, 2)
+                @test norm(t + t, 1) ≈ 2 * norm(t, 1)
+                @test norm(t + t, Inf) ≈ 2 * norm(t, Inf)
+                p = 3 * rand(Float64)
+                @test norm(t + t, p) ≈ 2 * norm(t, p)
+                @test norm(t) ≈ norm(t')
+
+                t2 = @constinferred CUDA.rand!(similar(t))
+                β = rand(T)
+                @test @constinferred(dot(β * t2, α * t)) ≈ conj(β) * α * conj(dot(t, t2))
+                @test dot(t2, t) ≈ conj(dot(t, t2))
+                @test dot(t2, t) ≈ conj(dot(t2', t'))
+                @test dot(t2, t) ≈ dot(t', t2')
+
+                A = storagetype(t)
+
+                i1 = @constinferred(isomorphism(CuMatrix{T}, V1 ⊗ V2, V2 ⊗ V1))
+                i2 = @constinferred(isomorphism(CuMatrix{T}, V2 ⊗ V1, V1 ⊗ V2))
+                @test i1 * i2 == @constinferred(id(CuMatrix{T}, V1 ⊗ V2))
+                @test i2 * i1 == @constinferred(id(CuMatrix{T}, V2 ⊗ V1))
+
+                w = @constinferred(isometry(CuMatrix{T}, V1 ⊗ (oneunit(V1) ⊕ oneunit(V1)),
+                                            V1))
+                @test dim(w) == 2 * dim(V1 ← V1)
+                @test w' * w == id(CuMatrix{T}, V1)
+                @test w * w' == (w * w')^2
+            end
+        end
+        if hasfusiontensor(I)
+            @timedtestset "Basic linear algebra: test via conversion" begin
+                W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+                for T in (Float32, ComplexF64)
+                    t = CUDA.rand(T, W)
+                    t2 = @constinferred CUDA.rand!(similar(t))
+                    @test norm(t, 2) ≈ norm(ad(t), 2)
+                    @test dot(t2, t) ≈ dot(ad(t2), ad(t))
+                    α = rand(T)
+                    @test ad(α * t) ≈ α * ad(t)
+                    @test ad(t + t) ≈ 2 * ad(t)
+                end
+            end
+            @timedtestset "Real and imaginary parts" begin
+                W = V1 ⊗ V2
+                for T in (Float64, ComplexF64, ComplexF32)
+                    t = @constinferred CUDA.randn(T, W, W)
+                    @test real(ad(t)) == ad(@constinferred real(t))
+                    @test imag(ad(t)) == ad(@constinferred imag(t))
+                end
+            end
+        end
+        @timedtestset "Tensor conversion" begin
+            W = V1 ⊗ V2
+            t = @constinferred CUDA.randn(W ← W)
+            @test typeof(convert(CuTensorMap, t')) == typeof(t)
+            tc = complex(t)
+            @test convert(typeof(tc), t) == tc
+            @test typeof(convert(typeof(tc), t)) == typeof(tc)
+            @test typeof(convert(typeof(tc), t')) == typeof(tc)
+            @test Base.promote_typeof(t, tc) == typeof(tc)
+            @test Base.promote_typeof(tc, t) == typeof(tc + t)
+        end
+        #=@timedtestset "diag/diagm" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            t = CUDA.randn(ComplexF64, W)
+            d = LinearAlgebra.diag(t)
+            # TODO find a way to use CUDA here
+            D = LinearAlgebra.diagm(codomain(t), domain(t), d)
+            @test LinearAlgebra.isdiag(D)
+            @test LinearAlgebra.diag(D) == d
+        end=#
+        @timedtestset "Permutations: test via inner product invariance" begin
+            W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+            t = CUDA.rand(ComplexF64, W)
+            t′ = CUDA.randn!(similar(t))
+            for k in 0:5
+                for p in permutations(1:5)
+                    p1 = ntuple(n -> p[n], k)
+                    p2 = ntuple(n -> p[k + n], 5 - k)
+                    t2 = @constinferred permute(t, (p1, p2))
+                    t2 = permute(t, (p1, p2))
+                    @test norm(t2) ≈ norm(t)
+                    t2′ = permute(t′, (p1, p2))
+                    @test dot(t2′, t2) ≈ dot(t′, t) ≈ dot(transpose(t2′), transpose(t2))
+                end
+
+                t3 = @constinferred repartition(t, $k)
+                t3 = repartition(t, k)
+                @test norm(t3) ≈ norm(t)
+                t3′ = @constinferred repartition!(similar(t3), t′)
+                @test norm(t3′) ≈ norm(t′)
+                @test dot(t′, t) ≈ dot(t3′, t3)
+            end
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Permutations: test via conversion" begin
+                W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+                t = CUDA.rand(ComplexF64, W)
+                a = ad(t)
+                for k in 0:5
+                    for p in permutations(1:5)
+                        p1 = ntuple(n -> p[n], k)
+                        p2 = ntuple(n -> p[k + n], 5 - k)
+                        t2 = permute(t, (p1, p2))
+                        a2 = ad(t2)
+                        @test a2 ≈ permute(a, (p1, p2))
+                        @test ad(transpose(t2)) ≈ transpose(a2)
+                    end
+
+                    t3 = repartition(t, k)
+                    a3 = ad(t3)
+                    @test a3 ≈ repartition(a, k)
+                end
+            end
+        end
+        @timedtestset "Full trace: test self-consistency" begin
+            t = CUDA.rand(ComplexF64, V1 ⊗ V2' ⊗ V2 ⊗ V1')
+            t2 = permute(t, ((1, 2), (4, 3)))
+            s = @constinferred tr(t2)
+            @test conj(s) ≈ tr(t2')
+            if !isdual(V1)
+                t2 = twist!(t2, 1)
+            end
+            if isdual(V2)
+                t2 = twist!(t2, 2)
+            end
+            ss = tr(t2)
+            @tensor s2 = t[a, b, b, a]
+            @tensor t3[a, b] := t[a, c, c, b]
+            @tensor s3 = t3[a, a]
+            @test ss ≈ s2
+            @test ss ≈ s3
+        end
+        @timedtestset "Partial trace: test self-consistency" begin
+            t = CUDA.rand(ComplexF64, V1 ⊗ V2' ⊗ V3 ⊗ V2 ⊗ V1' ⊗ V3')
+            @tensor t2[a, b] := t[c, d, b, d, c, a]
+            @tensor t4[a, b, c, d] := t[d, e, b, e, c, a]
+            @tensor t5[a, b] := t4[a, b, c, c]
+            @test t2 ≈ t5
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Trace: test via conversion" begin
+                t = CUDA.rand(ComplexF64, V1 ⊗ V2' ⊗ V3 ⊗ V2 ⊗ V1' ⊗ V3')
+                @tensor t2[a, b] := t[c, d, b, d, c, a]
+                @tensor t3[a, b] := ad(t)[c, d, b, d, c, a]
+                @test t3 ≈ ad(t2)
+            end
+        end
+        @timedtestset "Trace and contraction" begin
+            t1 = CUDA.rand(ComplexF64, V1 ⊗ V2 ⊗ V3)
+            t2 = CUDA.rand(ComplexF64, V2' ⊗ V4 ⊗ V1')
+            t3 = t1 ⊗ t2
+            @tensor ta[a, b] := t1[x, y, a] * t2[y, b, x]
+            @tensor tb[a, b] := t3[x, y, a, y, b, x]
+            @test ta ≈ tb
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Tensor contraction: test via conversion" begin
+                A1 = CUDA.randn(ComplexF64, V1' * V2', V3')
+                A2 = CUDA.randn(ComplexF64, V3 * V4, V5)
+                rhoL = CUDA.randn(ComplexF64, V1, V1)
+                rhoR = CUDA.randn(ComplexF64, V5, V5)' # test adjoint tensor
+                H = CUDA.randn(ComplexF64, V2 * V4, V2 * V4)
+                @tensor HrA12[a, s1, s2, c] := rhoL[a, a'] * conj(A1[a', t1, b]) *
+                                               A2[b, t2, c'] * rhoR[c', c] *
+                                               H[s1, s2, t1, t2]
+
+                @tensor HrA12array[a, s1, s2, c] := ad(rhoL)[a, a'] *
+                                                    conj(ad(A1)[a', t1, b]) *
+                                                    ad(A2)[b, t2, c'] *
+                                                    ad(rhoR)[c', c] *
+                                                    ad(H)[s1, s2, t1, t2]
+
+                @test HrA12array ≈ ad(HrA12)
+            end
+        end
+        @timedtestset "Multiplication and inverse: test compatibility" begin
+            W1 = V1 ⊗ V2 ⊗ V3
+            W2 = V4 ⊗ V5
+            for T in (Float64, ComplexF64)
+                t1 = CUDA.rand(T, W1, W1)
+                t2 = CUDA.rand(T, W2, W2)
+                t  = CUDA.rand(T, W1, W2)
+                @test t1 * (t1 \ t) ≈ t
+                @test (t / t2) * t2 ≈ t
+                @test t1 \ one(t1) ≈ inv(t1)
+                # @test one(t1) / t1 ≈ pinv(t1) # pinv not available in CUDA
+                @test_throws SpaceMismatch inv(t)
+                @test_throws SpaceMismatch t2 \ t
+                @test_throws SpaceMismatch t / t1
+                # pinv not available in CUDA
+                # tp = pinv(t) * t 
+                # @test tp ≈ tp * tp
+            end
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Multiplication and inverse: test via conversion" begin
+                W1 = V1 ⊗ V2 ⊗ V3
+                W2 = V4 ⊗ V5
+                for T in (Float32, Float64, ComplexF32, ComplexF64)
+                    t1 = CUDA.rand(T, W1, W1)
+                    t2 = CUDA.rand(T, W2, W2)
+                    t  = CUDA.rand(T, W1, W2)
+                    d1 = dim(W1)
+                    d2 = dim(W2)
+                    At1 = reshape(convert(Array, t1), d1, d1)
+                    At2 = reshape(convert(Array, t2), d2, d2)
+                    At  = reshape(convert(Array, t), d1, d2)
+                    @test ad(t1 * t) ≈ ad(t1) * ad(t)
+                    @test ad(t1' * t) ≈ ad(t1)' * ad(t)
+                    @test ad(t2 * t') ≈ ad(t2) * ad(t)'
+                    @test ad(t2' * t') ≈ ad(t2)' * ad(t)'
+                    @test ad(inv(t1)) ≈ inv(ad(t1))
+                    # pinv not available in CUDA
+                    #@test ad(pinv(t)) ≈ pinv(ad(t))
+
+                    if T == Float32 || T == ComplexF32
+                        continue
+                    end
+
+                    @test ad(t1 \ t) ≈ ad(t1) \ ad(t)
+                    @test ad(t1' \ t) ≈ ad(t1)' \ ad(t)
+                    @test ad(t2 \ t') ≈ ad(t2) \ ad(t)'
+                    @test ad(t2' \ t') ≈ ad(t2)' \ ad(t)'
+
+                    @test ad(t2 / t) ≈ ad(t2) / ad(t)
+                    @test ad(t2' / t) ≈ ad(t2)' / ad(t)
+                    @test ad(t1 / t') ≈ ad(t1) / ad(t)'
+                    @test ad(t1' / t') ≈ ad(t1)' / ad(t)'
+                end
+            end
+        end
+        @timedtestset "Factorization" begin
+            W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+            for T in (Float32, ComplexF64)
+                # Test both a normal tensor and an adjoint one.
+                ts = (CUDA.rand(T, W), CUDA.rand(T, W)')
+                for t in ts
+                    @testset "leftorth with $alg" for alg in
+                                                      (TensorKit.QR(), TensorKit.QRpos(),
+                                                       TensorKit.QL(), TensorKit.QLpos(),
+                                                       TensorKit.Polar(), TensorKit.SVD(),
+                                                       )
+                        Q, R = @constinferred leftorth(t, ((3, 4, 2), (1, 5)); alg=alg)
+                        @test isisometry(Q)
+                        @test Q * R ≈ permute(t, ((3, 4, 2), (1, 5)))
+                    end
+                    @testset "leftnull with $alg" for alg in
+                                                      (TensorKit.QR(), TensorKit.SVD(),
+                                                       )
+                        N = @constinferred leftnull(t, ((3, 4, 2), (1, 5)); alg=alg)
+                        @test isisometry(N)
+                        @test norm(N' * permute(t, ((3, 4, 2), (1, 5)))) <
+                              100 * eps(norm(t))
+                    end
+                    @testset "rightorth with $alg" for alg in
+                                                       (TensorKit.RQ(), TensorKit.RQpos(),
+                                                        TensorKit.LQ(), TensorKit.LQpos(),
+                                                        TensorKit.Polar(), TensorKit.SVD(),
+                                                        )
+                        # cusolver SVD requires m >= n for some reason
+                        L, Q = @constinferred rightorth(t, ((3, 4), (2, 1, 5)); alg=alg)
+                        @test isisometry(Q; side=:right)
+                        @test L * Q ≈ permute(t, ((3, 4), (2, 1, 5)))
+                    end
+                    @testset "rightnull with $alg" for alg in
+                                                       (TensorKit.LQ(), TensorKit.SVD(),
+                                                        )
+                        M = @constinferred rightnull(t, ((3, 4), (2, 1, 5)); alg=alg)
+                        @test isisometry(M; side=:right)
+                        @test norm(permute(t, ((3, 4), (2, 1, 5))) * M') <
+                              100 * eps(norm(t))
+                    end
+                    @testset "tsvd with $alg" for alg in (TensorKit.SVD(),)
+                        U, S, V = @constinferred tsvd(t, ((3, 4, 2), (1, 5)); alg=alg)
+                        @test isisometry(U)
+                        @test isisometry(V; side=:right)
+                        t2 = permute(t, ((3, 4, 2), (1, 5)))
+                        US = U * S
+                        USV = US * V
+                        @test U * S * V ≈ t2
+
+                        s = LinearAlgebra.svdvals(t2)
+                        s′ = LinearAlgebra.diag(S)
+                        for (c, b) in s
+                            @test b ≈ s′[c]
+                        end
+                    end
+                end
+                @testset "empty tensor" begin
+                    t = CUDA.randn(T, V1 ⊗ V2, typeof(V1)())
+                    @testset "leftorth with $alg" for alg in
+                                                      (TensorKit.QR(), TensorKit.QRpos(),
+                                                       TensorKit.QL(), TensorKit.QLpos(),
+                                                       TensorKit.Polar(), TensorKit.SVD(),
+                                                       )
+                        Q, R = @constinferred leftorth(t; alg=alg)
+                        @test Q == t
+                        @test dim(Q) == dim(R) == 0
+                    end
+                    @testset "leftnull with $alg" for alg in
+                                                      (TensorKit.QR(), TensorKit.SVD(),
+                                                       )
+                        N = @constinferred leftnull(t; alg=alg)
+                        @test N' * N ≈ id(storagetype(t), domain(N))
+                        @test N * N' ≈ id(storagetype(t), codomain(N))
+                    end
+                    @testset "rightorth with $alg" for alg in
+                                                       (TensorKit.RQ(), TensorKit.RQpos(),
+                                                        TensorKit.LQ(), TensorKit.LQpos(),
+                                                        TensorKit.Polar(), TensorKit.SVD(),
+                                                        )
+                        L, Q = @constinferred rightorth(copy(t'); alg=alg)
+                        @test Q == t'
+                        @test dim(Q) == dim(L) == 0
+                    end
+                    @testset "rightnull with $alg" for alg in
+                                                       (TensorKit.LQ(), TensorKit.SVD(),
+                                                        )
+                        M = @constinferred rightnull(copy(t'); alg=alg)
+                        @test M * M' ≈ id(storagetype(t), codomain(M))
+                        @test M' * M ≈ id(storagetype(t), domain(M))
+                    end
+                    @testset "tsvd with $alg" for alg in (TensorKit.SVD(), )
+                        U, S, V = @constinferred tsvd(t; alg=alg)
+                        @test U == t
+                        @test dim(U) == dim(S) == dim(V)
+                    end
+                end
+
+                # CUDA only supports symmetric/hermitian eigen
+                @testset "eig and isposdef" begin
+                    t = CUDA.rand(T, V1 ⊗ V2 ← V1 ⊗ V2)
+                    t += t'
+                    D, V = eigen(t)
+                    @test t * V ≈ V * D
+
+                    d = LinearAlgebra.eigvals(t; sortby=nothing)
+                    d′ = LinearAlgebra.diag(D)
+                    for (c, b) in d
+                        @test sort(real.(b)) ≈ sort(real.(d′[c]))
+                    end
+
+                    # Somehow moving these test before the previous one gives rise to errors
+                    # with T=Float32 on x86 platforms. Is this an OpenBLAS issue? 
+                    # VdV = V' * V
+                    # VdV = (VdV + VdV') / 2
+                    # @test isposdef(VdV)
+                    #
+                    # @test !isposdef(t2) # unlikely for non-hermitian map
+                    # t2 = (t2 + t2')
+                    # D, V = eigen(t2)
+                    # VdV = V' * V
+                    # @test VdV ≈ one(VdV)
+                    # D̃, Ṽ = @constinferred eigh(t2)
+                    # @test D ≈ D̃
+                    # @test V ≈ Ṽ
+                    # λ = minimum(minimum(real(LinearAlgebra.diag(b)))
+                    #             for (c, b) in blocks(D))
+                    # @test isposdef(t2) == isposdef(λ)
+                    # @test isposdef(t2 - λ * one(t2) + 0.1 * one(t2))
+                    # @test !isposdef(t2 - λ * one(t2) - 0.1 * one(t2))
+                end
+            end
+        end
+        @timedtestset "Tensor truncation" begin
+            for T in (Float32, ComplexF64)
+                for p in (1, 2, 3, Inf)
+                    # Test both a normal tensor and an adjoint one.
+                    ts = (CUDA.randn(T, V1 ⊗ V2 ⊗ V3, V4 ⊗ V5),
+                          CUDA.randn(T, V4 ⊗ V5, V1 ⊗ V2 ⊗ V3)')
+                    for t in ts
+                        U₀, S₀, V₀, = tsvd(t)
+                        t = rmul!(t, 1 / norm(S₀, p))
+                        # Probably shouldn't allow truncerr and truncdim, as these require scalar indexing?
+                        U, S, V = tsvd(t; trunc=truncbelow(1 / dim(domain(S₀))), p=p)
+                        U′, S′, V′ = tsvd(t; trunc=truncspace(space(S, 1)), p=p)
+                        @test (U, S, V) == (U′, S′, V′)
+                    end
+                end
+            end
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Tensor functions" begin
+                W = V1 ⊗ V2
+                for T in (Float64, ComplexF64)
+                    t = CUDA.randn(T, W, W)
+                    s = dim(W)
+                    expt = @constinferred exp(t)
+                    @test ad(expt) ≈ exp(ad(t))
+
+                    @test (@constinferred sqrt(t))^2 ≈ t
+                    @test ad(sqrt(t)) ≈ sqrt(ad(t))
+
+                    @test exp(@constinferred log(expt)) ≈ expt
+                    @test ad(log(expt)) ≈ log(ad(expt))
+
+                    @test (@constinferred cos(t))^2 + (@constinferred sin(t))^2 ≈
+                          id(storagetype(t), W)
+                    @test (@constinferred tan(t)) ≈ sin(t) / cos(t)
+                    @test (@constinferred cot(t)) ≈ cos(t) / sin(t)
+                    @test (@constinferred cosh(t))^2 - (@constinferred sinh(t))^2 ≈
+                          id(storagetype(t), W)
+                    @test (@constinferred tanh(t)) ≈ sinh(t) / cosh(t)
+                    @test (@constinferred coth(t)) ≈ cosh(t) / sinh(t)
+
+                    t1 = sin(t)
+                    @test sin(@constinferred asin(t1)) ≈ t1
+                    t2 = cos(t)
+                    @test cos(@constinferred acos(t2)) ≈ t2
+                    t3 = sinh(t)
+                    @test sinh(@constinferred asinh(t3)) ≈ t3
+                    t4 = cosh(t)
+                    @test cosh(@constinferred acosh(t4)) ≈ t4
+                    t5 = tan(t)
+                    @test tan(@constinferred atan(t5)) ≈ t5
+                    t6 = cot(t)
+                    @test cot(@constinferred acot(t6)) ≈ t6
+                    t7 = tanh(t)
+                    @test tanh(@constinferred atanh(t7)) ≈ t7
+                    t8 = coth(t)
+                    @test coth(@constinferred acoth(t8)) ≈ t8
+                end
+            end
+        end
+        # Sylvester not defined for CUDA
+        # @timedtestset "Sylvester equation" begin
+        #     for T in (Float32, ComplexF64)
+        #         tA = CUDA.rand(T, V1 ⊗ V3, V1 ⊗ V3)
+        #         tB = CUDA.rand(T, V2 ⊗ V4, V2 ⊗ V4)
+        #         tA = 3 // 2 * leftorth(tA; alg=Polar())[1]
+        #         tB = 1 // 5 * leftorth(tB; alg=Polar())[1]
+        #         tC = CUDA.rand(T, V1 ⊗ V3, V2 ⊗ V4)
+        #         t = @constinferred sylvester(tA, tB, tC)
+        #         @test codomain(t) == V1 ⊗ V3
+        #         @test domain(t) == V2 ⊗ V4
+        #         @test norm(tA * t + t * tB + tC) <
+        #               (norm(tA) + norm(tB) + norm(tC)) * eps(real(T))^(2 / 3)
+        #         if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+        #             matrix(x) = reshape(convert(Array, x), dim(codomain(x)), dim(domain(x)))
+        #             @test matrix(t) ≈ sylvester(matrix(tA), matrix(tB), matrix(tC))
+        #         end
+        #     end
+        # end
+        #
+        # TODO
+        @timedtestset "Tensor product: test via norm preservation" begin
+            for T in (Float32, ComplexF64)
+                t1 = CUDA.rand(T, V2 ⊗ V3 ⊗ V1, V1 ⊗ V2)
+                t2 = CUDA.rand(T, V2 ⊗ V1 ⊗ V3, V1 ⊗ V1)
+                t = @constinferred (t1 ⊗ t2)
+                @test norm(t) ≈ norm(t1) * norm(t2)
+            end
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Tensor product: test via conversion" begin
+                for T in (Float32, ComplexF64)
+                    t1 = CUDA.rand(T, V2 ⊗ V3 ⊗ V1, V1)
+                    t2 = CUDA.rand(T, V2 ⊗ V1 ⊗ V3, V2)
+                    t = @constinferred (t1 ⊗ t2)
+                    d1 = dim(codomain(t1))
+                    d2 = dim(codomain(t2))
+                    d3 = dim(domain(t1))
+                    d4 = dim(domain(t2))
+                    At = ad(t)
+                    @test ad(t) ≈ ad(t1) ⊗ ad(t2)
+                end
+            end
+        end
+        @timedtestset "Tensor product: test via tensor contraction" begin
+            for T in (Float32, ComplexF64)
+                t1 = CUDA.rand(T, V2 ⊗ V3 ⊗ V1)
+                t2 = CUDA.rand(T, V2 ⊗ V1 ⊗ V3)
+                t = @constinferred (t1 ⊗ t2)
+                @tensor t′[1, 2, 3, 4, 5, 6] := t1[1, 2, 3] * t2[4, 5, 6]
+                @test t ≈ t′
+            end
+        end
+    end
+end
+
+@timedtestset "Deligne tensor product: test via conversion" begin
+    Vlists1 = (Vtr,) # VSU₂)
+    Vlists2 = (Vtr,) # Vℤ₂)
+    @testset for Vlist1 in Vlists1, Vlist2 in Vlists2
+        V1, V2, V3, V4, V5 = Vlist1
+        W1, W2, W3, W4, W5 = Vlist2
+        for T in (Float32, ComplexF64)
+            t1 = CUDA.rand(T, V1 ⊗ V2, V3' ⊗ V4)
+            t2 = CUDA.rand(T, W2, W1 ⊗ W1')
+            t = @constinferred (t1 ⊠ t2)
+            d1 = dim(codomain(t1))
+            d2 = dim(codomain(t2))
+            d3 = dim(domain(t1))
+            d4 = dim(domain(t2))
+            @test ad(t1) ⊠ ad(t2) ≈ ad(t1 ⊠ t2)
+        end
+    end
+end
+
diff --git a/test/runtests.jl b/test/runtests.jl
index 9e44b8a3a..70124f857 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -112,24 +112,39 @@ VSU₂U₁ = (Vect[SU2Irrep ⊠ U1Irrep]((0, 0) => 1, (1 // 2, -1) => 1),
 #     ℂ[SU3Irrep]((1, 0, 0) => 1, (2, 0, 0) => 1),
 #     ℂ[SU3Irrep]((0, 0, 0) => 1, (1, 0, 0) => 1, (1, 1, 0) => 1)')
 
+is_buildkite = get(ENV, "BUILDKITE", "false") == "true"
+
 Ti = time()
-include("fusiontrees.jl")
-include("spaces.jl")
-include("tensors.jl")
-include("diagonal.jl")
-include("planar.jl")
-# TODO: remove once we know AD is slow on macOS CI
-if !(Sys.isapple() && get(ENV, "CI", "false") == "true") && isempty(VERSION.prerelease)
-    include("ad.jl")
+if !is_buildkite
+    include("fusiontrees.jl")
+    include("spaces.jl")
+    include("tensors.jl")
+    include("diagonal.jl")
+    include("planar.jl")
+    # TODO: remove once we know AD is slow on macOS CI
+    if !(Sys.isapple() && get(ENV, "CI", "false") == "true") && isempty(VERSION.prerelease)
+        include("ad.jl")
+    end
+    include("bugfixes.jl")
+    Tf = time()
+    @testset "Aqua" verbose = true begin
+        using Aqua
+        Aqua.test_all(TensorKit)
+    end
+else
+    using CUDA, cuTENSOR
+    if CUDA.functional()
+        include("cuda.jl")
+    end
+
+    using AMDGPU 
+    if AMDGPU.functional()
+        include("amdgpu.jl")
+    end
+    Tf = time()
 end
-include("bugfixes.jl")
-Tf = time()
+
 printstyled("Finished all tests in ",
             string(round((Tf - Ti) / 60; sigdigits=3)),
             " minutes."; bold=true, color=Base.info_color())
 println()
-
-@testset "Aqua" verbose = true begin
-    using Aqua
-    Aqua.test_all(TensorKit)
-end