diff --git a/Project.toml b/Project.toml
index 8ca217db8..dd10c3c8d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -19,6 +19,7 @@ VectorInterface = "409d34a3-91d5-4945-b6ec-7529ddf182d8"
 
 [weakdeps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
@@ -26,12 +27,14 @@ cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [extensions]
 TensorKitAdaptExt = "Adapt"
+TensorKitAMDGPUExt = "AMDGPU"
 TensorKitCUDAExt = ["CUDA", "cuTENSOR"]
 TensorKitChainRulesCoreExt = "ChainRulesCore"
 TensorKitFiniteDifferencesExt = "FiniteDifferences"
 
 [compat]
 Adapt = "4"
+AMDGPU = "2"
 Aqua = "0.6, 0.7, 0.8"
 ArgParse = "1.2.0"
 CUDA = "5.9"
@@ -61,6 +64,7 @@ julia = "1.10"
 
 [extras]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
@@ -78,4 +82,4 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [targets]
-test = ["ArgParse", "Adapt", "Aqua", "Combinatorics", "CUDA", "cuTENSOR", "GPUArrays", "LinearAlgebra", "SafeTestsets", "TensorOperations", "Test", "TestExtras", "ChainRulesCore", "ChainRulesTestUtils", "FiniteDifferences", "Zygote"]
+test = ["ArgParse", "Adapt", "Aqua", "Combinatorics", "AMDGPU", "CUDA", "cuTENSOR", "GPUArrays", "LinearAlgebra", "SafeTestsets", "TensorOperations", "Test", "TestExtras", "ChainRulesCore", "ChainRulesTestUtils", "FiniteDifferences", "Zygote"]
diff --git a/ext/TensorKitAMDGPUExt/TensorKitAMDGPUExt.jl b/ext/TensorKitAMDGPUExt/TensorKitAMDGPUExt.jl
new file mode 100644
index 000000000..9fa2dc2f8
--- /dev/null
+++ b/ext/TensorKitAMDGPUExt/TensorKitAMDGPUExt.jl
@@ -0,0 +1,20 @@
+module TensorKitAMDGPUExt
+
+using AMDGPU, AMDGPU.rocBLAS, AMDGPU.rocSOLVER, LinearAlgebra
+using AMDGPU: @allowscalar
+import AMDGPU: rand as rocrand, rand! as rocrand!, randn as rocrandn, randn! as rocrandn!
+
+using TensorKit
+using TensorKit.Factorizations
+using TensorKit.Strided
+using TensorKit.Factorizations: AbstractAlgorithm
+using TensorKit: SectorDict, tensormaptype, scalar, similarstoragetype, AdjointTensorMap, scalartype, project_symmetric_and_check
+import TensorKit: randisometry, rand, randn
+
+using TensorKit: MatrixAlgebraKit
+
+using Random
+
+include("roctensormap.jl")
+
+end
diff --git a/ext/TensorKitAMDGPUExt/roctensormap.jl b/ext/TensorKitAMDGPUExt/roctensormap.jl
new file mode 100644
index 000000000..a9a937912
--- /dev/null
+++ b/ext/TensorKitAMDGPUExt/roctensormap.jl
@@ -0,0 +1,166 @@
+const ROCTensorMap{T, S, N₁, N₂} = TensorMap{T, S, N₁, N₂, ROCVector{T, AMDGPU.Mem.HIPBuffer}}
+const ROCTensor{T, S, N} = ROCTensorMap{T, S, N, 0}
+
+const AdjointROCTensorMap{T, S, N₁, N₂} = AdjointTensorMap{T, S, N₁, N₂, ROCTensorMap{T, S, N₁, N₂}}
+
+function ROCTensorMap(t::TensorMap{T, S, N₁, N₂, A}) where {T, S, N₁, N₂, A}
+    return ROCTensorMap{T, S, N₁, N₂}(ROCArray{T}(t.data), space(t))
+end
+
+# project_symmetric! doesn't yet work for GPU types, so do this on the host, then copy
+function TensorKit.project_symmetric_and_check(::Type{T}, ::Type{A}, data::AbstractArray, V::TensorMapSpace; tol = sqrt(eps(real(float(eltype(data)))))) where {T, A <: ROCVector{T}}
+    h_t = TensorKit.TensorMapWithStorage{T, Vector{T}}(undef, V)
+    h_t = TensorKit.project_symmetric!(h_t, Array(data))
+    # verify result
+    isapprox(Array(reshape(data, dims(h_t))), convert(Array, h_t); atol = tol) ||
+        throw(ArgumentError("Data has non-zero elements at incompatible positions"))
+    return TensorKit.TensorMapWithStorage{T, A}(A(h_t.data), V)
+end
+
+for (fname, felt) in ((:zeros, :zero), (:ones, :one))
+    @eval begin
+        function AMDGPU.$fname(
+                codomain::TensorSpace{S},
+                domain::TensorSpace{S} = one(codomain)
+            ) where {S <: IndexSpace}
+            return AMDGPU.$fname(codomain ← domain)
+        end
+        function AMDGPU.$fname(
+                ::Type{T}, codomain::TensorSpace{S},
+                domain::TensorSpace{S} = one(codomain)
+            ) where {T, S <: IndexSpace}
+            return AMDGPU.$fname(T, codomain ← domain)
+        end
+        AMDGPU.$fname(V::TensorMapSpace) = AMDGPU.$fname(Float64, V)
+        function AMDGPU.$fname(::Type{T}, V::TensorMapSpace) where {T}
+            t = ROCTensorMap{T}(undef, V)
+            fill!(t, $felt(T))
+            return t
+        end
+    end
+end
+
+for randfun in (:rocrand, :rocrandn)
+    randfun! = Symbol(randfun, :!)
+    @eval begin
+        # converting `codomain` and `domain` into `HomSpace`
+        function $randfun(
+                codomain::TensorSpace{S},
+                domain::TensorSpace{S} = one(codomain),
+            ) where {S <: IndexSpace}
+            return $randfun(codomain ← domain)
+        end
+        function $randfun(
+                ::Type{T}, codomain::TensorSpace{S},
+                domain::TensorSpace{S} = one(codomain),
+            ) where {T, S <: IndexSpace}
+            return $randfun(T, codomain ← domain)
+        end
+        function $randfun(
+                rng::Random.AbstractRNG, ::Type{T},
+                codomain::TensorSpace{S},
+                domain::TensorSpace{S} = one(codomain),
+            ) where {T, S <: IndexSpace}
+            return $randfun(rng, T, codomain ← domain)
+        end
+
+        # filling in default eltype
+        $randfun(V::TensorMapSpace) = $randfun(Float64, V)
+        function $randfun(rng::Random.AbstractRNG, V::TensorMapSpace)
+            return $randfun(rng, Float64, V)
+        end
+
+        # filling in default rng
+        function $randfun(::Type{T}, V::TensorMapSpace) where {T}
+            return $randfun(Random.default_rng(), T, V)
+        end
+
+        # implementation
+        function $randfun(
+                rng::Random.AbstractRNG, ::Type{T},
+                V::TensorMapSpace
+            ) where {T}
+            t = ROCTensorMap{T}(undef, V)
+            $randfun!(rng, t)
+            return t
+        end
+
+        function $randfun!(rng::Random.AbstractRNG, t::ROCTensorMap)
+            for (_, b) in blocks(t)
+                $randfun!(rng, b)
+            end
+            return t
+        end
+    end
+end
+
+# Scalar implementation
+#-----------------------
+function TensorKit.scalar(t::ROCTensorMap{T, S, 0, 0}) where {T, S}
+    inds = findall(!iszero, t.data)
+    return isempty(inds) ? zero(scalartype(t)) : @allowscalar @inbounds t.data[only(inds)]
+end
+
+function Base.convert(
+        TT::Type{ROCTensorMap{T, S, N₁, N₂}},
+        t::AbstractTensorMap{<:Any, S, N₁, N₂}
+    ) where {T, S, N₁, N₂}
+    if typeof(t) === TT
+        return t
+    else
+        tnew = TT(undef, space(t))
+        return copy!(tnew, t)
+    end
+end
+
+function LinearAlgebra.isposdef(t::ROCTensorMap)
+    domain(t) == codomain(t) ||
+        throw(SpaceMismatch("`isposdef` requires domain and codomain to be the same"))
+    InnerProductStyle(spacetype(t)) === EuclideanInnerProduct() || return false
+    for (c, b) in blocks(t)
+        # do our own hermitian check
+        isherm = MatrixAlgebraKit.ishermitian(b; atol = eps(real(eltype(b))), rtol = eps(real(eltype(b))))
+        isherm || return false
+        isposdef(Hermitian(b)) || return false
+    end
+    return true
+end
+
+function Base.promote_rule(
+        ::Type{<:TT₁},
+        ::Type{<:TT₂}
+    ) where {
+        S, N₁, N₂, TTT₁, TTT₂,
+        TT₁ <: ROCTensorMap{TTT₁, S, N₁, N₂},
+        TT₂ <: ROCTensorMap{TTT₂, S, N₁, N₂},
+    }
+    T = TensorKit.VectorInterface.promote_add(TTT₁, TTT₂)
+    return ROCTensorMap{T, S, N₁, N₂}
+end
+
+# ROCTensorMap exponentation:
+function TensorKit.exp!(t::ROCTensorMap)
+    domain(t) == codomain(t) ||
+        error("Exponential of a tensor only exist when domain == codomain.")
+    !MatrixAlgebraKit.ishermitian(t) && throw(ArgumentError("`exp!` is currently only supported on hermitian AMDGPU tensors"))
+    for (c, b) in blocks(t)
+        copy!(b, parent(Base.exp(Hermitian(b))))
+    end
+    return t
+end
+
+# functions that don't map ℝ to (a subset of) ℝ
+for f in (:sqrt, :log, :asin, :acos, :acosh, :atanh, :acoth)
+    sf = string(f)
+    @eval function Base.$f(t::ROCTensorMap)
+        domain(t) == codomain(t) ||
+            throw(SpaceMismatch("`$($sf)` of a tensor only exists when domain == codomain"))
+        !MatrixAlgebraKit.ishermitian(t) && throw(ArgumentError("`$($sf)` is currently only supported on hermitian AMDGPU tensors"))
+        T = complex(float(scalartype(t)))
+        tf = similar(t, T)
+        for (c, b) in blocks(t)
+            copy!(block(tf, c), parent($f(Hermitian(b))))
+        end
+        return tf
+    end
+end
diff --git a/test/amd/tensors.jl b/test/amd/tensors.jl
new file mode 100644
index 000000000..459212fa5
--- /dev/null
+++ b/test/amd/tensors.jl
@@ -0,0 +1,606 @@
+using Adapt, AMDGPU
+using Test, TestExtras
+using TensorKit, Combinatorics
+ad = adapt(Array)
+const AMDGPUExt = Base.get_extension(TensorKit, :TensorKitAMDGPUExt)
+@assert !isnothing(AMDGPUExt)
+const ROCTensorMap = getglobal(AMDGPUExt, :ROCTensorMap)
+const rocrand = getglobal(AMDGPUExt, :rocrand)
+const rocrandn = getglobal(AMDGPUExt, :rocrandn)
+const rocrand! = getglobal(AMDGPUExt, :rocrand!)
+using AMDGPU: rand as rocrand, rand! as rocrand!, randn as rocrandn, randn! as rocrandn!
+
+@isdefined(TestSetup) || include("../setup.jl")
+using .TestSetup
+
+for V in (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂, VfSU₂) #, VSU₃)
+    V1, V2, V3, V4, V5 = V
+    @assert V3 * V4 * V2 ≿ V1' * V5' # necessary for leftorth tests
+    @assert V3 * V4 ≾ V1' * V2' * V5' # necessary for rightorth tests
+end
+
+spacelist = try
+    if ENV["CI"] == "true"
+        println("Detected running on CI")
+        if Sys.iswindows()
+            (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂)
+        elseif Sys.isapple()
+            (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VfU₁, VfSU₂) #, VSU₃)
+        else
+            (Vtr, Vℤ₂, Vfℤ₂, VU₁, VCU₁, VSU₂, VfSU₂) #, VSU₃)
+        end
+    else
+        (Vtr, VU₁, VSU₂, Vfℤ₂)
+    end
+catch
+    (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂, VfSU₂) #, VSU₃)
+end
+
+for V in spacelist
+    I = sectortype(first(V))
+    Istr = TensorKit.type_repr(I)
+    println("---------------------------------------")
+    println("AMDGPU Tensors with symmetry: $Istr")
+    println("---------------------------------------")
+    @timedtestset "Tensors with symmetry: $Istr" verbose = true begin
+        V1, V2, V3, V4, V5 = V
+        @timedtestset "Basic tensor properties" begin
+            W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+            # test default pass-throughs
+            for f in (AMDGPU.zeros, AMDGPU.ones, rocrand, rocrandn)
+                t = @constinferred f(W)
+                @test scalartype(t) == Float64
+                @test codomain(t) == W
+                @test space(t) == (W ← one(W))
+                @test domain(t) == one(W)
+                @test typeof(t) == TensorMap{Float64, spacetype(t), 5, 0, ROCVector{Float64, AMDGPU.Mem.HIPBuffer}}
+            end
+            for f in (rand, randn)
+                t = @constinferred f(ROCVector{Float64, AMDGPU.Mem.HIPBuffer}, W)
+                @test scalartype(t) == Float64
+                @test codomain(t) == W
+                @test space(t) == (W ← one(W))
+                @test domain(t) == one(W)
+                @test typeof(t) == TensorMap{Float64, spacetype(t), 5, 0, ROCVector{Float64, AMDGPU.Mem.HIPBuffer}}
+            end
+            for f! in (rocrand!, rocrandn!)
+                t = @constinferred AMDGPU.zeros(W)
+                f!(t)
+                @test scalartype(t) == Float64
+                @test codomain(t) == W
+                @test space(t) == (W ← one(W))
+                @test domain(t) == one(W)
+                @test typeof(t) == TensorMap{Float64, spacetype(t), 5, 0, ROCVector{Float64, AMDGPU.Mem.HIPBuffer}}
+            end
+            for T in (Int, Float32, Float64, ComplexF32, ComplexF64)
+                t = @constinferred AMDGPU.zeros(T, W)
+                AMDGPU.@allowscalar begin
+                    @test @constinferred(hash(t)) == hash(deepcopy(t))
+                end
+                @test scalartype(t) == T
+                @test norm(t) == 0
+                @test codomain(t) == W
+                @test space(t) == (W ← one(W))
+                @test domain(t) == one(W)
+                @test typeof(t) == TensorMap{T, spacetype(t), 5, 0, ROCVector{T, AMDGPU.Mem.HIPBuffer}}
+                # blocks
+                bs = @constinferred blocks(t)
+                (c, b1), state = @constinferred Nothing iterate(bs)
+                @test c == first(blocksectors(W))
+                next = @constinferred Nothing iterate(bs, state)
+                b2 = @constinferred block(t, first(blocksectors(t)))
+                @test b1 == b2
+                @test_broken eltype(bs) === Pair{typeof(c), typeof(b1)}
+                @test_broken typeof(b1) === TensorKit.blocktype(t)
+                @test typeof(c) === sectortype(t)
+            end
+        end
+        @timedtestset "Conversion to/from host" begin
+            W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+            for T in (Int, Float32, ComplexF64)
+                h_t = @constinferred rand(T, W)
+                t1 = convert(ROCTensorMap{T}, h_t)
+                @test collect(t1.data) == h_t.data
+                @test space(t1) == space(h_t)
+                @test scalartype(t1) == T
+                @test codomain(t1) == W
+                @test space(t1) == (W ← one(W))
+                @test domain(t1) == one(W)
+                t2 = ROCTensorMap(h_t)
+                @test collect(t2.data) == h_t.data
+                @test space(t2) == space(h_t)
+                @test scalartype(t2) == T
+                @test codomain(t2) == W
+                @test space(t2) == (W ← one(W))
+                @test domain(t2) == one(W)
+            end
+        end
+        @timedtestset "Tensor Dict conversion" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            for T in (Int, Float32, ComplexF64)
+                t = @constinferred AMDGPU.rand(T, W)
+                d = convert(Dict, t)
+                @test TensorKit.to_cpu(t) == convert(TensorMap, d)
+            end
+        end
+        @timedtestset "Basic linear algebra" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            for T in (Float32, ComplexF64)
+                t = @constinferred AMDGPU.rand(T, W)
+                @test scalartype(t) == T
+                @test space(t) == W
+                @test space(t') == W'
+                @test dim(t) == dim(space(t))
+                @test codomain(t) == codomain(W)
+                @test domain(t) == domain(W)
+                # blocks for adjoint
+                bs = @constinferred blocks(t')
+                (c, b1), state = @constinferred Nothing iterate(bs)
+                @test c == first(blocksectors(W'))
+                next = @constinferred Nothing iterate(bs, state)
+                b2 = @constinferred block(t', first(blocksectors(t')))
+                @test b1 == b2
+                @test_broken eltype(bs) === Pair{typeof(c), typeof(b1)}
+                @test_broken typeof(b1) === TensorKit.blocktype(t')
+                @test typeof(c) === sectortype(t)
+                # linear algebra
+                @test isa(@constinferred(norm(t)), real(T))
+                @test norm(t)^2 ≈ dot(t, t)
+                α = rand(T)
+                @test norm(α * t) ≈ abs(α) * norm(t)
+                @test norm(t + t, 2) ≈ 2 * norm(t, 2)
+                @test norm(t + t, 1) ≈ 2 * norm(t, 1)
+                @test norm(t + t, Inf) ≈ 2 * norm(t, Inf)
+                p = 3 * rand(Float64)
+                @test norm(t + t, p) ≈ 2 * norm(t, p)
+                @test norm(t) ≈ norm(t')
+
+                t2 = @constinferred rand!(similar(t))
+                β = rand(T)
+                #@test @constinferred(dot(β * t2, α * t)) ≈ conj(β) * α * conj(dot(t, t2)) # broken for Irrep[CU₁]
+                @test dot(β * t2, α * t) ≈ conj(β) * α * conj(dot(t, t2))
+                @test dot(t2, t) ≈ conj(dot(t, t2))
+                @test dot(t2, t) ≈ conj(dot(t2', t'))
+                @test dot(t2, t) ≈ dot(t', t2')
+
+                i1 = @constinferred(isomorphism(ROCVector{T, AMDGPU.Mem.HIPBuffer}, V1 ⊗ V2, V2 ⊗ V1))
+                i2 = @constinferred(isomorphism(ROCVector{T, AMDGPU.Mem.HIPBuffer}, V2 ⊗ V1, V1 ⊗ V2))
+                @test i1 * i2 == @constinferred(id(ROCVector{T, AMDGPU.Mem.HIPBuffer}, V1 ⊗ V2))
+                @test i2 * i1 == @constinferred(id(ROCVector{T, AMDGPU.Mem.HIPBuffer}, V2 ⊗ V1))
+                w = @constinferred(isometry(ROCVector{T, AMDGPU.Mem.HIPBuffer}, V1 ⊗ (oneunit(V1) ⊕ oneunit(V1)), V1))
+                @test dim(w) == 2 * dim(V1 ← V1)
+                @test w' * w == id(ROCVector{T, AMDGPU.Mem.HIPBuffer}, V1)
+                @test w * w' == (w * w')^2
+            end
+        end
+        @timedtestset "Trivial space insertion and removal" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            for T in (Float32, ComplexF64)
+                t = @constinferred AMDGPU.rand(T, W)
+                t2 = @constinferred insertleftunit(t)
+                @test t2 == @constinferred insertrightunit(t)
+                @test numind(t2) == numind(t) + 1
+                @test space(t2) == insertleftunit(space(t))
+                @test scalartype(t2) === T
+                @test t.data === t2.data
+                @test @constinferred(removeunit(t2, $(numind(t2)))) == t
+                t3 = @constinferred insertleftunit(t; copy = true)
+                @test t3 == @constinferred insertrightunit(t; copy = true)
+                @test t.data !== t3.data
+                for (c, b) in blocks(t)
+                    @test b == block(t3, c)
+                end
+                @test @constinferred(removeunit(t3, $(numind(t3)))) == t
+                t4 = @constinferred insertrightunit(t, 3; dual = true)
+                @test numin(t4) == numin(t) && numout(t4) == numout(t) + 1
+                for (c, b) in blocks(t)
+                    @test b == block(t4, c)
+                end
+                @test @constinferred(removeunit(t4, 4)) == t
+                t5 = @constinferred insertleftunit(t, 4; dual = true)
+                @test numin(t5) == numin(t) + 1 && numout(t5) == numout(t)
+                for (c, b) in blocks(t)
+                    @test b == block(t5, c)
+                end
+                @test @constinferred(removeunit(t5, 4)) == t
+            end
+        end
+        if hasfusiontensor(I)
+            @timedtestset "Basic linear algebra: test via CPU" begin
+                W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+                for T in (Float32, ComplexF64)
+                    t = AMDGPU.rand(T, W)
+                    t2 = @constinferred AMDGPU.rand!(similar(t))
+                    α = rand(T)
+                    @test norm(t, 2) ≈ norm(TensorKit.to_cpu(t), 2)
+                    @test dot(t2, t) ≈ dot(TensorKit.to_cpu(t2), TensorKit.to_cpu(t))
+                    @test TensorKit.to_cpu(α * t) ≈ α * TensorKit.to_cpu(t)
+                    @test TensorKit.to_cpu(t + t) ≈ 2 * TensorKit.to_cpu(t)
+                end
+            end
+            @timedtestset "Real and imaginary parts" begin
+                W = V1 ⊗ V2
+                for T in (Float64, ComplexF64, ComplexF32)
+                    t = @constinferred AMDGPU.randn(T, W, W)
+
+                    tr = @constinferred real(t)
+                    @test scalartype(tr) <: Real
+                    @test real(TensorKit.to_cpu(t)) == TensorKit.to_cpu(tr)
+                    @test storagetype(tr) == ROCVector{real(T), AMDGPU.Mem.HIPBuffer}
+
+                    ti = @constinferred imag(t)
+                    @test scalartype(ti) <: Real
+                    @test imag(TensorKit.to_cpu(t)) == TensorKit.to_cpu(ti)
+                    @test storagetype(ti) == ROCVector{real(T), AMDGPU.Mem.HIPBuffer}
+
+                    tc = @inferred complex(t)
+                    @test scalartype(tc) <: Complex
+                    @test complex(TensorKit.to_cpu(t)) == TensorKit.to_cpu(tc)
+                    @test storagetype(tc) == ROCVector{complex(T), AMDGPU.Mem.HIPBuffer}
+
+                    tc2 = @inferred complex(tr, ti)
+                    @test tc2 ≈ tc
+                    @test storagetype(tc2) == ROCVector{complex(T), AMDGPU.Mem.HIPBuffer}
+                end
+            end
+        end
+        @timedtestset "Tensor conversion" begin # TODO adjoint conversion methods don't work yet
+            W = V1 ⊗ V2
+            t = @constinferred AMDGPU.randn(W ← W)
+            #@test typeof(convert(TensorMap, t')) == typeof(t) # TODO Adjoint not supported yet
+            tc = complex(t)
+            @test convert(typeof(tc), t) == tc
+            @test typeof(convert(typeof(tc), t)) == typeof(tc)
+            # @test typeof(convert(typeof(tc), t')) == typeof(tc) # TODO Adjoint not supported yet
+            @test Base.promote_typeof(t, tc) == typeof(tc)
+            @test Base.promote_typeof(tc, t) == typeof(tc + t)
+        end
+        #=@timedtestset "diag/diagm" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            t = AMDGPU.randn(ComplexF64, W)
+            d = LinearAlgebra.diag(t)
+            # TODO find a way to use AMDGPU here
+            D = LinearAlgebra.diagm(codomain(t), domain(t), d)
+            @test LinearAlgebra.isdiag(D)
+            @test LinearAlgebra.diag(D) == d
+        end=#
+        @timedtestset "Permutations: test via inner product invariance" begin
+            W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+            t = AMDGPU.rand(ComplexF64, W)
+            t′ = AMDGPU.randn!(similar(t))
+            for k in 0:5
+                for p in permutations(1:5)
+                    p1 = ntuple(n -> p[n], k)
+                    p2 = ntuple(n -> p[k + n], 5 - k)
+                    AMDGPU.@allowscalar begin
+                        t2 = @constinferred permute(t, (p1, p2))
+                        t2 = permute(t, (p1, p2))
+                        @test norm(t2) ≈ norm(t)
+                        t2′ = permute(t′, (p1, p2))
+                        @test dot(t2′, t2) ≈ dot(t′, t) ≈ dot(transpose(t2′), transpose(t2))
+                    end
+                end
+
+                AMDGPU.@allowscalar begin
+                    t3 = @constinferred repartition(t, $k)
+                    t3 = repartition(t, k)
+                    @test norm(t3) ≈ norm(t)
+                    t3′ = @constinferred repartition!(similar(t3), t′)
+                    @test norm(t3′) ≈ norm(t′)
+                    @test dot(t′, t) ≈ dot(t3′, t3)
+                end
+            end
+        end
+        if BraidingStyle(I) isa SymmetricBraiding
+            @timedtestset "Permutations: test via CPU" begin
+                W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+                t = AMDGPU.rand(ComplexF64, W)
+                for k in 0:5
+                    for p in permutations(1:5)
+                        p1 = ntuple(n -> p[n], k)
+                        p2 = ntuple(n -> p[k + n], 5 - k)
+                        dt2 = AMDGPU.@allowscalar permute(t, (p1, p2))
+                        ht2 = permute(TensorKit.to_cpu(t), (p1, p2))
+                        @test ht2 == TensorKit.to_cpu(dt2)
+                    end
+
+                    dt3 = AMDGPU.@allowscalar repartition(t, k)
+                    ht3 = repartition(TensorKit.to_cpu(t), k)
+                    @test ht3 == TensorKit.to_cpu(dt3)
+                end
+            end
+        end
+        @timedtestset "Full trace: test self-consistency" begin
+            t = AMDGPU.rand(ComplexF64, V1 ⊗ V2' ⊗ V2 ⊗ V1')
+            AMDGPU.@allowscalar begin
+                t2 = permute(t, ((1, 2), (4, 3)))
+                s = @constinferred tr(t2)
+                @test conj(s) ≈ tr(t2')
+                if !isdual(V1)
+                    t2 = twist!(t2, 1)
+                end
+                if isdual(V2)
+                    t2 = twist!(t2, 2)
+                end
+                ss = tr(t2)
+                @tensor s2 = t[a, b, b, a]
+                @tensor t3[a, b] := t[a, c, c, b]
+                @tensor s3 = t3[a, a]
+            end
+            @test ss ≈ s2
+            @test ss ≈ s3
+        end
+        #=@timedtestset "Partial trace: test self-consistency" begin
+            t = AMDGPU.rand(ComplexF64, V1 ⊗ V2' ⊗ V3 ⊗ V2 ⊗ V1' ⊗ V3')
+            @tensor t2[a, b] := t[c, d, b, d, c, a]
+            @tensor t4[a, b, c, d] := t[d, e, b, e, c, a]
+            @tensor t5[a, b] := t4[a, b, c, c]
+            @test t2 ≈ t5
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Trace: test via conversion" begin
+                t = AMDGPU.rand(ComplexF64, V1 ⊗ V2' ⊗ V3 ⊗ V2 ⊗ V1' ⊗ V3')
+                AMDGPU.@allowscalar begin
+                    @tensor t2[a, b] := t[c, d, b, d, c, a]
+                    @tensor t3[a, b] := ad(t)[c, d, b, d, c, a]
+                end
+                @test t3 ≈ ad(t2)
+            end
+        end
+        @timedtestset "Trace and contraction" begin
+            t1 = AMDGPU.rand(ComplexF64, V1 ⊗ V2 ⊗ V3)
+            t2 = AMDGPU.rand(ComplexF64, V2' ⊗ V4 ⊗ V1')
+            AMDGPU.@allowscalar begin
+                t3 = t1 ⊗ t2
+                @tensor ta[a, b] := t1[x, y, a] * t2[y, b, x]
+                @tensor tb[a, b] := t3[x, y, a, y, b, x]
+            end
+            @test ta ≈ tb
+        end
+        #=if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Tensor contraction: test via CPU" begin
+                dA1 = AMDGPU.randn(ComplexF64, V1' * V2', V3')
+                dA2 = AMDGPU.randn(ComplexF64, V3 * V4, V5)
+                drhoL = AMDGPU.randn(ComplexF64, V1, V1)
+                drhoR = AMDGPU.randn(ComplexF64, V5, V5)' # test adjoint tensor
+                dH = AMDGPU.randn(ComplexF64, V2 * V4, V2 * V4)
+                @tensor dHrA12[a, s1, s2, c] := drhoL[a, a'] * conj(dA1[a', t1, b]) *
+                    dA2[b, t2, c'] * drhoR[c', c] *
+                    dH[s1, s2, t1, t2]
+                @tensor hHrA12[a, s1, s2, c] := TensorKit.to_cpu(drhoL)[a, a'] * conj(TensorKit.to_cpu(dA1)[a', t1, b]) *
+                    TensorKit.to_cpu(dA2)[b, t2, c'] * TensorKit.to_cpu(drhoR)[c', c] *
+                    TensorKit.to_cpu(dH)[s1, s2, t1, t2]
+                @test TensorKit.to_cpu(dHrA12) ≈ hHrA12
+            end
+        end=# # doesn't yet work because of AdjointTensor
+        @timedtestset "Index flipping: test flipping inverse" begin
+            t = AMDGPU.rand(ComplexF64, V1 ⊗ V1' ← V1' ⊗ V1)
+            for i in 1:4
+                AMDGPU.@allowscalar begin
+                    @test t ≈ flip(flip(t, i), i; inv = true)
+                    @test t ≈ flip(flip(t, i; inv = true), i)
+                end
+            end
+        end
+        #=@timedtestset "Index flipping: test via explicit flip" begin
+            t = AMDGPU.rand(ComplexF64, V1 ⊗ V1' ← V1' ⊗ V1)
+            F1 = unitary(flip(V1), V1)
+
+            AMDGPU.@allowscalar begin
+                @tensor tf[a, b; c, d] := F1[a, a'] * t[a', b; c, d]
+                @test flip(t, 1) ≈ tf
+                @tensor tf[a, b; c, d] := conj(F1[b, b']) * t[a, b'; c, d]
+                @test twist!(flip(t, 2), 2) ≈ tf
+                @tensor tf[a, b; c, d] := F1[c, c'] * t[a, b; c', d]
+                @test flip(t, 3) ≈ tf
+                @tensor tf[a, b; c, d] := conj(F1[d, d']) * t[a, b; c, d']
+                @test twist!(flip(t, 4), 4) ≈ tf
+            end
+        end
+        @timedtestset "Index flipping: test via contraction" begin
+            t1 = AMDGPU.rand(ComplexF64, V1 ⊗ V2 ⊗ V3 ← V4)
+            t2 = AMDGPU.rand(ComplexF64, V2' ⊗ V5 ← V4' ⊗ V1)
+            AMDGPU.@allowscalar begin
+                @tensor ta[a, b] := t1[x, y, a, z] * t2[y, b, z, x]
+                @tensor tb[a, b] := flip(t1, 1)[x, y, a, z] * flip(t2, 4)[y, b, z, x]
+                @test ta ≈ tb
+                @tensor tb[a, b] := flip(t1, (2, 4))[x, y, a, z] * flip(t2, (1, 3))[y, b, z, x]
+                @test ta ≈ tb
+                @tensor tb[a, b] := flip(t1, (1, 2, 4))[x, y, a, z] * flip(t2, (1, 3, 4))[y, b, z, x]
+                @tensor tb[a, b] := flip(t1, (1, 3))[x, y, a, z] * flip(t2, (2, 4))[y, b, z, x]
+                @test flip(ta, (1, 2)) ≈ tb
+            end
+        end=# # TODO =# # None of this works due to lack of HIPTensor support
+        @timedtestset "Multiplication of isometries: test properties" begin
+            W2 = V4 ⊗ V5
+            W1 = W2 ⊗ (oneunit(V1) ⊕ oneunit(V1))
+            for T in (Float64, ComplexF64)
+                t1 = randisometry(ROCMatrix{T}, W1, W2)
+                t2 = randisometry(ROCMatrix{T}, W2 ← W2)
+                @test isisometric(t1)
+                @test isunitary(t2)
+                P = t1 * t1'
+                @test P * P ≈ P
+            end
+        end
+        @timedtestset "Multiplication and inverse: test compatibility" begin
+            W1 = V1 ⊗ V2 ⊗ V3
+            W2 = V4 ⊗ V5
+            for T in (Float64, ComplexF64)
+                t1 = AMDGPU.rand(T, W1, W1)
+                t2 = AMDGPU.rand(T, W2, W2)
+                t = AMDGPU.rand(T, W1, W2)
+                @test t1 * (t1 \ t) ≈ t
+                @test (t / t2) * t2 ≈ t
+                @test t1 \ one(t1) ≈ inv(t1)
+                @test one(t1) / t1 ≈ pinv(t1)
+                @test_throws SpaceMismatch inv(t)
+                @test_throws SpaceMismatch t2 \ t
+                @test_throws SpaceMismatch t / t1
+                tp = pinv(t) * t
+                @test tp ≈ tp * tp
+            end
+        end
+        @timedtestset "Multiplication and inverse: test via CPU" begin
+            W1 = V1 ⊗ V2 ⊗ V3
+            W2 = V4 ⊗ V5
+            for T in (Float32, Float64, ComplexF32, ComplexF64)
+                t1 = AMDGPU.rand(T, W1, W1)
+                t2 = AMDGPU.rand(T, W2, W2)
+                t = AMDGPU.rand(T, W1, W2)
+                ht1 = TensorKit.to_cpu(t1)
+                ht2 = TensorKit.to_cpu(t2)
+                ht = TensorKit.to_cpu(t)
+                @test TensorKit.to_cpu(t1 * t) ≈ ht1 * ht
+                @test TensorKit.to_cpu(t1' * t) ≈ ht1' * ht
+                @test TensorKit.to_cpu(t2 * t') ≈ ht2 * ht'
+                @test TensorKit.to_cpu(t2' * t') ≈ ht2' * ht'
+
+                @test TensorKit.to_cpu(inv(t1)) ≈ inv(ht1)
+                @test TensorKit.to_cpu(pinv(t)) ≈ pinv(ht)
+
+                if T == Float32 || T == ComplexF32
+                    continue
+                end
+
+                @test TensorKit.to_cpu(t1 \ t) ≈ ht1 \ ht
+                @test TensorKit.to_cpu(t1' \ t) ≈ ht1' \ ht
+                @test TensorKit.to_cpu(t2 \ t') ≈ ht2 \ ht'
+                @test TensorKit.to_cpu(t2' \ t') ≈ ht2' \ ht'
+
+                @test TensorKit.to_cpu(t2 / t) ≈ ht2 / ht
+                @test TensorKit.to_cpu(t2' / t) ≈ ht2' / ht
+                @test TensorKit.to_cpu(t1 / t') ≈ ht1 / ht'
+                @test TensorKit.to_cpu(t1' / t') ≈ ht1' / ht'
+            end
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Tensor functions" begin
+                W = V1 ⊗ V2
+                for T in (Float64, ComplexF64)
+                    t = project_hermitian!(AMDGPU.randn(T, W, W))
+                    s = dim(W)
+                    #@test (@constinferred sqrt(t))^2 ≈ t
+                    #@test TensorKit.to_cpu(sqrt(t)) ≈ sqrt(TensorKit.to_cpu(t))
+
+                    expt = @constinferred exp(t)
+                    @test TensorKit.to_cpu(expt) ≈ exp(TensorKit.to_cpu(t))
+
+                    # log doesn't work on AMDGPU yet (scalar indexing)
+                    #@test exp(@constinferred log(project_hermitian!(expt))) ≈ expt
+                    #@test TensorKit.to_cpu(log(project_hermitian!(expt))) ≈ log(TensorKit.to_cpu(expt))
+
+                    #=@test (@constinferred cos(t))^2 + (@constinferred sin(t))^2 ≈
+                          id(storagetype(t), W)
+                    @test (@constinferred tan(t)) ≈ sin(t) / cos(t)
+                    @test (@constinferred cot(t)) ≈ cos(t) / sin(t)
+                    @test (@constinferred cosh(t))^2 - (@constinferred sinh(t))^2 ≈
+                          id(storagetype(t), W)
+                    @test (@constinferred tanh(t)) ≈ sinh(t) / cosh(t)
+                    @test (@constinferred coth(t)) ≈ cosh(t) / sinh(t)=# # TODO in AMDGPU
+
+                    #=t1 = sin(t)
+                    @test sin(@constinferred asin(t1)) ≈ t1
+                    t2 = cos(t)
+                    @test cos(@constinferred acos(t2)) ≈ t2
+                    t3 = sinh(t)
+                    @test sinh(@constinferred asinh(t3)) ≈ t3
+                    t4 = cosh(t)
+                    @test cosh(@constinferred acosh(t4)) ≈ t4
+                    t5 = tan(t)
+                    @test tan(@constinferred atan(t5)) ≈ t5
+                    t6 = cot(t)
+                    @test cot(@constinferred acot(t6)) ≈ t6
+                    t7 = tanh(t)
+                    @test tanh(@constinferred atanh(t7)) ≈ t7
+                    t8 = coth(t)
+                    @test coth(@constinferred acoth(t8)) ≈ t8=#
+                    # TODO in AMDGPU
+                end
+            end
+        end
+        # Sylvester not defined for AMDGPU
+        # @timedtestset "Sylvester equation" begin
+        #     for T in (Float32, ComplexF64)
+        #         tA = AMDGPU.rand(T, V1 ⊗ V3, V1 ⊗ V3)
+        #         tB = AMDGPU.rand(T, V2 ⊗ V4, V2 ⊗ V4)
+        #         tA = 3 // 2 * leftorth(tA; alg=Polar())[1]
+        #         tB = 1 // 5 * leftorth(tB; alg=Polar())[1]
+        #         tC = AMDGPU.rand(T, V1 ⊗ V3, V2 ⊗ V4)
+        #         t = @constinferred sylvester(tA, tB, tC)
+        #         @test codomain(t) == V1 ⊗ V3
+        #         @test domain(t) == V2 ⊗ V4
+        #         @test norm(tA * t + t * tB + tC) <
+        #               (norm(tA) + norm(tB) + norm(tC)) * eps(real(T))^(2 / 3)
+        #         if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+        #             matrix(x) = reshape(convert(Array, x), dim(codomain(x)), dim(domain(x)))
+        #             @test matrix(t) ≈ sylvester(matrix(tA), matrix(tB), matrix(tC))
+        #         end
+        #     end
+        # end
+        #
+        # TODO
+        @timedtestset "Tensor product: test via norm preservation" begin
+            for T in (Float32, ComplexF64)
+                t1 = AMDGPU.rand(T, V2 ⊗ V3 ⊗ V1, V1 ⊗ V2)
+                t2 = AMDGPU.rand(T, V2 ⊗ V1 ⊗ V3, V1 ⊗ V1)
+                AMDGPU.@allowscalar begin
+                    t = @constinferred (t1 ⊗ t2)
+                end
+                @test norm(t) ≈ norm(t1) * norm(t2)
+            end
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Tensor product: test via conversion" begin
+                for T in (Float32, ComplexF64)
+                    t1 = AMDGPU.rand(T, V2 ⊗ V3 ⊗ V1, V1)
+                    t2 = AMDGPU.rand(T, V2 ⊗ V1 ⊗ V3, V2)
+                    d1 = dim(codomain(t1))
+                    d2 = dim(codomain(t2))
+                    d3 = dim(domain(t1))
+                    d4 = dim(domain(t2))
+                    AMDGPU.@allowscalar begin
+                        t = @constinferred (t1 ⊗ t2)
+                        At = ad(t)
+                        @test ad(t) ≈ ad(t1) ⊗ ad(t2)
+                    end
+                end
+            end
+        end
+        #=@timedtestset "Tensor product: test via tensor contraction" begin
+            for T in (Float32, ComplexF64)
+                t1 = AMDGPU.rand(T, V2 ⊗ V3 ⊗ V1)
+                t2 = AMDGPU.rand(T, V2 ⊗ V1 ⊗ V3)
+                AMDGPU.@allowscalar begin
+                    t = @constinferred (t1 ⊗ t2)
+                    @tensor t′[1, 2, 3, 4, 5, 6] := t1[1, 2, 3] * t2[4, 5, 6]
+                    # @test t ≈ t′ # TODO broken for symmetry: Irrep[ℤ₃]
+                end
+            end
+        end=# # broken due to no HIPTensor
+    end
+    TensorKit.empty_globalcaches!()
+end
+
+@timedtestset "Deligne tensor product: test via conversion" begin
+    Vlists1 = (Vtr,) # VSU₂)
+    Vlists2 = (Vtr,) # Vℤ₂)
+    @testset for Vlist1 in Vlists1, Vlist2 in Vlists2
+        V1, V2, V3, V4, V5 = Vlist1
+        W1, W2, W3, W4, W5 = Vlist2
+        for T in (Float32, ComplexF64)
+            t1 = AMDGPU.rand(T, V1 ⊗ V2, V3' ⊗ V4)
+            t2 = AMDGPU.rand(T, W2, W1 ⊗ W1')
+            AMDGPU.@allowscalar begin
+                t = @constinferred (t1 ⊠ t2)
+            end
+            d1 = dim(codomain(t1))
+            d2 = dim(codomain(t2))
+            d3 = dim(domain(t1))
+            d4 = dim(domain(t2))
+            AMDGPU.@allowscalar begin
+                @test ad(t1) ⊠ ad(t2) ≈ ad(t1 ⊠ t2)
+            end
+        end
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 3b0bfe8b0..3de200d3d 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -51,6 +51,10 @@ istestfile(fn) = endswith(fn, ".jl") && !contains(fn, "setup")
         CUDA.functional() || continue
         @time include("cuda/tensors.jl")
         @time include("cuda/factorizations.jl")
+    elseif group == "amd"
+        using AMDGPU
+        AMDGPU.functional() || continue
+        @time include("amd/tensors.jl")
     elseif is_buildkite
         continue
     end