diff --git a/Project.toml b/Project.toml index b148cea6a..6bb50f310 100644 --- a/Project.toml +++ b/Project.toml @@ -1,14 +1,13 @@ name = "TensorKit" uuid = "07d1fe3e-3e46-537d-9eac-e9e13d0d4cec" authors = ["Jutho Haegeman"] -version = "0.14.6" +version = "0.14.7" [deps] LRUCache = "8ac3fa9e-de4c-5943-b1dc-09c6b5f20637" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" PackageExtensionCompat = "65ce6f38-6b18-4e1d-a461-8949797d7930" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Strided = "5e0ebb24-38b0-5f93-81fe-25c709ecae67" TensorKitSectors = "13a9c161-d5da-41f0-bcbd-e1a08ae0647f" TensorOperations = "6aa20fa7-93e2-5fca-9bc0-fbd0db3c71a2" @@ -33,7 +32,6 @@ LRUCache = "1.0.2" LinearAlgebra = "1" PackageExtensionCompat = "1" Random = "1" -SparseArrays = "1" Strided = "2" TensorKitSectors = "0.1" TensorOperations = "5.1" diff --git a/benchmark/TensorKitBenchmarks/TensorKitBenchmarks.jl b/benchmark/TensorKitBenchmarks/TensorKitBenchmarks.jl index e276c1704..f93fd9996 100644 --- a/benchmark/TensorKitBenchmarks/TensorKitBenchmarks.jl +++ b/benchmark/TensorKitBenchmarks/TensorKitBenchmarks.jl @@ -4,7 +4,7 @@ using BenchmarkTools using TensorKit using TOML -BenchmarkTools.DEFAULT_PARAMETERS.seconds = 1.0 +BenchmarkTools.DEFAULT_PARAMETERS.seconds = 20.0 BenchmarkTools.DEFAULT_PARAMETERS.samples = 10000 BenchmarkTools.DEFAULT_PARAMETERS.time_tolerance = 0.15 BenchmarkTools.DEFAULT_PARAMETERS.memory_tolerance = 0.01 diff --git a/benchmark/TensorKitBenchmarks/tensornetworks/TensorNetworkBenchmarks.jl b/benchmark/TensorKitBenchmarks/tensornetworks/TensorNetworkBenchmarks.jl index becf3935c..4fcd27b22 100644 --- a/benchmark/TensorKitBenchmarks/tensornetworks/TensorNetworkBenchmarks.jl +++ b/benchmark/TensorKitBenchmarks/tensornetworks/TensorNetworkBenchmarks.jl @@ -45,9 +45,8 @@ function benchmark_mpo!(bench; sigmas=nothing, T="Float64", I="Trivial", dims) end if haskey(all_parameters, "mpo") - g = addgroup!(SUITE, "mpo") for params in all_parameters["mpo"] - benchmark_mpo!(g, params) + benchmark_mpo!(SUITE, params) end end @@ -90,9 +89,8 @@ function benchmark_pepo!(bench; sigmas=nothing, T="Float64", I="Trivial", dims) end if haskey(all_parameters, "pepo") - g = addgroup!(SUITE, "pepo") for params in all_parameters["pepo"] - benchmark_pepo!(g, params) + benchmark_pepo!(SUITE, params) end end @@ -136,9 +134,8 @@ function benchmark_mera!(bench; sigmas=nothing, T="Float64", I="Trivial", dims) end if haskey(all_parameters, "mera") - g = addgroup!(SUITE, "mera") for params in all_parameters["mera"] - benchmark_mera!(g, params) + benchmark_mera!(SUITE, params) end end diff --git a/benchmark/TensorKitBenchmarks/tensornetworks/benchparams.toml b/benchmark/TensorKitBenchmarks/tensornetworks/benchparams.toml index 8d10eb109..da20145cd 100644 --- a/benchmark/TensorKitBenchmarks/tensornetworks/benchparams.toml +++ b/benchmark/TensorKitBenchmarks/tensornetworks/benchparams.toml @@ -19,6 +19,12 @@ I = "U1Irrep" dims = [[40, 5, 3], [160, 5, 3], [640, 5, 3], [2560, 5, 3], [6120, 5, 3], [200, 20, 20], [400, 20, 20], [400, 40, 40]] sigmas = [0.5, 0.5, 0.5] +[[mpo]] +T = ["Float64"] +I = "SU2Irrep" +dims = [[40, 5, 3], [160, 5, 3], [640, 5, 3], [2560, 5, 3], [6120, 5, 3], [200, 20, 20], [400, 20, 20], [400, 40, 40]] +sigmas = [2, 2, 2] + # PEPO # ---- # dims = [peps, pepo, phys, env] @@ -40,6 +46,12 @@ I = "U1Irrep" dims = [[4, 2, 2, 100], [4, 4, 4, 200], [6, 2, 2, 100], [6, 3, 4, 200], [8, 2, 2, 100], [8, 2, 4, 200], [10, 2, 2, 50], [10, 3, 2, 100]] sigmas = [0.5, 0.5, 0.5, 0.5] +[[pepo]] +T = ["Float64"] +I = "SU2Irrep" +dims = [[4, 2, 2, 100], [4, 4, 4, 200], [6, 2, 2, 100], [6, 3, 4, 200], [8, 2, 2, 100], [8, 2, 4, 200], [10, 2, 2, 50], [10, 3, 2, 100]] +sigmas = [2.0, 2.0, 2.0, 2.0] + # MERA # ---- # dims = mera @@ -60,3 +72,9 @@ T = ["Float64"] I = "U1Irrep" dims = [4, 8, 12, 16, 22, 28] sigmas = [0.5] + +[[mera]] +T = ["Float64"] +I = "SU2Irrep" +dims = [4, 8, 12, 16, 22, 28] +sigmas = [2.0] diff --git a/benchmark/TensorKitBenchmarks/utils/BenchUtils.jl b/benchmark/TensorKitBenchmarks/utils/BenchUtils.jl index cefb02c98..ce22c763a 100644 --- a/benchmark/TensorKitBenchmarks/utils/BenchUtils.jl +++ b/benchmark/TensorKitBenchmarks/utils/BenchUtils.jl @@ -55,13 +55,15 @@ function generate_space(::Type{U1Irrep}, D::Int, sigma::Real=0.5) return U1Space((s => d for (s, d) in zip(sectors, dims))...) end function generate_space(::Type{SU2Irrep}, D::Int, sigma::Real=0.5) - poisson_pdf(x) = ceil(Int, D * exp(-sigma) * sigma^x / factorial(x + 1)) + normal_pdf = let D = D + x -> D * exp(-0.5 * (x / sigma)^2) / (sigma * sqrt(2π)) + end sectors = SU2Irrep[] dims = Int[] for sector in values(SU2Irrep) - d = poisson_pdf(Int(sector.j * 2)) + d = ceil(Int, normal_pdf(sector.j) / dim(sector)) push!(sectors, sector) push!(dims, d) D -= d * dim(sector) diff --git a/src/TensorKit.jl b/src/TensorKit.jl index d94245530..255d6ba3a 100644 --- a/src/TensorKit.jl +++ b/src/TensorKit.jl @@ -126,8 +126,6 @@ using LinearAlgebra: norm, dot, normalize, normalize!, tr, isposdef, isposdef!, ishermitian, rank, cond, Diagonal, Hermitian -using SparseArrays: SparseMatrixCSC, sparse, nzrange, rowvals, nonzeros - import Base.Meta using Random: Random, rand!, randn! @@ -185,6 +183,21 @@ include("fusiontrees/fusiontrees.jl") #------------------------------------------- include("spaces/vectorspaces.jl") +# Multithreading settings +#------------------------- +const TRANSFORMER_THREADS = Ref(1) + +get_num_transformer_threads() = TRANSFORMER_THREADS[] + +function set_num_transformer_threads(n::Int) + N = Base.Threads.nthreads() + if n > N + n = N + Strided._set_num_threads_warn(n) + end + return TRANSFORMER_THREADS[] = n +end + # Definitions and methods for tensors #------------------------------------- # general definitions diff --git a/src/auxiliary/auxiliary.jl b/src/auxiliary/auxiliary.jl index bda29d554..cb90589e0 100644 --- a/src/auxiliary/auxiliary.jl +++ b/src/auxiliary/auxiliary.jl @@ -55,3 +55,27 @@ _interleave(::Tuple{}, ::Tuple{}) = () function _interleave(a::NTuple{N}, b::NTuple{N}) where {N} return (a[1], b[1], _interleave(tail(a), tail(b))...) end + +# Low-overhead implementation of `copyto!` for specific case of `stride(B, 1) < stride(B, 2)` +# used in indexmanipulations: avoids the overhead of Strided.jl +function _copyto!(A::StridedView{<:Any,1}, B::StridedView{<:Any,2}) + length(A) == length(B) || throw(DimensionMismatch()) + + Adata = parent(A) + Astr = stride(A, 1) + IA = A.offset + + Bdata = parent(B) + Bstr = strides(B) + + IB_1 = B.offset + @inbounds for _ in axes(B, 2) + IB = IB_1 + for _ in axes(B, 1) + Adata[IA += Astr] = Bdata[IB += Bstr[1]] + end + IB_1 += Bstr[2] + end + + return A +end diff --git a/src/spaces/homspace.jl b/src/spaces/homspace.jl index bdb534663..b2ac5f894 100644 --- a/src/spaces/homspace.jl +++ b/src/spaces/homspace.jl @@ -225,11 +225,15 @@ end # Block and fusion tree ranges: structure information for building tensors #-------------------------------------------------------------------------- + +# sizes, strides, offset +const StridedStructure{N} = Tuple{NTuple{N,Int},NTuple{N,Int},Int} + struct FusionBlockStructure{I,N,F₁,F₂} totaldim::Int blockstructure::SectorDict{I,Tuple{Tuple{Int,Int},UnitRange{Int}}} fusiontreelist::Vector{Tuple{F₁,F₂}} - fusiontreestructure::Vector{Tuple{NTuple{N,Int},NTuple{N,Int},Int}} + fusiontreestructure::Vector{StridedStructure{N}} fusiontreeindices::FusionTreeDict{Tuple{F₁,F₂},Int} end diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl index 24478e3f7..c4e2b9228 100644 --- a/src/tensors/indexmanipulations.jl +++ b/src/tensors/indexmanipulations.jl @@ -453,177 +453,239 @@ end function add_transform!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, - (p₁, p₂)::Index2Tuple, + p::Index2Tuple, transformer, α::Number, β::Number, backend::AbstractBackend...) @boundscheck begin - permute(space(tsrc), (p₁, p₂)) == space(tdst) || + permute(space(tsrc), p) == space(tdst) || throw(SpaceMismatch("source = $(codomain(tsrc))←$(domain(tsrc)), - dest = $(codomain(tdst))←$(domain(tdst)), p₁ = $(p₁), p₂ = $(p₂)")) + dest = $(codomain(tdst))←$(domain(tdst)), p₁ = $(p[1]), p₂ = $(p[2])")) end - if p₁ === codomainind(tsrc) && p₂ === domainind(tsrc) + if p[1] === codomainind(tsrc) && p[2] === domainind(tsrc) add!(tdst, tsrc, α, β) else - add_transform_kernel!(tdst, tsrc, (p₁, p₂), transformer, α, β, backend...) + I = sectortype(tdst) + if I === Trivial + _add_trivial_kernel!(tdst, tsrc, p, transformer, α, β, backend...) + elseif FusionStyle(I) === UniqueFusion() + if use_threaded_transform(tdst, transformer) + _add_abelian_kernel_threaded!(tdst, tsrc, p, transformer, α, β, backend...) + else + _add_abelian_kernel_nonthreaded!(tdst, tsrc, p, transformer, α, β, + backend...) + end + else + if use_threaded_transform(tdst, transformer) + _add_general_kernel_threaded!(tdst, tsrc, p, transformer, α, β, backend...) + else + _add_general_kernel_nonthreaded!(tdst, tsrc, p, transformer, α, β, + backend...) + end + end end return tdst end -function add_transform_kernel!(tdst::TensorMap, - tsrc::TensorMap, - (p₁, p₂)::Index2Tuple, - ::TrivialTreeTransformer, - α::Number, - β::Number, - backend::AbstractBackend...) - return TO.tensoradd!(tdst[], tsrc[], (p₁, p₂), false, α, β, backend...) +function use_threaded_transform(t::TensorMap, transformer) + return get_num_transformer_threads() > 1 && length(t.data) > Strided.MINTHREADLENGTH +end +function use_threaded_transform(t::AbstractTensorMap, transformer) + return get_num_transformer_threads() > 1 && dim(space(t)) > Strided.MINTHREADLENGTH end -function add_transform_kernel!(tdst::TensorMap, - tsrc::TensorMap, - (p₁, p₂)::Index2Tuple, - transformer::AbelianTreeTransformer, - α::Number, - β::Number, - backend::AbstractBackend...) - structure_dst = transformer.structure_dst.fusiontreestructure - structure_src = transformer.structure_src.fusiontreestructure - - # TODO: this could be multithreaded - for (row, col, val) in zip(transformer.rows, transformer.cols, transformer.vals) - sz_dst, str_dst, offset_dst = structure_dst[col] - subblock_dst = StridedView(tdst.data, sz_dst, str_dst, offset_dst) - - sz_src, str_src, offset_src = structure_src[row] - subblock_src = StridedView(tsrc.data, sz_src, str_src, offset_src) +# Trivial implementations +# ----------------------- +function _add_trivial_kernel!(tdst, tsrc, p, transformer, α, β, backend...) + TO.tensoradd!(tdst[], tsrc[], p, false, α, β, backend...) + return nothing +end - TO.tensoradd!(subblock_dst, subblock_src, (p₁, p₂), false, α * val, β, backend...) +# Abelian implementations +# ----------------------- +function _add_abelian_kernel_nonthreaded!(tdst, tsrc, p, + transformer::AbelianTreeTransformer, + α, β, backend...) + for subtransformer in transformer.data + _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend...) end - return nothing end -function add_transform_kernel!(tdst::TensorMap, - tsrc::TensorMap, - (p₁, p₂)::Index2Tuple, - transformer::GenericTreeTransformer, - α::Number, - β::Number, - backend::AbstractBackend...) - structure_dst = transformer.structure_dst.fusiontreestructure - structure_src = transformer.structure_src.fusiontreestructure - - rows = rowvals(transformer.matrix) - vals = nonzeros(transformer.matrix) - - # TODO: this could be multithreaded - for j in axes(transformer.matrix, 2) - sz_dst, str_dst, offset_dst = structure_dst[j] - subblock_dst = StridedView(tdst.data, sz_dst, str_dst, offset_dst) - nzrows = nzrange(transformer.matrix, j) - - # treat first entry - sz_src, str_src, offset_src = structure_src[rows[first(nzrows)]] - subblock_src = StridedView(tsrc.data, sz_src, str_src, offset_src) - TO.tensoradd!(subblock_dst, subblock_src, (p₁, p₂), false, α * vals[first(nzrows)], - β, - backend...) - - # treat remaining entries - for i in @view(nzrows[2:end]) - sz_src, str_src, offset_src = structure_src[rows[i]] - subblock_src = StridedView(tsrc.data, sz_src, str_src, offset_src) - TO.tensoradd!(subblock_dst, subblock_src, (p₁, p₂), false, α * vals[i], One(), - backend...) +function _add_abelian_kernel_threaded!(tdst, tsrc, p, transformer::AbelianTreeTransformer, + α, β, backend...; + ntasks::Int=get_num_transformer_threads()) + nblocks = length(transformer.data) + counter = Threads.Atomic{Int}(1) + Threads.@sync for _ in 1:min(ntasks, nblocks) + Threads.@spawn begin + while true + local_counter = Threads.atomic_add!(counter, 1) + local_counter > nblocks && break + @inbounds subtransformer = transformer.data[local_counter] + _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend...) + end end end + return nothing +end - return tdst +function _add_transform_single!(tdst, tsrc, p, + (coeff, struct_dst, struct_src)::_AbelianTransformerData, + α, β, backend...) + subblock_dst = StridedView(tdst.data, struct_dst...) + subblock_src = StridedView(tsrc.data, struct_src...) + TO.tensoradd!(subblock_dst, subblock_src, p, false, α * coeff, β, backend...) + return nothing end -function add_transform_kernel!(tdst::AbstractTensorMap, - tsrc::AbstractTensorMap, - (p₁, p₂)::Index2Tuple, - fusiontreetransform::Function, - α::Number, - β::Number, - backend::AbstractBackend...) - I = sectortype(spacetype(tdst)) +function _add_abelian_kernel_nonthreaded!(tdst, tsrc, p, transformer, α, β, backend...) + for (f₁, f₂) in fusiontrees(tsrc) + _add_abelian_block!(tdst, tsrc, p, transformer, f₁, f₂, α, β, backend...) + end + return nothing +end - if I === Trivial - _add_trivial_kernel!(tdst, tsrc, (p₁, p₂), fusiontreetransform, α, β, backend...) - elseif FusionStyle(I) isa UniqueFusion - _add_abelian_kernel!(tdst, tsrc, (p₁, p₂), fusiontreetransform, α, β, backend...) - else - _add_general_kernel!(tdst, tsrc, (p₁, p₂), fusiontreetransform, α, β, backend...) +function _add_abelian_kernel_threaded!(tdst, tsrc, p, transformer, α, β, backend...) + Threads.@sync for (f₁, f₂) in fusiontrees(tsrc) + Threads.@spawn _add_abelian_block!(tdst, tsrc, p, transformer, f₁, f₂, α, β, + backend...) end + return nothing +end +function _add_abelian_block!(tdst, tsrc, p, transformer, f₁, f₂, α, β, backend...) + (f₁′, f₂′), coeff = first(transformer(f₁, f₂)) + @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, β, + backend...) return nothing end -# internal methods: no argument types -function _add_trivial_kernel!(tdst, tsrc, p, fusiontreetransform, α, β, backend...) - TO.tensoradd!(tdst[], tsrc[], p, false, α, β, backend...) +# Non-abelian implementations +# --------------------------- +function _add_general_kernel_nonthreaded!(tdst, tsrc, p, + transformer::GenericTreeTransformer, + α, β, backend...) + # preallocate buffers + buffers = allocate_buffers(tdst, tsrc, transformer) + + for subtransformer in transformer.data + # Special case without intermediate buffers whenever there is only a single block + if length(subtransformer[1]) == 1 + _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend...) + else + _add_transform_multi!(tdst, tsrc, p, subtransformer, buffers, α, β, backend...) + end + end return nothing end -function _add_abelian_kernel!(tdst, tsrc, p, fusiontreetransform, α, β, backend...) - if Threads.nthreads() > 1 - Threads.@sync for (f₁, f₂) in fusiontrees(tsrc) - Threads.@spawn _add_abelian_block!(tdst, tsrc, p, fusiontreetransform, - f₁, f₂, α, β, backend...) +function _add_general_kernel_threaded!(tdst, tsrc, p, transformer::GenericTreeTransformer, + α, β, backend...; + ntasks::Int=get_num_transformer_threads()) + nblocks = length(transformer.data) + + counter = Threads.Atomic{Int}(1) + Threads.@sync for _ in 1:min(ntasks, nblocks) + Threads.@spawn begin + # preallocate buffers for each task + buffers = allocate_buffers(tdst, tsrc, transformer) + + while true + local_counter = Threads.atomic_add!(counter, 1) + local_counter > nblocks && break + @inbounds subtransformer = transformer.data[local_counter] + if length(subtransformer[1]) == 1 + _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend...) + else + _add_transform_multi!(tdst, tsrc, p, subtransformer, buffers, + α, β, backend...) + end + end end - else - for (f₁, f₂) in fusiontrees(tsrc) - _add_abelian_block!(tdst, tsrc, p, fusiontreetransform, - f₁, f₂, α, β, backend...) + end + + return nothing +end + +function _add_general_kernel_nonthreaded!(tdst, tsrc, p, transformer, α, β, backend...) + if iszero(β) + tdst = zerovector!(tdst) + elseif !isone(β) + tdst = scale!(tdst, β) + end + for (f₁, f₂) in fusiontrees(tsrc) + for ((f₁′, f₂′), coeff) in transformer(f₁, f₂) + @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, + One(), backend...) end end return nothing end -function _add_abelian_block!(tdst, tsrc, p, fusiontreetransform, f₁, f₂, α, β, backend...) - (f₁′, f₂′), coeff = first(fusiontreetransform(f₁, f₂)) - @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, β, - backend...) +function _add_transform_single!(tdst, tsrc, p, + (basistransform, structs_dst, + structs_src)::_GenericTransformerData, + α, β, backend...) + struct_dst = (structs_dst[1], only(structs_dst[2])...) + struct_src = (structs_src[1], only(structs_src[2])...) + coeff = only(basistransform) + _add_transform_single!(tdst, tsrc, p, (coeff, struct_dst, struct_src), α, β, backend...) return nothing end -function _add_general_kernel!(tdst, tsrc, p, fusiontreetransform, α, β, backend...) +function _add_transform_multi!(tdst, tsrc, p, + (basistransform, (sz_dst, structs_dst), + (sz_src, structs_src)), + (buffer1, buffer2), α, β, backend...) + rows, cols = size(basistransform) + blocksize = prod(sz_src) + matsize = (prod(TupleTools.getindices(sz_src, codomainind(tsrc))), + prod(TupleTools.getindices(sz_src, domainind(tsrc)))) + + # Filling up a buffer with contiguous data + buffer_src = StridedView(buffer2, (blocksize, cols), (1, blocksize), 0) + for (i, struct_src) in enumerate(structs_src) + subblock_src = sreshape(StridedView(tsrc.data, sz_src, struct_src...), matsize) + _copyto!(buffer_src[:, i], subblock_src) + end + + # Resummation into a second buffer using BLAS + buffer_dst = StridedView(buffer1, (blocksize, rows), (1, blocksize), 0) + mul!(buffer_dst, buffer_src, basistransform, α, Zero()) + + # Filling up the output + for (i, struct_dst) in enumerate(structs_dst) + subblock_dst = StridedView(tdst.data, sz_dst, struct_dst...) + bufblock_dst = sreshape(buffer_dst[:, i], sz_src) + TO.tensoradd!(subblock_dst, bufblock_dst, p, false, One(), β, backend...) + end + + return nothing +end + +function _add_general_kernel_threaded!(tdst, tsrc, p, transformer, α, β, backend...) if iszero(β) tdst = zerovector!(tdst) - elseif β != 1 + elseif !isone(β) tdst = scale!(tdst, β) end - β′ = One() - if Threads.nthreads() > 1 - Threads.@sync for s₁ in sectors(codomain(tsrc)), s₂ in sectors(domain(tsrc)) - Threads.@spawn _add_nonabelian_sector!(tdst, tsrc, p, fusiontreetransform, s₁, - s₂, α, β′, backend...) - end - else - for (f₁, f₂) in fusiontrees(tsrc) - for ((f₁′, f₂′), coeff) in fusiontreetransform(f₁, f₂) - @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, - β′, backend...) - end - end + Threads.@sync for s₁ in sectors(codomain(tsrc)), s₂ in sectors(domain(tsrc)) + Threads.@spawn _add_nonabelian_sector!(tdst, tsrc, p, transformer, s₁, s₂, α, + backend...) end return nothing end -# TODO: β argument is weird here because it has to be 1 -function _add_nonabelian_sector!(tdst, tsrc, p, fusiontreetransform, s₁, s₂, α, β, - backend...) +function _add_nonabelian_sector!(tdst, tsrc, p, fusiontreetransform, s₁, s₂, α, backend...) for (f₁, f₂) in fusiontrees(tsrc) (f₁.uncoupled == s₁ && f₂.uncoupled == s₂) || continue for ((f₁′, f₂′), coeff) in fusiontreetransform(f₁, f₂) - @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, β, - backend...) + @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, + One(), backend...) end end return nothing diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl index 053c5f3ba..7c349c885 100644 --- a/src/tensors/treetransformers.jl +++ b/src/tensors/treetransformers.jl @@ -7,68 +7,158 @@ abstract type TreeTransformer end struct TrivialTreeTransformer <: TreeTransformer end -struct AbelianTreeTransformer{T,I,N,F1,F2,F3,F4} <: TreeTransformer - rows::Vector{Int} - cols::Vector{Int} - vals::Vector{T} - structure_dst::FusionBlockStructure{I,N,F1,F2} - structure_src::FusionBlockStructure{I,N,F3,F4} -end +const _AbelianTransformerData{T,N} = Tuple{T,StridedStructure{N},StridedStructure{N}} -struct GenericTreeTransformer{T,I,N,F1,F2,F3,F4} <: TreeTransformer - matrix::SparseMatrixCSC{T,Int} - structure_dst::FusionBlockStructure{I,N,F1,F2} - structure_src::FusionBlockStructure{I,N,F3,F4} +struct AbelianTreeTransformer{T,N} <: TreeTransformer + data::Vector{_AbelianTransformerData{T,N}} end -function treetransformertype(Vdst, Vsrc) - I = sectortype(Vdst) - I === Trivial && return TrivialTreeTransformer +function AbelianTreeTransformer(transform, p, Vdst, Vsrc) + t₀ = Base.time() + permute(Vsrc, p) == Vdst || throw(SpaceMismatch("Incompatible spaces for permuting.")) + structure_dst = fusionblockstructure(Vdst) + structure_src = fusionblockstructure(Vsrc) - N = numind(Vdst) - F1 = fusiontreetype(I, numout(Vdst)) - F2 = fusiontreetype(I, numin(Vdst)) - F3 = fusiontreetype(I, numout(Vsrc)) - F4 = fusiontreetype(I, numin(Vsrc)) - - if FusionStyle(I) isa UniqueFusion - return AbelianTreeTransformer{sectorscalartype(I),I,N,F1,F2,F3,F4} - else - return GenericTreeTransformer{sectorscalartype(I),I,N,F1,F2,F3,F4} + L = length(structure_src.fusiontreelist) + T = sectorscalartype(sectortype(Vdst)) + N = numind(Vsrc) + data = Vector{Tuple{T,StridedStructure{N},StridedStructure{N}}}(undef, L) + + for i in 1:L + f₁, f₂ = structure_src.fusiontreelist[i] + (f₃, f₄), coeff = only(transform(f₁, f₂)) + j = structure_dst.fusiontreeindices[(f₃, f₄)] + stridestructure_dst = structure_dst.fusiontreestructure[j] + stridestructure_src = structure_src.fusiontreestructure[i] + data[i] = (coeff, stridestructure_dst, stridestructure_src) end + + transformer = AbelianTreeTransformer(data) + + # sort by (approximate) weight to facilitate multi-threading strategies + # sort!(transformer) + + Δt = Base.time() - t₀ + + @debug("Treetransformer for $Vsrc to $Vdst via $p", nblocks = L, Δt) + + return transformer end -function TreeTransformer(transform::Function, Vdst::HomSpace{S}, - Vsrc::HomSpace{S}) where {S} - I = sectortype(Vdst) - I === Trivial && return TrivialTreeTransformer() +const _GenericTransformerData{T,N} = Tuple{Matrix{T}, + Tuple{NTuple{N,Int}, + Vector{Tuple{NTuple{N,Int},Int}}}, + Tuple{NTuple{N,Int}, + Vector{Tuple{NTuple{N,Int},Int}}}} + +struct GenericTreeTransformer{T,N} <: TreeTransformer + data::Vector{_GenericTransformerData{T,N}} +end +function GenericTreeTransformer(transform, p, Vdst, Vsrc) + t₀ = Base.time() + permute(Vsrc, p) == Vdst || throw(SpaceMismatch("Incompatible spaces for permuting.")) structure_dst = fusionblockstructure(Vdst) structure_src = fusionblockstructure(Vsrc) + I = sectortype(Vsrc) + + uncoupleds_src = map(structure_src.fusiontreelist) do (f₁, f₂) + return TupleTools.vcat(f₁.uncoupled, f₂.uncoupled) + end + uncoupleds_src_unique = unique(uncoupleds_src) + + uncoupleds_dst = map(structure_dst.fusiontreelist) do (f₁, f₂) + return TupleTools.vcat(f₁.uncoupled, f₂.uncoupled) + end - rows = Int[] - cols = Int[] - vals = sectorscalartype(sectortype(Vdst))[] + T = sectorscalartype(I) + N = numind(Vdst) + L = length(uncoupleds_src_unique) + data = Vector{_GenericTransformerData{T,N}}(undef, L) + + # TODO: this can be multithreaded + for (i, uncoupled) in enumerate(uncoupleds_src_unique) + ids_src = findall(==(uncoupled), uncoupleds_src) + fusiontrees_outer_src = structure_src.fusiontreelist[ids_src] + + uncoupled_dst = TupleTools.getindices(uncoupled, (p[1]..., p[2]...)) + ids_dst = findall(==(uncoupled_dst), uncoupleds_dst) + + fusiontrees_outer_dst = structure_dst.fusiontreelist[ids_dst] - for (row, (f1, f2)) in enumerate(structure_src.fusiontreelist) - for ((f3, f4), coeff) in transform(f1, f2) - col = structure_dst.fusiontreeindices[(f3, f4)] - push!(rows, row) - push!(cols, col) - push!(vals, coeff) + matrix = zeros(sectorscalartype(I), length(ids_dst), length(ids_src)) + for (row, (f₁, f₂)) in enumerate(fusiontrees_outer_src) + for ((f₃, f₄), coeff) in transform(f₁, f₂) + col = findfirst(==((f₃, f₄)), fusiontrees_outer_dst)::Int + matrix[row, col] = coeff + end end + + structs_src = structure_src.fusiontreestructure[ids_src] + sz_src = structs_src[1][1] + newstructs_src = map(x -> (x[2], x[3]), structs_src) + + structs_dst = structure_dst.fusiontreestructure[ids_dst] + sz_dst = structs_dst[1][1] + newstructs_dst = map(x -> (x[2], x[3]), structs_dst) + + @debug("Created recoupling block for uncoupled: $uncoupled", + sz = size(matrix), sparsity = count(!iszero, matrix) / length(matrix)) + + data[i] = (matrix, (sz_dst, newstructs_dst), (sz_src, newstructs_src)) end - if FusionStyle(I) isa UniqueFusion - return AbelianTreeTransformer(rows, cols, vals, structure_dst, structure_src) - else - ldst = length(structure_dst.fusiontreelist) - lsrc = length(structure_src.fusiontreelist) - matrix = sparse(rows, cols, vals, ldst, lsrc) - return GenericTreeTransformer(matrix, structure_dst, structure_src) + transformer = GenericTreeTransformer{T,N}(data) + + # sort by (approximate) weight to facilitate multi-threading strategies + sort!(transformer) + + Δt = Base.time() - t₀ + + @debug("TreeTransformer for $Vsrc to $Vdst via $p", + nblocks = length(data), + sz_median = size(data[end ÷ 2][1], 1), + sz_max = size(data[1][1], 1), + Δt) + + return transformer +end + +function buffersize(transformer::GenericTreeTransformer) + return maximum(transformer.data; init=0) do (basistransform, structures_dst, _) + return prod(structures_dst[1]) * size(basistransform, 1) end end +function allocate_buffers(tdst::TensorMap, tsrc::TensorMap, + transformer::GenericTreeTransformer) + sz = buffersize(transformer) + return similar(tdst.data, sz), similar(tsrc.data, sz) +end + +function treetransformertype(Vdst, Vsrc) + I = sectortype(Vdst) + I === Trivial && return TrivialTreeTransformer + + T = sectorscalartype(I) + N = numind(Vdst) + return FusionStyle(I) == UniqueFusion() ? AbelianTreeTransformer{T,N} : + GenericTreeTransformer{T,N} +end + +function TreeTransformer(transform::Function, p, Vdst::HomSpace{S}, + Vsrc::HomSpace{S}) where {S} + permute(Vsrc, p) == Vdst || + throw(SpaceMismatch("Incompatible spaces for permuting")) + + I = sectortype(Vdst) + I === Trivial && return TrivialTreeTransformer() + + return FusionStyle(I) == UniqueFusion() ? + AbelianTreeTransformer(transform, p, Vdst, Vsrc) : + GenericTreeTransformer(transform, p, Vdst, Vsrc) +end + # braid is special because it has levels function treebraider(::AbstractTensorMap, ::AbstractTensorMap, p::Index2Tuple, levels) return fusiontreetransform(f1, f2) = braid(f1, f2, levels..., p...) @@ -79,7 +169,7 @@ end @cached function treebraider(Vdst::TensorMapSpace, Vsrc::TensorMapSpace, p::Index2Tuple, levels)::treetransformertype(Vdst, Vsrc) fusiontreebraider(f1, f2) = braid(f1, f2, levels..., p...) - return TreeTransformer(fusiontreebraider, Vdst, Vsrc) + return TreeTransformer(fusiontreebraider, p, Vdst, Vsrc) end for (transform, treetransformer) in @@ -94,9 +184,30 @@ for (transform, treetransformer) in @cached function $treetransformer(Vdst::TensorMapSpace, Vsrc::TensorMapSpace, p::Index2Tuple)::treetransformertype(Vdst, Vsrc) fusiontreetransform(f1, f2) = $transform(f1, f2, p...) - return TreeTransformer(fusiontreetransform, Vdst, Vsrc) + return TreeTransformer(fusiontreetransform, p, Vdst, Vsrc) end end end # default cachestyle is GlobalLRUCache + +# Sorting based on cost model +# --------------------------- +function Base.sort!(transformer::Union{AbelianTreeTransformer,GenericTreeTransformer}; + by=_transformer_weight, rev::Bool=true) + sort!(transformer.data; by, rev) + return transformer +end + +function _transformer_weight((coeff, struct_dst, struct_src)::_AbelianTransformerData) + return prod(struct_dst[1]) +end + +# Cost model for transforming a set of subblocks with fixed uncoupled sectors: +# L x L x length(subblock) where L is the number of subblocks +# this is L input blocks each going to L output blocks of given length +# Note that it might be the case that the permutations are dominant, in which case the +# actual cost model would scale like L x length(subblock) +function _transformer_weight((mat, structs_dst, structs_src)::_GenericTransformerData) + return length(mat) * prod(structs_dst[1]) +end