diff --git a/.gitignore b/.gitignore index 20fe29d..918691c 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ *.jl.mem /Manifest.toml /docs/build/ +LocalPreferences.toml +.vscode/settings.json diff --git a/Project.toml b/Project.toml index 76f3d20..dcff28e 100644 --- a/Project.toml +++ b/Project.toml @@ -7,6 +7,7 @@ version = "0.9.17" BitIntegers = "c3b6d118-76ef-56ca-8cc7-ebb389d030a1" EllipsisNotation = "da5c29d0-fa7d-589e-88eb-ea29b0a81949" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" QuadGK = "1fd47b50-473d-5c70-9696-f719f8f3bcdc" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" @@ -16,6 +17,7 @@ EllipsisNotation = "1" QuadGK = "2.9" Random = "1.10.0" julia = "1.6" +MPI = "0.20.22" [extras] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" diff --git a/docs/src/documentation.md b/docs/src/documentation.md index 2c28b27..d406150 100644 --- a/docs/src/documentation.md +++ b/docs/src/documentation.md @@ -56,3 +56,8 @@ Modules = [TensorCrossInterpolation] Pages = ["cachedfunction.jl", "batcheval.jl", "util.jl", "globalsearch.jl"] ``` +## Parallel utility +```@autodocs +Modules = [TensorCrossInterpolation] +Pages = ["mpi.jl"] +``` \ No newline at end of file diff --git a/src/TensorCrossInterpolation.jl b/src/TensorCrossInterpolation.jl index cad8cd5..32777f3 100644 --- a/src/TensorCrossInterpolation.jl +++ b/src/TensorCrossInterpolation.jl @@ -3,8 +3,10 @@ module TensorCrossInterpolation using LinearAlgebra using EllipsisNotation using BitIntegers -import QuadGK +using MPI +using Base.Threads +import QuadGK # To add a method for rank(tci) import LinearAlgebra: rank, diag import LinearAlgebra as LA @@ -40,5 +42,6 @@ include("conversion.jl") include("integration.jl") include("contraction.jl") include("globalsearch.jl") +include("mpi.jl") end diff --git a/src/abstracttensortrain.jl b/src/abstracttensortrain.jl index 3ff96cc..8c63ea4 100644 --- a/src/abstracttensortrain.jl +++ b/src/abstracttensortrain.jl @@ -131,6 +131,26 @@ function evaluate( return only(prod(T[:, i, :] for (T, i) in zip(tt, indexset))) end +""" + function evaluate( + tt::TensorTrain{V}, + indexset::Union{AbstractVector{LocalIndex}, NTuple{N, LocalIndex}} + )::V where {N, V} + +Evaluates the tensor train `tt` at indices given by `indexset` and `jndexset`. This is ment to be used for MPOs. +""" +function evaluate( + tt::AbstractTensorTrain{V}, + indexset::Union{AbstractVector{LocalIndex},NTuple{N,LocalIndex}}, + jndexset::Union{AbstractVector{LocalIndex},NTuple{N,LocalIndex}} +)::V where {N,V} + if length(indexset) != length(tt) + throw(ArgumentError("To evaluate a tt of length $(length(tt)), you have to provide $(length(tt)) indices, but there were $(length(indexset)).")) + end + return only(prod(T[:, i, j, :] for (T, i, j) in zip(tt, indexset, jndexset))) +end + + """ function evaluate(tt::TensorTrain{V}, indexset::CartesianIndex) where {V} @@ -175,6 +195,38 @@ function sum(tt::AbstractTensorTrain{V}) where {V} return only(v) end +""" + function average(tt::TensorTrain{V}) where {V} + +Evaluates the average of the tensor train approximation over all lattice sites in an efficient +factorized manner. +""" +function average(tt::AbstractTensorTrain{V}) where {V} + v = transpose(sum(tt[1], dims=(1, 2))[1, 1, :]) / length(tt[1][1, :, 1]) + for T in tt[2:end] + v *= sum(T, dims=2)[:, 1, :] / length(T[1, :, 1]) + end + return only(v) +end + +""" + function weightedsum(tt::TensorTrain{V}, w::Vector{V}) where {V} + +Evaluates the weighted sum of the tensor train approximation over all lattice sites in an efficient +factorized manner, where w is the vector of vector of weights which has the same length and the same sizes as tt. +""" +function weightedsum(tt::AbstractTensorTrain{V}, w::Vector{Vector{V}}) where {V} + length(tt) == length(w) || throw(DimensionMismatch("The length of the Tensor Train is different from the one of the weight vector ($(length(tt)) and $(length(w))).")) + size(tt[1])[2] == length(w[1]) || throw(DimensionMismatch("The dimension at site 1 of the Tensor Train is different from the one of the weight vector ($(size(tt[1])[2]) and $(length(w[1]))).")) + v = transpose(sum(tt[1].*w[1]', dims=(1, 2))[1, 1, :]) + for i in 2:length(tt) + size(tt[i])[2] == length(w[i]) || throw(DimensionMismatch("The dimension at site $(i) of the Tensor Train is different from the one of the weight vector ($(size(tt[i])[2]) and $(length(w[i]))).")) + v *= sum(tt[i].*w[i]', dims=2)[:, 1, :] + end + return only(v) +end + + function _addtttensor( A::Array{V}, B::Array{V}; factorA=one(V), factorB=one(V), @@ -215,7 +267,7 @@ See also: [`+`](@ref) function add( lhs::AbstractTensorTrain{V}, rhs::AbstractTensorTrain{V}; factorlhs=one(V), factorrhs=one(V), - tolerance::Float64=0.0, maxbonddim::Int=typemax(Int) + tolerance::Float64=0.0, maxbonddim::Int=typemax(Int), normalizeerror::Bool=true ) where {V} if length(lhs) != length(rhs) throw(DimensionMismatch("Two tensor trains with different length ($(length(lhs)) and $(length(rhs))) cannot be added elementwise.")) @@ -233,7 +285,7 @@ function add( for ell in 1:L ] ) - compress!(tt, :SVD; tolerance, maxbonddim) + compress!(tt, :SVD; tolerance, maxbonddim, normalizeerror) return tt end @@ -247,9 +299,9 @@ Subtract two tensor trains `lhs` and `rhs`. See [`add`](@ref). """ function subtract( lhs::AbstractTensorTrain{V}, rhs::AbstractTensorTrain{V}; - tolerance::Float64=0.0, maxbonddim::Int=typemax(Int) + tolerance::Float64=0.0, maxbonddim::Int=typemax(Int), normalizeerror::Bool=true ) where {V} - return add(lhs, rhs; factorrhs=-1 * one(V), tolerance, maxbonddim) + return add(lhs, rhs; factorrhs=-1 * one(V), tolerance, maxbonddim, normalizeerror) end @doc raw""" @@ -270,6 +322,174 @@ function Base.:-(lhs::AbstractTensorTrain{V}, rhs::AbstractTensorTrain{V}) where return subtract(lhs, rhs) end +function leftcanonicalize!(tt::AbstractTensorTrain{ValueType}) where {ValueType} + n = length(tt) # Number of sites + for i in 1:n-1 + Q, R = qr(reshape(tt[i], prod(size(tt[i])[1:end-1]), size(tt[i])[end])) + Q = Matrix(Q) + + tt[i] = reshape(Q, size(tt[i])[1:end-1]..., size(Q, 2)) # New bond dimension after Q + + tmptt = reshape(tt[i+1], size(R, 2), :) # Reshape next tensor + tmptt .= Matrix(R) * tmptt + tt[i+1] = reshape(tmptt, size(tt[i+1])...) # Reshape back + end +end + +# This creates a TensorTrain which has every site right-canonical except the last +function rightcanonicalize!(tt::AbstractTensorTrain{ValueType}) where {ValueType} + n = length(tt) # Number of sites + for i in n:-1:2 + # Reshape W_i into a matrix (merging right bond and physical indices) + W = tt[i] + χl, d1, d2, χr = size(W) + W_mat = reshape(W, χl, d1*d2*χr) + + # Perform RQ decottsition: W_mat = R * Q + F = lq(reverse(W_mat, dims=1)) + R, Q = reverse(F.L), reverse(Matrix(F.Q), dims=1) # https://discourse.julialang.org/t/rq-decomposition/112795/13 + + # Reshape Q back into the MPO tensor + tt[i] = reshape(Q, size(Q, 1), d1, d2, χr) # New bond dimension after Q + + # Update the previous tt tensor by absorbing R + tmptt = reshape(tt[i-1], :, size(R, 1)) # Reshape previous tensor + tmptt .= tmptt * Matrix(R) + tt[i-1] = reshape(tmptt, size(tt[i-1], 1), d1, d2, size(tt[i-1], 4)) # Reshape back + end +end + + +function centercanonicalize!(tt::Vector{Array{ValueType, N}}, center::Int; old_center::Int=0) where {ValueType, N} + orthogonality = checkorthogonality(tt) + n = length(tt) # Number of sites + + if count(==( :N ), orthogonality) == 1 + old_center_ = findfirst(==( :N ), orthogonality) + if old_center_ == nothing # Useless, but help JET compiling + old_center_ = old_center + end + # println("Sto canonicalizzando centrando in $center. ho trovato il centro in $old_center_. Quindi flipperò: $(center < old_center_ ? [size(tt[i]) for i in center:old_center_] : [size(tt[i]) for i in old_center_:center])") + if old_center != 0 && old_center != old_center_ + println("Warning! In centercanonicalize!() old_center has been set as $old_center, but the real old center is $old_center_") + end + elseif old_center == 0 + old_center_ = 1 + else + old_center_ = old_center + end + # LEFT + for i in old_center_:center-1 + Q, R = qr(reshape(tt[i], prod(size(tt[i])[1:end-1]), size(tt[i])[end])) + Q = Matrix(Q) + + tt[i] = reshape(Q, size(tt[i])[1:end-1]..., size(Q, 2)) # New bond dimension after Q + + tmptt = reshape(tt[i+1], size(R, 2), :) # Reshape next tensor + tmptt = Matrix(R) * tmptt + tt[i+1] = reshape(tmptt, size(tt[i+1])...) # Reshape back + end + # RIGHT + if count(==( :N ), orthogonality) == 1 + old_center_ = findfirst(==( :N ), orthogonality) + if old_center_ == nothing # Useless, but help JET compiling + old_center_ = old_center + end + if old_center != 0 && old_center != old_center_ + println("Warning! In centercanonicalize!() old_center has been set as $old_center, but the real old center is $old_center_") + end + elseif old_center == 0 + old_center_ = n + else + old_center_ = old_center + end + for i in old_center_:-1:center+1 + W = tt[i] + χl, d1, d2, χr = size(W) + W_mat = reshape(W, χl, d1*d2*χr) + + L, Q = lq(W_mat) + Q = Matrix(Q) + # Reshape Q back into the tt tensor + tt[i] = reshape(Q, size(Q, 1), d1, d2, χr) # New bond dimension after Q + + # Update the previous tt tensor by absorbing L + tmptt = reshape(tt[i-1], :, size(L, 1)) # Reshape previous tensor + tmptt = tmptt * Matrix(L) + tt[i-1] = reshape(tmptt, size(tt[i-1], 1), d1, d2, size(tmptt, 2)) # Reshape back + end +end + +function move_center_right!(tt, i) + A = tt[i] + d = size(A) + A_mat = reshape(A, prod(d[1:end-1]), d[end]) + Q, R = qr(A_mat) + Q = Matrix(Q) + tt[i] = reshape(Q, d[1:end-1]..., size(Q, 2)) + + B = tt[i+1] + B_mat = reshape(B, size(R, 2), :) + B_mat .= Matrix(R) * B_mat + tt[i+1] = reshape(B_mat, size(B)...) +end + +function move_center_left!(tt, i) + A = tt[i] + d = size(A) + A_mat = reshape(A, d[1], prod(d[2:end])) + L, Q = lq(A_mat) + Q = Matrix(Q) + tt[i] = reshape(Q, size(Q,1), d[2:end]...) + + B = tt[i-1] + B_mat = reshape(B, :, size(L,1)) + B_mat .= B_mat * Matrix(L) + tt[i-1] = reshape(B_mat, size(B)[1:3]..., size(L,1)) +end + + +function leftcanonicalize(tt::AbstractTensorTrain{ValueType}) where {ValueType} + tt_ = deepcopy(tt) + leftcanonicalize!(tt_) + return tt_ +end + +# This creates a TensorTrain which has every site right-canonical except the last +function rightcanonicalize(tt::AbstractTensorTrain{ValueType}) where {ValueType} + tt_ = deepcopy(tt) + rightcanonicalize!(tt_) + return tt_ +end + +# This creates a TensorTrain which has every site right-canonical except the last +function centercanonicalize(tt::Vector{Array{ValueType, N}}, center::Int; old_center::Int=0) where {ValueType, N} + tt_ = deepcopy(tt) + centercanonicalize!(tt_, center; old_center) + return tt_ +end + +function checkorthogonality(tt::Vector{Array{ValueType, N}}) where {ValueType, N} + ort = Vector{Symbol}(undef, length(tt)) + for i in 1:length(tt) + W = tt[i] + left_check = _contract(permutedims(W, (4,2,3,1,)), W, (2,3,4,),(2,3,1)) + right_check = _contract(W, permutedims(W, (4,2,3,1,)), (2,3,4,),(2,3,1)) + is_left = isapprox(left_check, I, atol=1e-7) + is_right = isapprox(right_check, I, atol=1e-7) + ort[i] = if is_left && is_right + :O # Orthogonal + elseif is_left + :L # Left orthogonal + elseif is_right + :R # Right orthogonal + else + :N # Non orthogonal + end + end + return ort +end + """ Squared Frobenius norm of a tensor train. """ diff --git a/src/contraction.jl b/src/contraction.jl index af847e6..b3ebd74 100644 --- a/src/contraction.jl +++ b/src/contraction.jl @@ -108,6 +108,40 @@ function _extend_cache(oldcache::Matrix{T}, a_ell::Array{T,4}, b_ell::Array{T,4} return _contract(tmp1, b_ell[:, :, j, :], (1, 2), (1, 2)) end +# Compute full left environment +function leftenvironment( + A::Vector{Array{T,4}}, + B::Vector{Array{T,4}}, + X::Vector{Array{T,4}}, + i::Int; + # L::Array{T, 3} = ones(T, 1, 1, 1), + L::Union{Nothing, Array{T, 3}} = nothing, + Li::Int = 1 +)::Array{T, 3} where {T} + L === nothing && (L = ones(T, 1, 1, 1)) + for ell in Li:i + L = _contract(_contract(_contract(L, A[ell], (1,), (1,)), B[ell], (1,4,), (1,2,)), conj(X[ell]), (1,2,4,), (1,2,3,)) + end + return L +end + +# Compute full right environment +function rightenvironment( + A::Vector{Array{T,4}}, + B::Vector{Array{T,4}}, + X::Vector{Array{T,4}}, + i::Int; + # R::Array{T, 3} = ones(T, 1, 1, 1), + R::Union{Nothing, Array{T, 3}} = nothing, + Ri::Int = length(A) + )::Array{T, 3} where {T} + R === nothing && (R = ones(T, 1, 1, 1)) + for ell in Ri:-1:i + R = permutedims(_contract(conj(X[ell]), _contract(B[ell], _contract(A[ell], R, (4,), (1,)), (2,4,), (3,4,)), (2,3,4,), (4,2,5,)), (3,2,1,)) + end + return R +end + # Compute left environment function evaluateleft( obj::Contraction{T}, @@ -443,8 +477,9 @@ function contract_zipup( A::TensorTrain{ValueType,4}, B::TensorTrain{ValueType,4}; tolerance::Float64=1e-12, - method::Symbol=:SVD, # :SVD, :LU - maxbonddim::Int=typemax(Int) + method::Symbol=:SVD, # :SVD, :RSVD, :LU, :CI + maxbonddim::Int=typemax(Int), + kwargs... ) where {ValueType} if length(A) != length(B) throw(ArgumentError("Cannot contract tensor trains with different length.")) @@ -455,13 +490,16 @@ function contract_zipup( for n in 1:length(A) # R: (link_ab, link_an, link_bn) # A[n]: (link_an, s_n, s_n', link_anp1) + #println("Time RA:") RA = _contract(R, A[n], (2,), (1,)) # RA[n]: (link_ab, link_bn, s_n, s_n' link_anp1) # B[n]: (link_bn, s_n', s_n'', link_bnp1) # C: (link_ab, s_n, link_anp1, s_n'', link_bnp1) # => (link_ab, s_n, s_n'', link_anp1, link_bnp1) + #println("Time C:") C = permutedims(_contract(RA, B[n], (2, 4), (1, 2)), (1, 2, 4, 3, 5)) + #println(size(C)) if n == length(A) sitetensors[n] = reshape(C, size(C)[1:3]..., 1) break @@ -485,6 +523,644 @@ function contract_zipup( return TensorTrain{ValueType,4}(sitetensors) end +function contract_distr_zipup( + A::TensorTrain{ValueType,4}, + B::TensorTrain{ValueType,4}; + tolerance::Float64=1e-12, + method::Symbol=:SVD, + p::Int=16, + maxbonddim::Int=typemax(Int), + estimatedbond::Union{Nothing,Int}=nothing, + subcomm::Union{Nothing, MPI.Comm}=nothing, + kwargs... +) where {ValueType} + if length(A) != length(B) + throw(ArgumentError("Cannot contract tensor trains with different length.")) + end + + R::Array{ValueType,3} = ones(ValueType, 1, 1, 1) + + if MPI.Initialized() + if subcomm != nothing + comm = subcomm + else + comm = MPI.COMM_WORLD + end + mpirank = MPI.Comm_rank(comm) + juliarank = mpirank + 1 + nprocs = MPI.Comm_size(comm) + if estimatedbond == nothing + estimatedbond = maxbonddim == typemax(Int) ? max(maximum(linkdims(A)), maximum(linkdims(B))) : maxbonddim + end + noderanges = _noderanges(nprocs, length(A), size(A[1])[2]*size(B[1])[3], estimatedbond; interbond=true, algorithm=:mpompo) + Us = Vector{Array{ValueType,3}}(undef, length(noderanges[juliarank])) + Vts = Vector{Array{ValueType,3}}(undef, length(noderanges[juliarank])) + else + println("Warning! Distributed zipup has been chosen, but MPI is not initialized, please use initializempi() before using contract() and use finalizempi() afterwards") + return contract_zipup(A, B; tolerance, method, maxbonddim, kwargs...) + end + + if nprocs == 1 + println("Warning! Distributed zipup has been chosen, but only one process is running") + return contract_zipup(A, B; tolerance, method, maxbonddim, kwargs...) + end + + finalsitetensors = Vector{Array{ValueType,4}}(undef, length(A)) + + time = time_ns() + if method == :SVD || method == :RSVD + if maxbonddim == typemax(Int) + println("Warning! distrzipup uses always Randomized SVD, which cannot accept maxbonddim=typemax(Int), it will be set to the maximum bond of A or B") + maxbonddim = max(maximum(linkdims(A)), maximum(linkdims(B))) + end + for (i, n) in enumerate(noderanges[juliarank]) + # Random matrix + G = randn(size(A[n])[end], size(B[n])[end], maxbonddim+p) + # Projection on smaller space + Y = _contract(A[n], G, (4,), (1,)) + Y = _contract(B[n], Y, (2,4,), (3,4)) + Y = permutedims(Y, (3,1,4,2,5,)) + + # QR decomposition + Q = Matrix(LinearAlgebra.qr!(reshape(Y, (prod(size(Y)[1:4]), size(Y)[end]))).Q) + Q = reshape(Q, (size(Y)[1:4]..., min(prod(size(Y)[1:4]), size(Y)[end]))) + Qt = permutedims(Q, (5, 3, 4, 1, 2)) + # Smaller object to SVD + to_svd = _contract(Qt, A[n], (2,4,), (2,1,)) + to_svd = _contract(to_svd, B[n], (2,3,4,), (3,1,2,)) + factorization = svd(reshape(to_svd, (size(to_svd)[1], prod(size(to_svd)[2:3])))) + newbonddimr = min( + replacenothing(findlast(>(tolerance), Array(factorization.S)), 1), + maxbonddim + ) + + U = _contract(Q, factorization.U[:, 1:newbonddimr], (5,), (1,)) + US = _contract(U, Diagonal(factorization.S[1:newbonddimr]), (5,), (1,)) + U, Vt, newbonddiml = _factorize( + reshape(US, prod(size(US)[1:2]), prod(size(US)[3:5])), + method; tolerance, maxbonddim, leftorthogonal=true + ) + U = reshape(U, (size(A[n])[1], size(B[n])[1], newbonddiml)) + finalsitetensors[n] = reshape(Vt, newbonddiml, size(US)[3:5]...) + + Us[i] = U + Vts[i] = reshape(factorization.Vt[1:newbonddimr, :], newbonddimr, size(to_svd)[end-1], size(to_svd)[end]) + end + elseif method == :CI || method == :LU + for (i, n) in enumerate(noderanges[juliarank]) + dimsA = size(A[n]) + dimsB = size(B[n]) + function f24(j,k; print=false) + b1, a1 = Tuple(CartesianIndices((dimsB[1], dimsA[1]))[j]) # Extract (a1, b1) from j + b4, a4, b3, a2 = Tuple(CartesianIndices((dimsB[4], dimsA[4], dimsB[3], dimsA[2]))[k]) # Extract (a2, b3, a4, b4) from k + sum(A[n][a1, a2, c, a4] * B[n][b1, c, b3, b4] for c in 1:dimsA[3]) # Summing over the contracted index + end + + # Non Zero element to start PRRLU decomposition + nz = nothing + size_A = size(A[n]) # (i, j, k, l) + size_B = size(B[n]) # (m, k, nn, o) + + for i in 1:size_A[1], m in 1:size_B[1], j in 1:size_A[2], nn in 1:size_B[3], l in 1:size_A[4], o in 1:size_B[4] + sum_val = zero(eltype(A[n])) + for k in 1:size_A[3] # k index contraction + if A[n][i, j, k, l] != 0 && B[n][m, k, nn, o] != 0 + sum_val += A[n][i, j, k, l] * B[n][m, k, nn, o] + end + end + if sum_val != 0 + nz = [i, m, j, nn, l, o] + break + end + end + + # CartesianIndices are read the other way around + I0 = [(nz[1]-1)*dimsB[1] + nz[2], (nz[3]-1)*dimsB[3]*dimsA[4]*dimsB[4] + (nz[4]-1)*dimsA[4]*dimsB[4] + (nz[5]-1)*dimsB[4] + nz[6]] + J0 = [(nz[1]-1)*dimsB[1] + nz[2], (nz[3]-1)*dimsB[3]*dimsA[4]*dimsB[4] + (nz[4]-1)*dimsA[4]*dimsB[4] + (nz[5]-1)*dimsB[4] + nz[6]] + factorization = arrlu(ValueType, f24, (dimsA[1]*dimsB[1], dimsA[2]*dimsB[3]*dimsA[4]*dimsB[4]), I0, J0; abstol=tolerance, maxrank=maxbonddim) + L = left(factorization) + U = right(factorization) + newbonddiml = npivots(factorization) + + U_3_2 = reshape(U, size(U)[1]*dimsA[2]*dimsB[3], dimsA[4]*dimsB[4]) + U, Vt, newbonddimr = _factorize(U_3_2, :SVD; tolerance, maxbonddim) + Us[i] = reshape(L, dimsA[1], dimsB[1], newbonddiml) + finalsitetensors[n] = reshape(U, newbonddiml, dimsA[2], dimsB[3], newbonddimr) + Vts[i] = reshape(Vt, newbonddimr, dimsA[4], dimsB[4]) + end + end + + # Exchange Vt to left + if MPI.Initialized() + MPI.Barrier(comm) + reqs = MPI.Request[] + if juliarank < nprocs + push!(reqs, MPI.isend(Vts[end], comm; dest=mpirank+1, tag=mpirank)) + end + if juliarank > 1 + Vts_l = MPI.recv(comm; source=mpirank-1, tag=mpirank-1) + end + MPI.Waitall(reqs) + MPI.Barrier(comm) + end + + # contraction + time = time_ns() + for (i, n) in enumerate(noderanges[juliarank]) + if i > 1 + VtU = _contract(Vts[i-1], Us[i], (2,3,), (1,2,)) + finalsitetensors[n] = _contract(VtU, finalsitetensors[n], (2,), (1,)) + elseif n > 1 + VtU = _contract(Vts_l, Us[i], (2,3,), (1,2,)) + finalsitetensors[n] = _contract(VtU, finalsitetensors[n], (2,), (1,)) + end + end + if juliarank == 1 + finalsitetensors[1] = _contract(_contract(R, Us[1], (2,3,), (1,2,)), finalsitetensors[1], (2,), (1,)) + end + if juliarank == nprocs + finalsitetensors[end] = _contract(finalsitetensors[end], _contract(Vts[end], R, (2,3,), (1,2,)), (4,), (1,)) + end + + # All to all exchange + if MPI.Initialized() + all_sizes = [length(noderanges[r]) for r in 1:nprocs] + shapes = [isassigned(finalsitetensors, i) ? size(finalsitetensors[i]) : (0,0,0,0) for i in 1:length(finalsitetensors)] + shapesbuffer = VBuffer(shapes, all_sizes) + MPI.Allgatherv!(shapesbuffer, comm) + + lengths = [sum(prod.(shapes)[noderanges[r]]) for r in 1:nprocs] + all_length = sum(prod.(shapes)) + vec_tensors = zeros(all_length) + + if juliarank == 1 + before_me = 0 + else + before_me = sum(prod.(shapes[1:noderanges[juliarank][1]-1])) + end + me = sum(prod.(shapes[noderanges[juliarank]])) + vec_tensors[before_me+1:before_me+me] = vcat([vec(finalsitetensors[i]) for i in 1:length(finalsitetensors) if isassigned(finalsitetensors, i)]...) + + sendrecvbuf = VBuffer(vec_tensors, lengths) + MPI.Allgatherv!(sendrecvbuf, comm) + + idx = 1 + for i in 1:length(finalsitetensors) + s = shapes[i] + len = prod(s) + finalsitetensors[i] = reshape(view(vec_tensors, idx:idx+len-1), s) + idx += len + end + end + MPI.Barrier(comm) + + return TensorTrain{ValueType,4}(finalsitetensors) +end + +# If Ri = i, then R is the right environment until site i+1, which has size (size(A[i])[end], size(B[i])[end], size(X[i]])[end]) +# If Li = i, then L is the left environment until site i-1, which has size (size(A[i])[1], size(B[i])[1], size(X[i]])[1]) +function updatecore!(A::Vector{Array{T,4}}, B::Vector{Array{T,4}}, X::Vector{Array{T,4}}, i::Int; + method::Symbol=:SVD, tolerance::Float64=1e-8, maxbonddim::Int=typemax(Int), direction::Symbol=:forward, + R::Union{Nothing, Array{T,3}}=nothing, Ri::Int=length(A), + L::Union{Nothing, Array{T,3}}=nothing, Li::Int=1 + )::Tuple{Float64, Array{T,3}, Array{T,3}} where {T} + L = leftenvironment(A, B, X, i-1; L, Li) + R = rightenvironment(A, B, X, i+2; R, Ri) + + Le = _contract(_contract(L, A[i], (1,), (1,)), B[i], (1, 4), (1, 2)) + Re = _contract(B[i+1], _contract(A[i+1], R, (4,), (1,)), (2, 4), (3, 4)) + Ce = _contract(Le, Re, (3, 5), (3, 1)) + Ce = permutedims(Ce, (1, 2, 3, 5, 4, 6)) + + left, right, newbonddim = _factorize( + reshape(Ce, prod(size(Ce)[1:3]), prod(size(Ce)[4:6])), + method; tolerance, maxbonddim, leftorthogonal=(direction == :forward ? true : false) + ) + + X_i = reshape(left, :, size(X[i])[2:3]..., newbonddim) + X_i1 = reshape(right, newbonddim, size(X[i+1])[2:3]..., :) + + if size(X[i]) != size(X_i) || size(X[i+1]) != size(X_i1) + diff = Inf + else + diff = maximum(abs, X[i] .- X_i) + diff = max(diff, maximum(abs, X[i+1] .- X_i1)) + end + X[i] = X_i + X[i+1] = X_i1 + return diff, L, R +end + +""" + function contract_fit(A::TensorTrain{ValueType,4}, B::TensorTrain{ValueType,4}) + +Conctractos tensor trains A and B using the fit algorithm. + +# Keyword Arguments + +- `nsweep::Int`: Number of sweeps to perform during the algorithm. +- `initial_guess`: Optional initial guess for the tensor train A*B. If not provided, a tensor train of rank one is used. This must have coherent bond dimension (i.e. start and finish with less than [d,d^2,d^3,...,d^3,d^2,d]). +- `tolerance::Float64`: Convergence tolerance for the iterative algorithm. +- `method::Symbol`: Algorithm or method to use for the computation :SVD, :RSVD, :LU, :CI. +- `maxbonddim::Int`: Maximum bond dimension allowed during the decomposition. +""" +function contract_fit( + mpoA::TensorTrain{ValueType,4}, + mpoB::TensorTrain{ValueType,4}; + nsweeps::Int=2, + initial_guess::Union{Nothing,TensorTrain{ValueType,4}}=nothing, + tolerance::Float64=1e-12, + method::Symbol=:SVD, # :SVD, :RSVD, :LU, :CI + maxbonddim::Int=typemax(Int), + kwargs... +) where {ValueType} + if length(mpoA) != length(mpoB) + throw(ArgumentError("Cannot contract tensor trains with different length.")) + end + + n = length(mpoA) + + A = Vector{Array{ValueType, 4}}(undef, n) + B = Vector{Array{ValueType, 4}}(undef, n) + X = Vector{Array{ValueType, 4}}(undef, n) + for i in 1:n + A[i] = deepcopy(mpoA.sitetensors[i]) + B[i] = deepcopy(mpoB.sitetensors[i]) + if !isnothing(initial_guess) + X[i] = deepcopy(initial_guess.sitetensors[i]) + else + X[i] = ones(ValueType, 1, size(A[i])[2], size(B[i])[3], 1) + end + end + + Rs = Vector{Array{ValueType, 3}}(undef, n) + Ls = Vector{Array{ValueType, 3}}(undef, n) + + centercanonicalize!(A, 1) + centercanonicalize!(B, 1) + centercanonicalize!(X, 1) + # Precompute right environments + for i in n:-1:3 + if i == n + Rs[i] = rightenvironment(A, B, X, i) + else + Rs[i] = rightenvironment(A, B, X, i; R=Rs[i+1], Ri=i) + end + end + + # It doesn't matter if we repeat update in 1 or n-1, those are negligible + direction = :forward + for sweep in 1:nsweeps + max_diff = 0.0 + # Update cores and store Left or Right environment + if direction == :forward + for i in 1:n-1 + centercanonicalize!(A, i) + centercanonicalize!(B, i) + centercanonicalize!(X, i) + if i == 1 + diff, _, _ = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction, R=Rs[i+2], Ri=i+1) + elseif i == 2 + diff, Ls[i-1], _ = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction, R=Rs[i+2], Ri=i+1) + elseif i < n-1 + diff, Ls[i-1], _ = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction, L=Ls[i-2], Li=i-1, R=Rs[i+2], Ri=i+1) + else # i == n-1 + diff, Ls[i-1], _ = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction, L=Ls[i-2], Li=i-1) + end + max_diff = max(max_diff, diff) + end + direction = :backward + elseif direction == :backward + for i in n-1:-1:1 + centercanonicalize!(A, i) + centercanonicalize!(B, i) + centercanonicalize!(X, i) + if i == n-1 + diff, _, _ = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction, L=Ls[i-1], Li=i) + elseif i == n-2 + diff, _, Rs[i+2] = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction, L=Ls[i-1], Li=i) + elseif i > 1 + diff, _, Rs[i+2] = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction, L=Ls[i-1], Li=i, R=Rs[i+3], Ri=i+2) + else # i == 1 + diff, _, Rs[i+2] = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction, R=Rs[i+3], Ri=i+2) + end + max_diff = max(max_diff, diff) + end + direction = :forward + end + if max_diff < tolerance + break + end + end + return TensorTrain{ValueType,4}(X) +end + +""" + function contract_distr_fit(A::TensorTrain{ValueType,4}, B::TensorTrain{ValueType,4}) + +Conctractos tensor trains A and B using the fit algorithm. + +# Keyword Arguments + +- `nsweep::Int`: Number of sweeps to perform during the algorithm. +- `initial_guess`: Initial guess of the solution A*B. +- `subcomm::Union{Nothing, MPI.Comm}`: Subcommunicator to use for the distributed computation. If nothing, the global communicator is used. +- `tolerance::Float64`: Convergence tolerance for the iterative algorithm. +- `method::Symbol`: Algorithm or method to use for the computation :SVD, :RSVD, :LU, :CI. +- `maxbonddim::Int`: Maximum bond dimension allowed during the decomposition. +- `synchedmpo::Bool`: Tells whether the input MPOs are synchronized. Use "true" if you know that the input MPOs are exactly the same across different nodes. +""" +function contract_distr_fit( + mpoA::TensorTrain{ValueType,4}, + mpoB::TensorTrain{ValueType,4}; + nsweeps::Int=2, + initial_guess::Union{Nothing,TensorTrain{ValueType,4}}=nothing, + subcomm::Union{Nothing, MPI.Comm}=nothing, + noderanges::Union{Nothing,Vector{UnitRange{Int}}}=nothing, + tolerance::Float64=1e-12, + method::Symbol=:SVD, # :SVD, :RSVD, :LU, :CI + maxbonddim::Int=typemax(Int), + synchedmpo::Bool=false, + kwargs... +) where {ValueType} + if length(mpoA) != length(mpoB) + throw(ArgumentError("Cannot contract tensor trains with different length.")) + end + + if !synchedmpo + synchronize_tt!(mpoA) + synchronize_tt!(mpoB) + synchronize_tt!(initial_guess) + end + + n = length(mpoA) + + A = Vector{Array{ValueType, 4}}(undef, n) + B = Vector{Array{ValueType, 4}}(undef, n) + X = Vector{Array{ValueType, 4}}(undef, n) + for i in 1:n + A[i] = deepcopy(mpoA.sitetensors[i]) + B[i] = deepcopy(mpoB.sitetensors[i]) + if !isnothing(initial_guess) + X[i] = deepcopy(initial_guess.sitetensors[i]) + else + X[i] = ones(ValueType, 1, size(A[i])[2], size(B[i])[3], 1) + end + end + + if MPI.Initialized() + if subcomm != nothing + comm = subcomm + else + comm = MPI.COMM_WORLD + end + mpirank = MPI.Comm_rank(comm) + juliarank = mpirank + 1 + nprocs = MPI.Comm_size(comm) + if noderanges == nothing + if n < 4 + println("Warning! The TT is too small to be parallelized.") + return contract_fit(mpoA, mpoB; nsweeps, initial_guess, tolerance, method, maxbonddim, kwargs...) + end + if n < 6 + if nprocs > 2 + println("Warning! The TT is too small to be parallelized with more than 2 nodes. Some nodes will not be used.") + end + nprocs = 2 + noderanges = [1:2,3:n] + elseif n == 6 + if nprocs == 2 + noderanges = [1:3,4:6] + elseif nprocs == 3 + noderanges = [1:2,3:4,5:6] + else + println("Warning! The TT is too small to be parallelized with more than 3 nodes. Some nodes will not be used.") + nprocs = 3 + noderanges = [1:2,3:4,5:6] + end + else + extra1 = 3 # "Hyperparameter" + extraend = 3 + if nprocs >= div(n - extra1 - extraend, 2) # It's just one update per node. + println("Warning! A TT of lenght L can use be parallelized with up to (L-$(extra1 + extraend))/2 nodes. Some nodes will not be used.") + # Each node has 2 cores. Except first and last who have 2+extra1 and 2+extraend+n%2 + if n % 2 == 0 + noderanges = vcat([1:extra1+1],[extra1+i+1:extra1+i+2 for i in 1:2:n-extra1-extraend-2],[n-extraend:n]) + else + noderanges = vcat([1:extra1+1],[extra1+i+1:extra1+i+2 for i in 1:2:n-extra1-extraend-3],[n-1-extraend:n]) + end + for _ in div(n - extra1 - extraend, 2)+1:nprocs + push!(noderanges, 1:-1) + end + nprocs = div(n - extra1 - extraend, 2) + else + number_of_sites, rem = divrem(n-extra1-extraend, nprocs) + sites = [number_of_sites for i in 1:nprocs] + sites[1] += extra1 + sites[end] += extraend + # Distribute the remainder as evenly as possible + for i in 1:rem + if i % 2 == 1 + sites[div(i,2)+2] += 1 + else + sites[end-div(i,2)] += 1 + end + end + noderanges = [sum(sites[1:i-1])+1:sum(sites[1:i]) for i in 1:nprocs] + end + end + end + else + println("Warning! Distributed strategy has been chosen, but MPI is not initialized, please use TCI.initializempi() before contract() and use TCI.finalizempi() afterwards") + return contract_fit(mpoA, mpoB; nsweeps, initial_guess, tolerance, method, maxbonddim, kwargs...) + end + + if nprocs == 1 + println("Warning! Distributed strategy has been chosen, but only one process is running") + return contract_fit(mpoA, mpoB; nsweeps, initial_guess, tolerance, method, maxbonddim, kwargs...) + end + + if !isnothing(initial_guess) + X = deepcopy(initial_guess.sitetensors) + else + X = [ones(ValueType, 1, size(A[i])[2], size(B[i])[3], 1) for i in 1:n] + end + + Ls = Vector{Array{ValueType, 3}}(undef, n) + Rs = Vector{Array{ValueType, 3}}(undef, n) + + first = noderanges[juliarank][1] + last = noderanges[juliarank][end] + + #Ls are left enviroments and Rs are right environments + if first != 1 + Ls[first-1] = ones(ValueType, size(A[first])[1], size(B[first])[1], size(X[first])[1]) + end + if last != n + Rs[last+1] = ones(ValueType, size(A[last])[end], size(B[last])[end], size(X[last])[end]) + end + + + if juliarank % 2 == 1 # Precompute right environment if going forward + centercanonicalize!(A, first) + centercanonicalize!(B, first) + centercanonicalize!(X, first) + for i in last:-1:first+2 + if i == n + Rs[i] = rightenvironment(A, B, X, i) + else + Rs[i] = rightenvironment(A, B, X, i; R=Rs[i+1], Ri=i) + end + end + else # Precompute left environment if going backward + centercanonicalize!(A, last-1) + centercanonicalize!(B, last-1) + centercanonicalize!(X, last-1) + for i in first:last-2 + if i == 1 + Ls[i] = leftenvironment(A, B, X, i) + else + Ls[i] = leftenvironment(A, B, X, i; L=Ls[i-1], Li=i) + end + end + end + + for sweep in 1:nsweeps + max_diff = 0.0 + if sweep % 2 == juliarank % 2 + for i in first:last-1 + centercanonicalize!(A, i) + centercanonicalize!(B, i) + if i == 1 + diff, _, _ = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction=:forward, R=Rs[i+2], Ri=i+1) + elseif i == 2 + diff, Ls[i-1], _ = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction=:forward, R=Rs[i+2], Ri=i+1) + elseif i == first + diff, _, _ = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction=:forward, L=Ls[i-1], Li=i, R=Rs[i+2], Ri=i+1) + elseif i < n-1 + diff, Ls[i-1], _ = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction=:forward, L=Ls[i-2], Li=i-1, R=Rs[i+2], Ri=i+1) + else + diff, Ls[i-1], _ = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction=:forward, L=Ls[i-2], Li=i-1) + end + max_diff = max(max_diff, diff) + end + else + for i in last-1:-1:first + centercanonicalize!(A, i) + centercanonicalize!(B, i) + if i == n-1 + diff, _, _ = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction=:backward, L=Ls[i-1], Li=i) + elseif i == n-2 + diff, _, Rs[i+2] = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction=:backward, L=Ls[i-1], Li=i) + elseif i == last-1 + diff, _, _ = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction=:backward, L=Ls[i-1], Li=i, R=Rs[i+2], Ri=i+1) + elseif i > 1 + diff, _, Rs[i+2] = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction=:backward, L=Ls[i-1], Li=i, R=Rs[i+3], Ri=i+2) + else + diff, _, Rs[i+2] = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction=:backward, R=Rs[i+3], Ri=i+2) + end + max_diff = max(max_diff, diff) + end + end + + if MPI.Initialized() + if juliarank % 2 == sweep % 2 # Left + if juliarank != nprocs + centercanonicalize!(A, last) + centercanonicalize!(B, last) + + Ls[last-1] = leftenvironment(A, B, X, last-1; L=Ls[last-2], Li=last-1) + + sizes = Vector{Int}(undef, 3) + reqs = MPI.Isend(collect(size(Ls[last-1])), comm; dest=mpirank+1, tag=juliarank) + reqr = MPI.Irecv!(sizes, comm; source=mpirank+1, tag=juliarank+1) + + MPI.Waitall([reqs, reqr]) + + Rs[last+2] = ones(ValueType, sizes[1], sizes[2], sizes[3]) + + reqs = MPI.Isend(Ls[last-1], comm; dest=mpirank+1, tag=juliarank) + reqr = MPI.Irecv!(Rs[last+2], comm; source=mpirank+1, tag=juliarank+1) + + MPI.Waitall([reqs, reqr]) + @assert typeof(Rs[last+2]) <: Array{ValueType,3} "Expected Array{ValueType,3}, got $(typeof(Rs[last+2]))" + diff, _, _ = updatecore!(A, B, X, last; method, tolerance, maxbonddim, direction=:backward, L=Ls[last-1], Li=last, R=Rs[last+2], Ri=last+1) + + Rs[last+1] = rightenvironment(A, B, X, last+1; R=Rs[last+2], Ri=last+1) + end + else # Right + if juliarank != 1 + centercanonicalize!(A, first) + centercanonicalize!(B, first) + + Rs[first+1] = rightenvironment(A, B, X, first+1; R=Rs[first+2], Ri=first+1) + + sizes = Vector{Int}(undef, 3) + reqs = MPI.Isend(collect(size(Rs[first+1])), comm; dest=mpirank-1, tag=juliarank) + reqr = MPI.Irecv!(sizes, comm; source=mpirank-1, tag=juliarank-1) + + MPI.Waitall([reqs, reqr]) + + Ls[first-2] = ones(ValueType, sizes[1], sizes[2], sizes[3]) + reqs = MPI.Isend(Rs[first+1], comm; dest=mpirank-1, tag=juliarank) + reqr = MPI.Irecv!(Ls[first-2], comm; source=mpirank-1, tag=juliarank-1) + + MPI.Waitall([reqs, reqr]) + @assert typeof(Ls[first-2]) <: Array{ValueType,3} "Expected Array{ValueType,3}, got $(typeof(Ls[first-2]))" + diff, _, _ = updatecore!(A, B, X, first-1; method, tolerance, maxbonddim, direction=:forward, L=Ls[first-2], Li=first-1, R=Rs[first+1], Ri=first) #center on first + + Ls[first-1] = leftenvironment(A, B, X, first-1; L=Ls[first-2], Li=first-1) + end + end + end + + max_diff = max(max_diff, diff) # Check last update on edge + converged = max_diff < tolerance + global_converged = MPI.Allreduce(converged, MPI.LAND, comm) + if global_converged + break + end + end + + # Merge togheter all the sub tensor trains + if juliarank == 1 + for j in 2:nprocs + centercanonicalize!(A, noderanges[j-1][end]) + centercanonicalize!(B, noderanges[j-1][end]) + centercanonicalize!(X, noderanges[j-1][end]) + X[noderanges[j]] = MPI.recv(comm; source=j-1, tag=2*j) + R = Array{ValueType, 3}(undef, 1, 1, 1) + if j != nprocs + MPI.Recv!(R, comm; source=j-1, tag=2*j+1) + else + R = ones(ValueType, 1, 1, 1) + end + @assert typeof(R) <: Array{ValueType,3} "Expected Array{ValueType,3}, got $(typeof(R))" + + diff, _, _ = updatecore!(A, B, X, noderanges[j-1][end]; method, tolerance, maxbonddim, direction=:forward, R, Ri=noderanges[j][end]) + end + else + centercanonicalize!(X, first) + MPI.send(X[noderanges[juliarank]], comm; dest=0, tag=2*juliarank) + if juliarank != nprocs + MPI.send(Rs[noderanges[juliarank][end]+1], comm; dest=0, tag=2*juliarank+1) + end + end + + nprocs = MPI.Comm_size(comm) # In case not all processes where used to compute + + # Redistribute the tensor train among the processes. + if juliarank == 1 + for j in 2:nprocs + MPI.send(X, comm; dest=j-1, tag=1) + end + else + X = MPI.recv(comm; source=0, tag=1) + end + + return TensorTrain{ValueType,4}(X) +end + """ function contract( A::TensorTrain{V1,4}, @@ -493,6 +1169,7 @@ end tolerance::Float64=1e-12, maxbonddim::Int=typemax(Int), f::Union{Nothing,Function}=nothing, + subcomm::Union{Nothing, MPI.Comm}=nothing, kwargs... ) where {V1,V2} @@ -501,7 +1178,11 @@ Contract two tensor trains `A` and `B`. Currently, two implementations are available: 1. `algorithm=:TCI` constructs a new TCI that fits the contraction of `A` and `B`. 2. `algorithm=:naive` uses a naive tensor contraction and subsequent SVD recompression of the tensor train. - 2. `algorithm=:zipup` uses a naive tensor contraction with on-the-fly LU decomposition. + 3. `algorithm=:zipup` uses a naive tensor contraction with on-the-fly LU decomposition. + 4. `algorithm=:distrzipup` uses a naive tensor contraction with on-the-fly LU decomposition, distributed on multiple nodes. + 5. `algorithm=:fit` uses a variational approach to minimize the difference between `A`*`B` and the solution. + 6. `algorithm=:distrfit` uses a variational approach to minimize the difference between `A`*`B` and the solution, distributed on multiple nodes. + Arguments: - `A` and `B` are the tensor trains to be contracted. @@ -510,7 +1191,8 @@ Arguments: - `maxbonddim` sets the maximum bond dimension of the resulting tensor train. - `f` is a function to be applied elementwise to the result. This option is only available with `algorithm=:TCI`. - `method` chooses the method used for the factorization in the `algorithm=:zipup` case (`:SVD` or `:LU`). -- `kwargs...` are forwarded to [`crossinterpolate2`](@ref) if `algorithm=:TCI`. +- `subcomm` is an optional MPI communicator for distributed algorithms. If not provided, the default communicator is used. +- `kwargs...` are forwarded to [`crossinterpolate2`](@ref) if `algorithm=:TCI` or to [`contract_fit`](@ref) if `algorithm=:fit` or `algorithm=:distrfit`. """ function contract( A::TensorTrain{V1,4}, @@ -518,7 +1200,9 @@ function contract( algorithm::Symbol=:TCI, tolerance::Float64=1e-12, maxbonddim::Int=typemax(Int), + method::Symbol=:SVD, f::Union{Nothing,Function}=nothing, + subcomm::Union{Nothing, MPI.Comm}=nothing, kwargs... )::TensorTrain{promote_type(V1,V2),4} where {V1,V2} Vres = promote_type(V1, V2) @@ -535,7 +1219,23 @@ function contract( if f !== nothing error("Zipup contraction implementation cannot contract matrix product with a function. Use algorithm=:TCI instead.") end - return contract_zipup(A_, B_; tolerance, maxbonddim) + return contract_zipup(A_, B_; tolerance=tolerance, maxbonddim=maxbonddim, method=method, kwargs...) + elseif algorithm === :distrzipup + if f !== nothing + error("Zipup contraction implementation cannot contract matrix product with a function. Use algorithm=:TCI instead.") + end + return contract_distr_zipup(A_, B_; tolerance=tolerance, maxbonddim=maxbonddim, method=method, subcomm=subcomm, kwargs...) + elseif algorithm === :fit + if f !== nothing + error("Fit contraction implementation cannot contract matrix product with a function. Use algorithm=:TCI instead.") + end + return contract_fit(A_, B_; tolerance=tolerance, maxbonddim=maxbonddim, method=method, kwargs...) + elseif algorithm === :distrfit + if f !== nothing + error("Fit contraction implementation cannot contract matrix product with a function. Use algorithm=:TCI instead.") + end + # return contract_distr_fit(A_, B_; tolerance=tolerance, maxbonddim=maxbonddim, method=method, kwargs...) + return contract_fit(A_, B_; tolerance=tolerance, maxbonddim=maxbonddim, method=method, kwargs...) else throw(ArgumentError("Unknown algorithm $algorithm.")) end @@ -558,3 +1258,12 @@ function contract( tt = contract(A, TensorTrain{4}(B, [(s..., 1) for s in sitedims(B)]); kwargs...) return TensorTrain{3}(tt, prod.(sitedims(tt))) end + +function contract( + A::Union{TensorCI1{V},TensorCI2{V},TensorTrain{V,3}}, + B::Union{TensorCI1{V2},TensorCI2{V2},TensorTrain{V2,3}}; + kwargs... +)::promote_type(V,V2) where {V,V2} + tt = contract(TensorTrain{4}(A, [(1, s...) for s in sitedims(A)]), TensorTrain{4}(B, [(s..., 1) for s in sitedims(B)]); kwargs...) + return prod(prod.(tt.sitetensors)) +end diff --git a/src/integration.jl b/src/integration.jl index ff7a5f1..83ae9d1 100644 --- a/src/integration.jl +++ b/src/integration.jl @@ -55,4 +55,4 @@ function integrate( ) return sum(tci2) / normalization -end +end \ No newline at end of file diff --git a/src/matrixlu.jl b/src/matrixlu.jl index 24f6fae..ac713da 100644 --- a/src/matrixlu.jl +++ b/src/matrixlu.jl @@ -1,3 +1,44 @@ +function distribute(_batchf, full, fragmented, teamsize, teamrank, stripes, subcomm) + chunksize, remainder = divrem(length(fragmented), teamsize) + if stripes == :vertical + col_chunks = [chunksize + (i <= remainder ? 1 : 0) for i in 1:teamsize] + row_chunks = [length(full) for _ in 1:teamsize] + ranges = [(vcat([0],cumsum(col_chunks))[i] + 1):(cumsum(col_chunks)[i]) for i in 1:teamsize] + else # horizontal + row_chunks = [chunksize + (i <= remainder ? 1 : 0) for i in 1:teamsize] + col_chunks = [length(full) for _ in 1:teamsize] + ranges = [(vcat([0],cumsum(row_chunks))[i] + 1):(cumsum(row_chunks)[i]) for i in 1:teamsize] + end + fragment = fragmented[ranges[teamrank]] + localsubmatrix = if isempty(fragment) + zeros(length(full), 0) + else + if stripes == :vertical + _batchf(full, fragment) + else + _batchf(fragment, full) + end + end + if teamrank == 1 + submatrix = if stripes == :vertical + zeros(length(full), length(fragmented)) + else + zeros(length(fragmented), length(full)) + end + sizes = vcat(row_chunks', col_chunks') + counts = vec(prod(sizes, dims=1)) + submatrix_vbuf = VBuffer(submatrix, counts) + else + submatrix_vbuf = VBuffer(nothing) + end + MPI.Gatherv!(localsubmatrix, submatrix_vbuf, 0, subcomm) + if teamrank != 1 + submatrix = nothing + end + submatrix +end + + function submatrixargmax( f::Function, # real valued function A::AbstractMatrix{T}, @@ -235,7 +276,10 @@ function arrlu( abstol::Number=0.0, leftorthogonal::Bool=true, numrookiter::Int=5, - usebatcheval::Bool=false + usebatcheval::Bool=false, + leaders::Vector=Int[], + leaderslist::Vector=Int[], + subcomm=nothing # TODO change: bad practice )::rrLU{ValueType} where {ValueType} lu = rrLU{ValueType}(matrixsize...; leftorthogonal) islowrank = false @@ -243,50 +287,111 @@ function arrlu( _batchf = usebatcheval ? f : ((x, y) -> f.(x, y')) + if !isempty(leaders) + comm = MPI.COMM_WORLD + rank = MPI.Comm_rank(comm) + juliarank = rank + 1 + nprocs = MPI.Comm_size(comm) + teamrank = MPI.Comm_rank(subcomm) + teamjuliarank = teamrank + 1 + teamsize = MPI.Comm_size(subcomm) + end + while true if leftorthogonal pushrandomsubset!(J0, 1:matrixsize[2], max(1, length(J0))) else pushrandomsubset!(I0, 1:matrixsize[1], max(1, length(I0))) end - + for rookiter in 1:numrookiter colmove = (iseven(rookiter) == leftorthogonal) - submatrix = if colmove - _batchf(I0, lu.colpermutation) - else - _batchf(lu.rowpermutation, J0) + if !isempty(leaders) # Parallel + if teamsize > 1 # Parallel and leadership + submatrix = if colmove + distribute(_batchf, lu.colpermutation, I0, teamsize, teamjuliarank, :horizontal, subcomm) + else + distribute(_batchf, lu.rowpermutation, J0, teamsize, teamjuliarank, :vertical, subcomm) + end + MPI.Barrier(subcomm) + if teamjuliarank == 1 + lu.npivot = 0 + _optimizerrlu!(lu, submatrix; maxrank, reltol, abstol) + islowrank |= npivots(lu) < minimum(size(submatrix)) + end + else + submatrix = if colmove + _batchf(I0, lu.colpermutation) + else + _batchf(lu.rowpermutation, J0) + end + lu.npivot = 0 + _optimizerrlu!(lu, submatrix; maxrank, reltol, abstol) + islowrank |= npivots(lu) < minimum(size(submatrix)) + end + else # Serial + submatrix = if colmove + _batchf(I0, lu.colpermutation) + else + _batchf(lu.rowpermutation, J0) + end + lu.npivot = 0 + _optimizerrlu!(lu, submatrix; maxrank, reltol, abstol) + islowrank |= npivots(lu) < minimum(size(submatrix)) end - lu.npivot = 0 - _optimizerrlu!(lu, submatrix; maxrank, reltol, abstol) - islowrank |= npivots(lu) < minimum(size(submatrix)) - if rowindices(lu) == I0 && colindices(lu) == J0 - break + if !isempty(leaders) # Parallel + tb = time_ns() + lu = MPI.bcast([lu], subcomm)[1] + islowrank = MPI.bcast([islowrank], subcomm)[1] + tb = (time_ns() - tb)*1e-9 + if rowindices(lu) == I0 && colindices(lu) == J0 + break + end + J0 = colindices(lu) + I0 = rowindices(lu) + else # Serial + if rowindices(lu) == I0 && colindices(lu) == J0 + break + end + J0 = colindices(lu) + I0 = rowindices(lu) end - - J0 = colindices(lu) - I0 = rowindices(lu) end + if islowrank || length(I0) >= maxrank break end end + if size(lu.L, 1) < matrixsize[1] I2 = setdiff(1:matrixsize[1], I0) lu.rowpermutation = vcat(I0, I2) - L2 = _batchf(I2, J0) - cols2Lmatrix!(L2, (@view lu.U[1:lu.npivot, 1:lu.npivot]), leftorthogonal) - lu.L = vcat((@view lu.L[1:lu.npivot, 1:lu.npivot]), L2) + L2 = if !isempty(leaders) + distribute(_batchf, J0, I2, teamsize, teamjuliarank, :horizontal, subcomm) + else + _batchf(I2, J0) + end + if isempty(leaders) || teamjuliarank == 1 + cols2Lmatrix!(L2, (@view lu.U[1:lu.npivot, 1:lu.npivot]), leftorthogonal) + lu.L = vcat((@view lu.L[1:lu.npivot, 1:lu.npivot]), L2) + end end + if size(lu.U, 2) < matrixsize[2] J2 = setdiff(1:matrixsize[2], J0) lu.colpermutation = vcat(J0, J2) - U2 = _batchf(I0, J2) - rows2Umatrix!(U2, (@view lu.L[1:lu.npivot, 1:lu.npivot]), leftorthogonal) - lu.U = hcat((@view lu.U[1:lu.npivot, 1:lu.npivot]), U2) + U2 = if !isempty(leaders) + distribute(_batchf, I0, J2, teamsize, teamjuliarank, :vertical, subcomm) + else + U2 = _batchf(I0, J2) + end + if isempty(leaders) || teamjuliarank == 1 + rows2Umatrix!(U2, (@view lu.L[1:lu.npivot, 1:lu.npivot]), leftorthogonal) + lu.U = hcat((@view lu.U[1:lu.npivot, 1:lu.npivot]), U2) + end end return lu diff --git a/src/mpi.jl b/src/mpi.jl new file mode 100644 index 0000000..f9870c0 --- /dev/null +++ b/src/mpi.jl @@ -0,0 +1,63 @@ +""" + function initializempi() + +Initialize the MPI environment if it has not been initialized yet. If mute=true, then all the processes with rank>0 (i.e. not the root node) won't output anything to stdout. +""" +function initializempi(mute::Bool=true) + if !MPI.Initialized() + MPI.Init() + end + comm = MPI.COMM_WORLD + mpirank = MPI.Comm_rank(comm) + if mute # Mute all processors which have mpirank != 0. + if mpirank != 0 + open("/dev/null", "w") do devnull + redirect_stdout(devnull) + redirect_stderr(devnull) + end + end + end +end + +""" + function finalizempi() + +Finalize the MPI environment if it has not been finalized yet. +""" +function finalizempi() + MPI.Barrier(MPI.COMM_WORLD) + if MPI.Comm_rank == 0 + if !MPI.Finalized() + MPI.Finalize() + end + end +end + +function synchronize_tt!(tt::Union{TensorTrain{ValueType,N},TensorCI2{ValueType}}; subcomm = nothing, juliasource::Int = 1) where {ValueType, N} + if !MPI.Initialized() + println("Warning! synchronize_tt has been called, but MPI is not initialized, please use TCI.initializempi() before contract() and use TCI.finalizempi() afterwards") + else + if subcomm != nothing + comm = subcomm + else + comm = MPI.COMM_WORLD + end + mpirank = MPI.Comm_rank(comm) + juliarank = mpirank + 1 + nprocs = MPI.Comm_size(comm) + + MPI.Barrier(comm) + reqs = MPI.Request[] + if juliarank == juliasource + for j in 1:nprocs + if j != juliarank + push!(reqs, MPI.isend(tt.sitetensors[1:end], comm; dest=j-1, tag=juliarank)) + end + end + end + if juliarank != juliasource + tt.sitetensors[1:end] = MPI.recv(comm; source=juliasource-1, tag=juliasource) + end + MPI.Waitall(reqs) + end +end \ No newline at end of file diff --git a/src/sweepstrategies.jl b/src/sweepstrategies.jl index c53dabc..b14bdc5 100644 --- a/src/sweepstrategies.jl +++ b/src/sweepstrategies.jl @@ -3,4 +3,4 @@ function forwardsweep(sweepstrategy::Symbol, iteration::Int) (sweepstrategy == :forward) || (sweepstrategy == :backandforth && isodd(iteration)) ) -end +end \ No newline at end of file diff --git a/src/tensorci2.jl b/src/tensorci2.jl index c85db93..06c607b 100644 --- a/src/tensorci2.jl +++ b/src/tensorci2.jl @@ -250,7 +250,8 @@ function addglobalpivots2sitesweep!( pivotsearch::Symbol=:full, verbosity::Int=0, ntry::Int=10, - strictlynested::Bool=false + strictlynested::Bool=false, + multithread_eval::Bool=false )::Int where {F,ValueType} if any(length(tci) .!= length.(pivots)) throw(DimensionMismatch("Please specify a pivot as one index per leg of the MPS.")) @@ -270,7 +271,8 @@ function addglobalpivots2sitesweep!( maxbonddim=maxbonddim, pivotsearch=pivotsearch, strictlynested=strictlynested, - verbosity=verbosity) + verbosity=verbosity, + multithread_eval=multithread_eval) newpivots = [p for p in pivots if abs(evaluate(tci, p) - f(p)) > abstol] @@ -502,7 +504,214 @@ function (obj::SubMatrix{T})(irows::Vector{Int}, icols::Vector{Int})::Matrix{T} return res end +""" + Noderanges(nprocs, nbonds, localdim, estimatedbond; interbond=false, estimatedbonds=nothing, algorithm=:tci, efficiencycheck=false) + +Distributes the computational workload efficiently across multiple nodes for tensor train algorithms. + +# Arguments +- `nprocs::Int`: Number of processes (nodes) to distribute the workload across. +- `nbonds::Int`: Number of bonds in the tensor train. +- `localdim::Int`: Local dimension of the tensor train. +- `estimatedbond::Int`: Estimated bond dimension after compression, set by the user to help balance the workload. +- `interbond::Bool=false`: If `true`, distributes workload across bonds; if `false`, only function evaluation is distributed, equivalent to serial TCI. +- `estimatedbonds::Union{Nothing, Vector{Int}}=nothing`: Optional vector of estimated bond dimensions for each bond. If not provided, it is computed from `estimatedbond`. +- `algorithm::Symbol=:tci`: Algorithm being parallelized (e.g., `:tci`, `:mpompo`). Different algorithms may have different workload characteristics. +- `efficiencycheck::Bool=false`: If `true`, the function also returns the efficiency of the distribution. + +# Returns +A tuple containing the ranges of work assigned to each node. If `efficiencycheck` is `true`, also returns the efficiency of the distribution. + +# Notes +- The function is designed to optimize workload distribution for tensor train computations, taking into account the estimated bond dimensions and the chosen algorithm. +- Proper setting of `estimatedbond` or `estimatedbonds` can improve parallel efficiency. +""" + + +function _noderanges(nprocs::Int, nbonds::Int, localdim::Int, estimatedbond::Int; interbond::Bool=true, estimatedbonds::Union{Vector{Int},Nothing}=nothing, algorithm::Symbol=:tci, efficiencycheck::Bool=false)::Union{Vector{UnitRange{Int}},Tuple{Vector{UnitRange{Int}},Float64}} + if !interbond + return [1:nbonds for _ in 1:nprocs] + end + if estimatedbonds === nothing + estimatedbonds = [estimatedbond for _ in 1:nbonds-1] + i = 1 + while localdim^i < estimatedbond && i <= nbonds/2 + estimatedbonds[i] = localdim^i + estimatedbonds[end-i+1] = localdim^i + i = i + 1 + end + end + + if algorithm == :tci + # ~ chi^3 + costs = [(i == 1 ? 1 : estimatedbonds[i-1]) * (i == nbonds ? 1 : estimatedbonds[i]) * min((i == 1 ? 1 : estimatedbonds[i-1]) * (i == nbonds ? 1 : estimatedbonds[i])) for i in 1:nbonds] + else + # ~ chi^4 + costs = [(i == 1 ? 1 : estimatedbonds[i-1]) * (i == nbonds ? 1 : estimatedbonds[i]) * estimatedbond * min((i == 1 ? 1 : estimatedbonds[i-1]) * (i == nbonds ? 1 : estimatedbonds[i])) for i in 1:nbonds] + end + total_cost = sum(costs) + target_cost = total_cost / nprocs + + assignments = Vector{UnitRange{Int}}() + + i = 1 # bond index + node = 1 + current_cost = 0 + range_start = 1 + + while i <= nbonds && node < nprocs + bond_cost = costs[i] + + if current_cost + bond_cost <= target_cost * 1.3 + # Add the current bond to the node's range + current_cost += bond_cost + i += 1 + else + # Current node is full, move to the next node + push!(assignments, range_start:i-1) + node += 1 + current_cost = bond_cost + range_start = i + i += 1 + end + end + push!(assignments, range_start:nbonds) + leftover = nprocs-node + if leftover > 0 + for k in 1:Int(ceil(leftover/node)) + for j in 1:(k == Int(ceil(leftover/node)) ? rem(leftover, node) : node) + if j%2 == 1 + indx = Int(j + (j-1)/2 * (k-1)) + insert!(assignments, indx, assignments[indx]) + else + indx = Int(length(assignments)+2-j - (j/2)*(k-1)) + insert!(assignments, indx, assignments[indx]) + end + end + end + end + works = [sum(costs[i]) for i in assignments] + efficiency = total_cost / maximum(works) + if efficiencycheck + return assignments, efficiency + else + return assignments + end +end + +""" + _leaders(nprocs::Int, noderanges::Vector{UnitRange{Int}}) -> (leaders::Vector{Int}, leaderslist::Vector{Int}) + +Determine the leader processes among `nprocs` processes based on their assigned node ranges. + +A process is considered a leader (`+1`) if its node range is equal to the next process's node range, and not a leader (`-1`) if its node range is equal to the previous process's node range. All other processes are marked as `0`. Only leaders are responsible for communication. + +# Arguments +- `nprocs::Int`: The total number of processes. +- `noderanges::Vector{UnitRange{Int}}`: A vector specifying the node range assigned to each process. + +# Returns +- `leaders::Vector{Int}`: An array where each entry is `1` for a leader, `-1` for a worker, and `0` otherwise. +- `leaderslist::Vector{Int}`: A list of process indices that are leaders and participate in communication. + +# Notes +- Communication is performed only between leaders. +""" +function _leaders(nprocs::Int, noderanges::Vector{UnitRange{Int}}) + leaders = zeros(Int, nprocs) + for i in 1:nprocs + if i < nprocs + if noderanges[i] == noderanges[i + 1] + leaders[i] = 1 + end + end + if i > 1 + if noderanges[i] == noderanges[i - 1] + leaders[i] = -1 + end + end + end + leaderslist = [] + for i in 1:nprocs + if leaders[i] != -1 + push!(leaderslist, i) + end + end + return leaders, leaderslist +end + +""" +multithreadPi(ValueType, f, localdims, Icombined, Jcombined) +Evaluate the function `f` on a grid defined by `Icombined` and `Jcombined` using multiple threads. +""" +function multithreadPi(::Type{ValueType}, f, localdims::Union{Vector{Int},NTuple{N,Int}}, Icombined::Vector{MultiIndex}, Jcombined::Vector{MultiIndex}) where {N,ValueType} + Pi = zeros(ValueType, length(Icombined), length(Jcombined)) + nthreads = Threads.nthreads() + chunk_size, remainder = divrem(length(Jcombined), nthreads) + col_ranges = [((i - 1) * chunk_size + min(i - 1, remainder) + 1):(i * chunk_size + min(i, remainder)) for i in 1:nthreads] + @threads for tid in 1:nthreads + if !isempty(col_ranges[tid]) + Pi[:, col_ranges[tid]] = filltensor( + ValueType, f, localdims, + Icombined, Jcombined[col_ranges[tid]], Val(0) + ) + end + end + return Pi +end +function parallelfullsearch(::Type{ValueType}, f, tci, Icombined, Jcombined, teamsize, multithread_eval, teamjuliarank, subcomm) where {ValueType} + if teamsize == 1 + t1 = time_ns() + if multithread_eval + Pi = multithreadPi(ValueType, f, tci.localdims, Icombined, Jcombined) + else + Pi = reshape( + filltensor(ValueType, f, tci.localdims, + Icombined, Jcombined, Val(0)), + length(Icombined), length(Jcombined) + ) + end + t2 = time_ns() + # For compilation purpouse with MPI + Icombined_copy = Icombined + Jcombined_copy = Jcombined + else # Teamwork + # Coordinate + t1 = time_ns() + Icombined_copy = MPI.bcast([Icombined], subcomm)[1] + Jcombined_copy = MPI.bcast([Jcombined], subcomm)[1] + # Subdivide + chunksize, remainder = divrem(length(Jcombined_copy), teamsize) + col_chunks = [chunksize + (i <= remainder ? 1 : 0) for i in 1:teamsize] + row_chunks = [length(Icombined_copy) for _ in 1:teamsize] + ranges = [(vcat([0],cumsum(col_chunks))[i] + 1):(cumsum(col_chunks)[i]) for i in 1:teamsize] + Jcombined_copy_local = Jcombined_copy[ranges[teamjuliarank]] + if !isempty(Jcombined_copy_local) + tlocalPi = time_ns() + if multithread_eval + localPi = multithreadPi(ValueType, f, tci.localdims, Icombined_copy, Jcombined_copy_local) + else + localPi = reshape( + filltensor(ValueType, f, tci.localdims, + Icombined_copy, Jcombined_copy_local, Val(0)), + length(Icombined_copy), length(Jcombined_copy_local) + ) + end + tlocalPi = (time_ns() - tlocalPi) * 1e-9 + else + localPi = zeros(length(Icombined_copy), length(Jcombined_copy_local)) + end + # Unite + Pi = zeros(length(Icombined_copy), length(Jcombined_copy)) + sizes = vcat(row_chunks', col_chunks') + counts = vec(prod(sizes, dims=1)) + Pi_vbuf = VBuffer(Pi, counts) + MPI.Allgatherv!(localPi, Pi_vbuf, subcomm) + t2 = time_ns() + end + return Pi, t1, t2 +end """ Update pivots at bond `b` of `tci` using the TCI2 algorithm. Site tensors will be invalidated. @@ -513,28 +722,52 @@ function updatepivots!( f::F, leftorthogonal::Bool; reltol::Float64=1e-14, - abstol::Float64=0.0, + abstol::Float64=1e-14, #it was 0... maxbonddim::Int=typemax(Int), sweepdirection::Symbol=:forward, pivotsearch::Symbol=:full, verbosity::Int=0, extraIset::Vector{MultiIndex}=MultiIndex[], extraJset::Vector{MultiIndex}=MultiIndex[], + estimatedbond::Int=100, + interbond::Bool=true, + multithread_eval::Bool=false, + subcomm=nothing ) where {F,ValueType} invalidatesitetensors!(tci) + + if sweepdirection == :parallel && MPI.Initialized() + comm = MPI.COMM_WORLD + mpirank = MPI.Comm_rank(comm) + juliarank = mpirank + 1 + nprocs = MPI.Comm_size(comm) + noderanges = _noderanges(nprocs, length(tci) - 1, tci.localdims[1], estimatedbond; interbond) + leaders, leaderslist = _leaders(nprocs, noderanges) + teamrank = MPI.Comm_rank(subcomm) + teamjuliarank = teamrank + 1 + teamsize = MPI.Comm_size(subcomm) + end + Icombined = union(kronecker(tci.Iset[b], tci.localdims[b]), extraIset) Jcombined = union(kronecker(tci.localdims[b+1], tci.Jset[b+1]), extraJset) luci = if pivotsearch === :full - t1 = time_ns() - Pi = reshape( - filltensor(ValueType, f, tci.localdims, - Icombined, Jcombined, Val(0)), - length(Icombined), length(Jcombined) - ) - t2 = time_ns() - + if sweepdirection == :parallel && MPI.Initialized() + Pi, t1, t2 = parallelfullsearch(ValueType, f, tci, Icombined, Jcombined, teamsize, multithread_eval, teamjuliarank, subcomm) + else # Serial + t1 = time_ns() + if multithread_eval + Pi = multithreadPi(ValueType, f, tci.localdims, Icombined, Jcombined) + else + Pi = reshape( + filltensor(ValueType, f, tci.localdims, + Icombined, Jcombined, Val(0)), + length(Icombined), length(Jcombined) + ) + end + t2 = time_ns() + end updatemaxsample!(tci, Pi) luci = MatrixLUCI( Pi, @@ -553,30 +786,65 @@ function updatepivots!( t1 = time_ns() I0 = Int.(Iterators.filter(!isnothing, findfirst(isequal(i), Icombined) for i in tci.Iset[b+1]))::Vector{Int} J0 = Int.(Iterators.filter(!isnothing, findfirst(isequal(j), Jcombined) for j in tci.Jset[b]))::Vector{Int} - Pif = SubMatrix{ValueType}(f, Icombined, Jcombined) t2 = time_ns() - res = MatrixLUCI( - ValueType, - Pif, - (length(Icombined), length(Jcombined)), - I0, J0; - reltol=reltol, abstol=abstol, - maxrank=maxbonddim, - leftorthogonal=leftorthogonal, - pivotsearch=:rook, - usebatcheval=true - ) + if verbosity > 2 + x, y = length(Icombined), length(Jcombined) + print(" Computing Pi ($x x $y) at bond $b: ") + end + if sweepdirection == :parallel && MPI.Initialized() + I0 = MPI.bcast([I0], subcomm)[1] + J0 = MPI.bcast([J0], subcomm)[1] + Icombined_copy = MPI.bcast([Icombined], subcomm)[1] + Jcombined_copy = MPI.bcast([Jcombined], subcomm)[1] + Pif = SubMatrix{ValueType}(f, Icombined_copy, Jcombined_copy) + res = MatrixLUCI( + ValueType, + Pif, + (length(Icombined_copy), length(Jcombined_copy)), + I0, J0; + reltol=reltol, abstol=abstol, + maxrank=maxbonddim, + leftorthogonal=leftorthogonal, + pivotsearch=:rook, + usebatcheval=true, + leaders=leaders,leaderslist=leaderslist, subcomm=subcomm + ) + MPI.Barrier(subcomm) + else # Serial + Pif = SubMatrix{ValueType}(f, Icombined, Jcombined) + res = MatrixLUCI( + ValueType, + Pif, + (length(Icombined), length(Jcombined)), + I0, J0; + reltol=reltol, abstol=abstol, + maxrank=maxbonddim, + leftorthogonal=leftorthogonal, + pivotsearch=:rook, + usebatcheval=true, + ) + end updatemaxsample!(tci, [ValueType(Pif.maxsamplevalue)]) t3 = time_ns() # Fall back to full search if rook search fails if npivots(res) == 0 - Pi = reshape( - filltensor(ValueType, f, tci.localdims, - Icombined, Jcombined, Val(0)), - length(Icombined), length(Jcombined) - ) + if sweepdirection == :parallel && MPI.Initialized() + Pi, _, _ = parallelfullsearch(ValueType, f, tci, Icombined, Jcombined, teamsize, multithread_eval, teamjuliarank, subcomm) + else # Serial + t1 = time_ns() + if multithread_eval + Pi = multithreadPi(ValueType, f, tci.localdims, Icombined, Jcombined) + else + Pi = reshape( + filltensor(ValueType, f, tci.localdims, + Icombined, Jcombined, Val(0)), + length(Icombined), length(Jcombined) + ) + end + t2 = time_ns() + end updatemaxsample!(tci, Pi) res = MatrixLUCI( Pi, @@ -588,17 +856,56 @@ function updatepivots!( end t4 = time_ns() + #if verbosity > 2 + # x, y = length(Icombined), length(Jcombined) + # println(" Computing Pi ($x x $y) at bond $b: $(1e-9*(t2-t1)) sec, LU: $(1e-9*(t3-t2)) sec, fall back to full: $(1e-9*(t4-t3)) sec") + #end if verbosity > 2 x, y = length(Icombined), length(Jcombined) - println(" Computing Pi ($x x $y) at bond $b: $(1e-9*(t2-t1)) sec, LU: $(1e-9*(t3-t2)) sec, fall back to full: $(1e-9*(t4-t3)) sec") + println("$(1e-9*(t2-t1)) sec, LU: $(1e-9*(t3-t2)) sec, fall back to full: $(1e-9*(t4-t3)) sec") end res else throw(ArgumentError("Unknown pivot search strategy $pivotsearch. Choose from :rook, :full.")) end - tci.Iset[b+1] = Icombined[rowindices(luci)] - tci.Jset[b] = Jcombined[colindices(luci)] - if length(extraIset) == 0 && length(extraJset) == 0 + # Receive after the send + if sweepdirection == :parallel && MPI.Initialized() + sreq = MPI.Request[] + if leaders[juliarank] != -1 + if length(noderanges[juliarank]) == 1 && b == noderanges[juliarank][1] && juliarank != leaderslist[1] && juliarank != leaderslist[end] # If processor has only 1 bond then communicates both left and right + currentleader = findfirst(isequal(juliarank), leaderslist) + if currentleader != nothing + push!(sreq, MPI.isend([Jcombined[colindices(luci)]], comm; dest=leaderslist[currentleader - 1] - 1, tag=(2*b+1))) + push!(sreq, MPI.isend([Icombined[rowindices(luci)]], comm; dest=leaderslist[currentleader + 1] - 1, tag=2*(b+1))) + else + println("Error! Missmatch between leaders and leaderslist, unexpected behaviour! Please open an issue") + end + elseif b == noderanges[juliarank][1] && juliarank != leaderslist[1] # processor communicates left + currentleader = findfirst(isequal(juliarank), leaderslist) + if currentleader != nothing + push!(sreq, MPI.isend([Jcombined[colindices(luci)]], comm; dest=leaderslist[currentleader - 1] - 1, tag=(2*b+1))) + else + println("Error! Missmatch between leaders and leaderslist, unexpected behaviour! Please open an issue") + end + tci.Iset[b+1] = Icombined[rowindices(luci)] + elseif b == noderanges[juliarank][end] && juliarank != leaderslist[end] # processor communicates right + currentleader = findfirst(isequal(juliarank), leaderslist) + if currentleader != nothing + push!(sreq, MPI.isend([Icombined[rowindices(luci)]], comm; dest=leaderslist[currentleader + 1] - 1, tag=2*(b+1))) + else + println("Error! Missmatch between leaders and leaderslist, unexpected behaviour! Please open an issue") + end + tci.Jset[b] = Jcombined[colindices(luci)] + else # normal updates without MPI communications + tci.Iset[b+1] = Icombined[rowindices(luci)] + tci.Jset[b] = Jcombined[colindices(luci)] + end + end + else + tci.Iset[b+1] = Icombined[rowindices(luci)] + tci.Jset[b] = Jcombined[colindices(luci)] + end + if length(extraIset) == 0 && length(extraJset) == 0 && sweepdirection != :parallel setsitetensor!(tci, b, left(luci)) setsitetensor!(tci, b + 1, right(luci)) end @@ -660,7 +967,11 @@ end maxnglobalpivot::Int=5, nsearchglobalpivot::Int=5, tolmarginglobalsearch::Float64=10.0, - strictlynested::Bool=false + strictlynested::Bool=false, + checkbatchevaluatable::Bool=false, + multithread_eval::Bool=false, + estimatedbond::Int=100, + interbond::Bool=true ) where {ValueType} Perform optimization sweeps using the TCI2 algorithm. This will sucessively improve the TCI approximation of a function until it fits `f` with an error smaller than `tolerance`, or until the maximum bond dimension (`maxbonddim`) is reached. @@ -673,7 +984,7 @@ Arguments: - `tolerance::Float64` is a float specifying the target tolerance for the interpolation. Default: `1e-8`. - `maxbonddim::Int` specifies the maximum bond dimension for the TCI. Default: `typemax(Int)`, i.e. effectively unlimited. - `maxiter::Int` is the maximum number of iterations (i.e. optimization sweeps) before aborting the TCI construction. Default: `200`. -- `sweepstrategy::Symbol` specifies whether to sweep forward (:forward), backward (:backward), or back and forth (:backandforth) during optimization. Default: `:backandforth`. +- `sweepstrategy::Symbol` specifies whether to sweep forward (:forward), backward (:backward), back and forth (:backandforth) or in parallel (:parallel) during optimization. Default: `:backandforth`. - `pivotsearch::Symbol` determins how pivots are searched (`:full` or `:rook`). Default: `:full`. - `verbosity::Int` can be set to `>= 1` to get convergence information on standard output during optimization. Default: `0`. - `loginterval::Int` can be set to `>= 1` to specify how frequently to print convergence information. Default: `10`. @@ -683,6 +994,9 @@ Arguments: - `maxnglobalpivot::Int` can be set to `>= 0`. Default: `5`. The maximum number of global pivots to add in each iteration. - `strictlynested::Bool` determines whether to preserve partial nesting in the TCI algorithm. Default: `false`. - `checkbatchevaluatable::Bool` Check if the function `f` is batch evaluatable. Default: `false`. +- `multithread_eval::Bool` determines whether to use multithreading for function evaluation. Default: `false`. +- `estimatedbond::Int` is the estimated bond dimension after compression, set by the user to help balance the workload. Default: `100`. +- `interbond::Bool` determines whether to distribute the workload across bonds or not. Default: `true`. - `checkconvglobalpivot::Bool` Check if the global pivot finder is converged. Default: `true`. In the future, this will be set to `false` by default. Arguments (deprecated): @@ -716,6 +1030,9 @@ function optimize!( tolmarginglobalsearch::Float64=10.0, strictlynested::Bool=false, checkbatchevaluatable::Bool=false, + multithread_eval::Bool=false, + estimatedbond::Int=100, + interbond::Bool=true, checkconvglobalpivot::Bool=true ) where {ValueType} errors = Float64[] @@ -753,6 +1070,18 @@ function optimize!( )) end + if (sweepstrategy == :parallel) && !MPI.Initialized() + println("Warning! Parallel strategy has been chosen, but MPI is not initialized, please use initializempi() before using crossinterpolate2()/quanticscrossinterpolate() and use finalizempi() afterwards") + end + + if sweepstrategy == :parallel && MPI.Initialized() + comm = MPI.COMM_WORLD + mpirank = MPI.Comm_rank(comm) + juliarank = mpirank + 1 + nprocs = MPI.Comm_size(comm) + noderanges = _noderanges(nprocs, length(tci) - 1, tci.localdims[1], estimatedbond; interbond) + leaders, leaderslist = _leaders(nprocs, noderanges) + end # Create the global pivot finder finder = if isnothing(globalpivotfinder) DefaultGlobalPivotFinder( @@ -783,7 +1112,10 @@ function optimize!( strictlynested=strictlynested, verbosity=verbosity, sweepstrategy=sweepstrategy, - fillsitetensors=true + fillsitetensors=true, + estimatedbond=estimatedbond, + interbond=interbond, + multithread_eval=multithread_eval ) if verbosity > 0 && length(globalpivots) > 0 && mod(iter, loginterval) == 0 abserr = [abs(evaluate(tci, p) - f(p)) for p in globalpivots] @@ -801,14 +1133,16 @@ function optimize!( end # Find global pivots where the error is too large - input = GlobalPivotSearchInput(tci) - globalpivots = finder( - input, f, abstol; - verbosity=verbosity, - rng=Random.default_rng() - ) - addglobalpivots!(tci, globalpivots) - push!(nglobalpivots, length(globalpivots)) + if sweepstrategy != :parallel + input = GlobalPivotSearchInput(tci) + globalpivots = finder( + input, f, abstol; + verbosity=verbosity, + rng=Random.default_rng() + ) + addglobalpivots!(tci, globalpivots) + push!(nglobalpivots, length(globalpivots)) + end if verbosity > 1 println(" Walltime $(1e-9*(time_ns() - tstart)) sec: done searching global pivots") @@ -820,16 +1154,50 @@ function optimize!( println("iteration = $iter, rank = $(last(ranks)), error= $(last(errors)), maxsamplevalue= $(tci.maxsamplevalue), nglobalpivot=$(length(globalpivots))") flush(stdout) end - if convergencecriterion( - ranks, errors, - nglobalpivots, - abstol, maxbonddim, ncheckhistory; - checkconvglobalpivot=checkconvglobalpivot + # Checks if every processor has converged with its own subtrain + if sweepstrategy == :parallel && MPI.Initialized() + converged = leaders[juliarank] == -1 || convergencecriterion(ranks, errors, nglobalpivots, abstol, maxbonddim, ncheckhistory; checkconvglobalpivot=checkconvglobalpivot) + MPI.Barrier(comm) + global_converged = MPI.Allreduce(converged, MPI.LAND, comm) + if global_converged # If all the processors have converged + break + end + elseif convergencecriterion( + ranks, errors, nglobalpivots, abstol, maxbonddim, ncheckhistory; checkconvglobalpivot=checkconvglobalpivot ) break end end + # After convergence + # Allgatherv! not possible given type constraints + if sweepstrategy == :parallel && MPI.Initialized() + if juliarank == 1 # If we are the first processor, we take all the information from the other processes + for leader in leaderslist[2:end] + for b in noderanges[leader] + tci.Iset[b] = MPI.recv(comm; source=leader - 1, tag=2*(b))[1] + tci.Jset[b+1] = MPI.recv(comm; source=leader - 1, tag=2*(b+1)+1)[1] + if leader == leaderslist[end] + tci.Iset[length(tci)] = MPI.recv(comm; source=leader - 1, tag=2*(length(tci)+1)+1)[1] + end + end + end + else # Other processors pass the infomation to the first one + if leaders[juliarank] != -1 + for b in noderanges[juliarank] + MPI.send([tci.Iset[b]], comm; dest=0, tag=2*(b)) + MPI.send([tci.Jset[b+1]], comm; dest=0, tag=2*(b+1)+1) + if juliarank == leaderslist[end] + MPI.send([tci.Iset[length(tci)]], comm; dest=0, tag=2*(length(tci)+1)+1) + end + end + end + end + # BCast to all, so that all nodes have full knowledge + tci.Iset = MPI.bcast(tci.Iset, comm) + tci.Jset = MPI.bcast(tci.Jset, comm) + end + # Extra one sweep by the 1-site update to # (1) Remove unnecessary pivots added by global pivots # Note: a pivot matrix can be non-square after adding global pivots, @@ -861,14 +1229,23 @@ function sweep2site!( pivotsearch::Symbol=:full, verbosity::Int=0, strictlynested::Bool=false, - fillsitetensors::Bool=true + fillsitetensors::Bool=true, + estimatedbond::Int=100, + interbond::Bool=true, + multithread_eval::Bool=false ) where {ValueType} invalidatesitetensors!(tci) n = length(tci) - for iter in iter1:iter1+niter-1 + if sweepstrategy == :parallel && MPI.Initialized() + comm = MPI.COMM_WORLD + mpirank = MPI.Comm_rank(comm) + juliarank = mpirank + 1 + nprocs = MPI.Comm_size(comm) + end + for iter in iter1:iter1+niter-1 extraIset = [MultiIndex[] for _ in 1:n] extraJset = [MultiIndex[] for _ in 1:n] if !strictlynested && length(tci.Iset_history) > 0 @@ -880,7 +1257,56 @@ function sweep2site!( push!(tci.Jset_history, deepcopy(tci.Jset)) flushpivoterror!(tci) - if forwardsweep(sweepstrategy, iter) # forward sweep + if sweepstrategy == :parallel && MPI.Initialized() + noderanges = _noderanges(nprocs, length(tci) - 1, tci.localdims[1], estimatedbond; interbond) + leaders, leaderslist = _leaders(nprocs, noderanges) + colors = zeros(Int,nprocs) + count = -1 + for i in 1:length(colors) + if leaders[i] != -1 + count += 1 + end + colors[i] = count + end + color = colors[juliarank] + subcomm = MPI.Comm_split(comm, color, mpirank) + for bondindex in noderanges[juliarank] + updatepivots!( + tci, bondindex, f, true; + abstol=abstol, + maxbonddim=maxbonddim, + sweepdirection=:parallel, + pivotsearch=pivotsearch, + verbosity=verbosity, + extraIset=extraIset[bondindex+1], + extraJset=extraJset[bondindex], + estimatedbond=estimatedbond, + interbond=interbond, + multithread_eval=multithread_eval, + subcomm = subcomm + ) + end + for b in noderanges[juliarank] + if leaders[juliarank] != -1 + if b == noderanges[juliarank][1] && juliarank != leaderslist[1] + currentleader = findfirst(isequal(juliarank), leaderslist) + if currentleader != nothing + tci.Iset[b] = MPI.recv(comm; source=leaderslist[currentleader - 1] - 1, tag=2*b)[1] + else + println("Error! Missmatch between leaders and leaderslist, unexpected behaviour! Please open an issue") + end + end + if b == noderanges[juliarank][end] && juliarank != leaderslist[end] + currentleader = findfirst(isequal(juliarank), leaderslist) + if currentleader != nothing + tci.Jset[b + 1] = MPI.recv(comm; source=leaderslist[currentleader + 1] - 1, tag=2*(b + 1) + 1)[1] + else + println("Error! Missmatch between leaders and leaderslist, unexpected behaviour! Please open an issue") + end + end + end + end + elseif forwardsweep(sweepstrategy, iter) # forward sweep for bondindex in 1:n-1 updatepivots!( tci, bondindex, f, true; @@ -891,6 +1317,7 @@ function sweep2site!( verbosity=verbosity, extraIset=extraIset[bondindex+1], extraJset=extraJset[bondindex], + multithread_eval=multithread_eval ) end else # backward sweep @@ -904,12 +1331,13 @@ function sweep2site!( verbosity=verbosity, extraIset=extraIset[bondindex+1], extraJset=extraJset[bondindex], + multithread_eval=multithread_eval ) end end end - if fillsitetensors + if fillsitetensors && sweepstrategy != :parallel fillsitetensors!(tci, f) end nothing @@ -994,4 +1422,4 @@ function searchglobalpivots( end return [p for (_,p) in pivots] -end +end \ No newline at end of file diff --git a/src/tensortrain.jl b/src/tensortrain.jl index 2dc5fde..49e7f9a 100644 --- a/src/tensortrain.jl +++ b/src/tensortrain.jl @@ -93,8 +93,8 @@ function tensortrain(tci) end function _factorize( - A::AbstractMatrix{V}, method::Symbol; tolerance::Float64, maxbonddim::Int, leftorthogonal::Bool=false, normalizeerror=true -)::Tuple{Matrix{V},Matrix{V},Int} where {V} + A::AbstractMatrix{V}, method::Symbol; tolerance::Float64, maxbonddim::Int, leftorthogonal::Bool=false, diamond::Bool=false, normalizeerror=true, q::Int=0, p::Int=16 +)::Union{Tuple{Matrix{V},Matrix{V},Int},Tuple{Matrix{V},Vector{V},Matrix{V},Int}} where {V} reltol = 1e-14 abstol = 0.0 if normalizeerror @@ -114,22 +114,104 @@ function _factorize( normalized_err = err ./ sum(factorization.S .^ 2) trunci = min( - replacenothing(findfirst(<(abstol^2), err), length(err)), - replacenothing(findfirst(<(reltol^2), normalized_err), length(normalized_err)), + replacenothing(findlast(>(tolerance), Array(factorization.S)), 1), maxbonddim ) - if leftorthogonal + if diamond return ( - factorization.U[:, 1:trunci], - Diagonal(factorization.S[1:trunci]) * factorization.Vt[1:trunci, :], - trunci - ) + Matrix(factorization.U[:, 1:trunci]), + factorization.S[1:trunci], + Matrix(factorization.Vt[1:trunci, :]), + trunci + ) else - return ( - factorization.U[:, 1:trunci] * Diagonal(factorization.S[1:trunci]), - factorization.Vt[1:trunci, :], - trunci - ) + if leftorthogonal + return ( + Matrix(factorization.U[:, 1:trunci]), + Matrix(Diagonal(factorization.S[1:trunci]) * factorization.Vt[1:trunci, :]), + trunci + ) + else + return ( + Matrix(factorization.U[:, 1:trunci] * Diagonal(factorization.S[1:trunci])), + Matrix(factorization.Vt[1:trunci, :]), + trunci + ) + end + end + elseif method === :RSVD + invert = false + if size(A)[1] < size(A)[2] + A = A' + invert = true + end + if maxbonddim == typemax(Int) || maxbonddim + p > size(A)[1] || maxbonddim + p > size(A)[2] + factorization = LinearAlgebra.svd(A) + else + m, n = size(A) + G = randn(n, maxbonddim + p) + Y = A*G + Q = Matrix(LinearAlgebra.qr!(Y).Q) # THEORETICAL BOTTLENECK + for _ in 1:q # q=0 for best performance + Y = A'*Q + Q = Matrix(LinearAlgebra.qr!(Y).Q) + Y = A*Q + Q = Matrix(LinearAlgebra.qr!(Y).Q) + end + B = Q' * A + factorization = LinearAlgebra.svd(B) + factorization = SVD((Q*factorization.U)[:,1:maxbonddim], factorization.S[1:maxbonddim], factorization.Vt[1:maxbonddim,:]) + end + trunci = min( + replacenothing(findlast(>(tolerance), Array(factorization.S)), 1), + maxbonddim + ) + if diamond + if invert + return ( + Matrix(factorization.Vt[1:trunci, :]'), + factorization.S[1:trunci], + Matrix(factorization.U[:, 1:trunci]'), + trunci + ) + else + return ( + Matrix(factorization.U[:, 1:trunci]), + factorization.S[1:trunci], + Matrix(factorization.Vt[1:trunci, :]), + trunci + ) + end + else + if leftorthogonal + if invert + return ( + Matrix(factorization.Vt[1:trunci, :]' * Diagonal(factorization.S[1:trunci])), + Matrix(factorization.U[:, 1:trunci]'), + trunci + ) + else + return ( + Matrix(factorization.U[:, 1:trunci]), + Matrix(Diagonal(factorization.S[1:trunci]) * factorization.Vt[1:trunci, :]), + trunci + ) + end + else + if invert + return ( + Matrix(factorization.Vt[1:trunci, :]' * Diagonal(factorization.S[1:trunci])), + Matrix(factorization.U[:, 1:trunci]'), + trunci + ) + else + return ( + Matrix(factorization.U[:, 1:trunci] * Diagonal(factorization.S[1:trunci])), + Matrix(factorization.Vt[1:trunci, :]), + trunci + ) + end + end end else error("Not implemented yet.") @@ -182,7 +264,6 @@ function compress!( nothing end - function multiply!(tt::TensorTrain{V,N}, a) where {V,N} tt.sitetensors[end] .= tt.sitetensors[end] .* a nothing @@ -290,3 +371,7 @@ function fulltensor(obj::TensorTrain{T,N})::Array{T} where {T,N} returnsize = collect(Iterators.flatten(sitedims_)) return reshape(result, returnsize...) end + +function IMPO(R::Int) + return TensorTrain{Float64, 4}([reshape([1.,0.,0.,1.], 1,2,2,1) for _ in 1:R]) +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 768cb06..fbc9cf6 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,6 +4,7 @@ using LinearAlgebra include("test_with_aqua.jl") include("test_with_jet.jl") +include("test_mpi.jl") include("test_util.jl") include("test_sweepstrategies.jl") include("test_indexset.jl") diff --git a/test/test_contraction.jl b/test/test_contraction.jl index 71f6009..d3db9ac 100644 --- a/test/test_contraction.jl +++ b/test/test_contraction.jl @@ -1,6 +1,8 @@ import TensorCrossInterpolation as TCI using TensorCrossInterpolation using Random +# TODO remove +using Test function _tomat(tto::TensorTrain{T,4}) where {T} sitedims = TCI.sitedims(tto) @@ -34,7 +36,7 @@ function _gen_testdata_TTO_TTO() bonddims_b = [1, 2, 3, 2, 1] localdims1 = [2, 2, 2, 2] localdims2 = [3, 3, 3, 3] - localdims3 = [2, 2, 2, 2] + localdims3 = [5, 5, 5, 5] a = TensorTrain{ComplexF64,4}([ rand(ComplexF64, bonddims_a[n], localdims1[n], localdims2[n], bonddims_a[n+1]) @@ -181,15 +183,23 @@ end end end - -@testset "MPO-MPO contraction (zipup)" for method in [:SVD, :LU] +@testset "MPO-MPO contraction" for algorithm in [:zipup, :distrzipup, :fit, :distrfit], method in [:SVD, :LU, :RSVD], maxbonddim in [typemax(Int), 10] + Random.seed!(42) # For reproducibility N, a, b, localdims1, localdims2, localdims3 = _gen_testdata_TTO_TTO() - ab = contract(a, b; algorithm=:zipup, method=method) + #println([size(a[i]) for i in 1:length(a)]) + #println([size(b[i]) for i in 1:length(b)]) + #println("$maxbonddim with $method") + ab = contract(a, b; algorithm=algorithm, method=method, maxbonddim=maxbonddim, nsweeps=10) + #println([size(a[i]) for i in 1:length(a)]) + #println([size(b[i]) for i in 1:length(b)]) + #println([size(ab[i]) for i in 1:length(ab)]) + @test _tomat(ab) ≈ _tomat(a) * _tomat(b) end -@testset "MPO-MPS contraction (zipup)" for method in [:SVD, :LU] +@testset "MPO-MPS contraction" for algorithm in [:zipup, :distrzipup, :fit, :distrfit], method in [:SVD, :LU, :RSVD], maxbonddim in [typemax(Int), 10] + Random.seed!(42) # For reproducibility N, a, b, localdims1, localdims2 = _gen_testdata_TTO_TTS() - ab = contract(a, b; algorithm=:zipup, method=method) + ab = contract(a, b; algorithm=algorithm, method=method, maxbonddim=maxbonddim, nsweeps=10) @test _tovec(ab) ≈ _tomat(a) * _tovec(b) end \ No newline at end of file diff --git a/test/test_mpi.jl b/test/test_mpi.jl new file mode 100644 index 0000000..cb02fcc --- /dev/null +++ b/test/test_mpi.jl @@ -0,0 +1,58 @@ +using MPI +using Test +import TensorCrossInterpolation as TCI + +@testset "MPI" begin + @testset "MPI init" begin + @test !MPI.Initialized() + TCI.initializempi(false) + @test MPI.Initialized() + end + + @testset "MPI communicator" begin + comm = MPI.COMM_WORLD + nprocs = MPI.Comm_size(comm) + mpirank = MPI.Comm_rank(comm) + @test nprocs ≥ 1 + @test 0 ≤ mpirank < nprocs + end + + @testset "synchronize_tt!" begin + comm = MPI.COMM_WORLD + nprocs = MPI.Comm_size(comm) + mpirank = MPI.Comm_rank(comm) + juliarank = mpirank + 1 + + tolerance = 1e-8 + maxbonddim = 50 + + function f0(vec) + cos(prod(vec) * pi / 2^6) + end + function f(vec) + cos(prod(vec.^juliarank) * pi / 2^6) + end + + tt0, _, _ = TCI.crossinterpolate2(Float64, f0, fill(2, 6); tolerance, maxbonddim) + tt, _, _ = TCI.crossinterpolate2(Float64, f, fill(2, 6); tolerance, maxbonddim) + + maxerr = 0. + for x1 in 1:2, x2 in 1:2, x3 in 1:2, x4 in 1:2, x5 in 1:2, x6 in 1:2 + maxerr = max(maxerr, abs(TCI.evaluate(tt0, [x1,x2,x3,x4,x5,x6]) - TCI.evaluate(tt, [x1,x2,x3,x4,x5,x6]))) + end + if mpirank == 0 + @test maxerr < 1e-8 + else + @test maxerr > 1e-8 + end + + juliasource = 1 + TCI.synchronize_tt!(tt; juliasource=juliasource) + + maxerr = 0. + for x1 in 1:2, x2 in 1:2, x3 in 1:2, x4 in 1:2, x5 in 1:2, x6 in 1:2 + maxerr = max(maxerr, abs(TCI.evaluate(tt0, [x1,x2,x3,x4,x5,x6]) - TCI.evaluate(tt, [x1,x2,x3,x4,x5,x6]))) + end + @test maxerr < 1e-8 + end +end \ No newline at end of file diff --git a/test/test_tensorci2.jl b/test/test_tensorci2.jl index 7c9ec81..2017b5f 100644 --- a/test/test_tensorci2.jl +++ b/test/test_tensorci2.jl @@ -5,6 +5,10 @@ import Random import Random: AbstractRNG import QuanticsGrids as QD +TCI.initializempi(false) + +# TODO write test for parallel subfunctions + @testset "TensorCI2" begin @testset "kronecker util function" begin multiset = [collect(1:5) for _ in 1:5] @@ -24,7 +28,7 @@ import QuanticsGrids as QD end end - @testset "pivoterrors" begin + @testset "pivoterrors" for sweepstrategy in [:backandforth, :parallel] diags = [1.0, 1e-5, 0.0] f(x) = (x[1] == x[2] ? diags[x[1]] : 0.0) localdims = [3, 3] @@ -34,11 +38,12 @@ import QuanticsGrids as QD localdims, [[1, 1]]; tolerance=1e-8, + sweepstrategy=:sweepstrategy ) @test tci.pivoterrors == diags end - @testset "checkbatchevaluatable" begin + @testset "checkbatchevaluatable" for sweepstrategy in [:backandforth, :parallel] f(x) = 1.0 # Constant function without batch evaluation L = 10 localdims = fill(2, L) @@ -48,11 +53,12 @@ import QuanticsGrids as QD f, localdims, firstpivots; - checkbatchevaluatable=true + checkbatchevaluatable=true, + sweepstrategy=:sweepstrategy ) end - @testset "trivial MPS(exp): pivotsearch=$pivotsearch" for pivotsearch in [:full, :rook], strictlynested in [false, true], nsearchglobalpivot in [0, 10] + @testset "trivial MPS(exp): pivotsearch=$pivotsearch" for pivotsearch in [:full, :rook], strictlynested in [false, true], nsearchglobalpivot in [0, 10], sweepstrategy in [:backandforth, :parallel] if nsearchglobalpivot > 0 && strictlynested continue end @@ -82,7 +88,8 @@ import QuanticsGrids as QD normalizeerror=false, nsearchglobalpivot=nsearchglobalpivot, pivotsearch=pivotsearch, - strictlynested=strictlynested + strictlynested=strictlynested, + sweepstrategy=:sweepstrategy ) @test all(TCI.linkdims(tci) .== 1) @@ -195,7 +202,8 @@ import QuanticsGrids as QD normalizeerror=false, nsearchglobalpivot=nsearchglobalpivot, pivotsearch=pivotsearch, - strictlynested=strictlynested + strictlynested=strictlynested, + sweepstrategy=:sweepstrategy ) @test all(TCI.linkdims(tci) .== 1) @@ -244,7 +252,9 @@ import QuanticsGrids as QD @test_logs (:warn, "The option `pivottolerance` of `optimize!(tci::TensorCI2, f)` is deprecated. Please update your code to use `tolerance`, as `pivottolerance` will be removed in the future.") optimize!(tci, f; pivottolerance = 0.1) end - @testset "Lorentz MPS with ValueType=$(typeof(coeff)), pivotsearch=$pivotsearch" for coeff in [1.0, 0.5 - 1.0im], pivotsearch in [:full, :rook] + # This is a stochastic test + @testset "Lorentz MPS with ValueType=$(typeof(coeff)), pivotsearch=$pivotsearch" for seed in collect(1:20), coeff in [1.0, 0.5 - 1.0im], pivotsearch in [:full, :rook], sweepstrategy in [:backandforth, :parallel] + Random.seed!(seed) n = 5 f(v) = coeff ./ (sum(v .^ 2) + 1) @@ -285,7 +295,7 @@ import QuanticsGrids as QD [ones(Int, n)]; tolerance=1e-8, maxiter=8, - sweepstrategy=:forward, + sweepstrategy=:sweepstrategy, pivotsearch=pivotsearch ) @@ -301,7 +311,8 @@ import QuanticsGrids as QD [ones(Int, n)]; tolerance=1e-12, maxiter=200, - pivotsearch + pivotsearch, + sweepstrategy=:sweepstrategy ) @test pivoterror(tci3) <= 2e-12 @@ -323,7 +334,8 @@ import QuanticsGrids as QD initialpivots; tolerance=1e-12, maxiter=200, - pivotsearch + pivotsearch, + sweepstrategy=:sweepstrategy ) @test pivoterror(tci4) <= 2e-12 @@ -340,7 +352,7 @@ import QuanticsGrids as QD end - @testset "insert_global_pivots: pivotsearch=$pivotsearch, strictlynested=$strictlynested, seed=$seed" for seed in collect(1:20), pivotsearch in [:full, :rook], strictlynested in [false] + @testset "insert_global_pivots: pivotsearch=$pivotsearch, strictlynested=$strictlynested, seed=$seed" for seed in collect(1:20), pivotsearch in [:full, :rook], strictlynested in [false], sweepstrategy in [:backandforth, :parallel] Random.seed!(seed) R = 20 @@ -375,7 +387,8 @@ import QuanticsGrids as QD verbosity=0, normalizeerror=false, pivotsearch=pivotsearch, - strictlynested=strictlynested + strictlynested=strictlynested, + sweepstrategy=:sweepstrategy ) TCI.addglobalpivots2sitesweep!( @@ -392,7 +405,7 @@ import QuanticsGrids as QD @test sum(abs.([TCI.evaluate(tci, r) - f(r) for r in rindex]) .> abstol) == 0 end - @testset "insert_global_pivots" begin + @testset "insert_global_pivots" for sweepstrategy in [:backandforth, :parallel] Random.seed!(1234) R = 20 @@ -413,7 +426,8 @@ import QuanticsGrids as QD loginterval=1, verbosity=0, normalizeerror=false, - strictlynested=false + strictlynested=false, + sweepstrategy=:sweepstrategy ) r = fill(2, R) @@ -430,7 +444,7 @@ import QuanticsGrids as QD @test TCI.evaluate(tci, r) ≈ f(r) end - @testset "globalsearch" begin + @testset "globalsearch" for sweepstrategy in [:backandforth, :parallel] Random.seed!(1234) n = 10 @@ -451,7 +465,8 @@ import QuanticsGrids as QD maxbonddim=100, maxiter=100, nsearchglobalpivot=10, - strictlynested=false + strictlynested=false, + sweepstrategy=:sweepstrategy ) @test errors[end] < 1e-10 @@ -474,7 +489,7 @@ import QuanticsGrids as QD @test tci2.Jset == tci.Jset end - @testset "crossinterpolate2_ttcache" begin + @testset "crossinterpolate2_ttcache" for sweepstrategy in [:backandforth, :parallel] ValueType = Float64 N = 4 @@ -490,7 +505,8 @@ import QuanticsGrids as QD ttc, localdims; tolerance=1e-10, - maxbonddim=10 + maxbonddim=10, + sweepstrategy=:sweepstrategy ) tt_reconst = TCI.TensorTrain(tci2) @@ -501,6 +517,38 @@ import QuanticsGrids as QD @test vals_reconst ≈ vals_ref end + @testset "multithreadPi" begin + f = (x -> sum(x)) + + localdims = [2,2,2,2,2,2] + + Icombined = [[1,1,1,1], [1,2,1,1], [1,1,2,1]] + Jcombined = [[2,2], [1,2], [2,1], [1,1]] + + mPi = TCI.multithreadPi(Float64, f, localdims, Icombined, Jcombined) + + Pi = reshape( + TCI.filltensor(Float64, f, localdims, + Icombined, Jcombined, Val(0)), + length(Icombined), length(Jcombined) + ) + + @test size(Pi) == (3, 4) + + @test mPi == Pi + end + + @testset "noderanges" begin + @test TCI._noderanges(8, 24, 2, 100) == [1:8, 9:9, 10:10, 11:11, 12:12, 13:13, 14:14, 15:24] + @test TCI._noderanges(8, 24, 2, 500) == [1:9, 10:10, 11:11, 12:12, 13:13, 14:14, 15:15, 16:24] + @test TCI._noderanges(8, 24, 2, 100; interbond=false) == [1:24 for _ in 1:8] + @test TCI._noderanges(8, 24, 2, 500; interbond=false) == [1:24 for _ in 1:8] + @test TCI._noderanges(8, 24, 2, 100) == TCI._noderanges(8, 24, 2, 100; estimatedbonds=[2,4,8,16,32,64,100,100,100,100,100,100,100,100,100,100,100,64,32,16,8,4,2]) + + ranges, eff = TCI._noderanges(4, 8, 8, 64, efficiencycheck=true) + @test eff > 3.9 + end + @testset "convergencecriterion" begin let ranks = [1, 2] @@ -553,3 +601,4 @@ import QuanticsGrids as QD end end end + diff --git a/test/test_tensortrain.jl b/test/test_tensortrain.jl index e167dc8..c039ed5 100644 --- a/test/test_tensortrain.jl +++ b/test/test_tensortrain.jl @@ -253,3 +253,7 @@ end @test TCI.fulltensor(tt1) ≈ TCI.fulltensor(tt3) end +@testset "factorize" begin + +end +