diff --git a/.gitignore b/.gitignore
index 20fe29d..918691c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,5 @@
 *.jl.mem
 /Manifest.toml
 /docs/build/
+LocalPreferences.toml
+.vscode/settings.json
diff --git a/Project.toml b/Project.toml
index 76f3d20..dcff28e 100644
--- a/Project.toml
+++ b/Project.toml
@@ -7,6 +7,7 @@ version = "0.9.17"
 BitIntegers = "c3b6d118-76ef-56ca-8cc7-ebb389d030a1"
 EllipsisNotation = "da5c29d0-fa7d-589e-88eb-ea29b0a81949"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 QuadGK = "1fd47b50-473d-5c70-9696-f719f8f3bcdc"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 
@@ -16,6 +17,7 @@ EllipsisNotation = "1"
 QuadGK = "2.9"
 Random = "1.10.0"
 julia = "1.6"
+MPI = "0.20.22"
 
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
diff --git a/docs/src/documentation.md b/docs/src/documentation.md
index 2c28b27..d406150 100644
--- a/docs/src/documentation.md
+++ b/docs/src/documentation.md
@@ -56,3 +56,8 @@ Modules = [TensorCrossInterpolation]
 Pages = ["cachedfunction.jl", "batcheval.jl", "util.jl", "globalsearch.jl"]
 ```
 
+## Parallel utility
+```@autodocs
+Modules = [TensorCrossInterpolation]
+Pages = ["mpi.jl"]
+```
\ No newline at end of file
diff --git a/src/TensorCrossInterpolation.jl b/src/TensorCrossInterpolation.jl
index cad8cd5..32777f3 100644
--- a/src/TensorCrossInterpolation.jl
+++ b/src/TensorCrossInterpolation.jl
@@ -3,8 +3,10 @@ module TensorCrossInterpolation
 using LinearAlgebra
 using EllipsisNotation
 using BitIntegers
-import QuadGK
+using MPI
+using Base.Threads
 
+import QuadGK
 # To add a method for rank(tci)
 import LinearAlgebra: rank, diag
 import LinearAlgebra as LA
@@ -40,5 +42,6 @@ include("conversion.jl")
 include("integration.jl")
 include("contraction.jl")
 include("globalsearch.jl")
+include("mpi.jl")
 
 end
diff --git a/src/abstracttensortrain.jl b/src/abstracttensortrain.jl
index 3ff96cc..8c63ea4 100644
--- a/src/abstracttensortrain.jl
+++ b/src/abstracttensortrain.jl
@@ -131,6 +131,26 @@ function evaluate(
     return only(prod(T[:, i, :] for (T, i) in zip(tt, indexset)))
 end
 
+"""
+    function evaluate(
+        tt::TensorTrain{V},
+        indexset::Union{AbstractVector{LocalIndex}, NTuple{N, LocalIndex}}
+    )::V where {N, V}
+
+Evaluates the tensor train `tt` at indices given by `indexset` and `jndexset`. This is ment to be used for MPOs.
+"""
+function evaluate(
+    tt::AbstractTensorTrain{V},
+    indexset::Union{AbstractVector{LocalIndex},NTuple{N,LocalIndex}},
+    jndexset::Union{AbstractVector{LocalIndex},NTuple{N,LocalIndex}}
+)::V where {N,V}
+    if length(indexset) != length(tt)
+        throw(ArgumentError("To evaluate a tt of length $(length(tt)), you have to provide $(length(tt)) indices, but there were $(length(indexset))."))
+    end
+    return only(prod(T[:, i, j, :] for (T, i, j) in zip(tt, indexset, jndexset)))
+end
+
+
 """
     function evaluate(tt::TensorTrain{V}, indexset::CartesianIndex) where {V}
 
@@ -175,6 +195,38 @@ function sum(tt::AbstractTensorTrain{V}) where {V}
     return only(v)
 end
 
+"""
+    function average(tt::TensorTrain{V}) where {V}
+
+Evaluates the average of the tensor train approximation over all lattice sites in an efficient
+factorized manner.
+"""
+function average(tt::AbstractTensorTrain{V}) where {V}
+    v = transpose(sum(tt[1], dims=(1, 2))[1, 1, :]) / length(tt[1][1, :, 1])
+    for T in tt[2:end]
+        v *= sum(T, dims=2)[:, 1, :] / length(T[1, :, 1])
+    end
+    return only(v)
+end
+
+"""
+    function weightedsum(tt::TensorTrain{V}, w::Vector{V}) where {V}
+
+Evaluates the weighted sum of the tensor train approximation over all lattice sites in an efficient
+factorized manner, where w is the vector of vector of weights which has the same length and the same sizes as tt.
+"""
+function weightedsum(tt::AbstractTensorTrain{V}, w::Vector{Vector{V}}) where {V}
+    length(tt) == length(w) || throw(DimensionMismatch("The length of the Tensor Train is different from the one of the weight vector ($(length(tt)) and $(length(w)))."))
+    size(tt[1])[2] == length(w[1]) || throw(DimensionMismatch("The dimension at site 1 of the Tensor Train is different from the one of the weight vector ($(size(tt[1])[2]) and $(length(w[1])))."))
+    v = transpose(sum(tt[1].*w[1]', dims=(1, 2))[1, 1, :])
+    for i in 2:length(tt)
+        size(tt[i])[2] == length(w[i]) || throw(DimensionMismatch("The dimension at site $(i) of the Tensor Train is different from the one of the weight vector ($(size(tt[i])[2]) and $(length(w[i])))."))
+        v *= sum(tt[i].*w[i]', dims=2)[:, 1, :]
+    end
+    return only(v)
+end
+
+
 function _addtttensor(
     A::Array{V}, B::Array{V};
     factorA=one(V), factorB=one(V),
@@ -215,7 +267,7 @@ See also: [`+`](@ref)
 function add(
     lhs::AbstractTensorTrain{V}, rhs::AbstractTensorTrain{V};
     factorlhs=one(V), factorrhs=one(V),
-    tolerance::Float64=0.0, maxbonddim::Int=typemax(Int)
+    tolerance::Float64=0.0, maxbonddim::Int=typemax(Int), normalizeerror::Bool=true
 ) where {V}
     if length(lhs) != length(rhs)
         throw(DimensionMismatch("Two tensor trains with different length ($(length(lhs)) and $(length(rhs))) cannot be added elementwise."))
@@ -233,7 +285,7 @@ function add(
             for ell in 1:L
         ]
     )
-    compress!(tt, :SVD; tolerance, maxbonddim)
+    compress!(tt, :SVD; tolerance, maxbonddim, normalizeerror)
     return tt
 end
 
@@ -247,9 +299,9 @@ Subtract two tensor trains `lhs` and `rhs`. See [`add`](@ref).
 """
 function subtract(
     lhs::AbstractTensorTrain{V}, rhs::AbstractTensorTrain{V};
-    tolerance::Float64=0.0, maxbonddim::Int=typemax(Int)
+    tolerance::Float64=0.0, maxbonddim::Int=typemax(Int), normalizeerror::Bool=true
 ) where {V}
-    return add(lhs, rhs; factorrhs=-1 * one(V), tolerance, maxbonddim)
+    return add(lhs, rhs; factorrhs=-1 * one(V), tolerance, maxbonddim, normalizeerror)
 end
 
 @doc raw"""
@@ -270,6 +322,174 @@ function Base.:-(lhs::AbstractTensorTrain{V}, rhs::AbstractTensorTrain{V}) where
     return subtract(lhs, rhs)
 end
 
+function leftcanonicalize!(tt::AbstractTensorTrain{ValueType}) where {ValueType}
+    n = length(tt)  # Number of sites
+    for i in 1:n-1
+        Q, R = qr(reshape(tt[i], prod(size(tt[i])[1:end-1]), size(tt[i])[end]))
+        Q = Matrix(Q)
+
+        tt[i] = reshape(Q, size(tt[i])[1:end-1]..., size(Q, 2))  # New bond dimension after Q
+
+        tmptt = reshape(tt[i+1], size(R, 2), :)  # Reshape next tensor
+        tmptt .= Matrix(R) * tmptt
+        tt[i+1] = reshape(tmptt, size(tt[i+1])...)  # Reshape back
+    end
+end
+
+# This creates a TensorTrain which has every site right-canonical except the last
+function rightcanonicalize!(tt::AbstractTensorTrain{ValueType}) where {ValueType}
+    n = length(tt)  # Number of sites
+    for i in n:-1:2
+        # Reshape W_i into a matrix (merging right bond and physical indices)
+        W = tt[i]
+        χl, d1, d2, χr = size(W)
+        W_mat = reshape(W, χl, d1*d2*χr)
+
+        # Perform RQ decottsition: W_mat = R * Q
+        F = lq(reverse(W_mat, dims=1))
+        R, Q = reverse(F.L), reverse(Matrix(F.Q), dims=1) # https://discourse.julialang.org/t/rq-decomposition/112795/13
+
+        # Reshape Q back into the MPO tensor
+        tt[i] = reshape(Q, size(Q, 1), d1, d2, χr)  # New bond dimension after Q
+
+        # Update the previous tt tensor by absorbing R
+        tmptt = reshape(tt[i-1], :, size(R, 1))  # Reshape previous tensor
+        tmptt .= tmptt * Matrix(R)
+        tt[i-1] = reshape(tmptt, size(tt[i-1], 1), d1, d2, size(tt[i-1], 4))  # Reshape back
+    end
+end
+
+
+function centercanonicalize!(tt::Vector{Array{ValueType, N}}, center::Int; old_center::Int=0) where {ValueType, N}
+    orthogonality = checkorthogonality(tt)
+    n = length(tt)  # Number of sites
+    
+    if count(==( :N ), orthogonality) == 1
+        old_center_ = findfirst(==( :N ), orthogonality)
+        if old_center_ == nothing # Useless, but help JET compiling
+            old_center_ = old_center 
+        end
+        # println("Sto canonicalizzando centrando in $center. ho trovato il centro in $old_center_. Quindi flipperò: $(center < old_center_ ? [size(tt[i]) for i in center:old_center_] : [size(tt[i]) for i in old_center_:center])")
+        if old_center != 0 && old_center != old_center_
+            println("Warning! In centercanonicalize!() old_center has been set as $old_center, but the real old center is $old_center_")
+        end
+    elseif old_center == 0
+        old_center_ = 1
+    else
+        old_center_ = old_center
+    end
+    # LEFT
+    for i in old_center_:center-1
+        Q, R = qr(reshape(tt[i], prod(size(tt[i])[1:end-1]), size(tt[i])[end]))
+        Q = Matrix(Q)
+
+        tt[i] = reshape(Q, size(tt[i])[1:end-1]..., size(Q, 2))  # New bond dimension after Q
+
+        tmptt = reshape(tt[i+1], size(R, 2), :)  # Reshape next tensor
+        tmptt = Matrix(R) * tmptt
+        tt[i+1] = reshape(tmptt, size(tt[i+1])...)  # Reshape back
+    end
+    # RIGHT
+    if count(==( :N ), orthogonality) == 1
+        old_center_ = findfirst(==( :N ), orthogonality)
+        if old_center_ == nothing # Useless, but help JET compiling
+            old_center_ = old_center 
+        end
+        if old_center != 0 && old_center != old_center_
+            println("Warning! In centercanonicalize!() old_center has been set as $old_center, but the real old center is $old_center_")
+        end
+    elseif old_center == 0
+        old_center_ = n
+    else
+        old_center_ = old_center
+    end
+    for i in old_center_:-1:center+1
+        W = tt[i]
+        χl, d1, d2, χr = size(W)
+        W_mat = reshape(W, χl, d1*d2*χr)
+
+        L, Q = lq(W_mat)
+        Q = Matrix(Q)
+        # Reshape Q back into the tt tensor
+        tt[i] = reshape(Q, size(Q, 1), d1, d2, χr)  # New bond dimension after Q
+
+        # Update the previous tt tensor by absorbing L
+        tmptt = reshape(tt[i-1], :, size(L, 1))  # Reshape previous tensor
+        tmptt = tmptt * Matrix(L)
+        tt[i-1] = reshape(tmptt, size(tt[i-1], 1), d1, d2, size(tmptt, 2))  # Reshape back
+    end
+end
+
+function move_center_right!(tt, i)
+    A = tt[i]
+    d = size(A)
+    A_mat = reshape(A, prod(d[1:end-1]), d[end])
+    Q, R = qr(A_mat)
+    Q = Matrix(Q)
+    tt[i] = reshape(Q, d[1:end-1]..., size(Q, 2))
+
+    B = tt[i+1]
+    B_mat = reshape(B, size(R, 2), :)
+    B_mat .= Matrix(R) * B_mat
+    tt[i+1] = reshape(B_mat, size(B)...)
+end
+
+function move_center_left!(tt, i)
+    A = tt[i]
+    d = size(A)
+    A_mat = reshape(A, d[1], prod(d[2:end]))
+    L, Q = lq(A_mat)
+    Q = Matrix(Q)
+    tt[i] = reshape(Q, size(Q,1), d[2:end]...)
+
+    B = tt[i-1]
+    B_mat = reshape(B, :, size(L,1))
+    B_mat .= B_mat * Matrix(L)
+    tt[i-1] = reshape(B_mat, size(B)[1:3]..., size(L,1))
+end
+
+
+function leftcanonicalize(tt::AbstractTensorTrain{ValueType}) where {ValueType}
+    tt_ = deepcopy(tt)
+    leftcanonicalize!(tt_)
+    return tt_
+end
+
+# This creates a TensorTrain which has every site right-canonical except the last
+function rightcanonicalize(tt::AbstractTensorTrain{ValueType}) where {ValueType}
+    tt_ = deepcopy(tt)
+    rightcanonicalize!(tt_)
+    return tt_
+end
+
+# This creates a TensorTrain which has every site right-canonical except the last
+function centercanonicalize(tt::Vector{Array{ValueType, N}}, center::Int; old_center::Int=0) where {ValueType, N}
+    tt_ = deepcopy(tt)
+    centercanonicalize!(tt_, center; old_center)
+    return tt_
+end
+
+function checkorthogonality(tt::Vector{Array{ValueType, N}}) where {ValueType, N}
+    ort = Vector{Symbol}(undef, length(tt))
+    for i in 1:length(tt)
+        W = tt[i]
+        left_check = _contract(permutedims(W, (4,2,3,1,)), W, (2,3,4,),(2,3,1))
+        right_check = _contract(W, permutedims(W, (4,2,3,1,)), (2,3,4,),(2,3,1))
+        is_left = isapprox(left_check, I, atol=1e-7)
+        is_right = isapprox(right_check, I, atol=1e-7)
+        ort[i] = if is_left && is_right
+            :O # Orthogonal
+        elseif is_left
+            :L # Left orthogonal
+        elseif is_right
+            :R # Right orthogonal
+        else
+            :N # Non orthogonal
+        end
+    end
+    return ort
+end
+
 """
 Squared Frobenius norm of a tensor train.
 """
diff --git a/src/contraction.jl b/src/contraction.jl
index af847e6..b3ebd74 100644
--- a/src/contraction.jl
+++ b/src/contraction.jl
@@ -108,6 +108,40 @@ function _extend_cache(oldcache::Matrix{T}, a_ell::Array{T,4}, b_ell::Array{T,4}
     return _contract(tmp1, b_ell[:, :, j, :], (1, 2), (1, 2))
 end
 
+# Compute full left environment
+function leftenvironment(
+    A::Vector{Array{T,4}},
+    B::Vector{Array{T,4}},
+    X::Vector{Array{T,4}},
+    i::Int;
+    # L::Array{T, 3} = ones(T, 1, 1, 1),
+    L::Union{Nothing, Array{T, 3}} = nothing,
+    Li::Int = 1
+)::Array{T, 3} where {T}
+    L === nothing && (L = ones(T, 1, 1, 1))
+    for ell in Li:i
+        L = _contract(_contract(_contract(L, A[ell], (1,), (1,)), B[ell], (1,4,), (1,2,)), conj(X[ell]), (1,2,4,), (1,2,3,))
+    end
+    return L
+end
+
+# Compute full right environment
+function rightenvironment(
+    A::Vector{Array{T,4}},
+    B::Vector{Array{T,4}},
+    X::Vector{Array{T,4}},
+    i::Int;
+    # R::Array{T, 3} = ones(T, 1, 1, 1),
+    R::Union{Nothing, Array{T, 3}} = nothing,
+    Ri::Int = length(A)
+    )::Array{T, 3} where {T}
+    R === nothing && (R = ones(T, 1, 1, 1))
+    for ell in Ri:-1:i
+        R = permutedims(_contract(conj(X[ell]), _contract(B[ell], _contract(A[ell], R, (4,), (1,)), (2,4,), (3,4,)), (2,3,4,), (4,2,5,)), (3,2,1,))
+    end
+    return R
+end
+
 # Compute left environment
 function evaluateleft(
     obj::Contraction{T},
@@ -443,8 +477,9 @@ function contract_zipup(
     A::TensorTrain{ValueType,4},
     B::TensorTrain{ValueType,4};
     tolerance::Float64=1e-12,
-    method::Symbol=:SVD, # :SVD, :LU
-    maxbonddim::Int=typemax(Int)
+    method::Symbol=:SVD, # :SVD, :RSVD, :LU, :CI
+    maxbonddim::Int=typemax(Int),
+    kwargs...
 ) where {ValueType}
     if length(A) != length(B)
         throw(ArgumentError("Cannot contract tensor trains with different length."))
@@ -455,13 +490,16 @@ function contract_zipup(
     for n in 1:length(A)
         # R:     (link_ab, link_an, link_bn)
         # A[n]:  (link_an, s_n, s_n', link_anp1)
+        #println("Time RA:")
         RA = _contract(R, A[n], (2,), (1,))
 
         # RA[n]: (link_ab, link_bn, s_n, s_n' link_anp1)
         # B[n]:  (link_bn, s_n', s_n'', link_bnp1)
         # C:     (link_ab, s_n, link_anp1, s_n'', link_bnp1)
         #  =>    (link_ab, s_n, s_n'', link_anp1, link_bnp1)
+        #println("Time C:")
         C = permutedims(_contract(RA, B[n], (2, 4), (1, 2)), (1, 2, 4, 3, 5))
+        #println(size(C))
         if n == length(A)
             sitetensors[n] = reshape(C, size(C)[1:3]..., 1)
             break
@@ -485,6 +523,644 @@ function contract_zipup(
     return TensorTrain{ValueType,4}(sitetensors)
 end
 
+function contract_distr_zipup(
+    A::TensorTrain{ValueType,4},
+    B::TensorTrain{ValueType,4};
+    tolerance::Float64=1e-12,
+    method::Symbol=:SVD,
+    p::Int=16,
+    maxbonddim::Int=typemax(Int),
+    estimatedbond::Union{Nothing,Int}=nothing,
+    subcomm::Union{Nothing, MPI.Comm}=nothing,
+    kwargs...
+) where {ValueType}
+    if length(A) != length(B)
+        throw(ArgumentError("Cannot contract tensor trains with different length."))
+    end
+
+    R::Array{ValueType,3} = ones(ValueType, 1, 1, 1)
+
+    if MPI.Initialized()
+        if subcomm != nothing
+            comm = subcomm
+        else
+            comm = MPI.COMM_WORLD
+        end
+        mpirank = MPI.Comm_rank(comm)
+        juliarank = mpirank + 1
+	    nprocs = MPI.Comm_size(comm)
+        if estimatedbond == nothing
+            estimatedbond = maxbonddim == typemax(Int) ? max(maximum(linkdims(A)), maximum(linkdims(B))) : maxbonddim
+        end
+        noderanges = _noderanges(nprocs, length(A), size(A[1])[2]*size(B[1])[3], estimatedbond; interbond=true, algorithm=:mpompo)
+        Us = Vector{Array{ValueType,3}}(undef, length(noderanges[juliarank]))
+        Vts = Vector{Array{ValueType,3}}(undef, length(noderanges[juliarank]))
+    else
+        println("Warning! Distributed zipup has been chosen, but MPI is not initialized, please use initializempi() before using contract() and use finalizempi() afterwards")
+        return contract_zipup(A, B; tolerance, method, maxbonddim, kwargs...)
+    end
+
+    if nprocs == 1
+        println("Warning! Distributed zipup has been chosen, but only one process is running")
+        return contract_zipup(A, B; tolerance, method, maxbonddim, kwargs...)
+    end
+    
+    finalsitetensors =  Vector{Array{ValueType,4}}(undef, length(A))
+
+    time = time_ns()
+    if method == :SVD || method == :RSVD
+        if maxbonddim == typemax(Int)
+            println("Warning! distrzipup uses always Randomized SVD, which cannot accept maxbonddim=typemax(Int), it will be set to the maximum bond of A or B")
+            maxbonddim = max(maximum(linkdims(A)), maximum(linkdims(B)))
+        end
+        for (i, n) in enumerate(noderanges[juliarank])
+            # Random matrix
+            G = randn(size(A[n])[end], size(B[n])[end], maxbonddim+p)
+            # Projection on smaller space
+            Y = _contract(A[n], G, (4,), (1,))
+            Y = _contract(B[n], Y, (2,4,), (3,4))
+            Y = permutedims(Y, (3,1,4,2,5,))
+
+            # QR decomposition
+            Q = Matrix(LinearAlgebra.qr!(reshape(Y, (prod(size(Y)[1:4]), size(Y)[end]))).Q)
+            Q = reshape(Q, (size(Y)[1:4]..., min(prod(size(Y)[1:4]), size(Y)[end])))
+            Qt = permutedims(Q, (5, 3, 4, 1, 2))
+            # Smaller object to SVD
+            to_svd = _contract(Qt, A[n], (2,4,), (2,1,))
+            to_svd = _contract(to_svd, B[n], (2,3,4,), (3,1,2,))
+            factorization = svd(reshape(to_svd, (size(to_svd)[1], prod(size(to_svd)[2:3]))))
+            newbonddimr = min(
+                replacenothing(findlast(>(tolerance), Array(factorization.S)), 1),
+                maxbonddim
+            )
+
+            U = _contract(Q, factorization.U[:, 1:newbonddimr], (5,), (1,))
+            US = _contract(U, Diagonal(factorization.S[1:newbonddimr]), (5,), (1,))
+            U, Vt, newbonddiml = _factorize(
+                reshape(US, prod(size(US)[1:2]), prod(size(US)[3:5])),
+                method; tolerance, maxbonddim, leftorthogonal=true
+            )
+            U = reshape(U, (size(A[n])[1], size(B[n])[1], newbonddiml))
+            finalsitetensors[n] = reshape(Vt, newbonddiml, size(US)[3:5]...)
+
+            Us[i] = U
+            Vts[i] = reshape(factorization.Vt[1:newbonddimr, :], newbonddimr, size(to_svd)[end-1], size(to_svd)[end])
+        end
+    elseif method == :CI || method == :LU
+        for (i, n) in enumerate(noderanges[juliarank])
+            dimsA = size(A[n])
+            dimsB = size(B[n])
+            function f24(j,k; print=false)
+                b1, a1 = Tuple(CartesianIndices((dimsB[1], dimsA[1]))[j])  # Extract (a1, b1) from j
+                b4, a4, b3, a2 = Tuple(CartesianIndices((dimsB[4], dimsA[4], dimsB[3], dimsA[2]))[k])  # Extract (a2, b3, a4, b4) from k
+                sum(A[n][a1, a2, c, a4] * B[n][b1, c, b3, b4] for c in 1:dimsA[3])  # Summing over the contracted index
+            end
+            
+            # Non Zero element to start PRRLU decomposition
+            nz = nothing
+            size_A = size(A[n])  # (i, j, k, l)
+            size_B = size(B[n])  # (m, k, nn, o)
+            
+            for i in 1:size_A[1], m in 1:size_B[1], j in 1:size_A[2], nn in 1:size_B[3], l in 1:size_A[4], o in 1:size_B[4]
+                sum_val = zero(eltype(A[n]))
+                for k in 1:size_A[3]  # k index contraction
+                    if A[n][i, j, k, l] != 0 && B[n][m, k, nn, o] != 0
+                        sum_val += A[n][i, j, k, l] * B[n][m, k, nn, o]
+                    end
+                end
+                if sum_val != 0
+                    nz = [i, m, j, nn, l, o]
+                    break
+                end
+            end
+            
+            # CartesianIndices are read the other way around
+            I0 = [(nz[1]-1)*dimsB[1] + nz[2], (nz[3]-1)*dimsB[3]*dimsA[4]*dimsB[4] + (nz[4]-1)*dimsA[4]*dimsB[4] + (nz[5]-1)*dimsB[4] + nz[6]]
+            J0 = [(nz[1]-1)*dimsB[1] + nz[2], (nz[3]-1)*dimsB[3]*dimsA[4]*dimsB[4] + (nz[4]-1)*dimsA[4]*dimsB[4] + (nz[5]-1)*dimsB[4] + nz[6]]
+            factorization = arrlu(ValueType, f24, (dimsA[1]*dimsB[1], dimsA[2]*dimsB[3]*dimsA[4]*dimsB[4]), I0, J0; abstol=tolerance, maxrank=maxbonddim)
+            L = left(factorization)
+            U = right(factorization)
+            newbonddiml = npivots(factorization)
+            
+            U_3_2 = reshape(U, size(U)[1]*dimsA[2]*dimsB[3], dimsA[4]*dimsB[4])
+            U, Vt, newbonddimr = _factorize(U_3_2, :SVD; tolerance, maxbonddim)
+            Us[i] = reshape(L, dimsA[1], dimsB[1], newbonddiml)
+            finalsitetensors[n] = reshape(U, newbonddiml, dimsA[2], dimsB[3], newbonddimr)
+            Vts[i] = reshape(Vt, newbonddimr, dimsA[4], dimsB[4])
+        end
+    end
+    
+    # Exchange Vt to left
+    if MPI.Initialized()
+        MPI.Barrier(comm)
+        reqs = MPI.Request[]
+        if juliarank < nprocs
+            push!(reqs, MPI.isend(Vts[end], comm; dest=mpirank+1, tag=mpirank))
+        end
+        if juliarank > 1
+            Vts_l = MPI.recv(comm; source=mpirank-1, tag=mpirank-1)
+        end
+        MPI.Waitall(reqs)
+        MPI.Barrier(comm)
+    end
+
+    # contraction
+    time = time_ns()
+    for (i, n) in enumerate(noderanges[juliarank])
+        if i > 1
+            VtU = _contract(Vts[i-1], Us[i], (2,3,), (1,2,))
+            finalsitetensors[n] = _contract(VtU, finalsitetensors[n], (2,), (1,))
+        elseif n > 1
+            VtU = _contract(Vts_l, Us[i], (2,3,), (1,2,))
+            finalsitetensors[n] = _contract(VtU, finalsitetensors[n], (2,), (1,))
+        end
+    end
+    if juliarank == 1
+        finalsitetensors[1] = _contract(_contract(R, Us[1], (2,3,), (1,2,)), finalsitetensors[1], (2,), (1,))
+    end
+    if juliarank == nprocs
+        finalsitetensors[end] = _contract(finalsitetensors[end], _contract(Vts[end], R, (2,3,), (1,2,)), (4,), (1,))
+    end
+
+    # All to all exchange
+    if MPI.Initialized()
+        all_sizes = [length(noderanges[r]) for r in 1:nprocs]
+        shapes = [isassigned(finalsitetensors, i) ? size(finalsitetensors[i]) : (0,0,0,0) for i in 1:length(finalsitetensors)]
+        shapesbuffer = VBuffer(shapes, all_sizes)
+        MPI.Allgatherv!(shapesbuffer, comm)
+
+        lengths = [sum(prod.(shapes)[noderanges[r]]) for r in 1:nprocs]
+        all_length = sum(prod.(shapes))
+        vec_tensors = zeros(all_length)
+
+        if juliarank == 1
+            before_me = 0
+        else
+            before_me = sum(prod.(shapes[1:noderanges[juliarank][1]-1]))
+        end
+        me = sum(prod.(shapes[noderanges[juliarank]]))
+        vec_tensors[before_me+1:before_me+me] = vcat([vec(finalsitetensors[i]) for i in 1:length(finalsitetensors) if isassigned(finalsitetensors, i)]...)
+
+        sendrecvbuf = VBuffer(vec_tensors, lengths)
+        MPI.Allgatherv!(sendrecvbuf, comm)
+
+        idx = 1
+        for i in 1:length(finalsitetensors)
+            s = shapes[i]
+            len = prod(s)
+            finalsitetensors[i] = reshape(view(vec_tensors, idx:idx+len-1), s)
+            idx += len
+        end
+    end
+    MPI.Barrier(comm)
+
+    return TensorTrain{ValueType,4}(finalsitetensors)
+end
+
+# If Ri = i, then R is the right environment until site i+1, which has size (size(A[i])[end], size(B[i])[end], size(X[i]])[end])
+# If Li = i, then L is the left environment until site i-1, which has size (size(A[i])[1], size(B[i])[1], size(X[i]])[1])
+function updatecore!(A::Vector{Array{T,4}}, B::Vector{Array{T,4}}, X::Vector{Array{T,4}}, i::Int;
+    method::Symbol=:SVD, tolerance::Float64=1e-8, maxbonddim::Int=typemax(Int), direction::Symbol=:forward,
+    R::Union{Nothing, Array{T,3}}=nothing, Ri::Int=length(A),
+    L::Union{Nothing, Array{T,3}}=nothing, Li::Int=1
+    )::Tuple{Float64, Array{T,3}, Array{T,3}} where {T}
+    L = leftenvironment(A, B, X, i-1; L, Li)
+    R = rightenvironment(A, B, X, i+2; R, Ri)
+
+    Le = _contract(_contract(L, A[i], (1,), (1,)), B[i], (1, 4), (1, 2))
+    Re = _contract(B[i+1], _contract(A[i+1], R, (4,), (1,)), (2, 4), (3, 4))
+    Ce = _contract(Le, Re, (3, 5), (3, 1))
+    Ce = permutedims(Ce, (1, 2, 3, 5, 4, 6))
+
+    left, right, newbonddim = _factorize(
+        reshape(Ce, prod(size(Ce)[1:3]), prod(size(Ce)[4:6])),
+        method; tolerance, maxbonddim, leftorthogonal=(direction == :forward ? true : false)
+    )
+
+    X_i = reshape(left, :, size(X[i])[2:3]..., newbonddim)
+    X_i1 = reshape(right, newbonddim, size(X[i+1])[2:3]..., :)
+
+    if size(X[i]) != size(X_i) || size(X[i+1]) != size(X_i1)
+       diff = Inf
+    else
+        diff = maximum(abs, X[i] .- X_i)
+        diff = max(diff, maximum(abs, X[i+1] .- X_i1))
+    end
+    X[i] = X_i
+    X[i+1] = X_i1
+    return diff, L, R
+end
+
+"""
+    function contract_fit(A::TensorTrain{ValueType,4}, B::TensorTrain{ValueType,4})
+
+Conctractos tensor trains A and B using the fit algorithm.
+
+# Keyword Arguments
+
+- `nsweep::Int`: Number of sweeps to perform during the algorithm.
+- `initial_guess`: Optional initial guess for the tensor train A*B. If not provided, a tensor train of rank one is used. This must have coherent bond dimension (i.e. start and finish with less than [d,d^2,d^3,...,d^3,d^2,d]).
+- `tolerance::Float64`: Convergence tolerance for the iterative algorithm.
+- `method::Symbol`: Algorithm or method to use for the computation :SVD, :RSVD, :LU, :CI.
+- `maxbonddim::Int`: Maximum bond dimension allowed during the decomposition.
+"""
+function contract_fit(
+    mpoA::TensorTrain{ValueType,4},
+    mpoB::TensorTrain{ValueType,4};
+    nsweeps::Int=2,
+    initial_guess::Union{Nothing,TensorTrain{ValueType,4}}=nothing,
+    tolerance::Float64=1e-12,
+    method::Symbol=:SVD, # :SVD, :RSVD, :LU, :CI
+    maxbonddim::Int=typemax(Int),
+    kwargs...
+) where {ValueType}
+    if length(mpoA) != length(mpoB)
+        throw(ArgumentError("Cannot contract tensor trains with different length."))
+    end
+
+    n = length(mpoA)
+
+    A = Vector{Array{ValueType, 4}}(undef, n)
+    B = Vector{Array{ValueType, 4}}(undef, n)
+    X = Vector{Array{ValueType, 4}}(undef, n)
+    for i in 1:n
+        A[i] = deepcopy(mpoA.sitetensors[i])
+        B[i] = deepcopy(mpoB.sitetensors[i])
+        if !isnothing(initial_guess)
+            X[i] = deepcopy(initial_guess.sitetensors[i])
+        else
+            X[i] = ones(ValueType, 1, size(A[i])[2], size(B[i])[3], 1)
+        end
+    end
+    
+    Rs = Vector{Array{ValueType, 3}}(undef, n)
+    Ls = Vector{Array{ValueType, 3}}(undef, n)
+    
+    centercanonicalize!(A, 1)
+    centercanonicalize!(B, 1)
+    centercanonicalize!(X, 1)
+    # Precompute right environments
+    for i in n:-1:3
+        if i == n
+            Rs[i] = rightenvironment(A, B, X, i)
+        else
+            Rs[i] = rightenvironment(A, B, X, i; R=Rs[i+1], Ri=i)
+        end
+    end
+    
+    # It doesn't matter if we repeat update in 1 or n-1, those are negligible
+    direction = :forward
+    for sweep in 1:nsweeps
+        max_diff = 0.0
+        # Update cores and store Left or Right environment
+        if direction == :forward
+            for i in 1:n-1
+                centercanonicalize!(A, i)
+                centercanonicalize!(B, i)
+                centercanonicalize!(X, i)
+                if i == 1
+                    diff, _, _ = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction, R=Rs[i+2], Ri=i+1)
+                elseif i == 2
+                    diff, Ls[i-1], _ = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction, R=Rs[i+2], Ri=i+1)
+                elseif i < n-1
+                    diff, Ls[i-1], _ = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction, L=Ls[i-2], Li=i-1, R=Rs[i+2], Ri=i+1)
+                else # i == n-1
+                    diff, Ls[i-1], _ = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction, L=Ls[i-2], Li=i-1)
+                end
+                max_diff = max(max_diff, diff)
+            end
+            direction = :backward
+        elseif direction == :backward
+            for i in n-1:-1:1
+                centercanonicalize!(A, i)
+                centercanonicalize!(B, i)
+                centercanonicalize!(X, i)
+                if i == n-1
+                    diff, _, _ = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction, L=Ls[i-1], Li=i)
+                elseif i == n-2
+                    diff, _, Rs[i+2] = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction, L=Ls[i-1], Li=i)
+                elseif i > 1
+                    diff, _, Rs[i+2] = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction, L=Ls[i-1], Li=i, R=Rs[i+3], Ri=i+2)
+                else # i == 1
+                    diff, _, Rs[i+2] = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction, R=Rs[i+3], Ri=i+2)
+                end
+                max_diff = max(max_diff, diff)
+            end
+            direction = :forward
+        end
+        if max_diff < tolerance
+            break
+        end
+    end
+    return TensorTrain{ValueType,4}(X)
+end
+
+"""
+    function contract_distr_fit(A::TensorTrain{ValueType,4}, B::TensorTrain{ValueType,4})
+
+Conctractos tensor trains A and B using the fit algorithm.
+
+# Keyword Arguments
+
+- `nsweep::Int`: Number of sweeps to perform during the algorithm.
+- `initial_guess`: Initial guess of the solution A*B.
+- `subcomm::Union{Nothing, MPI.Comm}`: Subcommunicator to use for the distributed computation. If nothing, the global communicator is used.
+- `tolerance::Float64`: Convergence tolerance for the iterative algorithm.
+- `method::Symbol`: Algorithm or method to use for the computation :SVD, :RSVD, :LU, :CI.
+- `maxbonddim::Int`: Maximum bond dimension allowed during the decomposition.
+- `synchedmpo::Bool`: Tells whether the input MPOs are synchronized. Use "true" if you know that the input MPOs are exactly the same across different nodes.
+"""
+function contract_distr_fit(
+    mpoA::TensorTrain{ValueType,4},
+    mpoB::TensorTrain{ValueType,4};
+    nsweeps::Int=2,
+    initial_guess::Union{Nothing,TensorTrain{ValueType,4}}=nothing,
+    subcomm::Union{Nothing, MPI.Comm}=nothing,
+    noderanges::Union{Nothing,Vector{UnitRange{Int}}}=nothing,
+    tolerance::Float64=1e-12,
+    method::Symbol=:SVD, # :SVD, :RSVD, :LU, :CI
+    maxbonddim::Int=typemax(Int),
+    synchedmpo::Bool=false,
+    kwargs...
+) where {ValueType}
+    if length(mpoA) != length(mpoB)
+        throw(ArgumentError("Cannot contract tensor trains with different length."))
+    end
+
+    if !synchedmpo
+        synchronize_tt!(mpoA)
+        synchronize_tt!(mpoB)
+        synchronize_tt!(initial_guess)
+    end
+
+    n = length(mpoA)
+
+    A = Vector{Array{ValueType, 4}}(undef, n)
+    B = Vector{Array{ValueType, 4}}(undef, n)
+    X = Vector{Array{ValueType, 4}}(undef, n)
+    for i in 1:n
+        A[i] = deepcopy(mpoA.sitetensors[i])
+        B[i] = deepcopy(mpoB.sitetensors[i])
+        if !isnothing(initial_guess)
+            X[i] = deepcopy(initial_guess.sitetensors[i])
+        else
+            X[i] = ones(ValueType, 1, size(A[i])[2], size(B[i])[3], 1)
+        end
+    end
+
+    if MPI.Initialized()
+        if subcomm != nothing
+            comm = subcomm
+        else
+            comm = MPI.COMM_WORLD
+        end
+        mpirank = MPI.Comm_rank(comm)
+        juliarank = mpirank + 1
+        nprocs = MPI.Comm_size(comm)
+        if noderanges == nothing
+            if n < 4
+                println("Warning! The TT is too small to be parallelized.")
+                return contract_fit(mpoA, mpoB; nsweeps, initial_guess, tolerance, method, maxbonddim, kwargs...)
+            end
+            if n < 6
+                if nprocs > 2
+                    println("Warning! The TT is too small to be parallelized with more than 2 nodes. Some nodes will not be used.")
+                end
+                nprocs = 2
+                noderanges = [1:2,3:n]
+            elseif n == 6
+                if nprocs == 2
+                    noderanges = [1:3,4:6]
+                elseif nprocs == 3
+                    noderanges = [1:2,3:4,5:6]
+                else
+                    println("Warning! The TT is too small to be parallelized with more than 3 nodes. Some nodes will not be used.")
+                    nprocs = 3
+                    noderanges = [1:2,3:4,5:6]
+                end
+            else 
+                extra1 = 3 # "Hyperparameter"
+                extraend = 3
+                if nprocs >= div(n - extra1 - extraend, 2) # It's just one update per node.
+                    println("Warning! A TT of lenght L can use be parallelized with up to (L-$(extra1 + extraend))/2 nodes. Some nodes will not be used.")
+                    # Each node has 2 cores. Except first and last who have 2+extra1 and 2+extraend+n%2
+                    if n % 2 == 0
+                        noderanges = vcat([1:extra1+1],[extra1+i+1:extra1+i+2 for i in 1:2:n-extra1-extraend-2],[n-extraend:n])
+                    else
+                        noderanges = vcat([1:extra1+1],[extra1+i+1:extra1+i+2 for i in 1:2:n-extra1-extraend-3],[n-1-extraend:n])
+                    end
+                    for _ in div(n - extra1 - extraend, 2)+1:nprocs
+                        push!(noderanges, 1:-1)
+                    end
+                    nprocs = div(n - extra1 - extraend, 2)
+                else
+                    number_of_sites, rem = divrem(n-extra1-extraend, nprocs)
+                    sites = [number_of_sites for i in 1:nprocs]
+                    sites[1] += extra1
+                    sites[end] += extraend
+                    # Distribute the remainder as evenly as possible
+                    for i in 1:rem
+                        if i % 2 == 1
+                            sites[div(i,2)+2] += 1
+                        else
+                            sites[end-div(i,2)] += 1
+                        end
+                    end
+                    noderanges = [sum(sites[1:i-1])+1:sum(sites[1:i]) for i in 1:nprocs]
+                end
+            end
+        end
+    else
+        println("Warning! Distributed strategy has been chosen, but MPI is not initialized, please use TCI.initializempi() before contract() and use TCI.finalizempi() afterwards")
+        return contract_fit(mpoA, mpoB; nsweeps, initial_guess, tolerance, method, maxbonddim, kwargs...)
+    end
+
+    if nprocs == 1
+        println("Warning! Distributed strategy has been chosen, but only one process is running")
+        return contract_fit(mpoA, mpoB; nsweeps, initial_guess, tolerance, method, maxbonddim, kwargs...)
+    end
+
+    if !isnothing(initial_guess)
+        X = deepcopy(initial_guess.sitetensors)
+    else
+        X = [ones(ValueType, 1, size(A[i])[2], size(B[i])[3], 1) for i in 1:n]
+    end
+
+    Ls = Vector{Array{ValueType, 3}}(undef, n)
+    Rs = Vector{Array{ValueType, 3}}(undef, n)
+
+    first = noderanges[juliarank][1]
+    last = noderanges[juliarank][end]
+
+    #Ls are left enviroments and Rs are right environments
+    if first != 1
+        Ls[first-1] = ones(ValueType, size(A[first])[1], size(B[first])[1], size(X[first])[1])
+    end
+    if last != n
+        Rs[last+1] = ones(ValueType, size(A[last])[end], size(B[last])[end], size(X[last])[end])
+    end
+
+    
+    if juliarank % 2 == 1 # Precompute right environment if going forward
+        centercanonicalize!(A, first)
+        centercanonicalize!(B, first)
+        centercanonicalize!(X, first)
+        for i in last:-1:first+2
+            if i == n
+                Rs[i] = rightenvironment(A, B, X, i)
+            else
+                Rs[i] = rightenvironment(A, B, X, i; R=Rs[i+1], Ri=i)
+            end
+        end
+    else # Precompute left environment if going backward
+        centercanonicalize!(A, last-1)
+        centercanonicalize!(B, last-1)
+        centercanonicalize!(X, last-1)
+        for i in first:last-2
+            if i == 1
+                Ls[i] = leftenvironment(A, B, X, i)
+            else
+                Ls[i] = leftenvironment(A, B, X, i; L=Ls[i-1], Li=i)
+            end
+        end
+    end
+
+    for sweep in 1:nsweeps
+        max_diff = 0.0
+        if sweep % 2 == juliarank % 2
+            for i in first:last-1
+                centercanonicalize!(A, i)
+                centercanonicalize!(B, i)
+                if i == 1
+                    diff, _, _ = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction=:forward, R=Rs[i+2], Ri=i+1)
+                elseif i == 2
+                    diff, Ls[i-1], _ = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction=:forward, R=Rs[i+2], Ri=i+1)
+                elseif i == first
+                    diff, _, _ = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction=:forward, L=Ls[i-1], Li=i, R=Rs[i+2], Ri=i+1)
+                elseif i < n-1
+                    diff, Ls[i-1], _ = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction=:forward, L=Ls[i-2], Li=i-1, R=Rs[i+2], Ri=i+1)
+                else
+                    diff, Ls[i-1], _ = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction=:forward, L=Ls[i-2], Li=i-1)
+                end
+                max_diff = max(max_diff, diff)
+            end
+        else
+            for i in last-1:-1:first
+                centercanonicalize!(A, i)
+                centercanonicalize!(B, i)
+                if i == n-1
+                    diff, _, _ = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction=:backward, L=Ls[i-1], Li=i)
+                elseif i == n-2
+                    diff, _, Rs[i+2] = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction=:backward, L=Ls[i-1], Li=i)
+                elseif i == last-1
+                    diff, _, _ = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction=:backward, L=Ls[i-1], Li=i, R=Rs[i+2], Ri=i+1)
+                elseif i > 1
+                    diff, _, Rs[i+2] = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction=:backward, L=Ls[i-1], Li=i, R=Rs[i+3], Ri=i+2)
+                else
+                    diff, _, Rs[i+2] = updatecore!(A, B, X, i; method, tolerance, maxbonddim, direction=:backward, R=Rs[i+3], Ri=i+2)
+                end
+                max_diff = max(max_diff, diff)
+            end
+        end
+
+        if MPI.Initialized()
+            if juliarank % 2 == sweep % 2 # Left
+                if juliarank != nprocs
+                    centercanonicalize!(A, last)
+                    centercanonicalize!(B, last)
+
+                    Ls[last-1] = leftenvironment(A, B, X, last-1; L=Ls[last-2], Li=last-1)
+
+                    sizes = Vector{Int}(undef, 3)
+                    reqs = MPI.Isend(collect(size(Ls[last-1])), comm; dest=mpirank+1, tag=juliarank)
+                    reqr = MPI.Irecv!(sizes, comm; source=mpirank+1, tag=juliarank+1)
+
+                    MPI.Waitall([reqs, reqr])
+
+                    Rs[last+2] = ones(ValueType, sizes[1], sizes[2], sizes[3])
+                    
+                    reqs = MPI.Isend(Ls[last-1], comm; dest=mpirank+1, tag=juliarank)
+                    reqr = MPI.Irecv!(Rs[last+2], comm; source=mpirank+1, tag=juliarank+1)
+
+                    MPI.Waitall([reqs, reqr])
+                    @assert typeof(Rs[last+2]) <: Array{ValueType,3} "Expected Array{ValueType,3}, got $(typeof(Rs[last+2]))"
+                    diff, _, _ = updatecore!(A, B, X, last; method, tolerance, maxbonddim, direction=:backward, L=Ls[last-1], Li=last, R=Rs[last+2], Ri=last+1)
+                    
+                    Rs[last+1] = rightenvironment(A, B, X, last+1; R=Rs[last+2], Ri=last+1)
+                end
+            else # Right
+                if juliarank != 1
+                    centercanonicalize!(A, first)
+                    centercanonicalize!(B, first)
+
+                    Rs[first+1] = rightenvironment(A, B, X, first+1; R=Rs[first+2], Ri=first+1)
+
+                    sizes = Vector{Int}(undef, 3)
+                    reqs = MPI.Isend(collect(size(Rs[first+1])), comm; dest=mpirank-1, tag=juliarank)
+                    reqr = MPI.Irecv!(sizes, comm; source=mpirank-1, tag=juliarank-1)
+                    
+                    MPI.Waitall([reqs, reqr])
+
+                    Ls[first-2] = ones(ValueType, sizes[1], sizes[2], sizes[3])
+                    reqs = MPI.Isend(Rs[first+1], comm; dest=mpirank-1, tag=juliarank)
+                    reqr = MPI.Irecv!(Ls[first-2], comm; source=mpirank-1, tag=juliarank-1)
+                    
+                    MPI.Waitall([reqs, reqr])
+                    @assert typeof(Ls[first-2]) <: Array{ValueType,3} "Expected Array{ValueType,3}, got $(typeof(Ls[first-2]))"
+                    diff, _, _ = updatecore!(A, B, X, first-1; method, tolerance, maxbonddim, direction=:forward, L=Ls[first-2], Li=first-1, R=Rs[first+1], Ri=first) #center on first
+                    
+                    Ls[first-1] = leftenvironment(A, B, X, first-1; L=Ls[first-2], Li=first-1)
+                end
+            end
+        end
+
+        max_diff = max(max_diff, diff) # Check last update on edge
+        converged = max_diff < tolerance
+        global_converged = MPI.Allreduce(converged, MPI.LAND, comm)
+        if global_converged
+            break
+        end
+    end
+
+    # Merge togheter all the sub tensor trains
+    if juliarank == 1
+        for j in 2:nprocs
+            centercanonicalize!(A, noderanges[j-1][end])
+            centercanonicalize!(B, noderanges[j-1][end])
+            centercanonicalize!(X, noderanges[j-1][end])
+            X[noderanges[j]] = MPI.recv(comm; source=j-1, tag=2*j)
+            R = Array{ValueType, 3}(undef, 1, 1, 1)
+            if j != nprocs
+                MPI.Recv!(R, comm; source=j-1, tag=2*j+1)
+            else
+                R = ones(ValueType, 1, 1, 1)
+            end
+            @assert typeof(R) <: Array{ValueType,3} "Expected Array{ValueType,3}, got $(typeof(R))"
+
+            diff, _, _ = updatecore!(A, B, X, noderanges[j-1][end]; method, tolerance, maxbonddim, direction=:forward, R, Ri=noderanges[j][end])
+        end
+    else
+        centercanonicalize!(X, first)
+        MPI.send(X[noderanges[juliarank]], comm; dest=0, tag=2*juliarank)
+        if juliarank != nprocs
+            MPI.send(Rs[noderanges[juliarank][end]+1], comm; dest=0, tag=2*juliarank+1)
+        end
+    end
+
+    nprocs = MPI.Comm_size(comm) # In case not all processes where used to compute
+
+    # Redistribute the tensor train among the processes.
+    if juliarank == 1
+        for j in 2:nprocs
+            MPI.send(X, comm; dest=j-1, tag=1)
+        end
+    else
+        X = MPI.recv(comm; source=0, tag=1)
+    end
+
+    return TensorTrain{ValueType,4}(X)
+end
+
 """
     function contract(
         A::TensorTrain{V1,4},
@@ -493,6 +1169,7 @@ end
         tolerance::Float64=1e-12,
         maxbonddim::Int=typemax(Int),
         f::Union{Nothing,Function}=nothing,
+        subcomm::Union{Nothing, MPI.Comm}=nothing,
         kwargs...
     ) where {V1,V2}
 
@@ -501,7 +1178,11 @@ Contract two tensor trains `A` and `B`.
 Currently, two implementations are available:
  1. `algorithm=:TCI` constructs a new TCI that fits the contraction of `A` and `B`.
  2. `algorithm=:naive` uses a naive tensor contraction and subsequent SVD recompression of the tensor train.
- 2. `algorithm=:zipup` uses a naive tensor contraction with on-the-fly LU decomposition.
+ 3. `algorithm=:zipup` uses a naive tensor contraction with on-the-fly LU decomposition.
+ 4. `algorithm=:distrzipup` uses a naive tensor contraction with on-the-fly LU decomposition, distributed on multiple nodes.
+ 5. `algorithm=:fit` uses a variational approach to minimize the difference between `A`*`B` and the solution.
+ 6. `algorithm=:distrfit` uses a variational approach to minimize the difference between `A`*`B` and the solution, distributed on multiple nodes.
+
 
 Arguments:
 - `A` and `B` are the tensor trains to be contracted.
@@ -510,7 +1191,8 @@ Arguments:
 - `maxbonddim` sets the maximum bond dimension of the resulting tensor train.
 - `f` is a function to be applied elementwise to the result. This option is only available with `algorithm=:TCI`.
 - `method` chooses the method used for the factorization in the `algorithm=:zipup` case (`:SVD` or `:LU`).
-- `kwargs...` are forwarded to [`crossinterpolate2`](@ref) if `algorithm=:TCI`.
+- `subcomm` is an optional MPI communicator for distributed algorithms. If not provided, the default communicator is used.
+- `kwargs...` are forwarded to [`crossinterpolate2`](@ref) if `algorithm=:TCI` or to [`contract_fit`](@ref) if `algorithm=:fit` or `algorithm=:distrfit`.
 """
 function contract(
     A::TensorTrain{V1,4},
@@ -518,7 +1200,9 @@ function contract(
     algorithm::Symbol=:TCI,
     tolerance::Float64=1e-12,
     maxbonddim::Int=typemax(Int),
+    method::Symbol=:SVD,
     f::Union{Nothing,Function}=nothing,
+    subcomm::Union{Nothing, MPI.Comm}=nothing,
     kwargs...
 )::TensorTrain{promote_type(V1,V2),4} where {V1,V2}
     Vres = promote_type(V1, V2)
@@ -535,7 +1219,23 @@ function contract(
         if f !== nothing
             error("Zipup contraction implementation cannot contract matrix product with a function. Use algorithm=:TCI instead.")
         end
-        return contract_zipup(A_, B_; tolerance, maxbonddim)
+        return contract_zipup(A_, B_; tolerance=tolerance, maxbonddim=maxbonddim, method=method, kwargs...)
+    elseif algorithm === :distrzipup
+        if f !== nothing
+            error("Zipup contraction implementation cannot contract matrix product with a function. Use algorithm=:TCI instead.")
+        end
+        return contract_distr_zipup(A_, B_; tolerance=tolerance, maxbonddim=maxbonddim, method=method, subcomm=subcomm, kwargs...)
+    elseif algorithm === :fit
+        if f !== nothing
+            error("Fit contraction implementation cannot contract matrix product with a function. Use algorithm=:TCI instead.")
+        end
+        return contract_fit(A_, B_; tolerance=tolerance, maxbonddim=maxbonddim, method=method, kwargs...)
+    elseif algorithm === :distrfit
+        if f !== nothing
+            error("Fit contraction implementation cannot contract matrix product with a function. Use algorithm=:TCI instead.")
+        end
+        # return contract_distr_fit(A_, B_; tolerance=tolerance, maxbonddim=maxbonddim, method=method, kwargs...)
+        return contract_fit(A_, B_; tolerance=tolerance, maxbonddim=maxbonddim, method=method, kwargs...)
     else
         throw(ArgumentError("Unknown algorithm $algorithm."))
     end
@@ -558,3 +1258,12 @@ function contract(
     tt = contract(A, TensorTrain{4}(B, [(s..., 1) for s in sitedims(B)]); kwargs...)
     return TensorTrain{3}(tt, prod.(sitedims(tt)))
 end
+
+function contract(
+    A::Union{TensorCI1{V},TensorCI2{V},TensorTrain{V,3}},
+    B::Union{TensorCI1{V2},TensorCI2{V2},TensorTrain{V2,3}};
+    kwargs...
+)::promote_type(V,V2) where {V,V2}
+    tt = contract(TensorTrain{4}(A, [(1, s...) for s in sitedims(A)]), TensorTrain{4}(B, [(s..., 1) for s in sitedims(B)]); kwargs...)
+    return prod(prod.(tt.sitetensors))
+end
diff --git a/src/integration.jl b/src/integration.jl
index ff7a5f1..83ae9d1 100644
--- a/src/integration.jl
+++ b/src/integration.jl
@@ -55,4 +55,4 @@ function integrate(
     )
 
     return sum(tci2) / normalization
-end
+end
\ No newline at end of file
diff --git a/src/matrixlu.jl b/src/matrixlu.jl
index 24f6fae..ac713da 100644
--- a/src/matrixlu.jl
+++ b/src/matrixlu.jl
@@ -1,3 +1,44 @@
+function distribute(_batchf, full, fragmented, teamsize, teamrank, stripes, subcomm)
+    chunksize, remainder = divrem(length(fragmented), teamsize)
+    if stripes == :vertical
+        col_chunks = [chunksize + (i <= remainder ? 1 : 0) for i in 1:teamsize]
+        row_chunks = [length(full) for _ in 1:teamsize]
+        ranges = [(vcat([0],cumsum(col_chunks))[i] + 1):(cumsum(col_chunks)[i]) for i in 1:teamsize]
+    else # horizontal
+        row_chunks = [chunksize + (i <= remainder ? 1 : 0) for i in 1:teamsize]
+        col_chunks = [length(full) for _ in 1:teamsize]
+        ranges = [(vcat([0],cumsum(row_chunks))[i] + 1):(cumsum(row_chunks)[i]) for i in 1:teamsize]
+    end
+    fragment = fragmented[ranges[teamrank]]
+    localsubmatrix = if isempty(fragment)
+        zeros(length(full), 0)
+    else
+        if stripes == :vertical
+            _batchf(full, fragment)
+        else
+            _batchf(fragment, full)
+        end
+    end
+    if teamrank == 1
+        submatrix = if stripes == :vertical
+            zeros(length(full), length(fragmented))
+        else
+            zeros(length(fragmented), length(full))
+        end
+        sizes = vcat(row_chunks', col_chunks')
+        counts = vec(prod(sizes, dims=1))
+        submatrix_vbuf = VBuffer(submatrix, counts)
+    else
+        submatrix_vbuf = VBuffer(nothing)
+    end
+    MPI.Gatherv!(localsubmatrix, submatrix_vbuf, 0, subcomm)
+    if teamrank != 1
+        submatrix = nothing
+    end
+    submatrix
+end
+
+
 function submatrixargmax(
     f::Function, # real valued function
     A::AbstractMatrix{T},
@@ -235,7 +276,10 @@ function arrlu(
     abstol::Number=0.0,
     leftorthogonal::Bool=true,
     numrookiter::Int=5,
-    usebatcheval::Bool=false
+    usebatcheval::Bool=false,
+    leaders::Vector=Int[],
+    leaderslist::Vector=Int[],
+    subcomm=nothing # TODO change: bad practice
 )::rrLU{ValueType} where {ValueType}
     lu = rrLU{ValueType}(matrixsize...; leftorthogonal)
     islowrank = false
@@ -243,50 +287,111 @@ function arrlu(
 
     _batchf = usebatcheval ? f : ((x, y) -> f.(x, y'))
 
+    if !isempty(leaders)
+        comm = MPI.COMM_WORLD
+        rank = MPI.Comm_rank(comm)
+        juliarank = rank + 1
+        nprocs = MPI.Comm_size(comm)
+        teamrank = MPI.Comm_rank(subcomm)
+        teamjuliarank = teamrank + 1
+        teamsize = MPI.Comm_size(subcomm)
+    end
+
     while true
         if leftorthogonal
             pushrandomsubset!(J0, 1:matrixsize[2], max(1, length(J0)))
         else
             pushrandomsubset!(I0, 1:matrixsize[1], max(1, length(I0)))
         end
-
+        
         for rookiter in 1:numrookiter
             colmove = (iseven(rookiter) == leftorthogonal)
-            submatrix = if colmove
-                _batchf(I0, lu.colpermutation)
-            else
-                _batchf(lu.rowpermutation, J0)
+            if !isempty(leaders) # Parallel
+                if teamsize > 1 # Parallel and leadership
+                    submatrix = if colmove
+                        distribute(_batchf, lu.colpermutation, I0, teamsize, teamjuliarank, :horizontal, subcomm)
+                    else
+                        distribute(_batchf, lu.rowpermutation, J0, teamsize, teamjuliarank, :vertical, subcomm)
+                    end
+                    MPI.Barrier(subcomm)
+                    if teamjuliarank == 1
+                        lu.npivot = 0
+                        _optimizerrlu!(lu, submatrix; maxrank, reltol, abstol)
+                        islowrank |= npivots(lu) < minimum(size(submatrix))
+                    end
+                else
+                    submatrix = if colmove
+                        _batchf(I0, lu.colpermutation)
+                    else
+                        _batchf(lu.rowpermutation, J0)
+                    end
+                    lu.npivot = 0
+                    _optimizerrlu!(lu, submatrix; maxrank, reltol, abstol)
+                    islowrank |= npivots(lu) < minimum(size(submatrix))
+                end    
+            else # Serial
+                submatrix = if colmove
+                    _batchf(I0, lu.colpermutation)
+                else
+                    _batchf(lu.rowpermutation, J0)
+                end
+                lu.npivot = 0
+                _optimizerrlu!(lu, submatrix; maxrank, reltol, abstol)
+                islowrank |= npivots(lu) < minimum(size(submatrix))
             end
-            lu.npivot = 0
-            _optimizerrlu!(lu, submatrix; maxrank, reltol, abstol)
-            islowrank |= npivots(lu) < minimum(size(submatrix))
-            if rowindices(lu) == I0 && colindices(lu) == J0
-                break
+            if !isempty(leaders) # Parallel
+                tb = time_ns()
+                lu = MPI.bcast([lu], subcomm)[1]
+                islowrank = MPI.bcast([islowrank], subcomm)[1]
+                tb = (time_ns() - tb)*1e-9
+                if rowindices(lu) == I0 && colindices(lu) == J0
+                    break
+                end
+                J0 = colindices(lu)
+                I0 = rowindices(lu)
+            else # Serial
+                if rowindices(lu) == I0 && colindices(lu) == J0
+                    break
+                end
+                J0 = colindices(lu)
+                I0 = rowindices(lu)
             end
-
-            J0 = colindices(lu)
-            I0 = rowindices(lu)
         end
 
+        
         if islowrank || length(I0) >= maxrank
             break
         end
     end
 
+    
     if size(lu.L, 1) < matrixsize[1]
         I2 = setdiff(1:matrixsize[1], I0)
         lu.rowpermutation = vcat(I0, I2)
-        L2 = _batchf(I2, J0)
-        cols2Lmatrix!(L2, (@view lu.U[1:lu.npivot, 1:lu.npivot]), leftorthogonal)
-        lu.L = vcat((@view lu.L[1:lu.npivot, 1:lu.npivot]), L2)
+        L2 = if !isempty(leaders)
+            distribute(_batchf, J0, I2, teamsize, teamjuliarank, :horizontal, subcomm)
+        else
+            _batchf(I2, J0)
+        end
+        if isempty(leaders) || teamjuliarank == 1
+            cols2Lmatrix!(L2, (@view lu.U[1:lu.npivot, 1:lu.npivot]), leftorthogonal)
+            lu.L = vcat((@view lu.L[1:lu.npivot, 1:lu.npivot]), L2)
+        end
     end
 
+
     if size(lu.U, 2) < matrixsize[2]
         J2 = setdiff(1:matrixsize[2], J0)
         lu.colpermutation = vcat(J0, J2)
-        U2 = _batchf(I0, J2)
-        rows2Umatrix!(U2, (@view lu.L[1:lu.npivot, 1:lu.npivot]), leftorthogonal)
-        lu.U = hcat((@view lu.U[1:lu.npivot, 1:lu.npivot]), U2)
+        U2 = if !isempty(leaders)
+            distribute(_batchf, I0, J2, teamsize, teamjuliarank, :vertical, subcomm)
+        else
+            U2 = _batchf(I0, J2)
+        end
+        if isempty(leaders) || teamjuliarank == 1
+            rows2Umatrix!(U2, (@view lu.L[1:lu.npivot, 1:lu.npivot]), leftorthogonal)
+            lu.U = hcat((@view lu.U[1:lu.npivot, 1:lu.npivot]), U2)
+        end
     end
 
     return lu
diff --git a/src/mpi.jl b/src/mpi.jl
new file mode 100644
index 0000000..f9870c0
--- /dev/null
+++ b/src/mpi.jl
@@ -0,0 +1,63 @@
+"""
+    function initializempi()
+
+Initialize the MPI environment if it has not been initialized yet. If mute=true, then all the processes with rank>0 (i.e. not the root node) won't output anything to stdout.
+"""
+function initializempi(mute::Bool=true)
+    if !MPI.Initialized()
+        MPI.Init()
+    end
+    comm = MPI.COMM_WORLD
+    mpirank = MPI.Comm_rank(comm)
+    if mute # Mute all processors which have mpirank != 0.
+        if mpirank != 0
+            open("/dev/null", "w") do devnull
+                redirect_stdout(devnull)
+                redirect_stderr(devnull)
+            end
+        end
+    end
+end
+
+"""
+    function finalizempi()
+
+Finalize the MPI environment if it has not been finalized yet.
+"""
+function finalizempi()
+    MPI.Barrier(MPI.COMM_WORLD)
+    if MPI.Comm_rank == 0
+        if !MPI.Finalized()
+            MPI.Finalize()
+        end
+    end
+end
+
+function synchronize_tt!(tt::Union{TensorTrain{ValueType,N},TensorCI2{ValueType}}; subcomm = nothing, juliasource::Int = 1) where {ValueType, N}
+    if !MPI.Initialized()
+        println("Warning! synchronize_tt has been called, but MPI is not initialized, please use TCI.initializempi() before contract() and use TCI.finalizempi() afterwards")
+    else
+        if subcomm != nothing
+            comm = subcomm
+        else
+            comm = MPI.COMM_WORLD
+        end
+        mpirank = MPI.Comm_rank(comm)
+        juliarank = mpirank + 1
+        nprocs = MPI.Comm_size(comm)
+
+        MPI.Barrier(comm)
+        reqs = MPI.Request[]
+        if juliarank == juliasource
+            for j in 1:nprocs
+                if j != juliarank
+                    push!(reqs, MPI.isend(tt.sitetensors[1:end], comm; dest=j-1, tag=juliarank))
+                end
+            end
+        end
+        if juliarank != juliasource
+            tt.sitetensors[1:end] = MPI.recv(comm; source=juliasource-1, tag=juliasource)
+        end
+        MPI.Waitall(reqs)
+    end
+end
\ No newline at end of file
diff --git a/src/sweepstrategies.jl b/src/sweepstrategies.jl
index c53dabc..b14bdc5 100644
--- a/src/sweepstrategies.jl
+++ b/src/sweepstrategies.jl
@@ -3,4 +3,4 @@ function forwardsweep(sweepstrategy::Symbol, iteration::Int)
         (sweepstrategy == :forward) ||
         (sweepstrategy == :backandforth && isodd(iteration))
     )
-end
+end
\ No newline at end of file
diff --git a/src/tensorci2.jl b/src/tensorci2.jl
index c85db93..06c607b 100644
--- a/src/tensorci2.jl
+++ b/src/tensorci2.jl
@@ -250,7 +250,8 @@ function addglobalpivots2sitesweep!(
     pivotsearch::Symbol=:full,
     verbosity::Int=0,
     ntry::Int=10,
-    strictlynested::Bool=false
+    strictlynested::Bool=false,
+    multithread_eval::Bool=false
 )::Int where {F,ValueType}
     if any(length(tci) .!= length.(pivots))
         throw(DimensionMismatch("Please specify a pivot as one index per leg of the MPS."))
@@ -270,7 +271,8 @@ function addglobalpivots2sitesweep!(
             maxbonddim=maxbonddim,
             pivotsearch=pivotsearch,
             strictlynested=strictlynested,
-            verbosity=verbosity)
+            verbosity=verbosity,
+            multithread_eval=multithread_eval)
 
         newpivots = [p for p in pivots if abs(evaluate(tci, p) - f(p)) > abstol]
 
@@ -502,7 +504,214 @@ function (obj::SubMatrix{T})(irows::Vector{Int}, icols::Vector{Int})::Matrix{T}
     return res
 end
 
+"""
+    Noderanges(nprocs, nbonds, localdim, estimatedbond; interbond=false, estimatedbonds=nothing, algorithm=:tci, efficiencycheck=false)
+
+Distributes the computational workload efficiently across multiple nodes for tensor train algorithms.
+
+# Arguments
+- `nprocs::Int`: Number of processes (nodes) to distribute the workload across.
+- `nbonds::Int`: Number of bonds in the tensor train.
+- `localdim::Int`: Local dimension of the tensor train.
+- `estimatedbond::Int`: Estimated bond dimension after compression, set by the user to help balance the workload.
+- `interbond::Bool=false`: If `true`, distributes workload across bonds; if `false`, only function evaluation is distributed, equivalent to serial TCI.
+- `estimatedbonds::Union{Nothing, Vector{Int}}=nothing`: Optional vector of estimated bond dimensions for each bond. If not provided, it is computed from `estimatedbond`.
+- `algorithm::Symbol=:tci`: Algorithm being parallelized (e.g., `:tci`, `:mpompo`). Different algorithms may have different workload characteristics.
+- `efficiencycheck::Bool=false`: If `true`, the function also returns the efficiency of the distribution.
+
+# Returns
+A tuple containing the ranges of work assigned to each node. If `efficiencycheck` is `true`, also returns the efficiency of the distribution.
+
+# Notes
+- The function is designed to optimize workload distribution for tensor train computations, taking into account the estimated bond dimensions and the chosen algorithm.
+- Proper setting of `estimatedbond` or `estimatedbonds` can improve parallel efficiency.
+"""
+
+
+function _noderanges(nprocs::Int, nbonds::Int, localdim::Int, estimatedbond::Int; interbond::Bool=true, estimatedbonds::Union{Vector{Int},Nothing}=nothing, algorithm::Symbol=:tci, efficiencycheck::Bool=false)::Union{Vector{UnitRange{Int}},Tuple{Vector{UnitRange{Int}},Float64}}
+    if !interbond
+        return [1:nbonds for _ in 1:nprocs]
+    end
+    if estimatedbonds === nothing
+        estimatedbonds = [estimatedbond for _ in 1:nbonds-1]
+        i = 1
+        while localdim^i < estimatedbond && i <= nbonds/2
+            estimatedbonds[i] = localdim^i
+            estimatedbonds[end-i+1] = localdim^i
+            i = i + 1
+        end 
+    end
+
+    if algorithm == :tci
+        # ~ chi^3
+        costs = [(i == 1 ? 1 : estimatedbonds[i-1]) * (i == nbonds ? 1 : estimatedbonds[i]) * min((i == 1 ? 1 : estimatedbonds[i-1]) * (i == nbonds ? 1 : estimatedbonds[i])) for i in 1:nbonds]
+    else
+        # ~ chi^4
+        costs = [(i == 1 ? 1 : estimatedbonds[i-1]) * (i == nbonds ? 1 : estimatedbonds[i]) * estimatedbond * min((i == 1 ? 1 : estimatedbonds[i-1]) * (i == nbonds ? 1 : estimatedbonds[i])) for i in 1:nbonds]
+    end
+    total_cost = sum(costs)
+    target_cost = total_cost / nprocs
+
+    assignments = Vector{UnitRange{Int}}()
+
+    i = 1  # bond index
+    node = 1
+    current_cost = 0
+    range_start = 1
+
+    while i <= nbonds && node < nprocs
+        bond_cost = costs[i]
+
+        if current_cost + bond_cost <= target_cost * 1.3
+            # Add the current bond to the node's range
+            current_cost += bond_cost
+            i += 1
+        else
+            # Current node is full, move to the next node
+            push!(assignments, range_start:i-1)
+            node += 1
+            current_cost = bond_cost
+            range_start = i
+            i += 1
+        end
+    end
+    push!(assignments, range_start:nbonds)
+    leftover = nprocs-node
+    if leftover > 0 
+        for k in 1:Int(ceil(leftover/node))
+            for j in 1:(k == Int(ceil(leftover/node)) ? rem(leftover, node) : node)
+                if j%2 == 1
+                    indx = Int(j + (j-1)/2 * (k-1))
+                    insert!(assignments, indx, assignments[indx])
+                else
+                    indx = Int(length(assignments)+2-j - (j/2)*(k-1))
+                    insert!(assignments, indx, assignments[indx])
+                end
+            end
+        end
+    end
+    works = [sum(costs[i]) for i in assignments]
+    efficiency = total_cost / maximum(works)
+    if efficiencycheck 
+        return assignments, efficiency
+    else
+        return assignments
+    end
+end
+
+"""
+    _leaders(nprocs::Int, noderanges::Vector{UnitRange{Int}}) -> (leaders::Vector{Int}, leaderslist::Vector{Int})
+
+Determine the leader processes among `nprocs` processes based on their assigned node ranges.
+
+A process is considered a leader (`+1`) if its node range is equal to the next process's node range, and not a leader (`-1`) if its node range is equal to the previous process's node range. All other processes are marked as `0`. Only leaders are responsible for communication.
+
+# Arguments
+- `nprocs::Int`: The total number of processes.
+- `noderanges::Vector{UnitRange{Int}}`: A vector specifying the node range assigned to each process.
+
+# Returns
+- `leaders::Vector{Int}`: An array where each entry is `1` for a leader, `-1` for a worker, and `0` otherwise.
+- `leaderslist::Vector{Int}`: A list of process indices that are leaders and participate in communication.
+
+# Notes
+- Communication is performed only between leaders.
+"""
+function _leaders(nprocs::Int, noderanges::Vector{UnitRange{Int}})
+    leaders = zeros(Int, nprocs)
+    for i in 1:nprocs
+        if i < nprocs
+            if noderanges[i] == noderanges[i + 1]
+                leaders[i] = 1
+            end
+        end
+        if i > 1
+            if noderanges[i] == noderanges[i - 1]
+                leaders[i] = -1
+            end
+        end
+    end
+    leaderslist = []
+    for i in 1:nprocs
+        if leaders[i] != -1
+            push!(leaderslist, i)
+        end
+    end
+    return leaders, leaderslist
+end
+
+"""
+multithreadPi(ValueType, f, localdims, Icombined, Jcombined)
+Evaluate the function `f` on a grid defined by `Icombined` and `Jcombined` using multiple threads.
+"""
+function multithreadPi(::Type{ValueType}, f, localdims::Union{Vector{Int},NTuple{N,Int}}, Icombined::Vector{MultiIndex}, Jcombined::Vector{MultiIndex}) where {N,ValueType}
+    Pi = zeros(ValueType, length(Icombined), length(Jcombined))
+    nthreads = Threads.nthreads()
+    chunk_size, remainder = divrem(length(Jcombined), nthreads)
+    col_ranges = [((i - 1) * chunk_size + min(i - 1, remainder) + 1):(i * chunk_size + min(i, remainder)) for i in 1:nthreads]
+    @threads for tid in 1:nthreads
+        if !isempty(col_ranges[tid])
+            Pi[:, col_ranges[tid]] = filltensor(
+                ValueType, f, localdims,
+                Icombined, Jcombined[col_ranges[tid]], Val(0)
+            )
+        end
+    end
+    return Pi
+end
 
+function parallelfullsearch(::Type{ValueType}, f, tci, Icombined, Jcombined, teamsize, multithread_eval, teamjuliarank, subcomm) where {ValueType}
+    if teamsize == 1
+        t1 = time_ns()
+        if multithread_eval
+            Pi = multithreadPi(ValueType, f, tci.localdims, Icombined, Jcombined)
+        else
+            Pi = reshape(
+                filltensor(ValueType, f, tci.localdims,
+                Icombined, Jcombined, Val(0)),
+                length(Icombined), length(Jcombined)
+            )
+        end
+        t2 = time_ns()
+        # For compilation purpouse with MPI
+        Icombined_copy = Icombined
+        Jcombined_copy = Jcombined
+    else # Teamwork
+        # Coordinate
+        t1 = time_ns()
+        Icombined_copy = MPI.bcast([Icombined], subcomm)[1]
+        Jcombined_copy = MPI.bcast([Jcombined], subcomm)[1]
+        # Subdivide
+        chunksize, remainder = divrem(length(Jcombined_copy), teamsize)
+        col_chunks = [chunksize + (i <= remainder ? 1 : 0) for i in 1:teamsize]
+        row_chunks = [length(Icombined_copy) for _ in 1:teamsize]
+        ranges = [(vcat([0],cumsum(col_chunks))[i] + 1):(cumsum(col_chunks)[i]) for i in 1:teamsize]
+        Jcombined_copy_local = Jcombined_copy[ranges[teamjuliarank]]
+        if !isempty(Jcombined_copy_local)
+            tlocalPi = time_ns()
+            if multithread_eval
+                localPi = multithreadPi(ValueType, f, tci.localdims, Icombined_copy, Jcombined_copy_local)
+            else
+                localPi = reshape(
+                    filltensor(ValueType, f, tci.localdims,
+                    Icombined_copy, Jcombined_copy_local, Val(0)),
+                    length(Icombined_copy), length(Jcombined_copy_local)
+                )
+            end
+            tlocalPi = (time_ns() - tlocalPi) * 1e-9
+        else
+            localPi = zeros(length(Icombined_copy), length(Jcombined_copy_local))
+        end
+        # Unite
+        Pi = zeros(length(Icombined_copy), length(Jcombined_copy))
+        sizes = vcat(row_chunks', col_chunks')
+        counts = vec(prod(sizes, dims=1))
+        Pi_vbuf = VBuffer(Pi, counts)
+        MPI.Allgatherv!(localPi, Pi_vbuf, subcomm)
+        t2 = time_ns()
+    end
+    return Pi, t1, t2
+end
 """
 Update pivots at bond `b` of `tci` using the TCI2 algorithm.
 Site tensors will be invalidated.
@@ -513,28 +722,52 @@ function updatepivots!(
     f::F,
     leftorthogonal::Bool;
     reltol::Float64=1e-14,
-    abstol::Float64=0.0,
+    abstol::Float64=1e-14, #it was 0...
     maxbonddim::Int=typemax(Int),
     sweepdirection::Symbol=:forward,
     pivotsearch::Symbol=:full,
     verbosity::Int=0,
     extraIset::Vector{MultiIndex}=MultiIndex[],
     extraJset::Vector{MultiIndex}=MultiIndex[],
+    estimatedbond::Int=100,
+    interbond::Bool=true,
+    multithread_eval::Bool=false,
+    subcomm=nothing
 ) where {F,ValueType}
     invalidatesitetensors!(tci)
 
+
+    if sweepdirection == :parallel && MPI.Initialized()
+        comm = MPI.COMM_WORLD
+        mpirank = MPI.Comm_rank(comm)
+        juliarank = mpirank + 1
+        nprocs = MPI.Comm_size(comm)
+        noderanges = _noderanges(nprocs, length(tci) - 1, tci.localdims[1], estimatedbond; interbond)
+        leaders, leaderslist = _leaders(nprocs, noderanges)
+        teamrank = MPI.Comm_rank(subcomm)
+        teamjuliarank = teamrank + 1
+        teamsize = MPI.Comm_size(subcomm)
+    end
+
     Icombined = union(kronecker(tci.Iset[b], tci.localdims[b]), extraIset)
     Jcombined = union(kronecker(tci.localdims[b+1], tci.Jset[b+1]), extraJset)
 
     luci = if pivotsearch === :full
-        t1 = time_ns()
-        Pi = reshape(
-            filltensor(ValueType, f, tci.localdims,
-            Icombined, Jcombined, Val(0)),
-            length(Icombined), length(Jcombined)
-        )
-        t2 = time_ns()
-
+        if sweepdirection == :parallel && MPI.Initialized()
+            Pi, t1, t2 = parallelfullsearch(ValueType, f, tci, Icombined, Jcombined, teamsize, multithread_eval, teamjuliarank, subcomm)
+        else # Serial
+            t1 = time_ns()
+            if multithread_eval
+                Pi = multithreadPi(ValueType, f, tci.localdims, Icombined, Jcombined)
+            else
+                Pi = reshape(
+                    filltensor(ValueType, f, tci.localdims,
+                    Icombined, Jcombined, Val(0)),
+                    length(Icombined), length(Jcombined)
+                )
+            end
+            t2 = time_ns()
+        end
         updatemaxsample!(tci, Pi)
         luci = MatrixLUCI(
             Pi,
@@ -553,30 +786,65 @@ function updatepivots!(
         t1 = time_ns()
         I0 = Int.(Iterators.filter(!isnothing, findfirst(isequal(i), Icombined) for i in tci.Iset[b+1]))::Vector{Int}
         J0 = Int.(Iterators.filter(!isnothing, findfirst(isequal(j), Jcombined) for j in tci.Jset[b]))::Vector{Int}
-        Pif = SubMatrix{ValueType}(f, Icombined, Jcombined)
         t2 = time_ns()
-        res = MatrixLUCI(
-            ValueType,
-            Pif,
-            (length(Icombined), length(Jcombined)),
-            I0, J0;
-            reltol=reltol, abstol=abstol,
-            maxrank=maxbonddim,
-            leftorthogonal=leftorthogonal,
-            pivotsearch=:rook,
-            usebatcheval=true
-        )
+        if verbosity > 2
+            x, y = length(Icombined), length(Jcombined)
+            print("    Computing Pi ($x x $y) at bond $b: ")
+        end
+        if sweepdirection == :parallel && MPI.Initialized()
+            I0 = MPI.bcast([I0], subcomm)[1]
+            J0 = MPI.bcast([J0], subcomm)[1]
+            Icombined_copy = MPI.bcast([Icombined], subcomm)[1]
+            Jcombined_copy = MPI.bcast([Jcombined], subcomm)[1]
+            Pif = SubMatrix{ValueType}(f, Icombined_copy, Jcombined_copy)
+            res = MatrixLUCI(
+                ValueType,
+                Pif,
+                (length(Icombined_copy), length(Jcombined_copy)),
+                I0, J0;
+                reltol=reltol, abstol=abstol,
+                maxrank=maxbonddim,
+                leftorthogonal=leftorthogonal,
+                pivotsearch=:rook,
+                usebatcheval=true,
+                leaders=leaders,leaderslist=leaderslist, subcomm=subcomm
+            )
+            MPI.Barrier(subcomm)
+        else # Serial
+            Pif = SubMatrix{ValueType}(f, Icombined, Jcombined)
+            res = MatrixLUCI(
+                ValueType,
+                Pif,
+                (length(Icombined), length(Jcombined)),
+                I0, J0;
+                reltol=reltol, abstol=abstol,
+                maxrank=maxbonddim,
+                leftorthogonal=leftorthogonal,
+                pivotsearch=:rook,
+                usebatcheval=true,
+            )
+        end
         updatemaxsample!(tci, [ValueType(Pif.maxsamplevalue)])
 
         t3 = time_ns()
 
         # Fall back to full search if rook search fails
         if npivots(res) == 0
-            Pi = reshape(
-                filltensor(ValueType, f, tci.localdims,
-                Icombined, Jcombined, Val(0)),
-                length(Icombined), length(Jcombined)
-            )
+            if sweepdirection == :parallel && MPI.Initialized()
+                Pi, _, _ = parallelfullsearch(ValueType, f, tci, Icombined, Jcombined, teamsize, multithread_eval, teamjuliarank, subcomm)
+            else # Serial
+                t1 = time_ns()
+                if multithread_eval
+                    Pi = multithreadPi(ValueType, f, tci.localdims, Icombined, Jcombined)
+                else
+                    Pi = reshape(
+                        filltensor(ValueType, f, tci.localdims,
+                        Icombined, Jcombined, Val(0)),
+                        length(Icombined), length(Jcombined)
+                    )
+                end
+                t2 = time_ns()
+            end
             updatemaxsample!(tci, Pi)
             res = MatrixLUCI(
                 Pi,
@@ -588,17 +856,56 @@ function updatepivots!(
         end
 
         t4 = time_ns()
+        #if verbosity > 2
+        #    x, y = length(Icombined), length(Jcombined)
+        #    println("    Computing Pi ($x x $y) at bond $b: $(1e-9*(t2-t1)) sec, LU: $(1e-9*(t3-t2)) sec, fall back to full: $(1e-9*(t4-t3)) sec")
+        #end
         if verbosity > 2
             x, y = length(Icombined), length(Jcombined)
-            println("    Computing Pi ($x x $y) at bond $b: $(1e-9*(t2-t1)) sec, LU: $(1e-9*(t3-t2)) sec, fall back to full: $(1e-9*(t4-t3)) sec")
+            println("$(1e-9*(t2-t1)) sec, LU: $(1e-9*(t3-t2)) sec, fall back to full: $(1e-9*(t4-t3)) sec")
         end
         res
     else
         throw(ArgumentError("Unknown pivot search strategy $pivotsearch. Choose from :rook, :full."))
     end
-    tci.Iset[b+1] = Icombined[rowindices(luci)]
-    tci.Jset[b] = Jcombined[colindices(luci)]
-    if length(extraIset) == 0 && length(extraJset) == 0
+    # Receive after the send
+    if sweepdirection == :parallel && MPI.Initialized()
+        sreq = MPI.Request[]
+        if leaders[juliarank] != -1
+            if length(noderanges[juliarank]) == 1 && b == noderanges[juliarank][1] && juliarank != leaderslist[1] && juliarank != leaderslist[end] # If processor has only 1 bond then communicates both left and right
+                currentleader = findfirst(isequal(juliarank), leaderslist)
+                if currentleader != nothing
+                    push!(sreq, MPI.isend([Jcombined[colindices(luci)]], comm; dest=leaderslist[currentleader - 1] - 1, tag=(2*b+1)))
+                    push!(sreq, MPI.isend([Icombined[rowindices(luci)]], comm; dest=leaderslist[currentleader + 1] - 1, tag=2*(b+1)))
+                else
+                    println("Error! Missmatch between leaders and leaderslist, unexpected behaviour! Please open an issue")
+                end
+            elseif b == noderanges[juliarank][1] && juliarank != leaderslist[1] # processor communicates left
+                currentleader = findfirst(isequal(juliarank), leaderslist)
+                if currentleader != nothing
+                    push!(sreq, MPI.isend([Jcombined[colindices(luci)]], comm; dest=leaderslist[currentleader - 1] - 1, tag=(2*b+1)))
+                else
+                    println("Error! Missmatch between leaders and leaderslist, unexpected behaviour! Please open an issue")
+                end
+                tci.Iset[b+1] = Icombined[rowindices(luci)]
+            elseif b == noderanges[juliarank][end] && juliarank != leaderslist[end] # processor communicates right
+                currentleader = findfirst(isequal(juliarank), leaderslist)
+                if currentleader != nothing
+                    push!(sreq, MPI.isend([Icombined[rowindices(luci)]], comm; dest=leaderslist[currentleader + 1] - 1, tag=2*(b+1)))
+                else
+                    println("Error! Missmatch between leaders and leaderslist, unexpected behaviour! Please open an issue")
+                end
+                tci.Jset[b] = Jcombined[colindices(luci)]
+            else # normal updates without MPI communications
+                tci.Iset[b+1] = Icombined[rowindices(luci)]
+                tci.Jset[b] = Jcombined[colindices(luci)]
+            end
+        end
+    else
+        tci.Iset[b+1] = Icombined[rowindices(luci)]
+        tci.Jset[b] = Jcombined[colindices(luci)]
+    end
+    if length(extraIset) == 0 && length(extraJset) == 0 && sweepdirection != :parallel
         setsitetensor!(tci, b, left(luci))
         setsitetensor!(tci, b + 1, right(luci))
     end
@@ -660,7 +967,11 @@ end
         maxnglobalpivot::Int=5,
         nsearchglobalpivot::Int=5,
         tolmarginglobalsearch::Float64=10.0,
-        strictlynested::Bool=false
+        strictlynested::Bool=false,
+        checkbatchevaluatable::Bool=false,
+        multithread_eval::Bool=false,
+        estimatedbond::Int=100,
+        interbond::Bool=true
     ) where {ValueType}
 
 Perform optimization sweeps using the TCI2 algorithm. This will sucessively improve the TCI approximation of a function until it fits `f` with an error smaller than `tolerance`, or until the maximum bond dimension (`maxbonddim`) is reached.
@@ -673,7 +984,7 @@ Arguments:
 - `tolerance::Float64` is a float specifying the target tolerance for the interpolation. Default: `1e-8`.
 - `maxbonddim::Int` specifies the maximum bond dimension for the TCI. Default: `typemax(Int)`, i.e. effectively unlimited.
 - `maxiter::Int` is the maximum number of iterations (i.e. optimization sweeps) before aborting the TCI construction. Default: `200`.
-- `sweepstrategy::Symbol` specifies whether to sweep forward (:forward), backward (:backward), or back and forth (:backandforth) during optimization. Default: `:backandforth`.
+- `sweepstrategy::Symbol` specifies whether to sweep forward (:forward), backward (:backward), back and forth (:backandforth) or in parallel (:parallel) during optimization. Default: `:backandforth`.
 - `pivotsearch::Symbol` determins how pivots are searched (`:full` or `:rook`). Default: `:full`.
 - `verbosity::Int` can be set to `>= 1` to get convergence information on standard output during optimization. Default: `0`.
 - `loginterval::Int` can be set to `>= 1` to specify how frequently to print convergence information. Default: `10`.
@@ -683,6 +994,9 @@ Arguments:
 - `maxnglobalpivot::Int` can be set to `>= 0`. Default: `5`. The maximum number of global pivots to add in each iteration.
 - `strictlynested::Bool` determines whether to preserve partial nesting in the TCI algorithm. Default: `false`.
 - `checkbatchevaluatable::Bool` Check if the function `f` is batch evaluatable. Default: `false`.
+- `multithread_eval::Bool` determines whether to use multithreading for function evaluation. Default: `false`.
+- `estimatedbond::Int` is the estimated bond dimension after compression, set by the user to help balance the workload. Default: `100`.
+- `interbond::Bool` determines whether to distribute the workload across bonds or not. Default: `true`.
 - `checkconvglobalpivot::Bool` Check if the global pivot finder is converged. Default: `true`. In the future, this will be set to `false` by default.
 
 Arguments (deprecated):
@@ -716,6 +1030,9 @@ function optimize!(
     tolmarginglobalsearch::Float64=10.0,
     strictlynested::Bool=false,
     checkbatchevaluatable::Bool=false,
+    multithread_eval::Bool=false,
+    estimatedbond::Int=100,
+    interbond::Bool=true,
     checkconvglobalpivot::Bool=true
 ) where {ValueType}
     errors = Float64[]
@@ -753,6 +1070,18 @@ function optimize!(
         ))
     end
 
+    if (sweepstrategy == :parallel) && !MPI.Initialized()
+        println("Warning! Parallel strategy has been chosen, but MPI is not initialized, please use initializempi() before using crossinterpolate2()/quanticscrossinterpolate() and use finalizempi() afterwards")
+    end
+
+    if sweepstrategy == :parallel && MPI.Initialized()
+        comm = MPI.COMM_WORLD
+        mpirank = MPI.Comm_rank(comm)
+        juliarank = mpirank + 1
+	    nprocs = MPI.Comm_size(comm)
+        noderanges = _noderanges(nprocs, length(tci) - 1, tci.localdims[1], estimatedbond; interbond)
+        leaders, leaderslist = _leaders(nprocs, noderanges)
+    end
     # Create the global pivot finder
     finder = if isnothing(globalpivotfinder)
         DefaultGlobalPivotFinder(
@@ -783,7 +1112,10 @@ function optimize!(
             strictlynested=strictlynested,
             verbosity=verbosity,
             sweepstrategy=sweepstrategy,
-            fillsitetensors=true
+            fillsitetensors=true,
+            estimatedbond=estimatedbond,
+            interbond=interbond,
+            multithread_eval=multithread_eval
             )
         if verbosity > 0 && length(globalpivots) > 0 && mod(iter, loginterval) == 0
             abserr = [abs(evaluate(tci, p) - f(p)) for p in globalpivots]
@@ -801,14 +1133,16 @@ function optimize!(
         end
 
         # Find global pivots where the error is too large
-        input = GlobalPivotSearchInput(tci)
-        globalpivots = finder(
-            input, f, abstol;
-            verbosity=verbosity,
-            rng=Random.default_rng()
-        )
-        addglobalpivots!(tci, globalpivots)
-        push!(nglobalpivots, length(globalpivots))
+        if sweepstrategy != :parallel
+            input = GlobalPivotSearchInput(tci)
+            globalpivots = finder(
+                input, f, abstol;
+                verbosity=verbosity,
+                rng=Random.default_rng()
+            )
+            addglobalpivots!(tci, globalpivots)
+            push!(nglobalpivots, length(globalpivots))
+        end
 
         if verbosity > 1
             println("  Walltime $(1e-9*(time_ns() - tstart)) sec: done searching global pivots")
@@ -820,16 +1154,50 @@ function optimize!(
             println("iteration = $iter, rank = $(last(ranks)), error= $(last(errors)), maxsamplevalue= $(tci.maxsamplevalue), nglobalpivot=$(length(globalpivots))")
             flush(stdout)
         end
-        if convergencecriterion(
-            ranks, errors,
-            nglobalpivots,
-            abstol, maxbonddim, ncheckhistory;
-            checkconvglobalpivot=checkconvglobalpivot
+        # Checks if every processor has converged with its own subtrain
+        if sweepstrategy == :parallel && MPI.Initialized()
+            converged = leaders[juliarank] == -1 || convergencecriterion(ranks, errors, nglobalpivots, abstol, maxbonddim, ncheckhistory; checkconvglobalpivot=checkconvglobalpivot)
+            MPI.Barrier(comm)
+            global_converged = MPI.Allreduce(converged, MPI.LAND, comm)
+            if global_converged # If all the processors have converged
+                break
+            end
+        elseif convergencecriterion(
+            ranks, errors, nglobalpivots, abstol, maxbonddim, ncheckhistory; checkconvglobalpivot=checkconvglobalpivot
         )
             break
         end
     end
 
+    # After convergence
+    # Allgatherv! not possible given type constraints
+    if sweepstrategy == :parallel && MPI.Initialized()
+        if juliarank == 1 # If we are the first processor, we take all the information from the other processes
+            for leader in leaderslist[2:end]
+                for b in noderanges[leader]
+                    tci.Iset[b] = MPI.recv(comm; source=leader - 1, tag=2*(b))[1]
+                    tci.Jset[b+1] = MPI.recv(comm; source=leader - 1, tag=2*(b+1)+1)[1]
+                    if leader == leaderslist[end]
+                        tci.Iset[length(tci)] = MPI.recv(comm; source=leader - 1, tag=2*(length(tci)+1)+1)[1]
+                    end
+                end
+            end
+        else # Other processors pass the infomation to the first one
+            if leaders[juliarank] != -1
+                for b in noderanges[juliarank]
+                    MPI.send([tci.Iset[b]], comm; dest=0, tag=2*(b))
+                    MPI.send([tci.Jset[b+1]], comm; dest=0, tag=2*(b+1)+1)
+                    if juliarank == leaderslist[end]
+                        MPI.send([tci.Iset[length(tci)]], comm; dest=0, tag=2*(length(tci)+1)+1)
+                    end
+                end
+            end
+        end
+        # BCast to all, so that all nodes have full knowledge
+        tci.Iset = MPI.bcast(tci.Iset, comm)
+        tci.Jset = MPI.bcast(tci.Jset, comm)
+    end
+
     # Extra one sweep by the 1-site update to
     #  (1) Remove unnecessary pivots added by global pivots
     #      Note: a pivot matrix can be non-square after adding global pivots,
@@ -861,14 +1229,23 @@ function sweep2site!(
     pivotsearch::Symbol=:full,
     verbosity::Int=0,
     strictlynested::Bool=false,
-    fillsitetensors::Bool=true
+    fillsitetensors::Bool=true,
+    estimatedbond::Int=100,
+    interbond::Bool=true,
+    multithread_eval::Bool=false
 ) where {ValueType}
     invalidatesitetensors!(tci)
 
     n = length(tci)
 
-    for iter in iter1:iter1+niter-1
+    if sweepstrategy == :parallel && MPI.Initialized()
+        comm = MPI.COMM_WORLD
+        mpirank = MPI.Comm_rank(comm)
+        juliarank = mpirank + 1
+        nprocs = MPI.Comm_size(comm)
+    end
 
+    for iter in iter1:iter1+niter-1
         extraIset = [MultiIndex[] for _ in 1:n]
         extraJset = [MultiIndex[] for _ in 1:n]
         if !strictlynested && length(tci.Iset_history) > 0
@@ -880,7 +1257,56 @@ function sweep2site!(
         push!(tci.Jset_history, deepcopy(tci.Jset))
 
         flushpivoterror!(tci)
-        if forwardsweep(sweepstrategy, iter) # forward sweep
+        if sweepstrategy == :parallel && MPI.Initialized()
+            noderanges = _noderanges(nprocs, length(tci) - 1, tci.localdims[1], estimatedbond; interbond)
+            leaders, leaderslist = _leaders(nprocs, noderanges)
+            colors = zeros(Int,nprocs)
+            count = -1
+            for i in 1:length(colors)
+                if leaders[i] != -1
+                    count += 1
+                end
+                colors[i] = count
+            end
+            color = colors[juliarank]
+            subcomm = MPI.Comm_split(comm, color, mpirank)
+            for bondindex in noderanges[juliarank]
+                updatepivots!(
+                    tci, bondindex, f, true;
+                    abstol=abstol,
+                    maxbonddim=maxbonddim,
+                    sweepdirection=:parallel,
+                    pivotsearch=pivotsearch,
+                    verbosity=verbosity,
+                    extraIset=extraIset[bondindex+1],
+                    extraJset=extraJset[bondindex],
+                    estimatedbond=estimatedbond,
+                    interbond=interbond,
+                    multithread_eval=multithread_eval,
+                    subcomm = subcomm
+                )
+            end
+            for b in noderanges[juliarank]
+                if leaders[juliarank] != -1
+                    if b == noderanges[juliarank][1] && juliarank != leaderslist[1]
+                        currentleader = findfirst(isequal(juliarank), leaderslist)
+                        if currentleader != nothing
+                            tci.Iset[b] = MPI.recv(comm; source=leaderslist[currentleader - 1] - 1, tag=2*b)[1]
+                        else
+                            println("Error! Missmatch between leaders and leaderslist, unexpected behaviour! Please open an issue")
+                        end
+                    end
+                    if b == noderanges[juliarank][end] && juliarank != leaderslist[end]
+                        currentleader = findfirst(isequal(juliarank), leaderslist)
+                        if currentleader != nothing
+                            tci.Jset[b + 1] = MPI.recv(comm; source=leaderslist[currentleader + 1] - 1, tag=2*(b + 1) + 1)[1]
+                        else
+                            println("Error! Missmatch between leaders and leaderslist, unexpected behaviour! Please open an issue")
+                        end
+                    end
+                end
+            end
+        elseif forwardsweep(sweepstrategy, iter) # forward sweep
             for bondindex in 1:n-1
                 updatepivots!(
                     tci, bondindex, f, true;
@@ -891,6 +1317,7 @@ function sweep2site!(
                     verbosity=verbosity,
                     extraIset=extraIset[bondindex+1],
                     extraJset=extraJset[bondindex],
+                    multithread_eval=multithread_eval
                 )
             end
         else # backward sweep
@@ -904,12 +1331,13 @@ function sweep2site!(
                     verbosity=verbosity,
                     extraIset=extraIset[bondindex+1],
                     extraJset=extraJset[bondindex],
+                    multithread_eval=multithread_eval
                 )
             end
         end
     end
 
-    if fillsitetensors
+    if fillsitetensors && sweepstrategy != :parallel
         fillsitetensors!(tci, f)
     end
     nothing
@@ -994,4 +1422,4 @@ function searchglobalpivots(
     end
 
     return [p for (_,p) in pivots]
-end
+end
\ No newline at end of file
diff --git a/src/tensortrain.jl b/src/tensortrain.jl
index 2dc5fde..49e7f9a 100644
--- a/src/tensortrain.jl
+++ b/src/tensortrain.jl
@@ -93,8 +93,8 @@ function tensortrain(tci)
 end
 
 function _factorize(
-    A::AbstractMatrix{V}, method::Symbol; tolerance::Float64, maxbonddim::Int, leftorthogonal::Bool=false, normalizeerror=true
-)::Tuple{Matrix{V},Matrix{V},Int} where {V}
+    A::AbstractMatrix{V}, method::Symbol; tolerance::Float64, maxbonddim::Int, leftorthogonal::Bool=false, diamond::Bool=false, normalizeerror=true, q::Int=0, p::Int=16
+)::Union{Tuple{Matrix{V},Matrix{V},Int},Tuple{Matrix{V},Vector{V},Matrix{V},Int}} where {V}
     reltol = 1e-14
     abstol = 0.0
     if normalizeerror
@@ -114,22 +114,104 @@ function _factorize(
         normalized_err = err ./ sum(factorization.S .^ 2)
 
         trunci = min(
-            replacenothing(findfirst(<(abstol^2), err), length(err)),
-            replacenothing(findfirst(<(reltol^2), normalized_err), length(normalized_err)),
+            replacenothing(findlast(>(tolerance), Array(factorization.S)), 1),
             maxbonddim
         )
-        if leftorthogonal
+        if diamond
             return (
-                factorization.U[:, 1:trunci],
-                Diagonal(factorization.S[1:trunci]) * factorization.Vt[1:trunci, :],
-                trunci
-            )
+                    Matrix(factorization.U[:, 1:trunci]),
+                    factorization.S[1:trunci],
+                    Matrix(factorization.Vt[1:trunci, :]),
+                    trunci
+                )
         else
-            return (
-                factorization.U[:, 1:trunci] * Diagonal(factorization.S[1:trunci]),
-                factorization.Vt[1:trunci, :],
-                trunci
-            )
+            if leftorthogonal
+                return (
+                    Matrix(factorization.U[:, 1:trunci]),
+                    Matrix(Diagonal(factorization.S[1:trunci]) * factorization.Vt[1:trunci, :]),
+                    trunci
+                )
+            else
+                return (
+                    Matrix(factorization.U[:, 1:trunci] * Diagonal(factorization.S[1:trunci])),
+                    Matrix(factorization.Vt[1:trunci, :]),
+                    trunci
+                )
+            end
+        end
+    elseif method === :RSVD
+        invert = false
+        if size(A)[1] < size(A)[2]
+            A = A'
+            invert = true
+        end
+        if maxbonddim == typemax(Int) || maxbonddim + p > size(A)[1] || maxbonddim + p > size(A)[2]
+            factorization = LinearAlgebra.svd(A)
+        else
+            m, n = size(A)
+            G = randn(n, maxbonddim + p)
+            Y = A*G 
+            Q = Matrix(LinearAlgebra.qr!(Y).Q) # THEORETICAL BOTTLENECK
+            for _ in 1:q # q=0 for best performance
+                Y = A'*Q
+                Q = Matrix(LinearAlgebra.qr!(Y).Q)
+                Y = A*Q
+                Q = Matrix(LinearAlgebra.qr!(Y).Q)
+            end
+            B = Q' * A
+            factorization = LinearAlgebra.svd(B)
+            factorization = SVD((Q*factorization.U)[:,1:maxbonddim], factorization.S[1:maxbonddim], factorization.Vt[1:maxbonddim,:])
+        end
+        trunci = min(
+            replacenothing(findlast(>(tolerance), Array(factorization.S)), 1),
+            maxbonddim
+        )
+        if diamond
+            if invert
+                return (
+                    Matrix(factorization.Vt[1:trunci, :]'),
+                    factorization.S[1:trunci],
+                    Matrix(factorization.U[:, 1:trunci]'),
+                    trunci
+                )
+            else
+                return (
+                    Matrix(factorization.U[:, 1:trunci]),
+                    factorization.S[1:trunci],
+                    Matrix(factorization.Vt[1:trunci, :]),
+                    trunci
+                )
+            end
+        else
+            if leftorthogonal
+                if invert
+                    return (
+                        Matrix(factorization.Vt[1:trunci, :]' * Diagonal(factorization.S[1:trunci])),
+                        Matrix(factorization.U[:, 1:trunci]'),
+                        trunci
+                    )
+                else
+                    return (
+                        Matrix(factorization.U[:, 1:trunci]),
+                        Matrix(Diagonal(factorization.S[1:trunci]) * factorization.Vt[1:trunci, :]),
+                        trunci
+                    )
+                end
+            else
+                if invert
+                    return (
+                        Matrix(factorization.Vt[1:trunci, :]' * Diagonal(factorization.S[1:trunci])),
+                        Matrix(factorization.U[:, 1:trunci]'),
+                        trunci
+                    )
+                else
+                    return (
+                        Matrix(factorization.U[:, 1:trunci] * Diagonal(factorization.S[1:trunci])),
+                        Matrix(factorization.Vt[1:trunci, :]),
+                        trunci
+                    )
+                end
+            end
         end
     else
         error("Not implemented yet.")
@@ -182,7 +264,6 @@ function compress!(
     nothing
 end
 
-
 function multiply!(tt::TensorTrain{V,N}, a) where {V,N}
     tt.sitetensors[end] .= tt.sitetensors[end] .* a
     nothing
@@ -290,3 +371,7 @@ function fulltensor(obj::TensorTrain{T,N})::Array{T} where {T,N}
     returnsize = collect(Iterators.flatten(sitedims_))
     return reshape(result, returnsize...)
 end
+
+function IMPO(R::Int)
+    return TensorTrain{Float64, 4}([reshape([1.,0.,0.,1.], 1,2,2,1) for _ in 1:R])
+end
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index 768cb06..fbc9cf6 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -4,6 +4,7 @@ using LinearAlgebra
 
 include("test_with_aqua.jl")
 include("test_with_jet.jl")
+include("test_mpi.jl")
 include("test_util.jl")
 include("test_sweepstrategies.jl")
 include("test_indexset.jl")
diff --git a/test/test_contraction.jl b/test/test_contraction.jl
index 71f6009..d3db9ac 100644
--- a/test/test_contraction.jl
+++ b/test/test_contraction.jl
@@ -1,6 +1,8 @@
 import TensorCrossInterpolation as TCI
 using TensorCrossInterpolation
 using Random
+# TODO remove
+using Test
 
 function _tomat(tto::TensorTrain{T,4}) where {T}
     sitedims = TCI.sitedims(tto)
@@ -34,7 +36,7 @@ function _gen_testdata_TTO_TTO()
     bonddims_b = [1, 2, 3, 2, 1]
     localdims1 = [2, 2, 2, 2]
     localdims2 = [3, 3, 3, 3]
-    localdims3 = [2, 2, 2, 2]
+    localdims3 = [5, 5, 5, 5]
 
     a = TensorTrain{ComplexF64,4}([
         rand(ComplexF64, bonddims_a[n], localdims1[n], localdims2[n], bonddims_a[n+1])
@@ -181,15 +183,23 @@ end
     end
 end
 
-
-@testset "MPO-MPO contraction (zipup)" for method in [:SVD, :LU]
+@testset "MPO-MPO contraction" for algorithm in [:zipup, :distrzipup, :fit, :distrfit], method in [:SVD, :LU, :RSVD], maxbonddim in [typemax(Int), 10]
+    Random.seed!(42) # For reproducibility
     N, a, b, localdims1, localdims2, localdims3 = _gen_testdata_TTO_TTO()
-    ab = contract(a, b; algorithm=:zipup, method=method)
+    #println([size(a[i]) for i in 1:length(a)])
+    #println([size(b[i]) for i in 1:length(b)])
+    #println("$maxbonddim with $method")
+    ab = contract(a, b; algorithm=algorithm, method=method, maxbonddim=maxbonddim, nsweeps=10)
+    #println([size(a[i]) for i in 1:length(a)])
+    #println([size(b[i]) for i in 1:length(b)])
+    #println([size(ab[i]) for i in 1:length(ab)])
+
     @test _tomat(ab) ≈ _tomat(a) * _tomat(b)
 end
 
-@testset "MPO-MPS contraction (zipup)" for method in [:SVD, :LU]
+@testset "MPO-MPS contraction" for algorithm in [:zipup, :distrzipup, :fit, :distrfit], method in [:SVD, :LU, :RSVD], maxbonddim in [typemax(Int), 10]
+    Random.seed!(42) # For reproducibility
     N, a, b, localdims1, localdims2 = _gen_testdata_TTO_TTS()
-    ab = contract(a, b; algorithm=:zipup, method=method)
+    ab = contract(a, b; algorithm=algorithm, method=method, maxbonddim=maxbonddim, nsweeps=10)
     @test _tovec(ab) ≈ _tomat(a) * _tovec(b)
 end
\ No newline at end of file
diff --git a/test/test_mpi.jl b/test/test_mpi.jl
new file mode 100644
index 0000000..cb02fcc
--- /dev/null
+++ b/test/test_mpi.jl
@@ -0,0 +1,58 @@
+using MPI
+using Test
+import TensorCrossInterpolation as TCI
+
+@testset "MPI" begin
+    @testset "MPI init" begin
+        @test !MPI.Initialized()
+        TCI.initializempi(false)
+        @test MPI.Initialized()
+    end
+
+    @testset "MPI communicator" begin
+        comm = MPI.COMM_WORLD
+        nprocs = MPI.Comm_size(comm)
+        mpirank = MPI.Comm_rank(comm)
+        @test nprocs ≥ 1
+        @test 0 ≤ mpirank < nprocs
+    end
+
+    @testset "synchronize_tt!" begin
+        comm = MPI.COMM_WORLD
+        nprocs = MPI.Comm_size(comm)
+        mpirank = MPI.Comm_rank(comm)
+        juliarank = mpirank + 1
+        
+        tolerance = 1e-8
+        maxbonddim = 50
+
+        function f0(vec)
+            cos(prod(vec) * pi / 2^6)
+        end
+        function f(vec)
+            cos(prod(vec.^juliarank) * pi / 2^6)
+        end
+
+        tt0, _, _ = TCI.crossinterpolate2(Float64, f0, fill(2, 6); tolerance, maxbonddim)
+        tt, _, _ = TCI.crossinterpolate2(Float64, f, fill(2, 6); tolerance, maxbonddim)
+
+        maxerr = 0.
+        for x1 in 1:2, x2 in 1:2, x3 in 1:2, x4 in 1:2, x5 in 1:2, x6 in 1:2
+            maxerr = max(maxerr, abs(TCI.evaluate(tt0, [x1,x2,x3,x4,x5,x6]) - TCI.evaluate(tt, [x1,x2,x3,x4,x5,x6])))
+        end
+        if mpirank == 0
+            @test maxerr < 1e-8
+        else
+            @test maxerr > 1e-8
+        end
+
+        juliasource = 1
+        TCI.synchronize_tt!(tt; juliasource=juliasource)
+
+        maxerr = 0.
+        for x1 in 1:2, x2 in 1:2, x3 in 1:2, x4 in 1:2, x5 in 1:2, x6 in 1:2
+            maxerr = max(maxerr, abs(TCI.evaluate(tt0, [x1,x2,x3,x4,x5,x6]) - TCI.evaluate(tt, [x1,x2,x3,x4,x5,x6])))
+        end
+        @test maxerr < 1e-8
+    end
+end
\ No newline at end of file
diff --git a/test/test_tensorci2.jl b/test/test_tensorci2.jl
index 7c9ec81..2017b5f 100644
--- a/test/test_tensorci2.jl
+++ b/test/test_tensorci2.jl
@@ -5,6 +5,10 @@ import Random
 import Random: AbstractRNG
 import QuanticsGrids as QD
 
+TCI.initializempi(false)
+
+# TODO write test for parallel subfunctions
+
 @testset "TensorCI2" begin
     @testset "kronecker util function" begin
         multiset = [collect(1:5) for _ in 1:5]
@@ -24,7 +28,7 @@ import QuanticsGrids as QD
         end
     end
 
-    @testset "pivoterrors" begin
+    @testset "pivoterrors" for sweepstrategy in [:backandforth, :parallel]
         diags = [1.0, 1e-5, 0.0]
         f(x) = (x[1] == x[2] ? diags[x[1]] : 0.0)
         localdims = [3, 3]
@@ -34,11 +38,12 @@ import QuanticsGrids as QD
             localdims,
             [[1, 1]];
             tolerance=1e-8,
+            sweepstrategy=:sweepstrategy
         )
         @test tci.pivoterrors == diags
     end
 
-    @testset "checkbatchevaluatable" begin
+    @testset "checkbatchevaluatable" for sweepstrategy in [:backandforth, :parallel]
         f(x) = 1.0 # Constant function without batch evaluation
         L = 10
         localdims = fill(2, L)
@@ -48,11 +53,12 @@ import QuanticsGrids as QD
             f,
             localdims,
             firstpivots;
-            checkbatchevaluatable=true
+            checkbatchevaluatable=true,
+            sweepstrategy=:sweepstrategy
         )
     end
 
-    @testset "trivial MPS(exp): pivotsearch=$pivotsearch" for pivotsearch in [:full, :rook], strictlynested in [false, true], nsearchglobalpivot in [0, 10]
+    @testset "trivial MPS(exp): pivotsearch=$pivotsearch" for pivotsearch in [:full, :rook], strictlynested in [false, true], nsearchglobalpivot in [0, 10], sweepstrategy in [:backandforth, :parallel]
         if nsearchglobalpivot > 0 && strictlynested
             continue
         end
@@ -82,7 +88,8 @@ import QuanticsGrids as QD
             normalizeerror=false,
             nsearchglobalpivot=nsearchglobalpivot,
             pivotsearch=pivotsearch,
-            strictlynested=strictlynested
+            strictlynested=strictlynested,
+            sweepstrategy=:sweepstrategy
         )
 
         @test all(TCI.linkdims(tci) .== 1)
@@ -195,7 +202,8 @@ import QuanticsGrids as QD
             normalizeerror=false,
             nsearchglobalpivot=nsearchglobalpivot,
             pivotsearch=pivotsearch,
-            strictlynested=strictlynested
+            strictlynested=strictlynested,
+            sweepstrategy=:sweepstrategy
         )
 
         @test all(TCI.linkdims(tci) .== 1)
@@ -244,7 +252,9 @@ import QuanticsGrids as QD
         @test_logs (:warn, "The option `pivottolerance` of `optimize!(tci::TensorCI2, f)` is deprecated. Please update your code to use `tolerance`, as `pivottolerance` will be removed in the future.") optimize!(tci, f; pivottolerance = 0.1)
     end
 
-    @testset "Lorentz MPS with ValueType=$(typeof(coeff)), pivotsearch=$pivotsearch" for coeff in [1.0, 0.5 - 1.0im], pivotsearch in [:full, :rook]
+    # This is a stochastic test
+    @testset "Lorentz MPS with ValueType=$(typeof(coeff)), pivotsearch=$pivotsearch" for seed in collect(1:20), coeff in [1.0, 0.5 - 1.0im], pivotsearch in [:full, :rook], sweepstrategy in [:backandforth, :parallel]
+        Random.seed!(seed)
         n = 5
         f(v) = coeff ./ (sum(v .^ 2) + 1)
 
@@ -285,7 +295,7 @@ import QuanticsGrids as QD
             [ones(Int, n)];
             tolerance=1e-8,
             maxiter=8,
-            sweepstrategy=:forward,
+            sweepstrategy=:sweepstrategy,
             pivotsearch=pivotsearch
         )
 
@@ -301,7 +311,8 @@ import QuanticsGrids as QD
             [ones(Int, n)];
             tolerance=1e-12,
             maxiter=200,
-            pivotsearch
+            pivotsearch,
+            sweepstrategy=:sweepstrategy
         )
 
         @test pivoterror(tci3) <= 2e-12
@@ -323,7 +334,8 @@ import QuanticsGrids as QD
             initialpivots;
             tolerance=1e-12,
             maxiter=200,
-            pivotsearch
+            pivotsearch,
+            sweepstrategy=:sweepstrategy
         )
 
         @test pivoterror(tci4) <= 2e-12
@@ -340,7 +352,7 @@ import QuanticsGrids as QD
     end
 
 
-    @testset "insert_global_pivots: pivotsearch=$pivotsearch, strictlynested=$strictlynested, seed=$seed" for seed in collect(1:20), pivotsearch in [:full, :rook], strictlynested in [false]
+    @testset "insert_global_pivots: pivotsearch=$pivotsearch, strictlynested=$strictlynested, seed=$seed" for seed in collect(1:20), pivotsearch in [:full, :rook], strictlynested in [false], sweepstrategy in [:backandforth, :parallel]
         Random.seed!(seed)
 
         R = 20
@@ -375,7 +387,8 @@ import QuanticsGrids as QD
             verbosity=0,
             normalizeerror=false,
             pivotsearch=pivotsearch,
-            strictlynested=strictlynested
+            strictlynested=strictlynested,
+            sweepstrategy=:sweepstrategy
         )
 
         TCI.addglobalpivots2sitesweep!(
@@ -392,7 +405,7 @@ import QuanticsGrids as QD
         @test sum(abs.([TCI.evaluate(tci, r) - f(r) for r in rindex]) .> abstol) == 0
     end
 
-    @testset "insert_global_pivots" begin
+    @testset "insert_global_pivots" for sweepstrategy in [:backandforth, :parallel]
         Random.seed!(1234)
 
         R = 20
@@ -413,7 +426,8 @@ import QuanticsGrids as QD
             loginterval=1,
             verbosity=0,
             normalizeerror=false,
-            strictlynested=false
+            strictlynested=false,
+            sweepstrategy=:sweepstrategy
         )
 
         r = fill(2, R)
@@ -430,7 +444,7 @@ import QuanticsGrids as QD
         @test TCI.evaluate(tci, r) ≈ f(r)
     end
 
-    @testset "globalsearch" begin
+    @testset "globalsearch" for sweepstrategy in [:backandforth, :parallel]
         Random.seed!(1234)
 
         n = 10
@@ -451,7 +465,8 @@ import QuanticsGrids as QD
             maxbonddim=100,
             maxiter=100,
             nsearchglobalpivot=10,
-            strictlynested=false
+            strictlynested=false,
+            sweepstrategy=:sweepstrategy
         )
 
         @test errors[end] < 1e-10
@@ -474,7 +489,7 @@ import QuanticsGrids as QD
         @test tci2.Jset == tci.Jset
     end
 
-    @testset "crossinterpolate2_ttcache" begin
+    @testset "crossinterpolate2_ttcache" for sweepstrategy in [:backandforth, :parallel]
         ValueType = Float64
 
         N = 4
@@ -490,7 +505,8 @@ import QuanticsGrids as QD
             ttc,
             localdims;
             tolerance=1e-10,
-            maxbonddim=10
+            maxbonddim=10,
+            sweepstrategy=:sweepstrategy
         )
 
         tt_reconst = TCI.TensorTrain(tci2)
@@ -501,6 +517,38 @@ import QuanticsGrids as QD
         @test vals_reconst ≈ vals_ref
     end
 
+    @testset "multithreadPi" begin
+        f = (x -> sum(x))
+
+        localdims = [2,2,2,2,2,2]
+
+        Icombined = [[1,1,1,1], [1,2,1,1], [1,1,2,1]]
+        Jcombined = [[2,2], [1,2], [2,1], [1,1]]
+
+        mPi = TCI.multithreadPi(Float64, f, localdims, Icombined, Jcombined)
+
+        Pi = reshape(
+            TCI.filltensor(Float64, f, localdims,
+            Icombined, Jcombined, Val(0)),
+            length(Icombined), length(Jcombined)
+        )
+
+        @test size(Pi) == (3, 4)
+
+        @test mPi == Pi
+    end
+
+    @testset "noderanges" begin
+        @test TCI._noderanges(8, 24, 2, 100) == [1:8, 9:9, 10:10, 11:11, 12:12, 13:13, 14:14, 15:24]
+        @test TCI._noderanges(8, 24, 2, 500) == [1:9, 10:10, 11:11, 12:12, 13:13, 14:14, 15:15, 16:24]
+        @test TCI._noderanges(8, 24, 2, 100; interbond=false) == [1:24 for _ in 1:8]
+        @test TCI._noderanges(8, 24, 2, 500; interbond=false) == [1:24 for _ in 1:8]
+        @test TCI._noderanges(8, 24, 2, 100) == TCI._noderanges(8, 24, 2, 100; estimatedbonds=[2,4,8,16,32,64,100,100,100,100,100,100,100,100,100,100,100,64,32,16,8,4,2])
+
+        ranges, eff = TCI._noderanges(4, 8, 8, 64, efficiencycheck=true)
+        @test eff > 3.9
+    end
+
     @testset "convergencecriterion" begin
         let
             ranks = [1, 2]
@@ -553,3 +601,4 @@ import QuanticsGrids as QD
         end
     end
 end
+
diff --git a/test/test_tensortrain.jl b/test/test_tensortrain.jl
index e167dc8..c039ed5 100644
--- a/test/test_tensortrain.jl
+++ b/test/test_tensortrain.jl
@@ -253,3 +253,7 @@ end
     @test TCI.fulltensor(tt1) ≈ TCI.fulltensor(tt3)
 end
 
+@testset "factorize" begin
+
+end
+