using Distributed; addprocs(2)
nworkers = length(workers())
@everywhere using Dagger
@everywhere using LinearAlgebra
A = rand(Blocks(8, 4), 4*4, 4)
B = rand(Blocks(2), 4)
C = Dagger.@shard rand(4*4)
Dagger.spawn_datadeps() do
for (idx, w) in enumerate(workers())
Dagger.@spawn scope=Dagger.scope(worker=w) mul!(Out(C), A.chunks[idx], B.chunks[idx])
end
# further operations that use C
end