From dc2dd3bc0b05530dc38618634b6bbb3ffb4d5d7c Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Mon, 2 Dec 2024 10:56:09 +0100
Subject: [PATCH 1/7] add WIP for MPI example

---
 examples/Project.toml |   7 ++-
 examples/bratu_mpi.jl | 111 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 116 insertions(+), 2 deletions(-)
 create mode 100644 examples/bratu_mpi.jl

diff --git a/examples/Project.toml b/examples/Project.toml
index 3671d5c..6272520 100644
--- a/examples/Project.toml
+++ b/examples/Project.toml
@@ -1,10 +1,13 @@
 [deps]
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
+EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
 Krylov = "ba0b0d4f-ebba-5204-a429-3ac8c609bfb7"
 KrylovPreconditioners = "45d422c2-293f-44ce-8315-2cb988662dec"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 NewtonKrylov = "0be81120-40bf-4f8b-adf0-26103efb66f1"
+OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 
-[sources]
-NewtonKrylov = {path = ".."}
+[sources.NewtonKrylov]
+path = ".."
diff --git a/examples/bratu_mpi.jl b/examples/bratu_mpi.jl
new file mode 100644
index 0000000..01fcd17
--- /dev/null
+++ b/examples/bratu_mpi.jl
@@ -0,0 +1,111 @@
+# # 1D bratu equation from (Kan2022-ko)[@cite]
+
+# ## Necessary packages
+using NewtonKrylov, Krylov
+using KrylovPreconditioners
+using SparseArrays, LinearAlgebra
+using OffsetArrays
+using MPI
+
+using EnzymeCore
+import EnzymeCore.EnzymeRules: forward
+using EnzymeCore.EnzymeRules
+
+function forward(::FwdConfig, func::Const{MPI.Request}, ::Type{<:Duplicated})
+    # Enzyme: unhandled forward for jl_f_finalizer
+    return Duplicated(MPI.Request(), MPI.Request())
+end
+
+MPI.Init()
+
+nranks = MPI.Comm_size(MPI.COMM_WORLD)
+myid   = MPI.Comm_rank(MPI.COMM_WORLD)
+
+# We are going to split a domain of size N
+# into equal chunks.
+
+function localdomain(N, nranks, myrank)
+    n = cld(N, nranks)
+    first = (n * myrank) + 1
+    last = min(n * (myrank + 1), N)
+    first:last 
+end
+
+# ## Choice of parameters
+const N = 10_000
+const λ = 3.51382
+const dx = 1 / (N + 1) # Grid-spacing
+
+# ### Domain and Inital condition
+LI = localdomain(N, 1, 0) # TODO
+
+x = LinRange(0.0, 1.0, N+2) # Global
+u₀ = sin.(x .* π)
+
+# ## 1D Bratu equations
+# $y′′ + λ * exp(y) = 0$
+
+
+
+function bratu!(_res, _y, Δx, λ, N)
+    res = OffsetArray(_res, 0:(N+1))
+    y  = OffsetArray(_y, 0:(N+1))
+
+    # Set boundary conditions
+    nranks = MPI.Comm_size(MPI.COMM_WORLD)
+    myid   = MPI.Comm_rank(MPI.COMM_WORLD)
+   
+    ## BVP boundary condition
+    if myid == 0
+        y[0] = 0
+    end
+    if myid == (nranks - 1)
+        y[N+1] = 0
+    end
+
+    ## domain splitting
+    # reqs = MPI.MultiRequest(4)  Enzyme strugles with this
+    reqs = [MPI.Request() for _ in 1:4]
+    if myid != (nranks - 1)
+        # Send & data to the right
+        MPI.Isend(view(y, N), MPI.COMM_WORLD, reqs[1]; dest = myid + 1)
+        MPI.Irecv!(view(y, N+1), MPI.COMM_WORLD, reqs[2]; src = myid + 1)
+    end
+    if myid != 0
+        # Send data to the left
+        MPI.Isend(view(y, 1), MPI.COMM_WORLD, reqs[3]; dest = myid - 1)
+        MPI.Irecv!(view(y, 0), MPI.COMM_WORLD, reqs[4]; src = myid - 1)
+    end
+    MPI.Waitall(reqs)
+
+    ## Calculate residual
+    for i in 1:N
+        y_l = y[i - 1]
+        y_r = y[i + 1]
+        y′′ = (y_r - 2y[i] + y_l) / Δx^2
+
+        res[i] = y′′ + λ * exp(y[i]) # = 0
+    end
+    return nothing
+end
+
+function bratu(y, dx, λ, N)
+    res = similar(y)
+    bratu!(res, y, dx, λ, N)
+    return res
+end
+
+# ## Reference solution
+function true_sol_bratu(x)
+    ## for λ = 3.51382, 2nd sol θ = 4.8057
+    θ = 4.79173
+    return -2 * log(cosh(θ * (x - 0.5) / 2) / (cosh(θ / 4)))
+end
+
+# ## Solving using inplace variant and CG
+uₖ, _ = newton_krylov!(
+    (res, u) -> bratu!(res, u, dx, λ, N),
+    copy(u₀), similar(u₀);
+    Solver = CgSolver,
+)
+

From fae43ebe72731d6cdcb524342331aede3f6654dd Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Mon, 2 Dec 2024 14:35:50 +0100
Subject: [PATCH 2/7] Update BC outside krylov

---
 examples/bratu_mpi.jl | 28 ++++++++++++----------------
 src/NewtonKrylov.jl   |  5 +++++
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/examples/bratu_mpi.jl b/examples/bratu_mpi.jl
index 01fcd17..765fe53 100644
--- a/examples/bratu_mpi.jl
+++ b/examples/bratu_mpi.jl
@@ -7,15 +7,6 @@ using SparseArrays, LinearAlgebra
 using OffsetArrays
 using MPI
 
-using EnzymeCore
-import EnzymeCore.EnzymeRules: forward
-using EnzymeCore.EnzymeRules
-
-function forward(::FwdConfig, func::Const{MPI.Request}, ::Type{<:Duplicated})
-    # Enzyme: unhandled forward for jl_f_finalizer
-    return Duplicated(MPI.Request(), MPI.Request())
-end
-
 MPI.Init()
 
 nranks = MPI.Comm_size(MPI.COMM_WORLD)
@@ -45,16 +36,13 @@ u₀ = sin.(x .* π)
 # ## 1D Bratu equations
 # $y′′ + λ * exp(y) = 0$
 
-
-
-function bratu!(_res, _y, Δx, λ, N)
-    res = OffsetArray(_res, 0:(N+1))
+function update!(_y, N)
     y  = OffsetArray(_y, 0:(N+1))
 
     # Set boundary conditions
     nranks = MPI.Comm_size(MPI.COMM_WORLD)
     myid   = MPI.Comm_rank(MPI.COMM_WORLD)
-   
+    
     ## BVP boundary condition
     if myid == 0
         y[0] = 0
@@ -69,14 +57,20 @@ function bratu!(_res, _y, Δx, λ, N)
     if myid != (nranks - 1)
         # Send & data to the right
         MPI.Isend(view(y, N), MPI.COMM_WORLD, reqs[1]; dest = myid + 1)
-        MPI.Irecv!(view(y, N+1), MPI.COMM_WORLD, reqs[2]; src = myid + 1)
+        MPI.Irecv!(view(y, N+1), MPI.COMM_WORLD, reqs[2]; source = myid + 1)
     end
     if myid != 0
         # Send data to the left
         MPI.Isend(view(y, 1), MPI.COMM_WORLD, reqs[3]; dest = myid - 1)
-        MPI.Irecv!(view(y, 0), MPI.COMM_WORLD, reqs[4]; src = myid - 1)
+        MPI.Irecv!(view(y, 0), MPI.COMM_WORLD, reqs[4]; source = myid - 1)
     end
     MPI.Waitall(reqs)
+    return nothing
+end
+
+function bratu!(_res, _y, Δx, λ, N)
+    res = OffsetArray(_res, 0:(N+1))
+    y  = OffsetArray(_y, 0:(N+1))
 
     ## Calculate residual
     for i in 1:N
@@ -107,5 +101,7 @@ uₖ, _ = newton_krylov!(
     (res, u) -> bratu!(res, u, dx, λ, N),
     copy(u₀), similar(u₀);
     Solver = CgSolver,
+    update! = (u)->update!(u, N),
+    norm = u->sqrt(MPI.Allreduce(sum(abs2, u), +, MPI.COMM_WORLD))
 )
 
diff --git a/src/NewtonKrylov.jl b/src/NewtonKrylov.jl
index db993aa..66ecdba 100644
--- a/src/NewtonKrylov.jl
+++ b/src/NewtonKrylov.jl
@@ -167,8 +167,12 @@ function newton_krylov!(
         N = nothing,
         krylov_kwargs = (;),
         callback = (args...) -> nothing,
+        norm = norm,
+        update! = (u) -> nothing
     )
     t₀ = time_ns()
+    
+    update!(u) # Impose BC
     F!(res, u) # res = F(u)
     n_res = norm(res)
     callback(u, res, n_res)
@@ -213,6 +217,7 @@ function newton_krylov!(
         # Update residual and norm
         n_res_prior = n_res
 
+        update!(u) # Impose BC
         F!(res, u) # res = F(u)
         n_res = norm(res)
         callback(u, res, n_res)

From ec4d70e9ce24c2cd7d8364c28b931b1b6b21a6e0 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Mon, 2 Dec 2024 16:25:18 +0100
Subject: [PATCH 3/7] save not working model

---
 examples/bratu_mpi.jl | 64 +++++++++++++++++++++++++++---------
 examples/mpiexecjl    | 76 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 124 insertions(+), 16 deletions(-)
 create mode 100755 examples/mpiexecjl

diff --git a/examples/bratu_mpi.jl b/examples/bratu_mpi.jl
index 765fe53..a635f51 100644
--- a/examples/bratu_mpi.jl
+++ b/examples/bratu_mpi.jl
@@ -1,5 +1,7 @@
 # # 1D bratu equation from (Kan2022-ko)[@cite]
 
+# # Run with `JULIAUP_CHANNEL="1.10" ./mpiexecjl --project=. -np 2 julia bratu_mpi.jl`
+
 # ## Necessary packages
 using NewtonKrylov, Krylov
 using KrylovPreconditioners
@@ -9,8 +11,8 @@ using MPI
 
 MPI.Init()
 
-nranks = MPI.Comm_size(MPI.COMM_WORLD)
-myid   = MPI.Comm_rank(MPI.COMM_WORLD)
+const nranks = MPI.Comm_size(MPI.COMM_WORLD)
+const myid = MPI.Comm_rank(MPI.COMM_WORLD)
 
 # We are going to split a domain of size N
 # into equal chunks.
@@ -19,7 +21,7 @@ function localdomain(N, nranks, myrank)
     n = cld(N, nranks)
     first = (n * myrank) + 1
     last = min(n * (myrank + 1), N)
-    first:last 
+    return first:last
 end
 
 # ## Choice of parameters
@@ -28,27 +30,33 @@ const λ = 3.51382
 const dx = 1 / (N + 1) # Grid-spacing
 
 # ### Domain and Inital condition
-LI = localdomain(N, 1, 0) # TODO
+LI = localdomain(N, nranks, myid)
+const l_N = length(LI)
+@show l_N
+u₀ = Vector{Float64}(undef, l_N + 2)
+
+X = LinRange(0.0, 1.0, N) # Global
+x = view(X, LI)
 
-x = LinRange(0.0, 1.0, N+2) # Global
-u₀ = sin.(x .* π)
+u₀[2:(l_N + 1)] = sin.(x .* π)
+U₀ = MPI.Gather(view(u₀, 2:(l_N + 1)), MPI.COMM_WORLD)
 
 # ## 1D Bratu equations
 # $y′′ + λ * exp(y) = 0$
 
 function update!(_y, N)
-    y  = OffsetArray(_y, 0:(N+1))
+    y = OffsetArray(_y, 0:(N + 1))
 
     # Set boundary conditions
     nranks = MPI.Comm_size(MPI.COMM_WORLD)
-    myid   = MPI.Comm_rank(MPI.COMM_WORLD)
-    
+    myid = MPI.Comm_rank(MPI.COMM_WORLD)
+
     ## BVP boundary condition
     if myid == 0
         y[0] = 0
     end
     if myid == (nranks - 1)
-        y[N+1] = 0
+        y[N + 1] = 0
     end
 
     ## domain splitting
@@ -57,7 +65,7 @@ function update!(_y, N)
     if myid != (nranks - 1)
         # Send & data to the right
         MPI.Isend(view(y, N), MPI.COMM_WORLD, reqs[1]; dest = myid + 1)
-        MPI.Irecv!(view(y, N+1), MPI.COMM_WORLD, reqs[2]; source = myid + 1)
+        MPI.Irecv!(view(y, N + 1), MPI.COMM_WORLD, reqs[2]; source = myid + 1)
     end
     if myid != 0
         # Send data to the left
@@ -69,8 +77,8 @@ function update!(_y, N)
 end
 
 function bratu!(_res, _y, Δx, λ, N)
-    res = OffsetArray(_res, 0:(N+1))
-    y  = OffsetArray(_y, 0:(N+1))
+    res = OffsetArray(_res, 0:(N + 1))
+    y = OffsetArray(_y, 0:(N + 1))
 
     ## Calculate residual
     for i in 1:N
@@ -96,12 +104,36 @@ function true_sol_bratu(x)
     return -2 * log(cosh(θ * (x - 0.5) / 2) / (cosh(θ / 4)))
 end
 
+u₀ = OffsetArray(u₀, 0:(l_N + 1))
+update!(u₀, l_N)
+
 # ## Solving using inplace variant and CG
 uₖ, _ = newton_krylov!(
-    (res, u) -> bratu!(res, u, dx, λ, N),
+    (res, u) -> bratu!(res, u, dx, λ, l_N),
     copy(u₀), similar(u₀);
     Solver = CgSolver,
-    update! = (u)->update!(u, N),
-    norm = u->sqrt(MPI.Allreduce(sum(abs2, u), +, MPI.COMM_WORLD))
+    update! = (u) -> update!(u, l_N),
+    norm = u -> sqrt(MPI.Allreduce(sum(abs2, u), +, MPI.COMM_WORLD))
 )
 
+Uₖ = MPI.Gather(view(uₖ, 2:(l_N + 1)), MPI.COMM_WORLD)
+if myid == 0
+    using CairoMakie
+
+    reference = true_sol_bratu.(X)
+    ϵ = abs2.(Uₖ .- reference)
+
+    fig = Figure(size = (800, 800))
+    ax = Axis(fig[1, 1], title = "", ylabel = "", xlabel = "")
+
+    lines!(ax, X, reference, label = "True solution")
+    lines!(ax, X, U₀, label = "Initial guess")
+    lines!(ax, X, Uₖ, label = "Newton-Krylov solution")
+
+    axislegend(ax, position = :cb)
+
+    ax = Axis(fig[1, 2], title = "Error", ylabel = "abs2 err", xlabel = "")
+    lines!(ax, ϵ)
+
+    save("bratu_mpi.png", fig)
+end
diff --git a/examples/mpiexecjl b/examples/mpiexecjl
new file mode 100755
index 0000000..137c9b4
--- /dev/null
+++ b/examples/mpiexecjl
@@ -0,0 +1,76 @@
+#!/bin/sh
+#
+# Copyright (C) 2020 Simon Byrne, Mosè Giordano
+# License is MIT "Expat"
+#
+### Commentary:
+#
+# Command line utility to call the `mpiexec` binary used by the `MPI.jl` version
+# in the given Julia project.  It has the same syntax as the `mpiexec` binary
+# that would be called, with the additional `--project=...` flag to select a
+# different Julia project.
+#
+# Examples of usage (the MPI flags available depend on the MPI implementation
+# called):
+#
+#   $ mpiexecjl --version
+#   $ mpiexecjl -n 40 julia mpi-script.jl
+#   $ mpiexecjl --project=my_experiment -n 80 --oversubscribe julia mpi-script.jl
+#
+### Code:
+
+usage () {
+    echo "Usage: ${0} [--project=...] MPIEXEC_ARGUMENTS..."
+    echo "Call the mpiexec binary in the Julia environment specified by the --project option."
+    echo "If no project is specified, the MPI associated with the global Julia environment will be used."
+    echo "All other arguments are forwarded to mpiexec."
+}
+
+for arg; do
+    shift
+    case "${arg}" in
+        --project | --project=*)
+            PROJECT_ARG="${arg}"
+            ;;
+        -h | --help)
+            usage
+            echo "Below is the help of the current mpiexec."
+            echo
+            set -- "${@}" "${arg}"
+            ;;
+        *)
+            set -- "${@}" "${arg}"
+            ;;
+    esac
+done
+
+if [ -z "${*}" ]; then
+    echo "ERROR: no arguments specified." 1>&2
+    echo
+    usage
+    exit 1
+fi
+
+if [ -n "${JULIA_BINDIR}" ]; then
+    JULIA_CMD="${JULIA_BINDIR}/julia"
+else
+    JULIA_CMD="julia"
+fi
+
+# shellcheck disable=SC2016
+SCRIPT='
+using MPI
+ENV["JULIA_PROJECT"] = dirname(Base.active_project())
+proc = run(pipeline(`$(mpiexec()) $(ARGS)`; stdout, stderr); wait=false)
+wait(proc)
+if !iszero(proc.exitcode)
+   @error "The MPI process failed" proc
+end
+exit(proc.exitcode)
+'
+
+if [ -n "${PROJECT_ARG}" ]; then
+    "${JULIA_CMD}" "${PROJECT_ARG}" --color=yes --startup-file=no -q --compile=min -O0 -e "${SCRIPT}" -- "${@}"
+else
+    "${JULIA_CMD}" --color=yes --startup-file=no -q --compile=min -O0 -e "${SCRIPT}" -- "${@}"
+fi

From af4767a3f3219a6c7e9bd115e626a86d0f1836bf Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 3 Dec 2024 10:46:18 +0100
Subject: [PATCH 4/7] use a custom array

---
 examples/bratu_mpi.jl | 112 ++++++++++++++++++++++++++++++++----------
 src/NewtonKrylov.jl   |   6 ++-
 2 files changed, 91 insertions(+), 27 deletions(-)

diff --git a/examples/bratu_mpi.jl b/examples/bratu_mpi.jl
index a635f51..4d69c8d 100644
--- a/examples/bratu_mpi.jl
+++ b/examples/bratu_mpi.jl
@@ -9,10 +9,48 @@ using SparseArrays, LinearAlgebra
 using OffsetArrays
 using MPI
 
-MPI.Init()
+struct LocalData{FC, D} <: AbstractVector{FC}
+    data::D
 
-const nranks = MPI.Comm_size(MPI.COMM_WORLD)
-const myid = MPI.Comm_rank(MPI.COMM_WORLD)
+    function LocalData(data::D) where {D}
+        FC = eltype(data)
+        return new{FC, D}(data)
+    end
+end
+
+function LocalData{FC, D}(::UndefInitializer, l::Int64) where {FC, D}
+    return LocalData(D(undef, l))
+end
+
+Base.length(v::LocalData) = return length(v.data)
+
+Base.size(v::LocalData) = return size(v.data)
+
+Base.@propagate_inbounds function Base.getindex(v::LocalData, idx)
+    return getindex(v.data, idx)
+end
+
+Base.@propagate_inbounds function Base.setindex!(v::LocalData, x, idx)
+    return setindex!(v.data, x, idx)
+end
+
+Base.similar(v::LocalData) = LocalData(similar(v.data))
+Base.copy(v::LocalData) = LocalData(copy(v.data))
+function Base.cconvert(::Type{MPI.MPIPtr}, buf::LocalData)
+    return buf.data
+end
+
+using Krylov
+import Krylov.FloatOrComplex
+
+function Krylov.kdot(n::Integer, x::LocalData{T}, y::LocalData{T}) where {T <: FloatOrComplex}
+    return MPI.Allreduce(dot(x.data, y.data), +, MPI.COMM_WORLD)
+end
+
+function Krylov.knorm(n::Integer, x::LocalData{T}) where {T <: FloatOrComplex}
+    # TODO: We need to not double count the boundary
+    return sqrt(MPI.Allreduce(sum(abs2, x.data), +, MPI.COMM_WORLD))
+end
 
 # We are going to split a domain of size N
 # into equal chunks.
@@ -24,28 +62,12 @@ function localdomain(N, nranks, myrank)
     return first:last
 end
 
-# ## Choice of parameters
-const N = 10_000
-const λ = 3.51382
-const dx = 1 / (N + 1) # Grid-spacing
-
-# ### Domain and Inital condition
-LI = localdomain(N, nranks, myid)
-const l_N = length(LI)
-@show l_N
-u₀ = Vector{Float64}(undef, l_N + 2)
-
-X = LinRange(0.0, 1.0, N) # Global
-x = view(X, LI)
-
-u₀[2:(l_N + 1)] = sin.(x .* π)
-U₀ = MPI.Gather(view(u₀, 2:(l_N + 1)), MPI.COMM_WORLD)
 
 # ## 1D Bratu equations
 # $y′′ + λ * exp(y) = 0$
 
-function update!(_y, N)
-    y = OffsetArray(_y, 0:(N + 1))
+function update!(_y::LocalData, N)
+    y = OffsetArray(_y.data, 0:(N + 1))
 
     # Set boundary conditions
     nranks = MPI.Comm_size(MPI.COMM_WORLD)
@@ -80,6 +102,9 @@ function bratu!(_res, _y, Δx, λ, N)
     res = OffsetArray(_res, 0:(N + 1))
     y = OffsetArray(_y, 0:(N + 1))
 
+    # res is N+2 and knorm doesn't know that
+    res[N + 1] = 0
+    res[0] = 0
     ## Calculate residual
     for i in 1:N
         y_l = y[i - 1]
@@ -104,19 +129,56 @@ function true_sol_bratu(x)
     return -2 * log(cosh(θ * (x - 0.5) / 2) / (cosh(θ / 4)))
 end
 
-u₀ = OffsetArray(u₀, 0:(l_N + 1))
+# # Setup
+MPI.Init()
+
+const nranks = MPI.Comm_size(MPI.COMM_WORLD)
+const myid = MPI.Comm_rank(MPI.COMM_WORLD)
+
+# ## Choice of parameters
+const N = 10_000
+const λ = 3.51382
+const dx = 1 / (N + 1) # Grid-spacing
+
+# ### Domain and Inital condition
+LI = localdomain(N, nranks, myid)
+const l_N = length(LI)
+u₀ = Vector{Float64}(undef, l_N + 2)
+
+X = LinRange(0.0 + dx, 1.0 - dx, N) # Global
+x = view(X, LI)
+
+u₀[2:(l_N + 1)] = sin.(x .* π)
+
+u₀ = LocalData(u₀) #, 0:(l_N + 1))
 update!(u₀, l_N)
 
+U₀ = MPI.Gather(view(u₀, 2:(l_N + 1)), MPI.COMM_WORLD)
+
+@show Krylov.knorm(length(u₀), u₀)
+if myid == 0
+    @show Krylov.knorm(length(U₀), U₀)
+end
+@show typeof(similar(u₀))
+@show typeof(copy(u₀))
+
 # ## Solving using inplace variant and CG
 uₖ, _ = newton_krylov!(
     (res, u) -> bratu!(res, u, dx, λ, l_N),
     copy(u₀), similar(u₀);
     Solver = CgSolver,
     update! = (u) -> update!(u, l_N),
-    norm = u -> sqrt(MPI.Allreduce(sum(abs2, u), +, MPI.COMM_WORLD))
+    norm = u -> Krylov.knorm(length(u), u),
+    verbose = myid == 0 ? 1 : 0
 )
 
 Uₖ = MPI.Gather(view(uₖ, 2:(l_N + 1)), MPI.COMM_WORLD)
+
+@show Krylov.knorm(length(uₖ), uₖ)
+if myid == 0
+    @show Krylov.knorm(length(Uₖ), Uₖ)
+end
+
 if myid == 0
     using CairoMakie
 
@@ -133,7 +195,7 @@ if myid == 0
     axislegend(ax, position = :cb)
 
     ax = Axis(fig[1, 2], title = "Error", ylabel = "abs2 err", xlabel = "")
-    lines!(ax, ϵ)
+    lines!(ax, X, ϵ)
 
-    save("bratu_mpi.png", fig)
+    save("bratu_mpi_n$(nranks).png", fig)
 end
diff --git a/src/NewtonKrylov.jl b/src/NewtonKrylov.jl
index 66ecdba..76f7ea7 100644
--- a/src/NewtonKrylov.jl
+++ b/src/NewtonKrylov.jl
@@ -171,7 +171,7 @@ function newton_krylov!(
         update! = (u) -> nothing
     )
     t₀ = time_ns()
-    
+
     update!(u) # Impose BC
     F!(res, u) # res = F(u)
     n_res = norm(res)
@@ -206,7 +206,9 @@ function newton_krylov!(
         # Solve: Jx = -res
         # res is modifyed by J, so we create a copy `-res`
         # TODO: provide a temporary storage for `-res`
-        solve!(solver, J, -res; kwargs...)
+        b = copy(res)
+        b .*= - 1
+        solve!(solver, J, b; kwargs...)
 
         d = solver.x # Newton direction
         s = 1        # Newton step TODO: LineSearch

From de64ea898b70ce7418f261e8107300042c26369d Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 3 Dec 2024 15:24:29 +0100
Subject: [PATCH 5/7] WIP: MPI krylov

---
 examples/bratu_mpi.jl | 105 ++++++++++++++++++++++--------------------
 examples/simple.jl    |   5 ++
 src/NewtonKrylov.jl   |  15 +++---
 3 files changed, 68 insertions(+), 57 deletions(-)

diff --git a/examples/bratu_mpi.jl b/examples/bratu_mpi.jl
index 4d69c8d..5c9e02f 100644
--- a/examples/bratu_mpi.jl
+++ b/examples/bratu_mpi.jl
@@ -19,37 +19,43 @@ struct LocalData{FC, D} <: AbstractVector{FC}
 end
 
 function LocalData{FC, D}(::UndefInitializer, l::Int64) where {FC, D}
-    return LocalData(D(undef, l))
+    return LocalData(D(undef, l + 2))
 end
 
-Base.length(v::LocalData) = return length(v.data)
-
-Base.size(v::LocalData) = return size(v.data)
+Base.length(v::LocalData) = return length(v.data) - 2
+Base.size(v::LocalData) = return (length(v),)
 
 Base.@propagate_inbounds function Base.getindex(v::LocalData, idx)
-    return getindex(v.data, idx)
+    return getindex(v.data, idx + 1)
 end
 
 Base.@propagate_inbounds function Base.setindex!(v::LocalData, x, idx)
-    return setindex!(v.data, x, idx)
+    return setindex!(v.data, x, idx + 1)
 end
 
 Base.similar(v::LocalData) = LocalData(similar(v.data))
 Base.copy(v::LocalData) = LocalData(copy(v.data))
+Base.zero(v::LocalData) = LocalData(zero(v.data))
+
 function Base.cconvert(::Type{MPI.MPIPtr}, buf::LocalData)
-    return buf.data
+    return view(buf.data, 2:(length(buf) - 1))
+end
+function MPI.Buffer(v::LocalData)
+    return MPI.Buffer(v, Cint(length(v)), MPI.Datatype(eltype(v)))
 end
 
 using Krylov
 import Krylov.FloatOrComplex
 
 function Krylov.kdot(n::Integer, x::LocalData{T}, y::LocalData{T}) where {T <: FloatOrComplex}
-    return MPI.Allreduce(dot(x.data, y.data), +, MPI.COMM_WORLD)
+    return @views MPI.Allreduce(
+        dot(x.data[2:(length(x) + 1)], y.data[2:(length(y) + 1)]),
+        +, MPI.COMM_WORLD
+    )
 end
 
 function Krylov.knorm(n::Integer, x::LocalData{T}) where {T <: FloatOrComplex}
-    # TODO: We need to not double count the boundary
-    return sqrt(MPI.Allreduce(sum(abs2, x.data), +, MPI.COMM_WORLD))
+    return @views sqrt(MPI.Allreduce(sum(abs2, x.data[2:(length(x) + 1)]), +, MPI.COMM_WORLD))
 end
 
 # We are going to split a domain of size N
@@ -66,9 +72,8 @@ end
 # ## 1D Bratu equations
 # $y′′ + λ * exp(y) = 0$
 
-function update!(_y::LocalData, N)
-    y = OffsetArray(_y.data, 0:(N + 1))
-
+function update!(y::LocalData)
+    N = length(y)
     # Set boundary conditions
     nranks = MPI.Comm_size(MPI.COMM_WORLD)
     myid = MPI.Comm_rank(MPI.COMM_WORLD)
@@ -83,28 +88,38 @@ function update!(_y::LocalData, N)
 
     ## domain splitting
     # reqs = MPI.MultiRequest(4)  Enzyme strugles with this
-    reqs = [MPI.Request() for _ in 1:4]
+    # reqs = [MPI.Request() for _ in 1:4]
+    # if myid != (nranks - 1)
+    #     # Send & data to the right
+    #     MPI.Isend(view(y.data, N + 1), MPI.COMM_WORLD, reqs[1]; dest = myid + 1)
+    #     MPI.Irecv!(view(y.data, N + 2), MPI.COMM_WORLD, reqs[2]; source = myid + 1)
+    # end
+    # if myid != 0
+    #     # Send data to the left
+    #     MPI.Isend(view(y.data, 2), MPI.COMM_WORLD, reqs[3]; dest = myid - 1)
+    #     MPI.Irecv!(view(y.data, 1), MPI.COMM_WORLD, reqs[4]; source = myid - 1)
+    # end
+    # MPI.Waitall(reqs)
+
     if myid != (nranks - 1)
         # Send & data to the right
-        MPI.Isend(view(y, N), MPI.COMM_WORLD, reqs[1]; dest = myid + 1)
-        MPI.Irecv!(view(y, N + 1), MPI.COMM_WORLD, reqs[2]; source = myid + 1)
+        MPI.Send(view(y.data, N + 1), MPI.COMM_WORLD; dest = myid + 1)
+        MPI.Recv!(view(y.data, N + 2), MPI.COMM_WORLD; source = myid + 1)
     end
     if myid != 0
         # Send data to the left
-        MPI.Isend(view(y, 1), MPI.COMM_WORLD, reqs[3]; dest = myid - 1)
-        MPI.Irecv!(view(y, 0), MPI.COMM_WORLD, reqs[4]; source = myid - 1)
+        MPI.Recv!(view(y.data, 1), MPI.COMM_WORLD; source = myid - 1)
+        MPI.Send(view(y.data, 2), MPI.COMM_WORLD; dest = myid - 1)
     end
-    MPI.Waitall(reqs)
+
     return nothing
 end
 
-function bratu!(_res, _y, Δx, λ, N)
-    res = OffsetArray(_res, 0:(N + 1))
-    y = OffsetArray(_y, 0:(N + 1))
-
-    # res is N+2 and knorm doesn't know that
-    res[N + 1] = 0
-    res[0] = 0
+function bratu!(res::LocalData, y::LocalData, Δx, λ)
+    update!(y)
+    # Must be part of the equation since otherwise the krylov solver
+    # Will not update the boundary conditions for the inner iterations
+    N = length(y)
     ## Calculate residual
     for i in 1:N
         y_l = y[i - 1]
@@ -116,9 +131,9 @@ function bratu!(_res, _y, Δx, λ, N)
     return nothing
 end
 
-function bratu(y, dx, λ, N)
+function bratu(y, dx, λ)
     res = similar(y)
-    bratu!(res, y, dx, λ, N)
+    bratu!(res, y, dx, λ)
     return res
 end
 
@@ -148,36 +163,28 @@ u₀ = Vector{Float64}(undef, l_N + 2)
 X = LinRange(0.0 + dx, 1.0 - dx, N) # Global
 x = view(X, LI)
 
-u₀[2:(l_N + 1)] = sin.(x .* π)
+u₀ = LocalData(u₀)
+u₀ .= sin.(x .* π)
+update!(u₀)
 
-u₀ = LocalData(u₀) #, 0:(l_N + 1))
-update!(u₀, l_N)
+# JOp = NewtonKrylov.JacobianOperator((res, u) -> bratu!(res, u, dx, λ), similar(u₀), u₀)
+# J = collect(JOp)
+# display(J)
 
-U₀ = MPI.Gather(view(u₀, 2:(l_N + 1)), MPI.COMM_WORLD)
-
-@show Krylov.knorm(length(u₀), u₀)
-if myid == 0
-    @show Krylov.knorm(length(U₀), U₀)
-end
-@show typeof(similar(u₀))
-@show typeof(copy(u₀))
+U₀ = MPI.Gather(u₀, MPI.COMM_WORLD)
 
 # ## Solving using inplace variant and CG
-uₖ, _ = newton_krylov!(
-    (res, u) -> bratu!(res, u, dx, λ, l_N),
+uₖ, stats = newton_krylov!(
+    (res, u) -> bratu!(res, u, dx, λ),
     copy(u₀), similar(u₀);
     Solver = CgSolver,
-    update! = (u) -> update!(u, l_N),
-    norm = u -> Krylov.knorm(length(u), u),
+    # update! = (u) -> update!(u),
+    norm = v -> Krylov.knorm(length(v), v),
     verbose = myid == 0 ? 1 : 0
 )
+@show stats
 
-Uₖ = MPI.Gather(view(uₖ, 2:(l_N + 1)), MPI.COMM_WORLD)
-
-@show Krylov.knorm(length(uₖ), uₖ)
-if myid == 0
-    @show Krylov.knorm(length(Uₖ), Uₖ)
-end
+Uₖ = MPI.Gather(uₖ, MPI.COMM_WORLD)
 
 if myid == 0
     using CairoMakie
diff --git a/examples/simple.jl b/examples/simple.jl
index 5347cab..05d862f 100644
--- a/examples/simple.jl
+++ b/examples/simple.jl
@@ -48,3 +48,8 @@ end
 lines!(ax, trace_3)
 
 fig
+
+# ## Extract Jacobian
+
+JOp = NewtonKrylov.JacobianOperator(F!, zeros(2), [3.0, 4.0])
+J = collect(JOp)
diff --git a/src/NewtonKrylov.jl b/src/NewtonKrylov.jl
index 76f7ea7..ab9cfa2 100644
--- a/src/NewtonKrylov.jl
+++ b/src/NewtonKrylov.jl
@@ -67,12 +67,12 @@ end
 
 function Base.collect(JOp::JacobianOperator)
     N, M = size(JOp)
-    v = zeros(eltype(JOp), M)
-    out = zeros(eltype(JOp), N)
+    v = zero(JOp.u)
+    out = zero(JOp.res)
     J = SparseMatrixCSC{eltype(v), Int}(undef, size(JOp)...)
     for j in 1:M
-        out .= 0.0
-        v .= 0.0
+        out .= 0
+        v .= 0
         v[j] = 1.0
         mul!(out, JOp, v)
         for i in 1:N
@@ -187,6 +187,7 @@ function newton_krylov!(
 
     J = JacobianOperator(F!, res, u)
     solver = Solver(J, res)
+    b = similar(res)
 
     stats = Stats(0, 0)
     while n_res > tol && stats.outer_iterations <= max_niter
@@ -205,9 +206,7 @@ function newton_krylov!(
 
         # Solve: Jx = -res
         # res is modifyed by J, so we create a copy `-res`
-        # TODO: provide a temporary storage for `-res`
-        b = copy(res)
-        b .*= - 1
+        b .= .- res
         solve!(solver, J, b; kwargs...)
 
         d = solver.x # Newton direction
@@ -233,7 +232,7 @@ function newton_krylov!(
             η = forcing(η, tol, n_res, n_res_prior)
         end
 
-        verbose > 0 && @info "Newton" iter = n_res η stats
+        verbose > 0 && @info "Newton" resᵢ = n_res η stats
         stats = update(stats, solver.stats.niter)
     end
     t = (time_ns() - t₀) / 1.0e9

From c4ffbde54a78492df7e953c4f549e81b7100874b Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 17 Dec 2024 12:42:58 +0100
Subject: [PATCH 6/7] add FDJacobainOperator

---
 src/NewtonKrylov.jl | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/src/NewtonKrylov.jl b/src/NewtonKrylov.jl
index ab9cfa2..09aeab4 100644
--- a/src/NewtonKrylov.jl
+++ b/src/NewtonKrylov.jl
@@ -19,7 +19,9 @@ function maybe_duplicated(f, df)
     end
 end
 
-struct JacobianOperator{F, A}
+abstract type AbstractJacobianOperator end
+
+struct JacobianOperator{F, A} <: AbstractJacobianOperator
     f::F # F!(res, u)
     f_cache::F
     res::A
@@ -65,7 +67,7 @@ function mul!(out, J′::Union{Adjoint{<:Any, <:JacobianOperator}, Transpose{<:A
     return nothing
 end
 
-function Base.collect(JOp::JacobianOperator)
+function Base.collect(JOp::AbstractJacobianOperator)
     N, M = size(JOp)
     v = zero(JOp.u)
     out = zero(JOp.res)
@@ -84,6 +86,29 @@ function Base.collect(JOp::JacobianOperator)
     return J
 end
 
+struct FDJacobianOperator{F, A} <: AbstractJacobianOperator
+    f::F # F!(res, u)
+    res::A
+    u::A
+    function FDJacobianOperator(f::F, res, u) where {F}
+        return new{F, typeof(u)}(f, res, u)
+    end
+end
+
+Base.size(J::FDJacobianOperator) = (length(J.res), length(J.u))
+Base.eltype(J::FDJacobianOperator) = eltype(J.u)
+
+function mul!(out, J::FDJacobianOperator, v)
+    ϵ = cbrt(eps()/2)
+    # (F(u + ϵ .* v) - F(u - ϵ .* v)) ./ 2ϵ
+
+    J.f(J.res, J.u .+ ϵ .* v)
+    out .= J.res
+    J.f(J.res, J.u .- ϵ .* v)
+    out .= (out .- J.res) ./ 2ϵ 
+    return nothing
+end
+
 ##
 # Newton-Krylov
 ##

From 4cd3ac3c1f6576a94c1c68a59d72b886cd10f05c Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 17 Dec 2024 12:43:41 +0100
Subject: [PATCH 7/7] fixup! add FDJacobainOperator

---
 src/NewtonKrylov.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/NewtonKrylov.jl b/src/NewtonKrylov.jl
index 09aeab4..c999f4c 100644
--- a/src/NewtonKrylov.jl
+++ b/src/NewtonKrylov.jl
@@ -193,7 +193,8 @@ function newton_krylov!(
         krylov_kwargs = (;),
         callback = (args...) -> nothing,
         norm = norm,
-        update! = (u) -> nothing
+        update! = (u) -> nothing,
+        Operator = JacobianOperator,
     )
     t₀ = time_ns()
 
@@ -210,7 +211,7 @@ function newton_krylov!(
 
     verbose > 0 && @info "Jacobian-Free Newton-Krylov" Solver res₀ = n_res tol tol_rel tol_abs η
 
-    J = JacobianOperator(F!, res, u)
+    J = Operator(F!, res, u)
     solver = Solver(J, res)
     b = similar(res)