diff --git a/examples/Project.toml b/examples/Project.toml
index 3671d5c..6272520 100644
--- a/examples/Project.toml
+++ b/examples/Project.toml
@@ -1,10 +1,13 @@
 [deps]
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
+EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
 Krylov = "ba0b0d4f-ebba-5204-a429-3ac8c609bfb7"
 KrylovPreconditioners = "45d422c2-293f-44ce-8315-2cb988662dec"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 NewtonKrylov = "0be81120-40bf-4f8b-adf0-26103efb66f1"
+OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 
-[sources]
-NewtonKrylov = {path = ".."}
+[sources.NewtonKrylov]
+path = ".."
diff --git a/examples/bratu_mpi.jl b/examples/bratu_mpi.jl
new file mode 100644
index 0000000..5c9e02f
--- /dev/null
+++ b/examples/bratu_mpi.jl
@@ -0,0 +1,208 @@
+# # 1D bratu equation from (Kan2022-ko)[@cite]
+
+# # Run with `JULIAUP_CHANNEL="1.10" ./mpiexecjl --project=. -np 2 julia bratu_mpi.jl`
+
+# ## Necessary packages
+using NewtonKrylov, Krylov
+using KrylovPreconditioners
+using SparseArrays, LinearAlgebra
+using OffsetArrays
+using MPI
+
+struct LocalData{FC, D} <: AbstractVector{FC}
+    data::D
+
+    function LocalData(data::D) where {D}
+        FC = eltype(data)
+        return new{FC, D}(data)
+    end
+end
+
+function LocalData{FC, D}(::UndefInitializer, l::Int64) where {FC, D}
+    return LocalData(D(undef, l + 2))
+end
+
+Base.length(v::LocalData) = return length(v.data) - 2
+Base.size(v::LocalData) = return (length(v),)
+
+Base.@propagate_inbounds function Base.getindex(v::LocalData, idx)
+    return getindex(v.data, idx + 1)
+end
+
+Base.@propagate_inbounds function Base.setindex!(v::LocalData, x, idx)
+    return setindex!(v.data, x, idx + 1)
+end
+
+Base.similar(v::LocalData) = LocalData(similar(v.data))
+Base.copy(v::LocalData) = LocalData(copy(v.data))
+Base.zero(v::LocalData) = LocalData(zero(v.data))
+
+function Base.cconvert(::Type{MPI.MPIPtr}, buf::LocalData)
+    return view(buf.data, 2:(length(buf) - 1))
+end
+function MPI.Buffer(v::LocalData)
+    return MPI.Buffer(v, Cint(length(v)), MPI.Datatype(eltype(v)))
+end
+
+using Krylov
+import Krylov.FloatOrComplex
+
+function Krylov.kdot(n::Integer, x::LocalData{T}, y::LocalData{T}) where {T <: FloatOrComplex}
+    return @views MPI.Allreduce(
+        dot(x.data[2:(length(x) + 1)], y.data[2:(length(y) + 1)]),
+        +, MPI.COMM_WORLD
+    )
+end
+
+function Krylov.knorm(n::Integer, x::LocalData{T}) where {T <: FloatOrComplex}
+    return @views sqrt(MPI.Allreduce(sum(abs2, x.data[2:(length(x) + 1)]), +, MPI.COMM_WORLD))
+end
+
+# We are going to split a domain of size N
+# into equal chunks.
+
+function localdomain(N, nranks, myrank)
+    n = cld(N, nranks)
+    first = (n * myrank) + 1
+    last = min(n * (myrank + 1), N)
+    return first:last
+end
+
+
+# ## 1D Bratu equations
+# $y′′ + λ * exp(y) = 0$
+
+function update!(y::LocalData)
+    N = length(y)
+    # Set boundary conditions
+    nranks = MPI.Comm_size(MPI.COMM_WORLD)
+    myid = MPI.Comm_rank(MPI.COMM_WORLD)
+
+    ## BVP boundary condition
+    if myid == 0
+        y[0] = 0
+    end
+    if myid == (nranks - 1)
+        y[N + 1] = 0
+    end
+
+    ## domain splitting
+    # reqs = MPI.MultiRequest(4)  Enzyme strugles with this
+    # reqs = [MPI.Request() for _ in 1:4]
+    # if myid != (nranks - 1)
+    #     # Send & data to the right
+    #     MPI.Isend(view(y.data, N + 1), MPI.COMM_WORLD, reqs[1]; dest = myid + 1)
+    #     MPI.Irecv!(view(y.data, N + 2), MPI.COMM_WORLD, reqs[2]; source = myid + 1)
+    # end
+    # if myid != 0
+    #     # Send data to the left
+    #     MPI.Isend(view(y.data, 2), MPI.COMM_WORLD, reqs[3]; dest = myid - 1)
+    #     MPI.Irecv!(view(y.data, 1), MPI.COMM_WORLD, reqs[4]; source = myid - 1)
+    # end
+    # MPI.Waitall(reqs)
+
+    if myid != (nranks - 1)
+        # Send & data to the right
+        MPI.Send(view(y.data, N + 1), MPI.COMM_WORLD; dest = myid + 1)
+        MPI.Recv!(view(y.data, N + 2), MPI.COMM_WORLD; source = myid + 1)
+    end
+    if myid != 0
+        # Send data to the left
+        MPI.Recv!(view(y.data, 1), MPI.COMM_WORLD; source = myid - 1)
+        MPI.Send(view(y.data, 2), MPI.COMM_WORLD; dest = myid - 1)
+    end
+
+    return nothing
+end
+
+function bratu!(res::LocalData, y::LocalData, Δx, λ)
+    update!(y)
+    # Must be part of the equation since otherwise the krylov solver
+    # Will not update the boundary conditions for the inner iterations
+    N = length(y)
+    ## Calculate residual
+    for i in 1:N
+        y_l = y[i - 1]
+        y_r = y[i + 1]
+        y′′ = (y_r - 2y[i] + y_l) / Δx^2
+
+        res[i] = y′′ + λ * exp(y[i]) # = 0
+    end
+    return nothing
+end
+
+function bratu(y, dx, λ)
+    res = similar(y)
+    bratu!(res, y, dx, λ)
+    return res
+end
+
+# ## Reference solution
+function true_sol_bratu(x)
+    ## for λ = 3.51382, 2nd sol θ = 4.8057
+    θ = 4.79173
+    return -2 * log(cosh(θ * (x - 0.5) / 2) / (cosh(θ / 4)))
+end
+
+# # Setup
+MPI.Init()
+
+const nranks = MPI.Comm_size(MPI.COMM_WORLD)
+const myid = MPI.Comm_rank(MPI.COMM_WORLD)
+
+# ## Choice of parameters
+const N = 10_000
+const λ = 3.51382
+const dx = 1 / (N + 1) # Grid-spacing
+
+# ### Domain and Inital condition
+LI = localdomain(N, nranks, myid)
+const l_N = length(LI)
+u₀ = Vector{Float64}(undef, l_N + 2)
+
+X = LinRange(0.0 + dx, 1.0 - dx, N) # Global
+x = view(X, LI)
+
+u₀ = LocalData(u₀)
+u₀ .= sin.(x .* π)
+update!(u₀)
+
+# JOp = NewtonKrylov.JacobianOperator((res, u) -> bratu!(res, u, dx, λ), similar(u₀), u₀)
+# J = collect(JOp)
+# display(J)
+
+U₀ = MPI.Gather(u₀, MPI.COMM_WORLD)
+
+# ## Solving using inplace variant and CG
+uₖ, stats = newton_krylov!(
+    (res, u) -> bratu!(res, u, dx, λ),
+    copy(u₀), similar(u₀);
+    Solver = CgSolver,
+    # update! = (u) -> update!(u),
+    norm = v -> Krylov.knorm(length(v), v),
+    verbose = myid == 0 ? 1 : 0
+)
+@show stats
+
+Uₖ = MPI.Gather(uₖ, MPI.COMM_WORLD)
+
+if myid == 0
+    using CairoMakie
+
+    reference = true_sol_bratu.(X)
+    ϵ = abs2.(Uₖ .- reference)
+
+    fig = Figure(size = (800, 800))
+    ax = Axis(fig[1, 1], title = "", ylabel = "", xlabel = "")
+
+    lines!(ax, X, reference, label = "True solution")
+    lines!(ax, X, U₀, label = "Initial guess")
+    lines!(ax, X, Uₖ, label = "Newton-Krylov solution")
+
+    axislegend(ax, position = :cb)
+
+    ax = Axis(fig[1, 2], title = "Error", ylabel = "abs2 err", xlabel = "")
+    lines!(ax, X, ϵ)
+
+    save("bratu_mpi_n$(nranks).png", fig)
+end
diff --git a/examples/mpiexecjl b/examples/mpiexecjl
new file mode 100755
index 0000000..137c9b4
--- /dev/null
+++ b/examples/mpiexecjl
@@ -0,0 +1,76 @@
+#!/bin/sh
+#
+# Copyright (C) 2020 Simon Byrne, Mosè Giordano
+# License is MIT "Expat"
+#
+### Commentary:
+#
+# Command line utility to call the `mpiexec` binary used by the `MPI.jl` version
+# in the given Julia project.  It has the same syntax as the `mpiexec` binary
+# that would be called, with the additional `--project=...` flag to select a
+# different Julia project.
+#
+# Examples of usage (the MPI flags available depend on the MPI implementation
+# called):
+#
+#   $ mpiexecjl --version
+#   $ mpiexecjl -n 40 julia mpi-script.jl
+#   $ mpiexecjl --project=my_experiment -n 80 --oversubscribe julia mpi-script.jl
+#
+### Code:
+
+usage () {
+    echo "Usage: ${0} [--project=...] MPIEXEC_ARGUMENTS..."
+    echo "Call the mpiexec binary in the Julia environment specified by the --project option."
+    echo "If no project is specified, the MPI associated with the global Julia environment will be used."
+    echo "All other arguments are forwarded to mpiexec."
+}
+
+for arg; do
+    shift
+    case "${arg}" in
+        --project | --project=*)
+            PROJECT_ARG="${arg}"
+            ;;
+        -h | --help)
+            usage
+            echo "Below is the help of the current mpiexec."
+            echo
+            set -- "${@}" "${arg}"
+            ;;
+        *)
+            set -- "${@}" "${arg}"
+            ;;
+    esac
+done
+
+if [ -z "${*}" ]; then
+    echo "ERROR: no arguments specified." 1>&2
+    echo
+    usage
+    exit 1
+fi
+
+if [ -n "${JULIA_BINDIR}" ]; then
+    JULIA_CMD="${JULIA_BINDIR}/julia"
+else
+    JULIA_CMD="julia"
+fi
+
+# shellcheck disable=SC2016
+SCRIPT='
+using MPI
+ENV["JULIA_PROJECT"] = dirname(Base.active_project())
+proc = run(pipeline(`$(mpiexec()) $(ARGS)`; stdout, stderr); wait=false)
+wait(proc)
+if !iszero(proc.exitcode)
+   @error "The MPI process failed" proc
+end
+exit(proc.exitcode)
+'
+
+if [ -n "${PROJECT_ARG}" ]; then
+    "${JULIA_CMD}" "${PROJECT_ARG}" --color=yes --startup-file=no -q --compile=min -O0 -e "${SCRIPT}" -- "${@}"
+else
+    "${JULIA_CMD}" --color=yes --startup-file=no -q --compile=min -O0 -e "${SCRIPT}" -- "${@}"
+fi
diff --git a/examples/simple.jl b/examples/simple.jl
index 5347cab..05d862f 100644
--- a/examples/simple.jl
+++ b/examples/simple.jl
@@ -48,3 +48,8 @@ end
 lines!(ax, trace_3)
 
 fig
+
+# ## Extract Jacobian
+
+JOp = NewtonKrylov.JacobianOperator(F!, zeros(2), [3.0, 4.0])
+J = collect(JOp)
diff --git a/src/NewtonKrylov.jl b/src/NewtonKrylov.jl
index db993aa..c999f4c 100644
--- a/src/NewtonKrylov.jl
+++ b/src/NewtonKrylov.jl
@@ -19,7 +19,9 @@ function maybe_duplicated(f, df)
     end
 end
 
-struct JacobianOperator{F, A}
+abstract type AbstractJacobianOperator end
+
+struct JacobianOperator{F, A} <: AbstractJacobianOperator
     f::F # F!(res, u)
     f_cache::F
     res::A
@@ -65,14 +67,14 @@ function mul!(out, J′::Union{Adjoint{<:Any, <:JacobianOperator}, Transpose{<:A
     return nothing
 end
 
-function Base.collect(JOp::JacobianOperator)
+function Base.collect(JOp::AbstractJacobianOperator)
     N, M = size(JOp)
-    v = zeros(eltype(JOp), M)
-    out = zeros(eltype(JOp), N)
+    v = zero(JOp.u)
+    out = zero(JOp.res)
     J = SparseMatrixCSC{eltype(v), Int}(undef, size(JOp)...)
     for j in 1:M
-        out .= 0.0
-        v .= 0.0
+        out .= 0
+        v .= 0
         v[j] = 1.0
         mul!(out, JOp, v)
         for i in 1:N
@@ -84,6 +86,29 @@ function Base.collect(JOp::JacobianOperator)
     return J
 end
 
+struct FDJacobianOperator{F, A} <: AbstractJacobianOperator
+    f::F # F!(res, u)
+    res::A
+    u::A
+    function FDJacobianOperator(f::F, res, u) where {F}
+        return new{F, typeof(u)}(f, res, u)
+    end
+end
+
+Base.size(J::FDJacobianOperator) = (length(J.res), length(J.u))
+Base.eltype(J::FDJacobianOperator) = eltype(J.u)
+
+function mul!(out, J::FDJacobianOperator, v)
+    ϵ = cbrt(eps()/2)
+    # (F(u + ϵ .* v) - F(u - ϵ .* v)) ./ 2ϵ
+
+    J.f(J.res, J.u .+ ϵ .* v)
+    out .= J.res
+    J.f(J.res, J.u .- ϵ .* v)
+    out .= (out .- J.res) ./ 2ϵ 
+    return nothing
+end
+
 ##
 # Newton-Krylov
 ##
@@ -167,8 +192,13 @@ function newton_krylov!(
         N = nothing,
         krylov_kwargs = (;),
         callback = (args...) -> nothing,
+        norm = norm,
+        update! = (u) -> nothing,
+        Operator = JacobianOperator,
     )
     t₀ = time_ns()
+
+    update!(u) # Impose BC
     F!(res, u) # res = F(u)
     n_res = norm(res)
     callback(u, res, n_res)
@@ -181,8 +211,9 @@ function newton_krylov!(
 
     verbose > 0 && @info "Jacobian-Free Newton-Krylov" Solver res₀ = n_res tol tol_rel tol_abs η
 
-    J = JacobianOperator(F!, res, u)
+    J = Operator(F!, res, u)
     solver = Solver(J, res)
+    b = similar(res)
 
     stats = Stats(0, 0)
     while n_res > tol && stats.outer_iterations <= max_niter
@@ -201,8 +232,8 @@ function newton_krylov!(
 
         # Solve: Jx = -res
         # res is modifyed by J, so we create a copy `-res`
-        # TODO: provide a temporary storage for `-res`
-        solve!(solver, J, -res; kwargs...)
+        b .= .- res
+        solve!(solver, J, b; kwargs...)
 
         d = solver.x # Newton direction
         s = 1        # Newton step TODO: LineSearch
@@ -213,6 +244,7 @@ function newton_krylov!(
         # Update residual and norm
         n_res_prior = n_res
 
+        update!(u) # Impose BC
         F!(res, u) # res = F(u)
         n_res = norm(res)
         callback(u, res, n_res)
@@ -226,7 +258,7 @@ function newton_krylov!(
             η = forcing(η, tol, n_res, n_res_prior)
         end
 
-        verbose > 0 && @info "Newton" iter = n_res η stats
+        verbose > 0 && @info "Newton" resᵢ = n_res η stats
         stats = update(stats, solver.stats.niter)
     end
     t = (time_ns() - t₀) / 1.0e9