SciML · ChrisRackauckas-Claude · Jan 7, 2026
diff --git a/src/MCSample.jl b/src/MCSample.jl
@@ -17,9 +17,15 @@ end
 
 function (mc_sample::UniformSampling{T})(x_mc, kwargs...) where {T}
     Tel = eltype(T)
+    a = mc_sample.a
+    b = mc_sample.b
     rand!(x_mc)
-    m = (mc_sample.b + mc_sample.a) ./ convert(Tel, 2)
-    return x_mc .= (x_mc .- convert(Tel, 0.5)) .* (mc_sample.b - mc_sample.a) .+ m
+    # Fuse all operations to avoid temporaries: x_mc = a + (b - a) * x_mc
+    # This is equivalent to uniform sampling in [a, b]
+    @inbounds for i in eachindex(x_mc)
+        x_mc[i] = a[i] + (b[i] - a[i]) * x_mc[i]
+    end
+    return x_mc
 end
 
 """

diff --git a/src/MLP.jl b/src/MLP.jl
@@ -109,6 +109,8 @@ function _ml_picard(
 
     x2 = similar(x)
     _integrate(mc_sample!) ? x3 = similar(x) : x3 = nothing
+    # Pre-allocate temp buffer for in-place reflection when Neumann BC is used
+    x_temp = isnothing(neumann_bc) ? nothing : similar(x)
     p = prob.p
 
     a = zero(elxType)
@@ -119,7 +121,7 @@ function _ml_picard(
         for k in 1:num
             verbose && println("loop k")
             r = s + (t - s) * rand(tType)
-            _mlt_sde_loop!(x2, x, s, r, prob, neumann_bc)
+            _mlt_sde_loop!(x2, x, s, r, prob, neumann_bc, x_temp)
             b2 = _ml_picard(M, l, K, x2, r, t, mc_sample!, g, f, verbose, prob, neumann_bc)
             b3 = zero(elxType)
             # non local integration
@@ -144,7 +146,7 @@ function _ml_picard(
         num = M^(L - l)
         for k in 1:num
             r = s + (t - s) * rand(tType)
-            _mlt_sde_loop!(x2, x, s, r, prob, neumann_bc)
+            _mlt_sde_loop!(x2, x, s, r, prob, neumann_bc, x_temp)
             b2 = _ml_picard(M, l, K, x2, r, t, mc_sample!, g, f, verbose, prob, neumann_bc)
             b4 = _ml_picard(
                 M,
@@ -188,7 +190,7 @@ function _ml_picard(
     a2 = zero(elxType)
     for k in 1:num
         verbose && println("loop k3")
-        _mlt_sde_loop!(x2, x, s, t, prob, neumann_bc)
+        _mlt_sde_loop!(x2, x, s, t, prob, neumann_bc, x_temp)
         a2 += g(x2)
     end
     a2 /= num
@@ -234,10 +236,12 @@ function _ml_picard_mlt(
     # first level
     num = M^(L)
     x2 = similar(x)
+    # Pre-allocate temp buffer for in-place reflection when Neumann BC is used
+    x_temp = isnothing(neumann_bc) ? nothing : similar(x)
     a2 = zero(elxType)
     for k in 1:num
         verbose && println("loop k3")
-        _mlt_sde_loop!(x2, x, s, t, prob, neumann_bc)
+        _mlt_sde_loop!(x2, x, s, t, prob, neumann_bc, x_temp)
         a2 += g(x2)
     end
     a2 /= num
@@ -266,6 +270,8 @@ function _ml_picard_call(
     ) where {xType, tType}
     x2 = similar(x)
     _integrate(mc_sample!) ? x3 = similar(x) : x3 = nothing
+    # Pre-allocate temp buffer for in-place reflection when Neumann BC is used
+    x_temp = isnothing(neumann_bc) ? nothing : similar(x)
     p = prob.p
     elxType = eltype(xType)
 
@@ -277,7 +283,7 @@ function _ml_picard_call(
         for k in 1:loop_num
             verbose && println("loop k")
             r = s + (t - s) * rand(tType)
-            _mlt_sde_loop!(x2, x, s, r, prob, neumann_bc)
+            _mlt_sde_loop!(x2, x, s, r, prob, neumann_bc, x_temp)
             b2 = _ml_picard(M, l, K, x2, r, t, mc_sample!, g, f, verbose, prob, neumann_bc)
             b3 = zero(elxType)
             for h in 1:K # non local integration
@@ -302,7 +308,7 @@ function _ml_picard_call(
         loop_num = _get_loop_num(M, num, thread_id, NUM_THREADS)
         for k in 1:loop_num
             r = s + (t - s) * rand(tType)
-            _mlt_sde_loop!(x2, x, s, r, prob, neumann_bc)
+            _mlt_sde_loop!(x2, x, s, r, prob, neumann_bc, x_temp)
             b2 = _ml_picard(M, l, K, x2, r, t, mc_sample!, g, f, verbose, prob, neumann_bc)
             b4 = _ml_picard(
                 M,
@@ -398,13 +404,21 @@ function _mlt_sde_loop!(
         s,
         t,
         prob,
-        neumann_bc
+        neumann_bc,
+        x_temp = nothing
     )
     #randn! allows to save one allocation
     dt = t - s
     randn!(x2)
     x2 .= x + (prob.μ(x, prob.p, t) .* dt .+ prob.σ(x, prob.p, t) .* sqrt(dt) .* x2)
     return if !isnothing(neumann_bc)
-        x2 .= _reflect(x, x2, neumann_bc...)
+        if isnothing(x_temp)
+            # Non-mutating version when no temp buffer provided
+            x2 .= _reflect(x, x2, neumann_bc...)
+        else
+            # Use in-place version: x_temp is modified as temporary storage, x2 contains result
+            x_temp .= x
+            _reflect!(x_temp, x2, neumann_bc...)
+        end
     end
 end
diff --git a/src/reflect.jl b/src/reflect.jl
@@ -1,52 +1,88 @@
 """
     _reflect(a,b,s,e)
 
-reflection of the Brownian motion `B` where `B_{t-1} = a` and  `B_{t} = b` 
+reflection of the Brownian motion `B` where `B_{t-1} = a` and  `B_{t} = b`
 on the hypercube `[s,e]^d` where `d = size(a,1)`.
 """
 # Used by `MLP` algorithm.
 function _reflect(a::T, b::T, s::T, e::T) where {T <: Vector}
-    r = 2
-    n = zeros(size(a))
-    # first checking if b is in the hypercube
-    all((a .>= s) .& (a .<= e)) ? nothing : error("a = $a not in hypercube")
-    size(a) == size(b) ? nothing : error("a not same dim as b")
-    for i in 1:length(a)
+    # Use copies to avoid mutating inputs
+    a_curr = copy(a)
+    b_curr = copy(b)
+    _reflect!(a_curr, b_curr, s, e)
+    return b_curr
+end
+
+"""
+    _reflect!(a, b, s, e)
+
+In-place version of `_reflect` that mutates both `a` and `b`.
+`a` is used as working storage and `b` contains the result.
+"""
+function _reflect!(a::T, b::T, s::T, e::T) where {T <: Vector}
+    elT = eltype(T)
+    r = elT(2)
+    n_idx = 0  # Track which index has the normal, 0 means no reflection needed
+    n_val = zero(elT)  # The normal value at that index (+1 or -1)
+
+    # first checking if a is in the hypercube
+    @inbounds for i in eachindex(a)
+        if a[i] < s[i] || a[i] > e[i]
+            error("a = $a not in hypercube")
+        end
+    end
+    length(a) == length(b) || error("a not same dim as b")
+
+    @inbounds for i in eachindex(a)
         if b[i] < s[i]
             rtemp = (a[i] - s[i]) / (a[i] - b[i])
             if rtemp < r
                 r = rtemp
-                n .= 0
-                n[i] = -1
+                n_idx = i
+                n_val = -one(elT)
             end
         elseif b[i] > e[i]
             rtemp = (e[i] - a[i]) / (b[i] - a[i])
             if rtemp < r
                 r = rtemp
-                n .= 0
-                n[i] = 1
+                n_idx = i
+                n_val = one(elT)
             end
         end
     end
+
     while r < 1
-        c = a + r * (b - a)
-        a = c
-        b = b - 2 * n * (dot(b - c, n))
-        r = 2
-        for i in 1:length(a)
+        # c = a + r * (b - a), but we'll compute in-place
+        # a = c
+        @inbounds for i in eachindex(a)
+            a[i] = a[i] + r * (b[i] - a[i])
+        end
+
+        # b = b - 2 * n * dot(b - c, n)
+        # Since n is a unit vector with only one non-zero element at n_idx,
+        # dot(b - c, n) = (b[n_idx] - c[n_idx]) * n_val = (b[n_idx] - a[n_idx]) * n_val
+        # (after c is stored in a)
+        dot_val = (b[n_idx] - a[n_idx]) * n_val
+        b[n_idx] = b[n_idx] - 2 * n_val * dot_val
+
+        r = elT(2)
+        n_idx = 0
+        n_val = zero(elT)
+
+        @inbounds for i in eachindex(a)
             if b[i] < s[i]
                 rtemp = (a[i] - s[i]) / (a[i] - b[i])
                 if rtemp < r
                     r = rtemp
-                    n .= 0
-                    n[i] = -1
+                    n_idx = i
+                    n_val = -one(elT)
                 end
             elseif b[i] > e[i]
                 rtemp = (e[i] - a[i]) / (b[i] - a[i])
                 if rtemp < r
                     r = rtemp
-                    n .= 0
-                    n[i] = 1
+                    n_idx = i
+                    n_val = one(elT)
                 end
             end
         end