move apply_process_model to PBMApplicator.apply_model

bgctw · bgctw · commit 599612893dbb · 2025-08-21T11:21:51.000+02:00
keep stack in non-GPU call, because mapreduce is very slow (despite then its not compatible with Zygor)

for GPUArrays use mapreduce, because stack results in scalar indexing error
diff --git a/dev/doubleMM.jl b/dev/doubleMM.jl
@@ -572,7 +572,7 @@ f_allsites = get_hybridproblem_PBmodel(prob0; scenario, use_all_sites = true)
 trans_mP=StackedArray(transP, size(ζsP, 2))
 trans_mMs=StackedArray(transM, size(ζsMs, 1) * size(ζsMs, 3))
 θsP, θsMs = transform_ζs(ζsP, ζsMs; trans_mP, trans_mMs)
-y = apply_process_model(θsP, θsMs, f, xP) 
+y = f(θsP, θsMs, f, xP) 
 #(; y, θsP, θsMs) = HVI.apply_f_trans(ζsP, ζsMs, f_allsites, xP; transP, transM);
 (y_hmc, θsP_hmc, θsMs_hmc) = (; y, θsP, θsMs);
 
diff --git a/src/HybridVariationalInference.jl b/src/HybridVariationalInference.jl
@@ -93,7 +93,7 @@ include("logden_normal.jl")
 export get_ca_starts, get_ca_ends, get_cor_count
 include("cholesky.jl")
 
-export neg_elbo_gtf, sample_posterior, apply_process_model, predict_hvi
+export neg_elbo_gtf, sample_posterior, predict_hvi
 include("elbo.jl")
 
 export init_hybrid_params, init_hybrid_ϕunc
diff --git a/src/PBMApplicator.jl b/src/PBMApplicator.jl
@@ -1,25 +1,67 @@
 """
-    AbstractPBMApplicator(θP::AbstractVector, θMs::AbstractMatrix, xP::AbstractMatrix)
-
 Abstraction of applying a process-based model with 
-global parameters, `x`, site-specific parameters, `θMs` (sites in columns), 
+global parameters, `θP`, site-specific parameters, `θMs` (sites in columns), 
 and site-specific model drivers, `xP` (sites in columns),
 It returns a matrix of predictions sites in columns.    
 
-Specific implementations need to implement function `apply_model(app, θP, θMs, xP)`.
+Specific implementations need to provide function `apply_model(app, θP, θMs, xP)`.
+where
+- `θsP` and `θsMs` are shaped according to the output of `generate_ζ`, i.e.
+`(n_site_pred x n_par x n_MC)`.
+- Results are of shape `(n_obs x n_site_pred x n_MC)`.
+
+They may also provide function `apply_model(app, θP, θMs, xP)` for a sample
+of parameters, i.e. where an additional dimension is added to both `θP` and `θMs`.
+However, there is a default implementation that mapreduces across these dimensions.
+
 Provided are implementations
-- `NullPBMApplicator`: returning its input `θMs` for testing
 - `PBMSiteApplicator`: based on a function that computes predictions per site
 - `PBMPopulationApplicator`: based on a function that computes predictions for entire population
+- `NullPBMApplicator`: returning its input `θMs` for testing
+- `PlainPBMApplicator`: based on a function that takes the same arguments as `apply_model`
 """
 abstract type AbstractPBMApplicator end
 
 # function apply_model end  # already defined in ModelApplicator.jl for ML model
 
-function (app::AbstractPBMApplicator)(θP::AbstractVector, θMs::AbstractMatrix, xP::AbstractMatrix) 
+function (app::AbstractPBMApplicator)(θP::AbstractArray, θMs::AbstractArray, xP::AbstractMatrix) 
     apply_model(app, θP, θMs, xP)
 end
 
+"""
+    apply_model(app::AbstractPBMApplicator, θsP::AbstractVector, θsMs::AbstractMatrix, xP::AbstractMatrix) 
+    apply_model(app::AbstractPBMApplicator, θsP::AbstractMatrix, θsMs::AbstractArray{ET,3}, xP) 
+
+The first variant calls the PBM for one batch of sites.
+
+The second variant calls the PBM for a sample of batches, and stack results.
+The default implementation mapreduces the last dimension of `θsP` and θ`sMs` calling the 
+first variant of `apply_model` for each sample.
+"""
+# docu in struct
+function apply_model(app::AbstractPBMApplicator, θsP::AbstractMatrix, θsMs::AbstractArray{ET,3}, xP) where ET
+    # stack does not work on GPU, see specialized method for GPUArrays below
+    y_pred = stack(
+     map(eachcol(CA.getdata(θsP)), eachslice(CA.getdata(θsMs), dims=3)) do θP, θMs
+        y_global, y_pred_i = app(θP, θMs, xP)
+        y_pred_i
+    end)
+end
+function apply_model(app::AbstractPBMApplicator, θsP::GPUArraysCore.AbstractGPUMatrix, θsMs::GPUArraysCore.AbstractGPUArray{ET,3}, xP) where ET
+    # stack does not work on GPU, need to resort to slower mapreduce
+    # for type stability, apply f at first iterate to supply init to mapreduce
+    P1, Pit = Iterators.peel(eachcol(CA.getdata(θsP)));
+    Ms1, Msit = Iterators.peel(eachslice(CA.getdata(θsMs), dims=3));
+    y1 = apply_model(app, P1, Ms1, xP)[2]
+    y1a = reshape(y1, size(y1)..., 1) # add one dimension
+    y_pred = mapreduce((a,b) -> cat(a,b; dims=3), Pit, Msit; init=y1a) do θP, θMs
+        y_global, y_pred_i = app(θP, θMs, xP)
+        y_pred_i
+    end
+end
+
+
+
 
 """
     NullPBMApplicator()
@@ -119,8 +161,8 @@ struct PBMPopulationApplicator{MFT, IPT, IT, IXT, F} <: AbstractPBMApplicator
     int_xP::IXT
 end
 
-# let fmap not descend into isP
-# @functor PBMPopulationApplicator (θFixm, )
+# let fmap not descend into isP, because indexing with isP on cpu is faster
+@functor PBMPopulationApplicator (θFixm, )
 
 """
     PBMPopulationApplicator(fθpop, n_batch; θP, θM, θFix, xPvec)
@@ -167,7 +209,13 @@ function apply_model(app::PBMPopulationApplicator, θP::AbstractVector, θMs::Ab
         "or compute PBM on CPU.")
     end
     # repeat θP and concatenate with 
+    # Main.@infiltrate_main
+    # repeat is 2x slower for Vector and 100 times slower (with allocation) on GPU
+    # app.isP on CPU is slightly faster than app.isP on GPU
+    #@benchmark CA.getdata(θP[app.isP])  
+    #@benchmark CA.getdata(repeat(θP', size(θMs,1))) 
     local θ = hcat(CA.getdata(θP[app.isP]), CA.getdata(θMs), app.θFixm)
+    #local θ = hcat(CA.getdata(repeat(θP', size(θMs,1))), CA.getdata(θMs), app.θFixm)
     local θc = app.intθ(CA.getdata(θ))
     local xPc = app.int_xP(CA.getdata(xP))
     local pred_sites = app.fθpop(θc, xPc)
diff --git a/src/elbo.jl b/src/elbo.jl
@@ -177,7 +177,8 @@ function predict_hvi(rng, prob::AbstractHybridProblem; scenario=Val(()),
     else
         f_dev = f
     end
-    y = apply_process_model(θsP, θsMs, f_dev, xP)
+    #y = apply_process_model(θsP, θsMs, f_dev, xP)
+    y = f_dev(θsP, θsMs, xP)
     (; y, θsP, θsMs, entropy_ζ)
 end
 
@@ -312,31 +313,33 @@ end
 #     (; y, θP, θMs)
 # end
 
-"""
-    apply_process_model(θsP::AbstractMatrix, θsMs::AbstractArray{ET,3}, f, xP)
+# """
+#     apply_process_model(θsP::AbstractMatrix, θsMs::AbstractArray{ET,3}, f, xP)
 
-Call a PBM applicator for a sample of parameters of each site, and stack results
+# Call a PBM applicator for a sample of parameters of each site, and stack results
 
-`θsP` and `θsMs` are shaped according to the output of `generate_ζ`, i.e.
-`(n_site_pred x n_par x n_MC)`.
-Results are of shape `(n_obs x n_site_pred x n_MC)`.
-"""
-function apply_process_model(θsP::AbstractMatrix, θsMs::AbstractArray{ET,3}, f, xP) where ET
-    # stack does not work on GPU
-    # y_pred = stack(map(eachcol(θsP), eachslice(θsMs, dims=3)) do θP, θMs
-    #     y_global, y_pred_i = f(θP, θMs, xP)
-    #     y_pred_i
-    # end)
-    #Main.@infiltrate_main
-    # for type stability, apply f at first sample before mapreduce
-    P1, Pit = Iterators.peel(eachcol(θsP));
-    Ms1, Msit = Iterators.peel(eachslice(θsMs, dims=3));
-    y1 = f(P1, Ms1, xP)[2]
-    y_pred = mapreduce((a,b) -> cat(a,b;dims=3), Pit, Msit; init=y1) do θP, θMs
-        y_global, y_pred_i = f(θP, θMs, xP)
-        y_pred_i
-    end
-end
+# `θsP` and `θsMs` are shaped according to the output of `generate_ζ`, i.e.
+# `(n_site_pred x n_par x n_MC)`.
+# Results are of shape `(n_obs x n_site_pred x n_MC)`.
+# """
+# function apply_process_model(θsP::AbstractMatrix, θsMs::AbstractArray{ET,3}, f, xP) where ET
+#     error("deprecated, use f(θsP, θsMs, xP)")
+#     # stack does not work on GPU
+#     # y_pred = stack(
+#     #  map(eachcol(CA.getdata(θsP)), eachslice(CA.getdata(θsMs), dims=3)) do θP, θMs
+#     #     y_global, y_pred_i = f(θP, θMs, xP)
+#     #     y_pred_i
+#     # end)
+#     # for type stability, apply f at first iterate to supply init to mapreduce
+#     P1, Pit = Iterators.peel(eachcol(CA.getdata(θsP)));
+#     Ms1, Msit = Iterators.peel(eachslice(CA.getdata(θsMs), dims=3));
+#     y1 = f(P1, Ms1, xP)[2]
+#     y1a = reshape(y1, size(y1)..., 1) # add one dimension
+#     y_pred = mapreduce((a,b) -> cat(a,b; dims=3), Pit, Msit; init=y1a) do θP, θMs
+#         y_global, y_pred_i = f(θP, θMs, xP)
+#         y_pred_i
+#     end
+# end
 
 """
 Generate samples of (inv-transformed) model parameters, ζ, 
diff --git a/test/test_HybridProblem.jl b/test/test_HybridProblem.jl
@@ -316,10 +316,11 @@ test_with_flux_gpu = (scenario) -> begin
             rng = StableRNG(111)
             probg = HybridProblem(DoubleMM.DoubleMMCase(); scenario = scenf);
             # put Applicator to gpu (θFix)
-            probg = HybridProblem(
-                probg, 
-                f_batch = fmap(gdev, probg.f_batch), 
-                f_allsites = fmap(gdev, probg.f_allsites))
+            # moved to solve and predict_hvi
+            # probg = HybridProblem(
+            #     probg, 
+            #     f_batch = fmap(gdev, probg.f_batch), 
+            #     f_allsites = fmap(gdev, probg.f_allsites))
             #prob = CP.update(probg, transM = identity, transP = identity);
             solver = HybridPosteriorSolver(; alg=Adam(0.02), n_MC=3)
             n_site, n_batch = get_hybridproblem_n_site_and_batch(probg; scenario = scenf)
diff --git a/test/test_elbo.jl b/test/test_elbo.jl
@@ -18,9 +18,10 @@ using MLDataDevices
 # setup g as FluxNN on gpu
 using Flux
 
+#CUDA.device!(4)
+
 ggdev = gpu_device()
 
-#CUDA.device!(4)
 rng = StableRNG(111)
 
 const prob = DoubleMM.DoubleMMCase()
@@ -143,7 +144,7 @@ test_scenario = (scenario) -> begin
             _ϕ = vcat(ϕ_ini.μP, probc.ϕg, probd.ϕunc)
             #hcat(ϕ_ini, ϕ, _ϕ)[1:4,:]
             #hcat(ϕ_ini, ϕ, _ϕ)[(end-20):end,:]
-            n_predict = 80000
+            n_predict = 8000
             xM_batch = xM[:, 1:n_batch]
             _ζsP, _ζsMs, _σ = @inferred (
                 # @descend_code_warntype (
@@ -196,18 +197,18 @@ test_scenario = (scenario) -> begin
                     reshape(residMst, size(residMst,1)*size(residMst,2), size(residMst,3)))
                 cor_PMs = cor(residPMst')
                 @test cor_PMs[1,2] ≈ ρsP_true[1] atol=0.02
-                @test all(.≈(cor_PMs[1:2,3:end], 0.0, atol=0.02)) # no correlations P,M
+                @test all(.≈(cor_PMs[1:2,3:end], 0.0, atol=0.1)) # no correlations P,M
                 @test cor_PMs[3,4] ≈ ρsM_true[1] atol=0.02
-                @test all(.≈(cor_PMs[3:4,5:end], 0.0, atol=0.02)) # no correlations M1, M2
+                @test all(.≈(cor_PMs[3:4,5:end], 0.0, atol=0.1)) # no correlations M1, M2
                 @test cor_PMs[5,6] ≈ ρsM_true[1] atol=0.02
-                @test all(.≈(cor_PMs[5:6,7:end], 0.0, atol=0.02)) # no correlations M1, M2
+                @test all(.≈(cor_PMs[5:6,7:end], 0.0, atol=0.1)) # no correlations M1, M2
             end
             test_distζ(_ζsP, _ζsMs, ϕunc_true, ζMs_g)
             @testset "predict_hvi check sd" begin
                 # test if uncertainty and reshaping is propagated
                 # here inverse the predicted θs and then test distribution 
                 probcu = HybridProblem(probc, ϕunc=ϕunc_true);
-                n_sample_pred = 24_000
+                n_sample_pred = 2_400
                 (; y, θsP, θsMs, entropy_ζ) = predict_hvi(rng, probcu; scenario, n_sample_pred);
                 #size(_ζsMs), size(θsMs)
                 #size(_ζsP), size(θsP)
@@ -221,7 +222,7 @@ test_scenario = (scenario) -> begin
                 test_distζ(_ζsP2, _ζsMs2, ϕunc_true, ζMs_g2)
             end;
         end;
-    end # if covar in scenario
+    end # if covarK2 in scenario
 
     if ggdev isa MLDataDevices.AbstractGPUDevice
         @testset "generate_ζ gpu $(last(CP._val_value(scenario)))" begin
@@ -390,14 +391,13 @@ test_scenario = (scenario) -> begin
         θsPc = int_mP(θsP)
         @test all(θsPc[:r0, :] .> 0)
         #
-        y = apply_process_model(θsP, θsMs, f_pred, xP)
+        y = @inferred f_pred(θsP, θsMs, xP)
         @test y isa Array
         @test size(y) == (size(y_o)..., n_sample_pred)
     end
 
     if ggdev isa MLDataDevices.AbstractGPUDevice
         @testset "predict_hvi gpu $(last(CP._val_value(scenario)))" begin
-            n_sample_pred = 32
             ϕ_ini_g = ggdev(CA.getdata(ϕ_ini))
             xMg = ggdev(xM)
             n_sample_pred = 30
@@ -407,17 +407,20 @@ test_scenario = (scenario) -> begin
                     sample_posterior(rng, g_gpu, ϕ_ini_g, xMg;
                     int_μP_ϕg_unc, int_unc,
                     transP, transM,
-                    cdev = cpu_device(),
+                    #cdev = cpu_device(),
+                    cdev = identity, # do not transfer to CPU
                     n_sample_pred, cor_ends, pbm_covar_indices)
                 )
+            # this variant without the problem, does not attach axes
             @test θsP isa AbstractMatrix
             @test θsMs isa AbstractArray{T,3} where {T}
             int_mP = ComponentArrayInterpreter(int_P, (size(θsP, 2),))
-            θsPc = int_mP(θsP)
-            @test all(θsPc[:r0, :] .> 0)
+            @test all(int_mP(θsP)[:r0, :] .> 0)
             #
-            y = apply_process_model(θsP, θsMs, f_pred, xP)
-            @test y isa Array
+            xP_dev = ggdev(xP);
+            f_pred_dev = fmap(ggdev, f_pred)
+            y = @inferred f_pred_dev(θsP, θsMs, xP_dev)
+            @test y isa GPUArraysCore.AbstractGPUArray
             @test size(y) == (size(y_o)..., n_sample_pred)
         end
         # @testset "predict_hvi also f on gpu" begin