feat: implement step size adaptation with adam

aseyboldt · aseyboldt · commit 7ada9bef417e · 2025-09-10T15:17:15.000+02:00
diff --git a/Cargo.toml b/Cargo.toml
@@ -22,7 +22,7 @@ rand = { version = "0.9.0", features = ["small_rng"] }
 rand_distr = "0.5.0"
 itertools = "0.14.0"
 thiserror = "2.0.3"
-arrow = { version = "55.1.0", default-features = false, features = ["ffi"] }
+arrow = { version = "56.1.0", default-features = false, features = ["ffi"] }
 rand_chacha = "0.9.0"
 anyhow = "1.0.72"
 faer = { version = "0.22.6", default-features = false, features = ["linalg"] }
@@ -32,7 +32,7 @@ rayon = "1.10.0"
 [dev-dependencies]
 proptest = "1.6.0"
 pretty_assertions = "1.4.0"
-criterion = "0.6.0"
+criterion = "0.7.0"
 nix = { version = "0.30.0", features = ["sched"] }
 approx = "0.5.1"
 ndarray = "0.16.1"
diff --git a/examples/adam_adaptation.rs b/examples/adam_adaptation.rs
@@ -0,0 +1,157 @@
+//! Example demonstrating the Adam optimizer for step size adaptation.
+//!
+//! This example shows how to use the Adam optimizer instead of dual averaging
+//! for adapting the step size in NUTS.
+
+use nuts_rs::{
+    AdamOptions, Chain, CpuLogpFunc, CpuMath, DiagGradNutsSettings, LogpError, Settings,
+    StepSizeAdaptMethod,
+};
+use thiserror::Error;
+
+// Define a function that computes the unnormalized posterior density
+// and its gradient.
+#[derive(Debug)]
+struct PosteriorDensity {}
+
+// The density might fail in a recoverable or non-recoverable manner...
+#[derive(Debug, Error)]
+enum PosteriorLogpError {}
+impl LogpError for PosteriorLogpError {
+    fn is_recoverable(&self) -> bool {
+        false
+    }
+}
+
+impl CpuLogpFunc for PosteriorDensity {
+    type LogpError = PosteriorLogpError;
+
+    // Only used for transforming adaptation.
+    type TransformParams = ();
+
+    // We define a 10 dimensional normal distribution
+    fn dim(&self) -> usize {
+        10
+    }
+
+    // The normal likelihood with mean 3 and its gradient.
+    fn logp(&mut self, position: &[f64], grad: &mut [f64]) -> Result<f64, Self::LogpError> {
+        let mu = 3f64;
+        let logp = position
+            .iter()
+            .copied()
+            .zip(grad.iter_mut())
+            .map(|(x, grad)| {
+                let diff = x - mu;
+                *grad = -diff;
+                -diff * diff / 2f64
+            })
+            .sum();
+        return Ok(logp);
+    }
+}
+
+fn main() {
+    println!("Running NUTS with Adam step size adaptation...");
+
+    // Create sampler settings with Adam optimizer
+    let mut settings = DiagGradNutsSettings::default();
+
+    // Configure for Adam adaptation
+    settings
+        .adapt_options
+        .step_size_settings
+        .adapt_options
+        .method = StepSizeAdaptMethod::Adam;
+
+    // Set Adam options
+    let adam_options = AdamOptions {
+        beta1: 0.9,
+        beta2: 0.999,
+        epsilon: 1e-8,
+        learning_rate: 0.05,
+    };
+
+    settings.adapt_options.step_size_settings.adapt_options.adam = adam_options;
+
+    // Standard MCMC settings
+    settings.num_tune = 1000;
+    settings.num_draws = 1000;
+    settings.maxdepth = 10;
+
+    // Create the posterior density function
+    let logp_func = PosteriorDensity {};
+    let math = CpuMath::new(logp_func);
+
+    // Initialize the sampler
+    let chain = 0;
+    let mut rng = rand::rng();
+    let mut sampler = settings.new_chain(chain, math, &mut rng);
+
+    // Set initial position
+    let initial_position = vec![0f64; 10];
+    sampler
+        .set_position(&initial_position)
+        .expect("Unrecoverable error during init");
+
+    // Collect samples
+    let mut trace = vec![];
+    let mut stats = vec![];
+
+    // Sampling with progress reporting
+    println!("Warmup phase:");
+    for i in 0..settings.num_tune {
+        if i % 100 == 0 {
+            println!("\rWarmup: {}/{}", i, settings.num_tune);
+        }
+
+        let (draw, info) = sampler.draw().expect("Unrecoverable error during sampling");
+        println!("{:?}", info.step_size);
+        trace.push(draw);
+        stats.push(info);
+    }
+    println!("\rWarmup: {}/{}", settings.num_tune, settings.num_tune);
+
+    println!("\nSampling phase:");
+    for i in 0..settings.num_draws {
+        if i % 100 == 0 {
+            print!("\rSampling: {}/{}", i, settings.num_draws);
+        }
+
+        let (draw, info) = sampler.draw().expect("Unrecoverable error during sampling");
+        trace.push(draw);
+        stats.push(info);
+    }
+    println!("\rSampling: {}/{}", settings.num_draws, settings.num_draws);
+
+    // Calculate mean of samples (post-warmup)
+    let warmup_samples = settings.num_tune as usize;
+    let mut means = vec![0.0; 10];
+
+    for i in warmup_samples..trace.len() {
+        for (j, mean) in means.iter_mut().enumerate() {
+            *mean += trace[i][j];
+        }
+    }
+
+    for mean in &mut means {
+        *mean /= settings.num_draws as f64;
+    }
+
+    // Print results
+    println!("\nResults after {} samples:", settings.num_draws);
+    println!("Target mean: 3.0 for all dimensions");
+    println!("Estimated means:");
+    for (i, mean) in means.iter().enumerate() {
+        println!("Dimension {}: {:.4}", i, mean);
+    }
+
+    // Print adaptation statistics
+    let last_stats = &stats[stats.len() - 1];
+    println!("\nFinal adaptation statistics:");
+    println!("Step size: {:.6}", last_stats.step_size);
+    // Note: the full acceptance stats are in the Progress struct, but we don't have direct access to mean_tree_accept
+    println!("Number of steps: {}", last_stats.num_steps);
+
+    println!("\nSampling completed successfully!");
+}
diff --git a/src/adapt_strategy.rs b/src/adapt_strategy.rs
@@ -14,10 +14,10 @@ use crate::{
     sampler::Settings,
     sampler_stats::{SamplerStats, StatTraceBuilder},
     state::State,
-    stepsize::AcceptanceRateCollector,
     stepsize_adapt::{
-        DualAverageSettings, StatsBuilder as StepSizeStatsBuilder, Strategy as StepSizeStrategy,
+        StatsBuilder as StepSizeStatsBuilder, StepSizeSettings, Strategy as StepSizeStrategy,
     },
+    stepsize_dual_avg::AcceptanceRateCollector,
     NutsError,
 };
 
@@ -38,7 +38,7 @@ pub struct GlobalStrategy<M: Math, A: MassMatrixAdaptStrategy<M>> {
 
 #[derive(Debug, Clone, Copy)]
 pub struct EuclideanAdaptOptions<S: Debug + Default> {
-    pub dual_average_options: DualAverageSettings,
+    pub step_size_settings: StepSizeSettings,
     pub mass_matrix_options: S,
     pub early_window: f64,
     pub step_size_window: f64,
@@ -50,7 +50,7 @@ pub struct EuclideanAdaptOptions<S: Debug + Default> {
 impl<S: Debug + Default> Default for EuclideanAdaptOptions<S> {
     fn default() -> Self {
         Self {
-            dual_average_options: DualAverageSettings::default(),
+            step_size_settings: StepSizeSettings::default(),
             mass_matrix_options: S::default(),
             early_window: 0.3,
             step_size_window: 0.15,
@@ -97,7 +97,7 @@ impl<M: Math, A: MassMatrixAdaptStrategy<M>> AdaptStrategy<M> for GlobalStrategy
         assert!(early_end < num_tune);
 
         Self {
-            step_size: StepSizeStrategy::new(options.dual_average_options),
+            step_size: StepSizeStrategy::new(options.step_size_settings),
             mass_matrix: A::new(math, options.mass_matrix_options, num_tune, chain),
             options,
             num_tune,
diff --git a/src/lib.rs b/src/lib.rs
@@ -98,8 +98,9 @@ mod nuts;
 mod sampler;
 mod sampler_stats;
 mod state;
-mod stepsize;
+mod stepsize_adam;
 mod stepsize_adapt;
+mod stepsize_dual_avg;
 mod transform_adapt_strategy;
 mod transformed_hamiltonian;
 
@@ -117,5 +118,6 @@ pub use sampler::{
 
 pub use low_rank_mass_matrix::LowRankSettings;
 pub use mass_matrix_adapt::DiagAdaptExpSettings;
-pub use stepsize_adapt::DualAverageSettings;
+pub use stepsize_adam::AdamOptions;
+pub use stepsize_adapt::{StepSizeAdaptMethod, StepSizeAdaptOptions, StepSizeSettings};
 pub use transform_adapt_strategy::TransformedSettings;
diff --git a/src/mass_matrix.rs b/src/mass_matrix.rs
@@ -24,10 +24,6 @@ pub trait MassMatrix<M: Math>: SamplerStats<M> {
     );
 }
 
-pub struct NullCollector {}
-
-impl<M: Math, P: Point<M>> Collector<M, P> for NullCollector {}
-
 #[derive(Debug)]
 pub struct DiagMassMatrix<M: Math> {
     inv_stds: M::Vector,
diff --git a/src/stepsize_adam.rs b/src/stepsize_adam.rs
@@ -0,0 +1,110 @@
+//! Adam optimizer for step size adaptation.
+//!
+//! This implements a single-parameter version of the Adam optimizer
+//! for adapting the step size in the NUTS algorithm. Unlike dual averaging,
+//! Adam maintains both first and second moment estimates of gradients,
+//! which can potentially lead to better adaptation in some scenarios.
+
+use std::f64;
+
+/// Settings for Adam step size adaptation
+#[derive(Debug, Clone, Copy)]
+pub struct AdamOptions {
+    /// First moment decay rate (default: 0.9)
+    pub beta1: f64,
+    /// Second moment decay rate (default: 0.999)
+    pub beta2: f64,
+    /// Small constant for numerical stability (default: 1e-8)
+    pub epsilon: f64,
+    /// Learning rate (default: 0.001)
+    pub learning_rate: f64,
+}
+
+impl Default for AdamOptions {
+    fn default() -> Self {
+        Self {
+            beta1: 0.9,
+            beta2: 0.999,
+            epsilon: 1e-8,
+            learning_rate: 0.05,
+        }
+    }
+}
+
+/// Adam optimizer for step size adaptation.
+///
+/// This implements the Adam optimizer for a single parameter (the step size).
+/// The adaptation takes the acceptance probability statistic and adjusts
+/// the step size to reach the target acceptance rate.
+#[derive(Clone)]
+pub struct Adam {
+    /// Current log step size
+    log_step: f64,
+    /// First moment estimate
+    m: f64,
+    /// Second moment estimate
+    v: f64,
+    /// Iteration counter
+    t: u64,
+    /// Adam settings
+    settings: AdamOptions,
+}
+
+impl Adam {
+    /// Create a new Adam optimizer with given settings and initial step size
+    pub fn new(settings: AdamOptions, initial_step: f64) -> Self {
+        Self {
+            log_step: initial_step.ln(),
+            m: 0.0,
+            v: 0.0,
+            t: 0,
+            settings,
+        }
+    }
+
+    /// Advance the optimizer by one step using the current acceptance statistic
+    ///
+    /// This updates the step size to move towards the target acceptance rate.
+    /// The error signal is the difference between the target and current acceptance rates.
+    pub fn advance(&mut self, accept_stat: f64, target: f64) {
+        // Compute the error/gradient - we want to minimize (target - accept_stat)²
+        // So gradient is -2 * (target - accept_stat)
+        // We simplify and just use (accept_stat - target) as our gradient
+        let gradient = accept_stat - target;
+
+        // Increment timestep
+        self.t += 1;
+
+        // Update biased first moment estimate
+        self.m = self.settings.beta1 * self.m + (1.0 - self.settings.beta1) * gradient;
+
+        // Update biased second moment estimate
+        self.v = self.settings.beta2 * self.v + (1.0 - self.settings.beta2) * gradient * gradient;
+
+        // Compute bias-corrected first moment estimate
+        let m_hat = self.m / (1.0 - self.settings.beta1.powi(self.t as i32));
+
+        // Compute bias-corrected second moment estimate
+        let v_hat = self.v / (1.0 - self.settings.beta2.powi(self.t as i32));
+
+        // Update log step size
+        // Note: if gradient is positive (accept_stat > target), we should decrease step size
+        // if gradient is negative (accept_stat < target), we should increase step size
+        self.log_step +=
+            self.settings.learning_rate * m_hat / (v_hat.sqrt() + self.settings.epsilon);
+    }
+
+    /// Get the current step size (not adapted)
+    pub fn current_step_size(&self) -> f64 {
+        self.log_step.exp()
+    }
+
+    /// Reset the optimizer with a new initial step size and bias factor
+    #[allow(dead_code)]
+    pub fn reset(&mut self, initial_step: f64, _bias_factor: f64) {
+        self.log_step = initial_step.ln();
+        self.m = 0.0;
+        self.v = 0.0;
+        self.t = 0;
+    }
+}
diff --git a/src/stepsize_adapt.rs b/src/stepsize_adapt.rs
diff --git a/src/stepsize_dual_avg.rs b/src/stepsize_dual_avg.rs
diff --git a/src/transform_adapt_strategy.rs b/src/transform_adapt_strategy.rs