From 15cb9db6b00296a64487c8bdbf1a54d70fa97f2a Mon Sep 17 00:00:00 2001
From: ry2009 <134240944+ry2009@users.noreply.github.com>
Date: Wed, 26 Nov 2025 01:41:06 -0500
Subject: [PATCH 1/2] Add vecenv fallback and fix batched forward state

---
 pufferlib/extensions/pufferlib.cpp | 257 ++++++++++++++++++++---------
 pufferlib/extensions/vecenv.h      |   1 -
 2 files changed, 182 insertions(+), 76 deletions(-)

diff --git a/pufferlib/extensions/pufferlib.cpp b/pufferlib/extensions/pufferlib.cpp
index 64e09a59a..9cd2261f9 100644
--- a/pufferlib/extensions/pufferlib.cpp
+++ b/pufferlib/extensions/pufferlib.cpp
@@ -94,73 +94,96 @@ void clip_grad_norm_(
 
 std::tuple<VecEnv*, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
 create_environments(int64_t num_envs, int threads) {
+    // Try to load native vecenv; fall back to dummy env if symbols are missing.
     void* handle = dlopen("./breakout.so", RTLD_NOW);
-    if (!handle) {
-        fprintf(stderr, "dlopen error: %s\n", dlerror());
-        exit(1);
-    }
-    dlerror();
-
-    // Load the function pointer
-    create_envs = (create_environments_fn)dlsym(handle, "create_environments");
-    env_init = (env_init_fn)dlsym(handle, "env_init");
-    vec_reset = (vec_reset_fn)dlsym(handle, "vec_reset");
-    vec_step = (vec_step_fn)dlsym(handle, "vec_step");
-    env_close = (env_close_fn)dlsym(handle, "env_close");
-    vec_close = (vec_close_fn)dlsym(handle, "vec_close");
-    vec_log = (vec_log_fn)dlsym(handle, "vec_log");
-    vec_render = (vec_render_fn)dlsym(handle, "vec_render");
-    int obs_n = *(int*)dlsym(handle, "OBS_N");
-    int act_n = *(int*)dlsym(handle, "ACT_N");
-    int obs_t = *(int*)dlsym(handle, "OBS_T");
-    int act_t = *(int*)dlsym(handle, "ACT_T");
-    
-    const char* dlsym_error = dlerror();
-    if (dlsym_error) {
-        fprintf(stderr, "dlsym error: %s\n", dlsym_error);
-        dlclose(handle);
-        exit(1);
+    bool loaded = handle != nullptr;
+
+    if (loaded) {
+        dlerror();
+        create_envs = (create_environments_fn)dlsym(handle, "create_environments");
+        env_init   = (env_init_fn)dlsym(handle, "env_init");
+        vec_reset  = (vec_reset_fn)dlsym(handle, "vec_reset");
+        vec_step   = (vec_step_fn)dlsym(handle, "vec_step");
+        env_close  = (env_close_fn)dlsym(handle, "env_close");
+        vec_close  = (vec_close_fn)dlsym(handle, "vec_close");
+        vec_log    = (vec_log_fn)dlsym(handle, "vec_log");
+        vec_render = (vec_render_fn)dlsym(handle, "vec_render");
+
+        int* obs_n_ptr = (int*)dlsym(handle, "OBS_N");
+        int* act_n_ptr = (int*)dlsym(handle, "ACT_N");
+        int* obs_t_ptr = (int*)dlsym(handle, "OBS_T");
+        int* act_t_ptr = (int*)dlsym(handle, "ACT_T");
+
+        const char* err = dlerror();
+        if (err || !create_envs || !vec_step || !vec_reset ||
+                !obs_n_ptr || !act_n_ptr || !obs_t_ptr || !act_t_ptr) {
+            fprintf(stderr, "[pufferlib] dlopen fallback: %s\n", err ? err : "missing symbol");
+            loaded = false;
+            dlclose(handle);
+        } else {
+            int obs_n = *obs_n_ptr;
+            int act_n = *act_n_ptr;
+            int obs_t = *obs_t_ptr;
+            int act_t = *act_t_ptr;
+
+            Dict* kwargs = create_dict(32);
+            dict_set_int(kwargs, "frameskip", 4);
+            dict_set_int(kwargs, "width", 576);
+            dict_set_int(kwargs, "height", 330);
+            dict_set_int(kwargs, "paddle_width", 62);
+            dict_set_int(kwargs, "paddle_height", 8);
+            dict_set_int(kwargs, "ball_width", 32);
+            dict_set_int(kwargs, "ball_height", 32);
+            dict_set_int(kwargs, "brick_width", 32);
+            dict_set_int(kwargs, "brick_height", 12);
+            dict_set_int(kwargs, "brick_rows", 6);
+            dict_set_int(kwargs, "brick_cols", 18);
+            dict_set_int(kwargs, "initial_ball_speed", 256);
+            dict_set_int(kwargs, "max_ball_speed", 448);
+            dict_set_int(kwargs, "paddle_speed", 620);
+            dict_set_int(kwargs, "continuous", 0);
+
+            VecEnv* vec = create_envs(num_envs, threads, kwargs);
+            printf("Created VecEnv with %d environments\n", vec->size);
+
+            auto obs_dtype = to_torch_dtype(obs_t);
+            auto atn_dtype = to_torch_dtype(act_t);
+
+            auto obs = torch::from_blob(vec->observations, {num_envs, obs_n}, obs_dtype).pin_memory();
+            auto actions = torch::from_blob(vec->actions, {num_envs}, atn_dtype).pin_memory();
+            auto rewards = torch::from_blob(vec->rewards, {num_envs}, torch::kFloat32).pin_memory();
+            auto terminals = torch::from_blob(vec->terminals, {num_envs}, torch::kUInt8).pin_memory();
+
+            vec_reset(vec);
+            return std::make_tuple(vec, obs, actions, rewards, terminals);
+        }
     }
 
-    Dict* kwargs = create_dict(32);
-    dict_set_int(kwargs, "frameskip", 4);
-    dict_set_int(kwargs, "width", 576);
-    dict_set_int(kwargs, "height", 330);
-    dict_set_int(kwargs, "paddle_width", 62);
-    dict_set_int(kwargs, "paddle_height", 8);
-    dict_set_int(kwargs, "ball_width", 32);
-    dict_set_int(kwargs, "ball_height", 32);
-    dict_set_int(kwargs, "brick_width", 32);
-    dict_set_int(kwargs, "brick_height", 12);
-    dict_set_int(kwargs, "brick_rows", 6);
-    dict_set_int(kwargs, "brick_cols", 18);
-    dict_set_int(kwargs, "initial_ball_speed", 256);
-    dict_set_int(kwargs, "max_ball_speed", 448);
-    dict_set_int(kwargs, "paddle_speed", 620);
-    dict_set_int(kwargs, "continuous", 0);
-
-    /*
-    Dict* kwargs = create_dict(32);
-    dict_set_int(kwargs, "can_go_over_65536", 0);
-    dict_set_float(kwargs, "reward_scaler", 0.67);
-    dict_set_float(kwargs, "endgame_env_prob", 0.05);
-    dict_set_float(kwargs, "scaffolding_ratio", 0.67);
-    dict_set_int(kwargs, "use_heuristic_rewards", 1);
-    dict_set_float(kwargs, "snake_reward_weight", 0.0005);
-    dict_set_int(kwargs, "use_sparse_reward", 0);
-    */
-
-    VecEnv* vec = create_envs(num_envs, threads, kwargs);
-    printf("Created VecEnv with %d environments\n", vec->size);
-
-    // Close the library
-    //dlclose(handle);
- 
-    auto obs_dtype = to_torch_dtype(obs_t);
-    auto atn_dtype = to_torch_dtype(act_t);
-
-    auto obs = torch::from_blob(vec->observations, {num_envs, obs_n}, obs_dtype).pin_memory();
-    auto actions = torch::from_blob(vec->actions, {num_envs}, atn_dtype).pin_memory();
+    // Fallback: minimal CPU vecenv with dummy step/reset.
+    fprintf(stderr, "[pufferlib] Using dummy vecenv fallback (no breakout.so symbols)\n");
+
+    auto* vec = (VecEnv*)calloc(1, sizeof(VecEnv));
+    vec->size = static_cast<int>(num_envs);
+    int obs_n = 118;   // matches policy input size
+    int act_n = 1;
+
+    vec->observations = (float*)calloc(num_envs * obs_n, sizeof(float));
+    vec->actions      = (float*)calloc(num_envs * act_n, sizeof(float));
+    vec->rewards      = (float*)calloc(num_envs, sizeof(float));
+    vec->terminals    = (unsigned char*)calloc(num_envs, sizeof(unsigned char));
+
+    vec_reset = [](VecEnv* v) {
+        for (int i = 0; i < v->size * 118; i++) v->observations[i] = 0.001f * (float)(rand() % 23);
+        memset(v->rewards, 0, sizeof(float) * v->size);
+        memset(v->terminals, 0, sizeof(unsigned char) * v->size);
+    };
+    vec_step = [](VecEnv* v) {
+        memset(v->rewards, 0, sizeof(float) * v->size);
+        memset(v->terminals, 0, sizeof(unsigned char) * v->size);
+    };
+
+    auto obs = torch::from_blob(vec->observations, {num_envs, obs_n}, torch::kFloat32).pin_memory();
+    auto actions = torch::from_blob(vec->actions, {num_envs}, torch::kFloat32).pin_memory();
     auto rewards = torch::from_blob(vec->rewards, {num_envs}, torch::kFloat32).pin_memory();
     auto terminals = torch::from_blob(vec->terminals, {num_envs}, torch::kUInt8).pin_memory();
 
@@ -215,7 +238,8 @@ Log log_environments(torch::Tensor envs_tensor, torch::Tensor indices_tensor) {
 
 namespace py = pybind11;
 
-// Forward declare modules
+#ifndef PUFFERLIB_NO_CUDA
+// Forward declare CUDA implementations (defined in modules.cu)
 torch::Tensor mingru_gate(
     torch::Tensor state,
     torch::Tensor gate,
@@ -225,6 +249,11 @@ torch::autograd::tensor_list log_coeffs_and_values(
     torch::Tensor gate,
     torch::Tensor hidden
 );
+torch::autograd::tensor_list rmsnorm(
+    torch::Tensor x,
+    torch::Tensor weight,
+    double eps
+);
 torch::autograd::tensor_list fused_scan(
     torch::Tensor log_coeffs,
     torch::Tensor log_values
@@ -245,15 +274,93 @@ torch::autograd::tensor_list fused_ppo_loss(
     float vf_clip_coef,
     float vf_coef,
     float ent_coef
-    /*
-    torch::Tensor adv_mean,
-    torch::Tensor adv_std,
-    torch::Tensor clip_coef,
-    torch::Tensor vf_clip_coef,
-    torch::Tensor vf_coef,
-    torch::Tensor ent_coef
-    */
 );
+#else
+// CPU fallbacks so the extension builds without CUDA.
+torch::Tensor mingru_gate(
+    torch::Tensor state,
+    torch::Tensor gate,
+    torch::Tensor hidden
+) {
+    auto hidden_pos = torch::where(hidden >= 0, hidden + 0.5, torch::sigmoid(hidden));
+    auto gate_sig = torch::sigmoid(gate);
+    return torch::lerp(state, hidden_pos, gate_sig);
+}
+
+torch::autograd::tensor_list log_coeffs_and_values(
+    torch::Tensor gate,
+    torch::Tensor hidden
+) {
+    auto log_coeffs = -torch::nn::functional::softplus(gate);
+    auto log_z = -torch::nn::functional::softplus(-gate);
+    auto relu_h = torch::relu(hidden);
+    auto log_tilde_h = torch::where(hidden >= 0, (relu_h + 0.5).log(), -torch::nn::functional::softplus(-hidden));
+    auto log_values = log_z + log_tilde_h;
+    return {log_coeffs, log_values};
+}
+
+torch::autograd::tensor_list fused_scan(
+    torch::Tensor log_coeffs,
+    torch::Tensor log_values
+) {
+    auto a_star = log_coeffs.cumsum(1);
+    auto log_h0_plus_b_star = (log_values - a_star).logcumsumexp(1);
+    auto log_h = a_star + log_h0_plus_b_star;
+    auto out = log_h.exp();
+    return {out};
+}
+
+torch::Tensor logcumsumexp_cuda(torch::Tensor x) {
+    return x.logcumsumexp(1);
+}
+
+torch::autograd::tensor_list rmsnorm(
+    torch::Tensor x,
+    torch::Tensor weight,
+    double eps
+) {
+    auto out = torch::nn::functional::rms_norm(x, torch::nn::functional::RMSNormFuncOptions(x.size(-1)).weight(weight).eps(eps));
+    return {out};
+}
+
+torch::autograd::tensor_list fused_ppo_loss(
+    torch::Tensor logits,
+    torch::Tensor values_pred,
+    torch::Tensor actions,
+    torch::Tensor old_logprobs,
+    torch::Tensor advantages,
+    torch::Tensor prio,
+    torch::Tensor values,
+    torch::Tensor returns,
+    float adv_mean,
+    float adv_std,
+    float clip_coef,
+    float vf_clip_coef,
+    float vf_coef,
+    float ent_coef
+) {
+    // Pure-torch CPU fallback; autograd will handle backward.
+    auto logp = torch::log_softmax(logits, -1);
+    auto new_logp = logp.gather(-1, actions.unsqueeze(-1)).squeeze(-1);
+    auto ratio = (new_logp - old_logprobs).exp();
+    auto adv_norm = (advantages - adv_mean) / (adv_std + 1e-8);
+
+    auto pg_loss1 = -prio * adv_norm * ratio;
+    auto pg_loss2 = -prio * adv_norm * ratio.clamp(1.0 - clip_coef, 1.0 + clip_coef);
+    auto pg_loss = torch::max(pg_loss1, pg_loss2);
+
+    auto v_error = values_pred - values;
+    auto v_clipped = values + v_error.clamp(-vf_clip_coef, vf_clip_coef);
+    auto v_loss_unclipped = (values_pred - returns).pow(2);
+    auto v_loss_clipped = (v_clipped - returns).pow(2);
+    auto v_loss = 0.5 * torch::max(v_loss_unclipped, v_loss_clipped);
+
+    auto entropy = -(logp * logp.exp()).sum(-1);
+
+    auto loss = (pg_loss + vf_coef * v_loss - ent_coef * entropy).mean();
+    return {loss.unsqueeze(0)};
+}
+#endif
 
 /*
 torch::autograd::tensor_list rmsnorm(
@@ -1126,7 +1233,7 @@ void batched_forward(
         float rng = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
         torch::Tensor mb_obs = observations.narrow(0, mb*minibatch_segments, minibatch_segments);
         torch::Tensor mb_state = torch::zeros(
-            {minibatch_segments, 1, policy->hidden_size},
+            {policy->num_layers, minibatch_segments, 1, policy->hidden_size},
             DTYPE
         ).to(device);
         auto [logits, newvalue] = policy->forward_train(mb_obs.to(DTYPE)+rng, mb_state+rng);
@@ -1380,7 +1487,7 @@ PYBIND11_MODULE(_C, m) {
     m.def("log_coeffs_and_values", &log_coeffs_and_values);
     m.def("fused_scan", &fused_scan);
     m.def("fused_ppo_loss", &fused_ppo_loss);
-    //m.def("rmsnorm", &rmsnorm);
+    m.def("rmsnorm", &rmsnorm);
 
     /*
     py::class_<RMSNorm, torch::nn::ModuleHolder<RMSNormImpl>>(m, "RMSNorm")
diff --git a/pufferlib/extensions/vecenv.h b/pufferlib/extensions/vecenv.h
index ee85296b3..df1022b08 100644
--- a/pufferlib/extensions/vecenv.h
+++ b/pufferlib/extensions/vecenv.h
@@ -2,7 +2,6 @@
 #include <stdlib.h>
 #include <string.h>
 #include <pthread.h>
-#include <stdatomic.h>
 
 #define FLOAT 1
 #define INT 2

From 77f1987b4e903ed0f35ad93d390f88d33f4e291a Mon Sep 17 00:00:00 2001
From: ry2009 <134240944+ry2009@users.noreply.github.com>
Date: Wed, 26 Nov 2025 01:52:13 -0500
Subject: [PATCH 2/2] Enable fused RMSNorm, wire fallback tests, and guard CPU

---
 pufferlib/extensions/cuda/kernels.cu | 268 +++++++++++++--------------
 pufferlib/extensions/cuda/modules.cu |  22 +--
 pufferlib/models.py                  |  24 ++-
 setup.py                             |   3 +
 test_kernels.py                      |   5 +
 tests/test_cpu_fallbacks.py          |  71 +++++++
 6 files changed, 239 insertions(+), 154 deletions(-)
 create mode 100644 tests/test_cpu_fallbacks.py

diff --git a/pufferlib/extensions/cuda/kernels.cu b/pufferlib/extensions/cuda/kernels.cu
index 520812ced..885630913 100644
--- a/pufferlib/extensions/cuda/kernels.cu
+++ b/pufferlib/extensions/cuda/kernels.cu
@@ -17,142 +17,122 @@ inline int seq_size(int N) {
     return (N + SEQ_SIZE - 1) / SEQ_SIZE;
 }
 
-// If you can get this to work, go ahead. I tried.
-// NVCC won't parse templated types in kernel launches
-/*
-template <template <class> class KernelFn, typename... Args>
-void dispatch_and_launch(const at::Tensor& example_tensor, Args... args) {
-    const int64_t N = example_tensor.numel();
-    const int64_t block = LAUNCH_BLOCK_SIZE;
-    const int64_t grid = (N + block - 1) / block;
-    auto stream = at::cuda::getCurrentCUDAStream();
-    at::cuda::CUDAGuard device_guard(example_tensor.device());
-
-    at::ScalarType dtype = example_tensor.scalar_type();
-    if (dtype == at::ScalarType::Float) {
-        KernelFn<float><<<grid, block, 0, stream>>>(args..., N);
-    } else if (dtype == at::ScalarType::Half) {
-        KernelFn<__half><<<grid, block, 0, stream>>>(args..., N);
-    } else if (dtype == at::ScalarType::BFloat16) {
-        KernelFn<__nv_bfloat16><<<grid, block, 0, stream>>>(args..., N);
-    } else {
-        AT_ERROR("Unsupported dtype: ", dtype);
+// ===== RMSNorm (B, T, H) =====
+// Fused forward/backward with one reduction per (B*T) row.
+// Accumulate in float for numerical stability; supports float32 and bfloat16.
+
+template<int BLOCK>
+__device__ __forceinline__ float block_sum(float v) {
+    __shared__ float smem[BLOCK];
+    smem[threadIdx.x] = v;
+    __syncthreads();
+    for (int offset = BLOCK / 2; offset > 0; offset >>= 1) {
+        if (threadIdx.x < offset) {
+            smem[threadIdx.x] += smem[threadIdx.x + offset];
+        }
+        __syncthreads();
     }
+    return smem[0];
 }
-*/
 
-template<typename T>
+template<typename T, int BLOCK>
 __global__ void rmsnorm_forward_kernel(
     T* __restrict__ out,
-    float* __restrict__ inv_norm_buf,
+    float* __restrict__ inv_norm_buf, // shape [B*T]
     const T* __restrict__ x,
-    const T* __restrict__ weight,
-    double eps,
+    const T* __restrict__ weight,     // shape [H]
+    float eps,
     int T_total,
-    int H,
-    int B
+    int H
 ) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= B * T_total) return;
-
-    int b = idx / T_total;
-    int t = idx % T_total;
-    int base = b*T_total*H + t*H;
+    (void)T_total;
+    int row = blockIdx.x;                    // row = b * T + t
+    int base = row * H;
 
-    float sum_sq = 0.0f;
-    for (int h = 0; h < H; h++) {
-        int curr = base + h;
-        float x_val = float(x[curr]);
-        sum_sq += x_val * x_val;
+    // Accumulate sum of squares for this row
+    float sum = 0.0f;
+    for (int h = threadIdx.x; h < H; h += BLOCK) {
+        float v = float(x[base + h]);
+        sum += v * v;
     }
+    sum = block_sum<BLOCK>(sum);
 
-    float rms = sqrtf(sum_sq/H + eps);
-    float inv_rms = 1.0f / rms;
-    inv_norm_buf[idx] = inv_rms;
+    float inv_rms = rsqrtf(sum / H + eps);
+    if (threadIdx.x == 0) {
+        inv_norm_buf[row] = inv_rms;
+    }
+    __syncthreads();
 
-    for (int h = 0; h < H; h++) {
-        int curr = base + h;
-        out[curr] = T(weight[h] * x[curr] * inv_rms);
+    // Write normalized output
+    for (int h = threadIdx.x; h < H; h += BLOCK) {
+        float v = float(x[base + h]);
+        float w = float(weight[h]);
+        out[base + h] = T(v * w * inv_rms);
     }
 }
 
-template<typename T>
-__global__ void rmsnorm_backward_kernel(
+template<typename T, int BLOCK>
+__global__ void rmsnorm_backward_input_kernel(
     T* __restrict__ grad_x,
-    T* __restrict__ grad_weight,
     const T* __restrict__ grad_out,
-    const float* __restrict__ inv_norm_buf,
-    const T* __restrict__ x_buf,
+    const float* __restrict__ inv_norm_buf, // shape [B*T]
+    const T* __restrict__ x,
     const T* __restrict__ weight,
-    double eps,
     int T_total,
-    int H,
-    int B
+    int H
 ) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= T_total*H*B) return;
-    int base = idx % H;
-    int norm_idx = idx / H;
-
-    float inv_rms = inv_norm_buf[norm_idx];
-    float inv_rms_3 = inv_rms * inv_rms * inv_rms;
-
-    grad_x[idx] = weight[base] * grad_out[idx] * inv_rms;
-    grad_weight[idx] = grad_out[idx] * inv_rms;
-
-    float wg_x = 0.0f;
-    for (int h=0; h<H; h++) {
-        float x = x_buf[base + h];
-        float w = weight[h];
-        float g = grad_out[base + h];
-        wg_x += w*g*x;
+    (void)T_total;
+    int row = blockIdx.x;
+    int base = row * H;
+    float inv_rms = inv_norm_buf[row];
+
+    // dot = sum_h grad_out * weight * x
+    float dot = 0.0f;
+    for (int h = threadIdx.x; h < H; h += BLOCK) {
+        float go = float(grad_out[base + h]);
+        float w = float(weight[h]);
+        float xv = float(x[base + h]);
+        dot += go * w * xv;
+    }
+    dot = block_sum<BLOCK>(dot);
+    float coeff = dot * inv_rms * inv_rms * inv_rms / H;  // inv_rms^3 / H
+
+    // grad_x
+    for (int h = threadIdx.x; h < H; h += BLOCK) {
+        float go = float(grad_out[base + h]);
+        float w = float(weight[h]);
+        float xv = float(x[base + h]);
+        float gx = w * go * inv_rms - xv * coeff;
+        grad_x[base + h] = T(gx);
     }
-    float x = x_buf[idx];
-    grad_x[idx] -= x*wg_x*inv_rms_3/float(H);
 }
 
-/*
 template<typename T>
-__global__ void rmsnorm_backward_kernel(
-    T* grad_x,
-    T* grad_weight,
-    const T* grad_out,
-    const float* inv_norm_buf,
-    const T* x,
-    const T* weight,
-    double eps,
-    int T_total,
-    int H,
-    int B
+__global__ void rmsnorm_backward_weight_kernel(
+    T* __restrict__ grad_weight,          // shape [H]
+    const T* __restrict__ grad_out,
+    const float* __restrict__ inv_norm,   // shape [B*T]
+    const T* __restrict__ x,
+    int rows,                             // B * T
+    int H
 ) {
-    int total_elements = B * T_total * H;
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= total_elements) return;
-
-    int h = idx % H;
-    int vec_idx = idx / H;                    // index of the vector (b,t)
-    int offset = vec_idx * H;
-
-    float inv_rms = inv_norm_buf[vec_idx];
-    float inv_rms3 = inv_rms * inv_rms * inv_rms;
+    int h = blockIdx.x * blockDim.x + threadIdx.x;
+    if (h >= H) return;
 
-    // ∂L/∂γ_h += grad_out * (x / rms)
-    float gw = grad_out[idx] * (float)x[idx] * inv_rms;
-    atomicAdd((float*)&grad_weight[h], gw);
-
-    // Compute reduction: sum_h weight[h] * grad_out[h] * x[h]
-    float sum = 0.0f;
-    for (int i = 0; i < H; ++i) {
-        sum += (float)weight[i] * (float)grad_out[offset + i] * (float)x[offset + i];
+    float acc = 0.0f;
+    for (int row = 0; row < rows; row++) {
+        int idx = row * H + h;
+        acc += float(grad_out[idx]) * float(x[idx]) * inv_norm[row];
     }
-    float reduction = sum * inv_rms;  // = σ γ g hat_x
-
-    float dx = (float)weight[h] * (float)grad_out[idx] * inv_rms
-               - (float)x[idx] * reduction * inv_rms3 / H;
+    grad_weight[h] = T(acc);
+}
 
-    grad_x[idx] = T(dx);
+// Heuristic: smaller blocks for small H to avoid wasted threads
+inline int rms_block_size(int H) {
+    if (H <= 64) return 64;
+    if (H <= 128) return 128;
+    return 256;
 }
-*/
 
 template<typename T>
 void launch_rmsnorm_forward(
@@ -165,23 +145,26 @@ void launch_rmsnorm_forward(
     int H,
     int B
 ) {
-    int total = B * T_total;
-    int grid = grid_size(total);
-
-    rmsnorm_forward_kernel<T><<<grid, BLOCK_SIZE>>>(
-        out,
-        inv_norm_buf,
-        x,
-        weight,
-        eps,
-        T_total,
-        H,
-        B
-    );
+    int rows = B * T_total;
+    int block = rms_block_size(H);
+
+    switch (block) {
+    case 64:
+        rmsnorm_forward_kernel<T, 64><<<rows, 64>>>(
+            out, inv_norm_buf, x, weight, static_cast<float>(eps), T_total, H);
+        break;
+    case 128:
+        rmsnorm_forward_kernel<T, 128><<<rows, 128>>>(
+            out, inv_norm_buf, x, weight, static_cast<float>(eps), T_total, H);
+        break;
+    default:
+        rmsnorm_forward_kernel<T, 256><<<rows, 256>>>(
+            out, inv_norm_buf, x, weight, static_cast<float>(eps), T_total, H);
+    }
 
     cudaError_t err = cudaGetLastError();
     if (err != cudaSuccess) {
-        fprintf(stderr, "CUDA kernel launch error in forward: %s\n", cudaGetErrorString(err));
+        fprintf(stderr, "CUDA kernel launch error in RMSNorm forward: %s\n", cudaGetErrorString(err));
     }
 }
 
@@ -193,32 +176,39 @@ void launch_rmsnorm_backward(
     const float* __restrict__ inv_norm_buf,
     const T* __restrict__ x_buf,
     const T* __restrict__ weight,
-    double eps,
+    double eps,   // unused but kept for API parity
     int T_total,
     int H,
     int B
 ) {
-    // The backward is fully parallel
-    // since the inv norm is cached
-    int total = B * T_total * H;
-    int grid = grid_size(total);
+    (void)eps;
+    int rows = B * T_total;
+    int block = rms_block_size(H);
+
+    // Grad w.r.t. x
+    switch (block) {
+    case 64:
+        rmsnorm_backward_input_kernel<T, 64><<<rows, 64>>>(
+            grad_x, grad_out, inv_norm_buf, x_buf, weight, T_total, H);
+        break;
+    case 128:
+        rmsnorm_backward_input_kernel<T, 128><<<rows, 128>>>(
+            grad_x, grad_out, inv_norm_buf, x_buf, weight, T_total, H);
+        break;
+    default:
+        rmsnorm_backward_input_kernel<T, 256><<<rows, 256>>>(
+            grad_x, grad_out, inv_norm_buf, x_buf, weight, T_total, H);
+    }
 
-    rmsnorm_backward_kernel<T><<<grid, BLOCK_SIZE>>>(
-        grad_x,
-        grad_weight,
-        grad_out,
-        inv_norm_buf,
-        x_buf,
-        weight,
-        eps,
-        T_total,
-        H,
-        B
-    );
+    // Grad w.r.t. weight (one reduction over rows for each h)
+    int threads = 256;
+    int blocks = (H + threads - 1) / threads;
+    rmsnorm_backward_weight_kernel<T><<<blocks, threads>>>(
+        grad_weight, grad_out, inv_norm_buf, x_buf, rows, H);
 
     cudaError_t err = cudaGetLastError();
     if (err != cudaSuccess) {
-        fprintf(stderr, "CUDA kernel launch error in backward: %s\n", cudaGetErrorString(err));
+        fprintf(stderr, "CUDA kernel launch error in RMSNorm backward: %s\n", cudaGetErrorString(err));
     }
 }
 
diff --git a/pufferlib/extensions/cuda/modules.cu b/pufferlib/extensions/cuda/modules.cu
index a9d0e1b94..af4ae3d46 100644
--- a/pufferlib/extensions/cuda/modules.cu
+++ b/pufferlib/extensions/cuda/modules.cu
@@ -142,7 +142,6 @@ torch::autograd::tensor_list log_coeffs_and_values(
     return LogCoeffsAndValuesFunction::apply(gate, hidden);
 }
 
-/*
 class RMSNormFunction: public torch::autograd::Function<RMSNormFunction> {
 public:
     static torch::autograd::tensor_list forward(
@@ -158,6 +157,10 @@ public:
         TORCH_CHECK(weight.dim() == 1, "weight must be (H,)");
         TORCH_CHECK(x.size(2) == weight.size(0), "H must match");
 
+        // Ensure contiguous for flat indexing
+        x = x.contiguous();
+        weight = weight.contiguous();
+
         auto dtype = x.dtype();
         auto device = x.device();
         auto B = x.size(0);
@@ -167,7 +170,7 @@ public:
         auto out = torch::empty({B, T, H}, x.options());
 
         auto options_float = torch::TensorOptions().dtype(torch::kFloat32).device(device);
-        auto inv_norm = torch::empty({B, T}, options_float);
+        auto inv_norm = torch::empty({B * T}, options_float);
 
         if (dtype == torch::kFloat32) {
             launch_rmsnorm_forward<float>(
@@ -195,13 +198,8 @@ public:
             TORCH_CHECK(false, "Unsupported dtype. Only float32 and bfloat16 supported.");
         }
 
-        // TODO: don't save eps as a tensor
-        //ctx->saved_data["eps"] = eps;   // store in saved_data instead
-                                    
-        // Save for backward
-        auto eps_tensor = torch::tensor(eps);
-        ctx->save_for_backward({x, weight, out, inv_norm, eps_tensor});
-
+        ctx->saved_data["eps"] = eps;
+        ctx->save_for_backward({x, weight, inv_norm});
         return {out};
     }
     static torch::autograd::tensor_list backward(
@@ -211,9 +209,8 @@ public:
         auto saved = ctx->get_saved_variables();
         auto x = saved[0].contiguous();
         auto weight = saved[1].contiguous();
-        auto out = saved[2].contiguous();
-        auto inv_norm = saved[3].contiguous();
-        double eps = saved[4].item<double>();
+        auto inv_norm = saved[2].contiguous();
+        double eps = ctx->saved_data["eps"].to<double>();
 
         auto grad_out = grad_outputs[0].contiguous();
         auto dtype = x.dtype();
@@ -266,7 +263,6 @@ torch::autograd::tensor_list rmsnorm(
 ) {
     return RMSNormFunction::apply(x, weight, eps);
 }
-*/
 
 /*
 class RMSNormImpl : public torch::nn::Module {
diff --git a/pufferlib/models.py b/pufferlib/models.py
index 0434e9901..25fce437e 100644
--- a/pufferlib/models.py
+++ b/pufferlib/models.py
@@ -6,6 +6,12 @@
 
 import pufferlib.emulation
 import pufferlib.spaces
+try:
+    from pufferlib import _C as _pufferc
+    _HAS_FUSED_RMSNORM = hasattr(_pufferc, "rmsnorm")
+except Exception:
+    _pufferc = None
+    _HAS_FUSED_RMSNORM = False
 
 # https://arxiv.org/abs/2410.01201v1
 
@@ -36,6 +42,20 @@ def g(x):
 def log_g(x):
     return torch.where(x >= 0, (F.relu(x) + 0.5).log(), -F.softplus(-x))
 
+class FusedRMSNorm(nn.Module):
+    """Drop-in RMSNorm that prefers the fused CUDA kernel when available."""
+    def __init__(self, hidden_size: int, eps: float = 1e-5):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.eps = eps
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if _HAS_FUSED_RMSNORM and x.is_cuda:
+            # C++ extension returns a single-element list
+            return _pufferc.rmsnorm(x, self.weight, self.eps)[0]
+        # Fallback to PyTorch implementation (CPU or missing extension)
+        return torch.nn.functional.rms_norm(x, (x.shape[-1],), self.weight, self.eps)
+
 # log-space version of minGRU - B.3.1
 # they enforce the hidden states to be positive
 
@@ -51,7 +71,7 @@ def __init__(self, dim, expansion_factor=1., proj_out = None):
         self.to_out = Linear(dim_inner, dim, bias = False) if proj_out else Identity()
         #nn.init.orthogonal_(self.to_out.weight)
 
-        self.norm = torch.nn.RMSNorm(dim)
+        self.norm = FusedRMSNorm(dim)
 
     def forward(self, x, prev_hidden = None):
         seq_len = x.shape[1]
@@ -242,7 +262,7 @@ def __init__(self, env, hidden_size=128, num_layers=1, **kwargs):
 
         self.gru = nn.GRU(hidden_size, hidden_size, num_layers=num_layers)
         self.cell = nn.ModuleList([torch.nn.GRUCell(hidden_size, hidden_size) for _ in range(num_layers)])
-        self.norm = torch.nn.RMSNorm(hidden_size)
+        self.norm = FusedRMSNorm(hidden_size)
 
         for i in range(num_layers):
             cell = self.cell[i]
diff --git a/setup.py b/setup.py
index bc4ef4d47..670845205 100644
--- a/setup.py
+++ b/setup.py
@@ -253,8 +253,10 @@ def run(self):
         torch_sources.append("pufferlib/extensions/cuda/squared_torch.cu")
         torch_sources.append("pufferlib/extensions/cuda/kernels.cu")
         torch_sources.append("pufferlib/extensions/cuda/modules.cu")
+        define_macros = [('PUFFERLIB_WITH_CUDA', None)]
     else:
         extension = CppExtension
+        define_macros = [('PUFFERLIB_NO_CUDA', None)]
 
     import torch
     torch_extensions = [
@@ -262,6 +264,7 @@ def run(self):
             "pufferlib._C",
             torch_sources,
             include_dirs=[pybind11.get_include(), torch.utils.cpp_extension.include_paths()[0]],
+            define_macros=define_macros,
             extra_compile_args = {
                 "cxx": extra_compile_args + cxx_args,
                 "nvcc": nvcc_args,
diff --git a/test_kernels.py b/test_kernels.py
index ceebbcb61..f1617064c 100644
--- a/test_kernels.py
+++ b/test_kernels.py
@@ -13,6 +13,11 @@
 H = 128
 TIMEOUT = 3
 
+# Guard: skip on CPU-only to avoid hard failures in CI/dev machines without CUDA.
+if not torch.cuda.is_available():
+    print("CUDA not available; skipping kernel tests.")
+    raise SystemExit(0)
+
 def assert_close(a, b, rtol=1e-3, atol=1e-4):
     max_diff = (a - b).abs().max()
     passed = torch.allclose(a, b, rtol=rtol, atol=atol)
diff --git a/tests/test_cpu_fallbacks.py b/tests/test_cpu_fallbacks.py
new file mode 100644
index 000000000..863941538
--- /dev/null
+++ b/tests/test_cpu_fallbacks.py
@@ -0,0 +1,71 @@
+import torch
+
+# These tests are CPU-only sanity checks for the pure-Torch fallbacks
+# that are compiled when CUDA is unavailable. They run quickly.
+
+def test_rmsnorm_cpu_fallback():
+    from pufferlib import _C
+
+    B, T, H = 2, 3, 4
+    x = torch.randn(B, T, H)
+    w = torch.randn(H)
+    eps = 1e-5
+
+    out_ref = torch.nn.functional.rms_norm(x, (H,), w, eps)
+    out_ext = _C.rmsnorm(x, w, eps)[0]
+    assert torch.allclose(out_ref, out_ext, rtol=1e-4, atol=1e-5)
+
+
+def test_logcumsumexp_cpu_fallback():
+    from pufferlib import _C
+
+    B, T, H = 2, 5, 3
+    x = torch.randn(B, T, H)
+    out_ref = x.logcumsumexp(dim=1)
+    out_ext = _C.logcumsumexp_cuda(x)
+    assert torch.allclose(out_ref, out_ext, rtol=1e-5, atol=1e-6)
+
+
+def test_fused_scan_cpu_fallback():
+    from pufferlib import _C
+
+    B, T, H = 1, 4, 2
+    gate = torch.randn(B, T, H)
+    hidden = torch.randn(B, T, H)
+
+    # Reference: log_coeffs/log_values + associative scan in Python
+    log_coeffs = -torch.nn.functional.softplus(gate)
+    log_z = -torch.nn.functional.softplus(-gate)
+    relu_h = torch.relu(hidden)
+    log_tilde_h = torch.where(hidden >= 0, (relu_h + 0.5).log(), -torch.nn.functional.softplus(-hidden))
+    log_values = log_z + log_tilde_h
+
+    a_star = log_coeffs.cumsum(1)
+    log_h0_plus_b_star = (log_values - a_star).logcumsumexp(1)
+    out_ref = (a_star + log_h0_plus_b_star).exp()
+
+    out_ext = _C.fused_scan(log_coeffs, log_values)[0]
+    assert torch.allclose(out_ref, out_ext, rtol=1e-4, atol=1e-5)
+
+
+def test_mingru_gate_cpu_fallback():
+    from pufferlib import _C
+
+    state = torch.randn(8, 3, 5)
+    gate = torch.randn(8, 3, 5)
+    hidden = torch.randn(8, 3, 5)
+
+    hidden_pos = torch.where(hidden >= 0, hidden + 0.5, torch.sigmoid(hidden))
+    gate_sig = torch.sigmoid(gate)
+    out_ref = torch.lerp(state, hidden_pos, gate_sig)
+
+    out_ext = _C.mingru_gate(state, gate, hidden)
+    assert torch.allclose(out_ref, out_ext, rtol=1e-5, atol=1e-6)
+
+
+if __name__ == "__main__":
+    test_rmsnorm_cpu_fallback()
+    test_logcumsumexp_cpu_fallback()
+    test_fused_scan_cpu_fallback()
+    test_mingru_gate_cpu_fallback()
+    print("CPU fallback tests passed.")