meta-pytorch · evgri243 · Oct 20, 2025 · Oct 28, 2025 · Nov 10, 2025
diff --git a/opacus/__init__.py b/opacus/__init__.py
@@ -14,13 +14,18 @@
 # limitations under the License.
 
 from . import utils
-from .grad_sample import GradSampleModule, GradSampleModuleFastGradientClipping
+from .grad_sample import (
+    GradSampleController,
+    GradSampleModule,
+    GradSampleModuleFastGradientClipping,
+)
 from .privacy_engine import PrivacyEngine
 from .version import __version__
 
 
 __all__ = [
     "PrivacyEngine",
+    "GradSampleController",
     "GradSampleModule",
     "GradSampleModuleFastGradientClipping",
     "utils",

diff --git a/opacus/grad_sample/README.md b/opacus/grad_sample/README.md
@@ -3,25 +3,54 @@
 Computing per sample gradients is an integral part of Opacus framework. We strive to provide out-of-the-box support for
 wide range of models, while keeping computations efficient.
 
-We currently provide two independent approaches for computing per sample gradients: hooks-based ``GradSampleModule``
-(stable implementation, exists since the very first version of Opacus) and ``GradSampleModuleExpandedWeights``
-(based on a beta functionality available in PyTorch 1.12).
-
-Each of the two implementations comes with it's own set of limitations, and we leave the choice up to the client
-which one to use.
-
-``GradSampleModuleExpandedWeights`` is currently in early beta and can produce unexpected errors, but potentially
-improves upon ``GradSampleModule`` on performance and functionality.
-
-**TL;DR:** If you want stable implementation, use ``GradSampleModule`` (`grad_sample_mode="hooks"`).
-If you want to experiment with the new functionality, you have two options. Try 
-``GradSampleModuleExpandedWeights``(`grad_sample_mode="ew"`) for better performance and `grad_sample_mode=functorch` 
-if your model is not supported by ``GradSampleModule``. 
-
-Please switch back to ``GradSampleModule``(`grad_sample_mode="hooks"`) if you encounter strange errors or unexpexted behaviour.
-We'd also appreciate it if you report these to us
-
-## Hooks-based approach
+We currently provide three independent approaches for computing per sample gradients:
+
+1. **Hooks-based `GradSampleModule`** (stable, wraps the model)
+2. **`GradSampleController`** (stable, no model wrapping - recommended for transformers)
+3. **`GradSampleModuleExpandedWeights`** (beta, based on PyTorch 1.12+ functionality)
+
+Each implementation comes with its own set of limitations and benefits.
+
+**TL;DR:**
+- Use `GradSampleModule` (`grad_sample_mode="hooks"`) for stable implementation with standard models (default)
+- Use controller mode (`return_controller=True`) for transformer models and when you need direct model access without wrapping
+- Use `GradSampleModuleExpandedWeights` (`grad_sample_mode="ew"`) if you want to experiment with better performance
+- Use `grad_sample_mode="functorch"` if your model has unsupported layers
+
+Please report any strange errors or unexpected behaviour to us!
+
+## Controller-Based Approach (No Model Wrapping)
+- Usage: Set `return_controller=True` in `PrivacyEngine.make_private()`
+- Controller class: ``opacus.grad_sample.GradSampleController``
+
+**Recommended for transformer models and when model wrapping causes issues.**
+
+Computes per-sample gradients by attaching hooks directly to model parameters without wrapping the model in a
+`GradSampleModule`. This approach:
+
+- ✅ Preserves model type (e.g., `isinstance(model, BertModel)` remains `True`)
+- ✅ No `_module.` prefix in state_dict
+- ✅ Direct access to model attributes (no attribute forwarding needed)
+- ✅ Better compatibility with HuggingFace transformers and models with custom `__getattr__`
+- ✅ Same grad sampler methods as `GradSampleModule`
+
+**Example:**
+```python
+from opacus import PrivacyEngine
+
+privacy_engine = PrivacyEngine()
+model, optimizer, dataloader = privacy_engine.make_private(
+    module=model,
+    optimizer=optimizer,
+    data_loader=dataloader,
+    noise_multiplier=1.0,
+    max_grad_norm=1.0,
+    return_controller=True,  # ← Enable controller mode
+)
+# model is now unwrapped with hooks attached directly
+```
+
+## Hooks-based approach (Model Wrapping)
 - Model wrapping class: ``opacus.grad_sample.grad_sample_module.GradSampleModule``
 - Keyword argument for ``PrivacyEngine.make_private()``: `grad_sample_mode="hooks"`
 
@@ -62,23 +91,27 @@ is roughly the same.
 Please note that these are known limitations and we plan to improve Expanded Weights and bridge the gap in feature completeness
 
 
-| xxx                          | Hooks                           | Expanded Weights | Functorch    |
-|:----------------------------:|:-------------------------------:|:----------------:|:------------:| 
-| Required PyTorch version     | 1.8+                            | 1.13+            | 1.12 (to be updated) |
-| Development status           | Underlying mechanism deprecated | Beta             | Beta         | 
-| Runtime Performance†          | baseline                       | ✅ ~25% faster  | 🟨 0-50% slower |
-| Any DP-allowed†† layers       | Not supported                   | Not supported   | ✅ Supported |
-| Most popular nn.* layers     | ✅ Supported                    | ✅ Supported    | ✅ Supported  | 
-| torchscripted models         | Not supported                   | ✅ Supported    | Not supported |
-| Client-provided grad sampler | ✅ Supported                    | Not supported   | ✅ Not needed |
-| `batch_first=False`          | ✅ Supported                    | Not supported   | ✅ Supported  |
-| Recurrent networks           | ✅ Supported                    | Not supported   | ✅ Supported  |
-| Padding `same` in Conv       | ✅ Supported                    | Not supported   | ✅ Supported  |
-| Empty poisson batches        | ✅ Supported                    | Not supported   | Not supported  |
-
-† Note, that performance differences are unstable and can vary a lot depending on the exact model and batch size. 
-Numbers above are averaged over benchmarks with small models consisting of convolutional and linear layers. 
-Note, that performance differences are only observed on GPU training, CPU performance seem to be almost identical 
+| xxx                          | GradSampleModule (Hooks) | GradSampleController | Expanded Weights | Functorch    |
+|:----------------------------:|:------------------------:|:-------------------:|:----------------:|:------------:|
+| Required PyTorch version     | 1.8+                     | 1.8+                | 1.13+            | 1.12 (to be updated) |
+| Development status           | Deprecated mechanism     | ✅ Beta             | Beta             | Beta         |
+| Model wrapping               | ✅ Wraps model           | ✅ No wrapping      | ✅ Wraps model   | ✅ Wraps model |
+| Runtime Performance†          | baseline                | baseline            | ✅ ~25% faster   | 🟨 0-50% slower |
+| Transformer compatibility    | 🟨 May have issues      | ✅ Excellent        | 🟨 May have issues | 🟨 May have issues |
+| State dict compatibility     | 🟨 `_module.` prefix    | ✅ Clean keys       | 🟨 `_module.` prefix | 🟨 `_module.` prefix |
+| Type preservation            | ❌ Model wrapped        | ✅ Model unchanged  | ❌ Model wrapped | ❌ Model wrapped |
+| Any DP-allowed†† layers       | Not supported          | Not supported       | Not supported    | ✅ Supported |
+| Most popular nn.* layers     | ✅ Supported            | ✅ Supported        | ✅ Supported     | ✅ Supported  |
+| torchscripted models         | Not supported           | Not supported       | ✅ Supported     | Not supported |
+| Client-provided grad sampler | ✅ Supported            | ✅ Supported        | Not supported    | ✅ Not needed |
+| `batch_first=False`          | ✅ Supported            | ✅ Supported        | Not supported    | ✅ Supported  |
+| Recurrent networks           | ✅ Supported            | ✅ Supported        | Not supported    | ✅ Supported  |
+| Padding `same` in Conv       | ✅ Supported            | ✅ Supported        | Not supported    | ✅ Supported  |
+| Empty poisson batches        | ✅ Supported            | ✅ Supported        | Not supported    | Not supported  |
+
+† Note, that performance differences are unstable and can vary a lot depending on the exact model and batch size.
+Numbers above are averaged over benchmarks with small models consisting of convolutional and linear layers.
+Note, that performance differences are only observed on GPU training, CPU performance seem to be almost identical
 for all approaches.
 
 †† Layers that produce joint computations on batch samples (e.g. BatchNorm) are not allowed under any approach    

diff --git a/opacus/grad_sample/__init__.py b/opacus/grad_sample/__init__.py
@@ -18,6 +18,10 @@
 from .dp_rnn import compute_rnn_linear_grad_sample  # noqa
 from .embedding import compute_embedding_grad_sample  # noqa
 from .embedding_norm_sample import compute_embedding_norm_sample  # noqa
+from .grad_sample_controller import GradSampleController  # noqa
+from .grad_sample_controller_fast_gradient_clipping import (  # noqa
+    GradSampleControllerFastGradientClipping,
+)
 from .grad_sample_module import GradSampleModule, create_or_accumulate_grad_sample
 from .grad_sample_module_fast_gradient_clipping import (  # noqa
     GradSampleModuleFastGradientClipping,
@@ -45,6 +49,8 @@
 
 
 __all__ = [
+    "GradSampleController",
+    "GradSampleControllerFastGradientClipping",
     "GradSampleModule",
     "GradSampleModuleFastGradientClipping",
     "GradSampleModuleFastGradientClippingFSDP",

diff --git a/opacus/grad_sample/grad_sample_controller.py b/opacus/grad_sample/grad_sample_controller.py
@@ -0,0 +1,180 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+GradSampleController: Manages privacy hooks on models without wrapping them.
+
+This module provides a GradSampleModule-less approach to attaching hooks
+directly to model parameters for computing per-sample gradients.
+"""
+
+import logging
+from functools import partial
+from typing import Iterable, List, Tuple
+
+import torch
+import torch.nn as nn
+from opacus.grad_sample.functorch import ft_compute_per_sample_gradient, prepare_layer
+from opacus.grad_sample.grad_sample_hooks_mixin import GradSampleHooksMixin
+from opacus.grad_sample.grad_sample_module import (
+    _get_batch_size,
+    create_or_accumulate_grad_sample,
+    promote_current_grad_sample,
+)
+from opacus.layers.dp_rnn import DPGRU, DPLSTM, DPRNN, RNNLinear
+from opacus.utils.module_utils import (
+    has_trainable_params,
+    requires_grad,
+    trainable_modules,
+    trainable_parameters,
+)
+from opacus.validators.errors import UnsupportedModuleError
+from torch.utils.hooks import RemovableHandle
+
+
+logger = logging.getLogger(__name__)
+logger.disabled = True
+
+
+OPACUS_PARAM_MONKEYPATCH_ATTRS = [
+    "grad_sample",
+    "_forward_counter",
+    "_current_grad_sample",
+    "_norm_sample",
+]
+
+
+# GradSampleHooksMixin is now imported from grad_sample_hooks_mixin.py to avoid circular imports
+
+
+class GradSampleController(GradSampleHooksMixin):
+    """
+    Controller for managing privacy hooks on models without wrapping them
+
+    Computes per-sample gradients using custom-written methods for each layer.
+    See README.md for more details
+
+    This class attaches hooks directly to model modules and manages their lifecycle,
+    providing an alternative to GradSampleModule wrapping that's more compatible
+    with transformers and other complex models.
+    """
+
+    def __init__(
+        self,
+        m: nn.Module,
+        *,
+        batch_first=True,
+        loss_reduction="mean",
+        strict: bool = True,
+        force_functorch=False,
+    ):
+        """
+
+        Args:
+            m: nn.Module to attach hooks to
+            batch_first: Flag to indicate if the input tensor to the corresponding module
+                has the first dimension representing the batch. If set to True, dimensions on
+                input tensor are expected be ``[batch_size, ...]``, otherwise
+                ``[K, batch_size, ...]``
+            loss_reduction: Indicates if the loss reduction (for aggregating the gradients)
+                is a sum or a mean operation. Can take values "sum" or "mean"
+            strict: If set to ``True``, the input module will be validated to make sure that none of its submodules includes buffers,
+                which is not currently supported by Opacus.
+                If set to ``False``, per sample gradients will
+                be computed on "best effort" basis - they will be available where
+                possible and set to None otherwise. This is not recommended, because
+                some unsupported modules (e.g. BatchNorm) affect other parameters and
+                invalidate the concept of per sample gradients for the entire model.
+            force_functorch: If set to ``True``, will use functorch to compute
+                all per sample gradients. Otherwise, functorch will be used only
+                for layers without registered grad sampler methods.
+
+        Raises:
+            NotImplementedError
+                If ``strict`` is set to ``True`` and module ``m`` (or any of its
+                submodules) includes a buffer.
+        """
+        errors = self.validate(module=m, strict=strict)
+        if errors and not strict:
+            logger.info(
+                f"GradSampleController found the following errors: {errors}."
+                "Using non-strict mode, continuing"
+            )
+
+        self.module = m
+        self.hooks_enabled = False
+        self.grad_accumulation_allowed = True
+        self.batch_first = batch_first
+        self.loss_reduction = loss_reduction
+        self.force_functorch = force_functorch
+
+        self.autograd_grad_sample_hooks: List[RemovableHandle] = []
+
+        # Initialize parameters with required attributes
+        for _, p in trainable_parameters(self.module):
+            p.grad_sample = None
+            p._forward_counter = 0
+
+        # Add the hooks
+        self.add_hooks()
+
+    def _get_target_module(self) -> nn.Module:
+        """Return the module to attach hooks to."""
+        return self.module
+
+    def add_hooks(self) -> None:
+        """
+        Adds hooks to model to save activations and backprop values.
+        The hooks will
+        1. save activations into param.activations during forward pass
+        2. compute per-sample gradients in params.grad_sample during backward pass.
+        Call ``remove_hooks(model)`` to disable this.
+        """
+        self._add_hooks_impl(
+            target_module=self.module,
+            hooks_list=self.autograd_grad_sample_hooks,
+            batch_first=self.batch_first,
+            loss_reduction=self.loss_reduction,
+            force_functorch=self.force_functorch,
+        )
+
+    def remove_hooks(self) -> None:
+        """
+        Removes hooks added by ``add_hooks()``
+        """
+        self.disable_hooks()
+
+        while self.autograd_grad_sample_hooks:
+            handle = self.autograd_grad_sample_hooks.pop()
+            handle.remove()
+
+        # Remove functorch hooks
+        for _module_name, module in trainable_modules(self.module):
+            if hasattr(module, "ft_compute_sample_grad"):
+                delattr(module, "ft_compute_sample_grad")
+            if hasattr(module, "activations"):
+                delattr(module, "activations")
+
+    def cleanup(self):
+        """
+        Clean up all hooks and attributes added to the model.
+        """
+        self.remove_hooks()
+
+        # Clean up parameter attributes
+        for attr in OPACUS_PARAM_MONKEYPATCH_ATTRS:
+            for p in self.module.parameters():
+                if hasattr(p, attr):
+                    delattr(p, attr)