Change freq_cis from persistent buffer to non-persistent buffer (#1600)

wwwjn · web-flow · commit 9874e84d1844 · 2025-08-19T14:11:18.000-07:00
## Context

As PP didn't need persistent buffer, and `torch.compile` works with
non-persistent buffer now, change freq_cis from persistent buffer to
non-persistent buffer . In this way, checkpointer doesn't need to
explicitly exclude freq_cis when loading.


## Test
1. llama3 model with torch.compile ✅
2. llama4 model with torch.compile ✅
3. deepseek-v3 model with torch.compile ✅
diff --git a/scripts/generate/test_generate.py b/scripts/generate/test_generate.py
@@ -24,7 +24,6 @@
     parallelize_module,
     RowwiseParallel,
 )
-from torchtitan.components.checkpoint import excluded_parameters_for_model_only
 from torchtitan.components.metrics import build_device_memory_monitor
 from torchtitan.config import ConfigManager
 from torchtitan.distributed import ParallelDims, utils as dist_utils
@@ -143,8 +142,6 @@ def test_generate(
     model.eval()
 
     state_dict = model.state_dict()
-    for k in excluded_parameters_for_model_only:
-        state_dict.pop(k, None)
 
     # Checkpoint Loading
     begin = time.monotonic()
diff --git a/tests/unit_tests/test_checkpoint.py b/tests/unit_tests/test_checkpoint.py
@@ -562,7 +562,7 @@ def test_enable_first_step_checkpoint(self, mock_save, mock_rank):
 
     @mock.patch("torch.distributed.get_rank", return_value=0)
     @mock.patch("torchtitan.components.checkpoint.dcp.save")
-    def test_excluded_parameters_not_saved(self, mock_save, mock_rank):
+    def test_non_persist_buffer_not_saved(self, mock_save, mock_rank):
         """Test that freqs_cis is not saved"""
 
         # Create a fake model with freqs_cis and other parameters
@@ -572,7 +572,7 @@ def __init__(self):
                 self.weight = nn.Parameter(torch.randn(2, 2))
                 self.bias = nn.Parameter(torch.randn(2))
                 # Register freqs_cis as a buffer (common pattern in transformer models)
-                self.register_buffer("freqs_cis", torch.randn(10, 5))
+                self.register_buffer("freqs_cis", torch.randn(10, 5), persistent=False)
                 self.other_param = nn.Parameter(torch.randn(3, 3))
 
         fake_model = FakeModelWithFreqsCis()
diff --git a/torchtitan/components/checkpoint.py b/torchtitan/components/checkpoint.py
@@ -55,12 +55,6 @@ class AsyncMode(str, enum.Enum):
     ASYNC_WITH_PINNED_MEM = "async_with_pinned_mem"
 
 
-# For now, we will manually pop the freqs_cis buffer, as we made this permanent
-# temporarily and we don't want to include it in the exported state_dict.
-# Context: https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/llama3/model.py#L404
-excluded_parameters_for_model_only = {"freqs_cis"}
-
-
 class ModelWrapper(Stateful):
     def __init__(self, model: nn.Module | list[nn.Module]) -> None:
         self.model = [model] if isinstance(model, nn.Module) else model
@@ -70,9 +64,6 @@ def _get_state_dict(self) -> dict[str, Any]:
         state_dict = {
             k: v for sd in map(get_model_state_dict, self.model) for k, v in sd.items()
         }
-        # Exclude parameters that should not be saved
-        for excluded_key in excluded_parameters_for_model_only:
-            state_dict.pop(excluded_key, None)
         return state_dict
 
     def state_dict(self) -> dict[str, Any]:
diff --git a/torchtitan/experiments/llama4/model/model.py b/torchtitan/experiments/llama4/model/model.py
@@ -391,14 +391,9 @@ def __init__(self, model_args: TransformerModelArgs):
 
         self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim)
 
-        # TODO persistent should be set to false, since this buffer can be recomputed.
-        # however, we set it to true for 2 reasons.  (1) due to pytorch/pytorch#123411,
-        # compile or pipeline-tracer will not correctly handle non-persistent buffers,
-        # so we need to fix that.  (2) if we initialize pipeline-parallel models from
-        # a seed checkpoint rather than calling init_weights, we need freqs_cis to be
-        # initialized by the checkpoint, or we need to add a separate initializer for
-        # just the non-persistent buffers that is called after loading checkpoints.
-        self.register_buffer("freqs_cis", self._precompute_freqs_cis(), persistent=True)
+        self.register_buffer(
+            "freqs_cis", self._precompute_freqs_cis(), persistent=False
+        )
 
         self.layers = torch.nn.ModuleDict()
         for layer_id in range(model_args.n_layers):
diff --git a/torchtitan/models/deepseek_v3/infra/parallelize.py b/torchtitan/models/deepseek_v3/infra/parallelize.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import torch
 import torch.nn as nn
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.tensor import Replicate, Shard
@@ -18,7 +19,11 @@
 from torchtitan.config import JobConfig, TORCH_DTYPE_MAP
 from torchtitan.distributed import ParallelDims
 from torchtitan.distributed.expert_parallel import NoParallel
-from torchtitan.experiments.llama4.infra.parallelize import apply_fsdp, apply_moe_ep_tp
+from torchtitan.experiments.llama4.infra.parallelize import (
+    apply_compile,
+    apply_fsdp,
+    apply_moe_ep_tp,
+)
 from torchtitan.models.llama3.infra.parallelize import apply_ac, apply_ddp
 from torchtitan.tools.logging import logger
 
diff --git a/torchtitan/models/deepseek_v3/model/model.py b/torchtitan/models/deepseek_v3/model/model.py
@@ -322,7 +322,7 @@ def __init__(self, model_args: DeepSeekV3ModelArgs):
         self.max_seq_len = model_args.max_seq_len
         self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim)
         self.register_buffer(
-            "freqs_cis", precompute_freqs_cis(model_args), persistent=True
+            "freqs_cis", precompute_freqs_cis(model_args), persistent=False
         )
 
         self.layers = torch.nn.ModuleDict()
diff --git a/torchtitan/models/llama3/model/model.py b/torchtitan/models/llama3/model/model.py
@@ -335,14 +335,9 @@ def __init__(self, model_args: TransformerModelArgs):
 
         self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim)
 
-        # TODO persistent should be set to false, since this buffer can be recomputed.
-        # however, we set it to true for 2 reasons.  (1) due to pytorch/pytorch#123411,
-        # compile or pipeline-tracer will not correctly handle non-persistent buffers,
-        # so we need to fix that.  (2) if we initialize pipeline-parallel models from
-        # a seed checkpoint rather than calling init_weights, we need freqs_cis to be
-        # initialized by the checkpoint, or we need to add a separate initializer for
-        # just the non-persistent buffers that is called after loading checkpoints.
-        self.register_buffer("freqs_cis", self._precompute_freqs_cis(), persistent=True)
+        self.register_buffer(
+            "freqs_cis", self._precompute_freqs_cis(), persistent=False
+        )
 
         self.layers = torch.nn.ModuleDict()
         for layer_id in range(model_args.n_layers):

Original file line number	Diff line number	Diff line change
`@@ -322,7 +322,7 @@ def __init__(self, model_args: DeepSeekV3ModelArgs):`
`322`	`322`	`self.max_seq_len = model_args.max_seq_len`
`323`	`323`	`self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim)`
`324`	`324`	`self.register_buffer(`
`325`		`- "freqs_cis", precompute_freqs_cis(model_args), persistent=True`
	`325`	`+ "freqs_cis", precompute_freqs_cis(model_args), persistent=False`
`326`	`326`	`)`
`327`	`327`
`328`	`328`	`self.layers = torch.nn.ModuleDict()`