[FIX] non-peristent buffer was saved incorrectly (#2242)

ZX-ModelCloud · github-code-quality[bot] · web-flow · commit f819e9af28c9 · 2025-12-09T15:43:02.000+08:00
* The `alias_from_turtle_for_submodule()` function needs to check if the buffer is persistent.

Signed-off-by: ZX-ModelCloud &lt;zx@modelcloud.ai&gt;

* get_state_dict_for_save() needs to skip non-persistent buffers.

Signed-off-by: ZX-ModelCloud &lt;zx@modelcloud.ai&gt;

* add test_model_save.py

Signed-off-by: ZX-ModelCloud &lt;zx@modelcloud.ai&gt;

* format

Signed-off-by: ZX-ModelCloud &lt;zx@modelcloud.ai&gt;

* Potential fix for pull request finding 'First parameter of a class method is not named 'cls''

Co-authored-by: Copilot Autofix powered by AI &lt;223894421+github-code-quality[bot]@users.noreply.github.com&gt;

* add comment

Signed-off-by: ZX-ModelCloud &lt;zx@modelcloud.ai&gt;

---------

Signed-off-by: ZX-ModelCloud &lt;zx@modelcloud.ai&gt;
Co-authored-by: Copilot Autofix powered by AI &lt;223894421+github-code-quality[bot]@users.noreply.github.com&gt;
diff --git a/gptqmodel/utils/model.py b/gptqmodel/utils/model.py
@@ -196,6 +196,14 @@ def find_modules(module: nn.Module, layers=None, name: str="") -> Dict[str, nn.M
     return res
 
 
+def get_module_by_name(module, child_name):
+    # get the child module by its name relative to the module
+    for name, m in module.named_modules():
+        if name == child_name:
+            return m
+    raise ValueError(f"Cannot find child_name {child_name} in module {module}")
+
+
 def get_module_by_name_prefix(model, module_name: Union[List[str], str]):
     module_name_list = module_name if isinstance(module_name, list) else [module_name]
     for name, module in model.named_modules():
@@ -1467,7 +1475,13 @@ def _collect_state_dict_with_offload(model: nn.Module, offload_root: str) -> Dic
     for name, buf in model.named_buffers():
         if name in state_dict:
             continue
+
+        # If the buffer is non-persistent, it does not need to be written to state_dict.
         module_path, leaf = _split_parameter_path(name)
+        module = get_module_by_name(model, module_path)
+        if hasattr(module, "_non_persistent_buffers_set") and leaf in module._non_persistent_buffers_set:
+            continue
+
         if getattr(buf, "is_meta", False) or buf.device.type == "meta":
             source = _resolve_offload_entry(
                 offload_root,
@@ -1504,6 +1518,13 @@ def get_state_dict_for_save(model: nn.Module, offload_root: Optional[str] = None
         for name, buf in model.named_buffers():
             if name in state_dict:
                 continue
+
+            # If the buffer is non-persistent, it does not need to be written to state_dict.
+            module_path, leaf = _split_parameter_path(name)
+            module = get_module_by_name(model, module_path)
+            if hasattr(module, "_non_persistent_buffers_set") and leaf in module._non_persistent_buffers_set:
+                continue
+
             state_dict[name] = TensorSource(name=name, torch_dtype=buf.dtype, shape=tuple(buf.shape), source=buf)
 
     ptrs = collections.defaultdict(list)
diff --git a/gptqmodel/utils/structure.py b/gptqmodel/utils/structure.py
@@ -529,7 +529,7 @@ def alias_from_turtle_for_submodule(
 
     # Resolve path & source submodule (on CPU/mmap)
     path = _get_qualified_name(target_model, target_submodule)
-    src_map = dict(turtle_model.named_modules())
+    src_map: Dict[str, nn.Module] = dict(turtle_model.named_modules())
     if path not in src_map:
         raise KeyError(f"Path '{path}' not found in turtle_model.")
     src_sub = src_map[path]
@@ -544,25 +544,33 @@ def alias_from_turtle_for_submodule(
                 continue
             t_p_new = _ensure_target_storage_on_device_(t_p, device)
             if t_p_new is not t_p:
-                parent, leaf = _get_parent_and_leaf_by_path(target_submodule, name)
-                setattr(parent, leaf, t_p_new)
+                t_parent, leaf = _get_parent_and_leaf_by_path(target_submodule, name)
+                setattr(t_parent, leaf, t_p_new)
                 t_p = t_p_new
             t_p.detach().copy_(s_p.detach(), non_blocking=(non_blocking and s_p.is_pinned()))
 
     t_bufs = dict(target_submodule.named_buffers(recurse=True))
     s_bufs = dict(src_sub.named_buffers(recurse=True))
     for name, s_b in s_bufs.items():
         tb = t_bufs.get(name)
-        parent, leaf = _get_parent_and_leaf_by_path(target_submodule, name)
+        t_parent, leaf = _get_parent_and_leaf_by_path(target_submodule, name)
+        s_parent, _ = _get_parent_and_leaf_by_path(src_sub, name)
+
+        # nn.Module decides buffer persistence using `_non_persistent_buffers_set`:
+        # the buffer is persistent unless its name is in this set.
+        persistent = True
+        if hasattr(s_parent, "_non_persistent_buffers_set"):
+            persistent = leaf not in s_parent._non_persistent_buffers_set
+
         if tb is None or getattr(tb, "is_meta", False) or tb.device.type == "meta":
             new_b = torch.empty_like(s_b, device=device)
             new_b.copy_(s_b.detach(), non_blocking=(non_blocking and s_b.is_pinned()))
-            parent.register_buffer(leaf, new_b, persistent=True)
+            t_parent.register_buffer(leaf, new_b, persistent=persistent)
         else:
             if tb.device != device:
                 new_tb = torch.empty_like(s_b, device=device)
                 new_tb.copy_(s_b.detach(), non_blocking=(non_blocking and s_b.is_pinned()))
-                parent.register_buffer(leaf, new_tb, persistent=True)
+                t_parent.register_buffer(leaf, new_tb, persistent=persistent)
             else:
                 tb.copy_(s_b.detach(), non_blocking=(non_blocking and s_b.is_pinned()))
 
diff --git a/tests/test_model_save.py b/tests/test_model_save.py
@@ -0,0 +1,62 @@
+# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
+# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
+# SPDX-License-Identifier: Apache-2.0
+# Contact: qubitium@modelcloud.ai, x.com/qubitium
+# -- do not touch
+import os
+import tempfile
+
+from datasets import load_dataset
+from transformers import AutoTokenizer
+
+from gptqmodel.utils.torch import torch_empty_cache
+
+
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+# -- end do not touch
+
+import unittest  # noqa: E402
+
+# isort: off
+# isort: on
+from parameterized import parameterized  # noqa: E402
+from safetensors import safe_open
+
+from gptqmodel import GPTQModel, QuantizeConfig  # noqa: E402
+
+
+class TestModelSave(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.pretrained_model_id = "/monster/data/model/Llama-3.2-1B-Instruct" # "meta-llama/Llama-3.2-1B-Instruct"
+
+        cls.tokenizer = AutoTokenizer.from_pretrained(cls.pretrained_model_id, use_fast=True)
+
+        traindata = load_dataset(path="/monster/data/model/dataset/nm-calibration", name="LLM", split="train")
+        cls.calibration_dataset = traindata.select(range(1))
+
+    @parameterized.expand([
+        True,
+        False,
+    ])
+    def test_model_save_with_non_persistent_buffer(self, offload_to_disk):
+        quantize_config = QuantizeConfig(
+            bits=4,
+            offload_to_disk=offload_to_disk,
+        )
+
+        model = GPTQModel.load(
+            self.pretrained_model_id,
+            quantize_config=quantize_config,
+        )
+        model.quantize(self.calibration_dataset, batch_size=1)
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            model.save(tmp_dir_name)
+
+            del model
+            torch_empty_cache()
+
+            with safe_open(tmp_dir_name+"/model.safetensors", framework="pt") as f:
+                print("weight_map", f.keys())
+                self.assertNotIn('model.rotary_emb.inv_freq', f.keys())