NVIDIA · yueshen2016 · Sep 18, 2025 · Sep 17, 2025 · coderabbitai · Sep 17, 2025
@@ -1085,7 +1085,10 @@ def _get_state_dict(self):
                             self.rules["k_layernorm"](layer.self_attention.k_layernorm, layer_id)
                         self.rules["linear_qkv"](layer.self_attention.linear_qkv, layer_id)
                         self.rules["linear_proj"](layer.self_attention.linear_proj, layer_id)
-                        if hasattr(layer.self_attention.core_attention, "softmax_offset"):
+                        if (
+                            getattr(layer.self_attention.core_attention, "softmax_offset", None)
+                            is not None
+                        ):
                             self.rules["softmax_offset"](
                                 layer.self_attention.core_attention.softmax_offset, layer_id
                             )
-                        if (
-                            getattr(layer.self_attention.core_attention, "softmax_offset", None)
-                            is not None
-                        ):
-                            self.rules["softmax_offset"](
-                                layer.self_attention.core_attention.softmax_offset, layer_id
-                            )
+                        core_attn = getattr(layer.self_attention, "core_attention", None)
+                        softmax_offset = getattr(core_attn, "softmax_offset", None)
+                        if softmax_offset is not None:
+                            self.rules["softmax_offset"](softmax_offset, layer_id)
-                        if (
-                            getattr(layer.self_attention.core_attention, "softmax_offset", None)
-                            is not None
-                        ):
-                            self.rules["softmax_offset"](
-                                layer.self_attention.core_attention.softmax_offset, layer_id
-                            )
+                        core_attn = getattr(layer.self_attention, "core_attention", None)
+                        softmax_offset = getattr(core_attn, "softmax_offset", None)
+                        if softmax_offset is not None:
+                            self.rules["softmax_offset"](softmax_offset, layer_id)