diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py index e28a165f..586745a1 100644 --- a/modelopt/torch/export/unified_export_megatron.py +++ b/modelopt/torch/export/unified_export_megatron.py @@ -1085,7 +1085,10 @@ def _get_state_dict(self): self.rules["k_layernorm"](layer.self_attention.k_layernorm, layer_id) self.rules["linear_qkv"](layer.self_attention.linear_qkv, layer_id) self.rules["linear_proj"](layer.self_attention.linear_proj, layer_id) - if hasattr(layer.self_attention.core_attention, "softmax_offset"): + if ( + getattr(layer.self_attention.core_attention, "softmax_offset", None) + is not None + ): self.rules["softmax_offset"]( layer.self_attention.core_attention.softmax_offset, layer_id )