diff --git a/modelopt/torch/export/plugins/megatron_importer.py b/modelopt/torch/export/plugins/megatron_importer.py index 696af632..4c805dc0 100644 --- a/modelopt/torch/export/plugins/megatron_importer.py +++ b/modelopt/torch/export/plugins/megatron_importer.py @@ -512,7 +512,7 @@ def _import_state_dict(self): self.rules["k_layernorm"](attention.k_layernorm, layer_id) self.rules["linear_qkv"](attention.linear_qkv, layer_id) self.rules["linear_proj"](attention.linear_proj, layer_id) - if hasattr(attention.core_attention, "softmax_offset"): + if getattr(attention.core_attention, "softmax_offset", None) is not None: self.rules["softmax_offset"]( attention.core_attention.softmax_offset, layer_id )