@@ -1113,7 +1113,6 @@ def megatron_fsdp_strategy_parallelize(
11131113 preserve_fp32_weights = preserve_fp32_weights ,
11141114 overlap_grad_reduce = overlap_grad_reduce ,
11151115 overlap_param_gather = overlap_param_gather ,
1116- sync_grads_each_step = False , # For better performance, avoid sync every step
11171116 check_for_nan_in_grad = check_for_nan_in_grad ,
11181117 average_in_collective = average_in_collective ,
11191118 disable_bucketing = disable_bucketing ,
@@ -1123,26 +1122,7 @@ def megatron_fsdp_strategy_parallelize(
11231122 fsdp_double_buffer = fsdp_double_buffer ,
11241123 )
11251124
1126- # Compatibility: older `megatron_fsdp.fully_shard` versions don't accept
1127- # `sync_grads_each_step`. Prefer filtering by signature, but also retry on
1128- # TypeError for cases where the callable's signature can't be inspected.
1129- try :
1130- sig = inspect .signature (megatron_fsdp_fully_shard )
1131- except (TypeError , ValueError ):
1132- sig = None
1133- if sig is not None and "sync_grads_each_step" not in sig .parameters :
1134- fsdp_kwargs .pop ("sync_grads_each_step" , None )
1135-
1136- try :
1137- model , optimizer = megatron_fsdp_fully_shard (** fsdp_kwargs )
1138- except TypeError as e :
1139- # Example: "fully_shard() got an unexpected keyword argument 'sync_grads_each_step'"
1140- if "sync_grads_each_step" in str (e ) and "unexpected keyword argument" in str (e ):
1141- fsdp_kwargs .pop ("sync_grads_each_step" , None )
1142- model , optimizer = megatron_fsdp_fully_shard (** fsdp_kwargs )
1143- else :
1144- raise
1145-
1125+ model , optimizer = megatron_fsdp_fully_shard (** fsdp_kwargs )
11461126 return model , optimizer
11471127
11481128
0 commit comments