Skip to content

Commit ab880fc

Browse files
authored
fix model saving bug in megatron (#1230)
1 parent 4f66fbb commit ab880fc

File tree

2 files changed

+11
-2
lines changed

2 files changed

+11
-2
lines changed

docker/patch/latest/megatron.patch

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -356,7 +356,7 @@ index a1230568c..1fd52f65a 100644
356356
},
357357
)
358358
diff --git a/megatron/core/optimizer/distrib_optimizer.py b/megatron/core/optimizer/distrib_optimizer.py
359-
index 6e093f96f..c1dfe205b 100644
359+
index 6e093f96f..eac21a3ea 100644
360360
--- a/megatron/core/optimizer/distrib_optimizer.py
361361
+++ b/megatron/core/optimizer/distrib_optimizer.py
362362
@@ -677,6 +677,8 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
@@ -368,6 +368,15 @@ index 6e093f96f..c1dfe205b 100644
368368

369369
# Grad scaler state.
370370
if self.grad_scaler:
371+
@@ -1646,6 +1648,8 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
372+
if key == 'padding':
373+
tensors[key] = LocalNonpersistentObject(tensors[key])
374+
continue
375+
+ if key == 'step':
376+
+ continue
377+
assert tensors[key].shape == (gbuf_local_end - gbuf_local_start,), (
378+
tensors[key].shape,
379+
gbuf_local_start,
371380
diff --git a/megatron/core/parallel_state.py b/megatron/core/parallel_state.py
372381
index a273002b9..4f821cfd5 100644
373382
--- a/megatron/core/parallel_state.py

docker/version.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
nightly-dev-20251222a
1+
nightly-dev-20251226b

0 commit comments

Comments
 (0)