Skip to content

Commit 6632f23

Browse files
committed
updates
1 parent a112e22 commit 6632f23

File tree

1 file changed

+17
-7
lines changed

1 file changed

+17
-7
lines changed

train/tr11-176B-ml/backup-schedule.md

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ gsutil rsync -r $six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/g
3030
gsutil rsync -r $six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step40000 gs://bigscience-backups/tr11-176B-ml/checkpoints/global_step40000
3131
gsutil rsync -r $six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step50000 gs://bigscience-backups/tr11-176B-ml/checkpoints/global_step50000
3232
gsutil rsync -r $six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step60000 gs://bigscience-backups/tr11-176B-ml/checkpoints/global_step60000
33+
gsutil rsync -r $six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step70000 gs://bigscience-backups/tr11-176B-ml/checkpoints/global_step70000
3334
3435
3536
# in-progress
@@ -39,7 +40,6 @@ gsutil rsync -r $six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/g
3940
# todo:
4041
4142
42-
gsutil rsync -r $six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step70000 gs://bigscience-backups/tr11-176B-ml/checkpoints/global_step70000
4343
gsutil rsync -r $six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step80000 gs://bigscience-backups/tr11-176B-ml/checkpoints/global_step80000
4444
gsutil rsync -r $six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step90000 gs://bigscience-backups/tr11-176B-ml/checkpoints/global_step90000
4545
gsutil rsync -r $six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step100000 gs://bigscience-backups/tr11-176B-ml/checkpoints/global_step100000
@@ -79,20 +79,18 @@ gsutil rsync -x "bf16.*" -r $six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkp
7979
gsutil rsync -x "bf16.*" -r $six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step51000 gs://bigscience-backups/tr11-176B-ml/checkpoints-weights/global_step51000
8080
gsutil rsync -x "bf16.*" -r $six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step54000 gs://bigscience-backups/tr11-176B-ml/checkpoints-weights/global_step54000
8181
gsutil rsync -x "bf16.*" -r $six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step57000 gs://bigscience-backups/tr11-176B-ml/checkpoints-weights/global_step57000
82+
gsutil rsync -x "bf16.*" -r $six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step63000 gs://bigscience-backups/tr11-176B-ml/checkpoints-weights/global_step63000
83+
gsutil rsync -x "bf16.*" -r $six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step66000 gs://bigscience-backups/tr11-176B-ml/checkpoints-weights/global_step66000
84+
gsutil rsync -x "bf16.*" -r $six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step69000 gs://bigscience-backups/tr11-176B-ml/checkpoints-weights/global_step69000
8285
8386
8487
# in-progress
8588
86-
87-
89+
gsutil rsync -x "bf16.*" -r $six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step72000 gs://bigscience-backups/tr11-176B-ml/checkpoints-weights/global_step72000
8890
8991
9092
# todo:
9193
92-
gsutil rsync -x "bf16.*" -r $six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step63000 gs://bigscience-backups/tr11-176B-ml/checkpoints-weights/global_step63000
93-
gsutil rsync -x "bf16.*" -r $six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step66000 gs://bigscience-backups/tr11-176B-ml/checkpoints-weights/global_step66000
94-
gsutil rsync -x "bf16.*" -r $six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step69000 gs://bigscience-backups/tr11-176B-ml/checkpoints-weights/global_step69000
95-
gsutil rsync -x "bf16.*" -r $six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step72000 gs://bigscience-backups/tr11-176B-ml/checkpoints-weights/global_step72000
9694
gsutil rsync -x "bf16.*" -r $six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step75000 gs://bigscience-backups/tr11-176B-ml/checkpoints-weights/global_step75000
9795
gsutil rsync -x "bf16.*" -r $six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step78000 gs://bigscience-backups/tr11-176B-ml/checkpoints-weights/global_step78000
9896
gsutil rsync -x "bf16.*" -r $six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step81000 gs://bigscience-backups/tr11-176B-ml/checkpoints-weights/global_step81000
@@ -133,3 +131,15 @@ gsutil rsync -x "bf16.*" -r $six_ALL_CCFRSTORE/checkpoints/tr11-176B-ml/checkpoi
133131
# todo:
134132
135133
```
134+
135+
136+
## Tarring the checkpoints in STORE
137+
138+
Since we don't have too many inodes in STORE we ought to tar the checkpoints
139+
140+
```
141+
cd /gpfsdsstore/projects/rech/six/commun/checkpoints/tr11-176B-ml/checkpoints
142+
cd 1
143+
find * -maxdepth 0 -type d -exec tar cvf {}.tar {} \;
144+
145+
```

0 commit comments

Comments
 (0)