Skip to content

Commit f6900d7

Browse files
committed
added model tarring script for first n checkpoints
1 parent e85bdb3 commit f6900d7

File tree

1 file changed

+45
-0
lines changed

1 file changed

+45
-0
lines changed
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=move_first_checkpoints_to_tar # job name
3+
#SBATCH --ntasks=1 # number of MP tasks
4+
#SBATCH --nodes=1
5+
#SBATCH --cpus-per-task=4 # number of cores per tasks
6+
#SBATCH --hint=nomultithread # we get physical cores not logical
7+
#SBATCH --time=20:00:00 # maximum execution time (HH:MM:SS)
8+
#SBATCH --output=logs/%x-%j.out # output file name
9+
#SBATCH --account=six@cpu
10+
#SBATCH --partition=cpu_p1
11+
12+
# DEBUG
13+
# SLURM_ARRAY_TASK_ID=0 # 0-149
14+
15+
16+
# you have to also pass --array=0-<desired_number>%1 as an sbatch flag to compress everything, eg sbatch --array=0-149%1 move_first_150_checkpoints_to_store.slurm tr11b-1B3-ml 150
17+
18+
pushd $six_ALL_CCFRWORK/checkpoints
19+
# readarray CHECKPOINTS < <(find . -regex '\./tr11[a-z].*/global_step[0-9]*')
20+
# DEBUG regex to test out only on tr11e-350
21+
# readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*')
22+
# batch size 512 -> first 150 checkpoints for 39B tokens, batch size 256 -> 300
23+
readarray CHECKPOINTS < <(ls -v ./"${1}"/checkpoints/main/ | head -"${2}")
24+
25+
echo "Total number of checkpoints to tar: ${#CHECKPOINTS[@]}"
26+
27+
CHECKPOINT_TO_TAR="./${1}/checkpoints/main/${CHECKPOINTS[$SLURM_ARRAY_TASK_ID]}"
28+
echo "Checkpoint to tar: $CHECKPOINT_TO_TAR"
29+
30+
TEMPNAME=$(dirname $CHECKPOINT_TO_TAR)
31+
DIRNAME=${TEMPNAME:2}
32+
BASENAME=$(basename $CHECKPOINT_TO_TAR)
33+
34+
CHECKPOINT_TO_TAR=$DIRNAME/$BASENAME
35+
CHECKPOINT_TAR_TO_FOLDER=$six_ALL_CCFRSTORE/checkpoints/$DIRNAME
36+
CHECKPOINT_TAR_TO=$CHECKPOINT_TAR_TO_FOLDER/$BASENAME.tar
37+
38+
mkdir -p $CHECKPOINT_TAR_TO_FOLDER
39+
echo $CHECKPOINT_TO_TAR
40+
echo $CHECKPOINT_TAR_TO_FOLDER
41+
42+
# cvfj for bz2 compression; won't change much
43+
tar cvf $CHECKPOINT_TAR_TO $CHECKPOINT_TO_TAR
44+
45+
popd

0 commit comments

Comments
 (0)