Skip to content

Commit e85bdb3

Browse files
committed
added model tarring scripts
1 parent bcb1768 commit e85bdb3

File tree

5 files changed

+219
-0
lines changed

5 files changed

+219
-0
lines changed
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=tr11b_move_to_tar # job name
3+
#SBATCH --ntasks=1 # number of MP tasks
4+
#SBATCH --nodes=1
5+
#SBATCH --cpus-per-task=4 # number of cores per tasks
6+
#SBATCH --hint=nomultithread # we get physical cores not logical
7+
#SBATCH --time=20:00:00 # maximum execution time (HH:MM:SS)
8+
#SBATCH --output=logs/%x-%j.out # output file name
9+
#SBATCH --account=six@cpu
10+
#SBATCH --array=0-1362%1
11+
#SBATCH --partition=cpu_p1
12+
13+
# DEBUG
14+
# SLURM_ARRAY_TASK_ID=0 # 0-6549
15+
16+
pushd $six_ALL_CCFRWORK/checkpoints
17+
# readarray CHECKPOINTS < <(find . -regex '\./tr11[a-z].*/global_step[0-9]*')
18+
# DEBUG regex to test out only on tr11e-350
19+
# readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*')
20+
# batch size 512 -> one out of 4 checkpoints for 1B tokens
21+
readarray CHECKPOINTS < <(find . -regex '\./tr11b-1B3-ml/.*/global_step[0-9]*000')
22+
23+
echo "Total number of checkpoints to tar: ${#CHECKPOINTS[@]}"
24+
25+
CHECKPOINT_TO_TAR=${CHECKPOINTS[$SLURM_ARRAY_TASK_ID]}
26+
echo "Checkpoint to tar: $CHECKPOINT_TO_TAR"
27+
28+
TEMPNAME=$(dirname $CHECKPOINT_TO_TAR)
29+
DIRNAME=${TEMPNAME:2}
30+
BASENAME=$(basename $CHECKPOINT_TO_TAR)
31+
32+
CHECKPOINT_TO_TAR=$DIRNAME/$BASENAME
33+
CHECKPOINT_TAR_TO_FOLDER=$six_ALL_CCFRSTORE/checkpoints/$DIRNAME
34+
CHECKPOINT_TAR_TO=$CHECKPOINT_TAR_TO_FOLDER/$BASENAME.tar
35+
36+
mkdir -p $CHECKPOINT_TAR_TO_FOLDER
37+
echo $CHECKPOINT_TO_TAR
38+
echo $CHECKPOINT_TAR_TO_FOLDER
39+
40+
# cvfj for bz2 compression; won't change much
41+
tar cvf $CHECKPOINT_TAR_TO $CHECKPOINT_TO_TAR
42+
43+
popd
44+
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=tr11c_move_to_tar # job name
3+
#SBATCH --ntasks=1 # number of MP tasks
4+
#SBATCH --nodes=1
5+
#SBATCH --cpus-per-task=4 # number of cores per tasks
6+
#SBATCH --hint=nomultithread # we get physical cores not logical
7+
#SBATCH --time=20:00:00 # maximum execution time (HH:MM:SS)
8+
#SBATCH --output=logs/%x-%j.out # output file name
9+
#SBATCH --account=six@cpu
10+
#SBATCH --array=0-239%1
11+
#SBATCH --partition=cpu_p1
12+
13+
# DEBUG
14+
# SLURM_ARRAY_TASK_ID=0 # 0-6549
15+
16+
pushd $six_ALL_CCFRWORK/checkpoints
17+
# readarray CHECKPOINTS < <(find . -regex '\./tr11[a-z].*/global_step[0-9]*')
18+
# DEBUG regex to test out only on tr11e-350
19+
# readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*')
20+
# batch size 512 -> one out of 4 checkpoints for 1B tokens
21+
readarray CHECKPOINTS < <(find . -regex '\./tr11c-2B5-ml/.*/global_step[0-9]*000')
22+
23+
echo "Total number of checkpoints to tar: ${#CHECKPOINTS[@]}"
24+
25+
CHECKPOINT_TO_TAR=${CHECKPOINTS[$SLURM_ARRAY_TASK_ID]}
26+
echo "Checkpoint to tar: $CHECKPOINT_TO_TAR"
27+
28+
TEMPNAME=$(dirname $CHECKPOINT_TO_TAR)
29+
DIRNAME=${TEMPNAME:2}
30+
BASENAME=$(basename $CHECKPOINT_TO_TAR)
31+
32+
CHECKPOINT_TO_TAR=$DIRNAME/$BASENAME
33+
CHECKPOINT_TAR_TO_FOLDER=$six_ALL_CCFRSTORE/checkpoints/$DIRNAME
34+
CHECKPOINT_TAR_TO=$CHECKPOINT_TAR_TO_FOLDER/$BASENAME.tar
35+
36+
mkdir -p $CHECKPOINT_TAR_TO_FOLDER
37+
echo $CHECKPOINT_TO_TAR
38+
echo $CHECKPOINT_TAR_TO
39+
40+
# cvfj for bz2 compression; won't change much
41+
tar cvf $CHECKPOINT_TAR_TO $CHECKPOINT_TO_TAR
42+
43+
popd
44+
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=tr11d_move_to_tar # job name
3+
#SBATCH --ntasks=1 # number of MP tasks
4+
#SBATCH --nodes=1
5+
#SBATCH --cpus-per-task=4 # number of cores per tasks
6+
#SBATCH --hint=nomultithread # we get physical cores not logical
7+
#SBATCH --time=20:00:00 # maximum execution time (HH:MM:SS)
8+
#SBATCH --output=logs/%x-%j.out # output file name
9+
#SBATCH --account=six@cpu
10+
#SBATCH --array=0-166%1
11+
#SBATCH --partition=cpu_p1
12+
13+
# DEBUG
14+
# SLURM_ARRAY_TASK_ID=0 # 0-6549
15+
16+
pushd $six_ALL_CCFRWORK/checkpoints
17+
# readarray CHECKPOINTS < <(find . -regex '\./tr11[a-z].*/global_step[0-9]*')
18+
# DEBUG regex to test out only on tr11e-350
19+
# readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*')
20+
# batch size 256 -> one out of 8 checkpoints for 1B tokens
21+
readarray CHECKPOINTS < <(find . -regex '\./tr11d-760M-ml/.*/global_step[0-9]*[02468]000')
22+
23+
echo "Total number of checkpoints to tar: ${#CHECKPOINTS[@]}"
24+
25+
CHECKPOINT_TO_TAR=${CHECKPOINTS[$SLURM_ARRAY_TASK_ID]}
26+
echo "Checkpoint to tar: $CHECKPOINT_TO_TAR"
27+
28+
TEMPNAME=$(dirname $CHECKPOINT_TO_TAR)
29+
DIRNAME=${TEMPNAME:2}
30+
BASENAME=$(basename $CHECKPOINT_TO_TAR)
31+
32+
CHECKPOINT_TO_TAR=$DIRNAME/$BASENAME
33+
CHECKPOINT_TAR_TO_FOLDER=$six_ALL_CCFRSTORE/checkpoints/$DIRNAME
34+
CHECKPOINT_TAR_TO=$CHECKPOINT_TAR_TO_FOLDER/$BASENAME.tar
35+
36+
mkdir -p $CHECKPOINT_TAR_TO_FOLDER
37+
echo $CHECKPOINT_TO_TAR
38+
echo $CHECKPOINT_TAR_TO
39+
40+
# cvfj for bz2 compression; won't change much
41+
tar cvf $CHECKPOINT_TAR_TO $CHECKPOINT_TO_TAR
42+
43+
popd
44+
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=move_to_tar # job name
3+
#SBATCH --ntasks=1 # number of MP tasks
4+
#SBATCH --nodes=1
5+
#SBATCH --cpus-per-task=4 # number of cores per tasks
6+
#SBATCH --hint=nomultithread # we get physical cores not logical
7+
#SBATCH --time=20:00:00 # maximum execution time (HH:MM:SS)
8+
#SBATCH --output=logs/%x-%j.out # output file name
9+
#SBATCH --account=six@cpu
10+
#SBATCH --array=0-276%1
11+
#SBATCH --partition=cpu_p1
12+
13+
# DEBUG
14+
# SLURM_ARRAY_TASK_ID=0 # 0-6549
15+
16+
pushd $six_ALL_CCFRWORK/checkpoints
17+
# readarray CHECKPOINTS < <(find . -regex '\./tr11[a-z].*/global_step[0-9]*')
18+
# DEBUG regex to test out only on tr11e-350
19+
# readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*')
20+
# batch size 256 -> one out of 8 checkpoints for 1B tokens
21+
readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*[02468]000')
22+
23+
echo "Total number of checkpoints to tar: ${#CHECKPOINTS[@]}"
24+
25+
CHECKPOINT_TO_TAR=${CHECKPOINTS[$SLURM_ARRAY_TASK_ID]}
26+
echo "Checkpoint to tar: $CHECKPOINT_TO_TAR"
27+
28+
TEMPNAME=$(dirname $CHECKPOINT_TO_TAR)
29+
DIRNAME=${TEMPNAME:2}
30+
BASENAME=$(basename $CHECKPOINT_TO_TAR)
31+
32+
CHECKPOINT_TO_TAR=$DIRNAME/$BASENAME
33+
CHECKPOINT_TAR_TO_FOLDER=$six_ALL_CCFRSTORE/checkpoints/$DIRNAME
34+
CHECKPOINT_TAR_TO=$CHECKPOINT_TAR_TO_FOLDER/$BASENAME.tar
35+
36+
mkdir -p $CHECKPOINT_TAR_TO_FOLDER
37+
echo $CHECKPOINT_TO_TAR
38+
echo $CHECKPOINT_TAR_TO
39+
40+
# cvfj for bz2 compression; won't change much
41+
tar cvf $CHECKPOINT_TAR_TO $CHECKPOINT_TO_TAR
42+
43+
popd
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=tr11f_move_to_tar # job name
3+
#SBATCH --ntasks=1 # number of MP tasks
4+
#SBATCH --nodes=1
5+
#SBATCH --cpus-per-task=4 # number of cores per tasks
6+
#SBATCH --hint=nomultithread # we get physical cores not logical
7+
#SBATCH --time=20:00:00 # maximum execution time (HH:MM:SS)
8+
#SBATCH --output=logs/%x-%j.out # output file name
9+
#SBATCH --account=six@cpu
10+
#SBATCH --array=0-155%1
11+
#SBATCH --partition=cpu_p1
12+
13+
# DEBUG
14+
# SLURM_ARRAY_TASK_ID=0 # 0-6549
15+
16+
pushd $six_ALL_CCFRWORK/checkpoints
17+
# readarray CHECKPOINTS < <(find . -regex '\./tr11[a-z].*/global_step[0-9]*')
18+
# DEBUG regex to test out only on tr11e-350
19+
# readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*')
20+
# batch size 512 -> one out of 4 checkpoints for 1B tokens
21+
readarray CHECKPOINTS < <(find . -regex '\./tr11f-6B3-ml/.*/global_step[0-9]*000')
22+
23+
echo "Total number of checkpoints to tar: ${#CHECKPOINTS[@]}"
24+
25+
CHECKPOINT_TO_TAR=${CHECKPOINTS[$SLURM_ARRAY_TASK_ID]}
26+
echo "Checkpoint to tar: $CHECKPOINT_TO_TAR"
27+
28+
TEMPNAME=$(dirname $CHECKPOINT_TO_TAR)
29+
DIRNAME=${TEMPNAME:2}
30+
BASENAME=$(basename $CHECKPOINT_TO_TAR)
31+
32+
CHECKPOINT_TO_TAR=$DIRNAME/$BASENAME
33+
CHECKPOINT_TAR_TO_FOLDER=$six_ALL_CCFRSTORE/checkpoints/$DIRNAME
34+
CHECKPOINT_TAR_TO=$CHECKPOINT_TAR_TO_FOLDER/$BASENAME.tar
35+
36+
mkdir -p $CHECKPOINT_TAR_TO_FOLDER
37+
echo $CHECKPOINT_TO_TAR
38+
echo $CHECKPOINT_TAR_TO
39+
40+
# cvfj for bz2 compression; won't change much
41+
tar cvf $CHECKPOINT_TAR_TO $CHECKPOINT_TO_TAR
42+
43+
popd
44+

0 commit comments

Comments
 (0)