|
| 1 | +#!/bin/bash |
| 2 | +#SBATCH --job-name=move_first_checkpoints_to_tar # job name |
| 3 | +#SBATCH --ntasks=1 # number of MP tasks |
| 4 | +#SBATCH --nodes=1 |
| 5 | +#SBATCH --cpus-per-task=4 # number of cores per tasks |
| 6 | +#SBATCH --hint=nomultithread # we get physical cores not logical |
| 7 | +#SBATCH --time=20:00:00 # maximum execution time (HH:MM:SS) |
| 8 | +#SBATCH --output=logs/%x-%j.out # output file name |
| 9 | +#SBATCH --account=six@cpu |
| 10 | +#SBATCH --partition=cpu_p1 |
| 11 | + |
| 12 | +# DEBUG |
| 13 | +# SLURM_ARRAY_TASK_ID=0 # 0-149 |
| 14 | + |
| 15 | + |
| 16 | +# you have to also pass --array=0-<desired_number>%1 as an sbatch flag to compress everything, eg sbatch --array=0-149%1 move_first_150_checkpoints_to_store.slurm tr11b-1B3-ml 150 |
| 17 | + |
| 18 | +pushd $six_ALL_CCFRWORK/checkpoints |
| 19 | +# readarray CHECKPOINTS < <(find . -regex '\./tr11[a-z].*/global_step[0-9]*') |
| 20 | +# DEBUG regex to test out only on tr11e-350 |
| 21 | +# readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*') |
| 22 | +# batch size 512 -> first 150 checkpoints for 39B tokens, batch size 256 -> 300 |
| 23 | +readarray CHECKPOINTS < <(ls -v ./"${1}"/checkpoints/main/ | head -"${2}") |
| 24 | + |
| 25 | +echo "Total number of checkpoints to tar: ${#CHECKPOINTS[@]}" |
| 26 | + |
| 27 | +CHECKPOINT_TO_TAR="./${1}/checkpoints/main/${CHECKPOINTS[$SLURM_ARRAY_TASK_ID]}" |
| 28 | +echo "Checkpoint to tar: $CHECKPOINT_TO_TAR" |
| 29 | + |
| 30 | +TEMPNAME=$(dirname $CHECKPOINT_TO_TAR) |
| 31 | +DIRNAME=${TEMPNAME:2} |
| 32 | +BASENAME=$(basename $CHECKPOINT_TO_TAR) |
| 33 | + |
| 34 | +CHECKPOINT_TO_TAR=$DIRNAME/$BASENAME |
| 35 | +CHECKPOINT_TAR_TO_FOLDER=$six_ALL_CCFRSTORE/checkpoints/$DIRNAME |
| 36 | +CHECKPOINT_TAR_TO=$CHECKPOINT_TAR_TO_FOLDER/$BASENAME.tar |
| 37 | + |
| 38 | +mkdir -p $CHECKPOINT_TAR_TO_FOLDER |
| 39 | +echo $CHECKPOINT_TO_TAR |
| 40 | +echo $CHECKPOINT_TAR_TO_FOLDER |
| 41 | + |
| 42 | +# cvfj for bz2 compression; won't change much |
| 43 | +tar cvf $CHECKPOINT_TAR_TO $CHECKPOINT_TO_TAR |
| 44 | + |
| 45 | +popd |
0 commit comments