|
| 1 | +#!/bin/bash |
| 2 | +#SBATCH --job-name=tr11b_move_to_tar # job name |
| 3 | +#SBATCH --ntasks=1 # number of MP tasks |
| 4 | +#SBATCH --nodes=1 |
| 5 | +#SBATCH --cpus-per-task=4 # number of cores per tasks |
| 6 | +#SBATCH --hint=nomultithread # we get physical cores not logical |
| 7 | +#SBATCH --time=20:00:00 # maximum execution time (HH:MM:SS) |
| 8 | +#SBATCH --output=logs/%x-%j.out # output file name |
| 9 | +#SBATCH --account=six@cpu |
| 10 | +#SBATCH --array=0-1362%1 |
| 11 | +#SBATCH --partition=cpu_p1 |
| 12 | + |
| 13 | +# DEBUG |
| 14 | +# SLURM_ARRAY_TASK_ID=0 # 0-6549 |
| 15 | + |
| 16 | +pushd $six_ALL_CCFRWORK/checkpoints |
| 17 | +# readarray CHECKPOINTS < <(find . -regex '\./tr11[a-z].*/global_step[0-9]*') |
| 18 | +# DEBUG regex to test out only on tr11e-350 |
| 19 | +# readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*') |
| 20 | +# batch size 512 -> one out of 4 checkpoints for 1B tokens |
| 21 | +readarray CHECKPOINTS < <(find . -regex '\./tr11b-1B3-ml/.*/global_step[0-9]*000') |
| 22 | + |
| 23 | +echo "Total number of checkpoints to tar: ${#CHECKPOINTS[@]}" |
| 24 | + |
| 25 | +CHECKPOINT_TO_TAR=${CHECKPOINTS[$SLURM_ARRAY_TASK_ID]} |
| 26 | +echo "Checkpoint to tar: $CHECKPOINT_TO_TAR" |
| 27 | + |
| 28 | +TEMPNAME=$(dirname $CHECKPOINT_TO_TAR) |
| 29 | +DIRNAME=${TEMPNAME:2} |
| 30 | +BASENAME=$(basename $CHECKPOINT_TO_TAR) |
| 31 | + |
| 32 | +CHECKPOINT_TO_TAR=$DIRNAME/$BASENAME |
| 33 | +CHECKPOINT_TAR_TO_FOLDER=$six_ALL_CCFRSTORE/checkpoints/$DIRNAME |
| 34 | +CHECKPOINT_TAR_TO=$CHECKPOINT_TAR_TO_FOLDER/$BASENAME.tar |
| 35 | + |
| 36 | +mkdir -p $CHECKPOINT_TAR_TO_FOLDER |
| 37 | +echo $CHECKPOINT_TO_TAR |
| 38 | +echo $CHECKPOINT_TAR_TO_FOLDER |
| 39 | + |
| 40 | +# cvfj for bz2 compression; won't change much |
| 41 | +tar cvf $CHECKPOINT_TAR_TO $CHECKPOINT_TO_TAR |
| 42 | + |
| 43 | +popd |
| 44 | + |
0 commit comments