Skip to content

Commit 6a324e8

Browse files
authored
ci: tests now run with HF_DATASETS_CACHE to speed up e2e time (#41)
Signed-off-by: Terry Kong <terryk@nvidia.com>
1 parent 4d62783 commit 6a324e8

File tree

5 files changed

+45
-5
lines changed

5 files changed

+45
-5
lines changed

.github/workflows/_run_test.yml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@ jobs:
6868
# NOTE: under certain circumstances, the checkout action cannot clean up the workspace properly, so
6969
# this workaround is needed to ensure that the workspace is clean by removing all files created by root.
7070
#
71+
# Tracking issue: https://github.com/NVIDIA/reinforcer/issues/76
72+
#
7173
# The error observed looked like this from the checkout action:
7274
# Run actions/checkout@v4
7375
# ...
@@ -85,15 +87,32 @@ jobs:
8587

8688
- name: Start container
8789
run: |
90+
# TODO: disable caching (--env UV_CACHE_DIR=/uv_cache --volume /mnt/datadrive/TestData/reinforcer/uv_cache:/uv_cache)
91+
# for now since it results in
92+
#
93+
# Using CPython 3.12.9 interpreter at: /home/ray/anaconda3/bin/python3
94+
# Creating virtual environment at: .venv
95+
# × Failed to download and build `antlr4-python3-runtime==4.9.3`
96+
# ├─▶ Failed to create temporary virtualenv
97+
# ╰─▶ Permission denied (os error 13)
98+
# help: `antlr4-python3-runtime` (v4.9.3) was included because
99+
# `nemo-reinforcer` (v0.0.1) depends on `math-verify` (v0.7.0) which
100+
# depends on `latex2sympy2-extended==1.10.1` (v1.10.1) which depends on
101+
# `antlr4-python3-runtime>=4.9.3, <=4.13.2`
102+
#
103+
# Something about our CI machines causes this issue since it is not reproducible locally.
104+
88105
docker run --rm -d --name nemo_container_${{ github.run_id }} --runtime=nvidia --gpus all --shm-size=64g \
89106
--env TRANSFORMERS_OFFLINE=0 \
90107
--env HYDRA_FULL_ERROR=1 \
91108
--env HF_HOME=/home/TestData/reinforcer/hf_home \
109+
--env HF_DATASETS_CACHE=/home/TestData/reinforcer/hf_datasets_cache \
92110
--env REINFORCER_REPO_DIR=/opt/reinforcer \
93111
--volume $PWD:/opt/reinforcer \
94112
--volume /mnt/datadrive/TestData/reinforcer/datasets:/opt/reinforcer/datasets:ro \
95113
--volume /mnt/datadrive/TestData/reinforcer/checkpoints:/home/TestData/reinforcer/checkpoints:ro \
96114
--volume /mnt/datadrive/TestData/reinforcer/hf_home/hub:/home/TestData/reinforcer/hf_home/hub \
115+
--volume /mnt/datadrive/TestData/reinforcer/hf_datasets_cache:/home/TestData/reinforcer/hf_datasets_cache \
97116
nemoci.azurecr.io/nemo_reinforcer_container:${{ github.run_id }} \
98117
bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"
99118

.github/workflows/cicd-main.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ jobs:
150150
if: ${{ needs.pre-flight.outputs.run_ci == 'true' }}
151151
with:
152152
RUNNER: self-hosted-azure
153-
TIMEOUT: 10
153+
TIMEOUT: 15
154154
SCRIPT: |
155155
cd ${REINFORCER_REPO_DIR}
156156
uv run --extra test bash -x ./tests/run_unit.sh
@@ -171,7 +171,7 @@ jobs:
171171
# TODO: For now, allow these to fail since the checks are not robust.
172172
IS_OPTIONAL: true
173173
RUNNER: self-hosted-azure
174-
TIMEOUT: 8
174+
TIMEOUT: 15
175175
SCRIPT: |
176176
cd ${REINFORCER_REPO_DIR}
177177
uv run bash ./tests/functional/${{ matrix.test_case }}

tests/functional/grpo.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,10 @@ LOG_DIR=$SCRIPT_DIR/$(basename $0 .sh)-logs
1111
JSON_METRICS=$LOG_DIR/$(basename $0 .sh).json
1212
RUN_LOG=$LOG_DIR/$(basename $0 .sh).log
1313
export RAY_DEDUP_LOGS=0
14-
export UV_CACHE_DIR=$PROJECT_ROOT/uv_cache
14+
export UV_CACHE_DIR=${UV_CACHE_DIR:-$PROJECT_ROOT/uv_cache}
1515
export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
1616

17+
rm -rf $LOG_DIR
1718
mkdir -p $LOG_DIR
1819

1920
cd $PROJECT_ROOT

tests/functional/sft.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,18 @@ LOG_DIR=$SCRIPT_DIR/$(basename $0 .sh)-logs
1111
JSON_METRICS=$LOG_DIR/$(basename $0 .sh).json
1212
RUN_LOG=$LOG_DIR/$(basename $0 .sh).log
1313
export RAY_DEDUP_LOGS=0
14-
export UV_CACHE_DIR=$PROJECT_ROOT/uv_cache
14+
export UV_CACHE_DIR=${UV_CACHE_DIR:-$PROJECT_ROOT/uv_cache}
1515
export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
1616

17+
rm -rf $LOG_DIR
1718
mkdir -p $LOG_DIR
1819

1920
cd $PROJECT_ROOT
2021
python -u $PROJECT_ROOT/examples/run_sft.py \
22+
policy.model_name=meta-llama/Llama-3.2-1B \
2123
cluster.gpus_per_node=2 \
2224
sft.max_num_steps=10 \
25+
sft.val_batches=1 \
2326
logger.tensorboard_enabled=true \
2427
logger.log_dir=$LOG_DIR \
2528
logger.wandb_enabled=false \

tests/run_functional_in_docker.sh

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,11 @@ TEST_SCRIPT=$(realpath $1)
2727
CONTAINER=${CONTAINER}
2828

2929
export HF_HOME=${HF_HOME:-$(realpath $SCRIPT_DIR/../hf_home)}
30+
export HF_DATASETS_CACHE=${HF_DATASETS_CACHE:-$(realpath $SCRIPT_DIR/../hf_datasets_cache)}
31+
export UV_CACHE_DIR=${UV_CACHE_DIR:-$(realpath $SCRIPT_DIR/../uv_cache)}
3032
mkdir -p $HF_HOME
33+
mkdir -p $HF_DATASETS_CACHE
34+
mkdir -p $UV_CACHE_DIR
3135

3236
# Check if running in GitLab CI
3337
INTERACTIVE_FLAG=""
@@ -44,4 +48,17 @@ fi
4448
# We have found that 111 does not always work and can leave the filesystem permissions in a bad state.
4549

4650
# Run the script inside the Docker container with GPU support
47-
docker run -u root $INTERACTIVE_FLAG --ulimit memlock=-1 --ulimit stack=67108864 --rm --gpus '"device=0,1"' -v "$PROJECT_ROOT:$PROJECT_ROOT" -v $HF_HOME:/hf_home -e WANDB_API_KEY -e HF_TOKEN -e HF_HOME=/hf_home -e HOME=/tmp/ -w $SCRIPT_DIR "$CONTAINER" -- bash -x -c "umask 000 && uv run bash -x $TEST_SCRIPT"
51+
docker run -u root $INTERACTIVE_FLAG --ulimit memlock=-1 --ulimit stack=67108864 --rm --gpus '"device=0,1"' \
52+
-v "$PROJECT_ROOT:$PROJECT_ROOT" \
53+
-v $HF_HOME:/hf_home \
54+
-v $HF_DATASETS_CACHE:/hf_datasets_cache \
55+
-v $UV_CACHE_DIR:/uv_cache \
56+
-e WANDB_API_KEY \
57+
-e HF_TOKEN \
58+
-e HF_HOME=/hf_home \
59+
-e HF_DATASETS_CACHE=/hf_datasets_cache \
60+
-e UV_CACHE_DIR=/uv_cache \
61+
-e HOME=/tmp/ \
62+
-w $SCRIPT_DIR \
63+
"$CONTAINER" -- \
64+
bash -x -c "umask 000 && uv run bash -x $TEST_SCRIPT"

0 commit comments

Comments
 (0)