6868 # NOTE: under certain circumstances, the checkout action cannot clean up the workspace properly, so
6969 # this workaround is needed to ensure that the workspace is clean by removing all files created by root.
7070 #
71+ # Tracking issue: https://github.com/NVIDIA/reinforcer/issues/76
72+ #
7173 # The error observed looked like this from the checkout action:
7274 # Run actions/checkout@v4
7375 # ...
@@ -85,15 +87,32 @@ jobs:
8587
8688 - name : Start container
8789 run : |
90+ # TODO: disable caching (--env UV_CACHE_DIR=/uv_cache --volume /mnt/datadrive/TestData/reinforcer/uv_cache:/uv_cache)
91+ # for now since it results in
92+ #
93+ # Using CPython 3.12.9 interpreter at: /home/ray/anaconda3/bin/python3
94+ # Creating virtual environment at: .venv
95+ # × Failed to download and build `antlr4-python3-runtime==4.9.3`
96+ # ├─▶ Failed to create temporary virtualenv
97+ # ╰─▶ Permission denied (os error 13)
98+ # help: `antlr4-python3-runtime` (v4.9.3) was included because
99+ # `nemo-reinforcer` (v0.0.1) depends on `math-verify` (v0.7.0) which
100+ # depends on `latex2sympy2-extended==1.10.1` (v1.10.1) which depends on
101+ # `antlr4-python3-runtime>=4.9.3, <=4.13.2`
102+ #
103+ # Something about our CI machines causes this issue since it is not reproducible locally.
104+
88105 docker run --rm -d --name nemo_container_${{ github.run_id }} --runtime=nvidia --gpus all --shm-size=64g \
89106 --env TRANSFORMERS_OFFLINE=0 \
90107 --env HYDRA_FULL_ERROR=1 \
91108 --env HF_HOME=/home/TestData/reinforcer/hf_home \
109+ --env HF_DATASETS_CACHE=/home/TestData/reinforcer/hf_datasets_cache \
92110 --env REINFORCER_REPO_DIR=/opt/reinforcer \
93111 --volume $PWD:/opt/reinforcer \
94112 --volume /mnt/datadrive/TestData/reinforcer/datasets:/opt/reinforcer/datasets:ro \
95113 --volume /mnt/datadrive/TestData/reinforcer/checkpoints:/home/TestData/reinforcer/checkpoints:ro \
96114 --volume /mnt/datadrive/TestData/reinforcer/hf_home/hub:/home/TestData/reinforcer/hf_home/hub \
115+ --volume /mnt/datadrive/TestData/reinforcer/hf_datasets_cache:/home/TestData/reinforcer/hf_datasets_cache \
97116 nemoci.azurecr.io/nemo_reinforcer_container:${{ github.run_id }} \
98117 bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"
99118
0 commit comments