fix(ci): Failures on Main: Compiler GPU Tests, Tokenizer, and JsonArgparse Issues (#2149)

bhimrazy · pre-commit-ci[bot] · web-flow · commit 9174da2cc4ec · 2025-11-04T10:30:57.000+01:00
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
diff --git a/.github/workflows/cpu-tests.yml b/.github/workflows/cpu-tests.yml
@@ -31,6 +31,7 @@ env:
   TRANSFORMERS_CACHE: .cache-HF/transformers
   DATASETS_CACHE: .cache-HF/datasets
   HF_DATASETS_CACHE: .cache-HF/datasets
+  TORCH_URL: "https://download.pytorch.org/whl/cpu/"
 
 jobs:
   testing-imports:
@@ -51,7 +52,7 @@ jobs:
 
       - name: Install minimal dependencies
         run: |
-          pip install . -U
+          pip install . -U --extra-index-url="${TORCH_URL}"
           pip list
 
       - name: Testing package imports
@@ -119,7 +120,7 @@ jobs:
           python -m lightning_utilities.cli requirements set-oldest --req_files=pyproject.toml
       - name: Install dependencies
         run: |
-          pip install '.[extra,compiler,test]' -U --upgrade-strategy eager
+          pip install '.[extra,compiler,test]' -U --upgrade-strategy eager --extra-index-url="${TORCH_URL}"
           pip list
 
       - name: Run tests
diff --git a/.lightning/workflows/tests.yaml b/.lightning/workflows/tests.yaml
@@ -4,7 +4,7 @@ trigger:
   pull_request:
     branches: ["main"]
 
-image: "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_2.7.1-dev"
+image: "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_2.8.0-dev"
 machine: "L4_X_2"
 interruptible: "true"
 timeout: "45" # minutes
@@ -19,7 +19,7 @@ env:
   NCCL_DEBUG: "INFO"
   CUBLAS_WORKSPACE_CONFIG: ":4096:8"
   NCCL_IGNORE_DISABLED_P2P: "1"
-  TORCH_VERSION: "2.7.1"
+  TORCH_VERSION: "2.8.0"
   RUN_ONLY_CUDA_TESTS: "1" # run CUDA tests only
 
 run: |
@@ -30,7 +30,7 @@ run: |
   pip list
   set -ex
 
-  pip install -q '.[extra,test]' "torch==${TORCH_VERSION}" cffi -U
+  pip install -q '.[extra,test]' "torch==${TORCH_VERSION}" cffi -U --upgrade-strategy eager
 
   if [ "${dependency}" == "compiler" ]; then
     pip uninstall -y torchvision torchaudio
@@ -41,17 +41,20 @@ run: |
 
   pip list
   python -c "import torch ; gpus = torch.cuda.device_count() ; assert gpus >= 2, f'GPU: {gpus}'"
-  python -c "from torch import __version__ as ver ; assert str(ver).split('+')[0] == '$TORCH_VERSION', f'PyTorch: installed {ver} but expected $TORCH_VERSION'"
+  python -c "from torch import __version__ as ver ; assert str(ver).split('+')[0] == '${TORCH_VERSION}', f'PyTorch: installed {ver} but expected ${TORCH_VERSION}'"
 
   pytest -v --durations=100
 
   wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh
   PL_RUN_STANDALONE_TESTS=1 bash run_standalone_tests.sh "tests"
 
   if [ "${dependency}" == "compiler" ]; then
-    pip uninstall -y lightning-thunder
+    pip uninstall -y lightning-thunder transformers
     # install thunder from source, so that, thunder.tests will be available
     pip install -U "lightning-thunder[test] @ git+https://github.com/Lightning-AI/lightning-thunder.git" "torch==${TORCH_VERSION}"
+    # Pin transformers to match thunder's test_networks.py requirements
+    # See: https://github.com/Lightning-AI/lightning-thunder/blob/main/requirements/test.txt
+    pip install transformers==4.52.4 # todo: find more robust way
     # without env var, it filters out all tests
     RUN_ONLY_CUDA_TESTS=0 pytest tests/ext_thunder/test_thunder_networks.py -v
   fi
diff --git a/pyproject.toml b/pyproject.toml
@@ -28,7 +28,7 @@ dependencies = [
   # download models:
   "huggingface-hub>=0.30,<0.35",
   "jsonargparse[signatures]>=4.31,<=4.32.1; python_version<'3.10'", # 4.33 does not seem to be compatible with Python 3.9
-  "jsonargparse[signatures]>=4.37; python_version>'3.9'",           # required to work with python3.12+
+  "jsonargparse[signatures]>=4.37,<=4.41; python_version>'3.9'",    # required to work with python3.12+
   "lightning>=2.5",
   "psutil==7",
   "safetensors>=0.4.3",
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -67,6 +67,14 @@ def destroy_process_group():
         torch.distributed.destroy_process_group()
 
 
+@pytest.fixture
+def turn_off_tf32_and_set_seed(monkeypatch):
+    monkeypatch.setenv("NVIDIA_TF32_OVERRIDE", "0")
+    torch.manual_seed(42)
+    yield
+    torch.seed()
+
+
 class MockTokenizer:
     """A dummy tokenizer that encodes each character as its ASCII code."""
 
diff --git a/tests/test_serve.py b/tests/test_serve.py
@@ -35,6 +35,7 @@ def _wait_and_check_response(waiting: int = 30):
 
 
 # todo: try to resolve this issue
+@pytest.mark.flaky(reruns=2, reruns_delay=30)
 @pytest.mark.xfail(condition=platform.system() == "Darwin", reason="it passes locally but having some issues on CI")
 def test_simple(tmp_path):
     seed_everything(123)
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
@@ -16,6 +16,7 @@
 
 
 # @pytest.mark.flaky(reruns=3, rerun_except=["AssertionError", "assert", "TypeError"])
+@pytest.mark.flaky(reruns=3, reruns_delay=120)
 @pytest.mark.parametrize("config", config_module.configs, ids=[c["hf_config"]["name"] for c in config_module.configs])
 def test_tokenizer_against_hf(config, tmp_path):
     config = config_module.Config(**config)
@@ -34,14 +35,17 @@ def test_tokenizer_against_hf(config, tmp_path):
     if "tokenizer.json" not in hf_files and "tokenizer.model" not in hf_files:
         raise ConnectionError("Unable to download any tokenizer files from HF")
 
-    # we need to rename the dir to match the model name in testing as well
-    # since we use to it determine the model in tokenizer.py
-    tmp_path = tmp_path.rename(tmp_path.parent / config.hf_config["name"])
+    # Create a clean, model-specific subdirectory for this test run.
+    # This avoids errors if previous runs or retries left files behind, ensuring the directory is always ready for fresh downloads and comparisons.
+    model_dir = tmp_path / config.hf_config["name"]
+    if model_dir.exists():
+        shutil.rmtree(model_dir)
+    os.makedirs(model_dir, exist_ok=True)
 
     for filename, hf_file in hf_files.items():
-        shutil.copy(hf_file, str(tmp_path / filename))
+        shutil.copy(hf_file, model_dir / filename)
 
-    ours = Tokenizer(tmp_path)
+    ours = Tokenizer(model_dir)
 
     assert ours.vocab_size == theirs.vocab_size
     if config.name == "Mixtral-8x22B-v0.1":