Skip to content

Commit 9174da2

Browse files
fix(ci): Failures on Main: Compiler GPU Tests, Tokenizer, and JsonArgparse Issues (#2149)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 22c2a4f commit 9174da2

File tree

6 files changed

+30
-13
lines changed

6 files changed

+30
-13
lines changed

.github/workflows/cpu-tests.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ env:
3131
TRANSFORMERS_CACHE: .cache-HF/transformers
3232
DATASETS_CACHE: .cache-HF/datasets
3333
HF_DATASETS_CACHE: .cache-HF/datasets
34+
TORCH_URL: "https://download.pytorch.org/whl/cpu/"
3435

3536
jobs:
3637
testing-imports:
@@ -51,7 +52,7 @@ jobs:
5152

5253
- name: Install minimal dependencies
5354
run: |
54-
pip install . -U
55+
pip install . -U --extra-index-url="${TORCH_URL}"
5556
pip list
5657
5758
- name: Testing package imports
@@ -119,7 +120,7 @@ jobs:
119120
python -m lightning_utilities.cli requirements set-oldest --req_files=pyproject.toml
120121
- name: Install dependencies
121122
run: |
122-
pip install '.[extra,compiler,test]' -U --upgrade-strategy eager
123+
pip install '.[extra,compiler,test]' -U --upgrade-strategy eager --extra-index-url="${TORCH_URL}"
123124
pip list
124125
125126
- name: Run tests

.lightning/workflows/tests.yaml

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ trigger:
44
pull_request:
55
branches: ["main"]
66

7-
image: "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_2.7.1-dev"
7+
image: "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_2.8.0-dev"
88
machine: "L4_X_2"
99
interruptible: "true"
1010
timeout: "45" # minutes
@@ -19,7 +19,7 @@ env:
1919
NCCL_DEBUG: "INFO"
2020
CUBLAS_WORKSPACE_CONFIG: ":4096:8"
2121
NCCL_IGNORE_DISABLED_P2P: "1"
22-
TORCH_VERSION: "2.7.1"
22+
TORCH_VERSION: "2.8.0"
2323
RUN_ONLY_CUDA_TESTS: "1" # run CUDA tests only
2424

2525
run: |
@@ -30,7 +30,7 @@ run: |
3030
pip list
3131
set -ex
3232
33-
pip install -q '.[extra,test]' "torch==${TORCH_VERSION}" cffi -U
33+
pip install -q '.[extra,test]' "torch==${TORCH_VERSION}" cffi -U --upgrade-strategy eager
3434
3535
if [ "${dependency}" == "compiler" ]; then
3636
pip uninstall -y torchvision torchaudio
@@ -41,17 +41,20 @@ run: |
4141
4242
pip list
4343
python -c "import torch ; gpus = torch.cuda.device_count() ; assert gpus >= 2, f'GPU: {gpus}'"
44-
python -c "from torch import __version__ as ver ; assert str(ver).split('+')[0] == '$TORCH_VERSION', f'PyTorch: installed {ver} but expected $TORCH_VERSION'"
44+
python -c "from torch import __version__ as ver ; assert str(ver).split('+')[0] == '${TORCH_VERSION}', f'PyTorch: installed {ver} but expected ${TORCH_VERSION}'"
4545
4646
pytest -v --durations=100
4747
4848
wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh
4949
PL_RUN_STANDALONE_TESTS=1 bash run_standalone_tests.sh "tests"
5050
5151
if [ "${dependency}" == "compiler" ]; then
52-
pip uninstall -y lightning-thunder
52+
pip uninstall -y lightning-thunder transformers
5353
# install thunder from source, so that, thunder.tests will be available
5454
pip install -U "lightning-thunder[test] @ git+https://github.com/Lightning-AI/lightning-thunder.git" "torch==${TORCH_VERSION}"
55+
# Pin transformers to match thunder's test_networks.py requirements
56+
# See: https://github.com/Lightning-AI/lightning-thunder/blob/main/requirements/test.txt
57+
pip install transformers==4.52.4 # todo: find more robust way
5558
# without env var, it filters out all tests
5659
RUN_ONLY_CUDA_TESTS=0 pytest tests/ext_thunder/test_thunder_networks.py -v
5760
fi

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ dependencies = [
2828
# download models:
2929
"huggingface-hub>=0.30,<0.35",
3030
"jsonargparse[signatures]>=4.31,<=4.32.1; python_version<'3.10'", # 4.33 does not seem to be compatible with Python 3.9
31-
"jsonargparse[signatures]>=4.37; python_version>'3.9'", # required to work with python3.12+
31+
"jsonargparse[signatures]>=4.37,<=4.41; python_version>'3.9'", # required to work with python3.12+
3232
"lightning>=2.5",
3333
"psutil==7",
3434
"safetensors>=0.4.3",

tests/conftest.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,14 @@ def destroy_process_group():
6767
torch.distributed.destroy_process_group()
6868

6969

70+
@pytest.fixture
71+
def turn_off_tf32_and_set_seed(monkeypatch):
72+
monkeypatch.setenv("NVIDIA_TF32_OVERRIDE", "0")
73+
torch.manual_seed(42)
74+
yield
75+
torch.seed()
76+
77+
7078
class MockTokenizer:
7179
"""A dummy tokenizer that encodes each character as its ASCII code."""
7280

tests/test_serve.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ def _wait_and_check_response(waiting: int = 30):
3535

3636

3737
# todo: try to resolve this issue
38+
@pytest.mark.flaky(reruns=2, reruns_delay=30)
3839
@pytest.mark.xfail(condition=platform.system() == "Darwin", reason="it passes locally but having some issues on CI")
3940
def test_simple(tmp_path):
4041
seed_everything(123)

tests/test_tokenizer.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717

1818
# @pytest.mark.flaky(reruns=3, rerun_except=["AssertionError", "assert", "TypeError"])
19+
@pytest.mark.flaky(reruns=3, reruns_delay=120)
1920
@pytest.mark.parametrize("config", config_module.configs, ids=[c["hf_config"]["name"] for c in config_module.configs])
2021
def test_tokenizer_against_hf(config, tmp_path):
2122
config = config_module.Config(**config)
@@ -34,14 +35,17 @@ def test_tokenizer_against_hf(config, tmp_path):
3435
if "tokenizer.json" not in hf_files and "tokenizer.model" not in hf_files:
3536
raise ConnectionError("Unable to download any tokenizer files from HF")
3637

37-
# we need to rename the dir to match the model name in testing as well
38-
# since we use to it determine the model in tokenizer.py
39-
tmp_path = tmp_path.rename(tmp_path.parent / config.hf_config["name"])
38+
# Create a clean, model-specific subdirectory for this test run.
39+
# This avoids errors if previous runs or retries left files behind, ensuring the directory is always ready for fresh downloads and comparisons.
40+
model_dir = tmp_path / config.hf_config["name"]
41+
if model_dir.exists():
42+
shutil.rmtree(model_dir)
43+
os.makedirs(model_dir, exist_ok=True)
4044

4145
for filename, hf_file in hf_files.items():
42-
shutil.copy(hf_file, str(tmp_path / filename))
46+
shutil.copy(hf_file, model_dir / filename)
4347

44-
ours = Tokenizer(tmp_path)
48+
ours = Tokenizer(model_dir)
4549

4650
assert ours.vocab_size == theirs.vocab_size
4751
if config.name == "Mixtral-8x22B-v0.1":

0 commit comments

Comments
 (0)