Skip to content

Commit 977d804

Browse files
[sharktank] Xfail hanging fp8 nightly CIs (#1255)
fp8 compile is hanging for hours and has other compile errors in previous IREE versions. Filed issue for hanging issue [here](iree-org/iree#20528).
1 parent 327b77f commit 977d804

File tree

10 files changed

+61
-41
lines changed

10 files changed

+61
-41
lines changed

.github/workflows/ci-llama-large-tests.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@ jobs:
7070
source ${VENV_DIR}/bin/activate
7171
pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py \
7272
-v -s \
73-
-m "expensive" \
7473
--run-nightly-llama-tests \
7574
--iree-hip-target=gfx942 \
7675
--iree-device=hip://0 \

.github/workflows/ci-llama-quick-tests.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@ jobs:
7070
pytest \
7171
sharktank/tests/models/llama/benchmark_amdgpu_test.py \
7272
-v -s \
73-
-m "expensive" \
7473
--iree-hip-target=gfx942 \
7574
--iree-device=hip://0 \
7675
--run-quick-llama-test

.github/workflows/ci-sharktank-nightly.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,6 @@ jobs:
8989
9090
test_perplexity_iree:
9191
if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
92-
timeout-minutes: 1000
9392
name: "IREE Perplexity"
9493
strategy:
9594
matrix:
@@ -136,7 +135,6 @@ jobs:
136135
-v \
137136
-s \
138137
sharktank/tests/evaluate/perplexity_iree_test.py \
139-
-m "expensive" \
140138
--run-nightly-llama-tests \
141139
--bs=100 \
142140
--iree-device=hip://0 \
@@ -160,7 +158,6 @@ jobs:
160158

161159
test_perplexity_torch:
162160
if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
163-
timeout-minutes: 1000
164161
name: "Torch Perplexity"
165162
strategy:
166163
matrix:

.github/workflows/ci_eval_short.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,12 +65,12 @@ jobs:
6565
run: |
6666
source ${VENV_DIR}/bin/activate
6767
pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py \
68-
-m "expensive" \
6968
--bs=4 \
7069
--iree-device=hip://0 \
7170
--iree-hip-target=gfx942 \
7271
--iree-hal-target-device=hip \
7372
--llama3-8b-f16-model-path=/shark-dev/data/llama3.1/weights/8b/fp16/llama3.1_8b_fp16_instruct.irpa \
7473
--llama3-8b-tokenizer-path=/shark-dev/data/llama3.1/weights/8b/fp16/tokenizer_config.json \
74+
--run-quick-llama-test \
7575
--log-cli-level=INFO
7676
ls -lha ${{ github.workspace }}/perplexity_ci_artifacts

sharktank/sharktank/evaluate/README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py -k test_llam
3434
##### IREE mode
3535
```bash
3636
pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py -k test_llama3_8B_f16 \
37-
-m "expensive" \
3837
--llama3-8b-f16-model-path=llama3.1_8b_instruct_fp16.irpa \
3938
--llama3-8b-tokenizer-path=tokenizer_config.json \
4039
--bs=4 \

sharktank/sharktank/utils/testing.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,33 @@
2424
from ..types import *
2525
from .math import cosine_similarity
2626

27+
# TODO: Remove once pre-submits and nightly tests are unified to single workflow.
28+
def get_test_type():
29+
pre_submit = 'config.getoption("--run-quick-llama-test")'
30+
nightly = 'config.getoption("--run-nightly-llama-tests")'
31+
if pre_submit or nightly:
32+
return False
33+
else:
34+
return True
35+
36+
2737
is_mi300x = pytest.mark.skipif("config.getoption('iree_hip_target') != 'gfx942'")
2838

39+
# TODO: ci-sharktank-nightly should run all nightly CIs requiring mi300x in a single workflow, dropping all test specific flags/workflows
40+
is_nightly = pytest.mark.skipif(
41+
'not config.getoption("run-nightly-llama-tests")',
42+
reason="Run large tests if --run-nightly-llama-tests is passed",
43+
)
44+
45+
# TODO: ci-sharktank/test-mi300x should run all pre-submits requiring mi300x in a single workflow, dropping all test specific flags/workflows
46+
is_pre_submit_nightly = pytest.mark.skipif(
47+
get_test_type(),
48+
reason="Run large/quick tests if --run-quick-llama-test or --run-nightly-llama-tests is passed",
49+
)
50+
is_llama_8b = pytest.mark.skipif(
51+
'config.getoption("llama3_8b_f16_model_path") is None',
52+
reason="Run llama tests if --llama3-8b-f16-model-path is passed",
53+
)
2954
is_cpu_condition = (
3055
"exec('from sharktank.utils.testing import is_iree_hal_target_device_cpu') or "
3156
"is_iree_hal_target_device_cpu(config.getoption('iree_hal_target_device'))"

sharktank/tests/docs/llama_benchmarking_instructions.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ In order to run Llama 3.1 8B F16 Decomposed test:
33
```
44
pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py \
55
-v -s \
6-
-m "expensive" \
76
--run-quick-test \
87
--iree-hip-target=gfx942 \
98
--iree-device=hip://0

sharktank/tests/evaluate/perplexity_iree_test.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,11 @@
1010
import numpy as np
1111

1212
from sharktank.evaluate import perplexity_iree
13-
from sharktank.utils.testing import is_mi300x
14-
15-
skipif_run_quick_llama_test = pytest.mark.skipif(
16-
'not config.getoption("run-nightly-llama-tests")',
17-
reason="Run large tests if --run-nightly-llama-tests is passed",
13+
from sharktank.utils.testing import (
14+
is_mi300x,
15+
is_nightly,
16+
is_pre_submit_nightly,
17+
is_llama_8b,
1818
)
1919

2020

@@ -26,7 +26,6 @@
2626
"batch_size",
2727
)
2828
@is_mi300x
29-
@pytest.mark.expensive
3029
class PerplexityTest(unittest.TestCase):
3130
def setUp(self):
3231
self.current_perplexity_all = {}
@@ -35,6 +34,8 @@ def setUp(self):
3534
with open(self.baseline_perplexity_scores, "r") as f:
3635
self.baseline_perplexity = json.load(f)
3736

37+
@is_pre_submit_nightly
38+
@is_llama_8b
3839
def test_llama3_8B_f16(self):
3940

4041
# Llama 3.1 8B non-decomposed
@@ -70,7 +71,11 @@ def test_llama3_8B_f16(self):
7071
msg=f"Current perplexity deviates baseline by {perplexity_difference}",
7172
)
7273

73-
@skipif_run_quick_llama_test
74+
@is_nightly
75+
@pytest.mark.xfail(
76+
run=False,
77+
reason="Compile hangs. Issue: https://github.com/iree-org/iree/issues/20528",
78+
)
7479
def test_llama3_8B_f8(self):
7580

7681
# Llama 3.1 8B non-decomposed
@@ -109,7 +114,7 @@ def test_llama3_8B_f8(self):
109114
msg=f"Current perplexity deviates baseline by {perplexity_difference}",
110115
)
111116

112-
@skipif_run_quick_llama_test
117+
@is_nightly
113118
@pytest.mark.xfail(reason="Compile Error")
114119
def test_llama3_405B_f16(self):
115120

@@ -145,7 +150,7 @@ def test_llama3_405B_f16(self):
145150
msg=f"Current perplexity deviates baseline by {perplexity_difference}",
146151
)
147152

148-
@skipif_run_quick_llama_test
153+
@is_nightly
149154
@pytest.mark.xfail(reason="Compile Error")
150155
def test_llama3_405B_f8(self):
151156

sharktank/tests/evaluate/perplexity_torch_test.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,9 @@
1111
import gc
1212

1313
from sharktank.evaluate import perplexity_torch
14-
15-
skipif_run_quick_llama_test = pytest.mark.skipif(
16-
'not config.getoption("run-nightly-llama-tests")',
17-
reason="Run large tests if --run-nightly-llama-tests is passed",
14+
from sharktank.utils.testing import (
15+
is_mi300x,
16+
is_nightly,
1817
)
1918

2019

@@ -25,6 +24,8 @@
2524
"batch_size",
2625
"device",
2726
)
27+
@is_mi300x
28+
@is_nightly
2829
class PerplexityTest(unittest.TestCase):
2930
def setUp(self):
3031
self.current_perplexity_all = {}
@@ -33,7 +34,6 @@ def setUp(self):
3334
with open(self.baseline_perplexity_scores, "r") as f:
3435
self.baseline_perplexity = json.load(f)
3536

36-
@skipif_run_quick_llama_test
3737
def test_llama3_8B_f16(self):
3838

3939
# Llama 3.1 8B non-decomposed
@@ -66,7 +66,6 @@ def test_llama3_8B_f16(self):
6666
)
6767
gc.collect()
6868

69-
@skipif_run_quick_llama_test
7069
def test_llama3_8B_f8(self):
7170

7271
# Llama 3.1 8B non-decomposed
@@ -106,7 +105,6 @@ def test_llama3_8B_f8(self):
106105
@pytest.mark.xfail(
107106
reason="Non-decomposed attention is not supported yet",
108107
)
109-
@skipif_run_quick_llama_test
110108
def test_llama3_405B_f16(self):
111109

112110
# Llama 3.1 405B non-decomposed
@@ -143,7 +141,6 @@ def test_llama3_405B_f16(self):
143141
@pytest.mark.xfail(
144142
reason="Non-decomposed attention is not supported yet",
145143
)
146-
@skipif_run_quick_llama_test
147144
def test_llama3_405B_f8(self):
148145

149146
# Llama 3.1 405B non-decomposed

sharktank/tests/models/llama/benchmark_amdgpu_test.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,10 @@
1919
IreeBenchmarkException,
2020
IreeCompileException,
2121
)
22-
23-
is_mi300x = pytest.mark.skipif("config.getoption('iree_hip_target') != 'gfx942'")
24-
skipif_run_quick_llama_test = pytest.mark.skipif(
25-
'config.getoption("run-quick-llama-test") and not config.getoption("run-nightly-llama-tests")',
26-
reason="Skipping largs tests when --run-quick-llama-test is set.",
22+
from sharktank.utils.testing import (
23+
is_mi300x,
24+
is_nightly,
25+
is_pre_submit_nightly,
2726
)
2827

2928

@@ -100,7 +99,6 @@ def save_benchmarks(
10099

101100

102101
@is_mi300x
103-
@pytest.mark.expensive
104102
class BenchmarkLlama3_1_8B(BaseBenchmarkTest):
105103
def setUp(self):
106104
super().setUp()
@@ -224,7 +222,7 @@ def setUp(self):
224222
">>",
225223
]
226224

227-
@skipif_run_quick_llama_test
225+
@is_pre_submit_nightly
228226
def testBenchmark8B_f16_TP1_Non_Decomposed_Input_Len_128(self):
229227
output_file_name = self.dir_path_8b / "f16_torch_128_tp1"
230228
output_mlir = self.llama8b_f16_torch_sdpa_artifacts.create_file(
@@ -269,7 +267,7 @@ def testBenchmark8B_f16_TP1_Non_Decomposed_Input_Len_128(self):
269267
cwd=self.repo_root,
270268
)
271269

272-
@skipif_run_quick_llama_test
270+
@is_nightly
273271
def testBenchmark8B_f16_TP1_Non_Decomposed_Input_Len_2048(self):
274272
output_file_name = self.dir_path_8b / "f16_torch_2048_tp1"
275273
output_mlir = self.llama8b_f16_torch_sdpa_artifacts.create_file(
@@ -314,8 +312,12 @@ def testBenchmark8B_f16_TP1_Non_Decomposed_Input_Len_2048(self):
314312
cwd=self.repo_root,
315313
)
316314

317-
@skipif_run_quick_llama_test
318-
@pytest.mark.xfail(reason="Benchmarking Error", raises=IreeBenchmarkException)
315+
@is_nightly
316+
@pytest.mark.xfail(
317+
run=False,
318+
reason="https://github.com/iree-org/iree/issues/20528",
319+
raises=IreeCompileException,
320+
)
319321
def testBenchmark8B_fp8_TP1_Non_Decomposed(self):
320322
output_file_name = self.dir_path_8b / "fp8_torch_tp1"
321323
output_mlir = self.llama8b_fp8_torch_sdpa_artifacts.create_file(
@@ -360,7 +362,7 @@ def testBenchmark8B_fp8_TP1_Non_Decomposed(self):
360362
cwd=self.repo_root,
361363
)
362364

363-
@skipif_run_quick_llama_test
365+
@is_nightly
364366
def testBenchmark8B_fp8_attnf8_TP1_Non_Decomposed_Input_Len_2048(self):
365367
output_file_name = self.dir_path_8b / "fp8_attnf8_2048_tp1"
366368
output_mlir = self.llama8b_fp8_attnf8_sdpa_artifacts.create_file(
@@ -405,7 +407,7 @@ def testBenchmark8B_fp8_attnf8_TP1_Non_Decomposed_Input_Len_2048(self):
405407
cwd=self.repo_root,
406408
)
407409

408-
@skipif_run_quick_llama_test
410+
@is_nightly
409411
def testBenchmark8B_fp8_attnf8_TP1_Non_Decomposed_Input_Len_128(self):
410412
output_file_name = self.dir_path_8b / "fp8_attnf8_128_tp1"
411413
output_mlir = self.llama8b_fp8_attnf8_sdpa_artifacts.create_file(
@@ -452,8 +454,7 @@ def testBenchmark8B_fp8_attnf8_TP1_Non_Decomposed_Input_Len_128(self):
452454

453455

454456
@is_mi300x
455-
@pytest.mark.expensive
456-
@skipif_run_quick_llama_test
457+
@is_nightly
457458
class BenchmarkLlama3_1_70B(BaseBenchmarkTest):
458459
def setUp(self):
459460
super().setUp()
@@ -808,8 +809,7 @@ def testBenchmark70B_fp8_TP1_Non_Decomposed(self):
808809

809810

810811
@is_mi300x
811-
@pytest.mark.expensive
812-
@skipif_run_quick_llama_test
812+
@is_nightly
813813
class BenchmarkLlama3_1_405B(BaseBenchmarkTest):
814814
def setUp(self):
815815
super().setUp()

0 commit comments

Comments
 (0)