[sharktank] Xfail hanging fp8 nightly CIs (#1255)

archana-ramalingam · web-flow · commit 977d804bbb8f · 2025-04-16T15:29:36.000-07:00
fp8 compile is hanging for hours and has other compile errors in previous IREE versions. Filed issue for hanging issue [here](iree-org/iree#20528).
diff --git a/.github/workflows/ci-llama-large-tests.yaml b/.github/workflows/ci-llama-large-tests.yaml
@@ -70,7 +70,6 @@ jobs:
           source ${VENV_DIR}/bin/activate
           pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py \
             -v -s \
-            -m "expensive" \
             --run-nightly-llama-tests \
             --iree-hip-target=gfx942 \
             --iree-device=hip://0 \
diff --git a/.github/workflows/ci-llama-quick-tests.yaml b/.github/workflows/ci-llama-quick-tests.yaml
@@ -70,7 +70,6 @@ jobs:
           pytest \
             sharktank/tests/models/llama/benchmark_amdgpu_test.py \
             -v -s \
-            -m "expensive" \
             --iree-hip-target=gfx942 \
             --iree-device=hip://0 \
             --run-quick-llama-test
diff --git a/.github/workflows/ci-sharktank-nightly.yml b/.github/workflows/ci-sharktank-nightly.yml
@@ -89,7 +89,6 @@ jobs:
 
   test_perplexity_iree:
     if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
-    timeout-minutes: 1000
     name: "IREE Perplexity"
     strategy:
       matrix:
@@ -136,7 +135,6 @@ jobs:
             -v \
             -s \
             sharktank/tests/evaluate/perplexity_iree_test.py \
-            -m "expensive" \
             --run-nightly-llama-tests \
             --bs=100 \
             --iree-device=hip://0 \
@@ -160,7 +158,6 @@ jobs:
 
   test_perplexity_torch:
     if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
-    timeout-minutes: 1000
     name: "Torch Perplexity"
     strategy:
       matrix:
diff --git a/.github/workflows/ci_eval_short.yaml b/.github/workflows/ci_eval_short.yaml
@@ -65,12 +65,12 @@ jobs:
         run: |
           source ${VENV_DIR}/bin/activate
           pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py \
-            -m "expensive" \
             --bs=4 \
             --iree-device=hip://0 \
             --iree-hip-target=gfx942 \
             --iree-hal-target-device=hip \
             --llama3-8b-f16-model-path=/shark-dev/data/llama3.1/weights/8b/fp16/llama3.1_8b_fp16_instruct.irpa \
             --llama3-8b-tokenizer-path=/shark-dev/data/llama3.1/weights/8b/fp16/tokenizer_config.json \
+            --run-quick-llama-test \
             --log-cli-level=INFO
           ls -lha ${{ github.workspace }}/perplexity_ci_artifacts
diff --git a/sharktank/sharktank/evaluate/README.md b/sharktank/sharktank/evaluate/README.md
@@ -34,7 +34,6 @@ pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py -k test_llam
 ##### IREE mode
 ```bash
 pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py -k test_llama3_8B_f16 \
-  -m "expensive" \
   --llama3-8b-f16-model-path=llama3.1_8b_instruct_fp16.irpa  \
   --llama3-8b-tokenizer-path=tokenizer_config.json \
   --bs=4 \
diff --git a/sharktank/sharktank/utils/testing.py b/sharktank/sharktank/utils/testing.py
@@ -24,8 +24,33 @@
 from ..types import *
 from .math import cosine_similarity
 
+# TODO: Remove once pre-submits and nightly tests are unified to single workflow.
+def get_test_type():
+    pre_submit = 'config.getoption("--run-quick-llama-test")'
+    nightly = 'config.getoption("--run-nightly-llama-tests")'
+    if pre_submit or nightly:
+        return False
+    else:
+        return True
+
+
 is_mi300x = pytest.mark.skipif("config.getoption('iree_hip_target') != 'gfx942'")
 
+# TODO: ci-sharktank-nightly should run all nightly CIs requiring mi300x in a single workflow, dropping all test specific flags/workflows
+is_nightly = pytest.mark.skipif(
+    'not config.getoption("run-nightly-llama-tests")',
+    reason="Run large tests if --run-nightly-llama-tests is passed",
+)
+
+# TODO: ci-sharktank/test-mi300x should run all pre-submits requiring mi300x in a single workflow, dropping all test specific flags/workflows
+is_pre_submit_nightly = pytest.mark.skipif(
+    get_test_type(),
+    reason="Run large/quick tests if --run-quick-llama-test or --run-nightly-llama-tests is passed",
+)
+is_llama_8b = pytest.mark.skipif(
+    'config.getoption("llama3_8b_f16_model_path") is None',
+    reason="Run llama tests if --llama3-8b-f16-model-path is passed",
+)
 is_cpu_condition = (
     "exec('from sharktank.utils.testing import is_iree_hal_target_device_cpu') or "
     "is_iree_hal_target_device_cpu(config.getoption('iree_hal_target_device'))"
diff --git a/sharktank/tests/docs/llama_benchmarking_instructions.md b/sharktank/tests/docs/llama_benchmarking_instructions.md
@@ -3,7 +3,6 @@ In order to run Llama 3.1 8B F16 Decomposed test:
 ```
 pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py \
     -v -s \
-    -m "expensive" \
     --run-quick-test \
     --iree-hip-target=gfx942 \
     --iree-device=hip://0
diff --git a/sharktank/tests/evaluate/perplexity_iree_test.py b/sharktank/tests/evaluate/perplexity_iree_test.py
@@ -10,11 +10,11 @@
 import numpy as np
 
 from sharktank.evaluate import perplexity_iree
-from sharktank.utils.testing import is_mi300x
-
-skipif_run_quick_llama_test = pytest.mark.skipif(
-    'not config.getoption("run-nightly-llama-tests")',
-    reason="Run large tests if --run-nightly-llama-tests is passed",
+from sharktank.utils.testing import (
+    is_mi300x,
+    is_nightly,
+    is_pre_submit_nightly,
+    is_llama_8b,
 )
 
 
@@ -26,7 +26,6 @@
     "batch_size",
 )
 @is_mi300x
-@pytest.mark.expensive
 class PerplexityTest(unittest.TestCase):
     def setUp(self):
         self.current_perplexity_all = {}
@@ -35,6 +34,8 @@ def setUp(self):
         with open(self.baseline_perplexity_scores, "r") as f:
             self.baseline_perplexity = json.load(f)
 
+    @is_pre_submit_nightly
+    @is_llama_8b
     def test_llama3_8B_f16(self):
 
         # Llama 3.1 8B non-decomposed
@@ -70,7 +71,11 @@ def test_llama3_8B_f16(self):
             msg=f"Current perplexity deviates baseline by {perplexity_difference}",
         )
 
-    @skipif_run_quick_llama_test
+    @is_nightly
+    @pytest.mark.xfail(
+        run=False,
+        reason="Compile hangs. Issue: https://github.com/iree-org/iree/issues/20528",
+    )
     def test_llama3_8B_f8(self):
 
         # Llama 3.1 8B non-decomposed
@@ -109,7 +114,7 @@ def test_llama3_8B_f8(self):
             msg=f"Current perplexity deviates baseline by {perplexity_difference}",
         )
 
-    @skipif_run_quick_llama_test
+    @is_nightly
     @pytest.mark.xfail(reason="Compile Error")
     def test_llama3_405B_f16(self):
 
@@ -145,7 +150,7 @@ def test_llama3_405B_f16(self):
             msg=f"Current perplexity deviates baseline by {perplexity_difference}",
         )
 
-    @skipif_run_quick_llama_test
+    @is_nightly
     @pytest.mark.xfail(reason="Compile Error")
     def test_llama3_405B_f8(self):
 
diff --git a/sharktank/tests/evaluate/perplexity_torch_test.py b/sharktank/tests/evaluate/perplexity_torch_test.py
@@ -11,10 +11,9 @@
 import gc
 
 from sharktank.evaluate import perplexity_torch
-
-skipif_run_quick_llama_test = pytest.mark.skipif(
-    'not config.getoption("run-nightly-llama-tests")',
-    reason="Run large tests if --run-nightly-llama-tests is passed",
+from sharktank.utils.testing import (
+    is_mi300x,
+    is_nightly,
 )
 
 
@@ -25,6 +24,8 @@
     "batch_size",
     "device",
 )
+@is_mi300x
+@is_nightly
 class PerplexityTest(unittest.TestCase):
     def setUp(self):
         self.current_perplexity_all = {}
@@ -33,7 +34,6 @@ def setUp(self):
         with open(self.baseline_perplexity_scores, "r") as f:
             self.baseline_perplexity = json.load(f)
 
-    @skipif_run_quick_llama_test
     def test_llama3_8B_f16(self):
 
         # Llama 3.1 8B non-decomposed
@@ -66,7 +66,6 @@ def test_llama3_8B_f16(self):
         )
         gc.collect()
 
-    @skipif_run_quick_llama_test
     def test_llama3_8B_f8(self):
 
         # Llama 3.1 8B non-decomposed
@@ -106,7 +105,6 @@ def test_llama3_8B_f8(self):
     @pytest.mark.xfail(
         reason="Non-decomposed attention is not supported yet",
     )
-    @skipif_run_quick_llama_test
     def test_llama3_405B_f16(self):
 
         # Llama 3.1 405B non-decomposed
@@ -143,7 +141,6 @@ def test_llama3_405B_f16(self):
     @pytest.mark.xfail(
         reason="Non-decomposed attention is not supported yet",
     )
-    @skipif_run_quick_llama_test
     def test_llama3_405B_f8(self):
 
         # Llama 3.1 405B non-decomposed
diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
@@ -19,11 +19,10 @@
     IreeBenchmarkException,
     IreeCompileException,
 )
-
-is_mi300x = pytest.mark.skipif("config.getoption('iree_hip_target') != 'gfx942'")
-skipif_run_quick_llama_test = pytest.mark.skipif(
-    'config.getoption("run-quick-llama-test") and not config.getoption("run-nightly-llama-tests")',
-    reason="Skipping largs tests when --run-quick-llama-test is set.",
+from sharktank.utils.testing import (
+    is_mi300x,
+    is_nightly,
+    is_pre_submit_nightly,
 )
 
 
@@ -100,7 +99,6 @@ def save_benchmarks(
 
 
 @is_mi300x
-@pytest.mark.expensive
 class BenchmarkLlama3_1_8B(BaseBenchmarkTest):
     def setUp(self):
         super().setUp()
@@ -224,7 +222,7 @@ def setUp(self):
             ">>",
         ]
 
-    @skipif_run_quick_llama_test
+    @is_pre_submit_nightly
     def testBenchmark8B_f16_TP1_Non_Decomposed_Input_Len_128(self):
         output_file_name = self.dir_path_8b / "f16_torch_128_tp1"
         output_mlir = self.llama8b_f16_torch_sdpa_artifacts.create_file(
@@ -269,7 +267,7 @@ def testBenchmark8B_f16_TP1_Non_Decomposed_Input_Len_128(self):
             cwd=self.repo_root,
         )
 
-    @skipif_run_quick_llama_test
+    @is_nightly
     def testBenchmark8B_f16_TP1_Non_Decomposed_Input_Len_2048(self):
         output_file_name = self.dir_path_8b / "f16_torch_2048_tp1"
         output_mlir = self.llama8b_f16_torch_sdpa_artifacts.create_file(
@@ -314,8 +312,12 @@ def testBenchmark8B_f16_TP1_Non_Decomposed_Input_Len_2048(self):
             cwd=self.repo_root,
         )
 
-    @skipif_run_quick_llama_test
-    @pytest.mark.xfail(reason="Benchmarking Error", raises=IreeBenchmarkException)
+    @is_nightly
+    @pytest.mark.xfail(
+        run=False,
+        reason="https://github.com/iree-org/iree/issues/20528",
+        raises=IreeCompileException,
+    )
     def testBenchmark8B_fp8_TP1_Non_Decomposed(self):
         output_file_name = self.dir_path_8b / "fp8_torch_tp1"
         output_mlir = self.llama8b_fp8_torch_sdpa_artifacts.create_file(
@@ -360,7 +362,7 @@ def testBenchmark8B_fp8_TP1_Non_Decomposed(self):
             cwd=self.repo_root,
         )
 
-    @skipif_run_quick_llama_test
+    @is_nightly
     def testBenchmark8B_fp8_attnf8_TP1_Non_Decomposed_Input_Len_2048(self):
         output_file_name = self.dir_path_8b / "fp8_attnf8_2048_tp1"
         output_mlir = self.llama8b_fp8_attnf8_sdpa_artifacts.create_file(
@@ -405,7 +407,7 @@ def testBenchmark8B_fp8_attnf8_TP1_Non_Decomposed_Input_Len_2048(self):
             cwd=self.repo_root,
         )
 
-    @skipif_run_quick_llama_test
+    @is_nightly
     def testBenchmark8B_fp8_attnf8_TP1_Non_Decomposed_Input_Len_128(self):
         output_file_name = self.dir_path_8b / "fp8_attnf8_128_tp1"
         output_mlir = self.llama8b_fp8_attnf8_sdpa_artifacts.create_file(
@@ -452,8 +454,7 @@ def testBenchmark8B_fp8_attnf8_TP1_Non_Decomposed_Input_Len_128(self):
 
 
 @is_mi300x
-@pytest.mark.expensive
-@skipif_run_quick_llama_test
+@is_nightly
 class BenchmarkLlama3_1_70B(BaseBenchmarkTest):
     def setUp(self):
         super().setUp()
@@ -808,8 +809,7 @@ def testBenchmark70B_fp8_TP1_Non_Decomposed(self):
 
 
 @is_mi300x
-@pytest.mark.expensive
-@skipif_run_quick_llama_test
+@is_nightly
 class BenchmarkLlama3_1_405B(BaseBenchmarkTest):
     def setUp(self):
         super().setUp()