Llama 8b f16 regression tests using random weights and inputs (#20487)

aviator19941 · web-flow · commit ff5b150a0a11 · 2025-04-14T21:57:24.000-07:00
This PR adds regression tests for llama 8b f16 prefill and decode using
randomized llama 8b f16 weights and inputs. We use random llama 8b f16
weights in order to download a public version of the weights (the
original weights are gated and require access from the model authors
from the huggingface repo).

---------

Signed-off-by: aviator19941 &lt;avinash.sharma@amd.com&gt;
diff --git a/.github/workflows/pkgci_test_sharktank.yml b/.github/workflows/pkgci_test_sharktank.yml
@@ -43,7 +43,7 @@ jobs:
           - name: amdgpu_rocm_mi250_gfx90a
             rocm-chip: gfx90a
             backend: rocm
-            iree_test_files: /home/esaimana/iree_tests_cache
+            iree_test_files: /groups/aig_sharks/iree-tests-cache
             sku: mi250
             target: target_hip
             runs-on: nodai-amdgpu-mi250-x86-64
diff --git a/tests/external/iree-test-suites/sharktank_models/benchmarks/llama/8b_f16_decode_rocm.json b/tests/external/iree-test-suites/sharktank_models/benchmarks/llama/8b_f16_decode_rocm.json
@@ -0,0 +1,22 @@
+{
+    "inputs": [
+        "4x1xi64",
+        "4xi64",
+        "4xi64",
+        "4x5xi64",
+        "34x2097152xf16"
+    ],
+    "function_run": "decode_bs4",
+    "benchmark_flags": [
+        "--hip_use_streams=true",
+        "--benchmark_repetitions=10",
+        "--benchmark_min_warmup_time=3.0"
+    ],
+    "device": "hip",
+    "golden_time_tolerance_multiplier": {
+        "mi300": 1.1
+    },
+    "golden_time_ms": {
+        "mi300": 15.7
+    }
+}
diff --git a/tests/external/iree-test-suites/sharktank_models/benchmarks/llama/8b_f16_prefill_rocm.json b/tests/external/iree-test-suites/sharktank_models/benchmarks/llama/8b_f16_prefill_rocm.json
@@ -0,0 +1,21 @@
+{
+    "inputs": [
+        "4x128xi64",
+        "4xi64",
+        "4x4xi64",
+        "34x2097152xf16"
+    ],
+    "function_run": "prefill_bs4",
+    "benchmark_flags": [
+        "--hip_use_streams=true",
+        "--benchmark_repetitions=10",
+        "--benchmark_min_warmup_time=3.0"
+    ],
+    "device": "hip",
+    "golden_time_tolerance_multiplier": {
+        "mi300": 1.1
+    },
+    "golden_time_ms": {
+        "mi300": 43.1
+    }
+}
diff --git a/tests/external/iree-test-suites/sharktank_models/quality_tests/llama/8b_f16_decode_rocm.json b/tests/external/iree-test-suites/sharktank_models/quality_tests/llama/8b_f16_decode_rocm.json
@@ -0,0 +1,30 @@
+{
+    "inputs": [
+        {
+            "source": "https://sharkpublic.blob.core.windows.net/sharkpublic/halo-models/llm-dev/llama3_8b_random/toy_llama_inputs/decode_next_tokens.npy"
+        },
+        {
+            "source": "https://sharkpublic.blob.core.windows.net/sharkpublic/halo-models/llm-dev/llama3_8b_random/toy_llama_inputs/decode_seq_lens.npy"
+        },
+        {
+            "source": "https://sharkpublic.blob.core.windows.net/sharkpublic/halo-models/llm-dev/llama3_8b_random/toy_llama_inputs/decode_start_positions.npy"
+        },
+        {
+            "source": "https://sharkpublic.blob.core.windows.net/sharkpublic/halo-models/llm-dev/llama3_8b_random/toy_llama_inputs/decode_seq_block_ids.npy"
+        },
+        {
+            "source": "https://sharkpublic.blob.core.windows.net/sharkpublic/halo-models/llm-dev/llama3_8b_random/toy_llama_inputs/decode_cache_state.npy"
+        }
+    ],
+    "device": "hip",
+    "real_weights": "https://sharkpublic.blob.core.windows.net/sharkpublic/halo-models/llm-dev/llama3_8b_random/real_weights.irpa",
+    "mlir": "https://sharkpublic.blob.core.windows.net/sharkpublic/halo-models/llm-dev/llama3_8b_random/8b_f16_random.mlir",
+    "compiler_flags": [
+        "--iree-hal-target-device=hip",
+        "--iree-opt-level=O3",
+        "--iree-hal-indirect-command-buffers=true",
+        "--iree-stream-resource-memory-model=discrete",
+        "--iree-hal-memoization=true"
+    ],
+    "run_function": "decode_bs4"
+}
diff --git a/tests/external/iree-test-suites/sharktank_models/quality_tests/llama/8b_f16_prefill_rocm.json b/tests/external/iree-test-suites/sharktank_models/quality_tests/llama/8b_f16_prefill_rocm.json
@@ -0,0 +1,27 @@
+{
+    "inputs": [
+        {
+            "source": "https://sharkpublic.blob.core.windows.net/sharkpublic/halo-models/llm-dev/llama3_8b_random/toy_llama_inputs/prefill_token_ids.npy"
+        },
+        {
+            "source": "https://sharkpublic.blob.core.windows.net/sharkpublic/halo-models/llm-dev/llama3_8b_random/toy_llama_inputs/prefill_seq_lens.npy"
+        },
+        {
+            "source": "https://sharkpublic.blob.core.windows.net/sharkpublic/halo-models/llm-dev/llama3_8b_random/toy_llama_inputs/prefill_seq_block_ids.npy"
+        },
+        {
+            "source": "https://sharkpublic.blob.core.windows.net/sharkpublic/halo-models/llm-dev/llama3_8b_random/toy_llama_inputs/prefill_cache_state.npy"
+        }
+    ],
+    "device": "hip",
+    "real_weights": "https://sharkpublic.blob.core.windows.net/sharkpublic/halo-models/llm-dev/llama3_8b_random/real_weights.irpa",
+    "mlir": "https://sharkpublic.blob.core.windows.net/sharkpublic/halo-models/llm-dev/llama3_8b_random/8b_f16_random.mlir",
+    "compiler_flags": [
+        "--iree-hal-target-device=hip",
+        "--iree-opt-level=O3",
+        "--iree-hal-indirect-command-buffers=true",
+        "--iree-stream-resource-memory-model=discrete",
+        "--iree-hal-memoization=true"
+    ],
+    "run_function": "prefill_bs4"
+}