From 9c5e19aaf1ca420eaa17735314f5026a029d2160 Mon Sep 17 00:00:00 2001
From: Rahul Tuli <rahul@neuralmagic.com>
Date: Thu, 23 Jan 2025 20:27:48 +0000
Subject: [PATCH 1/2] dummy

Signed-off-by: Rahul Tuli <rahul@neuralmagic.com>
---
 .../README.md                                 |  0
 .../llama3_8b_2of4.py                         |  0
 .../config.json                               | 46 +++++++++++++++++++
 tests/e2e/vLLM/test_vllm.py                   | 27 +++++------
 4 files changed, 60 insertions(+), 13 deletions(-)
 rename examples/{sparse_2of4_quantization_fp8 => best_feature}/README.md (100%)
 rename examples/{sparse_2of4_quantization_fp8 => best_feature}/llama3_8b_2of4.py (100%)
 create mode 100644 nm-testing/llama2.c-stories42M-pruned2.4-compressed/config.json

diff --git a/examples/sparse_2of4_quantization_fp8/README.md b/examples/best_feature/README.md
similarity index 100%
rename from examples/sparse_2of4_quantization_fp8/README.md
rename to examples/best_feature/README.md
diff --git a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py b/examples/best_feature/llama3_8b_2of4.py
similarity index 100%
rename from examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
rename to examples/best_feature/llama3_8b_2of4.py
diff --git a/nm-testing/llama2.c-stories42M-pruned2.4-compressed/config.json b/nm-testing/llama2.c-stories42M-pruned2.4-compressed/config.json
new file mode 100644
index 0000000000..316d12f2cd
--- /dev/null
+++ b/nm-testing/llama2.c-stories42M-pruned2.4-compressed/config.json
@@ -0,0 +1,46 @@
+{
+  "_name_or_path": "nm-testing/llama2.c-stories42M-pruned2.4",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 512,
+  "initializer_range": 0.02,
+  "intermediate_size": 1376,
+  "max_position_embeddings": 1024,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 8,
+  "num_hidden_layers": 8,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "quantization_config": {
+    "quant_method": "compressed-tensors",
+    "sparsity_config": {
+      "format": "sparse-24-bitmask",
+      "global_sparsity": 0.21780030857394755,
+      "ignore": [
+        "lm_head"
+      ],
+      "registry_requires_subclass": false,
+      "sparsity_structure": "2:4",
+      "targets": [
+        "Linear"
+      ]
+    },
+    "version": "0.8.1"
+  },
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.1",
+  "use_cache": true,
+  "vocab_size": 32000
+}
\ No newline at end of file
diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py
index 70a6a35e42..1539d71dd0 100644
--- a/tests/e2e/vLLM/test_vllm.py
+++ b/tests/e2e/vLLM/test_vllm.py
@@ -134,19 +134,19 @@ def test_vllm(self):
 
         logger.info("================= UPLOADING TO HUB ======================")
 
-        stub = f"{HF_MODEL_HUB_NAME}/{self.save_dir}-e2e"
+        # stub = f"{HF_MODEL_HUB_NAME}/{self.save_dir}-e2e"
 
-        self.api.create_repo(
-            repo_id=stub,
-            exist_ok=True,
-            repo_type="model",
-            private=False,
-        )
+        # self.api.create_repo(
+        #     repo_id=stub,
+        #     exist_ok=True,
+        #     repo_type="model",
+        #     private=False,
+        # )
 
-        self.api.upload_folder(
-            repo_id=stub,
-            folder_path=self.save_dir,
-        )
+        # self.api.upload_folder(
+        #     repo_id=stub,
+        #     folder_path=self.save_dir,
+        # )
 
         logger.info("================= RUNNING vLLM =========================")
 
@@ -172,8 +172,9 @@ def test_vllm(self):
         self.tear_down()
 
     def tear_down(self):
-        if self.save_dir is not None:
-            shutil.rmtree(self.save_dir)
+        # if self.save_dir is not None:
+        #     shutil.rmtree(self.save_dir)
+        pass
 
     def _check_session_contains_recipe(self) -> None:
         session = active_session()

From 6c5f1d229498ae60400c8a175d9fc65cee5673d2 Mon Sep 17 00:00:00 2001
From: Rahul Tuli <rahul@neuralmagic.com>
Date: Thu, 23 Jan 2025 20:37:55 +0000
Subject: [PATCH 2/2] bleh

Signed-off-by: Rahul Tuli <rahul@neuralmagic.com>
---
 tests/e2e/vLLM/temp_config/sparse2of4_fp8_dynamic.yaml | 7 +++++++
 tests/e2e/vLLM/temp_config/sparse_24.yaml              | 8 ++++++++
 tests/e2e/vLLM/test_vllm.py                            | 3 ++-
 3 files changed, 17 insertions(+), 1 deletion(-)
 create mode 100644 tests/e2e/vLLM/temp_config/sparse2of4_fp8_dynamic.yaml
 create mode 100644 tests/e2e/vLLM/temp_config/sparse_24.yaml

diff --git a/tests/e2e/vLLM/temp_config/sparse2of4_fp8_dynamic.yaml b/tests/e2e/vLLM/temp_config/sparse2of4_fp8_dynamic.yaml
new file mode 100644
index 0000000000..e1785ce2c0
--- /dev/null
+++ b/tests/e2e/vLLM/temp_config/sparse2of4_fp8_dynamic.yaml
@@ -0,0 +1,7 @@
+cadence: "nightly"
+test_type: "regression"
+model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+recipe: tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml
+scheme: sparse2of4_fp8_dynamic
+dataset_id: HuggingFaceH4/ultrachat_200k
+dataset_split: train_sft
\ No newline at end of file
diff --git a/tests/e2e/vLLM/temp_config/sparse_24.yaml b/tests/e2e/vLLM/temp_config/sparse_24.yaml
new file mode 100644
index 0000000000..653168b977
--- /dev/null
+++ b/tests/e2e/vLLM/temp_config/sparse_24.yaml
@@ -0,0 +1,8 @@
+cadence: "nightly"
+test_type: "regression"
+model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+recipe: tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4.yaml
+scheme: sparse2of4_only
+dataset_id: HuggingFaceH4/ultrachat_200k
+dataset_split: train_sft
+save_compressed: False
\ No newline at end of file
diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py
index 1539d71dd0..5835b901ee 100644
--- a/tests/e2e/vLLM/test_vllm.py
+++ b/tests/e2e/vLLM/test_vllm.py
@@ -1,6 +1,7 @@
 import os
 import re
-import shutil
+
+# import shutil
 from pathlib import Path
 from typing import Callable