vllm-project · rahul-tuli · Jan 23, 2025 · Jan 23, 2025
diff --git a/...es/sparse_2of4_quantization_fp8/README.md → examples/best_feature/README.md b/...es/sparse_2of4_quantization_fp8/README.md → examples/best_feature/README.md
diff --git a/...e_2of4_quantization_fp8/llama3_8b_2of4.py → examples/best_feature/llama3_8b_2of4.py b/...e_2of4_quantization_fp8/llama3_8b_2of4.py → examples/best_feature/llama3_8b_2of4.py
diff --git a/nm-testing/llama2.c-stories42M-pruned2.4-compressed/config.json b/nm-testing/llama2.c-stories42M-pruned2.4-compressed/config.json
@@ -0,0 +1,46 @@
+{
+  "_name_or_path": "nm-testing/llama2.c-stories42M-pruned2.4",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 512,
+  "initializer_range": 0.02,
+  "intermediate_size": 1376,
+  "max_position_embeddings": 1024,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 8,
+  "num_hidden_layers": 8,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "quantization_config": {
+    "quant_method": "compressed-tensors",
+    "sparsity_config": {
+      "format": "sparse-24-bitmask",
+      "global_sparsity": 0.21780030857394755,
+      "ignore": [
+        "lm_head"
+      ],
+      "registry_requires_subclass": false,
+      "sparsity_structure": "2:4",
+      "targets": [
+        "Linear"
+      ]
+    },
+    "version": "0.8.1"
+  },
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.1",
+  "use_cache": true,
+  "vocab_size": 32000
+}
diff --git a/tests/e2e/vLLM/temp_config/sparse2of4_fp8_dynamic.yaml b/tests/e2e/vLLM/temp_config/sparse2of4_fp8_dynamic.yaml
@@ -0,0 +1,7 @@
+cadence: "nightly"
+test_type: "regression"
+model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+recipe: tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml
+scheme: sparse2of4_fp8_dynamic
+dataset_id: HuggingFaceH4/ultrachat_200k
+dataset_split: train_sft
diff --git a/tests/e2e/vLLM/temp_config/sparse_24.yaml b/tests/e2e/vLLM/temp_config/sparse_24.yaml
@@ -0,0 +1,8 @@
+cadence: "nightly"
+test_type: "regression"
+model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+recipe: tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4.yaml
+scheme: sparse2of4_only
+dataset_id: HuggingFaceH4/ultrachat_200k
+dataset_split: train_sft
+save_compressed: False
diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py
@@ -1,6 +1,7 @@
 import os
 import re
-import shutil
+
+# import shutil
 from pathlib import Path
 from typing import Callable
 
@@ -134,19 +135,19 @@ def test_vllm(self):
 
         logger.info("================= UPLOADING TO HUB ======================")
 
-        stub = f"{HF_MODEL_HUB_NAME}/{self.save_dir}-e2e"
+        # stub = f"{HF_MODEL_HUB_NAME}/{self.save_dir}-e2e"
 
-        self.api.create_repo(
-            repo_id=stub,
-            exist_ok=True,
-            repo_type="model",
-            private=False,
-        )
+        # self.api.create_repo(
+        #     repo_id=stub,
+        #     exist_ok=True,
+        #     repo_type="model",
+        #     private=False,
+        # )
 
-        self.api.upload_folder(
-            repo_id=stub,
-            folder_path=self.save_dir,
-        )
+        # self.api.upload_folder(
+        #     repo_id=stub,
+        #     folder_path=self.save_dir,
+        # )
 
         logger.info("================= RUNNING vLLM =========================")
 
@@ -172,8 +173,9 @@ def test_vllm(self):
         self.tear_down()
 
     def tear_down(self):
-        if self.save_dir is not None:
-            shutil.rmtree(self.save_dir)
+        # if self.save_dir is not None:
+        #     shutil.rmtree(self.save_dir)
+        pass
 
     def _check_session_contains_recipe(self) -> None:
         session = active_session()