Replace nm-testing/llama2.c-stories15M with nm-testing/tinysmokellama-3.2 (#1833)

fynnsu · dsikka · web-flow · commit 07116307dca5 · 2025-09-22T22:18:20.000Z
SUMMARY: Created a new [model](https://huggingface.co/nm-testing/tinysmokellama-3.2) based off of a `LLama3.2` model for smoke testing. Note: the model is not properly trained, so outputs shouldn't be used, but the weights come from a real model's distribution "meta-llama/Llama-3.2-1B". TEST PLAN: Ran the "transformers" checks locally + they will be rerun by CI. --------- Signed-off-by: Fynn Schmitt-Ulms <fschmitt@redhat.com> Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
diff --git a/tests/llmcompressor/transformers/compression/configs/channelwise_smoke.yaml b/tests/llmcompressor/transformers/compression/configs/channelwise_smoke.yaml
@@ -1,4 +1,4 @@
 cadence: "commit"
 test_type: "regression"
-model_stub: "nm-testing/llama2.c-stories15M"
+model_stub: "nm-testing/tinysmokellama-3.2"
 new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_channel.yaml"
diff --git a/tests/llmcompressor/transformers/compression/configs/fp8_smoke.yaml b/tests/llmcompressor/transformers/compression/configs/fp8_smoke.yaml
@@ -1,4 +1,4 @@
 cadence: "commit"
 test_type: "regression"
-model_stub: "nm-testing/llama2.c-stories15M"
+model_stub: "nm-testing/tinysmokellama-3.2"
 new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_fp8.yaml"
diff --git a/tests/llmcompressor/transformers/compression/configs/inputs_smoke.yaml b/tests/llmcompressor/transformers/compression/configs/inputs_smoke.yaml
@@ -1,4 +1,4 @@
 cadence: "commit"
 test_type: "regression"
-model_stub: "nm-testing/llama2.c-stories15M"
+model_stub: "nm-testing/tinysmokellama-3.2"
 new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_full.yaml"
diff --git a/tests/llmcompressor/transformers/compression/configs/weights_only_smoke.yaml b/tests/llmcompressor/transformers/compression/configs/weights_only_smoke.yaml
@@ -1,4 +1,4 @@
 cadence: "commit"
 test_type: "regression"
-model_stub: "nm-testing/llama2.c-stories15M"
+model_stub: "nm-testing/tinysmokellama-3.2"
 new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml"
diff --git a/tests/llmcompressor/transformers/compression/test_compress_tensor_utils.py b/tests/llmcompressor/transformers/compression/test_compress_tensor_utils.py
@@ -46,7 +46,7 @@
 def test_sparse_model_reload(compressed, config, dtype, tmp_path):
     recipe_str = "tests/llmcompressor/transformers/obcq/recipes/test_tiny2.yaml"
     expected_sparsity = 0.5
-    model_path = "nm-testing/llama2.c-stories15M"
+    model_path = "nm-testing/tinysmokellama-3.2"
     dataset = "open_platypus"
     concatenate_data = False
     num_calibration_samples = 64
@@ -124,7 +124,7 @@ def test_sparse_model_reload(compressed, config, dtype, tmp_path):
 def test_dense_model_save(tmp_path, skip_compression_stats, save_compressed):
     reset_session()
 
-    model_path = "nm-testing/llama2.c-stories15M"
+    model_path = "nm-testing/tinysmokellama-3.2"
     model = AutoModelForCausalLM.from_pretrained(model_path)
 
     inferred_global_sparsity = SparsityConfigMetadata.infer_global_sparsity(model)
@@ -161,7 +161,7 @@ def test_quant_model_reload(format, dtype, tmp_path):
     recipe_str = (
         "tests/llmcompressor/transformers/compression/recipes/new_quant_simple.yaml"
     )
-    model_path = "nm-testing/llama2.c-stories15M"
+    model_path = "nm-testing/tinysmokellama-3.2"
     device = "cuda:0" if not torch.cuda.is_available() else "cpu"
     dataset = "open_platypus"
     concatenate_data = False
@@ -244,7 +244,7 @@ def test_quant_model_reload(format, dtype, tmp_path):
     ],
 )
 def test_model_reload(offload, torch_dtype, tie_word_embeddings, device, tmp_path):
-    model_path = "nm-testing/llama2.c-stories15M"
+    model_path = "nm-testing/tinysmokellama-3.2"
     save_path = tmp_path / "save_path"
 
     model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch_dtype)
@@ -296,7 +296,7 @@ def test_model_reload_gpu(offload, torch_dtype, tie_word_embeddings, device, tmp
 )
 def test_model_shared_tensors(offload, torch_dtype, tie_word_embeddings, device):
     # load model
-    model_path = "nm-testing/llama2.c-stories15M"
+    model_path = "nm-testing/tinysmokellama-3.2"
     model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch_dtype)
     if offload:
         model = dispatch_model(model, {"": device}, force_hooks=True)
@@ -337,7 +337,7 @@ def test_model_shared_tensors_gpu(offload, torch_dtype, tie_word_embeddings, dev
     "model_stub, recipe, sparse_format, quant_format",
     [
         (
-            "nm-testing/llama2.c-stories15M",
+            "nm-testing/tinysmokellama-3.2",
             "tests/llmcompressor/transformers/compression/recipes/sparse_24_fp8.yaml",
             CompressionFormat.sparse_24_bitmask.value,
             CompressionFormat.float_quantized.value,
@@ -418,7 +418,7 @@ def test_compressor_stacking(model_stub, recipe, sparse_format, quant_format, tm
     "model_stub, recipe, sparse_format",
     [
         (
-            "nm-testing/llama2.c-stories15M",
+            "nm-testing/tinysmokellama-3.2",
             "tests/llmcompressor/transformers/compression/recipes/sparse_24.yaml",
             CompressionFormat.sparse_24_bitmask.value,
         ),
diff --git a/tests/llmcompressor/transformers/compression/test_recipe_parsing.py b/tests/llmcompressor/transformers/compression/test_recipe_parsing.py
@@ -17,7 +17,7 @@ def setup_model_and_config(tmp_path):
     Loads a test model and returns common arguments used in oneshot runs.
     """
     model = AutoModelForCausalLM.from_pretrained(
-        "nm-testing/llama2.c-stories15M",
+        "nm-testing/tinysmokellama-3.2",
         torch_dtype="auto",
     )
 
diff --git a/tests/llmcompressor/transformers/finetune/data/conftest.py b/tests/llmcompressor/transformers/finetune/data/conftest.py
@@ -6,7 +6,7 @@
 
 @pytest.fixture
 def tiny_llama_path():
-    return "nm-testing/llama2.c-stories15M"
+    return "nm-testing/tinysmokellama-3.2"
 
 
 @pytest.fixture
diff --git a/tests/llmcompressor/transformers/finetune/finetune_custom/config1.yaml b/tests/llmcompressor/transformers/finetune/finetune_custom/config1.yaml
@@ -1,5 +1,5 @@
 cadence: "commit"
 test_type: "sanity"
-model: "nm-testing/llama2.c-stories15M"
+model: "nm-testing/tinysmokellama-3.2"
 file_extension: json
 num_train_epochs: 1
diff --git a/tests/llmcompressor/transformers/finetune/finetune_custom/config2.yaml b/tests/llmcompressor/transformers/finetune/finetune_custom/config2.yaml
@@ -1,5 +1,5 @@
 cadence: "commit"
 test_type: "sanity"
-model: "nm-testing/llama2.c-stories15M"
+model: "nm-testing/tinysmokellama-3.2"
 file_extension: csv
 num_train_epochs: 1
diff --git a/tests/llmcompressor/transformers/finetune/finetune_generic/config1.yaml b/tests/llmcompressor/transformers/finetune/finetune_generic/config1.yaml
@@ -1,4 +1,4 @@
 cadence: "nightly"
 test_type: "regression"
-model: "nm-testing/llama2.c-stories15M"
+model: "nm-testing/tinysmokellama-3.2"
 dataset: open_platypus
diff --git a/tests/llmcompressor/transformers/finetune/finetune_oneshot_configs/config.yaml b/tests/llmcompressor/transformers/finetune/finetune_oneshot_configs/config.yaml
@@ -1,6 +1,6 @@
 cadence: "commit"
 test_type: "sanity"
-model: "nm-testing/llama2.c-stories15M"
+model: "nm-testing/tinysmokellama-3.2"
 dataset: wikitext
 dataset_config_name: "wikitext-2-raw-v1"
 recipe: "tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml"
diff --git a/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py b/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py
@@ -14,7 +14,7 @@ def test_oneshot_sparsification_then_finetune(tmp_path):
 
     recipe_str = "tests/llmcompressor/transformers/obcq/recipes/test_tiny2.yaml"
     model = AutoModelForCausalLM.from_pretrained(
-        "nm-testing/llama2.c-stories15M", torch_dtype="auto"
+        "nm-testing/tinysmokellama-3.2", torch_dtype="auto"
     )
     dataset = "open_platypus"
     concatenate_data = False
@@ -42,7 +42,7 @@ def test_oneshot_sparsification_then_finetune(tmp_path):
         quantization_config=quantization_config,
     )
     distill_teacher = AutoModelForCausalLM.from_pretrained(
-        "nm-testing/llama2.c-stories15M", torch_dtype="auto"
+        "nm-testing/tinysmokellama-3.2", torch_dtype="auto"
     )
     dataset = "open_platypus"
     concatenate_data = False
diff --git a/tests/llmcompressor/transformers/finetune/test_session_mixin.py b/tests/llmcompressor/transformers/finetune/test_session_mixin.py
@@ -32,7 +32,7 @@ def __init__(
 
 @pytest.mark.unit
 def test_mixin_init():
-    model_state_path = "nm-testing/llama2.c-stories15M"
+    model_state_path = "nm-testing/tinysmokellama-3.2"
     model = AutoModelForCausalLM.from_pretrained(model_state_path)
     recipe = "tests/llmcompressor/transformers/finetune/test_quantization.yaml"
 
@@ -45,7 +45,7 @@ def test_mixin_init():
 
 @pytest.fixture
 def mixin_trainer():
-    model_state_path = "nm-testing/llama2.c-stories15M"
+    model_state_path = "nm-testing/tinysmokellama-3.2"
     model = AutoModelForCausalLM.from_pretrained(model_state_path)
     recipe = "tests/llmcompressor/transformers/finetune/test_quantization.yaml"
     train_dataset = "open-platypus"
diff --git a/tests/llmcompressor/transformers/obcq/obcq_configs/completion/tiny_llama_quant.yaml b/tests/llmcompressor/transformers/obcq/obcq_configs/completion/tiny_llama_quant.yaml
@@ -1,6 +1,6 @@
 cadence: "nightly"
 test_type: "sanity"
-model: "nm-testing/llama2.c-stories15M"
+model: "nm-testing/tinysmokellama-3.2"
 dataset: open_platypus
 recipe: "tests/llmcompressor/transformers/obcq/recipes/quant.yaml"
 num_samples: 32
diff --git a/tests/llmcompressor/transformers/obcq/obcq_configs/completion/tiny_llama_quant_and_sparse.yaml b/tests/llmcompressor/transformers/obcq/obcq_configs/completion/tiny_llama_quant_and_sparse.yaml
@@ -1,6 +1,6 @@
 cadence: "nightly"
 test_type: "sanity"
-model: "nm-testing/llama2.c-stories15M"
+model: "nm-testing/tinysmokellama-3.2"
 dataset: open_platypus
 recipe: "tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml"
 num_samples: 32
diff --git a/tests/llmcompressor/transformers/obcq/obcq_configs/consec_runs/tiny_llama_consec_runs.yaml b/tests/llmcompressor/transformers/obcq/obcq_configs/consec_runs/tiny_llama_consec_runs.yaml
@@ -1,6 +1,6 @@
 cadence: "commit"
 test_type: "sanity"
-model: "nm-testing/llama2.c-stories15M"
+model: "nm-testing/tinysmokellama-3.2"
 dataset: open_platypus
 first_recipe: "tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml"
 second_recipe: "tests/llmcompressor/transformers/obcq/recipes/additional_sparsity.yaml"
diff --git a/tests/llmcompressor/transformers/obcq/obcq_configs/mask_structure/tiny_llama_mask_structure_preservation.yaml b/tests/llmcompressor/transformers/obcq/obcq_configs/mask_structure/tiny_llama_mask_structure_preservation.yaml
@@ -1,6 +1,6 @@
 cadence: "commit"
 test_type: "sanity"
-model: "nm-testing/llama2.c-stories15M"
+model: "nm-testing/tinysmokellama-3.2"
 dataset: open_platypus
 initial_pruning_only_recipe: "tests/llmcompressor/transformers/obcq/recipes/sparse_with_mask_structure.yaml"
 initial_sparsity: 0.5
diff --git a/tests/llmcompressor/transformers/obcq/obcq_configs/sparse/tiny_llama_sparse.yaml b/tests/llmcompressor/transformers/obcq/obcq_configs/sparse/tiny_llama_sparse.yaml
@@ -1,6 +1,6 @@
 cadence: "commit"
 test_type: "sanity"
-model: "nm-testing/llama2.c-stories15M"
+model: "nm-testing/tinysmokellama-3.2"
 dataset: open_platypus
 recipe: "tests/llmcompressor/transformers/obcq/recipes/sparse.yaml"
 sparsity: 0.3
diff --git a/tests/llmcompressor/transformers/obcq/obcq_configs/sparsity_generic/config.yaml b/tests/llmcompressor/transformers/obcq/obcq_configs/sparsity_generic/config.yaml
@@ -1,4 +1,4 @@
 cadence: "nightly"
 test_type: "regression"
-model: "nm-testing/llama2.c-stories15M"
+model: "nm-testing/tinysmokellama-3.2"
 dataset: open_platypus
diff --git a/tests/llmcompressor/transformers/obcq/test_obcq_infer_targets.py b/tests/llmcompressor/transformers/obcq/test_obcq_infer_targets.py
@@ -9,7 +9,7 @@
 def test_infer_targets():
     modifier = SparseGPTModifier(sparsity=0.0)
     with init_empty_weights():
-        model = AutoModelForCausalLM.from_pretrained("nm-testing/llama2.c-stories15M")
+        model = AutoModelForCausalLM.from_pretrained("nm-testing/tinysmokellama-3.2")
 
     inferred = modifier._infer_sequential_targets(model)
     assert inferred == ["LlamaDecoderLayer"]
diff --git a/tests/llmcompressor/transformers/obcq/test_obcq_lm_head.py b/tests/llmcompressor/transformers/obcq/test_obcq_lm_head.py
@@ -12,7 +12,7 @@
 def model():
     device = "cuda:0" if torch.cuda.is_available() else "cpu"
     return AutoModelForCausalLM.from_pretrained(
-        "nm-testing/llama2.c-stories15M", device_map=device
+        "nm-testing/tinysmokellama-3.2", device_map=device
     )
 
 
diff --git a/tests/llmcompressor/transformers/obcq/test_obcq_owl.py b/tests/llmcompressor/transformers/obcq/test_obcq_owl.py
@@ -21,7 +21,7 @@ def test_infer_owl_layer_sparsity():
         modifier = SparseGPTModifier(
             sparsity=0.7, sparsity_profile="owl", owl_m=5, owl_lmbda=0.05
         )
-        model = AutoModelForCausalLM.from_pretrained("nm-testing/llama2.c-stories15M")
+        model = AutoModelForCausalLM.from_pretrained("nm-testing/tinysmokellama-3.2")
 
         dataset = Dataset.from_dict(
             {"input_ids": torch.randint(0, vocab_size, (ds_size, seq_len))}
diff --git a/tests/llmcompressor/transformers/obcq/test_obcq_sparsity.py b/tests/llmcompressor/transformers/obcq/test_obcq_sparsity.py
@@ -34,7 +34,7 @@ def test_sparsities(tmp_path, config):
     )
 
     layer_1_sparse = tensor_sparsity(model.model.layers[1].self_attn.k_proj.weight)
-    assert math.isclose(layer_1_sparse.item(), config["sparsity"], rel_tol=1e-4)
+    assert math.isclose(layer_1_sparse.item(), config["sparsity"], rel_tol=1e-3)
     layer_2_dense = tensor_sparsity(model.model.layers[2].self_attn.k_proj.weight)
     assert math.isclose(layer_2_dense.item(), 0.0, rel_tol=1e-4)
 
diff --git a/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_smoke_conf1.yaml b/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_smoke_conf1.yaml
@@ -1,7 +1,7 @@
 cadence: "commit"
 test_type: "smoke"
 tokenize: False
-model: "nm-testing/llama2.c-stories15M"
+model: "nm-testing/tinysmokellama-3.2"
 dataset: open_platypus
 recipe: |
   test_stage:
diff --git a/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_smoke_conf2.yaml b/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_smoke_conf2.yaml
@@ -1,6 +1,6 @@
 cadence: "commit"
 test_type: "smoke"
 tokenize: False
-model: "nm-testing/llama2.c-stories15M"
+model: "nm-testing/tinysmokellama-3.2"
 dataset: open_platypus
 recipe: "tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml"
diff --git a/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_smoke_conf3.yaml b/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_smoke_conf3.yaml
@@ -1,7 +1,7 @@
 cadence: "commit"
 test_type: "smoke"
 tokenize: False
-model: "nm-testing/llama2.c-stories15M"
+model: "nm-testing/tinysmokellama-3.2"
 dataset: "gsm8k"
 dataset_config_name: "main"
 recipe: "tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml"
diff --git a/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_smoke_conf4.yaml b/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_smoke_conf4.yaml
@@ -1,7 +1,7 @@
 cadence: "commit"
 test_type: "smoke"
 tokenize: False
-model: "nm-testing/llama2.c-stories15M"
+model: "nm-testing/tinysmokellama-3.2"
 dataset: "gsm8k"
 dataset_config_name: "main"
 recipe: |
diff --git a/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_smoke_conf5.yaml b/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_smoke_conf5.yaml
@@ -1,6 +1,6 @@
 cadence: "commit"
 test_type: "smoke"
 tokenize: True
-model: "nm-testing/llama2.c-stories15M"
+model: "nm-testing/tinysmokellama-3.2"
 dataset: open_platypus
 recipe: "tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml"
diff --git a/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_smoke_conf6.yaml b/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_smoke_conf6.yaml
@@ -1,6 +1,6 @@
 cadence: "commit"
 test_type: "smoke"
 tokenize: True
-model: "nm-testing/llama2.c-stories15M"
+model: "nm-testing/tinysmokellama-3.2"
 dataset: "gsm8k"
 recipe: "tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml"
diff --git a/tests/llmcompressor/utils/test_helpers.py b/tests/llmcompressor/utils/test_helpers.py
@@ -189,7 +189,7 @@ def test_patch_attr():
     "model_cls,model_stub",
     [
         (MllamaForConditionalGeneration, "meta-llama/Llama-3.2-11B-Vision-Instruct"),
-        (AutoModelForCausalLM, "nm-testing/llama2.c-stories15M"),
+        (AutoModelForCausalLM, "nm-testing/tinysmokellama-3.2"),
     ],
 )
 def test_disable_cache(model_cls, model_stub):

Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,7 @@ def setup_model_and_config(tmp_path):`
`17`	`17`	`Loads a test model and returns common arguments used in oneshot runs.`
`18`	`18`	`"""`
`19`	`19`	`model = AutoModelForCausalLM.from_pretrained(`
`20`		`- "nm-testing/llama2.c-stories15M",`
	`20`	`+ "nm-testing/tinysmokellama-3.2",`
`21`	`21`	`torch_dtype="auto",`
`22`	`22`	`)`
`23`	`23`