Complete pending todos in testing (#2088)

raishish · pre-commit-ci[bot] · web-flow · commit 5a9b13e98bd1 · 2025-07-09T16:24:50.000+02:00
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
diff --git a/litgpt/config.py b/litgpt/config.py
@@ -1536,7 +1536,9 @@ def norm_class(self) -> Type:
         name="CodeLlama-70b-Instruct-hf",
         hf_config=dict(org="codellama", name="CodeLlama-70b-Instruct-hf"),
         block_size=16384,
-        vocab_size=32016,
+        # 32016 is an added token, so not reported in vocab_size
+        # https://huggingface.co/codellama/CodeLlama-70b-Instruct-hf/blob/main/tokenizer_config.json
+        vocab_size=32015,
         padding_multiple=16,
         n_layer=80,
         n_head=64,
@@ -2331,53 +2333,6 @@ def norm_class(self) -> Type:
     ),
 ]
 
-qwen_2_5_1m = [
-    # https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-1M/blob/main/config.json
-    dict(
-        name="Qwen2.5-7B-Instruct-1M",
-        hf_config=dict(org="Qwen", name="Qwen2.5-7B-Instruct-1M"),
-        block_size=1010000,
-        vocab_size=151643,
-        padded_vocab_size=152064,
-        n_layer=28,
-        n_head=28,
-        n_embd=3584,
-        n_query_groups=4,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        attn_bias=True,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=18944,
-        norm_eps=1e-5,
-        rope_base=10000000,
-    ),
-    # https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-1M/blob/main/config.json
-    dict(
-        name="Qwen2.5-14B-Instruct-1M",
-        hf_config=dict(org="Qwen", name="Qwen2.5-14B-Instruct-1M"),
-        block_size=1010000,
-        vocab_size=151643,
-        padded_vocab_size=152064,
-        n_layer=48,
-        n_head=40,
-        n_embd=5120,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        attn_bias=True,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=13824,
-        norm_eps=1e-5,
-        rope_base=10000000,
-    ),
-]
-
-qwen_2_5.extend(qwen_2_5_1m)
-
 qwen_2_5_coder = [
     # https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B/blob/main/config.json
     dict(
@@ -2584,6 +2539,53 @@ def norm_class(self) -> Type:
         copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
         configs.append(copy)
 
+qwen_2_5_1m = [
+    # https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-1M/blob/main/config.json
+    dict(
+        name="Qwen2.5-7B-Instruct-1M",
+        hf_config=dict(org="Qwen", name="Qwen2.5-7B-Instruct-1M"),
+        block_size=1010000,
+        vocab_size=151643,
+        padded_vocab_size=152064,
+        n_layer=28,
+        n_head=28,
+        n_embd=3584,
+        n_query_groups=4,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        attn_bias=True,
+        norm_class_name="RMSNorm",
+        mlp_class_name="LLaMAMLP",
+        intermediate_size=18944,
+        norm_eps=1e-5,
+        rope_base=10000000,
+    ),
+    # https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-1M/blob/main/config.json
+    dict(
+        name="Qwen2.5-14B-Instruct-1M",
+        hf_config=dict(org="Qwen", name="Qwen2.5-14B-Instruct-1M"),
+        block_size=1010000,
+        vocab_size=151643,
+        padded_vocab_size=152064,
+        n_layer=48,
+        n_head=40,
+        n_embd=5120,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        attn_bias=True,
+        norm_class_name="RMSNorm",
+        mlp_class_name="LLaMAMLP",
+        intermediate_size=13824,
+        norm_eps=1e-5,
+        rope_base=10000000,
+    ),
+]
+
+configs.extend(qwen_2_5_1m)
+
 ##########
 # QwQ
 ##########
diff --git a/tests/test_config_hub.py b/tests/test_config_hub.py
@@ -16,7 +16,7 @@
     ("litgpt/pretrain.py", "pretrain/tinystories.yaml"),
     (
         "litgpt/pretrain.py",
-        "https://raw.githubusercontent.com/Lightning-AI/litgpt/4d55ab6d0aa404f0da0d03a80a8801ed60e07e83/config_hub/pretrain/tinystories.yaml",  # TODO: Update with path from main after merge
+        "https://raw.githubusercontent.com/Lightning-AI/litgpt/main/config_hub/pretrain/tinystories.yaml",
     ),
 ]
 
diff --git a/tests/test_rope.py b/tests/test_rope.py
@@ -1,6 +1,5 @@
 # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 
-import pytest
 import torch
 from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXConfig, GPTNeoXRotaryEmbedding
 from transformers.models.gpt_neox.modeling_gpt_neox import apply_rotary_pos_emb as apply_rotary_pos_emb_gptneo
@@ -231,8 +230,6 @@ def test_rope_llama_3_2():
 
 
 # See https://huggingface.co/google/gemma-3-27b-it/blob/main/config.json for settings
-# TODO: update HF transformers version to support Gemma3 and fix errors that causes after the update
-@pytest.mark.skip(reason="This test fails due to the HF transformers version not supporting Gemma3")
 @torch.inference_mode()
 def test_rope_gemma_3():
     from transformers.models.gemma3.configuration_gemma3 import Gemma3TextConfig
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
@@ -33,16 +33,18 @@ def test_tokenizer_against_hf(config, tmp_path):
             warnings.warn(str(ex), RuntimeWarning)
     if "tokenizer.json" not in hf_files and "tokenizer.model" not in hf_files:
         raise ConnectionError("Unable to download any tokenizer files from HF")
+
+    # we need to rename the dir to match the model name in testing as well
+    # since we use to it determine the model in tokenizer.py
+    tmp_path = tmp_path.rename(tmp_path.parent / config.hf_config["name"])
+
     for filename, hf_file in hf_files.items():
         shutil.copy(hf_file, str(tmp_path / filename))
 
     ours = Tokenizer(tmp_path)
 
     assert ours.vocab_size == theirs.vocab_size
-    if config.name.startswith("CodeLlama-70b-Instruct"):
-        # TODO: the HF tokenizer returns 1 less token for this model. why?
-        assert ours.vocab_size == config.vocab_size - 1
-    elif config.name == "Mixtral-8x22B-v0.1":
+    if config.name == "Mixtral-8x22B-v0.1":
         pytest.xfail(reason="Mixtral certainly lists 32000 vocab in its config")
     else:
         assert ours.vocab_size == config.vocab_size
@@ -70,17 +72,7 @@ def test_tokenizer_against_hf(config, tmp_path):
     prompt = PromptStyle.from_config(config).apply(prompt)
     actual = ours.encode(prompt)
     expected = theirs.encode(prompt)
-    if (expected[0] == theirs.bos_token_id and actual[0] != theirs.bos_token_id) or (
-        expected[0] == theirs.bos_token_id and expected[1] == theirs.bos_token_id
-    ):
-        # TODO: check what is going on with the bos_tokens
-        del expected[0]
-    if config.name.startswith("CodeLlama-70b"):
-        # TODO: there's a encoding difference with this model. why? note that the decoding is equal
-        # "Hello": 10994, "▁Hello": 15043
-        assert [15043 if t == 10994 else t for t in actual.tolist()] == expected
-    else:
-        assert actual.tolist() == expected
+    assert actual.tolist() == expected
     assert ours.decode(actual) == theirs.decode(expected, skip_special_tokens=True)
 
     if not config.name.startswith(("Mistral", "Mixtral")):

Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,7 @@`
`16`	`16`	`("litgpt/pretrain.py", "pretrain/tinystories.yaml"),`
`17`	`17`	`(`
`18`	`18`	`"litgpt/pretrain.py",`
`19`		`- "https://raw.githubusercontent.com/Lightning-AI/litgpt/4d55ab6d0aa404f0da0d03a80a8801ed60e07e83/config_hub/pretrain/tinystories.yaml", # TODO: Update with path from main after merge`
	`19`	`+ "https://raw.githubusercontent.com/Lightning-AI/litgpt/main/config_hub/pretrain/tinystories.yaml",`
`20`	`20`	`),`
`21`	`21`	`]`
`22`	`22`