Replace all torch_dtype with dtype (#881)

Tcc0403 · web-flow · commit 02d27648fd40 · 2025-09-16T16:11:23.000-07:00
## Summary  Fix #880  ## Testing Done   - Hardware Type: <BLANK> - [ ] run `make test` to ensure correctness - [x] run `make checkstyle` to ensure code style - [ ] run `make test-convergence` to ensure convergence --------- Signed-off-by: Tcc0403 <76503978+Tcc0403@users.noreply.github.com>
diff --git a/docs/Examples.md b/docs/Examples.md
@@ -239,7 +239,7 @@ from liger_kernel.transformers.trainer import LigerORPOTrainer  # noqa: F401
 
 model = AutoModelForCausalLM.from_pretrained(
     "meta-llama/Llama-3.2-1B-Instruct",
-    torch_dtype=torch.bfloat16,
+    dtype=torch.bfloat16,
 )
 
 tokenizer = AutoTokenizer.from_pretrained(
diff --git a/examples/alignment/run_orpo.py b/examples/alignment/run_orpo.py
@@ -9,7 +9,7 @@
 
 model = AutoModelForCausalLM.from_pretrained(
     "meta-llama/Llama-3.2-1B-Instruct",
-    torch_dtype=torch.bfloat16,
+    dtype=torch.bfloat16,
 )
 
 tokenizer = AutoTokenizer.from_pretrained(
diff --git a/examples/huggingface/training.py b/examples/huggingface/training.py
@@ -48,7 +48,7 @@ def train():
             custom_args.model_name,
             trust_remote_code=True,
             use_cache=False,
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             # These args will get passed to the appropriate apply_liger_kernel_to_* function
             # to override the default settings
             # cross_entropy=True,
@@ -59,7 +59,7 @@ def train():
             custom_args.model_name,
             trust_remote_code=True,
             use_cache=False,
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
         )
 
     trainer = SFTTrainer(
diff --git a/examples/huggingface/training_multimodal.py b/examples/huggingface/training_multimodal.py
@@ -56,7 +56,7 @@ def construct_model_and_processor(model_name: str, use_liger: bool) -> torch.nn.
         model = Qwen2VLForConditionalGeneration.from_pretrained(
             pretrained_model_name_or_path=model_name,
             use_cache=False,
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             low_cpu_mem_usage=True,
             attn_implementation="sdpa",
         )
diff --git a/examples/medusa/train.py b/examples/medusa/train.py
@@ -319,7 +319,7 @@ def _model_loader():
         model = model_builder(
             model_args.model_name_or_path,
             cache_dir=training_args.cache_dir,
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
         )
 
         # Freeze the base model
diff --git a/src/liger_kernel/transformers/fused_linear_cross_entropy.py b/src/liger_kernel/transformers/fused_linear_cross_entropy.py
@@ -25,7 +25,7 @@ def __init__(
         assert reduction in {
             "mean",
             "sum",
-            "none", 
+            "none",
         }, f"reduction must be 'mean' or 'sum' or 'none'. Got: {reduction}"
         assert softcap is None or softcap > 0, f"softcap must greater than 0.0 or None. Got: {softcap}"
         self.ce_weight = ce_weight
diff --git a/src/liger_kernel/transformers/model/glm4v.py b/src/liger_kernel/transformers/model/glm4v.py
@@ -70,7 +70,7 @@ def lce_forward(
     >>> processor = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True)
     >>> model = Glm4vForConditionalGeneration.from_pretrained(
         pretrained_model_name_or_path=MODEL_PATH,
-        torch_dtype=torch.bfloat16,
+        dtype=torch.bfloat16,
         device_map="auto",
     )
     >>> inputs = processor.apply_chat_template(
diff --git a/src/liger_kernel/transformers/model/glm4v_moe.py b/src/liger_kernel/transformers/model/glm4v_moe.py
@@ -75,7 +75,7 @@ def lce_forward(
     >>> processor = AutoProcessor.from_pretrained(MODEL_PATH)
     >>> model = Glm4vMoeForConditionalGeneration.from_pretrained(
         pretrained_model_name_or_path=MODEL_PATH,
-        torch_dtype="auto",
+        dtype="auto",
         device_map="auto",
     )
     >>> inputs = processor.apply_chat_template(
diff --git a/test/transformers/test_monkey_patch.py b/test/transformers/test_monkey_patch.py
@@ -338,7 +338,7 @@ def test_apply_liger_kernel_to_instance_for_llama():
     with patch("transformers.models.llama.modeling_llama"):
         # Instantiate a dummy model
         config = transformers.models.llama.configuration_llama.LlamaConfig(
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             rms_norm_eps=1e-5,
             hidden_size=32,
             intermediate_size=64,
@@ -382,7 +382,7 @@ def test_apply_liger_kernel_to_instance_for_mllama_for_conditional_generation():
 
         # Instantiate a dummy model
         config = transformers.models.mllama.configuration_mllama.MllamaConfig(
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             text_config=transformers.models.mllama.configuration_mllama.MllamaTextConfig(
                 rms_norm_eps=1e-5,
                 hidden_size=32,
@@ -533,7 +533,7 @@ def test_apply_liger_kernel_to_instance_for_llama4_for_causal_lm():
 
         # Instantiate a dummy model
         config = transformers.models.llama4.configuration_llama4.Llama4TextConfig(
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             rms_norm_eps=1e-5,
             hidden_size=32,
             intermediate_size=64,
@@ -573,9 +573,9 @@ def test_apply_liger_kernel_to_instance_for_llama4_for_conditional_generation():
 
         # Instantiate a dummy model
         config = transformers.models.llama4.configuration_llama4.Llama4Config(
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             text_config=transformers.models.llama4.configuration_llama4.Llama4TextConfig(
-                torch_dtype=torch.bfloat16,
+                dtype=torch.bfloat16,
                 rms_norm_eps=1e-5,
                 hidden_size=32,
                 intermediate_size=64,
@@ -656,7 +656,7 @@ def test_apply_liger_kernel_to_instance_for_mistral():
     with patch("transformers.models.mistral.modeling_mistral"):
         # Instantiate a dummy model
         config = transformers.models.mistral.configuration_mistral.MistralConfig(
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             rms_norm_eps=1e-5,
             hidden_size=32,
             intermediate_size=64,
@@ -695,7 +695,7 @@ def test_apply_liger_kernel_to_instance_for_mixtral():
     with patch("transformers.models.mixtral.modeling_mixtral"):
         # Instantiate a dummy model
         config = transformers.models.mixtral.configuration_mixtral.MixtralConfig(
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             rms_norm_eps=1e-5,
             hidden_size=32,
             intermediate_size=64,
@@ -738,7 +738,7 @@ def test_apply_liger_kernel_to_instance_for_gemma():
     with patch("transformers.models.gemma.modeling_gemma"):
         # Instantiate a dummy model
         config = transformers.models.gemma.configuration_gemma.GemmaConfig(
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             rms_norm_eps=1e-5,
             hidden_size=32,
             intermediate_size=64,
@@ -777,7 +777,7 @@ def test_apply_liger_kernel_to_instance_for_gemma2():
     with patch("transformers.models.gemma2.modeling_gemma2"):
         # Instantiate a dummy model
         config = transformers.models.gemma2.configuration_gemma2.Gemma2Config(
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             rms_norm_eps=1e-5,
             hidden_size=32,
             intermediate_size=64,
@@ -827,7 +827,7 @@ def test_apply_liger_kernel_to_instance_for_paligemma():
 
         # Instantiate a dummy model
         config = transformers.models.paligemma.configuration_paligemma.PaliGemmaConfig(
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             text_config={
                 "num_hidden_layers": 2,
                 "rms_norm_eps": 1e-5,
@@ -883,7 +883,7 @@ def test_apply_liger_kernel_to_instance_for_gemma3_text():
 
         # Instantiate a dummy model
         config = transformers.models.gemma3.configuration_gemma3.Gemma3TextConfig(
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             rms_norm_eps=1e-5,
             hidden_size=32,
             intermediate_size=64,
@@ -939,7 +939,7 @@ def test_apply_liger_kernel_to_instance_for_gemma3_conditional_generation():
 
         # Instantiate a dummy model
         text_config = transformers.models.gemma3.configuration_gemma3.Gemma3TextConfig(
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             rms_norm_eps=1e-5,
             hidden_size=32,
             intermediate_size=64,
@@ -1026,7 +1026,7 @@ def test_apply_liger_kernel_to_instance_for_qwen2():
     with patch("transformers.models.qwen2.modeling_qwen2"):
         # Instantiate a dummy model
         config = transformers.models.qwen2.configuration_qwen2.Qwen2Config(
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             rms_norm_eps=1e-5,
             hidden_size=32,
             intermediate_size=64,
@@ -1068,7 +1068,7 @@ def test_apply_liger_kernel_to_instance_for_qwen3():
 
         # Instantiate a dummy model
         config = transformers.models.qwen3.configuration_qwen3.Qwen3Config(
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             rms_norm_eps=1e-5,
             hidden_size=32,
             intermediate_size=64,
@@ -1110,7 +1110,7 @@ def test_apply_liger_kernel_to_instance_for_qwen3_moe():
 
         # Instantiate a dummy model
         config = transformers.models.qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig(
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             rms_norm_eps=1e-5,
             hidden_size=32,
             intermediate_size=64,
@@ -1158,7 +1158,7 @@ def test_apply_liger_kernel_to_instance_for_qwen2_vl_for_conditional_generation(
 
         # Instantiate a dummy model
         config = transformers.models.qwen2_vl.configuration_qwen2_vl.Qwen2VLConfig(
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             rms_norm_eps=1e-5,
             hidden_size=32,
             intermediate_size=48,
@@ -1227,7 +1227,7 @@ def test_apply_liger_kernel_to_instance_for_qwen2_vl():
 
         # Instantiate a dummy model
         config = transformers.models.qwen2_vl.configuration_qwen2_vl.Qwen2VLConfig(
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             rms_norm_eps=1e-5,
             hidden_size=32,
             intermediate_size=48,
@@ -1294,7 +1294,7 @@ def test_apply_liger_kernel_to_instance_for_qwen2_vl_text():
 
         # Instantiate a dummy model
         config = transformers.models.qwen2_vl.configuration_qwen2_vl.Qwen2VLTextConfig(
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             rms_norm_eps=1e-5,
             hidden_size=32,
             intermediate_size=48,
@@ -1347,7 +1347,7 @@ def test_apply_liger_kernel_to_instance_for_qwen2_5_vl():
 
         # Instantiate a dummy model
         config = transformers.models.qwen2_5_vl.configuration_qwen2_5_vl.Qwen2_5_VLConfig(
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             rms_norm_eps=1e-5,
             hidden_size=32,
             intermediate_size=48,
@@ -1416,7 +1416,7 @@ def test_apply_liger_kernel_to_instance_for_qwen2_5_vl_for_conditional_generatio
 
         # Instantiate a dummy model
         config = transformers.models.qwen2_5_vl.configuration_qwen2_5_vl.Qwen2_5_VLConfig(
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             rms_norm_eps=1e-5,
             hidden_size=32,
             intermediate_size=48,
@@ -1483,7 +1483,7 @@ def test_apply_liger_kernel_to_instance_for_qwen2_5_vl_text():
 
         # Instantiate a dummy model
         config = transformers.models.qwen2_5_vl.configuration_qwen2_5_vl.Qwen2_5_VLTextConfig(
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             rms_norm_eps=1e-5,
             hidden_size=32,
             intermediate_size=48,
@@ -1528,7 +1528,7 @@ def test_apply_liger_kernel_to_instance_for_phi3():
     with patch("transformers.models.phi3.modeling_phi3"):
         # Instantiate a dummy model
         config = transformers.models.phi3.configuration_phi3.Phi3Config(
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             rms_norm_eps=1e-5,
             hidden_size=32,
             intermediate_size=64,
@@ -1570,7 +1570,7 @@ def test_apply_liger_kernel_to_instance_for_olmo2():
 
         # Instantiate a dummy model
         config = transformers.models.olmo2.configuration_olmo2.Olmo2Config(
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             rms_norm_eps=1e-5,
             hidden_size=32,
             intermediate_size=64,
@@ -1616,7 +1616,7 @@ def test_apply_liger_kernel_to_instance_for_glm4():
 
         # Instantiate a dummy model
         config = transformers.models.glm4.configuration_glm4.Glm4Config(
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             rms_norm_eps=1e-5,
             hidden_size=32,
             intermediate_size=64,
@@ -1664,7 +1664,7 @@ def test_apply_liger_kernel_to_instance_for_glm4v():
 
         # Instantiate a dummy model
         config = transformers.models.glm4v.configuration_glm4v.Glm4vConfig(
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             text_config={
                 "num_hidden_layers": 2,
                 "rms_norm_eps": 1e-5,
@@ -1734,7 +1734,7 @@ def test_apply_liger_kernel_to_instance_for_glm4v_moe():
 
         # Instantiate a dummy model
         config = transformers.models.glm4v_moe.configuration_glm4v_moe.Glm4vMoeConfig(
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             hidden_size=32,
             num_attention_heads=4,
             num_key_value_heads=2,
@@ -1837,7 +1837,7 @@ def test_apply_liger_kernel_to_instance_for_smollm3():
     with patch("transformers.models.smollm3.modeling_smollm3"):
         # Instantiate a dummy model
         config = transformers.models.smollm3.configuration_smollm3.SmolLM3Config(
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             rms_norm_eps=1e-5,
             hidden_size=32,
             intermediate_size=64,

Original file line number	Diff line number	Diff line change
`@@ -239,7 +239,7 @@ from liger_kernel.transformers.trainer import LigerORPOTrainer # noqa: F401`
`239`	`239`
`240`	`240`	`model = AutoModelForCausalLM.from_pretrained(`
`241`	`241`	`"meta-llama/Llama-3.2-1B-Instruct",`
`242`		`- torch_dtype=torch.bfloat16,`
	`242`	`+ dtype=torch.bfloat16,`
`243`	`243`	`)`
`244`	`244`
`245`	`245`	`tokenizer = AutoTokenizer.from_pretrained(`
Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,7 @@`
`9`	`9`
`10`	`10`	`model = AutoModelForCausalLM.from_pretrained(`
`11`	`11`	`"meta-llama/Llama-3.2-1B-Instruct",`
`12`		`- torch_dtype=torch.bfloat16,`
	`12`	`+ dtype=torch.bfloat16,`
`13`	`13`	`)`
`14`	`14`
`15`	`15`	`tokenizer = AutoTokenizer.from_pretrained(`
Original file line number	Diff line number	Diff line change
`@@ -56,7 +56,7 @@ def construct_model_and_processor(model_name: str, use_liger: bool) -> torch.nn.`
`56`	`56`	`model = Qwen2VLForConditionalGeneration.from_pretrained(`
`57`	`57`	`pretrained_model_name_or_path=model_name,`
`58`	`58`	`use_cache=False,`
`59`		`- torch_dtype=torch.bfloat16,`
	`59`	`+ dtype=torch.bfloat16,`
`60`	`60`	`low_cpu_mem_usage=True,`
`61`	`61`	`attn_implementation="sdpa",`
`62`	`62`	`)`
Original file line number	Diff line number	Diff line change
`@@ -319,7 +319,7 @@ def _model_loader():`
`319`	`319`	`model = model_builder(`
`320`	`320`	`model_args.model_name_or_path,`
`321`	`321`	`cache_dir=training_args.cache_dir,`
`322`		`- torch_dtype=torch.bfloat16,`
	`322`	`+ dtype=torch.bfloat16,`
`323`	`323`	`)`
`324`	`324`
`325`	`325`	`# Freeze the base model`
Original file line number	Diff line number	Diff line change
`@@ -70,7 +70,7 @@ def lce_forward(`
`70`	`70`	`>>> processor = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True)`
`71`	`71`	`>>> model = Glm4vForConditionalGeneration.from_pretrained(`
`72`	`72`	`pretrained_model_name_or_path=MODEL_PATH,`
`73`		`- torch_dtype=torch.bfloat16,`
	`73`	`+ dtype=torch.bfloat16,`
`74`	`74`	`device_map="auto",`
`75`	`75`	`)`
`76`	`76`	`>>> inputs = processor.apply_chat_template(`
Original file line number	Diff line number	Diff line change
`@@ -75,7 +75,7 @@ def lce_forward(`
`75`	`75`	`>>> processor = AutoProcessor.from_pretrained(MODEL_PATH)`
`76`	`76`	`>>> model = Glm4vMoeForConditionalGeneration.from_pretrained(`
`77`	`77`	`pretrained_model_name_or_path=MODEL_PATH,`
`78`		`- torch_dtype="auto",`
	`78`	`+ dtype="auto",`
`79`	`79`	`device_map="auto",`
`80`	`80`	`)`
`81`	`81`	`>>> inputs = processor.apply_chat_template(`