Merge branch 'main' into main

shimizust · web-flow · commit 4adc234d2361 · 2025-09-21T13:18:17.000-07:00
diff --git a/.github/workflows/amd-ci.yml b/.github/workflows/amd-ci.yml
@@ -64,7 +64,8 @@ jobs:
       run: |
         rocm-smi
         python -m pip install --upgrade pip
-        pip install -e .[dev] --extra-index-url https://download.pytorch.org/whl/nightly/rocm${{ matrix.rocm_version }}
+        pip install -e .[dev]
+        pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm${{ matrix.rocm_version }}/
     
     - name: List Python Environments
       run: python -m pip list
diff --git a/.gitignore b/.gitignore
@@ -23,4 +23,5 @@ uv.lock
 
 # Benchmark images
 benchmark/visualizations
-.vscode/
+.vscode/
+.coverage
diff --git a/Makefile b/Makefile
@@ -5,7 +5,16 @@ all: checkstyle test test-convergence
 
 # Command to run pytest for correctness tests
 test:
-	python -m pytest --disable-warnings test/ --ignore=test/convergence
+	python -m pytest --disable-warnings \
+		-n auto \
+		--dist=load \
+		--cov=src/liger_kernel \
+		--cov-report=term-missing \
+		--ignore=test/convergence \
+		test/
+	coverage combine
+	coverage report -m
+	coverage html
 
 # Command to run ruff for linting and formatting code
 checkstyle:
diff --git a/README.md b/README.md
@@ -129,8 +129,8 @@ y = orpo_loss(lm_head.weight, x, target)
 - `triton >= 3.0.0` Install from pypi. (e.g. `pip install triton==3.0.0`)
 
 ```bash
-# Need to pass the url when installing
-pip install -e .[dev] --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2
+pip install -e .[dev]
+pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.3/
 ```
 
 ### Optional Dependencies
@@ -164,6 +164,9 @@ pip install -e .
 
 # Setup Development Dependencies
 pip install -e ".[dev]"
+
+# NOTE -> For AMD users only
+pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.3/
 ```
 
 
diff --git a/docs/Examples.md b/docs/Examples.md
@@ -239,7 +239,7 @@ from liger_kernel.transformers.trainer import LigerORPOTrainer  # noqa: F401
 
 model = AutoModelForCausalLM.from_pretrained(
     "meta-llama/Llama-3.2-1B-Instruct",
-    torch_dtype=torch.bfloat16,
+    dtype=torch.bfloat16,
 )
 
 tokenizer = AutoTokenizer.from_pretrained(
diff --git a/examples/alignment/run_orpo.py b/examples/alignment/run_orpo.py
@@ -9,7 +9,7 @@
 
 model = AutoModelForCausalLM.from_pretrained(
     "meta-llama/Llama-3.2-1B-Instruct",
-    torch_dtype=torch.bfloat16,
+    dtype=torch.bfloat16,
 )
 
 tokenizer = AutoTokenizer.from_pretrained(
diff --git a/examples/huggingface/training.py b/examples/huggingface/training.py
@@ -48,7 +48,7 @@ def train():
             custom_args.model_name,
             trust_remote_code=True,
             use_cache=False,
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             # These args will get passed to the appropriate apply_liger_kernel_to_* function
             # to override the default settings
             # cross_entropy=True,
@@ -59,7 +59,7 @@ def train():
             custom_args.model_name,
             trust_remote_code=True,
             use_cache=False,
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
         )
 
     trainer = SFTTrainer(
diff --git a/examples/huggingface/training_multimodal.py b/examples/huggingface/training_multimodal.py
@@ -56,7 +56,7 @@ def construct_model_and_processor(model_name: str, use_liger: bool) -> torch.nn.
         model = Qwen2VLForConditionalGeneration.from_pretrained(
             pretrained_model_name_or_path=model_name,
             use_cache=False,
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             low_cpu_mem_usage=True,
             attn_implementation="sdpa",
         )
diff --git a/examples/medusa/train.py b/examples/medusa/train.py
@@ -319,7 +319,7 @@ def _model_loader():
         model = model_builder(
             model_args.model_name_or_path,
             cache_dir=training_args.cache_dir,
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
         )
 
         # Freeze the base model
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,6 +24,36 @@ pythonpath = ["src", "."]
 asyncio_mode = "auto"
 log_cli = true
 log_cli_level = "INFO"
+addopts = [
+    "-n", "auto",
+    "--dist=load",                    # use "load" to distribute tests and let pytest-cov combine coverage
+    "--cov=src/liger_kernel",
+    "--cov-report=term-missing",
+    "--cov-report=html",
+    "--cov-config=pyproject.toml",
+    "--durations=0"
+]
+python_files = "test_*.py"
+testpaths = ["test/"]
+
+[tool.coverage.run]
+branch = true
+parallel = true
+source = ["src/liger_kernel"]
+# xdist uses subprocesses; "multiprocessing" is a safe concurrency choice
+concurrency = ["multiprocessing"]
+
+[tool.coverage.paths]
+liger_kernel = [
+  "src/liger_kernel",
+  "*/site-packages/liger_kernel"
+]
+
+[tool.coverage.report]
+omit = ["test/*"]
+show_missing = true
+skip_covered = false
+
 
 [tool.ruff]
 line-length = 120
diff --git a/setup.py b/setup.py
@@ -18,7 +18,6 @@ def get_default_dependencies():
         ]
     elif platform == "rocm":
         return [
-            "torch>=2.6.0.dev",
             "triton>=3.0.0",
         ]
     elif platform == "xpu":
@@ -33,15 +32,14 @@ def get_optional_dependencies():
         "dev": [
             "transformers>=4.49.0",
             "matplotlib>=3.7.2",
-            "flake8>=4.0.1.1",
-            "black>=24.4.2",
-            "isort>=5.13.2",
+            "ruff>=0.12.0",
             "pytest>=7.1.2",
             "pytest-xdist",
+            "pytest-cov",
+            "pytest-asyncio",
             "pytest-rerunfailures",
             "datasets>=2.19.2",
             "seaborn",
-            "mkdocs",
             "mkdocs-material",
             "torchvision>=0.20",
         ]
diff --git a/src/liger_kernel/transformers/fused_linear_cross_entropy.py b/src/liger_kernel/transformers/fused_linear_cross_entropy.py
@@ -25,7 +25,8 @@ def __init__(
         assert reduction in {
             "mean",
             "sum",
-        }, f"reduction must be 'mean' or 'sum'. Got: {reduction}"
+            "none",
+        }, f"reduction must be 'mean' or 'sum' or 'none'. Got: {reduction}"
         assert softcap is None or softcap > 0, f"softcap must greater than 0.0 or None. Got: {softcap}"
         self.ce_weight = ce_weight
         self.ignore_index = ignore_index
diff --git a/src/liger_kernel/transformers/model/glm4v.py b/src/liger_kernel/transformers/model/glm4v.py
@@ -70,7 +70,7 @@ def lce_forward(
     >>> processor = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True)
     >>> model = Glm4vForConditionalGeneration.from_pretrained(
         pretrained_model_name_or_path=MODEL_PATH,
-        torch_dtype=torch.bfloat16,
+        dtype=torch.bfloat16,
         device_map="auto",
     )
     >>> inputs = processor.apply_chat_template(
diff --git a/src/liger_kernel/transformers/model/glm4v_moe.py b/src/liger_kernel/transformers/model/glm4v_moe.py
@@ -75,7 +75,7 @@ def lce_forward(
     >>> processor = AutoProcessor.from_pretrained(MODEL_PATH)
     >>> model = Glm4vMoeForConditionalGeneration.from_pretrained(
         pretrained_model_name_or_path=MODEL_PATH,
-        torch_dtype="auto",
+        dtype="auto",
         device_map="auto",
     )
     >>> inputs = processor.apply_chat_template(
diff --git a/test/transformers/test_monkey_patch.py b/test/transformers/test_monkey_patch.py

Original file line number	Diff line number	Diff line change
`@@ -239,7 +239,7 @@ from liger_kernel.transformers.trainer import LigerORPOTrainer # noqa: F401`
`239`	`239`
`240`	`240`	`model = AutoModelForCausalLM.from_pretrained(`
`241`	`241`	`"meta-llama/Llama-3.2-1B-Instruct",`
`242`		`- torch_dtype=torch.bfloat16,`
	`242`	`+ dtype=torch.bfloat16,`
`243`	`243`	`)`
`244`	`244`
`245`	`245`	`tokenizer = AutoTokenizer.from_pretrained(`
Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,7 @@`
`9`	`9`
`10`	`10`	`model = AutoModelForCausalLM.from_pretrained(`
`11`	`11`	`"meta-llama/Llama-3.2-1B-Instruct",`
`12`		`- torch_dtype=torch.bfloat16,`
	`12`	`+ dtype=torch.bfloat16,`
`13`	`13`	`)`
`14`	`14`
`15`	`15`	`tokenizer = AutoTokenizer.from_pretrained(`
Original file line number	Diff line number	Diff line change
`@@ -56,7 +56,7 @@ def construct_model_and_processor(model_name: str, use_liger: bool) -> torch.nn.`
`56`	`56`	`model = Qwen2VLForConditionalGeneration.from_pretrained(`
`57`	`57`	`pretrained_model_name_or_path=model_name,`
`58`	`58`	`use_cache=False,`
`59`		`- torch_dtype=torch.bfloat16,`
	`59`	`+ dtype=torch.bfloat16,`
`60`	`60`	`low_cpu_mem_usage=True,`
`61`	`61`	`attn_implementation="sdpa",`
`62`	`62`	`)`
Original file line number	Diff line number	Diff line change
`@@ -319,7 +319,7 @@ def _model_loader():`
`319`	`319`	`model = model_builder(`
`320`	`320`	`model_args.model_name_or_path,`
`321`	`321`	`cache_dir=training_args.cache_dir,`
`322`		`- torch_dtype=torch.bfloat16,`
	`322`	`+ dtype=torch.bfloat16,`
`323`	`323`	`)`
`324`	`324`
`325`	`325`	`# Freeze the base model`