diff --git a/dev/modal/benchmarks.py b/dev/modal/benchmarks.py index 183fd6744..00ba3bf34 100644 --- a/dev/modal/benchmarks.py +++ b/dev/modal/benchmarks.py @@ -16,8 +16,8 @@ @app.function(gpu="H100", image=repo, timeout=60 * 45) def liger_benchmarks(): - import subprocess import os + import subprocess subprocess.run( ["uv pip install -e '.[dev]' --system"], @@ -31,7 +31,7 @@ def liger_benchmarks(): file_path = Path(REMOTE_ROOT_PATH) / "benchmark" / "data" / "all_benchmark_data.csv" print(f"Checking if file exists at: {file_path}") print(f"File exists: {os.path.exists(file_path)}") - + if not os.path.exists(file_path): print("Listing directory contents:") data_dir = file_path.parent @@ -54,21 +54,21 @@ def main(): # Run the benchmarks and get the data print("Starting benchmark run...") benchmark_data = liger_benchmarks.remote() - + if not benchmark_data: raise ValueError("No data received from remote function") - + # Save the data locally local_data_path = ROOT_PATH / "benchmark" / "data" / "all_benchmark_data.csv" print(f"Attempting to save data to: {local_data_path}") - + local_data_path.parent.mkdir(parents=True, exist_ok=True) - + with open(local_data_path, "wb") as f: f.write(benchmark_data) - + print(f"Successfully saved {len(benchmark_data)} bytes to: {local_data_path}") - + except Exception as e: print(f"Error occurred: {str(e)}") raise diff --git a/test/convergence/bf16/test_mini_models.py b/test/convergence/bf16/test_mini_models.py index 17a2a6219..fcda20a68 100644 --- a/test/convergence/bf16/test_mini_models.py +++ b/test/convergence/bf16/test_mini_models.py @@ -38,6 +38,8 @@ from test.utils import DEFAULT_DATASET_PATH from test.utils import MiniModelConfig from test.utils import assert_verbose_allclose +from test.utils import get_logprobs +from test.utils import get_topk from test.utils import revert_liger_kernel_to_gemma from test.utils import revert_liger_kernel_to_gemma2 from test.utils import revert_liger_kernel_to_gemma3_text @@ -851,17 +853,17 @@ def run_mini_model( eval_output = model(**eval_batch) print(f"Eval Loss: {eval_output.loss.item()}") loss_list.append(eval_output.loss.item()) - + topk_logprobs = get_topk(get_logprobs(eval_output.logits)) MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs) return { "loss": loss_list, - "logits": eval_output.logits, + "topk_logprobs": topk_logprobs.values, "model": model, } @pytest.mark.parametrize( - "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logits_atol, logits_rtol, param_atol, param_rtol", + "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logprobs_atol, logprobs_rtol, param_atol, param_rtol", [ pytest.param( "mini_llama3", @@ -884,7 +886,7 @@ def run_mini_model( 1e-3, 1e-2, 1e-1, - 1e-2, + 1e-1, 1e-2, 1e-2, marks=[ @@ -902,7 +904,7 @@ def run_mini_model( torch.bfloat16, 1e-3, 1e-2, - 1, # 1e-1 + 1e-1, # 1e-1 1e-1, # 1e-2 1e-2, 1e-2, @@ -972,7 +974,7 @@ def run_mini_model( torch.bfloat16, 1e-3, 1e-2, - 1, # 1e-1 + 1e-1, # 1e-1 1e-1, # 1e-2 1e-2, 1e-2, @@ -1111,8 +1113,8 @@ def run_mini_model( torch.bfloat16, 1e-3, 1e-2, - 1e-1, 1e-2, + 1e-1, 1e-2, 1e-2, marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"), @@ -1124,8 +1126,8 @@ def run_mini_model( torch.bfloat16, 1e-3, 1e-2, - 1e-1, 1e-2, + 1e-1, 1e-2, 1e-2, marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"), @@ -1153,8 +1155,8 @@ def run_mini_model( torch.bfloat16, 1e-3, 1e-2, - 1e-1, - 1e-2, + 3e-1, + 4e-1, 1e-2, 1e-2, marks=[ @@ -1174,8 +1176,8 @@ def test_mini_model( dtype, loss_atol, loss_rtol, - logits_atol, - logits_rtol, + logprobs_atol, + logprobs_rtol, param_atol, param_rtol, ): @@ -1193,13 +1195,13 @@ def test_mini_model( rtol=loss_rtol, ) - # Compare the logits from evaluation step - if expected_output["logits"] is not None and actual_output["logits"] is not None: + # Compare the topk logprobs from evaluation step + if expected_output["topk_logprobs"] is not None and actual_output["topk_logprobs"] is not None: assert_verbose_allclose( - expected_output["logits"], - actual_output["logits"], - atol=logits_atol, - rtol=logits_rtol, + expected_output["topk_logprobs"], + actual_output["topk_logprobs"], + atol=logprobs_atol, + rtol=logprobs_rtol, ) # Compare the params from the last step diff --git a/test/convergence/bf16/test_mini_models_multimodal.py b/test/convergence/bf16/test_mini_models_multimodal.py index 04ffedcc8..e087b80bd 100644 --- a/test/convergence/bf16/test_mini_models_multimodal.py +++ b/test/convergence/bf16/test_mini_models_multimodal.py @@ -20,6 +20,8 @@ from test.utils import UNTOKENIZED_DATASET_PATH from test.utils import MiniModelConfig from test.utils import assert_verbose_allclose +from test.utils import get_logprobs +from test.utils import get_topk from test.utils import is_torchvision_available from test.utils import load_image_processing_config from test.utils import load_processor_config @@ -764,13 +766,17 @@ def run_mini_model_multimodal( print(f"Step {i}, Loss: {output.loss.item()}") loss_list.append(output.loss.item()) - + topk_logprobs = get_topk(get_logprobs(output.logits)) MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs) - return {"loss": loss_list, "logits": output.logits, "model": model} + return { + "loss": loss_list, + "topk_logprobs": topk_logprobs.values, + "model": model, + } @pytest.mark.parametrize( - "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logits_atol, logits_rtol, param_atol, param_rtol", + "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logprobs_atol, logprobs_rtol, param_atol, param_rtol", [ pytest.param( "mini_qwen2_vl", @@ -917,8 +923,8 @@ def test_mini_model_multimodal( dtype, loss_atol, loss_rtol, - logits_atol, - logits_rtol, + logprobs_atol, + logprobs_rtol, param_atol, param_rtol, ): @@ -937,12 +943,12 @@ def test_mini_model_multimodal( rtol=loss_rtol, ) - # Compare the logits from the last step + # Compare the topk logprobs from evaluation step assert_verbose_allclose( - expected_output["logits"], - actual_output["logits"], - atol=logits_atol, - rtol=logits_rtol, + expected_output["topk_logprobs"], + actual_output["topk_logprobs"], + atol=logprobs_atol, + rtol=logprobs_rtol, ) # Compare the params from the last step diff --git a/test/convergence/bf16/test_mini_models_with_logits.py b/test/convergence/bf16/test_mini_models_with_logits.py index 0a6f61eb3..43aeda13c 100644 --- a/test/convergence/bf16/test_mini_models_with_logits.py +++ b/test/convergence/bf16/test_mini_models_with_logits.py @@ -38,6 +38,8 @@ from test.utils import DEFAULT_DATASET_PATH from test.utils import MiniModelConfig from test.utils import assert_verbose_allclose +from test.utils import get_logprobs +from test.utils import get_topk from test.utils import revert_liger_kernel_to_gemma from test.utils import revert_liger_kernel_to_gemma2 from test.utils import revert_liger_kernel_to_gemma3_text @@ -842,12 +844,17 @@ def run_mini_model( print(f"Step {i}, Loss: {output.loss.item()}") loss_list.append(output.loss.item()) + topk_logprobs = get_topk(get_logprobs(output.logits)) MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs) - return {"loss": loss_list, "logits": output.logits, "model": model} + return { + "loss": loss_list, + "topk_logprobs": topk_logprobs.values, + "model": model, + } @pytest.mark.parametrize( - "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logits_atol, logits_rtol, param_atol, param_rtol", + "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logprobs_atol, logprobs_rtol, param_atol, param_rtol", [ pytest.param( "mini_llama3", @@ -1058,8 +1065,8 @@ def run_mini_model( torch.bfloat16, 1e-3, 1e-2, - 1e-1, 1e-2, + 1e-1, 1e-2, 1e-2, marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"), @@ -1071,8 +1078,8 @@ def run_mini_model( torch.bfloat16, 1e-3, 1e-2, - 1e-1, 1e-2, + 1e-1, 1e-2, 1e-2, marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"), @@ -1159,8 +1166,8 @@ def test_mini_model( dtype, loss_atol, loss_rtol, - logits_atol, - logits_rtol, + logprobs_atol, + logprobs_rtol, param_atol, param_rtol, ): @@ -1180,12 +1187,12 @@ def test_mini_model( # No logits are materialized # import pdb; pdb.set_trace() - # Compare the logits from the last step + # Compare the topk logprobs from evaluation step assert_verbose_allclose( - expected_output["logits"], - actual_output["logits"], - atol=logits_atol, - rtol=logits_rtol, + expected_output["topk_logprobs"], + actual_output["topk_logprobs"], + atol=logprobs_atol, + rtol=logprobs_rtol, ) # Compare the params from the last step diff --git a/test/convergence/fp32/test_mini_models.py b/test/convergence/fp32/test_mini_models.py index 3a9c9e577..d311817dd 100644 --- a/test/convergence/fp32/test_mini_models.py +++ b/test/convergence/fp32/test_mini_models.py @@ -38,6 +38,8 @@ from test.utils import DEFAULT_DATASET_PATH from test.utils import MiniModelConfig from test.utils import assert_verbose_allclose +from test.utils import get_logprobs +from test.utils import get_topk from test.utils import revert_liger_kernel_to_gemma from test.utils import revert_liger_kernel_to_gemma2 from test.utils import revert_liger_kernel_to_gemma3_text @@ -849,17 +851,17 @@ def run_mini_model( eval_output = model(**eval_batch) print(f"Eval Loss: {eval_output.loss.item()}") loss_list.append(eval_output.loss.item()) - + topk_logprobs = get_topk(get_logprobs(eval_output.logits)) MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs) return { "loss": loss_list, - "logits": eval_output.logits, + "topk_logprobs": topk_logprobs.values, "model": model, } @pytest.mark.parametrize( - "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logits_atol, logits_rtol, param_atol, param_rtol", + "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logprobs_atol, logprobs_rtol, param_atol, param_rtol", [ ("mini_llama3", 32, 1e-4, torch.float32, 1e-8, 2e-5, 1e-4, 1e-5, 5e-3, 1e-5), pytest.param( @@ -1013,7 +1015,7 @@ def run_mini_model( # TODO: mixtral is flaky so disable the test for now # ("mini_mixtral", 32, 1e-4, torch.float32, 5e-4, 1e-4, 5e-3, 1e-5, 1e-2, 1e-5), # Gemma 1.1 and 2 has more tolerance because currently, the kernel is not a perfect match (casts are not done the same way) - ("mini_gemma1", 32, 1e-4, torch.float32, 1e-8, 1e-4, 5e-3, 1e-5, 5e-3, 1e-5), + ("mini_gemma1", 32, 1e-4, torch.float32, 1e-8, 1e-4, 5e-3, 1e-2, 5e-3, 1e-5), ("mini_gemma1.1", 32, 1e-4, torch.float32, 1e-8, 1e-4, 5e-3, 1e-5, 5e-3, 1e-5), ("mini_gemma2", 32, 1e-4, torch.float32, 1e-8, 1e-4, 5e-3, 1e-5, 5e-3, 1e-5), pytest.param( @@ -1041,8 +1043,8 @@ def test_mini_model( dtype, loss_atol, loss_rtol, - logits_atol, - logits_rtol, + logprobs_atol, + logprobs_rtol, param_atol, param_rtol, ): @@ -1060,13 +1062,13 @@ def test_mini_model( rtol=loss_rtol, ) - # Compare the logits from evaluation step - if expected_output["logits"] is not None and actual_output["logits"] is not None: + # Compare the topk logprobs from evaluation step + if expected_output["topk_logprobs"] is not None and actual_output["topk_logprobs"] is not None: assert_verbose_allclose( - expected_output["logits"], - actual_output["logits"], - atol=logits_atol, - rtol=logits_rtol, + expected_output["topk_logprobs"], + actual_output["topk_logprobs"], + atol=logprobs_atol, + rtol=logprobs_rtol, ) # Compare the params from the last step diff --git a/test/convergence/fp32/test_mini_models_multimodal.py b/test/convergence/fp32/test_mini_models_multimodal.py index 52aaa967c..70bee83f6 100644 --- a/test/convergence/fp32/test_mini_models_multimodal.py +++ b/test/convergence/fp32/test_mini_models_multimodal.py @@ -20,6 +20,8 @@ from test.utils import UNTOKENIZED_DATASET_PATH from test.utils import MiniModelConfig from test.utils import assert_verbose_allclose +from test.utils import get_logprobs +from test.utils import get_topk from test.utils import is_torchvision_available from test.utils import load_image_processing_config from test.utils import load_processor_config @@ -762,11 +764,16 @@ def run_mini_model_multimodal( print(f"Step {i}, Loss: {output.loss.item()}") loss_list.append(output.loss.item()) - return {"loss": loss_list, "logits": output.logits, "model": model} + topk_logprobs = get_topk(get_logprobs(output.logits)) + return { + "loss": loss_list, + "topk_logprobs": topk_logprobs.values, + "model": model, + } @pytest.mark.parametrize( - "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logits_atol, logits_rtol, param_atol, param_rtol", + "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logprobs_atol, logprobs_rtol, param_atol, param_rtol", [ pytest.param( "mini_qwen2_vl", @@ -875,10 +882,10 @@ def run_mini_model_multimodal( 32, 1e-4, torch.float32, - 1e-8, - 1e-5, - 5e-3, - 1e-5, + 1e-3, + 1e-3, + 1e-1, + 1e-1, 5e-3, 1e-5, marks=[ @@ -898,8 +905,8 @@ def test_mini_model_multimodal( dtype, loss_atol, loss_rtol, - logits_atol, - logits_rtol, + logprobs_atol, + logprobs_rtol, param_atol, param_rtol, ): @@ -920,10 +927,10 @@ def test_mini_model_multimodal( # Compare the logits from the last step assert_verbose_allclose( - expected_output["logits"], - actual_output["logits"], - atol=logits_atol, - rtol=logits_rtol, + expected_output["topk_logprobs"], + actual_output["topk_logprobs"], + atol=logprobs_atol, + rtol=logprobs_rtol, ) # Compare the params from the last step diff --git a/test/convergence/fp32/test_mini_models_with_logits.py b/test/convergence/fp32/test_mini_models_with_logits.py index 457df66d6..f6a160453 100644 --- a/test/convergence/fp32/test_mini_models_with_logits.py +++ b/test/convergence/fp32/test_mini_models_with_logits.py @@ -38,6 +38,8 @@ from test.utils import DEFAULT_DATASET_PATH from test.utils import MiniModelConfig from test.utils import assert_verbose_allclose +from test.utils import get_logprobs +from test.utils import get_topk from test.utils import revert_liger_kernel_to_gemma from test.utils import revert_liger_kernel_to_gemma2 from test.utils import revert_liger_kernel_to_gemma3_text @@ -841,12 +843,17 @@ def run_mini_model( print(f"Step {i}, Loss: {output.loss.item()}") loss_list.append(output.loss.item()) + topk_logprobs = get_topk(get_logprobs(output.logits)) MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs) - return {"loss": loss_list, "logits": output.logits, "model": model} + return { + "loss": loss_list, + "topk_logprobs": topk_logprobs.values, + "model": model, + } @pytest.mark.parametrize( - "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logits_atol, logits_rtol, param_atol, param_rtol", + "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logprobs_atol, logprobs_rtol, param_atol, param_rtol", [ ("mini_llama3", 32, 1e-4, torch.float32, 1e-8, 2e-5, 1e-4, 1e-5, 5e-3, 1e-5), pytest.param( @@ -1027,8 +1034,8 @@ def test_mini_model( dtype, loss_atol, loss_rtol, - logits_atol, - logits_rtol, + logprobs_atol, + logprobs_rtol, param_atol, param_rtol, ): @@ -1048,12 +1055,11 @@ def test_mini_model( # No logits are materialized # import pdb; pdb.set_trace() - # Compare the logits from the last step assert_verbose_allclose( - expected_output["logits"], - actual_output["logits"], - atol=logits_atol, - rtol=logits_rtol, + expected_output["topk_logprobs"], + actual_output["topk_logprobs"], + atol=logprobs_atol, + rtol=logprobs_rtol, ) # Compare the params from the last step diff --git a/test/utils.py b/test/utils.py index ab993ca95..ed01b3a83 100644 --- a/test/utils.py +++ b/test/utils.py @@ -57,6 +57,17 @@ def set_seed(seed=42): os.environ["PYTHONHASHSEED"] = str(seed) +@torch.no_grad +def get_logprobs(tensor): + return torch.nn.functional.log_softmax(tensor, dim=-1, dtype=torch.float32) + + +@torch.no_grad +def get_topk(tensor, k=20): + topk = torch.topk(tensor, k, dim=-1) + return topk + + def assert_verbose_allclose(tensor1, tensor2, rtol=1e-05, atol=1e-08, max_print=5): """ Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.