diff --git a/dev/modal/benchmarks.py b/dev/modal/benchmarks.py
index 183fd6744..00ba3bf34 100644
--- a/dev/modal/benchmarks.py
+++ b/dev/modal/benchmarks.py
@@ -16,8 +16,8 @@
 
 @app.function(gpu="H100", image=repo, timeout=60 * 45)
 def liger_benchmarks():
-    import subprocess
     import os
+    import subprocess
 
     subprocess.run(
         ["uv pip install -e '.[dev]' --system"],
@@ -31,7 +31,7 @@ def liger_benchmarks():
     file_path = Path(REMOTE_ROOT_PATH) / "benchmark" / "data" / "all_benchmark_data.csv"
     print(f"Checking if file exists at: {file_path}")
     print(f"File exists: {os.path.exists(file_path)}")
-    
+
     if not os.path.exists(file_path):
         print("Listing directory contents:")
         data_dir = file_path.parent
@@ -54,21 +54,21 @@ def main():
         # Run the benchmarks and get the data
         print("Starting benchmark run...")
         benchmark_data = liger_benchmarks.remote()
-        
+
         if not benchmark_data:
             raise ValueError("No data received from remote function")
-            
+
         # Save the data locally
         local_data_path = ROOT_PATH / "benchmark" / "data" / "all_benchmark_data.csv"
         print(f"Attempting to save data to: {local_data_path}")
-        
+
         local_data_path.parent.mkdir(parents=True, exist_ok=True)
-        
+
         with open(local_data_path, "wb") as f:
             f.write(benchmark_data)
-        
+
         print(f"Successfully saved {len(benchmark_data)} bytes to: {local_data_path}")
-        
+
     except Exception as e:
         print(f"Error occurred: {str(e)}")
         raise
diff --git a/test/convergence/bf16/test_mini_models.py b/test/convergence/bf16/test_mini_models.py
index 17a2a6219..fcda20a68 100644
--- a/test/convergence/bf16/test_mini_models.py
+++ b/test/convergence/bf16/test_mini_models.py
@@ -38,6 +38,8 @@
 from test.utils import DEFAULT_DATASET_PATH
 from test.utils import MiniModelConfig
 from test.utils import assert_verbose_allclose
+from test.utils import get_logprobs
+from test.utils import get_topk
 from test.utils import revert_liger_kernel_to_gemma
 from test.utils import revert_liger_kernel_to_gemma2
 from test.utils import revert_liger_kernel_to_gemma3_text
@@ -851,17 +853,17 @@ def run_mini_model(
         eval_output = model(**eval_batch)
     print(f"Eval Loss: {eval_output.loss.item()}")
     loss_list.append(eval_output.loss.item())
-
+    topk_logprobs = get_topk(get_logprobs(eval_output.logits))
     MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs)
     return {
         "loss": loss_list,
-        "logits": eval_output.logits,
+        "topk_logprobs": topk_logprobs.values,
         "model": model,
     }
 
 
 @pytest.mark.parametrize(
-    "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logits_atol, logits_rtol, param_atol, param_rtol",
+    "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logprobs_atol, logprobs_rtol, param_atol, param_rtol",
     [
         pytest.param(
             "mini_llama3",
@@ -884,7 +886,7 @@ def run_mini_model(
             1e-3,
             1e-2,
             1e-1,
-            1e-2,
+            1e-1,
             1e-2,
             1e-2,
             marks=[
@@ -902,7 +904,7 @@ def run_mini_model(
             torch.bfloat16,
             1e-3,
             1e-2,
-            1,  # 1e-1
+            1e-1,  # 1e-1
             1e-1,  # 1e-2
             1e-2,
             1e-2,
@@ -972,7 +974,7 @@ def run_mini_model(
             torch.bfloat16,
             1e-3,
             1e-2,
-            1,  # 1e-1
+            1e-1,  # 1e-1
             1e-1,  # 1e-2
             1e-2,
             1e-2,
@@ -1111,8 +1113,8 @@ def run_mini_model(
             torch.bfloat16,
             1e-3,
             1e-2,
-            1e-1,
             1e-2,
+            1e-1,
             1e-2,
             1e-2,
             marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
@@ -1124,8 +1126,8 @@ def run_mini_model(
             torch.bfloat16,
             1e-3,
             1e-2,
-            1e-1,
             1e-2,
+            1e-1,
             1e-2,
             1e-2,
             marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
@@ -1153,8 +1155,8 @@ def run_mini_model(
             torch.bfloat16,
             1e-3,
             1e-2,
-            1e-1,
-            1e-2,
+            3e-1,
+            4e-1,
             1e-2,
             1e-2,
             marks=[
@@ -1174,8 +1176,8 @@ def test_mini_model(
     dtype,
     loss_atol,
     loss_rtol,
-    logits_atol,
-    logits_rtol,
+    logprobs_atol,
+    logprobs_rtol,
     param_atol,
     param_rtol,
 ):
@@ -1193,13 +1195,13 @@ def test_mini_model(
         rtol=loss_rtol,
     )
 
-    # Compare the logits from evaluation step
-    if expected_output["logits"] is not None and actual_output["logits"] is not None:
+    # Compare the topk logprobs from evaluation step
+    if expected_output["topk_logprobs"] is not None and actual_output["topk_logprobs"] is not None:
         assert_verbose_allclose(
-            expected_output["logits"],
-            actual_output["logits"],
-            atol=logits_atol,
-            rtol=logits_rtol,
+            expected_output["topk_logprobs"],
+            actual_output["topk_logprobs"],
+            atol=logprobs_atol,
+            rtol=logprobs_rtol,
         )
 
     # Compare the params from the last step
diff --git a/test/convergence/bf16/test_mini_models_multimodal.py b/test/convergence/bf16/test_mini_models_multimodal.py
index 04ffedcc8..e087b80bd 100644
--- a/test/convergence/bf16/test_mini_models_multimodal.py
+++ b/test/convergence/bf16/test_mini_models_multimodal.py
@@ -20,6 +20,8 @@
 from test.utils import UNTOKENIZED_DATASET_PATH
 from test.utils import MiniModelConfig
 from test.utils import assert_verbose_allclose
+from test.utils import get_logprobs
+from test.utils import get_topk
 from test.utils import is_torchvision_available
 from test.utils import load_image_processing_config
 from test.utils import load_processor_config
@@ -764,13 +766,17 @@ def run_mini_model_multimodal(
 
         print(f"Step {i}, Loss: {output.loss.item()}")
         loss_list.append(output.loss.item())
-
+    topk_logprobs = get_topk(get_logprobs(output.logits))
     MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs)
-    return {"loss": loss_list, "logits": output.logits, "model": model}
+    return {
+        "loss": loss_list,
+        "topk_logprobs": topk_logprobs.values,
+        "model": model,
+    }
 
 
 @pytest.mark.parametrize(
-    "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logits_atol, logits_rtol, param_atol, param_rtol",
+    "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logprobs_atol, logprobs_rtol, param_atol, param_rtol",
     [
         pytest.param(
             "mini_qwen2_vl",
@@ -917,8 +923,8 @@ def test_mini_model_multimodal(
     dtype,
     loss_atol,
     loss_rtol,
-    logits_atol,
-    logits_rtol,
+    logprobs_atol,
+    logprobs_rtol,
     param_atol,
     param_rtol,
 ):
@@ -937,12 +943,12 @@ def test_mini_model_multimodal(
         rtol=loss_rtol,
     )
 
-    # Compare the logits from the last step
+    # Compare the topk logprobs from evaluation step
     assert_verbose_allclose(
-        expected_output["logits"],
-        actual_output["logits"],
-        atol=logits_atol,
-        rtol=logits_rtol,
+        expected_output["topk_logprobs"],
+        actual_output["topk_logprobs"],
+        atol=logprobs_atol,
+        rtol=logprobs_rtol,
     )
 
     # Compare the params from the last step
diff --git a/test/convergence/bf16/test_mini_models_with_logits.py b/test/convergence/bf16/test_mini_models_with_logits.py
index 0a6f61eb3..43aeda13c 100644
--- a/test/convergence/bf16/test_mini_models_with_logits.py
+++ b/test/convergence/bf16/test_mini_models_with_logits.py
@@ -38,6 +38,8 @@
 from test.utils import DEFAULT_DATASET_PATH
 from test.utils import MiniModelConfig
 from test.utils import assert_verbose_allclose
+from test.utils import get_logprobs
+from test.utils import get_topk
 from test.utils import revert_liger_kernel_to_gemma
 from test.utils import revert_liger_kernel_to_gemma2
 from test.utils import revert_liger_kernel_to_gemma3_text
@@ -842,12 +844,17 @@ def run_mini_model(
         print(f"Step {i}, Loss: {output.loss.item()}")
         loss_list.append(output.loss.item())
 
+    topk_logprobs = get_topk(get_logprobs(output.logits))
     MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs)
-    return {"loss": loss_list, "logits": output.logits, "model": model}
+    return {
+        "loss": loss_list,
+        "topk_logprobs": topk_logprobs.values,
+        "model": model,
+    }
 
 
 @pytest.mark.parametrize(
-    "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logits_atol, logits_rtol, param_atol, param_rtol",
+    "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logprobs_atol, logprobs_rtol, param_atol, param_rtol",
     [
         pytest.param(
             "mini_llama3",
@@ -1058,8 +1065,8 @@ def run_mini_model(
             torch.bfloat16,
             1e-3,
             1e-2,
-            1e-1,
             1e-2,
+            1e-1,
             1e-2,
             1e-2,
             marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
@@ -1071,8 +1078,8 @@ def run_mini_model(
             torch.bfloat16,
             1e-3,
             1e-2,
-            1e-1,
             1e-2,
+            1e-1,
             1e-2,
             1e-2,
             marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
@@ -1159,8 +1166,8 @@ def test_mini_model(
     dtype,
     loss_atol,
     loss_rtol,
-    logits_atol,
-    logits_rtol,
+    logprobs_atol,
+    logprobs_rtol,
     param_atol,
     param_rtol,
 ):
@@ -1180,12 +1187,12 @@ def test_mini_model(
 
     # No logits are materialized
     # import pdb; pdb.set_trace()
-    # Compare the logits from the last step
+    # Compare the topk logprobs from evaluation step
     assert_verbose_allclose(
-        expected_output["logits"],
-        actual_output["logits"],
-        atol=logits_atol,
-        rtol=logits_rtol,
+        expected_output["topk_logprobs"],
+        actual_output["topk_logprobs"],
+        atol=logprobs_atol,
+        rtol=logprobs_rtol,
     )
 
     # Compare the params from the last step
diff --git a/test/convergence/fp32/test_mini_models.py b/test/convergence/fp32/test_mini_models.py
index 3a9c9e577..d311817dd 100644
--- a/test/convergence/fp32/test_mini_models.py
+++ b/test/convergence/fp32/test_mini_models.py
@@ -38,6 +38,8 @@
 from test.utils import DEFAULT_DATASET_PATH
 from test.utils import MiniModelConfig
 from test.utils import assert_verbose_allclose
+from test.utils import get_logprobs
+from test.utils import get_topk
 from test.utils import revert_liger_kernel_to_gemma
 from test.utils import revert_liger_kernel_to_gemma2
 from test.utils import revert_liger_kernel_to_gemma3_text
@@ -849,17 +851,17 @@ def run_mini_model(
         eval_output = model(**eval_batch)
     print(f"Eval Loss: {eval_output.loss.item()}")
     loss_list.append(eval_output.loss.item())
-
+    topk_logprobs = get_topk(get_logprobs(eval_output.logits))
     MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs)
     return {
         "loss": loss_list,
-        "logits": eval_output.logits,
+        "topk_logprobs": topk_logprobs.values,
         "model": model,
     }
 
 
 @pytest.mark.parametrize(
-    "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logits_atol, logits_rtol, param_atol, param_rtol",
+    "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logprobs_atol, logprobs_rtol, param_atol, param_rtol",
     [
         ("mini_llama3", 32, 1e-4, torch.float32, 1e-8, 2e-5, 1e-4, 1e-5, 5e-3, 1e-5),
         pytest.param(
@@ -1013,7 +1015,7 @@ def run_mini_model(
         # TODO: mixtral is flaky so disable the test for now
         # ("mini_mixtral", 32, 1e-4, torch.float32, 5e-4, 1e-4, 5e-3, 1e-5, 1e-2, 1e-5),
         # Gemma 1.1 and 2 has more tolerance because currently, the kernel is not a perfect match (casts are not done the same way)
-        ("mini_gemma1", 32, 1e-4, torch.float32, 1e-8, 1e-4, 5e-3, 1e-5, 5e-3, 1e-5),
+        ("mini_gemma1", 32, 1e-4, torch.float32, 1e-8, 1e-4, 5e-3, 1e-2, 5e-3, 1e-5),
         ("mini_gemma1.1", 32, 1e-4, torch.float32, 1e-8, 1e-4, 5e-3, 1e-5, 5e-3, 1e-5),
         ("mini_gemma2", 32, 1e-4, torch.float32, 1e-8, 1e-4, 5e-3, 1e-5, 5e-3, 1e-5),
         pytest.param(
@@ -1041,8 +1043,8 @@ def test_mini_model(
     dtype,
     loss_atol,
     loss_rtol,
-    logits_atol,
-    logits_rtol,
+    logprobs_atol,
+    logprobs_rtol,
     param_atol,
     param_rtol,
 ):
@@ -1060,13 +1062,13 @@ def test_mini_model(
         rtol=loss_rtol,
     )
 
-    # Compare the logits from evaluation step
-    if expected_output["logits"] is not None and actual_output["logits"] is not None:
+    # Compare the topk logprobs from evaluation step
+    if expected_output["topk_logprobs"] is not None and actual_output["topk_logprobs"] is not None:
         assert_verbose_allclose(
-            expected_output["logits"],
-            actual_output["logits"],
-            atol=logits_atol,
-            rtol=logits_rtol,
+            expected_output["topk_logprobs"],
+            actual_output["topk_logprobs"],
+            atol=logprobs_atol,
+            rtol=logprobs_rtol,
         )
 
     # Compare the params from the last step
diff --git a/test/convergence/fp32/test_mini_models_multimodal.py b/test/convergence/fp32/test_mini_models_multimodal.py
index 52aaa967c..70bee83f6 100644
--- a/test/convergence/fp32/test_mini_models_multimodal.py
+++ b/test/convergence/fp32/test_mini_models_multimodal.py
@@ -20,6 +20,8 @@
 from test.utils import UNTOKENIZED_DATASET_PATH
 from test.utils import MiniModelConfig
 from test.utils import assert_verbose_allclose
+from test.utils import get_logprobs
+from test.utils import get_topk
 from test.utils import is_torchvision_available
 from test.utils import load_image_processing_config
 from test.utils import load_processor_config
@@ -762,11 +764,16 @@ def run_mini_model_multimodal(
         print(f"Step {i}, Loss: {output.loss.item()}")
         loss_list.append(output.loss.item())
 
-    return {"loss": loss_list, "logits": output.logits, "model": model}
+    topk_logprobs = get_topk(get_logprobs(output.logits))
+    return {
+        "loss": loss_list,
+        "topk_logprobs": topk_logprobs.values,
+        "model": model,
+    }
 
 
 @pytest.mark.parametrize(
-    "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logits_atol, logits_rtol, param_atol, param_rtol",
+    "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logprobs_atol, logprobs_rtol, param_atol, param_rtol",
     [
         pytest.param(
             "mini_qwen2_vl",
@@ -875,10 +882,10 @@ def run_mini_model_multimodal(
             32,
             1e-4,
             torch.float32,
-            1e-8,
-            1e-5,
-            5e-3,
-            1e-5,
+            1e-3,
+            1e-3,
+            1e-1,
+            1e-1,
             5e-3,
             1e-5,
             marks=[
@@ -898,8 +905,8 @@ def test_mini_model_multimodal(
     dtype,
     loss_atol,
     loss_rtol,
-    logits_atol,
-    logits_rtol,
+    logprobs_atol,
+    logprobs_rtol,
     param_atol,
     param_rtol,
 ):
@@ -920,10 +927,10 @@ def test_mini_model_multimodal(
 
     # Compare the logits from the last step
     assert_verbose_allclose(
-        expected_output["logits"],
-        actual_output["logits"],
-        atol=logits_atol,
-        rtol=logits_rtol,
+        expected_output["topk_logprobs"],
+        actual_output["topk_logprobs"],
+        atol=logprobs_atol,
+        rtol=logprobs_rtol,
     )
 
     # Compare the params from the last step
diff --git a/test/convergence/fp32/test_mini_models_with_logits.py b/test/convergence/fp32/test_mini_models_with_logits.py
index 457df66d6..f6a160453 100644
--- a/test/convergence/fp32/test_mini_models_with_logits.py
+++ b/test/convergence/fp32/test_mini_models_with_logits.py
@@ -38,6 +38,8 @@
 from test.utils import DEFAULT_DATASET_PATH
 from test.utils import MiniModelConfig
 from test.utils import assert_verbose_allclose
+from test.utils import get_logprobs
+from test.utils import get_topk
 from test.utils import revert_liger_kernel_to_gemma
 from test.utils import revert_liger_kernel_to_gemma2
 from test.utils import revert_liger_kernel_to_gemma3_text
@@ -841,12 +843,17 @@ def run_mini_model(
         print(f"Step {i}, Loss: {output.loss.item()}")
         loss_list.append(output.loss.item())
 
+    topk_logprobs = get_topk(get_logprobs(output.logits))
     MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs)
-    return {"loss": loss_list, "logits": output.logits, "model": model}
+    return {
+        "loss": loss_list,
+        "topk_logprobs": topk_logprobs.values,
+        "model": model,
+    }
 
 
 @pytest.mark.parametrize(
-    "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logits_atol, logits_rtol, param_atol, param_rtol",
+    "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logprobs_atol, logprobs_rtol, param_atol, param_rtol",
     [
         ("mini_llama3", 32, 1e-4, torch.float32, 1e-8, 2e-5, 1e-4, 1e-5, 5e-3, 1e-5),
         pytest.param(
@@ -1027,8 +1034,8 @@ def test_mini_model(
     dtype,
     loss_atol,
     loss_rtol,
-    logits_atol,
-    logits_rtol,
+    logprobs_atol,
+    logprobs_rtol,
     param_atol,
     param_rtol,
 ):
@@ -1048,12 +1055,11 @@ def test_mini_model(
 
     # No logits are materialized
     # import pdb; pdb.set_trace()
-    # Compare the logits from the last step
     assert_verbose_allclose(
-        expected_output["logits"],
-        actual_output["logits"],
-        atol=logits_atol,
-        rtol=logits_rtol,
+        expected_output["topk_logprobs"],
+        actual_output["topk_logprobs"],
+        atol=logprobs_atol,
+        rtol=logprobs_rtol,
     )
 
     # Compare the params from the last step
diff --git a/test/utils.py b/test/utils.py
index ab993ca95..ed01b3a83 100644
--- a/test/utils.py
+++ b/test/utils.py
@@ -57,6 +57,17 @@ def set_seed(seed=42):
     os.environ["PYTHONHASHSEED"] = str(seed)
 
 
+@torch.no_grad
+def get_logprobs(tensor):
+    return torch.nn.functional.log_softmax(tensor, dim=-1, dtype=torch.float32)
+
+
+@torch.no_grad
+def get_topk(tensor, k=20):
+    topk = torch.topk(tensor, k, dim=-1)
+    return topk
+
+
 def assert_verbose_allclose(tensor1, tensor2, rtol=1e-05, atol=1e-08, max_print=5):
     """
     Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.