src directory polishing (#23)

ByronHsu · web-flow · commit f7f8384a1333 · 2024-08-15T12:19:07.000-07:00
Complete the following tasks before sending your PR, and replace `[ ]`
with
`[x]` to indicate you have done them.
- [x] run `make test` to ensure correctness
- [x] run `make checkstyle` to ensure code style
- [x] run `make test-convergence` to ensure convergence

```
jobuser [ ~/Liger-Kernel ]$ make checkstyle test test-convergence
flake8 .; flake8_status=$?; \
isort .; isort_status=$?; \
black .; black_status=$?; \
if [ $flake8_status -ne 0 ] || [ $isort_status -ne 0 ] || [ $black_status -ne 0 ]; then \
        exit 1; \
fi
Skipped 1 files
All done! ✨ 🍰 ✨
45 files left unchanged.
pytest --disable-warnings test/ --ignore=test/convergence
=============================================================================================================== test session starts ================================================================================================================
platform linux -- Python 3.10.14, pytest-7.1.2, pluggy-1.0.0
rootdir: /home/jobuser/Liger-Kernel
plugins: lipy-config-base-30.6.1, lipy-fabric-35.2.3, lipy-test-8.0.52, datadir-1.3.1, lipy-mp-34.4.191
collected 111 items                                                                                                                                                                                                                                

test/transformers/test_cross_entropy.py ..........................................................                                                                                                                                           [ 52%]
test/transformers/test_fused_linear_cross_entropy.py ......                                                                                                                                                                                  [ 57%]
test/transformers/test_geglu.py ........                                                                                                                                                                                                     [ 64%]
test/transformers/test_rms_norm.py ................                                                                                                                                                                                          [ 79%]
test/transformers/test_rope.py ............                                                                                                                                                                                                  [ 90%]
test/transformers/test_swiglu.py ........                                                                                                                                                                                                    [ 97%]
test/transformers/test_transformers_monkey_patch.py .                                                                                                                                                                                        [ 98%]
test/triton/test_triton_monkey_patch.py ..                                                                                                                                                                                                   [100%]

========================================================================================================== 111 passed in 61.60s (0:01:01) ==========================================================================================================
HF_DATASETS_OFFLINE=1 pytest --disable-warnings test/convergence
=============================================================================================================== test session starts ================================================================================================================
platform linux -- Python 3.10.14, pytest-7.1.2, pluggy-1.0.0
rootdir: /home/jobuser/Liger-Kernel
plugins: lipy-config-base-30.6.1, lipy-fabric-35.2.3, lipy-test-8.0.52, datadir-1.3.1, lipy-mp-34.4.191
collected 8 items                                                                                                                                                                                                                                  

test/convergence/test_mini_models.py ......                                                                                                                                                                                                  [ 75%]
test/convergence/test_mini_models_no_logits.py ..                                                                                                                                                                                            [100%]

=========================================================================================================== 8 passed in 97.43s (0:01:37) ===========================================================================================================
```
diff --git a/docs/images/memory.png b/docs/images/memory.png
diff --git a/docs/images/speedup.png b/docs/images/speedup.png
diff --git a/src/liger_kernel/ops/cross_entropy.py b/src/liger_kernel/ops/cross_entropy.py
@@ -17,7 +17,7 @@ def liger_cross_entropy_kernel(
     BLOCK_SIZE: tl.constexpr,
 ):
     """
-    This kernel computes both cross entropy loss and the gradient of the _input.
+    This kernel computes both cross entropy loss and the gradient of the input.
     We only consider hard label + mean reduction for now. Please refer to https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html for the math.
 
     Parameters:
@@ -34,7 +34,7 @@ def liger_cross_entropy_kernel(
     """
 
     # https://github.com/triton-lang/triton/issues/1058
-    # Essentially if B*T*V is too large, program_id * stride will overflow out of int32
+    # If B*T*V is too large, program_id * stride will overflow out of int32, so we convert to int64
     program_id = tl.program_id(0).to(tl.int64)
 
     # 1. Load Y_ptr first because if the target is ignore_index, we can return right away
@@ -90,13 +90,7 @@ def liger_cross_entropy_kernel(
     tl.debug_barrier()
 
     # 5. Calculate the loss
-    # Old Approach: Problematic LogSoftmax
-    # min of bfloat16 and float32 is 1e-38, so we set a value larger than that but small enough
-    # This will overflow if X_y * n_non_ignore is too small. Even if we add a tiny epsilon, it will still overflow
-    # loss = -tl.log(X_y * n_non_ignore)
 
-    # New Approach: Safe LogSoftmax
-    # Therefore, we propose to use safe logsoftmax by reordering the formula.
     # loss = log (softmax(X_y)) = log ((e ^ (X_y - max(X)) / sum(e ^ (X - max(X))))
     #      = (X_y - max(X)) - log(sum(e ^ (X - max(X))))
     # sum(e ^ (X - max(X))) must >= 1 because the max term is e ^ 0 = 1
@@ -114,7 +108,7 @@ def liger_cross_entropy_kernel(
 # The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576 https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19
 # However, setting limit as 65536 as in LayerNorm tutorial is faster because of less register spilling
 # The optimal maximum block size depends on your hardware, your kernel, and your dtype
-MAX_FUSED_SIZE = 65536 // 2  # manual tune a bit
+MAX_FUSED_SIZE = 65536 // 2  # the best size we found by manually tuning
 
 
 @triton.jit
@@ -184,28 +178,6 @@ def forward(ctx, _input, target, ignore_index):
         n_non_ignore = (target != ignore_index).sum().item()
 
         # ensure _input and target are contiguous in the last dimension
-        # there are examples that are NOT contiguous overall but contiguous in the last dimension
-        ####################################################################
-        # tensor = torch.arange(1, 21).reshape(5, -1)
-        # print(tensor)
-        # tensor([[ 1,  2,  3,  4],
-        # [ 5,  6,  7,  8],
-        # [ 9, 10, 11, 12],
-        # [13, 14, 15, 16],
-        # [17, 18, 19, 20]])
-        # print(tensor.is_contiguous())
-        # True
-        # slice = tensor[::2, :]
-        # print(slice)
-        # tensor([[ 1,  2,  3,  4],
-        # [ 9, 10, 11, 12],
-        # [17, 18, 19, 20]])
-        # print(slice.is_contiguous())
-        # False
-        # print(slice.stride())
-        # (8, 1)
-        # slice is NOT a contiguous tensor but is contiguous in the last dimension, CE kernel can execute because the stride is 8, and each triton program will jump by 8
-        ####################################################################
         if _input.stride(-1) != 1:
             _input = _input.contiguous()
         if target.stride(-1) != 1:
@@ -252,10 +224,9 @@ def backward(ctx, grad_output):
         # If cross entropy is the last layer, grad_output is 1.0. Skip the mul to save time
         if torch.equal(grad_output, torch.tensor(1.0, device=grad_output.device)):
             pass
+
         # We use a Triton kernel instead of a PyTorch operation because modifying inputs in-place
         # for gradient storage and backward multiple times causes anomalies with PyTorch but not with Triton.
-        # Although the Brew trainer should only perform backward once, it encounters this issue.
-        # https://github.com/triton-lang/triton/issues/4004
         else:
             BT, V = _input.shape
             n_rows = BT
diff --git a/src/liger_kernel/ops/fused_linear_cross_entropy.py b/src/liger_kernel/ops/fused_linear_cross_entropy.py
@@ -1,8 +1,3 @@
-"""Fusing the last linear layer with cross-entropy loss
-
-Reference: https://github.com/mgmalek/efficient_cross_entropy
-"""
-
 import torch
 import triton
 
@@ -11,13 +6,16 @@
 # The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576 https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19
 # However, setting limit as 65536 as in LayerNorm tutorial is faster because of less register spilling
 # The optimal maximum block size depends on your hardware, your kernel, and your dtype
-MAX_FUSED_SIZE = 65536 // 2  # manual tune a bit
+MAX_FUSED_SIZE = 65536 // 2
 
 
 class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
     @staticmethod
     def forward(ctx, _input, linear, target, ignore_index):
         """
+        Fusing the last linear layer with cross-entropy loss
+            Reference: https://github.com/mgmalek/efficient_cross_entropy
+
         Handle the forward and backward pass of the final linear layer via cross-entropy loss by avoiding
         the materialization of the large logits tensor. Since Cross Entropy Loss is the last layer, we can
         compute the gradient at the forward pass. By doing so, we don't have to store the _input and target
@@ -54,6 +52,8 @@ def forward(ctx, _input, linear, target, ignore_index):
 
         grad_linear = torch.zeros_like(linear, device=device)
         grad_input = torch.zeros_like(_input, device=device)
+
+        # we use fp32 for loss accumulator
         loss_1d = torch.zeros(BT, dtype=torch.float32, device=device)
 
         total_n_non_ignore = (target != ignore_index).sum().item()
diff --git a/src/liger_kernel/transformers/model/llama.py b/src/liger_kernel/transformers/model/llama.py
@@ -37,6 +37,9 @@ def lce_forward(
     cache_position: Optional[torch.LongTensor] = None,
 ) -> Union[Tuple, CausalLMOutputWithPast]:
     r"""
+    Copy paste llama forward but replace torch cross entropy with liger fused linear cross entropy
+
+
     Args:
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/liger_kernel/transformers/monkey_patch.py b/src/liger_kernel/transformers/monkey_patch.py
@@ -14,7 +14,6 @@ def apply_liger_kernel_to_llama(
 ) -> None:
     """
     Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)
-    to make GPU go burrr.
 
     Args:
         rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
@@ -53,7 +52,6 @@ def apply_liger_kernel_to_mistral(
 ) -> None:
     """
     Apply Liger kernels to replace original implementation in HuggingFace Mistral models
-    to make GPU go burrr.
 
     Args:
         rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
@@ -82,12 +80,12 @@ def apply_liger_kernel_to_mixtral(
 ) -> None:
     """
     Apply Liger kernels to replace original implementation in HuggingFace Mixtral models
-    to make GPU go burrr.
 
     Args:
         rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
         cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is True.
         rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is True.
     """
 
     from transformers.models.mixtral import modeling_mixtral