Add backward operator support for vector_exp (#501)

aditvenk · Aditya Venkataraman · web-flow · commit 182b1032d942 · 2025-10-01T11:58:29.000-07:00
- Add Triton kernel for backward.

Testing:
Forward:
```
$ python run.py --op vector_exp --metrics accuracy
First-k mode: Selected 16 sequential inputs starting from index 0 (total available: 16)
Input IDs to run: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02&lt;00:00,  6.34it/s]
    x_val    triton_exp-accuracy    torch_compile_exp-accuracy
---------  ---------------------  ----------------------------
     4096                      1                             1
     8192                      1                             1
    16384                      1                             1
    32768                      1                             1
    65536                      1                             1
   131072                      1                             1
   262144                      1                             1
   524288                      1                             1
  1048576                      1                             1
  2097152                      1                             1
  4194304                      1                             1
  8388608                      1                             1
 16777216                      1                             1
 33554432                      1                             1
 67108864                      1                             1
134217728                      1                             1
  average                      1                             1
  ```

Backward:
```
$ python run.py --op vector_exp --metrics accuracy --bwd
First-k mode: Selected 16 sequential inputs starting from index 0 (total available: 16)
Input IDs to run: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02&lt;00:00,  5.49it/s]
    x_val    triton_exp-accuracy    torch_compile_exp-accuracy
---------  ---------------------  ----------------------------
     4096                      1                             1
     8192                      1                             1
    16384                      1                             1
    32768                      1                             1
    65536                      1                             1
   131072                      1                             1
   262144                      1                             1
   524288                      1                             1
  1048576                      1                             1
  2097152                      1                             1
  4194304                      1                             1
  8388608                      1                             1
 16777216                      1                             1
 33554432                      1                             1
 67108864                      1                             1
134217728                      1                             1
  average                      1                             1
  ```

Co-authored-by: Aditya Venkataraman &lt;avenkataraman@fb.com&gt;
diff --git a/tritonbench/operators/vector_exp/kernels.py b/tritonbench/operators/vector_exp/kernels.py
@@ -1,3 +1,4 @@
+import torch
 import triton
 import triton.language as tl
 
@@ -36,3 +37,90 @@ def triton_exp_kernel(
     if profile_mem is not None:
         end = time()
         tl.store(profile_mem + pid, end - start)
+
+
+@triton.jit
+def triton_exp_backward_kernel(
+    grad_output_ptr,  # *Pointer* to grad_output vector.
+    output_ptr,  # *Pointer* to forward pass output vector (exp(x)).
+    grad_input_ptr,  # *Pointer* to grad_input vector.
+    n_elements,  # Size of the vector.
+    BLOCK_SIZE: tl.constexpr,  # Number of elements each program should process.
+    profile_mem=None,  # *Pointer* to profile_mem.
+):
+    if profile_mem is not None:
+        start = time()
+
+    # There are multiple 'programs' processing different data. We identify which program
+    # we are here:
+    pid = tl.program_id(axis=0)  # We use a 1D launch grid so axis is 0.
+
+    # This program will process inputs that are offset from the initial data.
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    # Create a mask to guard memory operations against out-of-bounds accesses.
+    mask = offsets < n_elements
+
+    # Load grad_output and output from DRAM
+    grad_output = tl.load(grad_output_ptr + offsets, mask=mask)
+    output = tl.load(output_ptr + offsets, mask=mask)
+
+    # Compute grad_input = grad_output * output (since d/dx(exp(x)) = exp(x))
+    grad_input = grad_output * output
+
+    # Write grad_input back to DRAM.
+    tl.store(grad_input_ptr + offsets, grad_input, mask=mask)
+
+    if profile_mem is not None:
+        end = time()
+        tl.store(profile_mem + pid, end - start)
+
+
+class TritonExpFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx, x: torch.Tensor, block_size: int = 1024, profile_mem: torch.Tensor = None
+    ):
+        # Allocate output tensor
+        output = torch.empty_like(x)
+        n_elements = output.numel()
+
+        # Launch grid - number of blocks needed
+        grid = lambda meta: (triton.cdiv(n_elements, block_size),)
+
+        # Launch forward kernel
+        triton_exp_kernel[grid](
+            x, output, n_elements, BLOCK_SIZE=block_size, profile_mem=profile_mem
+        )
+
+        # Save output for backward pass
+        ctx.save_for_backward(output)
+        ctx.block_size = block_size
+        ctx.profile_mem = profile_mem
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor):
+        # Retrieve saved tensors
+        (output,) = ctx.saved_tensors
+
+        # Allocate grad_input tensor
+        grad_input = torch.empty_like(grad_output)
+        n_elements = grad_output.numel()
+
+        # Launch grid - number of blocks needed
+        grid = lambda meta: (triton.cdiv(n_elements, ctx.block_size),)
+
+        # Launch backward kernel
+        triton_exp_backward_kernel[grid](
+            grad_output,
+            output,
+            grad_input,
+            n_elements,
+            BLOCK_SIZE=ctx.block_size,
+            profile_mem=ctx.profile_mem,
+        )
+
+        # Return gradients (None for block_size and profile_mem as they don't need gradients)
+        return grad_input, None, None
diff --git a/tritonbench/operators/vector_exp/operator.py b/tritonbench/operators/vector_exp/operator.py
@@ -10,7 +10,7 @@
     register_metric,
 )
 
-from .kernels import triton_exp_kernel
+from .kernels import TritonExpFunction
 
 
 class Operator(BenchmarkOperator):
@@ -46,27 +46,14 @@ def duration(self, fn, example_inputs, metrics: BenchmarkOperatorMetrics):
 
     @register_benchmark()
     def triton_exp(self, x: torch.Tensor):
-        # We need to preallocate the output.
-        output = torch.empty_like(x)
-        n_elements = output.numel()
-        # The SPMD launch grid denotes the number of kernel instances that run in parallel.
-        # It is analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int].
-        # In this case, we use a 1D grid where the size is the number of blocks:
-        grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
-        # NOTE:
-        #  - Each torch.tensor object is implicitly converted into a pointer to its first element.
-        #  - `triton.jit`'ed functions can be indexed with a launch grid to obtain a callable GPU kernel.
-        #  - Don't forget to pass meta-parameters as keywords arguments.
-
+        n_elements = x.numel()
         # Prepare a memory buffer to store the profiled data, with the size equal to the number of programs.
         BLOCK_SIZE = 1024
         n_programs = triton.cdiv(n_elements, BLOCK_SIZE)
         profile_mem = torch.empty(n_programs, dtype=torch.int64, device=self.device)
 
         def _inner():
-            triton_exp_kernel[grid](
-                x, output, n_elements, BLOCK_SIZE=1024, profile_mem=profile_mem
-            )
+            output = TritonExpFunction.apply(x, BLOCK_SIZE, profile_mem)
             return {"output": output, "profile_mem": profile_mem}
 
         return _inner
@@ -133,5 +120,23 @@ def _plot(size, provider):
 
     def get_input_iter(self) -> Generator:
         for size in self.get_x_vals():
-            x = torch.rand(size, device=self.device, dtype=self.dtype)
+            x = torch.rand(
+                size, device=self.device, dtype=self.dtype, requires_grad=True
+            )
             yield (x,)
+
+    def get_bwd_fn(self, fwd_fn: Callable) -> Callable:
+        def _bwd():
+            x = self.example_inputs[0]
+            # clear existing grad
+            x.grad = None
+            y = fwd_fn()["output"]
+            dy = torch.randn_like(y)
+            y.backward(dy, retain_graph=True)
+            return {"output": x.grad}
+
+        return _bwd
+
+    def get_grad_to_none(self, args) -> List[torch.Tensor]:
+        x = args[0]
+        return [x]