feat: add softmax examples (#55)

jlonge4 · web-flow · commit 3c5d27799a98 · 2025-04-09T13:21:56.000-07:00
diff --git a/src/nki_samples/tutorials/softmax/softmax_nki_kernels.py b/src/nki_samples/tutorials/softmax/softmax_nki_kernels.py
@@ -0,0 +1,45 @@
+import math
+import neuronxcc.nki as nki
+import neuronxcc.nki.language as nl
+import torch_xla.core.xla_model as xm
+
+@nki.jit
+def nki_softmax_kernel(a_tensor):
+  # Calculate out_tensor
+  # Where softmax(x) = = exp(x - max(x)) / sum(exp(x - max(x)))
+  out_tensor = nl.ndarray(a_tensor.shape, dtype=a_tensor.dtype,
+                          buffer=nl.shared_hbm)
+
+  # Generate tensor indices to index input tensor
+  ix = nl.arange(128)[:, None]
+  iy = nl.arange(a_tensor.shape[1])[None, :]
+
+  num_rows = a_tensor.shape[0]
+
+  # Process 128 rows at a time due to 128-partition tile size limitation
+  # Since we're not reducing across the first dimension
+  # Tiles can be processed independently
+  for i in nl.affine_range(math.ceil(a_tensor.shape[0]/128)):
+
+    # Load input data from external memory to on-chip memory
+    a_tile = nl.load(a_tensor[i * 128 + ix, iy],
+                    mask=(i * 128 + ix < num_rows))
+
+    # Find max and subtract from each value to ensure numerical stability
+    max_vals = nl.max(a_tile, axis=[1], keepdims=True, mask=(i * 128 + ix < num_rows))
+    shifted = nl.subtract(a_tile, max_vals, mask=(i * 128 + ix < num_rows))
+
+    # Compute element-wise exp of a_tensor
+    numerator = nl.exp(shifted)
+
+    # Calculate sum of squared elements, along last dimension
+    denominator = nl.sum(numerator, axis=[1])
+
+    # Scale and get a reciprocal
+    sm = numerator / denominator
+
+    # store the results back to external memory (out_tensor)
+    nl.store(out_tensor[i * 128 + ix, iy], value=sm,
+            mask=(i * 128 + ix < num_rows))
+
+  return out_tensor
diff --git a/src/nki_samples/tutorials/softmax/softmax_torch.py b/src/nki_samples/tutorials/softmax/softmax_torch.py
@@ -0,0 +1,31 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from softmax_nki_kernels import nki_softmax_kernel
+
+class NaiveSoftmax(nn.Module):
+    def __init__(self):
+        super(NaiveSoftmax, self).__init__()
+
+    def forward(self, x):
+
+        numerator = torch.exp(x)
+        denominator = torch.sum(numerator, dim=-1, keepdim=True)
+        sm = numerator / denominator 
+        return sm
+
+def naive_softmax(logits: torch.tensor) -> torch.tensor :
+    softmax = NaiveSoftmax()
+    probs = softmax(logits)
+    return probs
+
+from torch_xla.core import xla_model as xm
+device = xm.xla_device()
+
+logits = torch.tensor([[1.0,2.0,3.0,4.0,5.0], [5.0,4.0,3.0,2.0,1.0]]).to(device)
+
+sm_naive = naive_softmax(logits)
+sm_nki = nki_softmax_kernel(logits)
+
+assert torch.allclose(sm_naive, sm_nki, rtol=1e-5, atol=1e-5)