use atomics for embedding backward (tinygrad#14400)

geohot · web-flow · commit 838cd078bc9f · 2026-01-30T18:10:59.000+08:00
* embedding is slow

* failing

* float is fine

* null

* it fails

* simplify embedding with broadcasting

* ATOMIC_ADD incoming

* min change

* simpler test

* better test

* fix test

* real test

* simpler

* cleanups

* types and names

* _zero_kernel

* grad multi

* hack

* none

* multi unshard

* more for call

* don't tag in call

* good

* call_multi

* call_multi wow claude is useless

* embedding backward mutli test

* test passes

* fix as_param

* shape_to_shape_arg

* add clip

* before cast

* fix spec=2, use atomics
diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_run.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_run.sh
@@ -9,6 +9,7 @@ export REWRITE_STACK_LIMIT=5000000 HCQDEV_WAIT_TIMEOUT_MS=240000
 export DEBUG=${DEBUG:-0}
 export FLASH_ATTENTION=${FLASH_ATTENTION:-1}
 export ALL2ALL=${ALL2ALL:-1}
+export USE_ATOMICS=${USE_ATOMICS:-1}
 
 export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16"
 export DP=${DP:-8} BS=8 EVAL_BS=8 GRADIENT_ACC_STEPS=2
diff --git a/test/test_arange.py b/test/test_arange.py
@@ -163,5 +163,30 @@ def test_llama_embedding(self, noopt=1, op_limit=65536):
   # at least the arange is being fused
   def test_llama_embedding_opt(self): self.test_llama_embedding(0, 1_736_704_000)
 
+  # NOTE: call doesn't work with SPEC=2
+  @unittest.skipIf(Device.DEFAULT not in ("CPU", "AMD"), "atomics only on AMD/CPU")
+  @Context(USE_ATOMICS=1, SPEC=1)
+  def test_llama_8b_embedding_backward(self):
+    from tinygrad.renderer.cstyle import CStyleLanguage
+    if Device.DEFAULT == "CPU" and not isinstance(Device["CPU"].renderer, CStyleLanguage): self.skipTest("CPU needs Clang renderer")
+    vocab_size, embed_size = 1000, 128
+    bs, seqlen = 4, 256
+    idx = Tensor.randint(bs, seqlen, high=vocab_size)
+    emb = nn.Embedding(vocab_size, embed_size)
+    emb.weight = Tensor.ones(vocab_size, embed_size, requires_grad=True)
+    gt = Tensor.zeros(bs, seqlen, embed_size)
+    Tensor.realize(idx, emb.weight, gt)
+    GlobalCounters.reset()
+    loss = (emb(idx)-gt).square().sum()
+    loss.backward()
+    emb.weight.grad.realize()
+    bwd_ops = GlobalCounters.global_ops
+    print(f"embedding bwd: {GlobalCounters.kernel_count} kernels, {bwd_ops:,} ops")
+    self.assertLess(bwd_ops, bs*seqlen*embed_size*20, f"backward ops {bwd_ops:,} should be less than 20 per with atomic scatter-add")
+    # correctness check
+    expected_grad = np.zeros((vocab_size, embed_size), dtype=np.float32)
+    for i in idx.flatten().numpy(): expected_grad[i] += 2
+    np.testing.assert_allclose(emb.weight.grad.numpy(), expected_grad, rtol=1e-5, atol=1e-5)
+
 if __name__ == "__main__":
   unittest.main()
diff --git a/test/test_multitensor.py b/test/test_multitensor.py
@@ -409,6 +409,28 @@ def test_embedding(self):
 
     np.testing.assert_allclose(z.numpy(), z_shard.numpy(), atol=1e-6, rtol=1e-6)
 
+  def test_embedding_backward(self, shard_weight_axis=None):
+    B, T, embed_size, vocab_size = 4, 10, 20, 28
+
+    layer = nn.Embedding(vocab_size, embed_size)
+    layer.weight.requires_grad = True
+    x = Tensor(np.random.randint(0, vocab_size, (B, T), dtype=np.int32))
+    z = layer(x)
+    z.sum().backward()
+    grad = layer.weight.grad.numpy()
+
+    layer_sharded = nn.Embedding(vocab_size, embed_size)
+    layer_sharded.weight.replace(layer.weight.shard(devices_2, axis=shard_weight_axis)).realize()
+    layer_sharded.weight.requires_grad = True
+    x_sharded = x.shard(devices_2, axis=None)
+    z_shard = layer_sharded(x_sharded)
+    z_shard.sum().backward()
+    grad_shard = layer_sharded.weight.grad.numpy()
+
+    np.testing.assert_allclose(grad, grad_shard, atol=1e-6, rtol=1e-6)
+
+  def test_embedding_backward_shard_weight(self): self.test_embedding_backward(shard_weight_axis=1)
+
   def test_rmsnorm(self):
     B, T, embed_size = 4, 10, 20
 
diff --git a/test/unit/test_call.py b/test/unit/test_call.py
@@ -30,18 +30,26 @@ def grad_fxn(grad:UOp, call:UOp): return (grad, grad)
 
     # we define a plus function
     plus_fxn = UOp.param(0, dtypes.float, (10,10)) + UOp.param(1, dtypes.float, (10,10))
-    c = Tensor.call(a, b, fxn=plus_fxn, arg=grad_fxn)
+    c = Tensor.call(a, b, fxn=plus_fxn, grad_fxn=grad_fxn)
     c.mean().backward()
 
     np.testing.assert_allclose(a.grad.numpy(), gt_a_grad, rtol=1e-5)
     np.testing.assert_allclose(b.grad.numpy(), gt_b_grad, rtol=1e-5)
 
-  @unittest.skip("needs GEMM on mixins")
   def test_call_gemm(self):
     M, K, N = 4, 8, 4
     a = Tensor.randn(M, K)
     b = Tensor.randn(K, N)
     Tensor.realize(a, b)
+    c = Tensor.call(a, b, fxn=a.as_param(0) @ b.as_param(1))
+    np.testing.assert_allclose(c.numpy(), a.numpy() @ b.numpy(), rtol=1e-5)
+
+  @unittest.skip("needs GEMM on mixins")
+  def test_call_gemm_uop(self):
+    M, K, N = 4, 8, 4
+    a = Tensor.randn(M, K)
+    b = Tensor.randn(K, N)
+    Tensor.realize(a, b)
 
     # we define a gemm function
     x = UOp.param(0, dtypes.float, shape=(M, K))
diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py
@@ -204,6 +204,8 @@ def tolist(self, obj=None):
 ALLOW_TF32 = ContextVar("ALLOW_TF32", 0)
 # set to 0 to disable the scheduler cache
 SCACHE = ContextVar("SCACHE", 1)
+# allow use of atomics for embedding backward
+USE_ATOMICS = ContextVar("USE_ATOMICS", 0)
 
 @dataclass(frozen=True)
 class Metadata:
diff --git a/tinygrad/nn/__init__.py b/tinygrad/nn/__init__.py
@@ -3,7 +3,7 @@
 from tinygrad.tensor import Tensor
 from tinygrad.dtype import dtypes
 from tinygrad.device import is_dtype_supported
-from tinygrad.helpers import prod, make_tuple, flatten
+from tinygrad.helpers import prod, make_tuple, flatten, USE_ATOMICS
 from tinygrad.nn import optim, state, datasets  # noqa: F401
 
 class BatchNorm:
@@ -304,6 +304,46 @@ def __call__(self, x:Tensor) -> Tensor:
     x = self._norm(x.float()).cast(x.dtype)
     return x if self.weight is None else x * self.weight
 
+from tinygrad.uop.ops import UOp, KernelInfo, Ops
+def _embedding_bwd(grad_emb:UOp, call:UOp) -> tuple:
+  weight, idx = call.src[1:]
+  # for multi-device: unshard inputs to one device
+  if isinstance(weight.device, tuple):
+    assert weight.axis is None, "sharded weights on Embedding not supported with USE_ATOMICS"
+    grad_emb = grad_emb.copy_to_device(weight.device)
+    idx = idx.copy_to_device(weight.device)
+  # weight is replicated, grad_weight should match
+  grad_weight_uop = Tensor.empty(weight.shape, dtype=weight.dtype, device=weight.device).uop
+
+  # TODO: how do we remove this dumb kernel and use Tensor.zeros?
+  def _zero_kernel(out:UOp) -> UOp:
+    i = UOp.range(out.size, 0)
+    return out.flatten()[i].store(0).end(i).sink(arg=KernelInfo(name="zero"))
+  grad_weight_uop = grad_weight_uop.custom_kernel(fxn=_zero_kernel)[0]
+
+  # TODO: do we have a universal helper for this?
+  device = call.device.split(":")[0] if not isinstance(call.device, tuple) else call.device[0].split(":")[0]
+
+  # this is the real atomic kernel
+  def _embedding_bwd_kernel(grad_weight:UOp, grad_emb:UOp, idx:UOp) -> UOp:
+    idx_flat, grad_emb_flat = idx.flatten(), grad_emb.reshape((idx.size, grad_weight.shape[-1]))
+    i = UOp.range(grad_emb_flat.shape[0], 0)  # batch_size * sequence_length
+    j = UOp.range(grad_emb_flat.shape[1], 1)  # embed_size
+    token_id = idx_flat[i].clip(0, grad_weight.shape[0]-1).cast(dtypes.index)
+    # atomic scatter-add: grad_weight[token_id, j] += grad_emb_flat[i, j]
+    if device in ("CPU", "NULL"): atomic_arg = "__atomic_fetch_add({0}, {1}, __ATOMIC_RELAXED);"
+    elif device == "AMD": atomic_arg = "__hip_atomic_fetch_add({0}, {1}, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);"
+    else: raise NotImplementedError(f"no atomics for device {device}")
+    atomic = UOp(Ops.CUSTOM, dtypes.void, (grad_weight.index(token_id, j, ptr=True), grad_emb_flat[i, j]), arg = atomic_arg)
+    return atomic.end(i, j).sink(arg=KernelInfo(name="embedding_bwd", opts_to_apply=()))
+  grad_weight_uop = grad_weight_uop.custom_kernel(grad_emb, idx, fxn=_embedding_bwd_kernel)[0]
+
+  return (grad_weight_uop, None)
+
+def _embedding_fwd(weight:Tensor, idx:Tensor) -> Tensor:
+  arange = Tensor.arange(weight.shape[0], requires_grad=False, device=weight.device)
+  return (arange == idx.unsqueeze(-1)).unsqueeze(-1).where(weight, 0).sum(-2, dtype=weight.dtype)
+
 class Embedding:
   """
   A simple lookup table that stores embeddings of a fixed dictionary and size.
@@ -316,12 +356,12 @@ class Embedding:
   ```
   """
   def __init__(self, vocab_size:int, embed_size:int):
-    self.vocab_sz, self.embed_sz, self.weight = vocab_size, embed_size, Tensor.glorot_uniform(vocab_size, embed_size)
+    self.weight = Tensor.glorot_uniform(vocab_size, embed_size)
 
   def __call__(self, idx:Tensor) -> Tensor:
     if not dtypes.is_int(idx.dtype): raise TypeError(f"Expected integer dtype for index in embedding, got {idx.dtype}")
-    arange = Tensor.arange(self.weight.shape[0], requires_grad=False, device=self.weight.device)
-    return (arange == idx.unsqueeze(-1)).unsqueeze(-1).where(self.weight, 0).sum(-2, dtype=self.weight.dtype)
+    if USE_ATOMICS: return Tensor.call(self.weight, idx, fxn=_embedding_fwd(self.weight.as_param(0), idx.as_param(1)), grad_fxn=_embedding_bwd)
+    return _embedding_fwd(self.weight, idx)
 
 class LSTMCell:
   """
diff --git a/tinygrad/schedule/multi.py b/tinygrad/schedule/multi.py
@@ -202,7 +202,7 @@ def assign_multi(dest:UOp, src:UOp):
   return dest.src[0].assign(src.src[0]).multi(src.axis)
 
 def passthrough_multi(root:UOp, multi:UOp):
-  return UOp(root.op, root.dtype, (multi.src[0],), root.arg).multi(multi.axis)
+  return UOp(root.op, root.dtype, (multi.src[0],)+tuple(x.src[0] if x.op is Ops.MULTI else x for x in root.src[1:]), root.arg).multi(multi.axis)
 
 # NOTE: this is the same pattern as Ops.UNROLL
 multi_pm = PatternMatcher([
@@ -218,6 +218,7 @@ def passthrough_multi(root:UOp, multi:UOp):
   (UPat(Ops.COPY, src=(UPat(Ops.MULTI, name="multi"), UPat(Ops.DEVICE, name="device"))), copy_multi),
   (UPat(Ops.ALLREDUCE, src=(UPat(Ops.MULTI, name="multi"), UPat(Ops.DEVICE, name="device")), name="red"),
     lambda multi,device,red: multi.src[0].allreduce(red.arg, device).multi(axis=multi.axis)),
+  (UPat(Ops.CALL, src=(UPat(Ops.MULTI, name="multi"), ), name="root", allow_any_len=True), passthrough_multi),
   (UPat((Ops.CAST, Ops.BITCAST, Ops.CONTIGUOUS, Ops.DETACH, Ops.CONTIGUOUS_BACKWARD),
         src=(UPat(Ops.MULTI, name="multi"), ), name="root"), passthrough_multi),
   # multi supports custom kernels with CUSTOM_KERNEL + AFTER
diff --git a/tinygrad/schedule/rangeify.py b/tinygrad/schedule/rangeify.py
@@ -71,12 +71,13 @@ def resolve_custom_kernel(ck:UOp) -> UOp:
 def resolve_call(c:UOp) -> UOp:
   params = sorted([x for x in c.src[0].toposort() if x.op == Ops.PARAM], key=lambda x: x.arg)
   args = c.src[1:]
+  # TODO: this check belongs in spec, not here
   if [x.arg for x in params] != list(range(len(params))): raise RuntimeError(f"params not in order: {[x.arg for x in params]}")
   if len(params) != len(args): raise TypeError(f"expected {len(params)} args, got {len(args)}")
   for i, (p, a) in enumerate(zip(params, args)):
     if p.shape != a.shape: raise TypeError(f"arg {i} shape mismatch: expected {p.shape}, got {a.shape}")
     if p.dtype != a.dtype: raise TypeError(f"arg {i} dtype mismatch: expected {p.dtype}, got {a.dtype}")
-  return c.src[0].substitute(dict(zip(params, args)))
+  return c.src[0].substitute(dict(zip(params, args))).rtag(c.tag)
 
 earliest_rewrites = mop_cleanup+PatternMatcher([
   # just removing it works...
@@ -533,11 +534,14 @@ def split_store(ctx:list[UOp], x:UOp) -> UOp|None:
   (UPat((Ops.STORE, Ops.END), name="x"), split_store),
 ])
 
-def tag_uop(ctx:list[UOp], x:UOp):
-  if x.tag is not None: return None
+def tag_uop(ctx:tuple[list[UOp], set[UOp]], x:UOp):
+  if x.tag is not None or x in ctx[1]: return None
+  if x.tag is None and x.op is Ops.CALL:
+    # don't tag anything in a CALL
+    for u in x.src[0].toposort(): ctx[1].add(u)
   if x.dtype.scalar() == dtypes.index: return None
-  ctx.append(x)
-  return x.replace(tag=(len(ctx)-1,))
+  ctx[0].append(x)
+  return x.replace(tag=(len(ctx[0])-1,))
 add_tags = PatternMatcher([
   # don't tag BUFFERs, they are global
   (UPat(GroupOp.All-{Ops.BUFFER, Ops.CONST, Ops.DEVICE, Ops.UNIQUE, Ops.LUNIQUE, Ops.DEFINE_VAR, Ops.BIND, Ops.KERNEL, Ops.END,
@@ -563,7 +567,7 @@ def found_contiguous(ctx:dict[UOp, UOp], contig:UOp, src:UOp):
 def get_rangeify_map(sink:UOp) -> dict[UOp, UOp]:
   if getenv("VIZ"): graph_rewrite(sink, PatternMatcher([]), name="View Input Graph")
   uop_list: list[UOp] = []
-  tsink = graph_rewrite(sink, add_tags, ctx=uop_list, bottom_up=True, name="number the uops")
+  tsink = graph_rewrite(sink, add_tags, ctx=(uop_list, set()), bottom_up=True, name="number the uops")
 
   tsink = graph_rewrite(tsink, pm_mops+earliest_rewrites+replace_contiguous, ctx={}, name="earliest rewrites")
 
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
@@ -232,8 +232,15 @@ def dtype(self) -> DType: return self.uop.dtype
 
   # ***** data handlers ****
 
-  def call(self, *lst:Tensor, fxn:UOp, arg:Any=None) -> Tensor:
-    return Tensor(UOp.call(*[t.uop for t in (self,)+lst], fxn=fxn, arg=arg))
+  def as_param(self, slot:int):
+    if self.uop.axis is not None:
+      multi_shape = tuple([s//len(self.device) if i==self.uop.axis else s for i,s in enumerate(self.shape)])
+      param = UOp.param(slot, self.dtype, multi_shape, self.device).multi(self.uop.axis)
+    else:
+      param = UOp.param(slot, self.dtype, self.shape, self.device)
+    return Tensor(param, device=self.device)
+  def call(self, *lst:Tensor, fxn:Tensor|UOp, grad_fxn:Callable|None=None) -> Tensor:
+    return Tensor(UOp.call(*[t.uop for t in (self,)+lst], fxn=fxn.uop if isinstance(fxn, Tensor) else fxn, arg=grad_fxn), device=self.device)
 
   def custom_kernel(self, *lst:Tensor, fxn:Callable, grad_fxn:Callable|None=None) -> list[Tensor]:
     """
diff --git a/tinygrad/uop/ops.py b/tinygrad/uop/ops.py
@@ -58,6 +58,11 @@ def multirange_str(rngs:Iterable[UOp], color=False, pad=None) -> str:
   if pad is not None: ret += " " * (pad-ansilen(ret))
   return ret
 
+def shape_to_shape_arg(arg:tuple[sint, ...]) -> UOp:
+  if len(arg) == 0: return UOp(Ops.VECTORIZE, dtypes.index.vec(0))
+  elif all(isinstance(x, int) for x in arg): return UOp.const(dtypes.index.vec(len(arg)), cast(tuple[int, ...], arg))
+  else: return UOp(Ops.VECTORIZE, dtypes.index.vec(len(arg)), tuple(UOp.const(dtypes.index, x) if isinstance(x, int) else x for x in arg))
+
 def consumer_map_from_toposort(lst:Iterable[UOp]):
   ret: dict[UOp, dict[UOp, None]] = {}
   for u in lst:
@@ -222,7 +227,7 @@ def _shape(self) -> tuple[sint, ...]|None:
       case Ops.DEFINE_GLOBAL | Ops.DEFINE_LOCAL | Ops.DEFINE_REG: return (self.ptrdtype.size,)
       case Ops.PARAM:
         # NOTE: copied from marg
-        if len(self.src) == 1: return tuple(self.src[0].sgep(i) for i in range(self.src[0].dtype.count))
+        if len(self.src) >= 1: return tuple(self.src[0].sgep(i) for i in range(self.src[0].dtype.count))
         return None
 
       # passthrough ops
@@ -558,11 +563,7 @@ def _mop(self, op:Ops, arg, same_shape_noop:bool=False) -> UOp:
       case Ops.PAD | Ops.SHRINK: src_args = list(zip(*arg))
       case Ops.PERMUTE | Ops.FLIP: src_args = []
       case _: raise RuntimeError(f"{op} is not a MovementOp")
-    usrcs = []
-    for arg in src_args:
-      if len(arg) == 0: usrcs.append(UOp(Ops.VECTORIZE, dtypes.index.vec(0)))
-      elif all(isinstance(x, int) for x in arg): usrcs.append(UOp.const(dtypes.index.vec(len(arg)), arg))
-      else: usrcs.append(UOp(Ops.VECTORIZE, dtypes.index.vec(len(arg)), tuple(UOp.const(dtypes.index, x) if isinstance(x, int) else x for x in arg)))
+    usrcs = [shape_to_shape_arg(arg) for arg in src_args]
     if len(usrcs) == 0: ret = UOp(op, self.dtype, (self,), arg)
     else: ret = UOp(op, self.dtype, (self,)+UOp.sink(*usrcs).simplify().src)
     # for all movement ops, we check shape property to validity check the movement op
@@ -826,8 +827,8 @@ def set(self:UOp, val:UOp|ConstType, end:UOp|tuple[UOp, ...]|list[UOp]=()) -> UO
 
   # TODO: this should replace placeholder
   @staticmethod
-  def param(slot:int, dtype:DType, shape:tuple[int, ...]|None=None):
-    src = () if shape is None else (UOp.const(dtypes.index.vec(len(shape)), shape),)
+  def param(slot:int, dtype:DType, shape:tuple[sint, ...]|None=None, device=None):
+    src = (UOp(Ops.NOOP) if shape is None else shape_to_shape_arg(shape),) + (() if device is None else (UOp(Ops.DEVICE, arg=device),))
     return UOp(Ops.PARAM, dtype, src, arg=slot)
 
   def call(*srcs:UOp, fxn:UOp, arg:Any|None) -> UOp: return UOp(Ops.CALL, fxn.dtype, (fxn,)+srcs, arg)