Implement Ring Attention forward pass, and unit tests for it.

nshepperd · nshepperd · commit e43f3463224a · 2024-03-17T21:47:56.000+11:00
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 __pycache__/
 *.py[cod]
 .pytest_cache
+.cache
 
 # C extensions
 *.so
diff --git a/src/flash_attn_jax/flash_sharding.py b/src/flash_attn_jax/flash_sharding.py
@@ -28,49 +28,82 @@
 _flash_mha_fwd_hlo_sharded = custom_partitioning(_flash_mha_fwd_hlo, static_argnums=(3,4,5))
 _flash_mha_bwd_hlo_sharded = custom_partitioning(_flash_mha_bwd_hlo, static_argnums=(6,7,8))
 
+from jax._src.ad_checkpoint import _optimization_barrier
+
+def ring_fwd(softmax_scale, is_causal, axis_name, axis_size, q,k,v):
+    [n,l,h,d] = q.shape
+
+    q_ix = jax.lax.axis_index(axis_name)
+    k_ix = jax.lax.axis_index(axis_name)
+
+    o = jnp.zeros([n,l,h,d], jnp.float32)
+    lse = jnp.full([n,h,l], float('-inf'), jnp.float32)
+
+    # scan :: (c -> a -> (c, b)) -> c -> [a] -> (c, [b])
+    def f(c, a):
+        (k, v, o, lse, k_ix) = c
+
+        o1, lse1 = o, lse
+        if is_causal:
+            o2, lse2 = jax.lax.switch((k_ix < q_ix).astype(jnp.int32) + (k_ix <= q_ix).astype(jnp.int32),
+                                    [
+                                        lambda q,k,v: (jnp.zeros([n,l,h,d], q.dtype), jnp.full([n,h,l], float('-inf'), jnp.float32)),
+                                        lambda q,k,v: _flash_mha_fwd_hlo(q,k,v, softmax_scale=softmax_scale, is_causal=True, window_size=(-1,-1)),
+                                        lambda q,k,v: _flash_mha_fwd_hlo(q,k,v, softmax_scale=softmax_scale, is_causal=False, window_size=(-1,-1)),
+                                    ], q, k, v)
+        else:
+            o2, lse2 = _flash_mha_fwd_hlo(q,k,v, softmax_scale=softmax_scale, is_causal=False, window_size=(-1,-1))
+        o2 = o2.astype(jnp.float32)
+
+        mx = jnp.maximum(lse1,lse2)
+        mn = jnp.minimum(lse1,lse2)
+        lse = jnp.log1p(jnp.exp(mn-mx)) + mx
+
+        o = (o1 * rearrange(jnp.exp(lse1 - lse), 'n h l -> n l h 1') +
+             o2 * rearrange(jnp.exp(lse2 - lse), 'n h l -> n l h 1'))
+        
+        k2 = jax.lax.ppermute(k, axis_name, [(i, (i+1)%axis_size) for i in range(axis_size)])
+        v2 = jax.lax.ppermute(v, axis_name, [(i, (i+1)%axis_size) for i in range(axis_size)])
+        k_ix = jax.lax.ppermute(k_ix, axis_name, [(i, (i+1)%axis_size) for i in range(axis_size)])
+
+        return ((k2, v2, o, lse, k_ix), None)
+    acc = (k,v,o,lse,k_ix)
+    # We sadly have to manually unroll this because scan breaks the axis context preventing us from using ppermute (unroll=axis_size doesn't help either).
+    # Optimization barrier prevents instruction reordering so that ppermute and flash_mha execute concurrently.
+    for _ in range(axis_size):
+        acc, _ = f(acc, None)
+        acc = _optimization_barrier(acc)
+    (_,_,o,lse,_) = acc
+    # (_,_,o,lse), _ = jax.lax.scan(f,init,None,axis_size)
+    return o.astype(q.dtype), lse
+
 def partition_fwd(softmax_scale, is_causal, window_size, mesh, arg_shapes, result_shape):
     result_shardings = jax.tree_map(lambda x: x.sharding, result_shape)
     arg_shardings = jax.tree_map(lambda x: x.sharding, arg_shapes)
 
     q_sharding = arg_shardings[0]
     if isinstance(q_sharding, PositionalSharding):
-        if not is_causal and window_size == (-1,-1):
-            # We can handle Q that's sharded across the L dimension
-            # without replicating Q by executing it as a cross
-            # attention:
-            #
-            #  q : n [L/devices] h d
-            #  kv : n L h d
-            #  -> o : n [L/devices] h d
-            #
-            # TODO: We could handle q sharded across L even with
-            # causal/local if we could communicate the slice offset
-            # (of q in kv) to the c++ driver. But it's unclear how to
-            # do that since the HLO has to be identical (SPMD).
-            q_sharding = q_sharding.replicate(3)
-            kv_sharding = q_sharding.replicate(1)
-            (n,l,h,d) = q_sharding.shape
-            result_shardings = q_sharding, q_sharding.reshape((n,l,h)).transpose(0,2,1) # n h l
-            arg_shardings = q_sharding, kv_sharding, kv_sharding
-        else:
-            # We need to replicate d always.
-            q_sharding = q_sharding.replicate((1,3))
-            (n,l,h,d) = q_sharding.shape # l=1, d=1
-            result_shardings = q_sharding, q_sharding.reshape((n,l,h)).transpose(0,2,1)
-            arg_shardings = q_sharding, q_sharding, q_sharding
+        (n,l,h,d) = q_sharding.shape
+        assert d == 1, "Sharding across `d` won't be efficient, so it's not supported."
+        assert l == 1, "For ring attention, use `with Mesh(...) as mesh` and NamedSharding."
+        result_shardings = q_sharding, q_sharding.reshape((n,h,1)) # n h l
+        arg_shardings = q_sharding, q_sharding, q_sharding
     elif isinstance(q_sharding, NamedSharding):
         mesh = q_sharding.mesh
         [n,l,h,d] = q_sharding.spec
-        if not is_causal and window_size == (-1,-1):
-            q_sharding = NamedSharding(mesh, P(n,l,h,None))
-            kv_sharding = NamedSharding(mesh, P(n,None,h,None))
-            lse_sharding = NamedSharding(mesh, P(n,h,l))
+        assert d == None, "Sharding across `d` won't be efficient, so it's not supported."
+        if l != None:
+            # assert not is_causal and window_size == (-1,-1), "Ring attention doesn't support causal or local masking yet."
+            assert window_size == (-1,-1), "Ring attention doesn't support local masking yet."
+            result_shardings = q_sharding, NamedSharding(mesh, P(n,h,l))
+            arg_shardings = q_sharding, q_sharding, q_sharding
+            axis_name = l
+            axis_size = mesh.shape[axis_name]
+            # ring attention
+            return mesh, partial(ring_fwd, softmax_scale, is_causal, axis_name, axis_size), result_shardings, arg_shardings
         else:
-            q_sharding = NamedSharding(mesh, P(n,None,h,None))
-            kv_sharding = q_sharding
-            lse_sharding = NamedSharding(mesh, P(n,h,None))
-        result_sharding = (q_sharding, lse_sharding)
-        arg_shardings = (q_sharding, kv_sharding, kv_sharding)
+            result_shardings = q_sharding, NamedSharding(mesh, P(n,h,l))
+            arg_shardings = q_sharding, q_sharding, q_sharding
     def fwd(q,k,v):
         return _flash_mha_fwd_hlo(q,k,v, softmax_scale=softmax_scale, is_causal=is_causal, window_size=window_size)
     return mesh, fwd, result_shardings, arg_shardings
diff --git a/tests/test_sharding.py b/tests/test_sharding.py
@@ -1,17 +1,21 @@
-import sys, glob
+import glob
+import sys
+
 if glob.glob('build/lib.linux-*'):
     sys.path.append(glob.glob('build/lib.linux-*')[0])
 
 import jax
 import jax.numpy as jnp
 import numpy as np
 import pytest
-
+from jax.sharding import Mesh, NamedSharding
+from jax.sharding import PartitionSpec as P
 from jax.sharding import PositionalSharding
 from jax.tree_util import tree_map
 
 from flash_attn_jax import flash_mha
 
+
 def ref_mha(q,k,v, is_causal=False, window_size=(-1,-1)):
     softmax_scale = 1/np.sqrt(q.shape[-1])
     att = jnp.einsum('nlhd,nLhd->nhlL',q,k)
@@ -79,13 +83,18 @@ def with_sharding(q_sharding, kv_sharding=None):
     assert 'all-gather' not in hlo
     assert 'dynamic-slice' not in hlo
 
-    # With q sharded and kv replicated, should need no communication
-    # (handle it as a cross attention), as long as causal and local
-    # are both false.
-    hlo = with_sharding(PositionalSharding(devices).reshape(1,n,1,1), PositionalSharding(devices).replicate())
-    if not (causal or local):
-        assert 'all-gather' not in hlo
-        assert 'dynamic-slice' not in hlo
+    if not local:
+        with Mesh(np.array(devices), axis_names=('x',)) as mesh:
+            sharding = NamedSharding(mesh, P(None,'x',None,None))
+            hlo = with_sharding(sharding)
+            # No resharding should occur, only manual collective-permute.
+            assert 'all-gather' not in hlo
+            assert 'dynamic-slice' not in hlo
+            assert 'collective-permute' in hlo
+            # Should always run concurrently, meaning custom-call is always between start and done.
+            import re
+            collectives = ''.join(re.findall(" collective-permute-start| collective-permute-done| custom-call", hlo))
+            assert 'collective-permute-start collective-permute-done' not in collectives, hlo
 
 
 @pytest.mark.parametrize("dtype", [jnp.float16, jnp.bfloat16])
@@ -128,8 +137,7 @@ def flash(qkv):
 @pytest.mark.parametrize("d", [32])
 @pytest.mark.parametrize("h", [4, 8])
 @pytest.mark.parametrize("seqlen", [128])
-@pytest.mark.parametrize("shard_dim", [0,1,2,3])
-def test_flash_fwd_sharded(seqlen, h, d, causal, local, dtype, shard_dim):
+def test_flash_fwd_sharded(seqlen, h, d, causal, local, dtype):
     window_size = (3,3) if local else (-1,-1)
 
     devices = jax.local_devices()
@@ -145,23 +153,25 @@ def flash(qkv):
     k = jax.random.normal(jax.random.PRNGKey(1), [n, seqlen, h, d], dtype=jnp.float32)
     v = jax.random.normal(jax.random.PRNGKey(2), [n, seqlen, h, d], dtype=jnp.float32)
 
-    if q.shape[shard_dim] % n != 0:
-        pytest.skip(f"{q.shape[shard_dim]} doesn't divide into {n} so we can't shard it.")
-
     ref_out = ref((q,k,v))
     q = q.astype(dtype)
     k = k.astype(dtype)
     v = v.astype(dtype)
-    repl_out = flash((q,k,v))
+    ref16_out = flash((q,k,v))
 
-    shape = [1,1,1,1]
-    shape[shard_dim] = n
-    sharding = PositionalSharding(devices).reshape(shape)
+    def check_sharding(sharding,q,k,v):
+        (q,k,v) = jax.device_put((q,k,v), sharding)
+        out = flash((q,k,v))
+        check(ref_out,ref16_out,out)
 
-    (q,k,v) = jax.device_put((q,k,v), sharding)
-    hlo = flash.lower((q,k,v)).compile().as_text()
-    out = flash((q,k,v))
-    check(ref_out, repl_out, out)
+    check_sharding(PositionalSharding(devices).reshape(n,1,1,1),q,k,v)
+    check_sharding(PositionalSharding(devices).reshape(1,1,n,1),q,k,v)
+
+    if not local:
+        # Ring attention
+        with Mesh(np.array(devices), axis_names=('x',)) as mesh:
+            sharding = NamedSharding(mesh, P(None,'x',None,None))
+            check_sharding(sharding,q,k,v)
 
 
 @pytest.mark.skipif(len(jax.local_devices()) < 2, reason='Requires >1 gpu device')
@@ -171,7 +181,7 @@ def flash(qkv):
 @pytest.mark.parametrize("d", [32])
 @pytest.mark.parametrize("h", [4, 8])
 @pytest.mark.parametrize("seqlen", [128])
-@pytest.mark.parametrize("shard_dim", [0,1,2,3])
+@pytest.mark.parametrize("shard_dim", [0,2])
 def test_flash_bwd_sharded(seqlen, h, d, causal, local, dtype, shard_dim):
     window_size = (3,3) if local else (-1,-1)
 
@@ -209,4 +219,4 @@ def flash(qkv):
     check(ref_out, repl_out, out)
 
 if __name__ == '__main__':
-    test_flash_bwd_sharded_hlo(128,4,32,False,False,jnp.float16)
+    test_flash_fwd_sharded_hlo(128,4,32,False,False,jnp.float16)