Fully overlap communication with computation ring backward. Well, in theory anyway. It's not working reliably until #20884 is merged and we can use a scan instead of unrolling.

nshepperd · nshepperd · commit 436731737a90 · 2024-05-02T08:35:44.000+10:00
diff --git a/src/flash_attn_jax/__init__.py b/src/flash_attn_jax/__init__.py
@@ -1,2 +1,2 @@
 from .flash import flash_mha
-__version__ = 'v0.1.0'
+__version__ = 'v0.2.0'
diff --git a/src/flash_attn_jax/ring_attention.py b/src/flash_attn_jax/ring_attention.py
@@ -97,26 +97,38 @@ def f(acc, _):
         # 0: ix < ix2
         # 1: ix = ix2
         # 2: ix > ix2
+        def skip():
+            return (jnp.zeros(q.shape, q.dtype), jnp.zeros(k.shape, k.dtype), jnp.zeros(v.shape, v.dtype))
+        def causal():
+            return mha_bwd(do,q,k2,v2,o,lse, softmax_scale=softmax_scale, is_causal=True, window_size=(-1,-1))
+        def non_causal():
+            return mha_bwd(do,q,k2,v2,o,lse, softmax_scale=softmax_scale, is_causal=False, window_size=(-1,-1))
+
+        (dk2_,dv2_) = jax.lax.ppermute((dk2,dv2), axis_name, [(i, (i+1)%axis_size) for i in range(axis_size)])
+        (k2_,v2_,ix2_) = jax.lax.ppermute((k2,v2,ix2), axis_name, [(i, (i+1)%axis_size) for i in range(axis_size)])
+        
         if is_causal:
-            dqa, dka, dva = jax.lax.switch(cmp, (
-                                lambda: (jnp.zeros(q.shape, q.dtype), jnp.zeros(k.shape, k.dtype), jnp.zeros(v.shape, v.dtype)),
-                                lambda: mha_bwd(do,q,k2,v2,o,lse, softmax_scale=softmax_scale, is_causal=True, window_size=(-1,-1)),
-                                lambda: mha_bwd(do,q,k2,v2,o,lse, softmax_scale=softmax_scale, is_causal=False, window_size=(-1,-1))
-            ))
+            (dqa, dka, dva) = jax.lax.switch(cmp, [skip, causal, non_causal])
         else:
-            dqa, dka, dva = mha_bwd(do,q,k2,v2,o,lse, softmax_scale=softmax_scale, is_causal=False, window_size=(-1,-1))
-
-        dq += dqa
-        dk2 += dka
-        dv2 += dva
-
-        (k2,v2,dk2,dv2,ix2) = jax.lax.ppermute((k2,v2,dk2,dv2,ix2), axis_name, [(i, (i+1)%axis_size) for i in range(axis_size)])
+            (dqa, dka, dva) = non_causal()
 
-        return ((k2,v2,dk2,dv2,ix2, dq), None)
+        # Send/receive of dk/dv retires here (because the following depends on it).
+        if is_causal:
+            (dq, dk2_, dv2_) = jax.lax.switch(cmp, [
+                lambda: (dq, dk2_, dv2_),
+                lambda: (dq+dqa, dk2_+dka, dv2_+dva),
+                lambda: (dq+dqa, dk2_+dka, dv2_+dva)
+            ])
+        else:
+            dq, dk2_, dv2_ = (dq+dqa, dk2_+dka, dv2_+dva)
+        
+        return ((k2_,v2_,dk2_,dv2_,ix2_, dq), None)
     acc = (k,v,dk,dv,ix, dq)
     # See above (#20884).
     for _ in range(axis_size):
         acc, _ = f(acc, None)
         acc = _optimization_barrier(acc)
+    # acc, _ = jax.lax.scan(f,acc,None,axis_size)
     (k,v,dk,dv,ix2, dq) = acc
+    (dk,dv) = jax.lax.ppermute((dk,dv), axis_name, [(i, (i+1)%axis_size) for i in range(axis_size)])
     return dq.astype(q.dtype),dk.astype(q.dtype),dv.astype(q.dtype)
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -1 +0,0 @@
-0
diff --git a/tests/ref_mha.py b/tests/ref_mha.py
@@ -38,13 +38,11 @@ def ref_fwd(q,k,v, is_causal=False, window_size=(-1,-1), softmax_scale=None):
         lse = einops.rearrange(lse, 'n h x l -> n (h x) l')
         return o.astype(q.dtype), lse.astype(jnp.float32)
     else:
-        att = jnp.einsum('nlhd,nLhd->nhlL',q,k)*softmax_scale
-        [_, _, l, L] = att.shape
-        mask = make_mask(l,L,is_causal,window_size)
-        att = jnp.where(mask, att, float('-inf'))
-        lse = jax.nn.logsumexp(att, axis=-1) #nhl
-        att = jnp.exp(att - lse[...,None])
-        o = jnp.einsum('nhlL,nLhd->nlhd',att,v)
+        S = jnp.einsum('nlhd,nLhd->nhlL',q,k)
+        S = jnp.where(mask, S, float('-inf'))
+        lse = jax.nn.logsumexp(S*softmax_scale, axis=-1) #nhl
+        P = jax.nn.softmax(S*softmax_scale, axis=-1) #jnp.exp(att - lse[...,None])
+        o = jnp.einsum('nhlL,nLhd->nlhd',P,v)
         return o.astype(q.dtype), lse.astype(jnp.float32)
 
 def ref_bwd(do,q,k,v,o,lse, is_causal=False, window_size=(-1,-1), softmax_scale=None):
diff --git a/tests/test_ring.py b/tests/test_ring.py
@@ -17,6 +17,7 @@
 from jax.experimental.shard_map import shard_map
 from functools import partial
 import einops
+import math
 
 from flash_attn_jax.ring_attention import ring_fwd, ring_bwd
 from .ref_mha import ref_fwd, ref_bwd
@@ -87,15 +88,18 @@ def ring(q,k,v):
 
 @pytest.mark.parametrize("causal", ['causal',''])
 @pytest.mark.parametrize("m", [1,2])
-@pytest.mark.parametrize("d", [8])
-@pytest.mark.parametrize("h", [1])
-@pytest.mark.parametrize("seqlen", [2])
+@pytest.mark.parametrize("d", [32])
+@pytest.mark.parametrize("h", [4])
+@pytest.mark.parametrize("seqlen", [128])
 def test_ring_bwd(seqlen, h, d, m, causal):
     window_size = (-1,-1)
 
     devices = jax.devices(backend='cpu')
     n_device = len(devices)
 
+    n = 1
+    A = 1.0 / math.sqrt(n * seqlen * h * d)
+
     with Mesh(np.array(devices), axis_names=('x',)) as mesh:
         @jax.jit
         def ref(q,k,v,do):
@@ -114,11 +118,13 @@ def ring(q,k,v,do):
         q = jax.random.normal(jax.random.PRNGKey(0), [1, seqlen, h*m, d], dtype=jnp.float32)
         k = jax.random.normal(jax.random.PRNGKey(1), [1, seqlen, h, d], dtype=jnp.float32)
         v = jax.random.normal(jax.random.PRNGKey(2), [1, seqlen, h, d], dtype=jnp.float32)
-        do = jax.random.normal(jax.random.PRNGKey(3), [1, seqlen, h*m, d], dtype=jnp.float32)
+        do = jax.random.normal(jax.random.PRNGKey(3), [1, seqlen, h*m, d], dtype=jnp.float32) * A
         o_ref = ref(q,k,v,do)
         o_ring = ring(q,k,v,do)
+        # print(jnp.stack([o_ref[0], o_ring[0], o_ref[0] - o_ring[0]], axis=-1))
+        print(jnp.stack([o_ref[2], o_ring[2], o_ref[2] - o_ring[2]], axis=-1))
         for i in range(3):
-            assert jnp.allclose(o_ref[i], o_ring[i], rtol=1e-2, atol=1e-3)
+            assert jnp.allclose(o_ref[i], o_ring[i], rtol=1e-2, atol=1e-3), i
 
 if __name__ == '__main__':
     test_ref()
diff --git a/tests/test_sharding.py b/tests/test_sharding.py
@@ -29,9 +29,11 @@ def pretty(tensor):
     std = jnp.std(tensor)
     return f'[{shape}: {mn:.3g} | {mean:.3g}±{std:.3g} | {mx:.3g}]'
 
-def check(ref_out, jax_out, out):
+def check(ref_out, jax_out, out, eps=3):
     def check1(ref_out, jax_out, out):
-        assert jnp.max(jnp.abs(out - ref_out)).item() <= 3 * jnp.max(jnp.abs(jax_out - ref_out)).item(), (pretty(jnp.abs(out - ref_out)), 'vs', pretty(jnp.abs(jax_out - ref_out)))
+        out_diff = jnp.abs(out - ref_out)
+        jax_diff = jnp.abs(jax_out - ref_out)
+        assert jnp.max(out_diff) <= eps * jnp.max(jax_diff), (pretty(out_diff), 'vs', pretty(jax_diff))
     tree_map(check1, ref_out, jax_out, out)
 
 @pytest.mark.skipif(len(jax.local_devices()) < 2, reason='Requires >1 gpu device')
@@ -130,9 +132,11 @@ def with_sharding(sharding):
             assert 'dynamic-slice' not in hlo
             assert 'collective-permute' in hlo
             # Should always run concurrently, meaning custom-call is always between start and done.
-            import re
-            collectives = ''.join(re.findall(" collective-permute-start| collective-permute-done| custom-call", hlo))
-            assert 'collective-permute-start collective-permute-done' not in collectives, hlo
+            # import re
+            # collectives = ''.join(re.findall(" collective-permute-start| collective-permute-done| custom-call", hlo))
+            # assert 'collective-permute-start collective-permute-done' not in collectives, hlo
+            print(hlo)
+            assert 'collective-permute-start collective-permute-done' not in decode_hlo(hlo), decode_hlo(hlo)
 
 @pytest.mark.skipif(len(jax.local_devices()) < 2, reason='Requires >1 gpu device')
 @pytest.mark.parametrize("dtype", [jnp.float16, jnp.bfloat16])
@@ -162,7 +166,7 @@ def flash(qkv):
     q = q.astype(dtype)
     k = k.astype(dtype)
     v = v.astype(dtype)
-    ref16_out = flash((q,k,v))
+    ref16_out = ref((q,k,v))
 
     def check_sharding(sharding,q,k,v):
         (q,k,v) = jax.device_put((q,k,v), sharding)
@@ -193,37 +197,73 @@ def test_flash_bwd_sharded(seqlen, h, d, m, causal, local, dtype):
     devices = jax.local_devices()
     n = len(devices)
 
+    A = 1.0 / math.sqrt(n * seqlen * h * d)
+
     @jax.jit
     @jax.grad
-    def ref(qkv):
-        return ref_mha(*qkv, is_causal=bool(causal), window_size=window_size).sum()
+    def ref(qkv, do):
+        o = ref_mha(*qkv, is_causal=bool(causal), window_size=window_size)
+        return (o * do).sum()
     @jax.jit
     @jax.grad
-    def flash(qkv):
-        return flash_mha(*qkv, is_causal=bool(causal), window_size=window_size).sum()
+    def flash(qkv, do):
+        o = flash_mha(*qkv, is_causal=bool(causal), window_size=window_size)
+        return (o * do).sum()
     q = jax.random.normal(jax.random.PRNGKey(0), [n, seqlen, h*m, d], dtype=jnp.float32)
     k = jax.random.normal(jax.random.PRNGKey(1), [n, seqlen, h, d], dtype=jnp.float32)
     v = jax.random.normal(jax.random.PRNGKey(2), [n, seqlen, h, d], dtype=jnp.float32)
+    do = jax.random.normal(jax.random.PRNGKey(3), [n, seqlen, h*m, d], dtype=jnp.float32) * A
 
-    ref_out = ref((q,k,v))
+    ref_out = ref((q,k,v), do)
     q = q.astype(dtype)
     k = k.astype(dtype)
     v = v.astype(dtype)
-    ref16_out = flash((q,k,v))
+    do = do.astype(dtype)
+    ref16_out = ref((q,k,v), do)
 
-    def check_sharding(sharding,q,k,v):
-        (q,k,v) = jax.device_put((q,k,v), sharding)
-        out = flash((q,k,v))
-        check(ref_out,ref16_out,out)
+    def check_sharding(sharding):
+        (qs,ks,vs,dos) = jax.device_put((q,k,v,do), sharding)
+        out = flash((qs,ks,vs),dos)
+        check(ref_out,ref16_out,out, eps=4)
 
-    check_sharding(PositionalSharding(devices).reshape(n,1,1,1),q,k,v)
-    check_sharding(PositionalSharding(devices).reshape(1,1,n,1),q,k,v)
+    check_sharding(PositionalSharding(devices).reshape(n,1,1,1))
+    check_sharding(PositionalSharding(devices).reshape(1,1,n,1))
 
     if not local:
         # Ring attention
         with Mesh(np.array(devices), axis_names=('x',)) as mesh:
             sharding = NamedSharding(mesh, P(None,'x',None,None))
-            check_sharding(sharding,q,k,v)
+            check_sharding(sharding)
+
+def decode_hlo(hlo):
+    computations = {}
+    current_name = None
+    current_lines = []
+    for line in hlo.splitlines():
+        if line.startswith('%') or line.startswith('ENTRY'):
+            if current_name is not None:
+                computations[current_name] = current_lines
+            current_name = line.split()[0]
+            current_lines = []
+        elif line.lstrip().startswith('%') or line.lstrip().startswith('ROOT'):
+            current_lines.append(line)
+    if current_lines:
+        computations[current_name] = current_lines
+
+    def visit(name):
+        for line in computations[name]:
+            if 'custom-call(' in line:
+                yield 'custom-call'
+            elif any('calls='+target in line for target in computations.keys()):
+                target = [target for target in computations.keys() if 'calls='+target in line][0]
+                for item in visit(target):
+                    yield item
+            elif 'collective-permute-start(' in line:
+                yield 'collective-permute-start'
+            elif 'collective-permute-done(' in line:
+                yield 'collective-permute-done'
+    
+    return ' '.join(visit('ENTRY'))
 
 if __name__ == '__main__':
     test_flash_fwd_sharded_hlo(128,4,32,False,False,jnp.float16)

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`from .flash import flash_mha`
`2`		`-__version__ = 'v0.1.0'`
	`2`	`+__version__ = 'v0.2.0'`