nshepperd
diff --git a/‎.github/workflows/publish.yml‎
Lines changed: 7 additions & 9 deletions b/‎.github/workflows/publish.yml‎
Lines changed: 7 additions & 9 deletions
diff --git a/‎README.md‎
Lines changed: 9 additions & 23 deletions b/‎README.md‎
Lines changed: 9 additions & 23 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 12 additions & 8 deletions b/‎pyproject.toml‎
Lines changed: 12 additions & 8 deletions
diff --git a/‎scripts/install-cuda-linux.sh‎
Lines changed: 1 addition & 1 deletion b/‎scripts/install-cuda-linux.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/flash_attn_jax/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎src/flash_attn_jax/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/flash_attn_jax/flash.py‎
Lines changed: 7 additions & 7 deletions b/‎src/flash_attn_jax/flash.py‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎src/flash_attn_jax/flash_hlo.py‎
Lines changed: 20 additions & 16 deletions b/‎src/flash_attn_jax/flash_hlo.py‎
Lines changed: 20 additions & 16 deletions
@@ -41,8 +41,8 @@ jobs:
       fail-fast: false
       matrix:
           os: [ubuntu-20.04]
-          python-version: ['cp39', 'cp310', 'cp311', 'cp312']
-          cuda-version: ['11.8', '12.3']
+          python-version: ['cp311', 'cp312']
+          cuda-version: ['12.8']
 
     steps:
       - name: Checkout
@@ -51,7 +51,7 @@ jobs:
       - name: Set up python
         uses: actions/setup-python@v4
         with:
-          python-version: '3.10'
+          python-version: '3.11'
 
       - name: Set CUDA and PyTorch versions
         run: |
@@ -76,7 +76,7 @@ jobs:
         uses: pypa/[email protected]
         env:
           CIBW_BUILD: ${{ matrix.python-version }}-manylinux_x86_64
-          CIBW_MANYLINUX_X86_64_IMAGE: sameli/manylinux2014_x86_64_cuda_${{ matrix.cuda-version }}
+          CIBW_BEFORE_ALL: bash scripts/install-cuda-linux.sh ${{ matrix.cuda-version }}
           CIBW_BUILD_VERBOSITY: 1
 
       - name: Log Built Wheels
@@ -128,17 +128,15 @@ jobs:
 
       - uses: actions/setup-python@v4
         with:
-          python-version: '3.10'
+          python-version: '3.11'
 
       - name: Install dependencies
         run: |
-          pip install setuptools==68.0.0
-          pip install git+https://github.com/nshepperd/setuptools-cuda-cpp
-          pip install ninja packaging wheel pybind11
+          pip install uv
 
       - name: Build core package
         run: |
-          CUDA_HOME=/ python setup.py sdist --dist-dir=dist
+          uv build --sdist
 
       - name: Retrieve release distributions
         uses: actions/download-artifact@v4
 
@@ -1,19 +1,20 @@
 # FlashAttention JAX
 This repository provides a jax binding to <https://github.com/Dao-AILab/flash-attention>. To avoid depending on pytorch, since torch and jax installations often conflict, this is a fork of the official repo.
 
-Please see [Tri Dao's repo](https://github.com/Dao-AILab/flash-attention) for more information about flash attention.
+Please see [Tri Dao's repo](https://github.com/Dao-AILab/flash-attention) for more information about flash attention. Also check there for how to cite the authors if you used flash attention in your work.
 
 FlashAttention and FlashAttention-2 are free to use and modify (see LICENSE).
 Please cite (see below) and credit FlashAttention if you use it.
 
 ## Installation
 
 Requirements:
-- CUDA 11.8 and above.
+- CUDA 12.8 and above.
 - Linux. Same story as with the pytorch repo. I haven't tested compilation of the jax bindings on windows.
-- JAX >=`0.4.24`. The custom sharding used for ring attention requires some somewhat advanced features.
+- JAX >= `0.5.*`. The custom call api changed in this version.
 
-To install: `pip install flash-attn-jax` will get the latest release from pypi. This gives you the cuda 12.3 build. If you want to use the cuda 11.8 build, you can install from the releases page (but according to jax's documentation, 11.8 will stop being supported for newer versions of jax).
+To install: `pip install flash-attn-jax` will get the latest release from pypi. This gives you the cuda 12.8
+build. CUDA 11 isn't supported any more (since jax stopped supporting it).
 
 ### Installing from source
 
@@ -25,7 +26,7 @@ cd flash-attn-jax
 cibuildwheel --only cp312-manylinux_x86_64 # I think cibuildwheel needs superuser privileges on some systems because of docker reasons?
 ```
 
-This will create a wheel in the `wheelhouse` directory. You can then install it with `pip install wheelhouse/flash_attn_jax_0.2.0-cp312-cp312-manylinux_x86_64.whl`. Or you could use setup.py to build the wheel and install it. You need cuda toolkit installed in that case.
+This will create a wheel in the `wheelhouse` directory. You can then install it with `pip install wheelhouse/flash_attn_jax_*.whl`. Or you could build it without docker using `uv build --wheel`. You need cuda installed in that case.
 
 ## Usage
 
@@ -45,15 +46,16 @@ This supports multi-query and grouped-query attention (when hk != h). The `softm
 Use jax.Array and shard your tensors along the length dimension, and flash_mha will automatically use the ring attention algorithm (forward and backward).
 
 ```py
-os.environ["XLA_FLAGS"] = '--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_async_collectives=true'
+os.environ["XLA_FLAGS"] = '--xla_gpu_enable_latency_hiding_scheduler=true'
 #...
 with Mesh(devices, axis_names=('len',)) as mesh:
         sharding = NamedSharding(mesh, P(None,'len')) # n l
         tokens = jax.device_put(tokens, sharding)
         # invoke your jax.jit'd transformer.forward
 ```
 
-It's not entirely reliable at hiding the communication latency though, depending on the whims of the xla optimizer. I'm waiting https://github.com/google/jax/issues/20864 to be fixed, then I can make it better.
+The latency hiding seems to be reliable now that some bugs have been fixed, as long as you enable the 
+latency hiding scheduler as above.
 
 ### GPU support
 
@@ -63,19 +65,3 @@ FlashAttention-2 currently supports:
    GPUs for now.
 2. Datatype fp16 and bf16 (bf16 requires Ampere, Ada, or Hopper GPUs).
 3. All head dimensions up to 256. ~~Head dim > 192 backward requires A100/A800 or H100/H800~~. Head dim 256 backward now works on consumer GPUs (if there's no dropout) as of flash-attn 2.5.5.
-
-## Citation
-If you use this codebase, or otherwise found our work valuable, please cite:
-```
-@inproceedings{dao2022flashattention,
-  title={Flash{A}ttention: Fast and Memory-Efficient Exact Attention with {IO}-Awareness},
-  author={Dao, Tri and Fu, Daniel Y. and Ermon, Stefano and Rudra, Atri and R{\'e}, Christopher},
-  booktitle={Advances in Neural Information Processing Systems},
-  year={2022}
-}
-@article{dao2023flashattention2,
-  title={Flash{A}ttention-2: Faster Attention with Better Parallelism and Work Partitioning},
-  author={Dao, Tri},
-  year={2023}
-}
-```
@@ -6,31 +6,35 @@ requires = [
     "packaging",
     "psutil",
     "pybind11>=2.11.0",
-    # "nvidia-cuda-runtime-cu12>=12.0",
-    # "nvidia-cuda-nvrtc-cu12",
-    # "nvidia-nvtx-cu12",
-    "torch>=2.0.0",
 ]
 build-backend = "scikit_build_core.build"
 
 [project]
 name = "flash_attn_jax"
 dynamic = ["version"]
-description = "Flash Attention: Fast and Memory-Efficient Exact Attention"
+description = "Flash Attention port for JAX"
 readme = "README.md"
-requires-python = ">=3.9"
+requires-python = ">=3.11"
 license = { text = "BSD-3-Clause" }
 authors = [
     { name = "Tri Dao", email = "[email protected]" },
     { name = "Emily Shepperd", email = "[email protected]" }
 ]
-dependencies = []
+dependencies = [
+    "jax>=0.5.0, <0.8.0"
+]
 classifiers = [
     "Programming Language :: Python :: 3",
     "License :: OSI Approved :: BSD License",
     "Operating System :: Unix",
 ]
 
+[dependency-groups]
+test = [
+    "pytest>=7.0.0",
+    "einops",
+    "jax[cuda12]",
+]
 [project.urls]
 Homepage = "https://github.com/nshepperd/flash_attn_jax"
 
@@ -59,7 +63,7 @@ input = "src/flash_attn_jax/__init__.py"
 manylinux-x86_64-image = "quay.io/pypa/manylinux_2_28_x86_64:latest"
 before-all = "bash scripts/install-cuda-linux.sh"
 build = "cp312-manylinux_x86_64"
-repair-wheel-command = "auditwheel repair --exclude=libcudart.so* --exclude libtorch.so* -w {dest_dir} {wheel}"
+repair-wheel-command = "auditwheel repair --exclude=libcudart.so* -w {dest_dir} {wheel}"
 
 [tool.cibuildwheel.environment]
 PATH="/opt/rh/gcc-toolset-13/root/usr/bin:/usr/local/cuda/bin:$PATH"
 
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -eux
 
-VER=${1:-12.4}
+VER=${1:-12.8}
 VER=${VER//./-}  # Convert version to format used in package names
 
 dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
 
@@ -1,2 +1,2 @@
 from .flash import flash_mha
-__version__ = 'v0.2.2'
+__version__ = 'v0.3.0'
@@ -12,11 +12,11 @@
 from jax.lib import xla_client
 from jaxlib.hlo_helpers import custom_call
 from jax.experimental.custom_partitioning import custom_partitioning
+from jax.extend.core import Primitive
 
 from jax.sharding import PartitionSpec as P
 from jax.sharding import Mesh
 from jax.sharding import NamedSharding
-from jax.sharding import PositionalSharding
 
 from einops import rearrange
 import einops
@@ -31,11 +31,11 @@
 # about sharding or padding, which will be handled when they are
 # lowered to hlo, using the physical "hlo" primitives, which directly
 # lower to XLA CustomCall.
-_flash_mha_fwd_p = core.Primitive("flash_mha_fwd")
+_flash_mha_fwd_p = Primitive("flash_mha_fwd")
 _flash_mha_fwd_p.multiple_results = True
 _flash_mha_fwd_p.def_impl(partial(xla.apply_primitive, _flash_mha_fwd_p))
 
-_flash_mha_bwd_p = core.Primitive("flash_mha_bwd")
+_flash_mha_bwd_p = Primitive("flash_mha_bwd")
 _flash_mha_bwd_p.multiple_results = True
 _flash_mha_bwd_p.def_impl(partial(xla.apply_primitive, _flash_mha_bwd_p))
 
@@ -79,7 +79,7 @@ def _flash_mha_fwd_abstract(q, k, v, softmax_scale=None, is_causal=None, window_
     assert q_dtype == k_dtype and q_dtype == v_dtype
     assert q_dtype in [jnp.bfloat16, jnp.float16]
     return (
-        ShapedArray(q.shape, q_dtype, named_shape=q.named_shape),
+        ShapedArray(q.shape, q_dtype),
         ShapedArray([n, h, l], jnp.float32)
     )
 _flash_mha_fwd_p.def_abstract_eval(_flash_mha_fwd_abstract)
@@ -96,9 +96,9 @@ def _flash_mha_bwd_abstract(dout, q, k, v, out, lse, softmax_scale=None, is_caus
     assert len(set([dout_dtype, q_dtype, k_dtype, v_dtype, out_dtype])) == 1
     assert q_dtype in [jnp.bfloat16, jnp.float16]
     return (
-        ShapedArray(q.shape, q_dtype, named_shape=q.named_shape),
-        ShapedArray(k.shape, k_dtype, named_shape=k.named_shape),
-        ShapedArray(v.shape, v_dtype, named_shape=v.named_shape),
+        ShapedArray(q.shape, q_dtype),
+        ShapedArray(k.shape, k_dtype),
+        ShapedArray(v.shape, v_dtype),
     )
 _flash_mha_bwd_p.def_abstract_eval(_flash_mha_bwd_abstract)
 
 
@@ -12,10 +12,7 @@
 from jax.lib import xla_client
 from jax.experimental.custom_partitioning import custom_partitioning
 
-from jax.sharding import PartitionSpec as P
-from jax.sharding import Mesh
-from jax.sharding import NamedSharding
-from jax.sharding import PositionalSharding
+from jax.extend.core import Primitive
 
 from einops import rearrange
 import einops
@@ -25,15 +22,15 @@
 
 # ==== Register primitives ====
 
-_flash_mha_fwd_hlo_p = core.Primitive("flash_mha_fwd_hlo")
+_flash_mha_fwd_hlo_p = Primitive("flash_mha_fwd_hlo")
 _flash_mha_fwd_hlo_p.multiple_results = True
 _flash_mha_fwd_hlo_p.def_impl(partial(xla.apply_primitive, _flash_mha_fwd_hlo_p))
 
-_flash_mha_bwd_hlo_p = core.Primitive("flash_mha_bwd_hlo")
+_flash_mha_bwd_hlo_p = Primitive("flash_mha_bwd_hlo")
 _flash_mha_bwd_hlo_p.multiple_results = True
 _flash_mha_bwd_hlo_p.def_impl(partial(xla.apply_primitive, _flash_mha_bwd_hlo_p))
 
-_custom_call_p = core.Primitive("custom_call")
+_custom_call_p = Primitive("custom_call")
 _custom_call_p.multiple_results = True
 _custom_call_p.def_impl(partial(xla.apply_primitive, _custom_call_p))
 
@@ -48,13 +45,18 @@ def _flash_mha_bwd_hlo(dout, q, k, v, out, lse, softmax_scale, is_causal, window
     return dq, dk, dv
 
 def custom_call(*args, call_target_name, result_types, backend_config, operand_layouts, result_layouts):
-    return _custom_call_p.bind(*args, call_target_name=call_target_name, result_types=result_types, backend_config=backend_config, operand_layouts=operand_layouts, result_layouts=result_layouts)
+    return _custom_call_p.bind(*args, call_target_name=call_target_name, 
+                               result_types=tuple(result_types),
+                               backend_config=backend_config,
+                               operand_layouts=tuple(operand_layouts), 
+                               result_layouts=tuple(result_layouts))
 
 # ==== HLO lowerings ====
 
 # Register functions defined in gpu_ops as custom call target for GPUs
 for _name, _value in flash_api.get_registrations().items():
-    xla_client.register_custom_call_target(_name, _value, platform="gpu")
+    # xla_client.register_custom_call_target(_name, _value, platform="gpu")
+    jax.ffi.register_ffi_target(_name, _value, platform="gpu", api_version=0)
 
 def default_layouts(*shapes):
     def row_major(shape):
@@ -85,6 +87,7 @@ def _flash_mha_fwd_hlo_lowering(ctx, q, k, v, softmax_scale=None, is_causal=Fals
     [nk, lk, hk, dk] = k_shape
     assert k_shape == v_shape, "K and V must have the same shape"
     assert [n, d] == [nk, dk], "Q and K must have the same batch size and head size"
+    assert isinstance(window_size, (tuple, list))
 
     opaque = flash_api.make_flash_mha_fwd_args(
         0.0, # p_dropout
@@ -164,6 +167,7 @@ def _flash_mha_bwd_hlo_lowering(ctx, dout, q, k, v, out, lse, softmax_scale=None
     [nk, lk, hk, dk] = k_shape
     assert n == nk
     assert d == dk
+    assert isinstance(window_size, (tuple, list))
 
     assert (list(map(list, [dout_shape, q_shape, k_shape, v_shape, out_shape, lse_shape])) ==
             [[n, lq, hq, d], [n, lq, hq, d], [n, lk, hk, d], [n, lk, hk, d],
@@ -238,7 +242,7 @@ def _flash_mha_fwd_abstract(q, k, v, softmax_scale=None, is_causal=None, window_
     assert q_dtype == k_dtype and q_dtype == v_dtype
     assert q_dtype in [jnp.bfloat16, jnp.float16]
     return (
-        ShapedArray(q.shape, q_dtype, named_shape=q.named_shape),
+        ShapedArray(q.shape, q_dtype),
         ShapedArray([n, h, l], jnp.float32)
     )
 _flash_mha_fwd_hlo_p.def_abstract_eval(_flash_mha_fwd_abstract)
@@ -255,9 +259,9 @@ def _flash_mha_bwd_abstract(dout, q, k, v, out, lse, softmax_scale=None, is_caus
     assert len(set([dout_dtype, q_dtype, k_dtype, v_dtype, out_dtype])) == 1
     assert q_dtype in [jnp.bfloat16, jnp.float16]
     return (
-        ShapedArray(q.shape, q_dtype, named_shape=q.named_shape),
-        ShapedArray(k.shape, k_dtype, named_shape=k.named_shape),
-        ShapedArray(v.shape, v_dtype, named_shape=v.named_shape),
+        ShapedArray(q.shape, q_dtype),
+        ShapedArray(k.shape, k_dtype),
+        ShapedArray(v.shape, v_dtype),
     )
 _flash_mha_bwd_hlo_p.def_abstract_eval(_flash_mha_bwd_abstract)
 
@@ -278,10 +282,10 @@ def _custom_call_hlo_lowering(ctx, *args, call_target_name, result_types, backen
     out = mlir.custom_call(
             call_target_name,
             operands=args,
-            result_types=result_types,
+            result_types=list(result_types),
             backend_config=backend_config,
-            operand_layouts=operand_layouts,
-            result_layouts=result_layouts,
+            operand_layouts=list(operand_layouts),
+            result_layouts=list(result_layouts),
         ).results
     return out
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`from .flash import flash_mha`
`2`		`-__version__ = 'v0.2.2'`
	`2`	`+__version__ = 'v0.3.0'`