ROCm
diff --git a/‎examples/32_ring_attention/README.md‎
Lines changed: 115 additions & 0 deletions b/‎examples/32_ring_attention/README.md‎
Lines changed: 115 additions & 0 deletions
diff --git a/‎examples/32_ring_attention/example_run.py‎
Lines changed: 129 additions & 0 deletions b/‎examples/32_ring_attention/example_run.py‎
Lines changed: 129 additions & 0 deletions
@@ -0,0 +1,115 @@
+<!--
+SPDX-License-Identifier: MIT
+Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+-->
+
+# Ring Attention
+
+An implementation of **Ring Attention with Blockwise Transformers** for
+near-infinite context on AMD GPUs using [Iris](../../README.md).
+
+> Liu, H., Li, M., Hall, A., Dao, T., & Abbeel, P. (2023).
+> *Ring Attention with Blockwise Transformers for Near-Infinite Context.*
+> arXiv:2310.01889. <https://arxiv.org/pdf/2310.01889>
+
+---
+
+## Algorithm
+
+Standard self-attention requires O(n²) memory in the sequence length n.
+Ring Attention enables sequences far longer than what fits on a single device
+by distributing them across a *ring* of GPUs:
+
+1. The full sequence is split evenly across **N GPUs** along the sequence
+   dimension. Each device holds a chunk of Q, K, and V of length
+   `seq_total / N`.
+2. **Q stays local**. K and V rotate around the ring one step at a time.
+3. At each of the **N steps**, every device runs a local
+   [Flash Attention](https://arxiv.org/abs/2205.14135) pass and accumulates
+   the result using **online softmax**.
+4. After all N steps the accumulator is normalised to yield the final output.
+
+For **causal (autoregressive) attention** only the steps where the KV chunk
+precedes or coincides with the Q chunk contribute, allowing early termination
+for some ranks and reducing total compute.
+
+```
+Step 0:  rank r processes its own K_r, V_r          (causal block diagonal)
+Step 1:  rank r receives K_{r-1}, V_{r-1}           (full attention, past)
+...
+Step r:  rank r receives K_0, V_0                   (full attention, past)
+Step r+1..N-1: all-future chunks – skipped          (causal mode only)
+```
+
+---
+
+## Files
+
+| File | Description |
+|------|-------------|
+| `ring_attention_kernels.py` | Triton flash-attention kernel + Python ring-rotation helper |
+| `ring_attention_layer.py`   | `RingAttention` – a `torch.nn.Module` wrapper |
+| `example_run.py`            | End-to-end demo with timing |
+
+---
+
+## Usage
+
+### Quick demo
+
+```bash
+# 2 GPUs, causal attention (default)
+python examples/32_ring_attention/example_run.py
+
+# 4 GPUs, bidirectional
+python examples/32_ring_attention/example_run.py --num_ranks 4 --no_causal
+
+# Custom sizes
+python examples/32_ring_attention/example_run.py \
+    --num_ranks 8 \
+    --total_seq_len 131072 \
+    --num_heads 32 \
+    --head_dim 128
+```
+
+### Validation
+
+```bash
+python tests/run_tests_distributed.py tests/examples/test_ring_attention.py --num_ranks 2 -v
+```
+
+---
+
+## Python API
+
+```python
+import iris
+from examples.ring_attention.ring_attention_layer import RingAttention
+
+shmem = iris.iris()
+
+# Each rank holds its local chunk
+layer = RingAttention(
+    shmem,
+    num_heads=16,
+    head_dim=64,
+    causal=True,       # autoregressive masking
+)
+
+# q, k, v: [seq_local, num_heads, head_dim]  (float16 or bfloat16)
+output = layer(q, k, v)   # [seq_local, num_heads, head_dim]
+```
+
+---
+
+## Design Notes
+
+* **Communication**: KV rotation uses `torch.distributed.isend` / `irecv`
+  (point-to-point), launching overlapping sends and receives to maximise
+  throughput.
+* **Online softmax**: The kernel maintains running max (`M`) and sum (`L`)
+  accumulators in float32 for numerical stability.  The final output is
+  `O / L` after all ring steps.
+* **Causal masking**: Handled entirely at the granularity of KV *chunks* –
+  full attention, diagonal block attention, or skip – so the per-element mask
+  is applied only in the same-block diagonal case.
@@ -0,0 +1,129 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+"""
+Minimal example demonstrating ring attention using the RingAttention layer.
+
+The sequence is split evenly across GPUs along the sequence dimension.
+Each rank computes its share of the attention output.  After the ring passes
+Q and V are combined via online-softmax, yielding the same result as a single
+device running full attention on the entire sequence.
+
+Usage::
+
+    # Run on 2 GPUs (default)
+    python examples/32_ring_attention/example_run.py
+
+    # Run on 4 GPUs
+    python examples/32_ring_attention/example_run.py --num_ranks 4
+
+    # Non-causal (bidirectional) attention
+    python examples/32_ring_attention/example_run.py --no_causal
+"""
+
+import argparse
+
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+import iris
+from ring_attention_layer import RingAttention
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Ring Attention example")
+    parser.add_argument("--total_seq_len", type=int, default=4096, help="Total sequence length (split across GPUs)")
+    parser.add_argument("--num_heads", type=int, default=16, help="Number of attention heads")
+    parser.add_argument("--head_dim", type=int, default=64, help="Head dimension")
+    parser.add_argument("--num_ranks", type=int, default=2, help="Number of GPUs")
+    parser.add_argument("--no_causal", action="store_true", help="Use bidirectional (non-causal) attention")
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="float16",
+        choices=["float16", "bfloat16"],
+        help="Input tensor dtype",
+    )
+    return parser.parse_args()
+
+
+def run(rank: int, world_size: int, init_url: str, args):
+    backend = "nccl" if torch.cuda.is_available() else "gloo"
+    dist.init_process_group(
+        backend=backend,
+        init_method=init_url,
+        world_size=world_size,
+        rank=rank,
+        device_id=torch.device(f"cuda:{rank}"),
+    )
+
+    shmem = iris.iris()
+    torch.manual_seed(42)
+    torch.set_default_device("cuda")
+
+    dtype = getattr(torch, args.dtype)
+    causal = not args.no_causal
+
+    seq_local = args.total_seq_len // world_size
+    num_heads = args.num_heads
+    head_dim = args.head_dim
+
+    if rank == 0:
+        attn_type = "causal" if causal else "bidirectional"
+        print(f"--- Ring Attention Example ({attn_type}) ---")
+        print(f"  GPUs          : {world_size}")
+        print(f"  Total seq len : {args.total_seq_len}")
+        print(f"  Seq per GPU   : {seq_local}")
+        print(f"  Heads × dim   : {num_heads} × {head_dim}")
+        print(f"  dtype         : {dtype}")
+
+    # Each rank creates its local Q, K, V chunk
+    q = torch.randn(seq_local, num_heads, head_dim, dtype=dtype)
+    k = torch.randn(seq_local, num_heads, head_dim, dtype=dtype)
+    v = torch.randn(seq_local, num_heads, head_dim, dtype=dtype)
+
+    shmem.barrier()
+
+    layer = RingAttention(shmem, num_heads=num_heads, head_dim=head_dim, causal=causal)
+
+    # Warm-up pass
+    _ = layer(q, k, v)
+    torch.cuda.synchronize()
+    shmem.barrier()
+
+    # Timed pass
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+
+    start.record()
+    output = layer(q, k, v)
+    end.record()
+
+    torch.cuda.synchronize()
+    elapsed_ms = start.elapsed_time(end)
+
+    if rank == 0:
+        print(f"\nOutput shape : {output.shape}")
+        print(f"Output dtype : {output.dtype}")
+        print(f"Elapsed time : {elapsed_ms:.2f} ms")
+        print(f"Output[0, 0, :4] = {output[0, 0, :4].float()}")
+
+    shmem.barrier()
+    dist.destroy_process_group()
+
+
+def main():
+    args = parse_args()
+    init_url = "tcp://127.0.0.1:29500"
+    mp.spawn(
+        fn=run,
+        args=(args.num_ranks, init_url, args),
+        nprocs=args.num_ranks,
+        join=True,
+    )
+
+
+if __name__ == "__main__":
+    main()