iree-org
diff --git a/‎docs/wave/gather_to_shared.rst‎
Lines changed: 41 additions & 0 deletions b/‎docs/wave/gather_to_shared.rst‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎docs/wave/wave.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/wave/wave.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎iree/turbine/kernel/lang/global_symbols.py‎
Lines changed: 1 addition & 0 deletions b/‎iree/turbine/kernel/lang/global_symbols.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎iree/turbine/kernel/ops/wave_ops.py‎
Lines changed: 34 additions & 2 deletions b/‎iree/turbine/kernel/ops/wave_ops.py‎
Lines changed: 34 additions & 2 deletions
diff --git a/‎iree/turbine/kernel/wave/barriers.py‎
Lines changed: 63 additions & 7 deletions b/‎iree/turbine/kernel/wave/barriers.py‎
Lines changed: 63 additions & 7 deletions
diff --git a/‎iree/turbine/kernel/wave/cache.py‎
Lines changed: 1 addition & 0 deletions b/‎iree/turbine/kernel/wave/cache.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎iree/turbine/kernel/wave/codegen/handlers.py‎
Lines changed: 27 additions & 1 deletion b/‎iree/turbine/kernel/wave/codegen/handlers.py‎
Lines changed: 27 additions & 1 deletion
diff --git a/‎iree/turbine/kernel/wave/codegen/read_write.py‎
Lines changed: 91 additions & 2 deletions b/‎iree/turbine/kernel/wave/codegen/read_write.py‎
Lines changed: 91 additions & 2 deletions
@@ -0,0 +1,41 @@
+.. _gather_to_shared:
+
+Gather to Shared Memory Optimization
+====================================
+
+Overview
+--------
+
+The ``gather_to_shared`` pass enables direct memory loads from global memory to Local Data Store (LDS) without passing through registers, reducing data movement overhead.
+
+This instruction is supported only on specific AMD GPU architectures (gfx94* and gfx95*).
+
+Architecture Support
+--------------------
+
+- **gfx94**: Support 32-bit load/store widths
+- **gfx95**: Support 32-bit, 96-bit, and 128-bit load/store widths
+
+Both architectures also support 8 and 16 bit load widths, but they are zero/sign extended to 32 bit before store, which is not very useful for us.
+
+Instruction Semantics
+---------------------
+
+``gather_to_shared`` is translated to ``amdgpu.gather_to_lds`` MLIR op, which is lowered to ``global_load_lds_*`` instructions.
+
+Each thread reads 4, 12, or 16 bytes from arbitrary positions in global memory or buffer and writes them contiguously to LDS starting from the address specified in the first thread in wave.
+Destination addresses in all other threads are ignored.
+
+The operation is asynchronous and AMDGPU backend currently doesn't enforce any dependencies with other LDS access operations (which may be fixed in the future). Users need to manually insert ``waitcnt`` instruction to avoid data races.
+
+This is handled in ``add_shared_memory_barriers`` pass. Currently, it will insert ``waitcnt`` instruction right before the ``amdgpu.lds_barrier`` instruction if it had any preceding ``amdgpu.gather_to_lds`` instructions.
+
+
+Pass Description
+----------------
+
+``gather_to_shared`` pass works similarly to ``minimize_global_loads``, it takes number of elements that need to be transferred and then divides it by the total number of threads to determine number of elements to be transferred per thread.
+
+Unlike ``minimize_global_loads`` it supports a very limited number of elements per thread and only supports a simple contiguous memory layout.
+
+Also, as LDS writes are always contiguous, it doesn't support padding if the number of elements per wave crosses a row boundary and will undo any LDS padding present in this case.
@@ -70,3 +70,4 @@ For more detailed information about Wave's architecture and optimization passes,
    schedule_modifier
    fused_softmax
    aplp
+   gather_to_shared
@@ -40,6 +40,7 @@ def get_workgroup_symbol(i: int):
 WRITE_SHARED_DELAY = index_symbol("$WRITE_SHARED_DELAY")
 READ_GLOBAL_DELAY = index_symbol("$READ_GLOBAL_DELAY")
 WRITE_GLOBAL_DELAY = index_symbol("$WRITE_GLOBAL_DELAY")
+GLOBAL_TO_SHARED_DELAY = index_symbol("$GLOBAL_TO_SHARED_DELAY")
 MMA_DELAY = index_symbol("$MMA_DELAY")
 VALU_DELAY = index_symbol("$VALU_DELAY")
 SHUFFLE_DELAY = index_symbol("$SHUFFLE_DELAY")
 
@@ -8,7 +8,6 @@
     TYPE_CHECKING,
     Any,
     Callable,
-    List,
     Optional,
     Sequence,
     Type,
@@ -74,7 +73,7 @@ def extract_slice(
 def set_wave_prio(priority: int): ...
 
 
-def shared_memory_barrier(): ...
+def shared_memory_barrier(wait_async_ops: bool = False): ...
 
 
 def workgroup_barrier(): ...
@@ -275,6 +274,18 @@ def select(
 ) -> "Register": ...
 
 
+def gather_to_lds(
+    src: Memory,
+    dst: Memory,
+    src_idx: dict[IndexSymbol, IndexSequence],
+    dst_idx: dict[IndexSymbol, IndexSequence],
+    dtype: DataType,
+    elements_per_thread: Optional[IndexExpr | int] = None,
+    src_mapping: Optional[IndexMapping] = None,
+    dst_mapping: Optional[IndexMapping] = None,
+): ...
+
+
 def define_op(op_name: str) -> Callable[[T], T]:
     def decorator(cls: T) -> T:
         cls.tkw_op_name = op_name
@@ -1218,6 +1229,8 @@ class SharedMemoryBarrier(CustomOp):
     Represents a shared memory barrier in the graph.
     """
 
+    wait_async_ops: bool = False
+
     @property
     def has_side_effects(self) -> bool:
         return True
@@ -2491,3 +2504,22 @@ def indexing_dims(self) -> list[IndexExpr]:
 
     def infer_type(self):
         self.type = get_custom(_to_sequence(self.args)[0]).type
+
+
+@define_op("gather_to_lds")
+@dataclass
+class GatherToLDS(CustomOp):
+    """
+    Represents an instruction that performs direct load from global
+    to lds. Source node points to the global memory to load from
+    and the destination node points to shared memory.
+    """
+
+    src: Memory
+    dst: Memory
+    src_idx: dict[IndexSymbol, IndexSequence]
+    dst_idx: dict[IndexSymbol, IndexSequence]
+    dtype: DataType
+    elements_per_thread: Optional[IndexExpr | int]
+    src_mapping: Optional[IndexMapping]
+    dst_mapping: Optional[IndexMapping]
@@ -6,10 +6,68 @@
 
 from .utils.graph_utils import is_reduction_subgraph, is_barrier_between
 from .._support.tracing import CapturedTrace
-from ..ops.wave_ops import get_custom, Read, SharedMemoryBarrier, Write, NestedRegionOp
+from ..ops.wave_ops import (
+    AtomicOp,
+    CustomOp,
+    GatherToLDS,
+    NestedRegionOp,
+    Read,
+    SharedMemoryBarrier,
+    Write,
+    get_custom,
+)
 from ..lang.global_symbols import SHARED_ADDRESS_SPACE
 import torch.fx as fx
 from typing import Optional
+from enum import Enum, auto
+
+
+class MemoryAccessType(Enum):
+    """Enum to classify memory access operations."""
+
+    NONE = auto()
+    READ = auto()
+    WRITE = auto()
+    READ_WRITE = auto()
+
+
+def is_shared_memory_op(node: CustomOp) -> bool:
+    if isinstance(node, (Read, Write, AtomicOp)):
+        return node.memory_type.address_space == SHARED_ADDRESS_SPACE
+    elif isinstance(node, GatherToLDS):
+        return True
+
+    return False
+
+
+def get_memory_access_type(node: CustomOp) -> MemoryAccessType:
+    if isinstance(node, Read):
+        return MemoryAccessType.READ
+    elif isinstance(node, Write):
+        return MemoryAccessType.WRITE
+    elif isinstance(node, AtomicOp):
+        return MemoryAccessType.READ_WRITE
+    elif isinstance(node, GatherToLDS):
+        return MemoryAccessType.WRITE
+    else:
+        return MemoryAccessType.NONE
+
+
+def need_barrier(node1: CustomOp, node2: CustomOp) -> bool:
+    access_type1 = get_memory_access_type(node1)
+    if access_type1 == MemoryAccessType.NONE:
+        return False
+    access_type2 = get_memory_access_type(node2)
+    if access_type2 == MemoryAccessType.NONE:
+        return False
+
+    if access_type1 != access_type2:
+        return True
+
+    if access_type1 == MemoryAccessType.READ_WRITE:
+        return True
+
+    return False
 
 
 def add_shared_memory_barriers(
@@ -32,19 +90,17 @@ def add_shared_memory_barriers(
 
     for node in graph.nodes:
         custom = get_custom(node)
-        if (
-            isinstance(custom, (Read, Write))
-            and custom.memory_type.address_space == SHARED_ADDRESS_SPACE
-        ):
+        if is_shared_memory_op(custom):
             if last_node is None:
                 last_node = custom
                 continue
-            if type(custom) != type(last_node) and not is_barrier_between(
+            if need_barrier(custom, last_node) and not is_barrier_between(
                 last_node.fx_node, custom.fx_node
             ):
+                is_async = isinstance(last_node, GatherToLDS)
                 # Synchronize after the write to shared memory before we read from it.
                 with graph.inserting_before(node):
-                    SharedMemoryBarrier().add_to_graph(graph)
+                    SharedMemoryBarrier(wait_async_ops=is_async).add_to_graph(graph)
             last_node = custom
         if isinstance(custom, NestedRegionOp):
             last_node = add_shared_memory_barriers(
 
@@ -205,6 +205,7 @@ def get_hash(
             options.use_buffer_store_ops,
             options.use_stride_cache_swizzle,
             options.use_fast_math,
+            options.use_global_to_shared,
             options.minimize_shared_allocs,
             options.reorder_allocs,
             options.override_schedule,
 
@@ -30,12 +30,12 @@
     amdgpu_d,
     arith_d,
     gpu_d,
+    llvm_d,
     math_d,
     memref_d,
     rocdl_d,
     scf_d,
     vector_d,
-    llvm_d,
 )
 from iree.turbine.aot.support.ir_utils import (
     _is_float_type,
@@ -1398,8 +1398,34 @@ def handle_set_wave_prio(emitter: WaveEmitter, node: fx.Node):
     rocdl_d.s_setprio(prio)
 
 
+def waitcnt(vmcnt: int):
+    """
+    Create `s_waitcnt` with the specified vmcnt and all other counters set to max.
+    """
+
+    # Clamp vmcnt to 6bits; a lower vmcnt will produce a conservative wait
+    vmCnt = min(63, vmcnt)
+
+    # Extract low and high bits and combine while setting all other bits to 1
+    lowBits = vmCnt & 0xF
+    highBits = (vmCnt >> 4) << 14
+    otherCnts = ~0xC00F  # C00F has bits 15:14 and 3:0 set
+    waitValue = lowBits | highBits | otherCnts
+    waitValue &= 0xFFFF
+
+    rocdl_d.s_waitcnt(waitValue)
+
+
 @handle_op(shared_memory_barrier)
 def handle_shared_memory_barrier(emitter: WaveEmitter, node: fx.Node):
+    try:
+        (wait_async_ops,) = node.args
+    except ValueError as e:
+        raise ValidationError("Malformed arguments") from e
+
+    if wait_async_ops:
+        waitcnt(0)
+
     amdgpu_d.lds_barrier()
 
 
 
@@ -4,6 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+import copy
 import sympy
 import functools
 from typing import Any, Optional, Dict
@@ -34,12 +35,20 @@
 from ...compiler.vector_codegen import (
     cast_kernel_buffer,
     cast_py_literal,
+    cast_py_value,
     cast_vector,
 )
 
-from ...ops.wave_ops import get_custom, read, write, CustomOp
+from ...ops.wave_ops import (
+    CustomOp,
+    gather_to_lds,
+    get_custom,
+    read,
+    write,
+)
 
 from ..utils.general_utils import get_fastest_index, infer_dim
+from ..utils.mapping_utils import transform_index_on_mapping
 from ..utils.symbol_utils import safe_subs, subs_idxc
 
 from ..._support.indexing import IndexingContext, IndexExpr, IndexSequence, IndexSymbol
@@ -48,10 +57,11 @@
 
 from .emitter import (
     WaveEmitter,
-    handle_op,
     add_emitter_subs,
     gen_sympy_index,
     get_constant_attr,
+    get_type_or_element_type,
+    handle_op,
 )
 
 
@@ -883,3 +893,82 @@ def handle_write(emitter: WaveEmitter, node: fx.Node):
             mask,
             offsets_vec,
         )
+
+
+@handle_op(gather_to_lds)
+def handle_gather_to_lds(emitter: WaveEmitter, node: fx.Node):
+    try:
+        (
+            src,
+            dst,
+            src_idx,
+            dst_idx,
+            element_type,
+            elements_per_thread,
+            src_mapping,
+            dst_mapping,
+        ) = node.args
+    except ValueError as e:
+        raise ValidationError("Malformed arguments") from e
+
+    element_type = IrType.parse(element_type.dtype.ir_type_asm())
+
+    src_symbolic_shape = _get_symbolic_shape(src)
+    dst_symbolic_shape = _get_symbolic_shape(dst)
+
+    src = cast_py_value(emitter, src)
+    dst = cast_py_value(emitter, dst)
+    src_data_type = get_type_or_element_type(src.ir_value.type)
+    dst_data_type = get_type_or_element_type(dst.ir_value.type)
+
+    if not (
+        MemRefType.isinstance(src.ir_value.type)
+        and MemRefType.isinstance(dst.ir_value.type)
+    ):
+        op = get_custom(node)
+        raise ValidationError(
+            f"Expected src and dst to be of Memref type for\n"
+            f"{op}\nGot\n"
+            f"src: {src.ir_value.type}\n"
+            f"dst: {dst.ir_value.type}\n"
+        )
+
+    if src_data_type != dst_data_type:
+        op = get_custom(node)
+        raise ValidationError(
+            f"Expected src and dst to have same data type for\n"
+            f"{op}\nGot\n"
+            f"src: {src_data_type} vs dst: {dst_data_type}\n"
+        )
+
+    src = src.ir_value
+    dst = dst.ir_value
+
+    if src_mapping:
+        src_idx = transform_index_on_mapping(src_mapping, src_symbolic_shape, src_idx)
+    if dst_mapping:
+        dst_idx = transform_index_on_mapping(dst_mapping, dst_symbolic_shape, dst_idx)
+
+    store_type = VectorType.get((elements_per_thread,), element_type)
+
+    src_index, src_index_wg, src_index_th = _build_start_indices(emitter, src_idx)
+    dst_index, _, _ = _build_start_indices(emitter, dst_idx)
+
+    if False:  # TODO: Buffer stuff needs upstream fixes
+        strides = strides_from_symbolic_shape(
+            IndexingContext.current(), src_symbolic_shape, allow_mixed_shapes=True
+        )
+        strides = [gen_sympy_index(add_emitter_subs(emitter), s) for s in strides]
+
+        src, offset_th = _linearize_memref(src, src_index_wg, src_index_th, strides)
+        src = _cast_buffer_and_encode_stride(src, strides, element_type, emitter)
+
+        src_index = [offset_th]
+
+    amdgpu_d.gather_to_lds(
+        src=src,
+        src_indices=src_index,
+        dst=dst,
+        dst_indices=dst_index,
+        transfer_type=store_type,
+    )