intel
diff --git a/‎.github/workflows/integration-tests-nvidia.yml
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/integration-tests-nvidia.yml
Lines changed: 2 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h
Lines changed: 6 additions & 5 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h
Lines changed: 6 additions & 5 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp
Lines changed: 8 additions & 5 deletions b/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp
Lines changed: 8 additions & 5 deletions
diff --git a/‎lib/Tools/GenericSwizzling.cpp
Lines changed: 75 additions & 2 deletions b/‎lib/Tools/GenericSwizzling.cpp
Lines changed: 75 additions & 2 deletions
diff --git a/‎python/triton/experimental/gluon/language/_core.py
Lines changed: 4 additions & 0 deletions b/‎python/triton/experimental/gluon/language/_core.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎python/triton/experimental/gluon/language/_layouts.py
Lines changed: 50 additions & 0 deletions b/‎python/triton/experimental/gluon/language/_layouts.py
Lines changed: 50 additions & 0 deletions
diff --git a/‎python/triton/tools/tensor_descriptor.py
Lines changed: 3 additions & 3 deletions b/‎python/triton/tools/tensor_descriptor.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎python/tutorials/gluon/01-attention-forward.py
Lines changed: 10 additions & 41 deletions b/‎python/tutorials/gluon/01-attention-forward.py
Lines changed: 10 additions & 41 deletions
@@ -13,6 +13,8 @@ jobs:
   integration-tests-nvidia:
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 60
+    # Let A100 and H100 continue even if GB200 fails, as it's a bit flaky
+    continue-on-error: ${{ matrix.runner[0] == 'nvidia-gb200'}}
     strategy:
       matrix:
         runner: ${{ fromJson(inputs.matrix) }}
 
@@ -260,11 +260,12 @@ void replaceUsesAndPropagateType(OpBuilder &builder, Operation *oldUse,
 
 /// Replace all uses of `old` with a local load from `alloc` unless the use is a
 /// `ttg.local_alloc` with a matching shared encoding, in which case the shared
-/// memory is forwarded directly into the use.
-void replaceUsesWithLocalLoad(
-    OpBuilder &builder, OpResult old,
-    TypedValue<triton::gpu::MemDescType> alloc,
-    TypedValue<triton::gpu::AsyncTokenType> token = {});
+/// memory is forwarded directly into the use. Returns the `ttg.local_load` if
+/// it created one.
+triton::gpu::LocalLoadOp
+replaceUsesWithLocalLoad(OpBuilder &builder, OpResult old,
+                         TypedValue<triton::gpu::MemDescType> alloc,
+                         TypedValue<triton::gpu::AsyncTokenType> token = {});
 
 // Return true if the value comes from a load or a block argument.
 // This will skip convert layouts and memdesc views.
 
@@ -1532,9 +1532,10 @@ void replaceUsesAndPropagateType(OpBuilder &builder, Operation *oldUse,
     op->erase();
 }
 
-void replaceUsesWithLocalLoad(OpBuilder &builder, OpResult old,
-                              TypedValue<ttg::MemDescType> alloc,
-                              TypedValue<ttg::AsyncTokenType> token) {
+ttg::LocalLoadOp
+replaceUsesWithLocalLoad(OpBuilder &builder, OpResult old,
+                         TypedValue<ttg::MemDescType> alloc,
+                         TypedValue<ttg::AsyncTokenType> token) {
   //  Remove redundant local_load -> local_alloc
   auto allocTy = alloc.getType();
   SmallVector<ttg::LocalAllocOp> allocsToErase;
@@ -1549,16 +1550,18 @@ void replaceUsesWithLocalLoad(OpBuilder &builder, OpResult old,
 
   // If there are some uses that were not local_allocs, we need to create a
   // local_load for them.
+  ttg::LocalLoadOp maybeLocalLoad;
   if (std::distance(old.getUsers().begin(), old.getUsers().end()) >
       allocsToErase.size()) {
     auto loc = old.getOwner()->getLoc();
-    auto sharedLoad = builder.template create<ttg::LocalLoadOp>(
+    maybeLocalLoad = builder.template create<ttg::LocalLoadOp>(
         loc, old.getType(), alloc, token);
-    old.replaceAllUsesWith(sharedLoad.getResult());
+    old.replaceAllUsesWith(maybeLocalLoad);
   }
   for (auto alloc : allocsToErase) {
     alloc.erase();
   }
+  return maybeLocalLoad;
 }
 
 bool comesFromLoadOrBlockArg(Value v) {
 
@@ -47,6 +47,16 @@ SmallVector<int32_t> flatten(const LinearLayout &ll, StringAttr dim) {
   return vec;
 };
 
+SmallVector<int32_t> removeZeros(ArrayRef<int32_t> vec) {
+  SmallVector<int32_t> result;
+  for (int32_t r : vec) {
+    if (r != 0) {
+      result.push_back(r);
+    }
+  }
+  return result;
+}
+
 // [1, 2, 4, 8] -> [[1], [2], [4], [8]]
 std::vector<std::vector<int32_t>> unflatten(ArrayRef<int32_t> basis) {
   std::vector<std::vector<int32_t>> unflattened;
@@ -279,6 +289,7 @@ LinearLayout optimalSwizzling(const LinearLayout &src, const LinearLayout &dst,
   auto *ctx = src.getInDimNames().begin()->getContext();
   auto kReg = StringAttr::get(ctx, "register");
   auto kLane = StringAttr::get(ctx, "lane");
+  auto kWarp = StringAttr::get(ctx, "warp");
 
   // We work on the flattened tensors as the tensor dimensions are not relevant
   const LinearLayout srcFlat = src.flattenOuts();
@@ -307,6 +318,65 @@ LinearLayout optimalSwizzling(const LinearLayout &src, const LinearLayout &dst,
   if (vbasis.size() > maxVecBases) {
     vbasis.resize(maxVecBases);
   }
+  // We fill-up vbasis until it has 32 bits as best we can
+  auto vecFillsBank = (1 << vbasis.size()) * bitwidth >= 32;
+  if (!vecFillsBank) {
+    auto warpSrc = removeZeros(flatten(srcFlat, kWarp));
+    auto warpDst = removeZeros(flatten(dstFlat, kWarp));
+    auto removeVec = [&vbasis](ArrayRef<int32_t> vec) {
+      SmallVector<int32_t> result;
+      for (int32_t r : vec) {
+        if (!llvm::is_contained(vbasis, r)) {
+          result.push_back(r);
+        }
+      }
+      return result;
+    };
+    auto regSrcWarp = intersectionBasis(removeVec(regSrc), warpDst, dim);
+    auto regDstWarp = intersectionBasis(removeVec(regDst), warpSrc, dim);
+    // Maximise vectorisation in the load or the store without creating
+    // conflicts
+    SmallVector<int32_t> largest;
+    if (regSrcWarp.size() == regDstWarp.size() && regSrcWarp.size() > 0) {
+      // We choose the one with the lowest basis in the hope that it will
+      // avoid PRMTs. The comparison of the mins will be strict as the sets
+      // removeVec(regSrc) and removeVec(regDst) don't intersect
+      if (*llvm::min_element(regSrcWarp) < *llvm::min_element(regDstWarp)) {
+        largest = regSrcWarp;
+      } else {
+        largest = regDstWarp;
+      }
+    } else {
+      largest = regSrcWarp.size() > regDstWarp.size() ? regSrcWarp : regDstWarp;
+    }
+    vbasis.append(largest.begin(), largest.end());
+    if (vbasis.size() > maxVecBases) {
+      vbasis.resize(maxVecBases);
+    }
+    // We allow vbasis.size > Log2_32(32 / bitwidth) at this point, as it is in
+    // general good, but one should note
+    if (vbasis.size() < llvm::Log2_32(32 / bitwidth)) {
+      // Pad the vectorisation to 32 bits with warp bases
+      auto warpSrcWarp = intersectionBasis(warpSrc, warpDst, dim);
+      vbasis.append(warpSrcWarp.begin(), warpSrcWarp.end());
+    }
+
+    int i = 0;
+    while (vbasis.size() < llvm::Log2_32(32 / bitwidth) &&
+           (i < warpSrc.size() || i < warpDst.size())) {
+      // If we have not filled up a whole bank, we add more warp bases
+      // until we have 32 bits. They will at least avoid bank conflicts in one
+      // direction
+      if (i < warpSrc.size() && !llvm::is_contained(vbasis, warpSrc[i])) {
+        vbasis.push_back(warpSrc[i]);
+      }
+      if (vbasis.size() < llvm::Log2_32(32 / bitwidth) && i < warpDst.size() &&
+          !llvm::is_contained(vbasis, warpDst[i])) {
+        vbasis.push_back(warpDst[i]);
+      }
+      ++i;
+    }
+  }
 
   // Bits in a bank segment: 32 banks x 32 bits
   constexpr int32_t bankBits = 32 * 32;
@@ -321,8 +391,11 @@ LinearLayout optimalSwizzling(const LinearLayout &src, const LinearLayout &dst,
   auto bankDst = llvm::to_vector(llvm::concat<int32_t>(vbasis, laneDst));
 
   // Whether we'll use b32.v1 / b32.v2 / b32.v4
-  auto b32Vec =
-      llvm::Log2_32(std::max<int32_t>((1 << vbasis.size()) * bitwidth / 32, 1));
+  // FIXME: With !vecFillsBank we may use b32.v2 or b32.v4 for the load or
+  // store, but we pesimistically assume we don't.
+  auto b32Vec = !vecFillsBank ? 0
+                              : llvm::Log2_32(std::max<int32_t>(
+                                    (1 << vbasis.size()) * bitwidth / 32, 1));
   // Drop the last vec bases of the banks
   bankSrc.resize(bankSrc.size() - b32Vec);
   bankDst.resize(bankDst.size() - b32Vec);
 
@@ -12,6 +12,7 @@
 import triton.language.core as tl_core
 from triton.language.core import (
     constexpr,
+    constexpr_function,
     base_value,
     base_type,
     dtype,
@@ -38,6 +39,7 @@
     float64,
     _unwrap_if_constexpr,
     _unwrap_shape,
+    static_range,
     tensor,
     tuple,
     tuple_type,
@@ -68,6 +70,7 @@
 
 __all__ = [
     "constexpr",
+    "constexpr_function",
     "base_value",
     "base_type",
     "dtype",
@@ -105,6 +108,7 @@
     "allocate_shared_memory",
     "set_auto_layout",
     "shared_memory_descriptor",
+    "static_range",
     "warp_specialize",
     *_IMPORT_FROM_TRITON,
 ]
 
@@ -233,6 +233,15 @@ def type(self):
         return constexpr_type(self)
 
 
+def _get_shape_per_cta(shape, cta_split_num):
+    shape_per_cta = shape
+    if cta_split_num is not None:
+        assert len(cta_split_num) == len(shape)
+        for dim in range(len(shape_per_cta)):
+            shape_per_cta[dim] /= cta_split_num[dim]
+    return shape_per_cta
+
+
 @dataclass(frozen=True)
 class NVMMASharedLayout(SharedLayout):
     """
@@ -286,6 +295,47 @@ def _to_ir(self, builder):
             self.cta_order,
         )
 
+    @staticmethod
+    def get_default_for(block_shape, dtype, transposed=False, fp4_padded=False, ctas_per_cga=None, cta_split_num=None,
+                        cta_order=None):
+        """Returns an NVMMASharedLayout with default swizzling for a given shape.
+
+        This picks the largest swizzle pattern compatible with the shape, which
+        allows emitting the fewest TMA or MMA messages.
+        """
+        packing_factor = 2 if fp4_padded else 1
+        shape_per_cta = _get_shape_per_cta(block_shape, cta_split_num)
+        rank = len(block_shape)
+        if transposed:
+            shape_per_cta = shape_per_cta[1:] + shape_per_cta[:1]
+        contig_dim_size = shape_per_cta[-1] * packing_factor
+        contig_dim_bytes = contig_dim_size * dtype.primitive_bitwidth // 8
+        if contig_dim_bytes >= 128 and contig_dim_bytes % 128 == 0:
+            swizzle_byte_width = 128
+        elif contig_dim_bytes >= 64 and contig_dim_bytes % 64 == 0:
+            swizzle_byte_width = 64
+        elif contig_dim_bytes >= 32 and contig_dim_bytes % 32 == 0:
+            swizzle_byte_width = 32
+        else:
+            swizzle_byte_width = 0
+
+        flatten_outer_dim = 1
+        for size in shape_per_cta[:-1]:
+            flatten_outer_dim *= size
+        if len(block_shape) < 2 or flatten_outer_dim < 8:
+            swizzle_byte_width = 0
+
+        return NVMMASharedLayout(
+            swizzle_byte_width=swizzle_byte_width,
+            element_bitwidth=dtype.primitive_bitwidth,
+            rank=rank,
+            transposed=transposed,
+            fp4_padded=fp4_padded,
+            ctas_per_cga=ctas_per_cga,
+            cta_split_num=cta_split_num,
+            cta_order=cta_order,
+        )
+
     def mangle(self) -> str:
         return f"NVMMA_{self.swizzle_byte_width}_{self.element_bitwidth}_{self.transposed}_{self.fp4_padded}_NVMMA"
 
 
@@ -1,8 +1,6 @@
 from dataclasses import dataclass
 from typing import List, Any
 from triton._utils import validate_block_shape
-from torch._subclasses.fake_tensor import FakeTensor
-from torch._subclasses.functional_tensor import FunctionalTensor
 
 
 @dataclass
@@ -18,7 +16,9 @@ def __post_init__(self):
         assert len(self.block_shape) == rank, f"rank mismatch: {self}"
         assert rank > 0, "rank must not be zero"
         assert rank <= 5, "rank cannot be more than 5"
-        if not isinstance(self.base, (FakeTensor, FunctionalTensor)):
+        ty = type(self.base)
+        type_name = f"{ty.__module__}.{ty.__name__}"
+        if type_name not in ("torch.FakeTensor", "torch.FunctionalTensor"):
             assert self.base.data_ptr() % 16 == 0, "base must be 16-byte aligned"
         validate_block_shape(self.block_shape)
         elem_bytes = self.base.dtype.itemsize
 
@@ -1,6 +1,5 @@
 import torch
 import triton
-import triton.language as tl
 import pytest
 import itertools
 
@@ -25,7 +24,7 @@
 # ===-----------------------------------------------------------------------===#
 
 
-@tl.constexpr_function
+@gl.constexpr_function
 def get_tmem_32x32b_reg_layout(instr_shape, shape, num_warps):
     assert len(shape) == 2, "expected a 2D tensor"
     assert num_warps in [4, 8], "expected 4 or 8 warps"
@@ -61,45 +60,15 @@ def get_tmem_32x32b_reg_layout(instr_shape, shape, num_warps):
     )
 
 
-@tl.constexpr_function
+@gl.constexpr_function
 def get_mma_instr_shape(shape, element_ty):
     m = 128 if shape[0] >= 128 else 64
     n = 256 if shape[1] >= 256 else shape[1]
     k = 256 // element_ty.primitive_bitwidth
     return (m, n, k)
 
 
-@tl.constexpr_function
-def get_nvmma_layout(shape, element_ty, order=[1, 0], fp4_padded=False):
-    packing_factor = 2 if fp4_padded else 1
-
-    contig_dim_size = shape[order[0]] * packing_factor * element_ty.primitive_bitwidth // 8
-    if contig_dim_size >= 128 and contig_dim_size % 128 == 0:
-        swizzle_byte_width = 128
-    elif contig_dim_size >= 64 and contig_dim_size % 64 == 0:
-        swizzle_byte_width = 64
-    elif contig_dim_size >= 32 and contig_dim_size % 32 == 0:
-        swizzle_byte_width = 32
-    else:
-        swizzle_byte_width = 0
-
-    flatten_outer_dim = 1
-    for i in range(1, len(shape)):
-        flatten_outer_dim *= shape[order[i]]
-    if len(shape) < 2 or flatten_outer_dim < 8:
-        swizzle_byte_width = 0
-    transposed = order[0] == 0
-
-    return gl.NVMMASharedLayout(
-        swizzle_byte_width=swizzle_byte_width,
-        element_bitwidth=element_ty.primitive_bitwidth,
-        rank=len(shape),
-        transposed=transposed,
-        fp4_padded=fp4_padded,
-    )
-
-
-@tl.constexpr_function
+@gl.constexpr_function
 def get_mma_reg_layout(shape, num_warps, dtype=gl.float32):
     instr_shape = get_mma_instr_shape(shape, dtype)
     return get_tmem_32x32b_reg_layout(instr_shape, shape, num_warps)
@@ -133,7 +102,7 @@ def alloc(shape: gl.constexpr, dtype: gl.constexpr, layout: gl.constexpr, num_bu
             mem = alloc_fn(dtype, [num_buffers] + shape, layout)
             ready_bars = gl.allocate_shared_memory(gl.int64, [num_buffers, 1], mbarrier.MBarrierLayout())
             empty_bars = gl.allocate_shared_memory(gl.int64, [num_buffers, 1], mbarrier.MBarrierLayout())
-            for i in tl.static_range(num_buffers):
+            for i in gl.static_range(num_buffers):
                 mbarrier.init(ready_bars.index(i), count=1)
                 mbarrier.init(empty_bars.index(i), count=num_consumers)
                 mbarrier.arrive(empty_bars.index(i), count=num_consumers)
@@ -179,7 +148,7 @@ def create_consumer(self):
         def release(self):
             if isinstance(self.mem, gl.shared_memory_descriptor):
                 self.mem._keep_alive()
-            for i in tl.static_range(self.num_buffers):
+            for i in gl.static_range(self.num_buffers):
                 mbarrier.invalidate(self.ready_bars.index(i))
                 mbarrier.invalidate(self.empty_bars.index(i))
 
@@ -847,7 +816,7 @@ def _attn_fwd_correction_rescale(config, s_tmem, corr_consumer, o_consumer):
     mbarrier.arrive(corr_bar, count=1)
     alpha = gl.convert_layout(alpha.reshape([config.SPLIT_M]), alpha_layout)
 
-    for i in tl.static_range(config.SPLIT_D_FACTOR):
+    for i in gl.static_range(config.SPLIT_D_FACTOR):
         o_ref = o_tmem.slice(i * config.SPLIT_D, config.SPLIT_D)
         o = o_ref.load(config.o_splitn_layout)
         o = _mul_f32x2(o, alpha[:, None])
@@ -882,7 +851,7 @@ def _attn_fwd_correction_epilogue(config, prog, s_tmem, M, corr_consumer, epi_pr
     SPLIT_N: gl.constexpr = o_smem.type.shape[1] // SPLIT_N_FACTOR
 
     scale = 1 / l_i
-    for i in tl.static_range(SPLIT_N_FACTOR):
+    for i in gl.static_range(SPLIT_N_FACTOR):
         o_ref = o_tmem.slice(i * SPLIT_N, SPLIT_N)
         o = o_ref.load(config.o_splitn_layout)
         o = _mul_f32x2(o, scale[:, None])
@@ -992,12 +961,12 @@ def attention_kernel(  #
 def torch_dtype_to_triton(dtype):
     if dtype == torch.float8_e5m2:
         return gl.float8e5
-    return getattr(tl, str(dtype).split('.')[1])
+    return getattr(gl, str(dtype).split('.')[1])
 
 
 def make_tensor_desc(x, shape, strides, block_shape):
-    layout = get_nvmma_layout(block_shape, torch_dtype_to_triton(x.dtype))
-    return TensorDescriptor(x, shape=shape, strides=strides, block_shape=block_shape, layout=layout.value)
+    layout = gl.NVMMASharedLayout.get_default_for(block_shape, torch_dtype_to_triton(x.dtype))
+    return TensorDescriptor(x, shape=shape, strides=strides, block_shape=block_shape, layout=layout)
 
 
 def attention_forward(q, k, v, causal, sm_scale):