intel
diff --git a/‎bin/RegisterTritonDialects.h‎
Lines changed: 1 addition & 0 deletions b/‎bin/RegisterTritonDialects.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎test/Triton/Intel/StrideVersioning/stride-versioning.mlir‎
Lines changed: 110 additions & 0 deletions b/‎test/Triton/Intel/StrideVersioning/stride-versioning.mlir‎
Lines changed: 110 additions & 0 deletions
diff --git a/‎third_party/intel/backend/compiler.py‎
Lines changed: 1 addition & 0 deletions b/‎third_party/intel/backend/compiler.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎third_party/intel/include/Dialect/Triton/Transforms/Passes.td‎
Lines changed: 36 additions & 0 deletions b/‎third_party/intel/include/Dialect/Triton/Transforms/Passes.td‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎third_party/intel/lib/Dialect/Triton/Transforms/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎third_party/intel/lib/Dialect/Triton/Transforms/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
@@ -97,6 +97,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::triton::intel::registerTritonIntelFuseReshape();
   mlir::triton::intel::registerTritonIntelRemoveBoundaryChecks();
   mlir::triton::intel::registerTritonIntelRemoveMasks();
+  mlir::triton::intel::registerTritonIntelStrideVersioning();
   mlir::triton::intel::registerTritonIntelTensorDescToBlockPointer();
   mlir::triton::registerRelayoutTritonGPUPass();
   mlir::triton::gpu::registerAllocateSharedMemoryPass();
 
@@ -0,0 +1,110 @@
+// RUN: triton-opt %s -split-input-file -triton-intel-stride-versioning | FileCheck %s
+
+module {
+  tt.func public @version_for_loop(%arg0: !tt.ptr<bf16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16> {tt.divisibility = 16 : i32}, %arg2: i64, %arg3: i64, %arg4: i64 {tt.divisibility = 16 : i32}, %arg5: i64) {
+    %c64_i32 = arith.constant 64 : i32
+    %cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32>
+    %c256_i32 = arith.constant 256 : i32
+    %c32_i32 = arith.constant 32 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c4096_i32 = arith.constant 4096 : i32
+    %c8192_i32 = arith.constant 8192 : i32
+    %c4_i32 = arith.constant 4 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.divsi %0, %c64_i32 : i32
+    %2 = arith.muli %1, %c4_i32 : i32
+    %3 = arith.subi %c32_i32, %2 : i32
+    %4 = arith.minsi %3, %c4_i32 : i32
+    %5 = arith.remsi %0, %c64_i32 : i32
+    %6 = arith.remsi %5, %4 : i32
+    %7 = arith.addi %2, %6 : i32
+    %8 = arith.divsi %5, %4 : i32
+    %9 = arith.extsi %c8192_i32 : i32 to i64
+    %10 = arith.extsi %c4096_i32 : i32 to i64
+    %11 = tt.make_tensor_ptr %arg0, [%9, %10], [%arg2, %arg3], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<256x32xbf16>>
+    %12 = tt.make_tensor_ptr %arg1, [%10, %10], [%arg4, %arg5], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<32x256xbf16>>
+    %13 = arith.muli %7, %c256_i32 : i32
+    %14 = arith.muli %8, %c256_i32 : i32
+    %15:2 = scf.for %arg9 = %c0_i32 to %c4096_i32 step %c32_i32 iter_args(%arg10 = %cst, %arg11 = %c0_i32) -> (tensor<256x256xf32>, i32)  : i32 {
+      %20 = tt.advance %11, [%13, %arg11] : <tensor<256x32xbf16>>
+      %21 = tt.load %20 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<256x32xbf16>>
+      %22 = tt.advance %12, [%arg11, %14] : <tensor<32x256xbf16>>
+      %23 = tt.load %22 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<32x256xbf16>>
+      %24 = tt.dot %21, %23, %cst, inputPrecision = tf32 : tensor<256x32xbf16> * tensor<32x256xbf16> -> tensor<256x256xf32>
+      %25 = arith.addf %arg10, %24 : tensor<256x256xf32>
+      %26 = arith.addi %arg11, %c32_i32 : i32
+      scf.yield %25, %26 : tensor<256x256xf32>, i32
+    }
+    tt.return
+  }
+
+  // CHECK: tt.func public @version_for_loop
+  // CHECK:     [[CST_1_i64:%.+]] = arith.constant 1 : i64
+  // CHECK-DAG: [[NEW_PTR1:%.+]] = tt.make_tensor_ptr %arg0, {{.*}}, [%arg2, %c1_i64], {{.*}} {order = array<i32: 1, 0>} : <tensor<256x32xbf16>>
+  // CHECK-DAG: [[ORIG_PTR1:%.+]] = tt.make_tensor_ptr %arg0, {{.*}}, [%arg2, %arg3], {{.*}} {order = array<i32: 1, 0>} : <tensor<256x32xbf16>>
+  // CHECK:     [[NEW_PTR2:%.+]] = tt.make_tensor_ptr %arg1, {{.*}}, [%arg4, %c1_i64], {{.*}} {order = array<i32: 1, 0>} : <tensor<32x256xbf16>>
+  // CHECK:     [[ORIG_PTR2:%.+]] = tt.make_tensor_ptr %arg1, {{.*}}, [%arg4, %arg5], {{.*}} {order = array<i32: 1, 0>} : <tensor<32x256xbf16>>
+  // CHECK-DAG: [[CMP1:%.+]] = arith.cmpi eq, %arg3, [[CST_1_i64]] : i64
+  // CHECK-DAG: [[CMP2:%.+]] = arith.cmpi eq, %arg5, [[CST_1_i64]] : i64
+  // CHECK:     [[VER_COND:%.+]] = arith.andi [[CMP1]], [[CMP2]] : i1
+  // CHECK:     [[LOOP_VER:%.+]]:2 = scf.if [[VER_COND]]
+  // CHECK:       scf.for
+  // CHECK:         tt.advance [[NEW_PTR1]]
+  // CHECK:         tt.advance [[NEW_PTR2]]
+  // CHECK:     } else {
+  // CHECK:       scf.for
+  // CHECK:         tt.advance [[ORIG_PTR1]]
+  // CHECK:         tt.advance [[ORIG_PTR2]]
+  // CHECK:     }
+}
+
+// -----
+
+module {
+  tt.func public @do_not_version(%arg0: !tt.ptr<bf16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16> {tt.divisibility = 16 : i32}, %arg2: i64, %arg3: i64, %arg4: i64 {tt.divisibility = 16 : i32}, %arg5: i64) {
+    %c64_i32 = arith.constant 64 : i32
+    %cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32>
+    %c256_i32 = arith.constant 256 : i32
+    %c32_i32 = arith.constant 32 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c4096_i32 = arith.constant 4096 : i32
+    %c8192_i32 = arith.constant 8192 : i32
+    %c4_i32 = arith.constant 4 : i32
+    %c2_i64 = arith.constant 2 : i64
+    %c4_i64 = arith.constant 4 : i64
+    %0 = tt.get_program_id x : i32
+    %1 = arith.divsi %0, %c64_i32 : i32
+    %2 = arith.muli %1, %c4_i32 : i32
+    %3 = arith.subi %c32_i32, %2 : i32
+    %4 = arith.minsi %3, %c4_i32 : i32
+    %5 = arith.remsi %0, %c64_i32 : i32
+    %6 = arith.remsi %5, %4 : i32
+    %7 = arith.addi %2, %6 : i32
+    %8 = arith.divsi %5, %4 : i32
+    %9 = arith.extsi %c8192_i32 : i32 to i64
+    %10 = arith.extsi %c4096_i32 : i32 to i64
+    %11 = tt.make_tensor_ptr %arg0, [%9, %10], [%c4_i64, %c2_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<256x32xbf16>>
+    %12 = tt.make_tensor_ptr %arg1, [%10, %10], [%c2_i64, %c4_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<32x256xbf16>>
+    %13 = arith.muli %7, %c256_i32 : i32
+    %14 = arith.muli %8, %c256_i32 : i32
+    %15:2 = scf.for %arg9 = %c0_i32 to %c4096_i32 step %c32_i32 iter_args(%arg10 = %cst, %arg11 = %c0_i32) -> (tensor<256x256xf32>, i32)  : i32 {
+      %20 = tt.advance %11, [%13, %arg11] : <tensor<256x32xbf16>>
+      %21 = tt.load %20 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<256x32xbf16>>
+      %22 = tt.advance %12, [%arg11, %14] : <tensor<32x256xbf16>>
+      %23 = tt.load %22 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<32x256xbf16>>
+      %24 = tt.dot %21, %23, %cst, inputPrecision = tf32 : tensor<256x32xbf16> * tensor<32x256xbf16> -> tensor<256x256xf32>
+      %25 = arith.addf %arg10, %24 : tensor<256x256xf32>
+      %26 = arith.addi %arg11, %c32_i32 : i32
+      scf.yield %25, %26 : tensor<256x256xf32>, i32
+    }
+    tt.return
+  }
+
+  // CHECK: tt.func public @do_not_version
+  // CHECK-DAG: [[PTR1:%.+]] = tt.make_tensor_ptr %arg0
+  // CHECK-DAG: [[PTR2:%.+]] = tt.make_tensor_ptr %arg1
+  // CHECK-NOT: scf.if
+  // CHECK:     scf.for
+  // CHECK:       tt.advance [[PTR1]]
+  // CHECK:       tt.advance [[PTR2]]
+}
@@ -213,6 +213,7 @@ def make_ttir(cls, mod, metadata, opt):
         passes.common.add_licm(pm)
         intel.passes.ttir.add_remove_boundary_checks(pm)
         intel.passes.ttir.add_remove_masks(pm)
+        intel.passes.ttir.add_stride_versioning(pm)
         intel.passes.ttir.add_fuse_reshape(pm)
         passes.common.add_canonicalizer(pm)
         passes.ttir.add_combine(pm)
 
@@ -107,4 +107,40 @@ def TritonIntelRemoveBoundaryChecks
   ];
 }
 
+def TritonIntelStrideVersioning
+    : Pass<"triton-intel-stride-versioning", "mlir::ModuleOp"> {
+  let summary = "Version loops containing block pointers loads if none of them is 1";
+
+  let description = [{
+    This pass versions loops that contain tt.load on a block pointer, if none of the block pointer strides is 1.
+    For example, given:
+
+    %cst = arith.constant ...
+    %ptr = tt.make_tensor_ptr %base, [%s0, %s1], [%cst, %b], [%x, %y] : <tensor<512x64xf16>>
+    scf.for ... {
+      %load = tt.load %ptr : !tt.ptr<tensor<512x64xf16>>
+      ...
+    }
+
+    The transformation creates:
+
+    %ptr = tt.make_tensor_ptr %base, [%s0, %s1], [%a, %b], [%x, %y] : <tensor<512x64xf16>>
+    %ptr' = tt.make_tensor_ptr %base, [%s0, %s1], [%a, 1], [%x, %y] : <tensor<512x64xf16>>
+    if (%b == 1)
+      scf.for ... {
+        %load = tt.load %ptr' : !tt.ptr<tensor<512x64xf16>>
+        ...
+      }
+    else
+      scf.for ... {
+        %load = tt.load %ptr : !tt.ptr<tensor<512x64xf16>>
+        ...
+      }
+  }];
+
+  let dependentDialects = [
+    "mlir::triton::TritonDialect"
+  ];
+}
+
 #endif // TRITON_DIALECT_TRITON_INTEL_TRANSFORMS_PASSES
@@ -2,6 +2,7 @@ add_triton_library(TritonIntelTransforms
   FuseReshape.cpp
   RemoveBoundaryChecks.cpp
   RemoveMasks.cpp
+  StrideVersioning.cpp
   TensorDescToBlockPointer.cpp
 
   DEPENDS