Fix build and test failures from '8a5862d'

anmyachev · whitneywhtsang · commit 68a3466721b5 · 2025-08-06T20:35:39.000Z
Signed-off-by: Anatoly Myachev &lt;anatoly.myachev@intel.com&gt;

fix build

Signed-off-by: Anatoly Myachev &lt;anatoly.myachev@intel.com&gt;

fix test_core.py after merge

Signed-off-by: Anatoly Myachev &lt;anatoly.myachev@intel.com&gt;
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -6419,13 +6419,14 @@ def test_convert2d(M, N, src_layout, interm_layout, dst_layout, dtype, device, t
                 # expect compute scratch buffer to not error on xpu
                 raise
             pytest.skip("Can't compute scratch buffer size")
-        lds_size = get_hip_lds_size()
+        lds_size = triton.runtime.driver.active.utils.get_device_properties(
+            triton.runtime.driver.active.get_current_device())["max_shared_mem"] if is_xpu() else get_hip_lds_size()
         # consider int32 dtype in scratch buffer size,
         # because it is the largest dtype used in convert_layout in this test
         int32_size = 4
         # skip even if scratch buffer equal to lds_size, because real scratch buffer is typically larger due to padding
         if scratch_shape[0] * scratch_shape[1] * int32_size >= lds_size:
-            pytest.skip("Scratch buffer is too large")
+            pytest.xfail("Scratch buffer is too large")
     if is_cuda() and isinstance(interm_layout, PaddedSharedLayout):
         pytest.skip("PaddedSharedLayout is not supported on CUDA")
 
diff --git a/test/Conversion/intel/shared_to_dot_layout_convert.mlir b/test/Conversion/intel/shared_to_dot_layout_convert.mlir
@@ -24,7 +24,7 @@ module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32
     // COM:   Start of ttg.local_load. Load the value from SLM to register.
     // CHECK:         %[[WORK_ITEM_ID_:.*]] = llvm.call spir_funccc @_Z12get_local_idj(%[[CST_0]])
     // CHECK:         %[[WORK_ITEM_ID:.*]] = llvm.trunc %[[WORK_ITEM_ID_]] : i64 to i32
-    // CHECK-COUNT-128:        %[[LD_RES:.*]] = llvm.load {{.*}} {alignment = 2 : i64} : !llvm.ptr<3> -> vector<1xf16>
+    // CHECK-COUNT-128:        %[[LD_RES:.*]] = llvm.load {{.*}} : !llvm.ptr<3> -> vector<1xf16>
     %AA_DOT = ttg.local_load %AA : !ttg.memdesc<128x64xf16, #shared, #ttg.shared_memory> -> tensor<128x64xf16, #dot_operand_a>
 
     %cst0 = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #dpas>
@@ -62,7 +62,7 @@ module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32
     // COM:   Start of ttg.local_load. Load the value from SLM to register.
     // CHECK:         %[[WORK_ITEM_ID_:.*]] = llvm.call spir_funccc @_Z12get_local_idj(%[[CST_0]])
     // CHECK:         %[[WORK_ITEM_ID:.*]] = llvm.trunc %[[WORK_ITEM_ID_]] : i64 to i32
-    // CHECK-COUNT-128:        %[[LD_RES:.*]] = llvm.load {{.*}} {alignment = 2 : i64} : !llvm.ptr<3> -> vector<1xf16>
+    // CHECK-COUNT-128:        %[[LD_RES:.*]] = llvm.load {{.*}} : !llvm.ptr<3> -> vector<1xf16>
     %AA_DOT = ttg.local_load %AA : !ttg.memdesc<128x64xf16, #shared, #ttg.shared_memory> -> tensor<128x64xf16, #dot_operand_a>
 
     %cst0 = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #dpas>
@@ -87,23 +87,21 @@ module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32
   // CHECK-SAME:    %[[PTR_1:.*]]: !llvm.ptr<1>)
   // CHECK-SAME:    attributes {intel_reqd_sub_group_size = 16 : i32, {{.*}}} {
   tt.func @convert_dot(%B: tensor<64x256xf16, #blocked1>) {
-    // CHECK-DAG:     %[[CST_128:.*]] = llvm.mlir.constant(128 : i32) : i32
-    // CHECK-DAG:     %[[CST_256:.*]] = llvm.mlir.constant(256 : i32) : i32
-    // CHECK-DAG:     %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32
-    // CHECK-DAG:     %[[CST_4:.*]] = llvm.mlir.constant(4 : i32) : i32
-    // CHECK-DAG:     %[[CST_2:.*]] = llvm.mlir.constant(2 : i32) : i32
+    // CHECK-DAG:     %[[CST_14:.*]] = llvm.mlir.constant(14 : i32) : i32
+    // CHECK-DAG:     %[[CST_13:.*]] = llvm.mlir.constant(13 : i32) : i32
+    // CHECK-DAG:     %[[CST_12:.*]] = llvm.mlir.constant(12 : i32) : i32
+    // CHECK-DAG:     %[[CST_11:.*]] = llvm.mlir.constant(11 : i32) : i32
+    // CHECK-DAG:     %[[CST_10:.*]] = llvm.mlir.constant(10 : i32) : i32
+    // CHECK-DAG:     %[[CST_9:.*]] = llvm.mlir.constant(9 : i32) : i32
     // CHECK-DAG:     %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32
-    // CHECK-DAG:     %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK-DAG:     %[[CST_64:.*]] = llvm.mlir.constant(64 : i32) : i32
-    // CHECK-DAG:     %[[CST_1:.*]] = llvm.mlir.constant(1 : i32) : i32
     // CHECK-DAG:     %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
     %BB = ttg.local_alloc %B : (tensor<64x256xf16, #blocked1>) -> !ttg.memdesc<64x256xf16, #shared, #ttg.shared_memory>
 
     // CHECK:         llvm.call spir_funccc @_Z7barrierj
     // COM:   Start of ttg.local_load. Load the value from SLM to register.
     // CHECK:         %[[WORK_ITEM_ID_:.*]] = llvm.call spir_funccc @_Z12get_local_idj(%[[CST_0]])
     // CHECK:         %[[WORK_ITEM_ID:.*]] = llvm.trunc %[[WORK_ITEM_ID_]] : i64 to i32
-    // CHECK-COUNT-128:        %[[LD_RES:.*]] = llvm.load {{.*}} {alignment = 2 : i64} : !llvm.ptr<3> -> vector<1xf16>
+    // CHECK-COUNT-128:        %[[LD_RES:.*]] = llvm.load {{.*}} : !llvm.ptr<3> -> vector<1xf16>
     %BB_DOT = ttg.local_load %BB : !ttg.memdesc<64x256xf16, #shared, #ttg.shared_memory> -> tensor<64x256xf16, #dot_operand_b>
     %cst0 = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #dpas>
     %cst1 = arith.constant dense<0.000000e+00> : tensor<128x64xf16, #dot_operand_a>
diff --git a/test/Conversion/intel/tritongpu_to_gen.mlir b/test/Conversion/intel/tritongpu_to_gen.mlir
@@ -875,9 +875,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
   // CHECK-LABEL: convert_layout_blocked_shared
   tt.func @convert_layout_blocked_shared(%arg0: tensor<128x32xf32, #blocked0>) {
     // CHECK: llvm.store
-    // CHECK-SAME: vector<8xf32>, !llvm.ptr<3>
+    // CHECK-SAME: vector<4xf32>, !llvm.ptr<3>
     // CHECK: llvm.store
-    // CHECK-SAME: vector<8xf32>, !llvm.ptr<3>
+    // CHECK-SAME: vector<4xf32>, !llvm.ptr<3>
     %0 = ttg.local_alloc %arg0 : (tensor<128x32xf32, #blocked0>) -> !ttg.memdesc<128x32xf32, #shared0, #smem>
     tt.return
   }
@@ -1432,6 +1432,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
   // CHECK-LABEL: test_base_index_cache
   tt.func @test_base_index_cache(%arg0: tensor<128x32xf32, #blocked0>) {
+    // CHECK:      llvm.mlir.constant(0 : i32) : i32
     // CHECK:      llvm.mlir.constant(0 : i32) : i32
     // CHECK:      llvm.mlir.constant(0 : i32) : i32
     // CHECK:      [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32
@@ -1449,6 +1450,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
   // CHECK-LABEL: test_index_cache_different_block
   tt.func @test_index_cache_different_block(%arg0: tensor<128x32xf32, #blocked0>, %arg1: i1) {
+    // CHECK:      llvm.mlir.constant(0 : i32) : i32
     // CHECK:      llvm.mlir.constant(0 : i32) : i32
     // CHECK:      llvm.mlir.constant(0 : i32) : i32
     // CHECK:      [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32
@@ -1890,7 +1892,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.thr
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
   // CHECK-LABEL: @vectorize_shmem_load
   // CHECK: llvm.load
-  // CHECK-SAME: {alignment = 8 : i64} : !llvm.ptr<3> -> vector<8xi8>
+  // CHECK-SAME: !llvm.ptr<3> -> vector<8xi8>
   // CHECK-NOT: llvm.load
   tt.func public @vectorize_shmem_load(%shmem : !ttg.memdesc<16x16xi8, #shared, #smem>) {
     %0 = ttg.local_load %shmem : !ttg.memdesc<16x16xi8, #shared, #smem> -> tensor<16x16xi8, #blocked>
@@ -1906,8 +1908,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 32 : i32} {
   // CHECK-LABEL: @vectorize_shmem_store
   // CHECK: llvm.store
-  // CHECK-SAME: {alignment = 64 : i64} : vector<16xi32>, !llvm.ptr<3>
-  // CHECK-NOT: llvm.store
+  // CHECK-SAME: vector<4xi32>, !llvm.ptr<3>
+  // CHECK: llvm.store
+  // CHECK-SAME: vector<4xi32>, !llvm.ptr<3>
   tt.func public @vectorize_shmem_store(%block : tensor<64x64xi32, #blocked>) {
     %0 = ttg.local_alloc %block : (tensor<64x64xi32, #blocked>) -> !ttg.memdesc<64x64xi32, #shared, #smem>
     tt.return
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.cpp
@@ -64,7 +64,7 @@ void TargetInfo::storeMatrixShared(RewriterBase &rewriter, Location loc,
 
 Value TargetInfo::loadDShared(RewriterBase &rewriter, Location loc, Value ptr,
                               std::optional<Value> ctaId, Type elemTy,
-                              Value pred) const {
+                              Value pred, Operation *localLoadOp) const {
   assert(cast<mlir::LLVM::LLVMPointerType>(ptr.getType()).getAddressSpace() ==
              3 &&
          "Invalid addr space for loadShared");
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.h b/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.h
@@ -29,8 +29,8 @@ class TargetInfo : public mlir::triton::TargetInfoBase {
                     std::optional<Value> ctaId, Value val,
                     Value pred) const override;
   Value loadDShared(RewriterBase &rewriter, Location loc, Value ptr,
-                    std::optional<Value> ctaId, Type elemTy,
-                    Value pred) const override;
+                    std::optional<Value> ctaId, Type elemTy, Value pred,
+                    Operation *localLoadOp = nullptr) const override;
   bool canUseStMatrix(RankedTensorType tensorTy, ArrayRef<unsigned> repShape,
                       ArrayRef<unsigned> paddedRepShape,
                       ArrayRef<unsigned> order,