[CI][Test][Reduction][Xe2/3] Add 1D cross subgroup cross lane (#1157)

Garra1980 · web-flow · commit fba6cc531681 · 2026-03-17T13:57:20.000-05:00
diff --git a/test/Integration/Dialect/XeGPU/WG/reduction_1D_cross_sg_cross_lane.mlir b/test/Integration/Dialect/XeGPU/WG/reduction_1D_cross_sg_cross_lane.mlir
@@ -0,0 +1,120 @@
+// RUN: imex-opt %s --gpu-lower-to-xevm-pipeline="xegpu-op-level=workgroup zebin-chip=pvc" \
+// RUN: | mlir-runner \
+// RUN:   --shared-libs=%mlir_levelzero_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --shared-libs=%mlir_c_runner_utils \
+// RUN:   --shared-libs=%irunner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
+#data_layout = #xegpu.layout<sg_layout = [4, 16], sg_data = [2, 16], inst_data = [1, 16], lane_layout= [1, 16], lane_data=[1, 1]>
+module attributes {gpu.container_module} {
+  gpu.module @reduction {
+    gpu.func @cross_sg_cross_lane_1D(%dst: memref<128xf32, 1>, %src: memref<8x256xf32, 1>) kernel {
+      %dst1 = memref.memory_space_cast %dst : memref<128xf32, 1> to memref<128xf32>
+      %dst_ptr_idx = memref.extract_aligned_pointer_as_index %dst1 : memref<128xf32> -> index
+      %dst_ptr_i64 = arith.index_cast %dst_ptr_idx : index to i64
+      %src1 = memref.memory_space_cast %src : memref<8x256xf32, 1> to memref<8x256xf32>
+      %src_ptr_idx = memref.extract_aligned_pointer_as_index %src1 : memref<8x256xf32> -> index
+      %src_ptr_i64 = arith.index_cast %src_ptr_idx : index to i64
+
+      %c256 = arith.constant dense<256> : vector<8xindex>
+      %offset_i0 = vector.step : vector<8xindex>
+      %offset_i = arith.muli %offset_i0, %c256 : vector<8xindex>
+      %offset_j = vector.step : vector<256xindex>
+      %offset_i_bcast = vector.broadcast %offset_i: vector<8xindex> to vector<256x8xindex>
+      %offset_i_bcast_t = vector.transpose %offset_i_bcast, [1, 0]: vector<256x8xindex> to vector<8x256xindex>
+      %offset_j_bcast = vector.broadcast %offset_j : vector<256xindex> to vector<8x256xindex>
+      %offset_ld = arith.addi %offset_i_bcast_t, %offset_j_bcast : vector<8x256xindex>
+
+      %mask_ld = arith.constant dense<1> : vector<8x256xi1>
+      %val = xegpu.load %src_ptr_i64[%offset_ld], %mask_ld : i64, vector<8x256xindex>, vector<8x256xi1> -> vector<8x256xf32>
+      %acc = arith.constant dense<0.0> : vector<8xf32>
+      %res = vector.multi_reduction <add>, %val, %acc [1] : vector<8x256xf32> to vector<8xf32>
+
+      %offset = vector.step : vector<8xindex>
+      %mask = arith.constant dense<1> : vector<8xi1>
+      xegpu.store %res, %dst_ptr_i64[%offset], %mask { layout = #xegpu.slice<#data_layout, dims = [1]> } : vector<8xf32>, i64, vector<8xindex>, vector<8xi1>
+      gpu.return
+    }
+  }
+
+func.func @test(%dst : memref<128xf32>, %src : memref<8x256xf32>) attributes {llvm.emit_c_interface} {
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+
+    %c32 = arith.constant 32 : index
+    %c64 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+
+    %c1024 = arith.constant 1024 : index // 4 * 16 * 16
+
+    %c2 = arith.constant 2 : index
+    %c4 = arith.constant 4 : index
+
+    %stream0_0 = gpu.wait async
+
+    %gpu_memref_dst, %stream0_1 = gpu.alloc async [%stream0_0] () : memref<128xf32>
+    %stream0_2 = gpu.memcpy async [%stream0_1] %gpu_memref_dst, %dst  : memref<128xf32>, memref<128xf32>
+
+    %gpu_memref_src, %stream0_3 = gpu.alloc async [%stream0_2] () : memref<8x256xf32>
+    %stream0_4 = gpu.memcpy async [%stream0_3] %gpu_memref_src, %src  : memref<8x256xf32>, memref<8x256xf32>
+
+
+    %dst_ptr_idx = memref.extract_aligned_pointer_as_index %gpu_memref_dst : memref<128xf32> -> index
+    %dst_ptr_i64 = arith.index_cast %dst_ptr_idx : index to i64
+
+    %src_ptr_idx = memref.extract_aligned_pointer_as_index %gpu_memref_src : memref<8x256xf32> -> index
+    %src_ptr_i64 = arith.index_cast %src_ptr_idx : index to i64
+
+    %gpu_memref_dst_casted = memref.memory_space_cast %gpu_memref_dst : memref<128xf32> to memref<128xf32, 1>
+    %gpu_memref_src_casted = memref.memory_space_cast %gpu_memref_src : memref<8x256xf32> to memref<8x256xf32, 1>
+
+    %stream0_5 = gpu.launch_func async[%stream0_4] @reduction::@cross_sg_cross_lane_1D blocks in (%c1, %c1, %c1) threads in (%c1024, %c1, %c1) args(%gpu_memref_dst_casted : memref<128xf32, 1>, %gpu_memref_src_casted : memref<8x256xf32, 1>)
+
+    %stream0_6 = gpu.memcpy async [%stream0_5]  %dst, %gpu_memref_dst : memref<128xf32>, memref<128xf32>
+    %stream0_8 = gpu.dealloc async [%stream0_6] %gpu_memref_dst : memref<128xf32>
+    gpu.wait [%stream0_8]
+    return
+  }
+
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %dst = memref.alloc() : memref<128xf32>
+    %src = memref.alloc() : memref<8x256xf32>
+
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c8 = arith.constant 8 : index
+    %c256 = arith.constant 256 : index
+    %c128 = arith.constant 128 : index
+
+    %c0_f32 = arith.constant 0. : f32
+    scf.for %i = %c0 to %c8 step %c1 {
+      scf.for %j = %c0 to %c256 step %c1 {
+        %i_f32 = arith.index_cast %i : index to i32
+        %j_f32 = arith.index_cast %j : index to i32
+        %i_float = arith.sitofp %i_f32 : i32 to f32
+        %j_float = arith.sitofp %j_f32 : i32 to f32
+        %c1000_f32 = arith.constant 1000.0 : f32
+        %j_scaled = arith.divf %j_float, %c1000_f32 : f32
+        %val = arith.addf %i_float, %j_scaled : f32
+        // Input is in format (#row_idx).(#col_idx/1000.0)
+        memref.store %i_float, %src[%i, %j] : memref<8x256xf32>
+      }
+    }
+        %c0_i64 = arith.constant 0 : i64
+
+    scf.for %i = %c0 to %c128 step %c1 {
+      memref.store %c0_f32, %dst[%i] : memref<128xf32>
+    }
+    call @test(%dst, %src) : (memref<128xf32>, memref<8x256xf32>) -> ()
+    %dst_cast = memref.cast %dst : memref<128xf32> to memref<*xf32>
+    %src_cast = memref.cast %src : memref<8x256xf32> to memref<*xf32>
+
+    // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+    // CHECK-NEXT: [0,  256,  512,  768,  1024,  1280,  1536,  1792,  0,
+    call @printMemrefF32(%dst_cast) : (memref<*xf32>) -> ()
+    return
+  }
+  func.func private @printMemrefF32(%ptr : memref<*xf32>) attributes { llvm.emit_c_interface }
+}