|
1 |
| -// RUN: triton-opt %s --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s --implicit-check-not=llvm.inline_asm |
| 1 | +// RUN: triton-opt %s --split-input-file --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s --implicit-check-not=llvm.inline_asm |
2 | 2 |
|
3 | 3 | // COM: Tests reduction when threads_per_warp < num_warps.
|
4 | 4 |
|
@@ -35,3 +35,24 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 64 : i32, "ttg.th
|
35 | 35 | tt.return
|
36 | 36 | }
|
37 | 37 | }
|
| 38 | + |
| 39 | +// ----- |
| 40 | + |
| 41 | +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [32, 1], order = [1, 0]}> |
| 42 | +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 32 : i32, ttig.min_sg_size = 32 : i32} { |
| 43 | + tt.func public @test_reduce(%arg0: tensor<32x128xf32, #blocked>) -> tensor<128xf32, #ttg.slice<{dim = 0, parent = #blocked}>> { |
| 44 | + // CHECK: llvm.call spir_funccc @_Z27__spirv_GroupNonUniformFAddiif |
| 45 | + // CHECK: llvm.call spir_funccc @_Z27__spirv_GroupNonUniformFAddiif |
| 46 | + // CHECK: llvm.call spir_funccc @_Z27__spirv_GroupNonUniformFAddiif |
| 47 | + // CHECK: llvm.call spir_funccc @_Z27__spirv_GroupNonUniformFAddiif |
| 48 | + // CHECK: llvm.store |
| 49 | + // CHECK-NOT: llvm.load |
| 50 | + // CHECK: llvm.call spir_funccc @_Z7barrierj |
| 51 | + %1 = "tt.reduce"(%arg0) <{axis = 0 : i32}> ({ |
| 52 | + ^bb0(%arg2: f32, %arg3: f32): |
| 53 | + %2 = arith.addf %arg2, %arg3 : f32 |
| 54 | + tt.reduce.return %2 : f32 |
| 55 | + }) {allocation.offset = 0 : i32} : (tensor<32x128xf32, #blocked>) -> tensor<128xf32, #ttg.slice<{dim = 0, parent = #blocked}>> |
| 56 | + tt.return %1 : tensor<128xf32, #ttg.slice<{dim = 0, parent = #blocked}>> |
| 57 | + } |
| 58 | +} |
0 commit comments