|
1 |
| -// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-gpu-to-llvm | FileCheck %s --dump-input-context 20 |
| 1 | +// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-gpu-to-llvm 2>/dev/null | FileCheck %s --dump-input-context 20 |
2 | 2 |
|
3 | 3 | module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
|
4 | 4 | // CHECK: llvm.func @test_empty_kernel(%arg0: i32, %arg1: !llvm.ptr<1>, %arg2: !llvm.ptr<1>)
|
@@ -1739,28 +1739,12 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
|
1739 | 1739 | #blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
|
1740 | 1740 | #blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
1741 | 1741 | module attributes {"ttg.target" = "cuda:80", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
|
1742 |
| - tt.func public @sum_reduction(%arg0: !tt.ptr<i32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i32> {tt.divisibility = 16 : i32}) attributes {noinline = false} { |
1743 |
| - %cst = arith.constant dense<1024> : tensor<1x1xi32, #blocked> |
1744 |
| - %0 = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32, #blocked1> |
1745 |
| - %1 = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked}>> |
1746 |
| - %2 = tt.expand_dims %1 {axis = 1 : i32} : tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi32, #blocked> |
1747 |
| - %3 = arith.muli %2, %cst : tensor<1x1xi32, #blocked> |
1748 |
| - %4 = tt.splat %arg0 : !tt.ptr<i32> -> tensor<1x1x!tt.ptr<i32>, #blocked> |
1749 |
| - %5 = tt.addptr %4, %3 : tensor<1x1x!tt.ptr<i32>, #blocked>, tensor<1x1xi32, #blocked> |
1750 |
| - %6 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #ttg.slice<{dim = 0, parent = #blocked}>> |
1751 |
| - %7 = tt.expand_dims %6 {axis = 0 : i32} : tensor<1024xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x1024xi32, #blocked> |
1752 |
| - %8 = tt.broadcast %5 : tensor<1x1x!tt.ptr<i32>, #blocked> -> tensor<1x1024x!tt.ptr<i32>, #blocked> |
1753 |
| - %9 = tt.addptr %8, %7 : tensor<1x1024x!tt.ptr<i32>, #blocked>, tensor<1x1024xi32, #blocked> |
1754 |
| - %10 = tt.load %9 : tensor<1x1024x!tt.ptr<i32>, #blocked> |
1755 |
| - %11 = "tt.reduce"(%10) <{axis = 1 : i32}> ({ |
| 1742 | + tt.func public @sum_reduction(%arg0: tensor<1x1024xi32, #blocked>) { |
| 1743 | + %11 = "tt.reduce"(%arg0) <{axis = 1 : i32}> ({ |
1756 | 1744 | ^bb0(%arg2: i32, %arg3: i32):
|
1757 | 1745 | %15 = arith.addi %arg2, %arg3 : i32
|
1758 | 1746 | tt.reduce.return %15 : i32
|
1759 | 1747 | }) : (tensor<1x1024xi32, #blocked>) -> tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked}>>
|
1760 |
| - %12 = ttg.convert_layout %11 : tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1xi32, #blocked1> |
1761 |
| - %13 = tt.splat %arg1 : !tt.ptr<i32> -> tensor<1x!tt.ptr<i32>, #blocked1> |
1762 |
| - %14 = tt.addptr %13, %0 : tensor<1x!tt.ptr<i32>, #blocked1>, tensor<1xi32, #blocked1> |
1763 |
| - tt.store %14, %12 : tensor<1x!tt.ptr<i32>, #blocked1> |
1764 | 1748 | tt.return
|
1765 | 1749 | }
|
1766 | 1750 | }
|
|
0 commit comments