From 824b6506a7082087feb0c28c7ab6e179ccb3ae8a Mon Sep 17 00:00:00 2001 From: linuxlonelyeagle <2020382038@qq.com> Date: Mon, 10 Feb 2025 15:05:36 +0800 Subject: [PATCH 1/3] make affine-loop-unroll a FunctionOpInterface pass. --- mlir/include/mlir/Dialect/Affine/Passes.h | 3 +- mlir/include/mlir/Dialect/Affine/Passes.td | 2 +- .../Dialect/Affine/Transforms/LoopUnroll.cpp | 75 +++++++------ mlir/test/Dialect/Affine/unroll.mlir | 100 ++++++++++++++++++ 4 files changed, 143 insertions(+), 37 deletions(-) diff --git a/mlir/include/mlir/Dialect/Affine/Passes.h b/mlir/include/mlir/Dialect/Affine/Passes.h index bc29d04287ac4..37147b079e5d9 100644 --- a/mlir/include/mlir/Dialect/Affine/Passes.h +++ b/mlir/include/mlir/Dialect/Affine/Passes.h @@ -19,6 +19,7 @@ namespace mlir { +class ModuleOp; namespace func { class FuncOp; } // namespace func @@ -93,7 +94,7 @@ std::unique_ptr> createLoopTilingPass(); /// factors supplied through other means. If -1 is passed as the unrollFactor /// and no callback is provided, anything passed from the command-line (if at /// all) or the default unroll factor is used (LoopUnroll:kDefaultUnrollFactor). -std::unique_ptr> createLoopUnrollPass( +std::unique_ptr> createLoopUnrollPass( int unrollFactor = -1, bool unrollUpToFactor = false, bool unrollFull = false, const std::function &getUnrollFactor = nullptr); diff --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td index d7c7897c65730..d96b50c3e8104 100644 --- a/mlir/include/mlir/Dialect/Affine/Passes.td +++ b/mlir/include/mlir/Dialect/Affine/Passes.td @@ -199,7 +199,7 @@ def AffineLoopTiling : Pass<"affine-loop-tile", "func::FuncOp"> { ]; } -def AffineLoopUnroll : Pass<"affine-loop-unroll", "func::FuncOp"> { +def AffineLoopUnroll : Pass<"affine-loop-unroll", "ModuleOp"> { let summary = "Unroll affine loops"; let constructor = "mlir::affine::createLoopUnrollPass()"; let options = [ diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp index 57df7ada91654..4dc9809574115 100644 --- a/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp @@ -19,6 +19,7 @@ #include "mlir/IR/AffineExpr.h" #include "mlir/IR/AffineMap.h" #include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinOps.h" #include "llvm/ADT/DenseMap.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -82,7 +83,7 @@ static bool isInnermostAffineForOp(AffineForOp op) { } /// Gathers loops that have no affine.for's nested within. -static void gatherInnermostLoops(func::FuncOp f, +static void gatherInnermostLoops(FunctionOpInterface f, SmallVectorImpl &loops) { f.walk([&](AffineForOp forOp) { if (isInnermostAffineForOp(forOp)) @@ -91,40 +92,44 @@ static void gatherInnermostLoops(func::FuncOp f, } void LoopUnroll::runOnOperation() { - func::FuncOp func = getOperation(); - if (func.isExternal()) - return; - - if (unrollFull && unrollFullThreshold.hasValue()) { - // Store short loops as we walk. + mlir::ModuleOp module = getOperation(); + SmallVector funcOps; + module.walk([&](FunctionOpInterface func) { funcOps.push_back(func); }); + for (auto func : funcOps) { + if (func.isExternal()) + return; + + if (unrollFull && unrollFullThreshold.hasValue()) { + // Store short loops as we walk. + SmallVector loops; + + // Gathers all loops with trip count <= minTripCount. Do a post order walk + // so that loops are gathered from innermost to outermost (or else + // unrolling an outer one may delete gathered inner ones). + getOperation().walk([&](AffineForOp forOp) { + std::optional tripCount = getConstantTripCount(forOp); + if (tripCount && *tripCount <= unrollFullThreshold) + loops.push_back(forOp); + }); + for (auto forOp : loops) + (void)loopUnrollFull(forOp); + return; + } + + // If the call back is provided, we will recurse until no loops are found. SmallVector loops; - - // Gathers all loops with trip count <= minTripCount. Do a post order walk - // so that loops are gathered from innermost to outermost (or else unrolling - // an outer one may delete gathered inner ones). - getOperation().walk([&](AffineForOp forOp) { - std::optional tripCount = getConstantTripCount(forOp); - if (tripCount && *tripCount <= unrollFullThreshold) - loops.push_back(forOp); - }); - for (auto forOp : loops) - (void)loopUnrollFull(forOp); - return; - } - - // If the call back is provided, we will recurse until no loops are found. - SmallVector loops; - for (unsigned i = 0; i < numRepetitions || getUnrollFactor; i++) { - loops.clear(); - gatherInnermostLoops(func, loops); - if (loops.empty()) - break; - bool unrolled = false; - for (auto forOp : loops) - unrolled |= succeeded(runOnAffineForOp(forOp)); - if (!unrolled) - // Break out if nothing was unrolled. - break; + for (unsigned i = 0; i < numRepetitions || getUnrollFactor; i++) { + loops.clear(); + gatherInnermostLoops(func, loops); + if (loops.empty()) + break; + bool unrolled = false; + for (auto forOp : loops) + unrolled |= succeeded(runOnAffineForOp(forOp)); + if (!unrolled) + // Break out if nothing was unrolled. + break; + } } } @@ -145,7 +150,7 @@ LogicalResult LoopUnroll::runOnAffineForOp(AffineForOp forOp) { cleanUpUnroll); } -std::unique_ptr> mlir::affine::createLoopUnrollPass( +std::unique_ptr> mlir::affine::createLoopUnrollPass( int unrollFactor, bool unrollUpToFactor, bool unrollFull, const std::function &getUnrollFactor) { return std::make_unique( diff --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir index e398c3fe2011d..43485ca56deeb 100644 --- a/mlir/test/Dialect/Affine/unroll.mlir +++ b/mlir/test/Dialect/Affine/unroll.mlir @@ -240,6 +240,23 @@ func.func @loop_nest_unroll_full() { return } // UNROLL-FULL } +gpu.module @unroll_full { + // UNROLL-FULL-LABEL: func @gpu_loop_nest_simplest() { + gpu.func @gpu_loop_nest_simplest() { + // UNROLL-FULL: affine.for %arg0 = 0 to 100 step 2 { + affine.for %i = 0 to 100 step 2 { + // UNROLL-FULL: %c1_i32 = arith.constant 1 : i32 + // UNROLL-FULL-NEXT: %c1_i32_0 = arith.constant 1 : i32 + // UNROLL-FULL-NEXT: %c1_i32_1 = arith.constant 1 : i32 + // UNROLL-FULL-NEXT: %c1_i32_2 = arith.constant 1 : i32 + affine.for %j = 0 to 4 { + %x = arith.constant 1 : i32 + } + } // UNROLL-FULL: } + gpu.return // UNROLL-FULL: return + } +} + // SHORT-LABEL: func @loop_nest_outer_unroll() { func.func @loop_nest_outer_unroll() { // SHORT: affine.for %arg0 = 0 to 4 { @@ -260,6 +277,28 @@ func.func @loop_nest_outer_unroll() { return // SHORT: return } // SHORT } +gpu.module @short { + // SHORT-LABEL: func @gpu_loop_nest_outer_unroll() { + gpu.func @gpu_loop_nest_outer_unroll() { + // SHORT: affine.for %arg0 = 0 to 4 { + // SHORT-NEXT: %0 = affine.apply [[$MAP0]](%arg0) + // SHORT-NEXT: %1 = "addi32"(%0, %0) : (index, index) -> index + // SHORT-NEXT: } + // SHORT-NEXT: affine.for %arg0 = 0 to 4 { + // SHORT-NEXT: %0 = affine.apply [[$MAP0]](%arg0) + // SHORT-NEXT: %1 = "addi32"(%0, %0) : (index, index) -> index + // SHORT-NEXT: } + affine.for %i = 0 to 2 { + affine.for %j = 0 to 4 { + %x = "affine.apply" (%j) { map = affine_map<(d0) -> (d0 + 1)> } : + (index) -> (index) + %y = "addi32"(%x, %x) : (index, index) -> index + } + } + gpu.return // SHORT: gpu.return + } // SHORT } +} + // We are doing a minimal FileCheck here. We just need this test case to // successfully run. Both %x and %y will get unrolled here as the min trip // count threshold set to 2. @@ -345,6 +384,37 @@ func.func @unroll_unit_stride_no_cleanup() { return } +gpu.module @unroll_by_4{ + // UNROLL-BY-4-LABEL: func @gpu_unroll_unit_stride_no_cleanup() { + gpu.func @gpu_unroll_unit_stride_no_cleanup() { + // UNROLL-BY-4: affine.for %arg0 = 0 to 100 { + affine.for %i = 0 to 100 { + // UNROLL-BY-4: for [[L1:%arg[0-9]+]] = 0 to 8 step 4 { + // UNROLL-BY-4-NEXT: %0 = "addi32"([[L1]], [[L1]]) : (index, index) -> i32 + // UNROLL-BY-4-NEXT: %1 = "addi32"(%0, %0) : (i32, i32) -> i32 + // UNROLL-BY-4-NEXT: %2 = affine.apply #map{{[0-9]*}}([[L1]]) + // UNROLL-BY-4-NEXT: %3 = "addi32"(%2, %2) : (index, index) -> i32 + // UNROLL-BY-4-NEXT: %4 = "addi32"(%3, %3) : (i32, i32) -> i32 + // UNROLL-BY-4-NEXT: %5 = affine.apply #map{{[0-9]*}}([[L1]]) + // UNROLL-BY-4-NEXT: %6 = "addi32"(%5, %5) : (index, index) -> i32 + // UNROLL-BY-4-NEXT: %7 = "addi32"(%6, %6) : (i32, i32) -> i32 + // UNROLL-BY-4-NEXT: %8 = affine.apply #map{{[0-9]*}}([[L1]]) + // UNROLL-BY-4-NEXT: %9 = "addi32"(%8, %8) : (index, index) -> i32 + // UNROLL-BY-4-NEXT: %10 = "addi32"(%9, %9) : (i32, i32) -> i32 + // UNROLL-BY-4-NEXT: } + affine.for %j = 0 to 8 { + %x = "addi32"(%j, %j) : (index, index) -> i32 + %y = "addi32"(%x, %x) : (i32, i32) -> i32 + } + // empty loop + // UNROLL-BY-4: affine.for %arg1 = 0 to 8 { + affine.for %k = 0 to 8 { + } + } + gpu.return + } +} + // UNROLL-BY-4-LABEL: func @unroll_unit_stride_cleanup() { func.func @unroll_unit_stride_cleanup() { // UNROLL-BY-4: affine.for %arg0 = 0 to 100 { @@ -632,6 +702,19 @@ func.func @unroll_by_one_should_promote_single_iteration_loop() { // UNROLL-BY-1-NEXT: return } +gpu.module @unroll_by_1 { + // UNROLL-BY-1-LABEL: func @gpu_unroll_by_one_should_promote_single_iteration_loop() + gpu.func @gpu_unroll_by_one_should_promote_single_iteration_loop() { + affine.for %i = 0 to 1 { + %x = "foo"(%i) : (index) -> i32 + } + gpu.return + // UNROLL-BY-1-NEXT: %c0 = arith.constant 0 : index + // UNROLL-BY-1-NEXT: %0 = "foo"(%c0) : (index) -> i32 + // UNROLL-BY-1-NEXT: gpu.return + } +} + // Test unrolling with affine.for iter_args. // UNROLL-BY-4-LABEL: loop_unroll_with_iter_args_and_cleanup @@ -706,6 +789,23 @@ func.func @unroll_cleanup_loop_with_larger_unroll_factor() { // UNROLL-CLEANUP-LOOP-NEXT: return } +gpu.module @unroll_cleanup_loop { + // UNROLL-CLEANUP-LOOP-LABEL: func @gpu_unroll_cleanup_loop_with_larger_unroll_factor() + gpu.func @gpu_unroll_cleanup_loop_with_larger_unroll_factor() { + affine.for %i = 0 to 3 { + %x = "foo"(%i) : (index) -> i32 + } + gpu.return + // UNROLL-CLEANUP-LOOP-NEXT: %[[C0:.*]] = arith.constant 0 : index + // UNROLL-CLEANUP-LOOP-NEXT: {{.*}} = "foo"(%[[C0]]) : (index) -> i32 + // UNROLL-CLEANUP-LOOP-NEXT: %[[V1:.*]] = affine.apply {{.*}} + // UNROLL-CLEANUP-LOOP-NEXT: {{.*}} = "foo"(%[[V1]]) : (index) -> i32 + // UNROLL-CLEANUP-LOOP-NEXT: %[[V2:.*]] = affine.apply {{.*}} + // UNROLL-CLEANUP-LOOP-NEXT: {{.*}} = "foo"(%[[V2]]) : (index) -> i32 + // UNROLL-CLEANUP-LOOP-NEXT: gpu.return + } +} + // UNROLL-CLEANUP-LOOP-LABEL: func @unroll_cleanup_loop_with_smaller_unroll_factor() func.func @unroll_cleanup_loop_with_smaller_unroll_factor() { affine.for %i = 0 to 7 { From 086146476f07dc285f8e5139faffb1c914f6c8db Mon Sep 17 00:00:00 2001 From: linuxlonelyeagle <2020382038@qq.com> Date: Tue, 11 Feb 2025 10:53:03 +0800 Subject: [PATCH 2/3] use InterfacePass implement it. --- mlir/include/mlir/Dialect/Affine/Passes.h | 3 +- mlir/include/mlir/Dialect/Affine/Passes.td | 2 +- .../Dialect/Affine/Transforms/LoopUnroll.cpp | 73 +++++++++---------- mlir/test/Dialect/Affine/unroll.mlir | 10 +-- mlir/test/Dialect/SCF/loop-unroll.mlir | 6 +- 5 files changed, 44 insertions(+), 50 deletions(-) diff --git a/mlir/include/mlir/Dialect/Affine/Passes.h b/mlir/include/mlir/Dialect/Affine/Passes.h index 37147b079e5d9..098cf386a3860 100644 --- a/mlir/include/mlir/Dialect/Affine/Passes.h +++ b/mlir/include/mlir/Dialect/Affine/Passes.h @@ -19,7 +19,6 @@ namespace mlir { -class ModuleOp; namespace func { class FuncOp; } // namespace func @@ -94,7 +93,7 @@ std::unique_ptr> createLoopTilingPass(); /// factors supplied through other means. If -1 is passed as the unrollFactor /// and no callback is provided, anything passed from the command-line (if at /// all) or the default unroll factor is used (LoopUnroll:kDefaultUnrollFactor). -std::unique_ptr> createLoopUnrollPass( +std::unique_ptr createLoopUnrollPass( int unrollFactor = -1, bool unrollUpToFactor = false, bool unrollFull = false, const std::function &getUnrollFactor = nullptr); diff --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td index d96b50c3e8104..5325d3b0a1d69 100644 --- a/mlir/include/mlir/Dialect/Affine/Passes.td +++ b/mlir/include/mlir/Dialect/Affine/Passes.td @@ -199,7 +199,7 @@ def AffineLoopTiling : Pass<"affine-loop-tile", "func::FuncOp"> { ]; } -def AffineLoopUnroll : Pass<"affine-loop-unroll", "ModuleOp"> { +def AffineLoopUnroll : InterfacePass<"affine-loop-unroll", "FunctionOpInterface"> { let summary = "Unroll affine loops"; let constructor = "mlir::affine::createLoopUnrollPass()"; let options = [ diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp index 4dc9809574115..da66af2d54295 100644 --- a/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp @@ -19,7 +19,6 @@ #include "mlir/IR/AffineExpr.h" #include "mlir/IR/AffineMap.h" #include "mlir/IR/Builders.h" -#include "mlir/IR/BuiltinOps.h" #include "llvm/ADT/DenseMap.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -92,44 +91,40 @@ static void gatherInnermostLoops(FunctionOpInterface f, } void LoopUnroll::runOnOperation() { - mlir::ModuleOp module = getOperation(); - SmallVector funcOps; - module.walk([&](FunctionOpInterface func) { funcOps.push_back(func); }); - for (auto func : funcOps) { - if (func.isExternal()) - return; - - if (unrollFull && unrollFullThreshold.hasValue()) { - // Store short loops as we walk. - SmallVector loops; - - // Gathers all loops with trip count <= minTripCount. Do a post order walk - // so that loops are gathered from innermost to outermost (or else - // unrolling an outer one may delete gathered inner ones). - getOperation().walk([&](AffineForOp forOp) { - std::optional tripCount = getConstantTripCount(forOp); - if (tripCount && *tripCount <= unrollFullThreshold) - loops.push_back(forOp); - }); - for (auto forOp : loops) - (void)loopUnrollFull(forOp); - return; - } - - // If the call back is provided, we will recurse until no loops are found. + FunctionOpInterface func = getOperation(); + if (func.isExternal()) + return; + + if (unrollFull && unrollFullThreshold.hasValue()) { + // Store short loops as we walk. SmallVector loops; - for (unsigned i = 0; i < numRepetitions || getUnrollFactor; i++) { - loops.clear(); - gatherInnermostLoops(func, loops); - if (loops.empty()) - break; - bool unrolled = false; - for (auto forOp : loops) - unrolled |= succeeded(runOnAffineForOp(forOp)); - if (!unrolled) - // Break out if nothing was unrolled. - break; - } + + // Gathers all loops with trip count <= minTripCount. Do a post order walk + // so that loops are gathered from innermost to outermost (or else + // unrolling an outer one may delete gathered inner ones). + getOperation().walk([&](AffineForOp forOp) { + std::optional tripCount = getConstantTripCount(forOp); + if (tripCount && *tripCount <= unrollFullThreshold) + loops.push_back(forOp); + }); + for (auto forOp : loops) + (void)loopUnrollFull(forOp); + return; + } + + // If the call back is provided, we will recurse until no loops are found. + SmallVector loops; + for (unsigned i = 0; i < numRepetitions || getUnrollFactor; i++) { + loops.clear(); + gatherInnermostLoops(func, loops); + if (loops.empty()) + break; + bool unrolled = false; + for (auto forOp : loops) + unrolled |= succeeded(runOnAffineForOp(forOp)); + if (!unrolled) + // Break out if nothing was unrolled. + break; } } @@ -150,7 +145,7 @@ LogicalResult LoopUnroll::runOnAffineForOp(AffineForOp forOp) { cleanUpUnroll); } -std::unique_ptr> mlir::affine::createLoopUnrollPass( +std::unique_ptr mlir::affine::createLoopUnrollPass( int unrollFactor, bool unrollUpToFactor, bool unrollFull, const std::function &getUnrollFactor) { return std::make_unique( diff --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir index 43485ca56deeb..3f7920dc1eeb3 100644 --- a/mlir/test/Dialect/Affine/unroll.mlir +++ b/mlir/test/Dialect/Affine/unroll.mlir @@ -1,8 +1,8 @@ -// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-unroll="unroll-full" | FileCheck %s --check-prefix UNROLL-FULL -// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-unroll="unroll-full unroll-full-threshold=2" | FileCheck %s --check-prefix SHORT -// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-unroll="unroll-factor=4" | FileCheck %s --check-prefix UNROLL-BY-4 -// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-unroll="unroll-factor=1" | FileCheck %s --check-prefix UNROLL-BY-1 -// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-unroll="unroll-factor=5 cleanup-unroll=true" | FileCheck %s --check-prefix UNROLL-CLEANUP-LOOP +// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-full=true}),gpu.module(gpu.func(affine-loop-unroll{unroll-full=true})))" | FileCheck %s --check-prefix UNROLL-FULL +// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-full=true unroll-full-threshold=2}),gpu.module(gpu.func(affine-loop-unroll{unroll-full=true unroll-full-threshold=2})))" | FileCheck %s --check-prefix SHORT +// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-factor=4}),gpu.module(gpu.func(affine-loop-unroll{unroll-factor=4})))" | FileCheck %s --check-prefix UNROLL-BY-4 +// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-factor=1}),gpu.module(gpu.func(affine-loop-unroll{unroll-factor=1})))" | FileCheck %s --check-prefix UNROLL-BY-1 +// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-factor=5 cleanup-unroll=true}),gpu.module(gpu.func(affine-loop-unroll{unroll-factor=5 cleanup-unroll=true})))" | FileCheck %s --check-prefix UNROLL-CLEANUP-LOOP // UNROLL-FULL-DAG: [[$MAP0:#map[0-9]*]] = affine_map<(d0) -> (d0 + 1)> // UNROLL-FULL-DAG: [[$MAP1:#map[0-9]*]] = affine_map<(d0) -> (d0 + 2)> diff --git a/mlir/test/Dialect/SCF/loop-unroll.mlir b/mlir/test/Dialect/SCF/loop-unroll.mlir index 0368505a1b70d..4c72d9e99d049 100644 --- a/mlir/test/Dialect/SCF/loop-unroll.mlir +++ b/mlir/test/Dialect/SCF/loop-unroll.mlir @@ -3,9 +3,9 @@ // RUN: mlir-opt %s -test-loop-unrolling='unroll-factor=2 loop-depth=0' | FileCheck %s --check-prefix UNROLL-OUTER-BY-2 // RUN: mlir-opt %s -test-loop-unrolling='unroll-factor=2 loop-depth=1' | FileCheck %s --check-prefix UNROLL-INNER-BY-2 // RUN: mlir-opt %s -test-loop-unrolling='unroll-factor=2 annotate=true' | FileCheck %s --check-prefix UNROLL-BY-2-ANNOTATE -// RUN: mlir-opt %s --affine-loop-unroll='unroll-factor=6 unroll-up-to-factor=true' | FileCheck %s --check-prefix UNROLL-UP-TO -// RUN: mlir-opt %s --affine-loop-unroll='unroll-factor=5 cleanup-unroll=true' | FileCheck %s --check-prefix CLEANUP-UNROLL-BY-5 -// RUN: mlir-opt %s --affine-loop-unroll --split-input-file | FileCheck %s +// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-factor=6 unroll-up-to-factor=true}))" | FileCheck %s --check-prefix UNROLL-UP-TO +// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-factor=5 cleanup-unroll=true}))" | FileCheck %s --check-prefix CLEANUP-UNROLL-BY-5 +// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll))" --split-input-file | FileCheck %s func.func @dynamic_loop_unroll(%arg0 : index, %arg1 : index, %arg2 : index, %arg3: memref) { From 4383806504c84451f82eb9ee72ac0bd7aeaefd40 Mon Sep 17 00:00:00 2001 From: linuxlonelyeagle <2020382038@qq.com> Date: Thu, 13 Feb 2025 11:27:07 +0800 Subject: [PATCH 3/3] update c++ impl and update test. --- mlir/include/mlir/Dialect/Affine/Passes.h | 3 +- .../Dialect/Affine/Transforms/LoopUnroll.cpp | 3 +- mlir/test/Dialect/Affine/unroll.mlir | 110 +++--------------- 3 files changed, 18 insertions(+), 98 deletions(-) diff --git a/mlir/include/mlir/Dialect/Affine/Passes.h b/mlir/include/mlir/Dialect/Affine/Passes.h index 098cf386a3860..ea5034b60d8bd 100644 --- a/mlir/include/mlir/Dialect/Affine/Passes.h +++ b/mlir/include/mlir/Dialect/Affine/Passes.h @@ -14,6 +14,7 @@ #ifndef MLIR_DIALECT_AFFINE_PASSES_H #define MLIR_DIALECT_AFFINE_PASSES_H +#include "mlir/Interfaces/FunctionInterfaces.h" #include "mlir/Pass/Pass.h" #include @@ -93,7 +94,7 @@ std::unique_ptr> createLoopTilingPass(); /// factors supplied through other means. If -1 is passed as the unrollFactor /// and no callback is provided, anything passed from the command-line (if at /// all) or the default unroll factor is used (LoopUnroll:kDefaultUnrollFactor). -std::unique_ptr createLoopUnrollPass( +std::unique_ptr> createLoopUnrollPass( int unrollFactor = -1, bool unrollUpToFactor = false, bool unrollFull = false, const std::function &getUnrollFactor = nullptr); diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp index da66af2d54295..7ff77968c61ad 100644 --- a/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp @@ -145,7 +145,8 @@ LogicalResult LoopUnroll::runOnAffineForOp(AffineForOp forOp) { cleanUpUnroll); } -std::unique_ptr mlir::affine::createLoopUnrollPass( +std::unique_ptr> +mlir::affine::createLoopUnrollPass( int unrollFactor, bool unrollUpToFactor, bool unrollFull, const std::function &getUnrollFactor) { return std::make_unique( diff --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir index 3f7920dc1eeb3..574e9f41494af 100644 --- a/mlir/test/Dialect/Affine/unroll.mlir +++ b/mlir/test/Dialect/Affine/unroll.mlir @@ -1,8 +1,9 @@ -// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-full=true}),gpu.module(gpu.func(affine-loop-unroll{unroll-full=true})))" | FileCheck %s --check-prefix UNROLL-FULL -// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-full=true unroll-full-threshold=2}),gpu.module(gpu.func(affine-loop-unroll{unroll-full=true unroll-full-threshold=2})))" | FileCheck %s --check-prefix SHORT -// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-factor=4}),gpu.module(gpu.func(affine-loop-unroll{unroll-factor=4})))" | FileCheck %s --check-prefix UNROLL-BY-4 -// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-factor=1}),gpu.module(gpu.func(affine-loop-unroll{unroll-factor=1})))" | FileCheck %s --check-prefix UNROLL-BY-1 -// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-factor=5 cleanup-unroll=true}),gpu.module(gpu.func(affine-loop-unroll{unroll-factor=5 cleanup-unroll=true})))" | FileCheck %s --check-prefix UNROLL-CLEANUP-LOOP +// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-full=true}))" | FileCheck %s --check-prefix UNROLL-FULL +// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-full=true unroll-full-threshold=2}))" | FileCheck %s --check-prefix SHORT +// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-factor=4}))" | FileCheck %s --check-prefix UNROLL-BY-4 +// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-factor=1}))" | FileCheck %s --check-prefix UNROLL-BY-1 +// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-factor=5 cleanup-unroll=true}))" | FileCheck %s --check-prefix UNROLL-CLEANUP-LOOP +// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(gpu.module(gpu.func(affine-loop-unroll{unroll-full=true})))" | FileCheck %s --check-prefix GPU-UNROLL-FULL // UNROLL-FULL-DAG: [[$MAP0:#map[0-9]*]] = affine_map<(d0) -> (d0 + 1)> // UNROLL-FULL-DAG: [[$MAP1:#map[0-9]*]] = affine_map<(d0) -> (d0 + 2)> @@ -241,19 +242,19 @@ func.func @loop_nest_unroll_full() { } // UNROLL-FULL } gpu.module @unroll_full { - // UNROLL-FULL-LABEL: func @gpu_loop_nest_simplest() { + // GPU-UNROLL-FULL-LABEL: func @gpu_loop_nest_simplest() { gpu.func @gpu_loop_nest_simplest() { - // UNROLL-FULL: affine.for %arg0 = 0 to 100 step 2 { + // GPU-UNROLL-FULL: affine.for %arg0 = 0 to 100 step 2 { affine.for %i = 0 to 100 step 2 { - // UNROLL-FULL: %c1_i32 = arith.constant 1 : i32 - // UNROLL-FULL-NEXT: %c1_i32_0 = arith.constant 1 : i32 - // UNROLL-FULL-NEXT: %c1_i32_1 = arith.constant 1 : i32 - // UNROLL-FULL-NEXT: %c1_i32_2 = arith.constant 1 : i32 + // GPU-UNROLL-FULL: %c1_i32 = arith.constant 1 : i32 + // GPU-UNROLL-FULL-NEXT: %c1_i32_0 = arith.constant 1 : i32 + // GPU-UNROLL-FULL-NEXT: %c1_i32_1 = arith.constant 1 : i32 + // GPU-UNROLL-FULL-NEXT: %c1_i32_2 = arith.constant 1 : i32 affine.for %j = 0 to 4 { %x = arith.constant 1 : i32 } - } // UNROLL-FULL: } - gpu.return // UNROLL-FULL: return + } // GPU-UNROLL-FULL: } + gpu.return // GPU-UNROLL-FULL: return } } @@ -277,28 +278,6 @@ func.func @loop_nest_outer_unroll() { return // SHORT: return } // SHORT } -gpu.module @short { - // SHORT-LABEL: func @gpu_loop_nest_outer_unroll() { - gpu.func @gpu_loop_nest_outer_unroll() { - // SHORT: affine.for %arg0 = 0 to 4 { - // SHORT-NEXT: %0 = affine.apply [[$MAP0]](%arg0) - // SHORT-NEXT: %1 = "addi32"(%0, %0) : (index, index) -> index - // SHORT-NEXT: } - // SHORT-NEXT: affine.for %arg0 = 0 to 4 { - // SHORT-NEXT: %0 = affine.apply [[$MAP0]](%arg0) - // SHORT-NEXT: %1 = "addi32"(%0, %0) : (index, index) -> index - // SHORT-NEXT: } - affine.for %i = 0 to 2 { - affine.for %j = 0 to 4 { - %x = "affine.apply" (%j) { map = affine_map<(d0) -> (d0 + 1)> } : - (index) -> (index) - %y = "addi32"(%x, %x) : (index, index) -> index - } - } - gpu.return // SHORT: gpu.return - } // SHORT } -} - // We are doing a minimal FileCheck here. We just need this test case to // successfully run. Both %x and %y will get unrolled here as the min trip // count threshold set to 2. @@ -384,37 +363,6 @@ func.func @unroll_unit_stride_no_cleanup() { return } -gpu.module @unroll_by_4{ - // UNROLL-BY-4-LABEL: func @gpu_unroll_unit_stride_no_cleanup() { - gpu.func @gpu_unroll_unit_stride_no_cleanup() { - // UNROLL-BY-4: affine.for %arg0 = 0 to 100 { - affine.for %i = 0 to 100 { - // UNROLL-BY-4: for [[L1:%arg[0-9]+]] = 0 to 8 step 4 { - // UNROLL-BY-4-NEXT: %0 = "addi32"([[L1]], [[L1]]) : (index, index) -> i32 - // UNROLL-BY-4-NEXT: %1 = "addi32"(%0, %0) : (i32, i32) -> i32 - // UNROLL-BY-4-NEXT: %2 = affine.apply #map{{[0-9]*}}([[L1]]) - // UNROLL-BY-4-NEXT: %3 = "addi32"(%2, %2) : (index, index) -> i32 - // UNROLL-BY-4-NEXT: %4 = "addi32"(%3, %3) : (i32, i32) -> i32 - // UNROLL-BY-4-NEXT: %5 = affine.apply #map{{[0-9]*}}([[L1]]) - // UNROLL-BY-4-NEXT: %6 = "addi32"(%5, %5) : (index, index) -> i32 - // UNROLL-BY-4-NEXT: %7 = "addi32"(%6, %6) : (i32, i32) -> i32 - // UNROLL-BY-4-NEXT: %8 = affine.apply #map{{[0-9]*}}([[L1]]) - // UNROLL-BY-4-NEXT: %9 = "addi32"(%8, %8) : (index, index) -> i32 - // UNROLL-BY-4-NEXT: %10 = "addi32"(%9, %9) : (i32, i32) -> i32 - // UNROLL-BY-4-NEXT: } - affine.for %j = 0 to 8 { - %x = "addi32"(%j, %j) : (index, index) -> i32 - %y = "addi32"(%x, %x) : (i32, i32) -> i32 - } - // empty loop - // UNROLL-BY-4: affine.for %arg1 = 0 to 8 { - affine.for %k = 0 to 8 { - } - } - gpu.return - } -} - // UNROLL-BY-4-LABEL: func @unroll_unit_stride_cleanup() { func.func @unroll_unit_stride_cleanup() { // UNROLL-BY-4: affine.for %arg0 = 0 to 100 { @@ -702,19 +650,6 @@ func.func @unroll_by_one_should_promote_single_iteration_loop() { // UNROLL-BY-1-NEXT: return } -gpu.module @unroll_by_1 { - // UNROLL-BY-1-LABEL: func @gpu_unroll_by_one_should_promote_single_iteration_loop() - gpu.func @gpu_unroll_by_one_should_promote_single_iteration_loop() { - affine.for %i = 0 to 1 { - %x = "foo"(%i) : (index) -> i32 - } - gpu.return - // UNROLL-BY-1-NEXT: %c0 = arith.constant 0 : index - // UNROLL-BY-1-NEXT: %0 = "foo"(%c0) : (index) -> i32 - // UNROLL-BY-1-NEXT: gpu.return - } -} - // Test unrolling with affine.for iter_args. // UNROLL-BY-4-LABEL: loop_unroll_with_iter_args_and_cleanup @@ -789,23 +724,6 @@ func.func @unroll_cleanup_loop_with_larger_unroll_factor() { // UNROLL-CLEANUP-LOOP-NEXT: return } -gpu.module @unroll_cleanup_loop { - // UNROLL-CLEANUP-LOOP-LABEL: func @gpu_unroll_cleanup_loop_with_larger_unroll_factor() - gpu.func @gpu_unroll_cleanup_loop_with_larger_unroll_factor() { - affine.for %i = 0 to 3 { - %x = "foo"(%i) : (index) -> i32 - } - gpu.return - // UNROLL-CLEANUP-LOOP-NEXT: %[[C0:.*]] = arith.constant 0 : index - // UNROLL-CLEANUP-LOOP-NEXT: {{.*}} = "foo"(%[[C0]]) : (index) -> i32 - // UNROLL-CLEANUP-LOOP-NEXT: %[[V1:.*]] = affine.apply {{.*}} - // UNROLL-CLEANUP-LOOP-NEXT: {{.*}} = "foo"(%[[V1]]) : (index) -> i32 - // UNROLL-CLEANUP-LOOP-NEXT: %[[V2:.*]] = affine.apply {{.*}} - // UNROLL-CLEANUP-LOOP-NEXT: {{.*}} = "foo"(%[[V2]]) : (index) -> i32 - // UNROLL-CLEANUP-LOOP-NEXT: gpu.return - } -} - // UNROLL-CLEANUP-LOOP-LABEL: func @unroll_cleanup_loop_with_smaller_unroll_factor() func.func @unroll_cleanup_loop_with_smaller_unroll_factor() { affine.for %i = 0 to 7 {