From 9f49720edab27dc1d08e00b1681e72a07af0fe92 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Tue, 15 Apr 2025 10:13:35 +0200 Subject: [PATCH 1/3] [AMDGPU][SplitModule] Do not create empty modules Skip creating a module if no function is going to be imported. Also includes a change so that if the first partition is empty (which can happen), we import global with non-local linkage into the first non-empty partition, instead of P0 all the time. I thought we'd need to change users of the SplitModule callback so they can deal with less modules than the number requested, but no. We already return only 1 module in some cases and it seems to be handled just fine. --- llvm/include/llvm/Target/TargetMachine.h | 3 +++ llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp | 15 ++++++++++++- .../address-taken-externalize-with-call.ll | 12 ++++------ .../AMDGPU/large-kernels-merging-weak_odr.ll | 22 +++++++++---------- .../AMDGPU/large-kernels-merging.ll | 22 +++++++++---------- .../llvm-split/AMDGPU/preserve-globals.ll | 21 ++++++++++++++++++ 6 files changed, 62 insertions(+), 33 deletions(-) create mode 100644 llvm/test/tools/llvm-split/AMDGPU/preserve-globals.ll diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h index 5a16c9cafcd7a..566e7dba6792b 100644 --- a/llvm/include/llvm/Target/TargetMachine.h +++ b/llvm/include/llvm/Target/TargetMachine.h @@ -451,6 +451,9 @@ class TargetMachine { /// Entry point for module splitting. Targets can implement custom module /// splitting logic, mainly used by LTO for --lto-partitions. /// + /// On success, this guarantees that between 1 and \p NumParts modules were + /// created and passed to \p ModuleCallBack. + /// /// \returns `true` if the module was split, `false` otherwise. When `false` /// is returned, it is assumed that \p ModuleCallback has never been called /// and \p M has not been modified. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp index dd3bec774ec67..b7b9814b93427 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp @@ -1478,6 +1478,10 @@ static void splitAMDGPUModule( << "' - Partition summaries will not be printed\n"; } + // One module will import all GlobalValues that are not Functions + // and are not subject to conservative import. + bool ImportAllGVs = true; + for (unsigned PID = 0; PID < NumParts; ++PID) { SplitModuleTimer SMT2("modules_creation", "creating modules for each partition"); @@ -1487,6 +1491,13 @@ static void splitAMDGPUModule( for (unsigned NodeID : (*Proposal)[PID].set_bits()) FnsInPart.insert(&SG.getNode(NodeID).getFunction()); + // Don't create empty module, except for PID 0 because + if (FnsInPart.empty()) { + LLVM_DEBUG(dbgs() << "[split] P" << PID + << " is empty, not creating module\n"); + continue; + } + ValueToValueMapTy VMap; CostType PartCost = 0; std::unique_ptr MPart( @@ -1501,9 +1512,11 @@ static void splitAMDGPUModule( } // Everything else goes in the first partition. - return needsConservativeImport(GV) || PID == 0; + return needsConservativeImport(GV) || ImportAllGVs; })); + ImportAllGVs = false; + // FIXME: Aliases aren't seen often, and their handling isn't perfect so // bugs are possible. diff --git a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll index 708b5a006be60..0f20a73b1928e 100644 --- a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll +++ b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll @@ -1,7 +1,6 @@ ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-threshold=0 ; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s ; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s ; 3 kernels: ; - A does a direct call to HelperA @@ -11,14 +10,11 @@ ; The helper functions will get externalized, so C/A will end up ; in the same partition. -; P0 is empty. -; CHECK0: declare +; CHECK0: define amdgpu_kernel void @B(ptr %dst) -; CHECK1: define amdgpu_kernel void @B(ptr %dst) - -; CHECK2: define hidden void @HelperA() -; CHECK2: define amdgpu_kernel void @A() -; CHECK2: define amdgpu_kernel void @C() +; CHECK1: define hidden void @HelperA() +; CHECK1: define amdgpu_kernel void @A() +; CHECK1: define amdgpu_kernel void @C() define internal void @HelperA() { ret void diff --git a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging-weak_odr.ll b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging-weak_odr.ll index 839688e7feb8b..567275686fb9f 100644 --- a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging-weak_odr.ll +++ b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging-weak_odr.ll @@ -1,7 +1,6 @@ ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=0 -amdgpu-module-splitting-large-threshold=1.2 -amdgpu-module-splitting-merge-threshold=0.5 ; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s ; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s ; RUN: llvm-split -o %t.nolarge %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-threshold=0 -amdgpu-module-splitting-max-depth=0 ; RUN: llvm-dis -o - %t.nolarge0 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK0 --implicit-check-not=define %s @@ -15,19 +14,18 @@ ; Also check w/o large kernels processing to verify they are indeed handled ; differently. -; P0 is empty -; CHECK0: declare +; Only two partitions created for the first command. -; CHECK1: define internal void @HelperC() -; CHECK1: define weak_odr amdgpu_kernel void @C +; CHECK0: define internal void @HelperC() +; CHECK0: define weak_odr amdgpu_kernel void @C -; CHECK2: define internal void @large2() -; CHECK2: define internal void @large1() -; CHECK2: define internal void @large0() -; CHECK2: define internal void @HelperA() -; CHECK2: define internal void @HelperB() -; CHECK2: define amdgpu_kernel void @A -; CHECK2: define weak_odr amdgpu_kernel void @B +; CHECK1: define internal void @large2() +; CHECK1: define internal void @large1() +; CHECK1: define internal void @large0() +; CHECK1: define internal void @HelperA() +; CHECK1: define internal void @HelperB() +; CHECK1: define amdgpu_kernel void @A +; CHECK1: define weak_odr amdgpu_kernel void @B ; NOLARGEKERNELS-CHECK0: define internal void @HelperC() ; NOLARGEKERNELS-CHECK0: define weak_odr amdgpu_kernel void @C diff --git a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll index 807fb2e5f33ce..35133d20c4e07 100644 --- a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll +++ b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll @@ -1,7 +1,6 @@ ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=0 -amdgpu-module-splitting-large-threshold=1.2 -amdgpu-module-splitting-merge-threshold=0.5 ; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s ; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s ; RUN: llvm-split -o %t.nolarge %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-threshold=0 -amdgpu-module-splitting-max-depth=0 ; RUN: llvm-dis -o - %t.nolarge0 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK0 --implicit-check-not=define %s @@ -15,19 +14,18 @@ ; Also check w/o large kernels processing to verify they are indeed handled ; differently. -; P0 is empty -; CHECK0: declare +; Only 2 partitions for the first command. -; CHECK1: define internal void @HelperC() -; CHECK1: define amdgpu_kernel void @C +; CHECK0: define internal void @HelperC() +; CHECK0: define amdgpu_kernel void @C -; CHECK2: define internal void @large2() -; CHECK2: define internal void @large1() -; CHECK2: define internal void @large0() -; CHECK2: define internal void @HelperA() -; CHECK2: define internal void @HelperB() -; CHECK2: define amdgpu_kernel void @A -; CHECK2: define amdgpu_kernel void @B +; CHECK1: define internal void @large2() +; CHECK1: define internal void @large1() +; CHECK1: define internal void @large0() +; CHECK1: define internal void @HelperA() +; CHECK1: define internal void @HelperB() +; CHECK1: define amdgpu_kernel void @A +; CHECK1: define amdgpu_kernel void @B ; NOLARGEKERNELS-CHECK0: define internal void @HelperC() ; NOLARGEKERNELS-CHECK0: define amdgpu_kernel void @C diff --git a/llvm/test/tools/llvm-split/AMDGPU/preserve-globals.ll b/llvm/test/tools/llvm-split/AMDGPU/preserve-globals.ll new file mode 100644 index 0000000000000..091010edd6be3 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/preserve-globals.ll @@ -0,0 +1,21 @@ +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=0 -amdgpu-module-splitting-large-threshold=1.2 -amdgpu-module-splitting-merge-threshold=0.5 +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s + +; Only 2 out of 3 partitions are created, check the external global is preserved in the first partition. + +; CHECK0: @foobar = linkonce_odr global i64 52 +; CHECK0: define amdgpu_kernel void @B + +; CHECK1-NOT: @foobar = linkonce_odr global i64 52 +; CHECK1: define amdgpu_kernel void @A + +@foobar = linkonce_odr global i64 52 + +define amdgpu_kernel void @A() { + ret void +} + +define amdgpu_kernel void @B() { + ret void +} From c58d21d816a31a03bdc20c91ffe62f14a01121c0 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Wed, 16 Apr 2025 11:49:10 +0200 Subject: [PATCH 2/3] comments --- llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp index b7b9814b93427..ca60aa78b8b36 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp @@ -1491,7 +1491,7 @@ static void splitAMDGPUModule( for (unsigned NodeID : (*Proposal)[PID].set_bits()) FnsInPart.insert(&SG.getNode(NodeID).getFunction()); - // Don't create empty module, except for PID 0 because + // Don't create empty modules. if (FnsInPart.empty()) { LLVM_DEBUG(dbgs() << "[split] P" << PID << " is empty, not creating module\n"); @@ -1511,7 +1511,7 @@ static void splitAMDGPUModule( return false; } - // Everything else goes in the first partition. + // Everything else goes in the first non-empty module we create. return needsConservativeImport(GV) || ImportAllGVs; })); From 0f680c80daa1c08b33b2e66af58e6bbdbc3d9a62 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Tue, 22 Apr 2025 09:48:29 +0200 Subject: [PATCH 3/3] comments --- llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp index ca60aa78b8b36..fea1db04aee11 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp @@ -1512,7 +1512,7 @@ static void splitAMDGPUModule( } // Everything else goes in the first non-empty module we create. - return needsConservativeImport(GV) || ImportAllGVs; + return ImportAllGVs || needsConservativeImport(GV); })); ImportAllGVs = false;