Skip to content

Conversation

@teresajohnson
Copy link
Contributor

Prepare for usage in the bitcode reader/writer where we already have a
LinearFrameId:

  • templatize input frame id type in CallStackRadixTreeBuilder
  • templatize input frame id type in computeFrameHistogram
  • make the map from FrameId to LinearFrameId optional

We plan to use the same radix format in the ThinLTO summary records,
where we already have a LinearFrameId.

Prepare for usage in the bitcode reader/writer where we already have a
LinearFrameId:
- templatize input frame id type in CallStackRadixTreeBuilder
- templatize input frame id type in computeFrameHistogram
- make the map from FrameId to LinearFrameId optional

We plan to use the same radix format in the ThinLTO summary records,
where we already have a LinearFrameId.
@llvmbot llvmbot added the PGO Profile Guided Optimizations label Nov 20, 2024
@llvmbot
Copy link
Member

llvmbot commented Nov 20, 2024

@llvm/pr-subscribers-pgo

Author: Teresa Johnson (teresajohnson)

Changes

Prepare for usage in the bitcode reader/writer where we already have a
LinearFrameId:

  • templatize input frame id type in CallStackRadixTreeBuilder
  • templatize input frame id type in computeFrameHistogram
  • make the map from FrameId to LinearFrameId optional

We plan to use the same radix format in the ThinLTO summary records,
where we already have a LinearFrameId.


Full diff: https://github.com/llvm/llvm-project/pull/117014.diff

4 Files Affected:

  • (modified) llvm/include/llvm/ProfileData/MemProf.h (+14-11)
  • (modified) llvm/lib/ProfileData/InstrProfWriter.cpp (+1-1)
  • (modified) llvm/lib/ProfileData/MemProf.cpp (+29-15)
  • (modified) llvm/unittests/ProfileData/MemProfTest.cpp (+8-8)
diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
index 9415e554bcc0ab..f97fbd4bd64419 100644
--- a/llvm/include/llvm/ProfileData/MemProf.h
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -1050,8 +1050,9 @@ struct FrameStat {
 };
 
 // Compute a histogram of Frames in call stacks.
-llvm::DenseMap<FrameId, FrameStat>
-computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
+template <typename FrameIdTy>
+llvm::DenseMap<FrameIdTy, FrameStat>
+computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>
                           &MemProfCallStackData);
 
 // Construct a radix tree of call stacks.
@@ -1109,7 +1110,7 @@ computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
 // On-disk IndexedMemProfRecord will refer to call stacks by their indexes into
 // the radix tree array, so we do not explicitly encode mappings like:
 // "CallStackId 1 -> 11".
-class CallStackRadixTreeBuilder {
+template <typename FrameIdTy> class CallStackRadixTreeBuilder {
   // The radix tree array.
   std::vector<LinearFrameId> RadixArray;
 
@@ -1136,23 +1137,25 @@ class CallStackRadixTreeBuilder {
   // RadixArray[Indexes[5 - 1]] is the last frame of the common prefix.
   std::vector<LinearCallStackId> Indexes;
 
-  using CSIdPair = std::pair<CallStackId, llvm::SmallVector<FrameId>>;
+  using CSIdPair = std::pair<CallStackId, llvm::SmallVector<FrameIdTy>>;
 
   // Encode a call stack into RadixArray.  Return the starting index within
   // RadixArray.
-  LinearCallStackId encodeCallStack(
-      const llvm::SmallVector<FrameId> *CallStack,
-      const llvm::SmallVector<FrameId> *Prev,
-      const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes);
+  LinearCallStackId
+  encodeCallStack(const llvm::SmallVector<FrameIdTy> *CallStack,
+                  const llvm::SmallVector<FrameIdTy> *Prev,
+                  std::optional<const llvm::DenseMap<FrameIdTy, LinearFrameId>>
+                      MemProfFrameIndexes);
 
 public:
   CallStackRadixTreeBuilder() = default;
 
   // Build a radix tree array.
-  void build(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
+  void build(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>
                  &&MemProfCallStackData,
-             const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes,
-             llvm::DenseMap<FrameId, FrameStat> &FrameHistogram);
+             std::optional<const llvm::DenseMap<FrameIdTy, LinearFrameId>>
+                 MemProfFrameIndexes,
+             llvm::DenseMap<FrameIdTy, FrameStat> &FrameHistogram);
 
   ArrayRef<LinearFrameId> getRadixArray() const { return RadixArray; }
 
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index 5580fe04ffe7df..d90629ad57f5b9 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -635,7 +635,7 @@ writeMemProfCallStackArray(
   llvm::DenseMap<memprof::CallStackId, memprof::LinearCallStackId>
       MemProfCallStackIndexes;
 
-  memprof::CallStackRadixTreeBuilder Builder;
+  memprof::CallStackRadixTreeBuilder<memprof::FrameId> Builder;
   Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
                 FrameHistogram);
   for (auto I : Builder.getRadixArray())
diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp
index 6c4bda2d9264ff..9d5ac748d7975d 100644
--- a/llvm/lib/ProfileData/MemProf.cpp
+++ b/llvm/lib/ProfileData/MemProf.cpp
@@ -436,10 +436,12 @@ CallStackId hashCallStack(ArrayRef<FrameId> CS) {
 // To quickly determine the location of the common prefix within RadixArray,
 // Indexes caches the indexes of the previous call stack's frames within
 // RadixArray.
-LinearCallStackId CallStackRadixTreeBuilder::encodeCallStack(
-    const llvm::SmallVector<FrameId> *CallStack,
-    const llvm::SmallVector<FrameId> *Prev,
-    const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes) {
+template <typename FrameIdTy>
+LinearCallStackId CallStackRadixTreeBuilder<FrameIdTy>::encodeCallStack(
+    const llvm::SmallVector<FrameIdTy> *CallStack,
+    const llvm::SmallVector<FrameIdTy> *Prev,
+    std::optional<const llvm::DenseMap<FrameIdTy, LinearFrameId>>
+        MemProfFrameIndexes) {
   // Compute the length of the common root prefix between Prev and CallStack.
   uint32_t CommonLen = 0;
   if (Prev) {
@@ -464,10 +466,11 @@ LinearCallStackId CallStackRadixTreeBuilder::encodeCallStack(
 
   // Copy the part of the call stack beyond the common prefix to RadixArray.
   assert(CommonLen <= CallStack->size());
-  for (FrameId F : llvm::drop_begin(llvm::reverse(*CallStack), CommonLen)) {
+  for (FrameIdTy F : llvm::drop_begin(llvm::reverse(*CallStack), CommonLen)) {
     // Remember the index of F in RadixArray.
     Indexes.push_back(RadixArray.size());
-    RadixArray.push_back(MemProfFrameIndexes.find(F)->second);
+    RadixArray.push_back(
+        MemProfFrameIndexes ? MemProfFrameIndexes->find(F)->second : F);
   }
   assert(CallStack->size() == Indexes.size());
 
@@ -479,11 +482,13 @@ LinearCallStackId CallStackRadixTreeBuilder::encodeCallStack(
   return RadixArray.size() - 1;
 }
 
-void CallStackRadixTreeBuilder::build(
-    llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
+template <typename FrameIdTy>
+void CallStackRadixTreeBuilder<FrameIdTy>::build(
+    llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>
         &&MemProfCallStackData,
-    const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes,
-    llvm::DenseMap<FrameId, FrameStat> &FrameHistogram) {
+    std::optional<const llvm::DenseMap<FrameIdTy, LinearFrameId>>
+        MemProfFrameIndexes,
+    llvm::DenseMap<FrameIdTy, FrameStat> &FrameHistogram) {
   // Take the vector portion of MemProfCallStackData.  The vector is exactly
   // what we need to sort.  Also, we no longer need its lookup capability.
   llvm::SmallVector<CSIdPair, 0> CallStacks = MemProfCallStackData.takeVector();
@@ -535,7 +540,7 @@ void CallStackRadixTreeBuilder::build(
     // root.
     return std::lexicographical_compare(
         L.second.rbegin(), L.second.rend(), R.second.rbegin(), R.second.rend(),
-        [&](FrameId F1, FrameId F2) {
+        [&](FrameIdTy F1, FrameIdTy F2) {
           uint64_t H1 = FrameHistogram[F1].Count;
           uint64_t H2 = FrameHistogram[F2].Count;
           // Popular frames should come later because we encode call stacks from
@@ -585,7 +590,7 @@ void CallStackRadixTreeBuilder::build(
   // traverse CallStacks in the reverse order, then Call Stack 3 has the
   // complete call stack encoded without any pointers.  Call Stack 1 and 2 point
   // to appropriate prefixes of Call Stack 3.
-  const llvm::SmallVector<FrameId> *Prev = nullptr;
+  const llvm::SmallVector<FrameIdTy> *Prev = nullptr;
   for (const auto &[CSId, CallStack] : llvm::reverse(CallStacks)) {
     LinearCallStackId Pos =
         encodeCallStack(&CallStack, Prev, MemProfFrameIndexes);
@@ -608,10 +613,14 @@ void CallStackRadixTreeBuilder::build(
     V = RadixArray.size() - 1 - V;
 }
 
-llvm::DenseMap<FrameId, FrameStat>
-computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
+// Explicitly instantiate class with the utilized FrameIdTy.
+template class CallStackRadixTreeBuilder<FrameId>;
+
+template <typename FrameIdTy>
+llvm::DenseMap<FrameIdTy, FrameStat>
+computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>
                           &MemProfCallStackData) {
-  llvm::DenseMap<FrameId, FrameStat> Histogram;
+  llvm::DenseMap<FrameIdTy, FrameStat> Histogram;
 
   for (const auto &KV : MemProfCallStackData) {
     const auto &CS = KV.second;
@@ -624,6 +633,11 @@ computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
   return Histogram;
 }
 
+// Explicitly instantiate function with the utilized FrameIdTy.
+template llvm::DenseMap<FrameId, FrameStat> computeFrameHistogram<FrameId>(
+    llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
+        &MemProfCallStackData);
+
 void verifyIndexedMemProfRecord(const IndexedMemProfRecord &Record) {
   for (const auto &AS : Record.AllocSites) {
     assert(AS.CSId == hashCallStack(AS.CallStack));
diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp
index e5d41db06dcc07..e68cc094338736 100644
--- a/llvm/unittests/ProfileData/MemProfTest.cpp
+++ b/llvm/unittests/ProfileData/MemProfTest.cpp
@@ -659,8 +659,8 @@ TEST(MemProf, RadixTreeBuilderEmpty) {
   llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> MemProfCallStackData;
   llvm::DenseMap<llvm::memprof::FrameId, llvm::memprof::FrameStat>
       FrameHistogram =
-          llvm::memprof::computeFrameHistogram(MemProfCallStackData);
-  llvm::memprof::CallStackRadixTreeBuilder Builder;
+          llvm::memprof::computeFrameHistogram<FrameId>(MemProfCallStackData);
+  llvm::memprof::CallStackRadixTreeBuilder<FrameId> Builder;
   Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
                 FrameHistogram);
   ASSERT_THAT(Builder.getRadixArray(), testing::IsEmpty());
@@ -677,8 +677,8 @@ TEST(MemProf, RadixTreeBuilderOne) {
   MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS1), CS1});
   llvm::DenseMap<llvm::memprof::FrameId, llvm::memprof::FrameStat>
       FrameHistogram =
-          llvm::memprof::computeFrameHistogram(MemProfCallStackData);
-  llvm::memprof::CallStackRadixTreeBuilder Builder;
+          llvm::memprof::computeFrameHistogram<FrameId>(MemProfCallStackData);
+  llvm::memprof::CallStackRadixTreeBuilder<FrameId> Builder;
   Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
                 FrameHistogram);
   EXPECT_THAT(Builder.getRadixArray(), testing::ElementsAreArray({
@@ -704,8 +704,8 @@ TEST(MemProf, RadixTreeBuilderTwo) {
   MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS2), CS2});
   llvm::DenseMap<llvm::memprof::FrameId, llvm::memprof::FrameStat>
       FrameHistogram =
-          llvm::memprof::computeFrameHistogram(MemProfCallStackData);
-  llvm::memprof::CallStackRadixTreeBuilder Builder;
+          llvm::memprof::computeFrameHistogram<FrameId>(MemProfCallStackData);
+  llvm::memprof::CallStackRadixTreeBuilder<FrameId> Builder;
   Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
                 FrameHistogram);
   EXPECT_THAT(Builder.getRadixArray(),
@@ -742,8 +742,8 @@ TEST(MemProf, RadixTreeBuilderSuccessiveJumps) {
   MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS4), CS4});
   llvm::DenseMap<llvm::memprof::FrameId, llvm::memprof::FrameStat>
       FrameHistogram =
-          llvm::memprof::computeFrameHistogram(MemProfCallStackData);
-  llvm::memprof::CallStackRadixTreeBuilder Builder;
+          llvm::memprof::computeFrameHistogram<FrameId>(MemProfCallStackData);
+  llvm::memprof::CallStackRadixTreeBuilder<FrameId> Builder;
   Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
                 FrameHistogram);
   EXPECT_THAT(Builder.getRadixArray(),

Copy link
Contributor

@kazutakahirata kazutakahirata left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM. Thanks!

@teresajohnson teresajohnson merged commit e14827f into llvm:main Nov 20, 2024
7 of 9 checks passed
@llvm-ci
Copy link
Collaborator

llvm-ci commented Nov 20, 2024

LLVM Buildbot has detected a new failure on builder openmp-offload-amdgpu-runtime running on omp-vega20-0 while building llvm at step 7 "Add check check-offload".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/30/builds/10670

Here is the relevant piece of the build log for the reference
Step 7 (Add check check-offload) failure: test (failure)
******************** TEST 'libomptarget :: amdgcn-amd-amdhsa :: offloading/pgo1.c' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
# RUN: at line 1
/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./bin/clang -fopenmp    -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src  -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib  -fopenmp-targets=amdgcn-amd-amdhsa /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test/offloading/pgo1.c -o /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/pgo1.c.tmp /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a -fprofile-instr-generate      -Xclang "-fprofile-instrument=clang"
# executed command: /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./bin/clang -fopenmp -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib -fopenmp-targets=amdgcn-amd-amdhsa /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test/offloading/pgo1.c -o /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/pgo1.c.tmp /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a -fprofile-instr-generate -Xclang -fprofile-instrument=clang
# note: command had no output on stdout or stderr
# RUN: at line 3
/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/pgo1.c.tmp 2>&1 | /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test/offloading/pgo1.c      --check-prefix="CLANG-PGO"
# executed command: /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/pgo1.c.tmp
# note: command had no output on stdout or stderr
# executed command: /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test/offloading/pgo1.c --check-prefix=CLANG-PGO
# .---command stderr------------
# | /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test/offloading/pgo1.c:32:20: error: CLANG-PGO-NEXT: expected string not found in input
# | // CLANG-PGO-NEXT: [ 0 11 20 ]
# |                    ^
# | <stdin>:3:28: note: scanning from here
# | ======== Counters =========
# |                            ^
# | <stdin>:4:1: note: possible intended match here
# | [ 0 12 20 ]
# | ^
# | 
# | Input file: <stdin>
# | Check file: /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test/offloading/pgo1.c
# | 
# | -dump-input=help explains the following input dump.
# | 
# | Input was:
# | <<<<<<
# |            1: ======= GPU Profile ======= 
# |            2: Target: amdgcn-amd-amdhsa 
# |            3: ======== Counters ========= 
# | next:32'0                                X error: no match found
# |            4: [ 0 12 20 ] 
# | next:32'0     ~~~~~~~~~~~~
# | next:32'1     ?            possible intended match
# |            5: [ 10 ] 
# | next:32'0     ~~~~~~~
# |            6: [ 20 ] 
# | next:32'0     ~~~~~~~
# |            7: ========== Data =========== 
# | next:32'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            8: { 10367987278331647071 4749112401 0xffffffffffffffd8 0x0 0x0 0x0 3 0 0 } 
# | next:32'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            9: { 3666282617048535130 24 0xffffffffffffffb0 0x0 0x0 0x0 1 0 0 } 
# | next:32'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            .
...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

PGO Profile Guided Optimizations

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants