[Instrumentation][Proton] Add MLIR/LLVM level compiler instrumentation pass support in Proton (triton-lang#5067)

CRobeck · web-flow · commit d06ec83c1c37 · 2024-11-08T12:56:17.000-06:00
Basic functionality to print load/store address spaces chosen by the
compiler. Usage/example with matmul Proton tutorial:

```
$ proton --instrument=print-mem-spaces matmul.py
0     matmul_kernel     matmul.py:180:20     SHARED     STORE
1     matmul_kernel     matmul.py:181:20     SHARED     STORE
2     matmul_kernel     matmul.py:180:20     SHARED     LOAD
3     matmul_kernel     matmul.py:181:20     SHARED     LOAD

matmul-performance:
        M       N       K     cuBLAS     Triton
0   256.0   256.0   256.0   2.231013   1.691252
1   384.0   384.0   384.0   5.947805   4.626071
2   512.0   512.0   512.0  12.336188   8.924051
3   640.0   640.0   640.0  26.006348  14.628980
4   768.0   768.0   768.0  36.065672  20.972006
5   896.0   896.0   896.0  51.974214  29.480457
6  1024.0  1024.0  1024.0  63.913206  27.560463
7  1152.0  1152.0  1152.0  52.790876  34.125533
```
diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
@@ -279,8 +279,9 @@ jobs:
           ctest -j32
       - name: Run Proton tests
         run: |
-          cd third_party/proton
-          python3 -m pytest -s test
+          cd third_party/proton/test
+          python3 -m pytest -s .
+          cd ..
       - # If we're on branch `main`, save the ccache Triton compilation artifacts
         # to the cache so they can be used by other (non-main) CI runs.
         #
@@ -425,8 +426,9 @@ jobs:
           python3 -m pytest -s -n 8 ./test_cast_matmul.py
       - name: Run Proton tests
         run: |
-          cd third_party/proton
-          python3 -m pytest -s test
+          cd third_party/proton/test
+          python3 -m pytest -s .
+          cd ..
       - name: Run C++ unittests
         run: |
           cd python
diff --git a/.github/workflows/integration-tests.yml.in b/.github/workflows/integration-tests.yml.in
@@ -319,8 +319,9 @@ jobs:
       - &run-proton-tests-step
         name: Run Proton tests
         run: |
-          cd third_party/proton
-          python3 -m pytest -s test
+          cd third_party/proton/test
+          python3 -m pytest -s .
+          cd ..
 
       # If we're on branch `main`, save the ccache Triton compilation artifacts
       # to the cache so they can be used by other (non-main) CI runs.
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
@@ -3,3 +3,4 @@ add_subdirectory(Conversion)
 add_subdirectory(Dialect)
 add_subdirectory(Target)
 add_subdirectory(Tools)
+add_subdirectory(Instrumentation)
diff --git a/lib/Instrumentation/CMakeLists.txt b/lib/Instrumentation/CMakeLists.txt
@@ -0,0 +1,40 @@
+set(GPU_INSTRUMENTATION_PASSES
+	PrintLoadStoreMemSpaces
+    )
+
+set(PrintLoadStoreMemSpaces_SOURCES
+    PrintLoadStoreMemSpaces.cpp
+    )
+
+
+foreach( plugin ${GPU_INSTRUMENTATION_PASSES} )
+    add_library(
+      ${plugin}
+      SHARED
+      ${${plugin}_SOURCES}
+      )
+
+    target_link_libraries(
+      ${plugin}
+      PRIVATE
+      LLVMCore
+      LLVMSupport
+      LLVMTransformUtils
+      "$<$<PLATFORM_ID:Darwin>:-undefined dynamic_lookup>"
+      )
+     # CMAKE_LIBRARY_OUTPUT_DIRECTORY is only set during the Python
+     # build. It is empty if building directly from the root
+     # CMakeLists.txt file. Therefore if not building from Python just
+     # use the default CMake shared lib path otherwise this causes a hard
+     # build error
+     if(DEFINED CMAKE_LIBRARY_OUTPUT_DIRECTORY)
+     	set_target_properties(${plugin} PROPERTIES
+     	     LIBRARY_OUTPUT_DIRECTORY
+	     "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/../instrumentation")
+     endif(DEFINED CMAKE_LIBRARY_OUTPUT_DIRECTORY)
+
+     # This is set to -fvisibility=hidden in the top level CMake file
+     # which causes the llvmGetPassPluginInfo symbol to be hidden and
+     # an "entry point not found" error. Reset it just for this target
+     target_compile_options(${plugin} PRIVATE -fvisibility=default -fno-rtti)
+endforeach()
diff --git a/lib/Instrumentation/PrintLoadStoreMemSpaces.cpp b/lib/Instrumentation/PrintLoadStoreMemSpaces.cpp
@@ -0,0 +1,101 @@
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Passes/PassPlugin.h"
+#include <map>
+
+using namespace llvm;
+
+namespace {
+
+struct LoadStoreMemSpace : public PassInfoMixin<LoadStoreMemSpace> {
+  PreservedAnalyses run(llvm::Module &module, ModuleAnalysisManager &) {
+    bool modifiedCodeGen = runOnModule(module);
+
+    return (modifiedCodeGen ? llvm::PreservedAnalyses::none()
+                            : llvm::PreservedAnalyses::all());
+  }
+  bool runOnModule(llvm::Module &module);
+  // isRequired being set to true keeps this pass from being skipped
+  // if it has the optnone LLVM attribute
+  static bool isRequired() { return true; }
+};
+
+} // end anonymous namespace
+
+std::map<int, std::string> AddrSpaceMap = {
+    {0, "FLAT"}, {1, "GLOBAL"}, {3, "SHARED"}, {4, "CONSTANT"}, {5, "SCRATCH"}};
+
+std::map<std::string, uint32_t> LocationCounterSourceMap;
+
+std::string LoadOrStoreMap(const BasicBlock::iterator &I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return "LOAD";
+  else if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return "STORE";
+  else
+    throw std::runtime_error("Error: unknown operation type");
+}
+template <typename LoadOrStoreInst>
+void InstrumentationFunction(const BasicBlock::iterator &I, const Function &F,
+                             const llvm::Module &M, uint32_t &LocationCounter) {
+  auto LSI = dyn_cast<LoadOrStoreInst>(I);
+  if (not LSI)
+    return;
+  Value *Op = LSI->getPointerOperand()->stripPointerCasts();
+  uint32_t AddrSpace = cast<PointerType>(Op->getType())->getAddressSpace();
+  DILocation *DL = dyn_cast<Instruction>(I)->getDebugLoc();
+
+  std::string SourceAndAddrSpaceInfo =
+      (F.getName() + "     " + DL->getFilename() + ":" + Twine(DL->getLine()) +
+       ":" + Twine(DL->getColumn()))
+          .str() +
+      "     " + AddrSpaceMap[AddrSpace] + "     " + LoadOrStoreMap(I);
+
+  if (LocationCounterSourceMap.find(SourceAndAddrSpaceInfo) ==
+      LocationCounterSourceMap.end()) {
+    errs() << LocationCounter << "     " << SourceAndAddrSpaceInfo << "\n";
+    LocationCounterSourceMap[SourceAndAddrSpaceInfo] = LocationCounter;
+    LocationCounter++;
+  }
+}
+
+bool LoadStoreMemSpace::runOnModule(Module &M) {
+  bool ModifiedCodeGen = false;
+  uint32_t LocationCounter = 0;
+  for (auto &F : M) {
+    if (F.isIntrinsic())
+      continue;
+    StringRef functionName = F.getName();
+    if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+        F.getCallingConv() == CallingConv::PTX_Kernel ||
+        functionName.contains("kernel")) {
+      for (Function::iterator BB = F.begin(); BB != F.end(); BB++) {
+        for (BasicBlock::iterator I = BB->begin(); I != BB->end(); I++) {
+          if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+            InstrumentationFunction<LoadInst>(I, F, M, LocationCounter);
+          } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+            InstrumentationFunction<StoreInst>(I, F, M, LocationCounter);
+          }
+        }
+      }
+    }
+  }
+  return ModifiedCodeGen;
+}
+
+PassPluginLibraryInfo getPassPluginInfo() {
+  const auto callback = [](PassBuilder &PB) {
+    PB.registerOptimizerLastEPCallback([&](ModulePassManager &MPM, auto, auto) {
+      MPM.addPass(LoadStoreMemSpace());
+      return true;
+    });
+  };
+
+  return {LLVM_PLUGIN_API_VERSION, "print-mem-space", LLVM_VERSION_STRING,
+          callback};
+};
+
+extern "C" LLVM_ATTRIBUTE_WEAK PassPluginLibraryInfo llvmGetPassPluginInfo() {
+  return getPassPluginInfo();
+}
diff --git a/third_party/proton/README.md b/third_party/proton/README.md
@@ -128,6 +128,7 @@ The following examples demonstrate how to use Proton command-line.
 proton [options] script.py [script_args] [script_options]
 proton [options] pytest [pytest_args] [script_options]
 python -m triton.profiler.proton [options] script.py [script_args] [script_options]
+proton --instrument=[instrumentation pass] script.py
 ```
 
 When profiling in the command line mode, the `proton.start` and `proton.finalize` functions are automatically called before and after the script execution. Any `proton.start` and `proton.finalize` functions in the script are ignored. Also, in the command line mode, only a single *session* is supported. Therefore, `proton.deactivate(session_id=1)` is invalid, while `proton.deactivate(session_id=0)` is valid.
@@ -156,6 +157,23 @@ More options can be found by running the following command.
 proton-viewer -h
 ```
 
+### Advanced features
+In addition to profiling, Proton also incorporates MLIR/LLVM based compiler instrumentation passes to get Triton level analysis
+and optimization information. This feature is under active development and the list of available passes is expected to grow.
+
+#### Available passes
+print-mem-spaces: this pass prints the load and store address spaces (e.g. global, flat, shared) chosen by the compiler and attributes back to Triton source information.
+
+Example usage with the Proton matmul tutorial:
+```bash
+$ proton --instrument=print-mem-spaces matmul.py
+0     matmul_kernel     matmul.py:180:20     SHARED     STORE
+1     matmul_kernel     matmul.py:181:20     SHARED     STORE
+2     matmul_kernel     matmul.py:180:20     SHARED     LOAD
+3     matmul_kernel     matmul.py:181:20     SHARED     LOAD
+```
+Notes: The instrument functionality is currently only available from the command line. Additionally the instrument and profile command line arguments can not be use simulantously.
+
 ### Instruction sampling (experimental)
 
 Proton supports instruction sampling on NVIDIA GPUs.
diff --git a/third_party/proton/proton/proton.py b/third_party/proton/proton/proton.py
@@ -1,8 +1,11 @@
 import argparse
 import sys
 import os
+from glob import glob
+import pathlib
 from .profile import start, finalize, _select_backend
 from .flags import set_command_line
+import triton
 
 
 def parse_arguments():
@@ -19,6 +22,8 @@ def parse_arguments():
                         choices=["shadow", "python"])
     parser.add_argument("-d", "--data", type=str, help="Profiling data", default="tree", choices=["tree"])
     parser.add_argument("-k", "--hook", type=str, help="Profiling hook", default=None, choices=[None, "triton"])
+    parser.add_argument("-i", "--instrument", type=str, help="Instrumentation analysis type", default=None,
+                        choices=[None, "print-mem-spaces"])
     parser.add_argument('target_args', nargs=argparse.REMAINDER, help='Subcommand and its arguments')
     args = parser.parse_args()
     return args, args.target_args
@@ -28,7 +33,7 @@ def is_pytest(script):
     return os.path.basename(script) == 'pytest'
 
 
-def execute_as_main(script, args):
+def execute_as_main(script, args, instrumentation_pass=None):
     script_path = os.path.abspath(script)
     # Prepare a clean global environment
     clean_globals = {
@@ -42,6 +47,14 @@ def execute_as_main(script, args):
     sys.argv = [script] + args
     # Append the script's directory in case the script uses relative imports
     sys.path.append(os.path.dirname(script_path))
+    top_level_triton_path = os.path.dirname(triton.__file__)
+
+    if instrumentation_pass == "print-mem-spaces":
+        instrumentation_pass_path = str(
+            next(pathlib.Path(top_level_triton_path).rglob("libPrintLoadStoreMemSpaces.so"), None))
+        os.environ['TRITON_ALWAYS_COMPILE'] = "1"
+        os.environ['TRITON_DISABLE_LINE_INFO'] = "0"
+        os.environ['LLVM_PASS_PLUGIN_PATH'] = instrumentation_pass_path
 
     # Execute in the isolated environment
     try:
@@ -54,11 +67,7 @@ def execute_as_main(script, args):
         sys.argv = original_argv
 
 
-def run_profiling(args, target_args):
-    backend = args.backend if args.backend else _select_backend()
-
-    start(args.name, context=args.context, data=args.data, backend=backend, hook=args.hook)
-
+def do_setup_and_execute(target_args, instrumentation_pass=None):
     # Set the command line mode to avoid any `start` calls in the script.
     set_command_line()
 
@@ -68,13 +77,29 @@ def run_profiling(args, target_args):
         import pytest
         pytest.main(script_args)
     else:
-        execute_as_main(script, script_args)
+        execute_as_main(script, script_args, instrumentation_pass)
+
+
+def run_profiling(args, target_args):
+    backend = args.backend if args.backend else _select_backend()
+
+    start(args.name, context=args.context, data=args.data, backend=backend, hook=args.hook)
+
+    do_setup_and_execute(target_args)
 
     finalize()
 
 
+def run_instrumentation(args, target_args):
+    backend = args.backend if args.backend else _select_backend()
+    do_setup_and_execute(target_args, args.instrument)
+
+
 def main():
     args, target_args = parse_arguments()
+    if args.instrument:
+        run_instrumentation(args, target_args)
+        return
     run_profiling(args, target_args)
 
 
diff --git a/third_party/proton/test/instrument.py b/third_party/proton/test/instrument.py
@@ -0,0 +1,68 @@
+import torch
+
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def matmul_kernel(a_ptr, b_ptr, c_ptr, M, N, K, stride_am, stride_ak,  #
+                  stride_bk, stride_bn,  #
+                  stride_cm, stride_cn, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr,
+                  BLOCK_SIZE_K: tl.constexpr,  #
+                  GROUP_SIZE_M: tl.constexpr,  #
+                  ):
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
+        accumulator += tl.dot(a, b)
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+    c = accumulator.to(tl.float16)
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=c_mask)
+
+
+def matmul(a, b, activation=""):
+    # Check constraints.
+    M, K = a.shape
+    K, N = b.shape
+    # Allocates output.
+    c = torch.empty((M, N), device=a.device, dtype=a.dtype)
+
+    # 1D launch kernel where each block gets its own program.
+    def grid():
+        return (1, )
+
+    matmul_kernel[grid](
+        a, b, c,  #
+        M, N, K,  #
+        a.stride(0), a.stride(1),  #
+        b.stride(0), b.stride(1),  #
+        c.stride(0), c.stride(1),  #
+        128, 256, 64, 8)
+    return c
+
+
+a = torch.randn((32, 32), device="cuda", dtype=torch.float16)
+b = torch.randn((32, 32), device="cuda", dtype=torch.float16)
+matmul(a, b)
diff --git a/third_party/proton/test/test_cmd.py b/third_party/proton/test/test_cmd.py