intel
diff --git a/‎.github/workflows/build-test-reusable.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/build-test-reusable.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.github/workflows/pip-test.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/pip-test.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions b/‎CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎Makefile‎
Lines changed: 2 additions & 1 deletion b/‎Makefile‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎bin/CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions b/‎bin/CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎bin/RegisterTritonDialects.h‎
Lines changed: 24 additions & 4 deletions b/‎bin/RegisterTritonDialects.h‎
Lines changed: 24 additions & 4 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 9 additions & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 1 addition & 1 deletion b/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ControlFlowOpToLLVM.cpp‎
Lines changed: 2 additions & 0 deletions b/‎lib/Conversion/TritonGPUToLLVM/ControlFlowOpToLLVM.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/FuncOpToLLVM.cpp‎
Lines changed: 7 additions & 5 deletions b/‎lib/Conversion/TritonGPUToLLVM/FuncOpToLLVM.cpp‎
Lines changed: 7 additions & 5 deletions
@@ -285,7 +285,8 @@ jobs:
         if: matrix.suite == 'rest' && inputs.driver_version == 'rolling' && inputs.device == 'max1100'
         run: |
           cd third_party/proton/test
-          pytest test_api.py test_lib.py test_profile.py test_viewer.py test_record.py -s -v
+          # FIXME: enable 'test_record.py' back
+          pytest test_api.py test_lib.py test_profile.py test_viewer.py -s -v
           cd ..
 
       - name: Run minicore tests
 
@@ -50,7 +50,7 @@ jobs:
           gh_token: ${{ secrets.GITHUB_TOKEN }}
           python_version: ${{ env.PYTHON_VERSION }}
           # transformers package is required for the inductor (e2e) test
-          wheels_pattern: '{torch,transformers}-*.whl'
+          wheels_pattern: 'torch-*.whl'
 
       - name: Install Triton
         uses: ./.github/actions/setup-triton
@@ -61,6 +61,7 @@ jobs:
           sed -i '/^validate_nccl_dep_consistency.*/d' generate_binary_build_matrix.py
           python -c "from generate_binary_build_matrix import PYTORCH_EXTRA_INSTALL_REQUIREMENTS; print('\n'.join(PYTORCH_EXTRA_INSTALL_REQUIREMENTS['xpu'].split(' | ')))" | tee /tmp/requirements.txt
           pip install -r /tmp/requirements.txt
+          pip install transformers==4.54.0
 
       - name: Run core tests
         run: |
 
@@ -228,7 +228,7 @@ if(TRITON_BUILD_PYTHON_MODULE)
   endif()
   # We always build proton dialect
   list(APPEND TRITON_PLUGIN_NAMES "proton")
-  add_subdirectory(third_party/proton/dialect)
+  add_subdirectory(third_party/proton/Dialect)
 
   get_property(triton_libs GLOBAL PROPERTY TRITON_LIBS)
   get_property(triton_plugins GLOBAL PROPERTY TRITON_PLUGINS)
@@ -360,7 +360,7 @@ if(NOT TRITON_BUILD_PYTHON_MODULE)
   foreach(CODEGEN_BACKEND ${TRITON_CODEGEN_BACKENDS})
     add_subdirectory(third_party/${CODEGEN_BACKEND})
   endforeach()
-  add_subdirectory(third_party/proton/dialect)
+  add_subdirectory(third_party/proton/Dialect)
 endif()
 
 find_package(Threads REQUIRED)
 
@@ -69,7 +69,8 @@ test-interpret: all
 
 .PHONY: test-proton
 test-proton: all
-	$(PYTEST) -s -n 8 third_party/proton/test
+	$(PYTEST) -s -n 8 third_party/proton/test --ignore=third_party/proton/test/test_override.py
+	$(PYTEST) -s third_party/proton/test/test_override.py
 
 .PHONY: test-python
 test-python: test-unit test-regression test-interpret test-proton
 
@@ -15,6 +15,7 @@ target_link_libraries(triton-opt PRIVATE
   TritonTestAnalysis
   TritonTestDialect
   TritonAMDGPUTestAnalysis
+  TritonTestProton
   # MLIR core
   MLIROptLib
   MLIRPass
@@ -35,6 +36,7 @@ target_link_libraries(triton-reduce PRIVATE
   TritonTestAnalysis
   TritonTestDialect
   TritonAMDGPUTestAnalysis
+  TritonTestProton
   # MLIR core
   MLIRReduceLib
   MLIRPass
@@ -54,6 +56,7 @@ target_link_libraries(triton-lsp PRIVATE
   TritonTestAnalysis
   TritonTestDialect
   TritonAMDGPUTestAnalysis
+  TritonTestProton
   # MLIR core
   MLIRLspServerLib
   MLIRPass
@@ -92,5 +95,6 @@ target_link_libraries(triton-tensor-layout PRIVATE
   ${dialect_libs}
   TritonTestAnalysis
   TritonTestDialect
+  TritonTestProton
   TritonAMDGPUTestAnalysis
   )
@@ -12,9 +12,15 @@
 
 #include "amd/include/Dialect/TritonAMDGPU/IR/Dialect.h"
 #include "amd/include/TritonAMDGPUTransforms/Passes.h"
-#include "third_party/nvidia/include/Dialect/NVGPU/IR/Dialect.h"
-#include "third_party/nvidia/include/Dialect/NVWS/IR/Dialect.h"
-#include "third_party/proton/dialect/include/Dialect/Proton/IR/Dialect.h"
+#include "nvidia/include/Dialect/NVGPU/IR/Dialect.h"
+#include "nvidia/include/Dialect/NVWS/IR/Dialect.h"
+#include "proton/Dialect/include/Conversion/ProtonGPUToLLVM/Passes.h"
+#include "proton/Dialect/include/Conversion/ProtonGPUToLLVM/ProtonAMDGPUToLLVM/Passes.h"
+#include "proton/Dialect/include/Conversion/ProtonGPUToLLVM/ProtonNvidiaGPUToLLVM/Passes.h"
+#include "proton/Dialect/include/Conversion/ProtonToProtonGPU/Passes.h"
+#include "proton/Dialect/include/Dialect/Proton/IR/Dialect.h"
+#include "proton/Dialect/include/Dialect/ProtonGPU/IR/Dialect.h"
+#include "proton/Dialect/include/Dialect/ProtonGPU/Transforms/Passes.h"
 #include "triton/Dialect/Gluon/Transforms/Passes.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
@@ -58,6 +64,9 @@ void registerTestMembarPass();
 void registerTestAMDGPUMembarPass();
 void registerTestTritonAMDGPURangeAnalysis();
 void registerTestLoopPeelingPass();
+namespace proton {
+void registerTestScopeIdAllocationPass();
+} // namespace proton
 } // namespace test
 } // namespace mlir
 
@@ -127,6 +136,16 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   // NVGPU transform passes
   mlir::registerNVHopperTransformsPasses();
 
+  // Proton passes
+  mlir::test::proton::registerTestScopeIdAllocationPass();
+  mlir::triton::proton::registerConvertProtonToProtonGPU();
+  mlir::triton::proton::gpu::registerConvertProtonNvidiaGPUToLLVM();
+  mlir::triton::proton::gpu::registerConvertProtonAMDGPUToLLVM();
+  mlir::triton::proton::gpu::registerAllocateProtonSharedMemoryPass();
+  mlir::triton::proton::gpu::registerAllocateProtonGlobalScratchBufferPass();
+  mlir::triton::proton::gpu::registerScheduleBufferStorePass();
+  mlir::triton::proton::gpu::registerAddSchedBarriersPass();
+
   registry.insert<
       mlir::triton::TritonDialect, mlir::cf::ControlFlowDialect,
       mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect,
@@ -136,7 +155,8 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
       mlir::gpu::GPUDialect, mlir::LLVM::LLVMDialect, mlir::NVVM::NVVMDialect,
       mlir::triton::nvgpu::NVGPUDialect, mlir::triton::nvws::NVWSDialect,
       mlir::triton::amdgpu::TritonAMDGPUDialect,
-      mlir::triton::proton::ProtonDialect, mlir::ROCDL::ROCDLDialect,
+      mlir::triton::proton::ProtonDialect,
+      mlir::triton::proton::gpu::ProtonGPUDialect, mlir::ROCDL::ROCDLDialect,
       mlir::triton::gpu::intel::TritonIntelGPUDialect,
       mlir::triton::TritonGEN::TritonGENDialect,
       mlir::triton::gluon::GluonDialect>();
 
@@ -319,6 +319,12 @@ class TritonLLVMIRRewriter : public IRRewriter, public TritonLLVMOpBuilder {
 #define str_attr(str) ::mlir::StringAttr::get(ctx, (str))
 
 namespace mlir {
+
+// See FuncOpToLLVM.cpp for details about Triton's function calling conventions
+constexpr int kProfileScratchBufferOffset = -1;
+constexpr int kGlobalScratchBufferOffset = -2;
+constexpr int kSharedMemoryOffset = -3;
+
 namespace triton {
 
 namespace gpu {
@@ -439,6 +445,9 @@ Value getGlobalScratchPtr(Location loc, RewriterBase &rewriter,
                           const TargetInfoBase &targetInfo,
                           FunctionOpInterface funcOp, Value allocOffset);
 
+Value getProfileScratchPtr(Location loc, RewriterBase &rewriter,
+                           FunctionOpInterface funcOp);
+
 Value getSharedMemoryBase(Location loc, RewriterBase &rewriter,
                           const TargetInfoBase &target, Operation *op);
 
 
@@ -60,7 +60,7 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
 inline const std::set<std::string> CACHE_NEUTRAL_ENV_VARS = {
     // clang-format off
     "TRITON_REPRODUCER_PATH",
-    "TRITON_ENABLE_PYTHON_STACKTRACE"
+    "TRITON_ENABLE_PYTHON_STACKTRACE",
     // clang-format on
 };
 
 
@@ -103,6 +103,8 @@ struct CallOpConversion : public ConvertOpToLLVMPattern<triton::CallOp> {
 
     promotedOperands.push_back(LLVM::getGlobalScratchPtr(
         loc, rewriter, targetInfo, caller, opOffsetVal));
+    promotedOperands.push_back(
+        LLVM::getProfileScratchPtr(loc, rewriter, caller));
     return promotedOperands;
   }
 
 
@@ -10,21 +10,20 @@ using namespace mlir;
 using namespace mlir::triton;
 
 // NOTE: [Additional Function Arguments]
+// Triton patches additional arguments to the function signature to support
+// (1) shared memory, (2) global scratch memory, and (3) profile scratch memory.
 // To support use of shared memory and global scratch memory inside of a
 // function, the caller allocates a single large block of the relevant memory
 // and calls the function with these extra arguments at the end.
-// Specifically, the last argument is the global scratch memory allocation and
-// the second to last is the shared memory allocation.
+// Profile scratch memory is only used when the function is instrumented for
+// profiling.
 //
 // For the kernel function itself, the shared memory base is a global symbol
 // so no additional function argument is required but global scratch memory
 // allocation is still passed in as the last argument. Though here the scratch
 // memory is shared between all programs, so a linear offset based on the
 // program id is required to get the local scratch base.
 
-/// FuncOp legalization pattern that converts MemRef arguments to pointers to
-/// MemRef descriptors (LLVM struct data types) containing all the MemRef type
-/// information.
 struct FuncOpConversion : public ConvertOpToLLVMPattern<triton::FuncOp> {
   FuncOpConversion(LLVMTypeConverter &converter,
                    const TargetInfoBase &targetInfo, PatternBenefit benefit)
@@ -56,6 +55,7 @@ struct FuncOpConversion : public ConvertOpToLLVMPattern<triton::FuncOp> {
     auto sharedPtrTy =
         LLVM::LLVMPointerType::get(ctx, targetInfo.getSharedAddressSpace());
     auto globalPtrTy = LLVM::LLVMPointerType::get(ctx, 1);
+    auto profilePtrTy = LLVM::LLVMPointerType::get(ctx, 1);
 
     // 1. Modify the function type to add the new arguments.
     auto funcTy = funcOp.getFunctionType();
@@ -73,6 +73,7 @@ struct FuncOpConversion : public ConvertOpToLLVMPattern<triton::FuncOp> {
       amendedInputTy.push_back(sharedPtrTy);
     }
     amendedInputTy.push_back(globalPtrTy);
+    amendedInputTy.push_back(profilePtrTy);
     auto amendedFuncTy =
         FunctionType::get(ctx, amendedInputTy, funcTy.getResults());
     // 2. Modify the argument attributes to add the new argument.
@@ -97,6 +98,7 @@ struct FuncOpConversion : public ConvertOpToLLVMPattern<triton::FuncOp> {
       region.addArgument(sharedPtrTy, loc);
     }
     region.addArgument(globalPtrTy, loc);
+    region.addArgument(profilePtrTy, loc);
     rewriter.inlineRegionBefore(region, amendedFuncOp.getBody(),
                                 amendedFuncOp.end());
     return amendedFuncOp;
Original file line number	Diff line number	Diff line change
`@@ -103,6 +103,8 @@ struct CallOpConversion : public ConvertOpToLLVMPattern<triton::CallOp> {`
`103`	`103`
`104`	`104`	`promotedOperands.push_back(LLVM::getGlobalScratchPtr(`
`105`	`105`	`loc, rewriter, targetInfo, caller, opOffsetVal));`
	`106`	`+ promotedOperands.push_back(`
	`107`	`+ LLVM::getProfileScratchPtr(loc, rewriter, caller));`
`106`	`108`	`return promotedOperands;`
`107`	`109`	`}`
`108`	`110`