intel
diff --git a/‎CMakeLists.txt
Lines changed: 2 additions & 2 deletions b/‎CMakeLists.txt
Lines changed: 2 additions & 2 deletions
diff --git a/‎Makefile
Lines changed: 1 addition & 2 deletions b/‎Makefile
Lines changed: 1 addition & 2 deletions
diff --git a/‎bin/CMakeLists.txt
Lines changed: 0 additions & 4 deletions b/‎bin/CMakeLists.txt
Lines changed: 0 additions & 4 deletions
diff --git a/‎bin/RegisterTritonDialects.h
Lines changed: 4 additions & 24 deletions b/‎bin/RegisterTritonDialects.h
Lines changed: 4 additions & 24 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h
Lines changed: 0 additions & 9 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h
Lines changed: 0 additions & 9 deletions
diff --git a/‎include/triton/Tools/Sys/GetEnv.hpp
Lines changed: 1 addition & 1 deletion b/‎include/triton/Tools/Sys/GetEnv.hpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ControlFlowOpToLLVM.cpp
Lines changed: 0 additions & 2 deletions b/‎lib/Conversion/TritonGPUToLLVM/ControlFlowOpToLLVM.cpp
Lines changed: 0 additions & 2 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/FuncOpToLLVM.cpp
Lines changed: 5 additions & 7 deletions b/‎lib/Conversion/TritonGPUToLLVM/FuncOpToLLVM.cpp
Lines changed: 5 additions & 7 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp
Lines changed: 3 additions & 14 deletions b/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp
Lines changed: 3 additions & 14 deletions
diff --git a/‎lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
Lines changed: 13 additions & 0 deletions b/‎lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
Lines changed: 13 additions & 0 deletions
@@ -228,7 +228,7 @@ if(TRITON_BUILD_PYTHON_MODULE)
   endif()
   # We always build proton dialect
   list(APPEND TRITON_PLUGIN_NAMES "proton")
-  add_subdirectory(third_party/proton/Dialect)
+  add_subdirectory(third_party/proton/dialect)
 
   get_property(triton_libs GLOBAL PROPERTY TRITON_LIBS)
   get_property(triton_plugins GLOBAL PROPERTY TRITON_PLUGINS)
@@ -360,7 +360,7 @@ if(NOT TRITON_BUILD_PYTHON_MODULE)
   foreach(CODEGEN_BACKEND ${TRITON_CODEGEN_BACKENDS})
     add_subdirectory(third_party/${CODEGEN_BACKEND})
   endforeach()
-  add_subdirectory(third_party/proton/Dialect)
+  add_subdirectory(third_party/proton/dialect)
 endif()
 
 find_package(Threads REQUIRED)
 
@@ -67,8 +67,7 @@ test-interpret: all
 
 .PHONY: test-proton
 test-proton: all
-	$(PYTEST) -s -n 8 third_party/proton/test --ignore=third_party/proton/test/test_override.py
-	$(PYTEST) -s third_party/proton/test/test_override.py
+	$(PYTEST) -s -n 8 third_party/proton/test
 
 .PHONY: test-python
 test-python: test-unit test-regression test-interpret test-proton
 
@@ -15,7 +15,6 @@ target_link_libraries(triton-opt PRIVATE
   TritonTestAnalysis
   TritonTestDialect
   TritonAMDGPUTestAnalysis
-  TritonTestProton
   # MLIR core
   MLIROptLib
   MLIRPass
@@ -36,7 +35,6 @@ target_link_libraries(triton-reduce PRIVATE
   TritonTestAnalysis
   TritonTestDialect
   TritonAMDGPUTestAnalysis
-  TritonTestProton
   # MLIR core
   MLIRReduceLib
   MLIRPass
@@ -56,7 +54,6 @@ target_link_libraries(triton-lsp PRIVATE
   TritonTestAnalysis
   TritonTestDialect
   TritonAMDGPUTestAnalysis
-  TritonTestProton
   # MLIR core
   MLIRLspServerLib
   MLIRPass
@@ -95,6 +92,5 @@ target_link_libraries(triton-tensor-layout PRIVATE
   ${dialect_libs}
   TritonTestAnalysis
   TritonTestDialect
-  TritonTestProton
   TritonAMDGPUTestAnalysis
   )
@@ -12,15 +12,9 @@
 
 #include "amd/include/Dialect/TritonAMDGPU/IR/Dialect.h"
 #include "amd/include/TritonAMDGPUTransforms/Passes.h"
-#include "nvidia/include/Dialect/NVGPU/IR/Dialect.h"
-#include "nvidia/include/Dialect/NVWS/IR/Dialect.h"
-#include "proton/Dialect/include/Conversion/ProtonGPUToLLVM/Passes.h"
-#include "proton/Dialect/include/Conversion/ProtonGPUToLLVM/ProtonAMDGPUToLLVM/Passes.h"
-#include "proton/Dialect/include/Conversion/ProtonGPUToLLVM/ProtonNvidiaGPUToLLVM/Passes.h"
-#include "proton/Dialect/include/Conversion/ProtonToProtonGPU/Passes.h"
-#include "proton/Dialect/include/Dialect/Proton/IR/Dialect.h"
-#include "proton/Dialect/include/Dialect/ProtonGPU/IR/Dialect.h"
-#include "proton/Dialect/include/Dialect/ProtonGPU/Transforms/Passes.h"
+#include "third_party/nvidia/include/Dialect/NVGPU/IR/Dialect.h"
+#include "third_party/nvidia/include/Dialect/NVWS/IR/Dialect.h"
+#include "third_party/proton/dialect/include/Dialect/Proton/IR/Dialect.h"
 #include "triton/Dialect/Gluon/Transforms/Passes.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
@@ -64,9 +58,6 @@ void registerTestMembarPass();
 void registerTestAMDGPUMembarPass();
 void registerTestTritonAMDGPURangeAnalysis();
 void registerTestLoopPeelingPass();
-namespace proton {
-void registerTestScopeIdAllocationPass();
-} // namespace proton
 } // namespace test
 } // namespace mlir
 
@@ -136,16 +127,6 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   // NVGPU transform passes
   mlir::registerNVHopperTransformsPasses();
 
-  // Proton passes
-  mlir::test::proton::registerTestScopeIdAllocationPass();
-  mlir::triton::proton::registerConvertProtonToProtonGPU();
-  mlir::triton::proton::gpu::registerConvertProtonNvidiaGPUToLLVM();
-  mlir::triton::proton::gpu::registerConvertProtonAMDGPUToLLVM();
-  mlir::triton::proton::gpu::registerAllocateProtonSharedMemoryPass();
-  mlir::triton::proton::gpu::registerAllocateProtonGlobalScratchBufferPass();
-  mlir::triton::proton::gpu::registerScheduleBufferStorePass();
-  mlir::triton::proton::gpu::registerAddSchedBarriersPass();
-
   registry.insert<
       mlir::triton::TritonDialect, mlir::cf::ControlFlowDialect,
       mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect,
@@ -155,8 +136,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
       mlir::gpu::GPUDialect, mlir::LLVM::LLVMDialect, mlir::NVVM::NVVMDialect,
       mlir::triton::nvgpu::NVGPUDialect, mlir::triton::nvws::NVWSDialect,
       mlir::triton::amdgpu::TritonAMDGPUDialect,
-      mlir::triton::proton::ProtonDialect,
-      mlir::triton::proton::gpu::ProtonGPUDialect, mlir::ROCDL::ROCDLDialect,
+      mlir::triton::proton::ProtonDialect, mlir::ROCDL::ROCDLDialect,
       mlir::triton::gpu::intel::TritonIntelGPUDialect,
       mlir::triton::TritonGEN::TritonGENDialect,
       mlir::triton::gluon::GluonDialect>();
 
@@ -319,12 +319,6 @@ class TritonLLVMIRRewriter : public IRRewriter, public TritonLLVMOpBuilder {
 #define str_attr(str) ::mlir::StringAttr::get(ctx, (str))
 
 namespace mlir {
-
-// See FuncOpToLLVM.cpp for details about Triton's function calling conventions
-constexpr int kProfileScratchBufferOffset = -1;
-constexpr int kGlobalScratchBufferOffset = -2;
-constexpr int kSharedMemoryOffset = -3;
-
 namespace triton {
 
 namespace gpu {
@@ -445,9 +439,6 @@ Value getGlobalScratchPtr(Location loc, RewriterBase &rewriter,
                           const TargetInfoBase &targetInfo,
                           FunctionOpInterface funcOp, Value allocOffset);
 
-Value getProfileScratchPtr(Location loc, RewriterBase &rewriter,
-                           FunctionOpInterface funcOp);
-
 Value getSharedMemoryBase(Location loc, RewriterBase &rewriter,
                           const TargetInfoBase &target, Operation *op);
 
 
@@ -59,7 +59,7 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
 inline const std::set<std::string> CACHE_NEUTRAL_ENV_VARS = {
     // clang-format off
     "TRITON_REPRODUCER_PATH",
-    "TRITON_ENABLE_PYTHON_STACKTRACE",
+    "TRITON_ENABLE_PYTHON_STACKTRACE"
     // clang-format on
 };
 
 
@@ -103,8 +103,6 @@ struct CallOpConversion : public ConvertOpToLLVMPattern<triton::CallOp> {
 
     promotedOperands.push_back(LLVM::getGlobalScratchPtr(
         loc, rewriter, targetInfo, caller, opOffsetVal));
-    promotedOperands.push_back(
-        LLVM::getProfileScratchPtr(loc, rewriter, caller));
     return promotedOperands;
   }
 
 
@@ -10,20 +10,21 @@ using namespace mlir;
 using namespace mlir::triton;
 
 // NOTE: [Additional Function Arguments]
-// Triton patches additional arguments to the function signature to support
-// (1) shared memory, (2) global scratch memory, and (3) profile scratch memory.
 // To support use of shared memory and global scratch memory inside of a
 // function, the caller allocates a single large block of the relevant memory
 // and calls the function with these extra arguments at the end.
-// Profile scratch memory is only used when the function is instrumented for
-// profiling.
+// Specifically, the last argument is the global scratch memory allocation and
+// the second to last is the shared memory allocation.
 //
 // For the kernel function itself, the shared memory base is a global symbol
 // so no additional function argument is required but global scratch memory
 // allocation is still passed in as the last argument. Though here the scratch
 // memory is shared between all programs, so a linear offset based on the
 // program id is required to get the local scratch base.
 
+/// FuncOp legalization pattern that converts MemRef arguments to pointers to
+/// MemRef descriptors (LLVM struct data types) containing all the MemRef type
+/// information.
 struct FuncOpConversion : public ConvertOpToLLVMPattern<triton::FuncOp> {
   FuncOpConversion(LLVMTypeConverter &converter,
                    const TargetInfoBase &targetInfo, PatternBenefit benefit)
@@ -55,7 +56,6 @@ struct FuncOpConversion : public ConvertOpToLLVMPattern<triton::FuncOp> {
     auto sharedPtrTy =
         LLVM::LLVMPointerType::get(ctx, targetInfo.getSharedAddressSpace());
     auto globalPtrTy = LLVM::LLVMPointerType::get(ctx, 1);
-    auto profilePtrTy = LLVM::LLVMPointerType::get(ctx, 1);
 
     // 1. Modify the function type to add the new arguments.
     auto funcTy = funcOp.getFunctionType();
@@ -73,7 +73,6 @@ struct FuncOpConversion : public ConvertOpToLLVMPattern<triton::FuncOp> {
       amendedInputTy.push_back(sharedPtrTy);
     }
     amendedInputTy.push_back(globalPtrTy);
-    amendedInputTy.push_back(profilePtrTy);
     auto amendedFuncTy =
         FunctionType::get(ctx, amendedInputTy, funcTy.getResults());
     // 2. Modify the argument attributes to add the new argument.
@@ -98,7 +97,6 @@ struct FuncOpConversion : public ConvertOpToLLVMPattern<triton::FuncOp> {
       region.addArgument(sharedPtrTy, loc);
     }
     region.addArgument(globalPtrTy, loc);
-    region.addArgument(profilePtrTy, loc);
     rewriter.inlineRegionBefore(region, amendedFuncOp.getBody(),
                                 amendedFuncOp.end());
     return amendedFuncOp;
 
@@ -1198,7 +1198,7 @@ SharedMemoryObject getSharedMemoryObjectFromStruct(Location loc,
 Value getStackPointer(RewriterBase &rewriter, FunctionOpInterface funcOp) {
   // See NOTE: [Additional Function Arguments]
   if (!isKernel(funcOp)) {
-    return funcOp.getArgument(funcOp.getNumArguments() + kSharedMemoryOffset);
+    return funcOp.getArgument(funcOp.getNumArguments() - 2);
   }
 
   auto mod = funcOp->getParentOfType<ModuleOp>();
@@ -1213,8 +1213,7 @@ Value getGlobalScratchPtr(Location loc, RewriterBase &rewriter,
   // See NOTE: [Additional Function Arguments]
   if (!isKernel(funcOp)) {
     // Base for this function
-    auto gmemBase = funcOp.getArgument(funcOp.getNumArguments() +
-                                       kGlobalScratchBufferOffset);
+    auto gmemBase = funcOp.getArgument(funcOp.getNumArguments() - 1);
     if (!allocOffset) {
       return gmemBase;
     }
@@ -1225,8 +1224,7 @@ Value getGlobalScratchPtr(Location loc, RewriterBase &rewriter,
   }
 
   // Base for entire kernel
-  auto gmemBase =
-      funcOp.getArgument(funcOp.getNumArguments() + kGlobalScratchBufferOffset);
+  auto gmemBase = funcOp.getArgument(funcOp.getNumArguments() - 1);
 
   ModuleOp mod = funcOp.getOperation()->getParentOfType<ModuleOp>();
   auto allocSizeAttr = mod.getOperation()->getAttrOfType<mlir::IntegerAttr>(
@@ -1268,15 +1266,6 @@ Value getGlobalScratchPtr(Location loc, RewriterBase &rewriter,
   return res;
 }
 
-Value getProfileScratchPtr(Location loc, RewriterBase &rewriter,
-                           FunctionOpInterface funcOp) {
-  // See NOTE: [Additional Function Arguments]
-  // FIXME(Keren): This is broken when we have device functions, we
-  // need to implement proper calling convention
-  return funcOp.getArgument(funcOp.getNumArguments() +
-                            kProfileScratchBufferOffset);
-}
-
 Value getSharedMemoryBase(Location loc, RewriterBase &rewriter,
                           const TargetInfoBase &target, Operation *op) {
   auto ptrTy = LLVM::LLVMPointerType::get(rewriter.getContext(),
 
@@ -3,6 +3,7 @@
 #include "mlir/Dialect/UB/IR/UBOps.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "third_party/proton/dialect/include/Dialect/Proton/IR/Dialect.h"
 #include "triton/Conversion/TritonToTritonGPU/Passes.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/Triton/IR/Utility.h"
@@ -596,6 +597,17 @@ void populateTritonPatterns(TritonGPUTypeConverter &typeConverter,
       // clang-format on
       >(typeConverter, context);
 }
+// Proton patterns
+// NOTE: Because Proton's inputs are scalars and not tensors this conversion
+// isn't strictly necessary however you could envision a case where we pass in
+// tensors in for Triton object specific tracing operations in which case we
+// would need to fill in the OpConversionPattern
+void populateProtonPatterns(TritonGPUTypeConverter &typeConverter,
+                            RewritePatternSet &patterns) {
+  MLIRContext *context = patterns.getContext();
+  patterns.add<GenericOpPattern<triton::proton::RecordOp>>(typeConverter,
+                                                           context);
+}
 //
 // SCF patterns
 //
@@ -809,6 +821,7 @@ class ConvertTritonToTritonGPU
     populateArithPatternsAndLegality(typeConverter, patterns, target);
     populateMathPatternsAndLegality(typeConverter, patterns, target);
     populateTritonPatterns(typeConverter, patterns, numCTAs);
+    populateProtonPatterns(typeConverter, patterns);
     // TODO: can we use
     //    mlir::scf::populateSCFStructurealTypeConversionsAndLegality(...) here?
     populateSCFPatterns(typeConverter, patterns);
Original file line number	Diff line number	Diff line change
`@@ -103,8 +103,6 @@ struct CallOpConversion : public ConvertOpToLLVMPattern<triton::CallOp> {`
`103`	`103`
`104`	`104`	`promotedOperands.push_back(LLVM::getGlobalScratchPtr(`
`105`	`105`	`loc, rewriter, targetInfo, caller, opOffsetVal));`
`106`		`- promotedOperands.push_back(`
`107`		`- LLVM::getProfileScratchPtr(loc, rewriter, caller));`
`108`	`106`	`return promotedOperands;`
`109`	`107`	`}`
`110`	`108`