Integrate Triton up to [9732c047](https://github.com/openai/triton/commits/9732c04701bd856daca89bde38bafa4636ca56a8)

gflegar · Google-ML-Automation · commit c4d19ca83cdc · 2024-12-03T10:55:56.000-08:00
PiperOrigin-RevId: 702397897
diff --git a/jaxlib/gpu/triton_kernels.cc b/jaxlib/gpu/triton_kernels.cc
@@ -466,7 +466,8 @@ KernelCall::KernelCall(Kernel kernel, uint32_t grid_0, uint32_t grid_1,
 
 absl::Status KernelCall::Launch(gpuStream_t stream, void** buffers) {
   std::vector<void*> params;
-  params.reserve(parameters_.size());
+  // We need an additional parameter for the scratchpad buffer.
+  params.reserve(parameters_.size() + 1);
   for (size_t i = 0; i < parameters_.size(); ++i) {
     const Parameter& param = parameters_[i];
     if (std::holds_alternative<Parameter::Array>(param.value)) {
@@ -492,6 +493,14 @@ absl::Status KernelCall::Launch(gpuStream_t stream, void** buffers) {
           param.value)));
     }
   }
+  // Triton's kernel ABI expects an additional scratchpad global memory.
+  // For now it is only used for on-device creation of TMA descriptors, which
+  // we do not use yet, so we are just replacing this argument with a null
+  // pointer.
+  // TODO: b/381242007 - Allocate a proper buffer if we want to use
+  // device-side TMA APIs.
+  void* scratch_ptr = nullptr;  // Alive until kernel_.Launch returns.
+  params.push_back(&scratch_ptr);
 
   return kernel_.Launch(stream, grid_, params.data());
 }