Proper allocation of workspace

matthiasdiener · matthiasdiener · commit 3c9de0778120 · 2025-11-18T16:22:26.000-06:00
diff --git a/transformer_engine/common/include/transformer_engine/recipe.h b/transformer_engine/common/include/transformer_engine/recipe.h
@@ -84,6 +84,8 @@ void nvte_delayed_scaling_recipe_amax_and_scale_update_after_reduction(
  */
 void nvte_compute_amax(const NVTETensor input, NVTETensor output, cudaStream_t stream);
 
+void nvte_compute_amax_with_workspace(const NVTETensor input_, const NVTETensor output_, const NVTETensor workspace_, cudaStream_t stream);
+
 /*! \brief Update an FP8 tensor's scale based on its amax.
  *
  *  This is only supported for FP8 tensors with per-tensor scaling.
diff --git a/transformer_engine/common/recipe/current_scaling.cu b/transformer_engine/common/recipe/current_scaling.cu
@@ -114,11 +114,7 @@ void launch_amax_kernel(const InputType *input, float *amax, const size_t N, flo
   constexpr size_t max_blocks = 65535;
   num_blocks = std::min(num_blocks, max_blocks);
 
-  const bool UseBlockAmax = (block_amax != nullptr);
-
-  if (UseBlockAmax) {
-    NVTE_CHECK(block_capacity >= num_blocks);
-  }
+  const bool UseBlockAmax = (block_amax != nullptr) && (block_capacity >= num_blocks);
 
   // Launch kernel
   switch (align) {
@@ -167,6 +163,10 @@ void launch_amax_kernel(const InputType *input, float *amax, const size_t N, flo
 }  // namespace transformer_engine
 
 void nvte_compute_amax(const NVTETensor input_, const NVTETensor output_, cudaStream_t stream) {
+  nvte_compute_amax_with_workspace(input_, output_, /*workspace=*/nullptr, stream);
+}
+
+void nvte_compute_amax_with_workspace(const NVTETensor input_, const NVTETensor output_, const NVTETensor workspace_, cudaStream_t stream) {
   NVTE_API_CALL(nvte_compute_amax);
   using namespace transformer_engine;
 
@@ -202,12 +202,19 @@ void nvte_compute_amax(const NVTETensor input_, const NVTETensor output_, cudaSt
              to_string(output.amax.dtype), ")");
   CheckOutputTensor(output, "output_compute_amax", true);
 
-  // Interpret output.data as workspace if present
-  float *block_amax = nullptr;
+  // Optional workspace
+  float* block_amax = nullptr;
   size_t block_capacity = 0;
-  if (output.data.dptr != nullptr) {
-    block_amax     = reinterpret_cast<float*>(output.data.dptr);
-    block_capacity = output.data.numel();     // #floats in workspace
+
+  if (workspace_ != nullptr) {
+    auto &workspace = *reinterpret_cast<Tensor *>(workspace_);
+    NVTE_CHECK(workspace.data.dptr != nullptr,
+               "Workspace tensor for amax computation has no data");
+    NVTE_CHECK(workspace.data.dtype == DType::kFloat32,
+               "Workspace tensor for amax computation must be FP32, got dtype=",
+               to_string(workspace.data.dtype));
+    block_amax     = reinterpret_cast<float*>(workspace.data.dptr);
+    block_capacity = workspace.data.numel();
   }
 
   // Compute amax
diff --git a/transformer_engine/pytorch/csrc/extensions/cast.cpp b/transformer_engine/pytorch/csrc/extensions/cast.cpp
@@ -52,8 +52,25 @@ py::object quantize(const at::Tensor& tensor, py::handle quantizer, const py::ob
   if (detail::IsFloat8CurrentScalingQuantizers(quantizer.ptr())) {
     // my_quantizer here has to be a Float8CurrentScalingQuantizer
     auto my_quantizer_cs = static_cast<Float8CurrentScalingQuantizer*>(my_quantizer.get());
+
+    // workspace for nvte_compute_amax_with_workspace
+    const auto N = static_cast<size_t>(input_tensor.numel());
+    constexpr size_t threads = 512;          // FIXME: should match amax_kernel_threads
+    constexpr size_t max_blocks_hw = 65535;
+
+    // Worst-case (nvec = 1) upper bound on number of blocks.
+    size_t max_blocks = std::min(DIVUP(N, threads), max_blocks_hw);
+
+    // Allocate FP32 workspace for block-wise amax
+    auto ws = at::empty({static_cast<long>(max_blocks)},
+                        tensor.options().dtype(at::kFloat));
+
+    TensorWrapper te_workspace = makeTransformerEngineTensor(ws);
+
     NVTE_SCOPED_GIL_RELEASE({
-      nvte_compute_amax(te_input.data(), te_output.data(), at::cuda::getCurrentCUDAStream());
+      nvte_compute_amax_with_workspace(te_input.data(), te_output.data(),
+          te_workspace.data(),
+          at::cuda::getCurrentCUDAStream());
     });
     // check if we need to do amax reudction (depending on model parallel configs)
     if (my_quantizer_cs->with_amax_reduction) {
diff --git a/transformer_engine/pytorch/csrc/extensions/recipe.cpp b/transformer_engine/pytorch/csrc/extensions/recipe.cpp
@@ -30,19 +30,24 @@ void compute_amax(const at::Tensor& tensor, at::Tensor& amax) {
   size_t max_blocks = std::min(DIVUP(static_cast<size_t>(N), threads),
                                max_blocks_hw);
 
-  // Allocate workspace for the fake output tensor.
-  // This will be the block_amax buffer.
+  // Allocate workspace for the block_amax buffer.
   auto ws = at::empty({static_cast<long>(max_blocks)},
                       tensor.options().dtype(at::kFloat));
 
   std::vector<size_t> ws_shape{static_cast<size_t>(max_blocks)};
 
   TensorWrapper fake_te_output(
-      ws.data_ptr(), ws_shape,
+      nullptr, te_input.shape(),
       DType::kFloat8E4M3,  // It doesn't matter because we only compute amax.
       amax.data_ptr<float>());
 
-  nvte_compute_amax(te_input.data(), fake_te_output.data(), at::cuda::getCurrentCUDAStream());
+  TensorWrapper te_workspace(
+      ws.data_ptr(), ws_shape,
+      DType::kFloat32,
+      nullptr
+  );
+
+  nvte_compute_amax_with_workspace(te_input.data(), fake_te_output.data(), te_workspace.data(), at::cuda::getCurrentCUDAStream());
 }
 
 void fused_amax_and_scale_update_after_reduction(const at::Tensor& amax_reduction_buffer,

Original file line number	Diff line number	Diff line change
`@@ -84,6 +84,8 @@ void nvte_delayed_scaling_recipe_amax_and_scale_update_after_reduction(`
`84`	`84`	`*/`
`85`	`85`	`void nvte_compute_amax(const NVTETensor input, NVTETensor output, cudaStream_t stream);`
`86`	`86`
	`87`	`+void nvte_compute_amax_with_workspace(const NVTETensor input_, const NVTETensor output_, const NVTETensor workspace_, cudaStream_t stream);`
	`88`	`+`
`87`	`89`	`/*! \brief Update an FP8 tensor's scale based on its amax.`
`88`	`90`	`*`
`89`	`91`	`* This is only supported for FP8 tensors with per-tensor scaling.`