NVIDIA · timmoon10 · Jan 10, 2026 · Jan 7, 2026 · timmoon10 · Jan 7, 2026
diff --git a/.gitignore b/.gitignore
@@ -41,3 +41,4 @@ compile_commands.json
 .nfs
 tensor_dumps/
 artifacts/
+*.DS_Store
diff --git a/transformer_engine/common/include/transformer_engine/cast.h b/transformer_engine/common/include/transformer_engine/cast.h
@@ -130,7 +130,7 @@ void nvte_quantize_v2(const NVTETensor input, NVTETensor output,
  *  \param[in]     stream           CUDA stream used for the operation.
  */
 void nvte_quantize_dbias(const NVTETensor input, NVTETensor output, NVTETensor dbias,
-                         NVTETensor workplace, cudaStream_t stream);
+                         NVTETensor workspace, cudaStream_t stream);
 
 /*! \brief Computes backward of GeLU operation on the input, then casts to FP8/MXFP8.
  *         Additionally, reduces the result of the GeLU backward along columns.
@@ -263,7 +263,8 @@ void nvte_dequantize(const NVTETensor input, NVTETensor output, cudaStream_t str
  *
  *  \param[in]      inputs           List of input tensors to be cast.
  *  \param[in,out]  outputs          List of output quantized tensors.
- *  \param[in]      quant_config    (Optional) Quantization configurations.
+ *  \param[in]      quant_config     (Optional) Quantization configurations.
+ *  \param[in]      num_tensors      Number of input and output tensors.
  *  \param[in]      stream           CUDA stream used for the operation.
  */
 void nvte_multi_tensor_quantize(const NVTETensor *inputs, NVTETensor *outputs,

diff --git a/transformer_engine/common/include/transformer_engine/cast_transpose_noop.h b/transformer_engine/common/include/transformer_engine/cast_transpose_noop.h
@@ -4,8 +4,8 @@
  * See LICENSE for license information.
  ************************************************************************/
 
-/*! \file transpose_with_noop.h
- *  \brief Functions handling transposes with no-op.
+/*! \file cast_transpose_noop.h
+ *  \brief Transpose functions with no-op flag.
  */
 
 #ifndef TRANSFORMER_ENGINE_CAST_TRANSPOSE_WITH_NOOP_H_

diff --git a/transformer_engine/common/include/transformer_engine/fused_attn.h b/transformer_engine/common/include/transformer_engine/fused_attn.h
@@ -409,7 +409,6 @@ void nvte_fused_attn_bwd_qkvpacked(
  *  \param[in]     softmax_type              Attention softmax type.
  *  \param[in]     window_size_left          Sliding window size (the left half).
  *  \param[in]     window_size_right         Sliding window size (the right half).
- *  \param[in]     deterministic             Whether to execute with deterministic behaviours.
  *  \param[in]     workspace                 Workspace tensor.
  *  \param[in]     stream                    CUDA stream used for this operation.
  */
@@ -673,7 +672,7 @@ void nvte_populate_rng_state_async(NVTETensor rng_state_dst, const NVTETensor se
  *  \param[in]     len                      batch_size x sequence_length.
  *  \param[in]     stream                   CUDA stream used for this operation.
  */
-uint32_t nvte_get_runtime_num_segments(NVTETensor cu_seqlen, NVTETensor workspace, size_t len,
+uint32_t nvte_get_runtime_num_segments(NVTETensor cu_seqlens, NVTETensor workspace, size_t len,
                                        cudaStream_t stream);
 
 /*!  \brief Set the seed and offset for RNG state.
@@ -830,8 +829,7 @@ void nvte_convert_thd_to_bshd(NVTETensor tensor, NVTETensor cu_seqlens, NVTETens
  *  \param[in]     tensor           Input tensor.
  *  \param[in]     cu_seqlens       Cumulative sequence lengths, [batch_size + 1].
  *  \param[out]    new_tensor       Output tensor.
- *  \param[in]     b                Batch size.
- *  \param[in]     max_seq_len      Maximum sequence length.
+ *  \param[in]     t                Packed sequence length.
  *  \param[in]     stream           CUDA stream used for this operation.
  */
 void nvte_convert_bshd_to_thd(NVTETensor tensor, NVTETensor cu_seqlens, NVTETensor new_tensor,

diff --git a/transformer_engine/common/include/transformer_engine/gemm.h b/transformer_engine/common/include/transformer_engine/gemm.h
@@ -255,9 +255,11 @@ class MatmulConfigWrapper {
   MatmulConfigWrapper(const MatmulConfigWrapper &) = delete;
   MatmulConfigWrapper &operator=(const MatmulConfigWrapper &) = delete;
 
+  /*! \brief Move constructor. */
   MatmulConfigWrapper(MatmulConfigWrapper &&other) : config_{other.config_} {
     other.config_ = nullptr;
   }
+  /*! \brief Move-assignment operator. */
   MatmulConfigWrapper &operator=(MatmulConfigWrapper &&other) {
     if (config_ != nullptr) {
       nvte_destroy_matmul_config(config_);

diff --git a/transformer_engine/common/include/transformer_engine/multi_tensor.h b/transformer_engine/common/include/transformer_engine/multi_tensor.h
@@ -288,7 +288,7 @@ void nvte_multi_tensor_compute_scale_inv_e8m0_cuda(int chunk_size, NVTETensor **
  *  and populate the amax of the corresponding output tensor.
  *
  *  \param[in]      input            Input tensor.
- *  \param[in,out]  amaxes           Array of output tensors. Only the amax is updated.
+ *  \param[in,out]  outputs          Array of output tensors. Only the amax is updated.
  *  \param[in]      split_sections   Size of each tensor split along dimension 0.
  *  \param[in]      num_tensors      Number of tensor splits.
  *  \param[in]      stream           CUDA stream used for the operation.

diff --git a/transformer_engine/common/include/transformer_engine/normalization.h b/transformer_engine/common/include/transformer_engine/normalization.h
@@ -163,11 +163,16 @@ void nvte_rmsnorm_bwd_add(const NVTETensor dz, const NVTETensor x, const NVTETen
                           NVTETensor dgamma, NVTETensor workspace, const int multiprocessorCount,
                           const bool zero_centered_gamma, cudaStream_t stream);
 
-/*! \brief Helper to enable cuDNN backend for normalization
+/*! \brief Set whether to enable cuDNN backend for normalization forward.
  *
- *  \param[in]     bool              Enable if True
+ *  \param[in]  enable  Whether to enable cuDNN backend.
  */
 void nvte_enable_cudnn_norm_fwd(bool enable);
+
+/*! \brief Set whether to enable cuDNN backend for normalization backward.
+ *
+ *  \param[in]  enable  Whether to enable cuDNN backend.
+ */
 void nvte_enable_cudnn_norm_bwd(bool enable);
 
 /*! \brief Control whether norm computes `gamma += 1.0` for zero-centered gamma
@@ -176,11 +181,14 @@ void nvte_enable_cudnn_norm_bwd(bool enable);
  *  Currently this only applies to the CuDNN backend. If CuDNN is not used,
  *  this setting has no effect.
  *
- *  \param[in]     bool              Enable if True
+ *  \param[in]  enable  Whether to enable zero-centered gamma.
  */
 void nvte_enable_zero_centered_gamma_in_weight_dtype(bool enable);
 
+#ifdef __cplusplus
+/*! \brief Normalization function type */
 enum class NVTE_Norm_Type { LayerNorm, RMSNorm };
+#endif
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/transformer_engine/common/include/transformer_engine/recipe.h b/transformer_engine/common/include/transformer_engine/recipe.h
@@ -23,7 +23,7 @@ extern "C" {
  * the last, the last entry shifts to the second to last) and the
  * first entry is set to zero. The scaling factor is estimated so the
  * FP8 tensor's maximum absolute value is
- * @f$ 2^{-\text{margin}} \text{max}_\text{fp8\_dtype} @f$.
+ * @f$ 2^{-margin} \max_{fp8\_dtype} @f$.
  *
  *  \param[in] amax_history             History of maximum absolute values.
  *                                      Shape: [history_length, num_scales]
@@ -54,7 +54,7 @@ void nvte_delayed_scaling_recipe_amax_and_scale_update(
  * the last, the last entry shifts to the second to last) and the
  * first entry is set to zero. The scaling factor is estimated so the
  * FP8 tensor's maximum absolute value is
- * @f$ 2^{-\text{margin}} \text{max}_\text{fp8\_dtype} @f$.
+ * @f$ 2^{-margin} \max_{fp8\_dtype} @f$.
  *
  *  \param[in] amax_reduction_buffer    The contiguous buffer used for amax reduction.
  *                                      Shape: [num_scales * num_tensors]

diff --git a/transformer_engine/common/include/transformer_engine/swizzle.h b/transformer_engine/common/include/transformer_engine/swizzle.h
@@ -34,6 +34,7 @@ void nvte_swizzle_scaling_factors(const NVTETensor input, NVTETensor output, cud
  *
  *  \param[in]     inputs       Input tensors with non-swizzled scale_inv.
  *  \param[in,out] outputs      Output tensors which hosts swizzled scale_inv.
+ *  \param[in]     num_tensors  Number of input and output tensors.
  *  \param[in]     stream       CUDA stream used for the operation.
  *
  *  Requirements:

diff --git a/transformer_engine/common/include/transformer_engine/transformer_engine.h b/transformer_engine/common/include/transformer_engine/transformer_engine.h
@@ -51,8 +51,11 @@ struct NVTEShape {
  *  It does not own the memory it points to.
  */
 struct NVTEBasicTensor {
+  /*! Pointer to data buffer. */
   void *data_ptr;
+  /*! Data type. */
   NVTEDType dtype;
+  /*! Tensor shape. */
   NVTEShape shape;
 };
 
@@ -144,7 +147,7 @@ void *nvte_tensor_columnwise_data(const NVTETensor tensor);
  *
  *  \param[data] Pointer to start of shape array. If NULL, the shape
  *               will be filled with zeros.
- *  \param[data] Number of dimensions (must be <= 14)
+ *  \param[ndim] Number of dimensions (must be <= 14)
  *
  *  \return A shape. The shape will own its own copy of the data.
  */
@@ -177,7 +180,7 @@ size_t nvte_tensor_ndims(const NVTETensor tensor);
 /*! \brief Get the size of a specific tensor dimension.
  *
  *  \param[in] tensor Tensor.
- *  \param[in] size_t Dimension index.
+ *  \param[in] dim    Dimension index.
  *
  *  \return Size of the tensor at the specified dimension.
  */
@@ -258,8 +261,7 @@ NVTEShape nvte_tensor_scale_inv_shape(const NVTETensor tensor);
 /*! \brief Reset tensor value to zero.
  *
  *  \param[in] tensor Tensor.
- *
- *  \return A scale_inv shape of the input tensor.
+ *  \param[in] stream CUDA stream to use for the operation.
  */
 void nvte_zero_tensor(const NVTETensor tensor, cudaStream_t stream);
 
@@ -539,7 +541,7 @@ enum class DType {
 /*! \brief Check if TE datatype is FP8
  *
  * Return true if TE datatype is FP8
- *  \param[in] DType      TE Datatype of interest
+ *  \param[in] t      TE Datatype of interest
  */
 inline bool is_fp8_dtype(const DType t) {
   return t == DType::kFloat8E4M3 || t == DType::kFloat8E5M2;
@@ -548,14 +550,14 @@ inline bool is_fp8_dtype(const DType t) {
 /*! \brief Check if TE datatype is FP4
  *
  * Return true if TE datatype is FP4
- *  \param[in] DType      TE Datatype of interest
+ *  \param[in] t      TE Datatype of interest
  */
 inline bool is_fp4_dtype(const DType t) { return t == DType::kFloat4E2M1; }
 
 /*! \brief Check if TE datatype is high precision (FP32, FP16, BF16)
  *
  * Return true if TE datatype is high precision
- *  \param[in] DType      TE Datatype of interest
+ *  \param[in] t      TE Datatype of interest
  */
 inline bool is_high_precision_dtype(const DType t) {
   return t == DType::kFloat32 || t == DType::kBFloat16 || t == DType::kFloat16;
@@ -579,6 +581,7 @@ class TensorWrapper {
    *  \param[in] scale_dptr      Pointer to the scale value.
    *  \param[in] scale_inv_shape Shape of scale_inv
    *  \param[in] scale_inv_dptr  Pointer to the inverse of scale value.
+   *  \param[in] scaling_mode    Tensor data format.
    */
   TensorWrapper(void *dptr, const NVTEShape &shape, const DType dtype, float *amax_dptr = nullptr,
                 float *scale_dptr = nullptr, float *scale_inv_dptr = nullptr,
@@ -615,6 +618,7 @@ class TensorWrapper {
    *  \param[in] scale_dptr      Pointer to the scale value.
    *  \param[in] scale_inv_shape Shape of scale_inv
    *  \param[in] scale_inv_dptr  Pointer to the inverse of scale value.
+   *  \param[in] scaling_mode    Tensor data format.
    */
   TensorWrapper(void *dptr, const std::vector<size_t> &shape, const DType dtype,
                 float *amax_dptr = nullptr, float *scale_dptr = nullptr,
@@ -766,7 +770,7 @@ class TensorWrapper {
 
   /*! \brief Get the size of this TensorWrapper in the given dimension.
    *
-   *  \param[in] size_t Dimension index.
+   *  \param[in] dim Dimension index.
    *
    *  \return Size of this TensorWrapper in given dimension.
    */
@@ -935,9 +939,11 @@ class QuantizationConfigWrapper {
   QuantizationConfigWrapper(const QuantizationConfigWrapper &) = delete;
   QuantizationConfigWrapper &operator=(const QuantizationConfigWrapper &) = delete;
 
+  /*! \brief Move constructor. */
   QuantizationConfigWrapper(QuantizationConfigWrapper &&other) : config_{other.config_} {
     other.config_ = nullptr;
   }
+  /*! \brief Move-assignment operator. */
   QuantizationConfigWrapper &operator=(QuantizationConfigWrapper &&other) {
     if (config_ != nullptr) {
       nvte_destroy_quantization_config(config_);

diff --git a/transformer_engine/common/include/transformer_engine/transpose.h b/transformer_engine/common/include/transformer_engine/transpose.h
@@ -231,7 +231,7 @@ void nvte_cast_transpose_dbias_dsrelu(const NVTETensor input, const NVTETensor a
  *  - columnwise data of `output` is equal to `transpose(cast(dact(input)))`
  *
  *  \param[in]     input               Input tensor of shape [N, H].
- *  \param[in]     gated_act_input     Tensor used as input to the forward of
+ *  \param[in]     act_input           Tensor used as input to the forward of
  *                                     gated activation operation.
  *                                     Shape [N, H * 2].
  *  \param[in,out] output              Result of the cast.
@@ -250,7 +250,7 @@ void nvte_dgeglu_cast_transpose(const NVTETensor input, const NVTETensor act_inp
  *  - columnwise data of `output` is equal to `transpose(cast(dact(input)))`
  *
  *  \param[in]     input               Input tensor of shape [N, H].
- *  \param[in]     gated_act_input     Tensor used as input to the forward of
+ *  \param[in]     act_input           Tensor used as input to the forward of
  *                                     gated activation operation.
  *                                     Shape [N, H * 2].
  *  \param[in,out] output              Result of the cast.
@@ -269,7 +269,7 @@ void nvte_dswiglu_cast_transpose(const NVTETensor input, const NVTETensor act_in
  *  - columnwise data of `output` is equal to `transpose(cast(dact(input)))`
  *
  *  \param[in]     input               Input tensor of shape [N, H].
- *  \param[in]     gated_act_input     Tensor used as input to the forward of
+ *  \param[in]     act_input           Tensor used as input to the forward of
  *                                     gated activation operation.
  *                                     Shape [N, H * 2].
  *  \param[in,out] output              Result of the cast.
@@ -288,7 +288,7 @@ void nvte_dreglu_cast_transpose(const NVTETensor input, const NVTETensor act_inp
  *  - columnwise data of `output` is equal to `transpose(cast(dact(input)))`
  *
  *  \param[in]     input               Input tensor of shape [N, H].
- *  \param[in]     gated_act_input     Tensor used as input to the forward of
+ *  \param[in]     act_input           Tensor used as input to the forward of
  *                                     gated activation operation.
  *                                     Shape [N, H * 2].
  *  \param[in,out] output              Result of the cast.
@@ -307,7 +307,7 @@ void nvte_dqgeglu_cast_transpose(const NVTETensor input, const NVTETensor act_in
  *  - columnwise data of `output` is equal to `transpose(cast(dact(input)))`
  *
  *  \param[in]     input               Input tensor of shape [N, H].
- *  \param[in]     gated_act_input     Tensor used as input to the forward of
+ *  \param[in]     act_input           Tensor used as input to the forward of
  *                                     gated activation operation.
  *                                     Shape [N, H * 2].
  *  \param[in,out] output              Result of the cast.