diff --git a/.gitignore b/.gitignore index 74acd6ad7f..7a86041a1e 100644 --- a/.gitignore +++ b/.gitignore @@ -41,3 +41,4 @@ compile_commands.json .nfs tensor_dumps/ artifacts/ +*.DS_Store diff --git a/transformer_engine/common/include/transformer_engine/cast.h b/transformer_engine/common/include/transformer_engine/cast.h index 5e67b7645e..576494a4de 100644 --- a/transformer_engine/common/include/transformer_engine/cast.h +++ b/transformer_engine/common/include/transformer_engine/cast.h @@ -130,7 +130,7 @@ void nvte_quantize_v2(const NVTETensor input, NVTETensor output, * \param[in] stream CUDA stream used for the operation. */ void nvte_quantize_dbias(const NVTETensor input, NVTETensor output, NVTETensor dbias, - NVTETensor workplace, cudaStream_t stream); + NVTETensor workspace, cudaStream_t stream); /*! \brief Computes backward of GeLU operation on the input, then casts to FP8/MXFP8. * Additionally, reduces the result of the GeLU backward along columns. @@ -263,7 +263,8 @@ void nvte_dequantize(const NVTETensor input, NVTETensor output, cudaStream_t str * * \param[in] inputs List of input tensors to be cast. * \param[in,out] outputs List of output quantized tensors. - * \param[in] quant_config (Optional) Quantization configurations. + * \param[in] quant_config (Optional) Quantization configurations. + * \param[in] num_tensors Number of input and output tensors. * \param[in] stream CUDA stream used for the operation. */ void nvte_multi_tensor_quantize(const NVTETensor *inputs, NVTETensor *outputs, diff --git a/transformer_engine/common/include/transformer_engine/cast_transpose_noop.h b/transformer_engine/common/include/transformer_engine/cast_transpose_noop.h index 5bef067910..d21a28e521 100644 --- a/transformer_engine/common/include/transformer_engine/cast_transpose_noop.h +++ b/transformer_engine/common/include/transformer_engine/cast_transpose_noop.h @@ -4,8 +4,8 @@ * See LICENSE for license information. ************************************************************************/ -/*! \file transpose_with_noop.h - * \brief Functions handling transposes with no-op. +/*! \file cast_transpose_noop.h + * \brief Transpose functions with no-op flag. */ #ifndef TRANSFORMER_ENGINE_CAST_TRANSPOSE_WITH_NOOP_H_ diff --git a/transformer_engine/common/include/transformer_engine/fused_attn.h b/transformer_engine/common/include/transformer_engine/fused_attn.h index 16b4b8ff4d..dd70ccf8df 100644 --- a/transformer_engine/common/include/transformer_engine/fused_attn.h +++ b/transformer_engine/common/include/transformer_engine/fused_attn.h @@ -409,7 +409,6 @@ void nvte_fused_attn_bwd_qkvpacked( * \param[in] softmax_type Attention softmax type. * \param[in] window_size_left Sliding window size (the left half). * \param[in] window_size_right Sliding window size (the right half). - * \param[in] deterministic Whether to execute with deterministic behaviours. * \param[in] workspace Workspace tensor. * \param[in] stream CUDA stream used for this operation. */ @@ -673,7 +672,7 @@ void nvte_populate_rng_state_async(NVTETensor rng_state_dst, const NVTETensor se * \param[in] len batch_size x sequence_length. * \param[in] stream CUDA stream used for this operation. */ -uint32_t nvte_get_runtime_num_segments(NVTETensor cu_seqlen, NVTETensor workspace, size_t len, +uint32_t nvte_get_runtime_num_segments(NVTETensor cu_seqlens, NVTETensor workspace, size_t len, cudaStream_t stream); /*! \brief Set the seed and offset for RNG state. @@ -830,8 +829,7 @@ void nvte_convert_thd_to_bshd(NVTETensor tensor, NVTETensor cu_seqlens, NVTETens * \param[in] tensor Input tensor. * \param[in] cu_seqlens Cumulative sequence lengths, [batch_size + 1]. * \param[out] new_tensor Output tensor. - * \param[in] b Batch size. - * \param[in] max_seq_len Maximum sequence length. + * \param[in] t Packed sequence length. * \param[in] stream CUDA stream used for this operation. */ void nvte_convert_bshd_to_thd(NVTETensor tensor, NVTETensor cu_seqlens, NVTETensor new_tensor, diff --git a/transformer_engine/common/include/transformer_engine/gemm.h b/transformer_engine/common/include/transformer_engine/gemm.h index 38214f8872..8a51b54fdb 100644 --- a/transformer_engine/common/include/transformer_engine/gemm.h +++ b/transformer_engine/common/include/transformer_engine/gemm.h @@ -255,9 +255,11 @@ class MatmulConfigWrapper { MatmulConfigWrapper(const MatmulConfigWrapper &) = delete; MatmulConfigWrapper &operator=(const MatmulConfigWrapper &) = delete; + /*! \brief Move constructor. */ MatmulConfigWrapper(MatmulConfigWrapper &&other) : config_{other.config_} { other.config_ = nullptr; } + /*! \brief Move-assignment operator. */ MatmulConfigWrapper &operator=(MatmulConfigWrapper &&other) { if (config_ != nullptr) { nvte_destroy_matmul_config(config_); diff --git a/transformer_engine/common/include/transformer_engine/multi_tensor.h b/transformer_engine/common/include/transformer_engine/multi_tensor.h index 1bea4cb21f..303801a88a 100644 --- a/transformer_engine/common/include/transformer_engine/multi_tensor.h +++ b/transformer_engine/common/include/transformer_engine/multi_tensor.h @@ -288,7 +288,7 @@ void nvte_multi_tensor_compute_scale_inv_e8m0_cuda(int chunk_size, NVTETensor ** * and populate the amax of the corresponding output tensor. * * \param[in] input Input tensor. - * \param[in,out] amaxes Array of output tensors. Only the amax is updated. + * \param[in,out] outputs Array of output tensors. Only the amax is updated. * \param[in] split_sections Size of each tensor split along dimension 0. * \param[in] num_tensors Number of tensor splits. * \param[in] stream CUDA stream used for the operation. diff --git a/transformer_engine/common/include/transformer_engine/normalization.h b/transformer_engine/common/include/transformer_engine/normalization.h index 7f5bd92fc1..29b98ca54f 100644 --- a/transformer_engine/common/include/transformer_engine/normalization.h +++ b/transformer_engine/common/include/transformer_engine/normalization.h @@ -163,11 +163,16 @@ void nvte_rmsnorm_bwd_add(const NVTETensor dz, const NVTETensor x, const NVTETen NVTETensor dgamma, NVTETensor workspace, const int multiprocessorCount, const bool zero_centered_gamma, cudaStream_t stream); -/*! \brief Helper to enable cuDNN backend for normalization +/*! \brief Set whether to enable cuDNN backend for normalization forward. * - * \param[in] bool Enable if True + * \param[in] enable Whether to enable cuDNN backend. */ void nvte_enable_cudnn_norm_fwd(bool enable); + +/*! \brief Set whether to enable cuDNN backend for normalization backward. + * + * \param[in] enable Whether to enable cuDNN backend. + */ void nvte_enable_cudnn_norm_bwd(bool enable); /*! \brief Control whether norm computes `gamma += 1.0` for zero-centered gamma @@ -176,11 +181,14 @@ void nvte_enable_cudnn_norm_bwd(bool enable); * Currently this only applies to the CuDNN backend. If CuDNN is not used, * this setting has no effect. * - * \param[in] bool Enable if True + * \param[in] enable Whether to enable zero-centered gamma. */ void nvte_enable_zero_centered_gamma_in_weight_dtype(bool enable); +#ifdef __cplusplus +/*! \brief Normalization function type */ enum class NVTE_Norm_Type { LayerNorm, RMSNorm }; +#endif #ifdef __cplusplus } // extern "C" diff --git a/transformer_engine/common/include/transformer_engine/recipe.h b/transformer_engine/common/include/transformer_engine/recipe.h index 83d436db30..c0cec8a3b9 100644 --- a/transformer_engine/common/include/transformer_engine/recipe.h +++ b/transformer_engine/common/include/transformer_engine/recipe.h @@ -23,7 +23,7 @@ extern "C" { * the last, the last entry shifts to the second to last) and the * first entry is set to zero. The scaling factor is estimated so the * FP8 tensor's maximum absolute value is - * @f$ 2^{-\text{margin}} \text{max}_\text{fp8\_dtype} @f$. + * @f$ 2^{-margin} \max_{fp8\_dtype} @f$. * * \param[in] amax_history History of maximum absolute values. * Shape: [history_length, num_scales] @@ -54,7 +54,7 @@ void nvte_delayed_scaling_recipe_amax_and_scale_update( * the last, the last entry shifts to the second to last) and the * first entry is set to zero. The scaling factor is estimated so the * FP8 tensor's maximum absolute value is - * @f$ 2^{-\text{margin}} \text{max}_\text{fp8\_dtype} @f$. + * @f$ 2^{-margin} \max_{fp8\_dtype} @f$. * * \param[in] amax_reduction_buffer The contiguous buffer used for amax reduction. * Shape: [num_scales * num_tensors] diff --git a/transformer_engine/common/include/transformer_engine/swizzle.h b/transformer_engine/common/include/transformer_engine/swizzle.h index b4489abdd8..4e3544d3c7 100644 --- a/transformer_engine/common/include/transformer_engine/swizzle.h +++ b/transformer_engine/common/include/transformer_engine/swizzle.h @@ -34,6 +34,7 @@ void nvte_swizzle_scaling_factors(const NVTETensor input, NVTETensor output, cud * * \param[in] inputs Input tensors with non-swizzled scale_inv. * \param[in,out] outputs Output tensors which hosts swizzled scale_inv. + * \param[in] num_tensors Number of input and output tensors. * \param[in] stream CUDA stream used for the operation. * * Requirements: diff --git a/transformer_engine/common/include/transformer_engine/transformer_engine.h b/transformer_engine/common/include/transformer_engine/transformer_engine.h index 7fc9d78980..fd0125c8d0 100644 --- a/transformer_engine/common/include/transformer_engine/transformer_engine.h +++ b/transformer_engine/common/include/transformer_engine/transformer_engine.h @@ -51,8 +51,11 @@ struct NVTEShape { * It does not own the memory it points to. */ struct NVTEBasicTensor { + /*! Pointer to data buffer. */ void *data_ptr; + /*! Data type. */ NVTEDType dtype; + /*! Tensor shape. */ NVTEShape shape; }; @@ -144,7 +147,7 @@ void *nvte_tensor_columnwise_data(const NVTETensor tensor); * * \param[data] Pointer to start of shape array. If NULL, the shape * will be filled with zeros. - * \param[data] Number of dimensions (must be <= 14) + * \param[ndim] Number of dimensions (must be <= 14) * * \return A shape. The shape will own its own copy of the data. */ @@ -177,7 +180,7 @@ size_t nvte_tensor_ndims(const NVTETensor tensor); /*! \brief Get the size of a specific tensor dimension. * * \param[in] tensor Tensor. - * \param[in] size_t Dimension index. + * \param[in] dim Dimension index. * * \return Size of the tensor at the specified dimension. */ @@ -258,8 +261,7 @@ NVTEShape nvte_tensor_scale_inv_shape(const NVTETensor tensor); /*! \brief Reset tensor value to zero. * * \param[in] tensor Tensor. - * - * \return A scale_inv shape of the input tensor. + * \param[in] stream CUDA stream to use for the operation. */ void nvte_zero_tensor(const NVTETensor tensor, cudaStream_t stream); @@ -539,7 +541,7 @@ enum class DType { /*! \brief Check if TE datatype is FP8 * * Return true if TE datatype is FP8 - * \param[in] DType TE Datatype of interest + * \param[in] t TE Datatype of interest */ inline bool is_fp8_dtype(const DType t) { return t == DType::kFloat8E4M3 || t == DType::kFloat8E5M2; @@ -548,14 +550,14 @@ inline bool is_fp8_dtype(const DType t) { /*! \brief Check if TE datatype is FP4 * * Return true if TE datatype is FP4 - * \param[in] DType TE Datatype of interest + * \param[in] t TE Datatype of interest */ inline bool is_fp4_dtype(const DType t) { return t == DType::kFloat4E2M1; } /*! \brief Check if TE datatype is high precision (FP32, FP16, BF16) * * Return true if TE datatype is high precision - * \param[in] DType TE Datatype of interest + * \param[in] t TE Datatype of interest */ inline bool is_high_precision_dtype(const DType t) { return t == DType::kFloat32 || t == DType::kBFloat16 || t == DType::kFloat16; @@ -579,6 +581,7 @@ class TensorWrapper { * \param[in] scale_dptr Pointer to the scale value. * \param[in] scale_inv_shape Shape of scale_inv * \param[in] scale_inv_dptr Pointer to the inverse of scale value. + * \param[in] scaling_mode Tensor data format. */ TensorWrapper(void *dptr, const NVTEShape &shape, const DType dtype, float *amax_dptr = nullptr, float *scale_dptr = nullptr, float *scale_inv_dptr = nullptr, @@ -615,6 +618,7 @@ class TensorWrapper { * \param[in] scale_dptr Pointer to the scale value. * \param[in] scale_inv_shape Shape of scale_inv * \param[in] scale_inv_dptr Pointer to the inverse of scale value. + * \param[in] scaling_mode Tensor data format. */ TensorWrapper(void *dptr, const std::vector &shape, const DType dtype, float *amax_dptr = nullptr, float *scale_dptr = nullptr, @@ -766,7 +770,7 @@ class TensorWrapper { /*! \brief Get the size of this TensorWrapper in the given dimension. * - * \param[in] size_t Dimension index. + * \param[in] dim Dimension index. * * \return Size of this TensorWrapper in given dimension. */ @@ -935,9 +939,11 @@ class QuantizationConfigWrapper { QuantizationConfigWrapper(const QuantizationConfigWrapper &) = delete; QuantizationConfigWrapper &operator=(const QuantizationConfigWrapper &) = delete; + /*! \brief Move constructor. */ QuantizationConfigWrapper(QuantizationConfigWrapper &&other) : config_{other.config_} { other.config_ = nullptr; } + /*! \brief Move-assignment operator. */ QuantizationConfigWrapper &operator=(QuantizationConfigWrapper &&other) { if (config_ != nullptr) { nvte_destroy_quantization_config(config_); diff --git a/transformer_engine/common/include/transformer_engine/transpose.h b/transformer_engine/common/include/transformer_engine/transpose.h index cd73935abb..5f9a8fe149 100644 --- a/transformer_engine/common/include/transformer_engine/transpose.h +++ b/transformer_engine/common/include/transformer_engine/transpose.h @@ -231,7 +231,7 @@ void nvte_cast_transpose_dbias_dsrelu(const NVTETensor input, const NVTETensor a * - columnwise data of `output` is equal to `transpose(cast(dact(input)))` * * \param[in] input Input tensor of shape [N, H]. - * \param[in] gated_act_input Tensor used as input to the forward of + * \param[in] act_input Tensor used as input to the forward of * gated activation operation. * Shape [N, H * 2]. * \param[in,out] output Result of the cast. @@ -250,7 +250,7 @@ void nvte_dgeglu_cast_transpose(const NVTETensor input, const NVTETensor act_inp * - columnwise data of `output` is equal to `transpose(cast(dact(input)))` * * \param[in] input Input tensor of shape [N, H]. - * \param[in] gated_act_input Tensor used as input to the forward of + * \param[in] act_input Tensor used as input to the forward of * gated activation operation. * Shape [N, H * 2]. * \param[in,out] output Result of the cast. @@ -269,7 +269,7 @@ void nvte_dswiglu_cast_transpose(const NVTETensor input, const NVTETensor act_in * - columnwise data of `output` is equal to `transpose(cast(dact(input)))` * * \param[in] input Input tensor of shape [N, H]. - * \param[in] gated_act_input Tensor used as input to the forward of + * \param[in] act_input Tensor used as input to the forward of * gated activation operation. * Shape [N, H * 2]. * \param[in,out] output Result of the cast. @@ -288,7 +288,7 @@ void nvte_dreglu_cast_transpose(const NVTETensor input, const NVTETensor act_inp * - columnwise data of `output` is equal to `transpose(cast(dact(input)))` * * \param[in] input Input tensor of shape [N, H]. - * \param[in] gated_act_input Tensor used as input to the forward of + * \param[in] act_input Tensor used as input to the forward of * gated activation operation. * Shape [N, H * 2]. * \param[in,out] output Result of the cast. @@ -307,7 +307,7 @@ void nvte_dqgeglu_cast_transpose(const NVTETensor input, const NVTETensor act_in * - columnwise data of `output` is equal to `transpose(cast(dact(input)))` * * \param[in] input Input tensor of shape [N, H]. - * \param[in] gated_act_input Tensor used as input to the forward of + * \param[in] act_input Tensor used as input to the forward of * gated activation operation. * Shape [N, H * 2]. * \param[in,out] output Result of the cast.