Skip to content

Commit 1240e02

Browse files
committed
Add: cuBLASLt kernels for e4m3, e5m2
Current variant uses the `__nv_fp8_e4m3` and `__nv_fp8_e5m2` types.
1 parent 2770d93 commit 1240e02

File tree

1 file changed

+140
-1
lines changed

1 file changed

+140
-1
lines changed

less_slow.cpp

Lines changed: 140 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3131,12 +3131,150 @@ static void cublas_tops(bm::State &state) {
31313131
cublas_check(cublasDestroy(handle));
31323132
}
31333133

3134-
// Register benchmarks
31353134
BENCHMARK(cublas_tops<float>)->RangeMultiplier(2)->Range(8, 16384)->Complexity(benchmark::oNCubed);
31363135
BENCHMARK(cublas_tops<double>)->RangeMultiplier(2)->Range(8, 16384)->Complexity(benchmark::oNCubed);
31373136
BENCHMARK(cublas_tops<__half>)->RangeMultiplier(2)->Range(8, 16384)->Complexity(benchmark::oNCubed);
31383137
BENCHMARK(cublas_tops<int8_t, int32_t>)->RangeMultiplier(2)->Range(8, 16384)->Complexity(benchmark::oNCubed);
31393138

3139+
/**
3140+
* To use newer numeric types, cuBLAS API isn't enough. "cuBLASLt" provides a more flexible interface.
3141+
* One of its' major changes is allowing input "scaling factors". Since SM89 one can provide a `CUDA_R_32F`
3142+
* multiplier scalar for all of `CUDA_R_8F_E4M3` and `CUDA_R_8F_E5M2` inputs and since SM100 black-scaling
3143+
* with different factors is also supported - a common technique used in extreme quantization both on
3144+
* CPUs and GPUs.
3145+
*
3146+
* @see "Using the cuBLASLt API" docs: https://docs.nvidia.com/cuda/cublas/#using-the-cublaslt-api
3147+
*/
3148+
#include <cublasLt.h>
3149+
#include <cuda_fp8.h> // `__nv_fp8*` types
3150+
3151+
template <typename scalar_type_>
3152+
cudaDataType_t to_cuda_data_type() {
3153+
if constexpr (std::is_same<scalar_type_, __nv_fp8_e4m3>::value) return CUDA_R_8F_E4M3;
3154+
if constexpr (std::is_same<scalar_type_, __nv_fp8_e5m2>::value) return CUDA_R_8F_E5M2;
3155+
if constexpr (std::is_same<scalar_type_, float>::value) return CUDA_R_32F;
3156+
if constexpr (std::is_same<scalar_type_, std::int8_t>::value) return CUDA_R_8I;
3157+
if constexpr (std::is_same<scalar_type_, std::uint8_t>::value) return CUDA_R_8U;
3158+
throw std::invalid_argument("Unknown CUDA type");
3159+
}
3160+
3161+
template <typename scalar_type_>
3162+
struct cuda_storage_type {
3163+
using scalar_type = scalar_type_;
3164+
};
3165+
3166+
template <>
3167+
struct cuda_storage_type<__nv_fp8_e4m3> {
3168+
using scalar_type = __nv_fp8_storage_t;
3169+
};
3170+
3171+
template <>
3172+
struct cuda_storage_type<__nv_fp8_e5m2> {
3173+
using scalar_type = __nv_fp8_storage_t;
3174+
};
3175+
3176+
template <typename input_scalar_type_, typename output_scalar_type_ = input_scalar_type_>
3177+
static void cublaslt_tops(bm::State &state) {
3178+
3179+
// Matrix size and leading dimensions
3180+
std::size_t n = static_cast<std::size_t>(state.range(0));
3181+
int lda = static_cast<int>(n), ldb = static_cast<int>(n), ldc = static_cast<int>(n);
3182+
constexpr bool same_type = std::is_same_v<input_scalar_type_, output_scalar_type_>;
3183+
cublasOperation_t a_transpose = CUBLAS_OP_N;
3184+
cublasOperation_t b_transpose = CUBLAS_OP_N;
3185+
3186+
// Unified memory for large matrices
3187+
using input_storage_type = typename cuda_storage_type<input_scalar_type_>::scalar_type;
3188+
unified_array<input_storage_type> a(n * n), b(n * n);
3189+
unified_array<output_scalar_type_> c(n * n), d(n * n);
3190+
3191+
// With unified memory, we don't even need Thrust to initialize the data
3192+
std::iota(a.begin(), a.end(), 0);
3193+
std::iota(b.begin(), b.end(), 0);
3194+
std::fill(c.begin(), c.end(), 0);
3195+
std::fill(d.begin(), d.end(), 0);
3196+
3197+
cublasLtHandle_t handle;
3198+
cublas_check(cublasLtCreate(&handle));
3199+
3200+
// Create the matmul descriptor.
3201+
cublasLtMatmulDesc_t descriptor = nullptr;
3202+
cublas_check(cublasLtMatmulDescCreate(&descriptor, CUBLAS_COMPUTE_32F, to_cuda_data_type<output_scalar_type_>()));
3203+
cublas_check(
3204+
cublasLtMatmulDescSetAttribute(descriptor, CUBLASLT_MATMUL_DESC_TRANSA, &a_transpose, sizeof(a_transpose)));
3205+
cublas_check(
3206+
cublasLtMatmulDescSetAttribute(descriptor, CUBLASLT_MATMUL_DESC_TRANSB, &b_transpose, sizeof(b_transpose)));
3207+
3208+
// Set per-tensor scaling attributes (using 1.0f as the default scaling factors).
3209+
float a_scale = 1.0f, b_scale = 1.0f, c_scale = 1.0f, d_scale = 1.0f;
3210+
cublas_check(
3211+
cublasLtMatmulDescSetAttribute(descriptor, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &a_scale, sizeof(a_scale)));
3212+
cublas_check(
3213+
cublasLtMatmulDescSetAttribute(descriptor, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &b_scale, sizeof(b_scale)));
3214+
cublas_check(
3215+
cublasLtMatmulDescSetAttribute(descriptor, CUBLASLT_MATMUL_DESC_C_SCALE_POINTER, &c_scale, sizeof(c_scale)));
3216+
cublas_check(
3217+
cublasLtMatmulDescSetAttribute(descriptor, CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, &d_scale, sizeof(d_scale)));
3218+
3219+
// Create matrix layout descriptors for A, B, C, and D (output)
3220+
// https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuBLASLt/LtFp8Matmul/sample_cublasLt_LtFp8Matmul.cu
3221+
cublasLtMatrixLayout_t a_descriptor = nullptr, b_descriptor = nullptr, c_descriptor = nullptr,
3222+
d_descriptor = nullptr;
3223+
cublas_check(cublasLtMatrixLayoutCreate(&a_descriptor, to_cuda_data_type<input_scalar_type_>(), n, n, lda));
3224+
cublas_check(cublasLtMatrixLayoutCreate(&b_descriptor, to_cuda_data_type<input_scalar_type_>(), n, n, ldb));
3225+
cublas_check(cublasLtMatrixLayoutCreate(&c_descriptor, to_cuda_data_type<output_scalar_type_>(), n, n, ldc));
3226+
cublas_check(cublasLtMatrixLayoutCreate(&d_descriptor, to_cuda_data_type<output_scalar_type_>(), n, n, ldc));
3227+
3228+
// Create a preference handle and set workspace limit (0 in this example).
3229+
cublasLtMatmulPreference_t preference = nullptr;
3230+
cublas_check(cublasLtMatmulPreferenceCreate(&preference));
3231+
std::size_t workspace_size = 0;
3232+
cublas_check(cublasLtMatmulPreferenceSetAttribute(preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
3233+
&workspace_size, sizeof(workspace_size)));
3234+
3235+
// Query the heuristic for the best available algorithm.
3236+
int heuristics_count = 0;
3237+
cublasLtMatmulHeuristicResult_t heuristic_result = {};
3238+
cublas_check(cublasLtMatmulAlgoGetHeuristic(handle, descriptor, a_descriptor, b_descriptor, c_descriptor,
3239+
d_descriptor, preference, 1, &heuristic_result, &heuristics_count));
3240+
if (heuristics_count == 0) throw std::runtime_error("No suitable heuristic found for cuBLASLt matmul");
3241+
3242+
// Define scaling factors (using FP32 scalars)
3243+
float alpha = 1.0f;
3244+
float beta = 0.0f;
3245+
3246+
for (auto _ : state) {
3247+
3248+
cublasLtMatmul( //
3249+
handle, descriptor, //
3250+
&alpha, //
3251+
a.begin(), a_descriptor, b.begin(), b_descriptor, //
3252+
&beta, //
3253+
c.begin(), c_descriptor, d.begin(), d_descriptor, //
3254+
&heuristic_result.algo, nullptr, 0, // No workspace
3255+
nullptr); // Default stream
3256+
3257+
// Synchronize to ensure that the GEMM call completes before timing stops.
3258+
// Otherwise 10'000 calls will be scheduled and we will block forever until all complete!
3259+
cudaDeviceSynchronize();
3260+
}
3261+
3262+
std::size_t tops_per_cycle = n * n * (n /* multiplications */ + (n - 1) /* additions */);
3263+
state.counters["TOP"] = bm::Counter(state.iterations() * tops_per_cycle, bm::Counter::kIsRate);
3264+
state.SetComplexityN(n);
3265+
3266+
// Cleanup
3267+
cublas_check(cublasLtMatrixLayoutDestroy(a_descriptor));
3268+
cublas_check(cublasLtMatrixLayoutDestroy(b_descriptor));
3269+
cublas_check(cublasLtMatrixLayoutDestroy(c_descriptor));
3270+
cublas_check(cublasLtMatrixLayoutDestroy(d_descriptor));
3271+
cublas_check(cublasLtMatmulDescDestroy(descriptor));
3272+
cublas_check(cublasLtDestroy(handle));
3273+
}
3274+
3275+
BENCHMARK(cublaslt_tops<__nv_fp8_e4m3, float>)->RangeMultiplier(2)->Range(8, 16384)->Complexity(benchmark::oNCubed);
3276+
BENCHMARK(cublaslt_tops<__nv_fp8_e5m2, float>)->RangeMultiplier(2)->Range(8, 16384)->Complexity(benchmark::oNCubed);
3277+
31403278
/**
31413279
* Here are the numbers one can expect on a Nvidia H200 GPUs:
31423280
*
@@ -3148,6 +3286,7 @@ BENCHMARK(cublas_tops<int8_t, int32_t>)->RangeMultiplier(2)->Range(8, 16384)->Co
31483286
* - `bf16` @b 1'000 T @b 1'047 T -
31493287
* - `f16` @b 1'000 T @b 1'056 T @b 764 T
31503288
* - `i8` & `u8` @b 2'000 T - @b 122 T
3289+
* - `e4m3` & `e5m2` @b 2'000 T - -
31513290
* - `b1` XOR-based - @b 79 T -
31523291
* - `b1` AND-based - @b 8'439 T -
31533292
*

0 commit comments

Comments
 (0)