@@ -3131,12 +3131,150 @@ static void cublas_tops(bm::State &state) {
31313131 cublas_check (cublasDestroy (handle));
31323132}
31333133
3134- // Register benchmarks
31353134BENCHMARK (cublas_tops<float >)->RangeMultiplier(2 )->Range(8 , 16384 )->Complexity(benchmark::oNCubed);
31363135BENCHMARK (cublas_tops<double >)->RangeMultiplier(2 )->Range(8 , 16384 )->Complexity(benchmark::oNCubed);
31373136BENCHMARK (cublas_tops<__half>)->RangeMultiplier(2 )->Range(8 , 16384 )->Complexity(benchmark::oNCubed);
31383137BENCHMARK (cublas_tops<int8_t , int32_t >)->RangeMultiplier(2 )->Range(8 , 16384 )->Complexity(benchmark::oNCubed);
31393138
3139+ /* *
3140+ * To use newer numeric types, cuBLAS API isn't enough. "cuBLASLt" provides a more flexible interface.
3141+ * One of its' major changes is allowing input "scaling factors". Since SM89 one can provide a `CUDA_R_32F`
3142+ * multiplier scalar for all of `CUDA_R_8F_E4M3` and `CUDA_R_8F_E5M2` inputs and since SM100 black-scaling
3143+ * with different factors is also supported - a common technique used in extreme quantization both on
3144+ * CPUs and GPUs.
3145+ *
3146+ * @see "Using the cuBLASLt API" docs: https://docs.nvidia.com/cuda/cublas/#using-the-cublaslt-api
3147+ */
3148+ #include < cublasLt.h>
3149+ #include < cuda_fp8.h> // `__nv_fp8*` types
3150+
3151+ template <typename scalar_type_>
3152+ cudaDataType_t to_cuda_data_type () {
3153+ if constexpr (std::is_same<scalar_type_, __nv_fp8_e4m3>::value) return CUDA_R_8F_E4M3;
3154+ if constexpr (std::is_same<scalar_type_, __nv_fp8_e5m2>::value) return CUDA_R_8F_E5M2;
3155+ if constexpr (std::is_same<scalar_type_, float >::value) return CUDA_R_32F;
3156+ if constexpr (std::is_same<scalar_type_, std::int8_t >::value) return CUDA_R_8I;
3157+ if constexpr (std::is_same<scalar_type_, std::uint8_t >::value) return CUDA_R_8U;
3158+ throw std::invalid_argument (" Unknown CUDA type" );
3159+ }
3160+
3161+ template <typename scalar_type_>
3162+ struct cuda_storage_type {
3163+ using scalar_type = scalar_type_;
3164+ };
3165+
3166+ template <>
3167+ struct cuda_storage_type <__nv_fp8_e4m3> {
3168+ using scalar_type = __nv_fp8_storage_t ;
3169+ };
3170+
3171+ template <>
3172+ struct cuda_storage_type <__nv_fp8_e5m2> {
3173+ using scalar_type = __nv_fp8_storage_t ;
3174+ };
3175+
3176+ template <typename input_scalar_type_, typename output_scalar_type_ = input_scalar_type_>
3177+ static void cublaslt_tops (bm::State &state) {
3178+
3179+ // Matrix size and leading dimensions
3180+ std::size_t n = static_cast <std::size_t >(state.range (0 ));
3181+ int lda = static_cast <int >(n), ldb = static_cast <int >(n), ldc = static_cast <int >(n);
3182+ constexpr bool same_type = std::is_same_v<input_scalar_type_, output_scalar_type_>;
3183+ cublasOperation_t a_transpose = CUBLAS_OP_N;
3184+ cublasOperation_t b_transpose = CUBLAS_OP_N;
3185+
3186+ // Unified memory for large matrices
3187+ using input_storage_type = typename cuda_storage_type<input_scalar_type_>::scalar_type;
3188+ unified_array<input_storage_type> a (n * n), b (n * n);
3189+ unified_array<output_scalar_type_> c (n * n), d (n * n);
3190+
3191+ // With unified memory, we don't even need Thrust to initialize the data
3192+ std::iota (a.begin (), a.end (), 0 );
3193+ std::iota (b.begin (), b.end (), 0 );
3194+ std::fill (c.begin (), c.end (), 0 );
3195+ std::fill (d.begin (), d.end (), 0 );
3196+
3197+ cublasLtHandle_t handle;
3198+ cublas_check (cublasLtCreate (&handle));
3199+
3200+ // Create the matmul descriptor.
3201+ cublasLtMatmulDesc_t descriptor = nullptr ;
3202+ cublas_check (cublasLtMatmulDescCreate (&descriptor, CUBLAS_COMPUTE_32F, to_cuda_data_type<output_scalar_type_>()));
3203+ cublas_check (
3204+ cublasLtMatmulDescSetAttribute (descriptor, CUBLASLT_MATMUL_DESC_TRANSA, &a_transpose, sizeof (a_transpose)));
3205+ cublas_check (
3206+ cublasLtMatmulDescSetAttribute (descriptor, CUBLASLT_MATMUL_DESC_TRANSB, &b_transpose, sizeof (b_transpose)));
3207+
3208+ // Set per-tensor scaling attributes (using 1.0f as the default scaling factors).
3209+ float a_scale = 1 .0f , b_scale = 1 .0f , c_scale = 1 .0f , d_scale = 1 .0f ;
3210+ cublas_check (
3211+ cublasLtMatmulDescSetAttribute (descriptor, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &a_scale, sizeof (a_scale)));
3212+ cublas_check (
3213+ cublasLtMatmulDescSetAttribute (descriptor, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &b_scale, sizeof (b_scale)));
3214+ cublas_check (
3215+ cublasLtMatmulDescSetAttribute (descriptor, CUBLASLT_MATMUL_DESC_C_SCALE_POINTER, &c_scale, sizeof (c_scale)));
3216+ cublas_check (
3217+ cublasLtMatmulDescSetAttribute (descriptor, CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, &d_scale, sizeof (d_scale)));
3218+
3219+ // Create matrix layout descriptors for A, B, C, and D (output)
3220+ // https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuBLASLt/LtFp8Matmul/sample_cublasLt_LtFp8Matmul.cu
3221+ cublasLtMatrixLayout_t a_descriptor = nullptr , b_descriptor = nullptr , c_descriptor = nullptr ,
3222+ d_descriptor = nullptr ;
3223+ cublas_check (cublasLtMatrixLayoutCreate (&a_descriptor, to_cuda_data_type<input_scalar_type_>(), n, n, lda));
3224+ cublas_check (cublasLtMatrixLayoutCreate (&b_descriptor, to_cuda_data_type<input_scalar_type_>(), n, n, ldb));
3225+ cublas_check (cublasLtMatrixLayoutCreate (&c_descriptor, to_cuda_data_type<output_scalar_type_>(), n, n, ldc));
3226+ cublas_check (cublasLtMatrixLayoutCreate (&d_descriptor, to_cuda_data_type<output_scalar_type_>(), n, n, ldc));
3227+
3228+ // Create a preference handle and set workspace limit (0 in this example).
3229+ cublasLtMatmulPreference_t preference = nullptr ;
3230+ cublas_check (cublasLtMatmulPreferenceCreate (&preference));
3231+ std::size_t workspace_size = 0 ;
3232+ cublas_check (cublasLtMatmulPreferenceSetAttribute (preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
3233+ &workspace_size, sizeof (workspace_size)));
3234+
3235+ // Query the heuristic for the best available algorithm.
3236+ int heuristics_count = 0 ;
3237+ cublasLtMatmulHeuristicResult_t heuristic_result = {};
3238+ cublas_check (cublasLtMatmulAlgoGetHeuristic (handle, descriptor, a_descriptor, b_descriptor, c_descriptor,
3239+ d_descriptor, preference, 1 , &heuristic_result, &heuristics_count));
3240+ if (heuristics_count == 0 ) throw std::runtime_error (" No suitable heuristic found for cuBLASLt matmul" );
3241+
3242+ // Define scaling factors (using FP32 scalars)
3243+ float alpha = 1 .0f ;
3244+ float beta = 0 .0f ;
3245+
3246+ for (auto _ : state) {
3247+
3248+ cublasLtMatmul ( //
3249+ handle, descriptor, //
3250+ &alpha, //
3251+ a.begin (), a_descriptor, b.begin (), b_descriptor, //
3252+ &beta, //
3253+ c.begin (), c_descriptor, d.begin (), d_descriptor, //
3254+ &heuristic_result.algo , nullptr , 0 , // No workspace
3255+ nullptr ); // Default stream
3256+
3257+ // Synchronize to ensure that the GEMM call completes before timing stops.
3258+ // Otherwise 10'000 calls will be scheduled and we will block forever until all complete!
3259+ cudaDeviceSynchronize ();
3260+ }
3261+
3262+ std::size_t tops_per_cycle = n * n * (n /* multiplications */ + (n - 1 ) /* additions */ );
3263+ state.counters [" TOP" ] = bm::Counter (state.iterations () * tops_per_cycle, bm::Counter::kIsRate );
3264+ state.SetComplexityN (n);
3265+
3266+ // Cleanup
3267+ cublas_check (cublasLtMatrixLayoutDestroy (a_descriptor));
3268+ cublas_check (cublasLtMatrixLayoutDestroy (b_descriptor));
3269+ cublas_check (cublasLtMatrixLayoutDestroy (c_descriptor));
3270+ cublas_check (cublasLtMatrixLayoutDestroy (d_descriptor));
3271+ cublas_check (cublasLtMatmulDescDestroy (descriptor));
3272+ cublas_check (cublasLtDestroy (handle));
3273+ }
3274+
3275+ BENCHMARK (cublaslt_tops<__nv_fp8_e4m3, float >)->RangeMultiplier(2 )->Range(8 , 16384 )->Complexity(benchmark::oNCubed);
3276+ BENCHMARK (cublaslt_tops<__nv_fp8_e5m2, float >)->RangeMultiplier(2 )->Range(8 , 16384 )->Complexity(benchmark::oNCubed);
3277+
31403278/* *
31413279 * Here are the numbers one can expect on a Nvidia H200 GPUs:
31423280 *
@@ -3148,6 +3286,7 @@ BENCHMARK(cublas_tops<int8_t, int32_t>)->RangeMultiplier(2)->Range(8, 16384)->Co
31483286 * - `bf16` @b 1'000 T @b 1'047 T -
31493287 * - `f16` @b 1'000 T @b 1'056 T @b 764 T
31503288 * - `i8` & `u8` @b 2'000 T - @b 122 T
3289+ * - `e4m3` & `e5m2` @b 2'000 T - -
31513290 * - `b1` XOR-based - @b 79 T -
31523291 * - `b1` AND-based - @b 8'439 T -
31533292 *
0 commit comments