|
| 1 | +/******************************************************************************* |
| 2 | + * Copyright (c) 2023-2024 Intel Corporation |
| 3 | + * |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | + * you may not use this file except in compliance with the License. |
| 6 | + * You may obtain a copy of the License at |
| 7 | + * |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | + * |
| 10 | + * Unless required by applicable law or agreed to in writing, software |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | + * See the License for the specific language governing permissions and |
| 14 | + * limitations under the License. |
| 15 | + *******************************************************************************/ |
| 16 | +#ifndef TRITONBENCHMARK_SPLIT_K_GEMM_H |
| 17 | +#define TRITONBENCHMARK_SPLIT_K_GEMM_H |
| 18 | + |
| 19 | +#include "xetla.hpp" |
| 20 | +#include <sycl.hpp> |
| 21 | + |
| 22 | +enum class kslicing_impl_t : uint8_t { none = 0, global = 1, local = 2 }; |
| 23 | + |
| 24 | +template <int m, int k, int n, |
| 25 | + kslicing_impl_t kslicing_type = kslicing_impl_t::none> |
| 26 | +sycl::event split_k_gemm_run(void *_A, void *_B, void *_C, void *_Acc, |
| 27 | + void *_Cnt, sycl::queue &queue) { |
| 28 | + |
| 29 | + // GEMM_UNIVERSAL input size |
| 30 | + size_t matrix_m = m; |
| 31 | + size_t matrix_n = n; |
| 32 | + size_t matrix_k = k; |
| 33 | + |
| 34 | + size_t size_a = matrix_m * matrix_k; |
| 35 | + size_t size_b = matrix_k * matrix_n; |
| 36 | + size_t size_c = matrix_m * matrix_n; |
| 37 | + |
| 38 | + using data_type_a = sycl::ext::oneapi::bfloat16; |
| 39 | + using data_type_b = sycl::ext::oneapi::bfloat16; |
| 40 | + using data_type_c = float; |
| 41 | + using data_type_acc = float; |
| 42 | + |
| 43 | + data_type_a *A = static_cast<data_type_a *>(_A); |
| 44 | + data_type_b *B = static_cast<data_type_b *>(_B); |
| 45 | + data_type_c *C = static_cast<data_type_c *>(_C); |
| 46 | + |
| 47 | + // Define the shape of workgroup |
| 48 | + // It's tunable parameters based on different input shape and hardware for |
| 49 | + // better performance |
| 50 | + constexpr uint32_t wg_tile_m = |
| 51 | + (kslicing_type != kslicing_impl_t::local) ? 256 : 64; |
| 52 | + constexpr uint32_t wg_tile_n = |
| 53 | + (kslicing_type != kslicing_impl_t::local) ? 256 : 128; |
| 54 | + |
| 55 | + // specify the range k_w/k_s by setting the corresponding ratio |
| 56 | + // splitk using global memory |
| 57 | + constexpr uint32_t num_global_splitk = |
| 58 | + (kslicing_type == kslicing_impl_t::global) ? 2 : 1; |
| 59 | + // splitk using local memory |
| 60 | + constexpr uint32_t num_local_splitk = |
| 61 | + (kslicing_type == kslicing_impl_t::local) ? 2 : 1; |
| 62 | + |
| 63 | + // Mirco-kernel configuration |
| 64 | + using tune_option = |
| 65 | + dict_t<elem_v_t<tune_key::param_optimizer_type, |
| 66 | + tune_key_value::param_optimizer_decision_tree>, |
| 67 | + elem_t_t<tune_key::data_type_acc, data_type_acc>, |
| 68 | + elem_v_t<tune_key::dispatch_policy, |
| 69 | + tune_key_value::dispatch_policy_kslicing>, |
| 70 | + elem_v_t<tune_key::global_kslicing_ratio, num_global_splitk>, |
| 71 | + elem_v_t<tune_key::local_kslicing_ratio, num_local_splitk>, |
| 72 | + elem_t_t<tune_key::wg_tile_shape, shape<wg_tile_n, wg_tile_m>>>; |
| 73 | + using gemm_op_t = gpu::xetla::kernel::default_gemm_t< |
| 74 | + data_type_a, // input datatype for A |
| 75 | + mem_layout::row_major, // memory layout for A |
| 76 | + 8, // leading dimension alignment for A, in unit of element |
| 77 | + data_type_b, // input datatype for B |
| 78 | + mem_layout::row_major, // memory layout for B |
| 79 | + 8, // leading dimension alignment for B, in unit of element |
| 80 | + data_type_c, // output datatype for C |
| 81 | + mem_layout::row_major, // memory layout for C |
| 82 | + 8, // leading dimension alignment for C, in unit of element |
| 83 | + data_type_acc, // accumulator data type for intermediate resutls |
| 84 | + gpu_arch::Xe, // GPU arch |
| 85 | + tune_option>; |
| 86 | + |
| 87 | + // allocate temp buffers for global split |
| 88 | + size_t size_acc = gemm_op_t::get_acc_buf_size(matrix_m, matrix_n); |
| 89 | + size_t size_cnt = gemm_op_t::get_cnt_buf_size(matrix_m, matrix_n); |
| 90 | + |
| 91 | + data_type_acc *Acc = static_cast<data_type_acc *>(_Acc); |
| 92 | + uint32_t *Cnt = static_cast<uint32_t *>(_Cnt); |
| 93 | + |
| 94 | + // set up gemm_universal arguments |
| 95 | + typename gemm_op_t::arguments_t gemm_arg(matrix_m, matrix_k, matrix_n, A, |
| 96 | + matrix_k, B, matrix_n, C, matrix_n, |
| 97 | + Acc, Cnt); |
| 98 | + |
| 99 | + cl::sycl::nd_range<3> nd_range = gemm_op_t::get_nd_range(gemm_arg); |
| 100 | + |
| 101 | + auto gpu_event = queue.submit([&](sycl::handler &cgh) { |
| 102 | + // GPU kernel |
| 103 | + cgh.parallel_for(nd_range, [=](sycl::nd_item<3> item) KERNEL_MAIN { |
| 104 | + // allocate slm and nbarrier resource |
| 105 | + slm_barrier_init<gemm_op_t>(); |
| 106 | + gemm_op_t gemm_op; |
| 107 | + gemm_op(item, gemm_arg); |
| 108 | + }); |
| 109 | + }); |
| 110 | + return gpu_event; |
| 111 | +} |
| 112 | + |
| 113 | +#endif // TRITONBENCHMARK_SPLIT_K_GEMM_H |
0 commit comments