|
| 1 | +#include "set-rows.cuh" |
| 2 | + |
| 3 | +typedef void (*set_rows_kernel_t)(const char * src, char * dst); |
| 4 | + |
| 5 | +template<typename src_t, typename dst_t> |
| 6 | +__device__ void set_rows_1(const src_t * src_f, dst_t * dst_f) {} |
| 7 | + |
| 8 | +template<> |
| 9 | +__device__ __forceinline__ void set_rows_1<float, half>(const float * src_f, half * dst_h) { |
| 10 | + *dst_h = __float2half(*src_f); |
| 11 | +} |
| 12 | + |
| 13 | +template<> |
| 14 | +__device__ __forceinline__ void set_rows_1<float, nv_bfloat16>(const float * src_f, nv_bfloat16 * dst_b) { |
| 15 | + *dst_b = *src_f; |
| 16 | +} |
| 17 | + |
| 18 | +template<> |
| 19 | +__device__ __forceinline__ void set_rows_1<float, float>(const float * src_f, float * dst_f) { |
| 20 | + *dst_f = *src_f; |
| 21 | +} |
| 22 | + |
| 23 | +template<typename src_t, typename dst_t> |
| 24 | +static __global__ void k_set_rows( |
| 25 | + const src_t * __restrict__ src0, const int64_t * __restrict__ src1, dst_t * __restrict__ dst, |
| 26 | + const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03, |
| 27 | + const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13, |
| 28 | + const int64_t s01, const int64_t s02, const int64_t s03, |
| 29 | + const int64_t s10, const int64_t s11, const int64_t s12, |
| 30 | + const int64_t s1, const int64_t s2, const int64_t s3) { |
| 31 | + |
| 32 | + const int64_t i = int64_t(blockDim.x) * blockIdx.x + threadIdx.x; |
| 33 | + const int64_t ne_total = ne00 * ne01 * ne02 * ne03; |
| 34 | + |
| 35 | + if (i >= ne_total) { |
| 36 | + return; |
| 37 | + } |
| 38 | + |
| 39 | + const int64_t i03 = i / (ne00 * ne01 * ne02); |
| 40 | + const int64_t i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01); |
| 41 | + const int64_t i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01) / ne00; |
| 42 | + const int64_t i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01 - i01 * ne00; |
| 43 | + |
| 44 | + const int64_t i12 = i03 % ne12; |
| 45 | + const int64_t i11 = i02 % ne11; |
| 46 | + const int64_t i10 = i01; |
| 47 | + |
| 48 | + const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12); |
| 49 | + |
| 50 | + const src_t * src0_row = src0 + i01*s01 + i02*s02 + i03*s03; |
| 51 | + dst_t * dst_row_ptr = dst + dst_row*s1 + i02*s2 + i03*s3; |
| 52 | + |
| 53 | + const src_t* src_elem = src0_row + i00; |
| 54 | + dst_t* dst_elem = dst_row_ptr + i00; |
| 55 | + set_rows_1(src_elem, dst_elem); |
| 56 | +} |
| 57 | + |
| 58 | +template<typename src_t, typename dst_t> |
| 59 | +static void set_rows_cuda( |
| 60 | + const src_t * src0_d, const int64_t * src1_d, dst_t * dst_d, |
| 61 | + const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03, |
| 62 | + const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13, |
| 63 | + const size_t nb01, const size_t nb02, const size_t nb03, |
| 64 | + const size_t nb10, const size_t nb11, const size_t nb12, |
| 65 | + const size_t nb1, const size_t nb2, const size_t nb3, |
| 66 | + cudaStream_t stream) { |
| 67 | + |
| 68 | + const int64_t ne_total = ne00 * ne01 * ne02 * ne03; |
| 69 | + const int num_blocks = (ne_total + CUDA_SET_ROWS_BLOCK_SIZE - 1) / CUDA_SET_ROWS_BLOCK_SIZE; |
| 70 | + const dim3 block_size(CUDA_SET_ROWS_BLOCK_SIZE); |
| 71 | + const dim3 grid_size(num_blocks); |
| 72 | + |
| 73 | + |
| 74 | + const int64_t s01 = nb01/sizeof(src_t); |
| 75 | + const int64_t s02 = nb02/sizeof(src_t); |
| 76 | + const int64_t s03 = nb03/sizeof(src_t); |
| 77 | + const int64_t s10 = nb10/sizeof(int64_t); |
| 78 | + const int64_t s11 = nb11/sizeof(int64_t); |
| 79 | + const int64_t s12 = nb12/sizeof(int64_t); |
| 80 | + const int64_t s1 = nb1/sizeof(dst_t); |
| 81 | + const int64_t s2 = nb2/sizeof(dst_t); |
| 82 | + const int64_t s3 = nb3/sizeof(dst_t); |
| 83 | + |
| 84 | + if (ne_total > 0) { |
| 85 | + k_set_rows<<<grid_size, block_size, 0, stream>>>( |
| 86 | + src0_d, src1_d, dst_d, |
| 87 | + ne00, ne01, ne02, ne03, |
| 88 | + ne10, ne11, ne12, ne13, |
| 89 | + s01, s02, s03, |
| 90 | + s10, s11, s12, |
| 91 | + s1, s2, s3); |
| 92 | + } |
| 93 | +} |
| 94 | + |
| 95 | + |
| 96 | +void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { |
| 97 | + const ggml_tensor * src0 = dst->src[0]; |
| 98 | + const ggml_tensor * src1 = dst->src[1]; |
| 99 | + |
| 100 | + GGML_ASSERT(src0->type == GGML_TYPE_F32); |
| 101 | + GGML_ASSERT(src1->type == GGML_TYPE_I64); |
| 102 | + |
| 103 | + GGML_TENSOR_BINARY_OP_LOCALS |
| 104 | + |
| 105 | + const float * src0_d = (const float *)src0->data; |
| 106 | + const int64_t * src1_d = (const int64_t *)src1->data; |
| 107 | + |
| 108 | + cudaStream_t stream = ctx.stream(); |
| 109 | + |
| 110 | + |
| 111 | + |
| 112 | + if (dst->type == GGML_TYPE_F32) { |
| 113 | + set_rows_cuda( |
| 114 | + src0_d, src1_d, (float*)dst->data, |
| 115 | + ne00, ne01, ne02, ne03, |
| 116 | + ne10, ne11, ne12, ne13, |
| 117 | + nb01, nb02, nb03, |
| 118 | + nb10, nb11, nb12, |
| 119 | + nb1, nb2, nb3, |
| 120 | + stream |
| 121 | + ); |
| 122 | + } else if (dst->type == GGML_TYPE_F16) { |
| 123 | + set_rows_cuda( |
| 124 | + src0_d, src1_d, (half*)dst->data, |
| 125 | + ne00, ne01, ne02, ne03, |
| 126 | + ne10, ne11, ne12, ne13, |
| 127 | + nb01, nb02, nb03, |
| 128 | + nb10, nb11, nb12, |
| 129 | + nb1, nb2, nb3, |
| 130 | + stream |
| 131 | + ); |
| 132 | + } else if (dst->type == GGML_TYPE_BF16) { |
| 133 | + set_rows_cuda( |
| 134 | + src0_d, src1_d, (nv_bfloat16*)dst->data, |
| 135 | + ne00, ne01, ne02, ne03, |
| 136 | + ne10, ne11, ne12, ne13, |
| 137 | + nb01, nb02, nb03, |
| 138 | + nb10, nb11, nb12, |
| 139 | + nb1, nb2, nb3, |
| 140 | + stream |
| 141 | + ); |
| 142 | + } else { |
| 143 | + GGML_ABORT("unsupported type"); |
| 144 | + } |
| 145 | +} |
0 commit comments