|
4 | 4 |
|
5 | 5 | #include "ggml-zdnn/common.hpp" |
6 | 6 | #include "ggml-zdnn/mmf.hpp" |
| 7 | +#include "ggml-zdnn/utils.hpp" |
7 | 8 | #include "ggml.h" |
8 | 9 |
|
9 | 10 | #include <vector> |
10 | 11 | #include <memory> |
11 | 12 | #include <csignal> // raise(SIGTRAP) |
12 | 13 | #include <unistd.h> |
13 | 14 |
|
14 | | -inline zdnn_data_types ggml_zdnn_type_mapping(ggml_type type) { |
15 | | - switch (type) { |
16 | | - case GGML_TYPE_F32: |
17 | | - return FP32; |
18 | | - case GGML_TYPE_F16: |
19 | | - return FP16; |
20 | | - case GGML_TYPE_BF16: |
21 | | - return BFLOAT; |
22 | | - case GGML_TYPE_I8: |
23 | | - return INT8; |
24 | | - case GGML_TYPE_I32: |
25 | | - return INT32; |
26 | | - case GGML_TYPE_Q8_0: |
27 | | - return INT8; |
28 | | - default: |
29 | | - GGML_ABORT("%s: fatal: unable to determine zTensor data type", |
30 | | - __func__); |
31 | | - break; |
32 | | - } |
33 | | -} |
34 | | - |
35 | | -inline void ggml_zdnn_create_tensor(zdnn_tensor_desc & pre_tfm_desc, |
36 | | - zdnn_tensor_desc & tfm_desc, |
37 | | - zdnn_ztensor & ztensor, |
38 | | - const ggml_tensor * src, |
39 | | - const int64_t * ne, |
40 | | - const zdnn_data_layouts layout) { |
41 | | - zdnn_init_pre_transformed_desc( |
42 | | - layout, |
43 | | - ggml_zdnn_type_mapping(src->type), |
44 | | - &pre_tfm_desc, |
45 | | - ne[3], ne[2], ne[1], ne[0] |
46 | | - ); |
47 | | - |
48 | | - ZDNN_CHECK(zdnn_generate_transformed_desc(&pre_tfm_desc, &tfm_desc)); |
49 | | - ZDNN_CHECK(zdnn_init_ztensor_with_malloc(&pre_tfm_desc, &tfm_desc, &ztensor)); |
50 | | -} |
51 | | - |
52 | | -inline void ggml_zdnn_load_tensor(zdnn_ztensor & ztensor, |
53 | | - void * buffer) { |
54 | | - ZDNN_CHECK(zdnn_transform_ztensor(&ztensor, buffer)); |
55 | | -} |
56 | | - |
57 | | -inline void ggml_zdnn_init_tensor(ggml_backend_zdnn_buffer * buffer, const ggml_tensor * tensor) { |
58 | | - switch (tensor->op) { |
59 | | - case GGML_OP_MUL_MAT: |
60 | | - { |
61 | | - zdnn_init_pre_transformed_desc( |
62 | | - ZDNN_2D, |
63 | | - ggml_zdnn_type_mapping(tensor->type), |
64 | | - &buffer->pre_tfm_desc, |
65 | | - tensor->ne[1], tensor->ne[0] |
66 | | - ); |
67 | | - } break; |
68 | | - |
69 | | - default: |
70 | | - { |
71 | | - // For 4D tensors, GGML uses NCHW layout. However, because zDNN |
72 | | - // automatically transforms everything to NHWC, we will use it |
73 | | - // directly to avoid the performance penalty changing the |
74 | | - // layout and reshaping the tensor. |
75 | | - zdnn_init_pre_transformed_desc( |
76 | | - ZDNN_NHWC, |
77 | | - ggml_zdnn_type_mapping(tensor->type), |
78 | | - &buffer->pre_tfm_desc, |
79 | | - tensor->ne[3], tensor->ne[2], tensor->ne[1], tensor->ne[0] |
80 | | - ); |
81 | | - |
82 | | - // TODO: Consider adding a ggml check. |
83 | | - // TODO: If tensor = 4D, use ZDNN_NCHW by default. |
84 | | - // TODO: If tensor = 2D, use ZDNN_NHWC by default. |
85 | | - } break; |
86 | | - } |
87 | | - |
88 | | - ZDNN_CHECK(zdnn_generate_transformed_desc(&buffer->pre_tfm_desc, &buffer->tfm_desc)); |
89 | | - ZDNN_CHECK(zdnn_init_ztensor_with_malloc(&buffer->pre_tfm_desc, &buffer->tfm_desc, &buffer->ztensor)); |
90 | | -} |
| 15 | +static void ggml_zdnn_compute_forward_mul_mat( |
| 16 | + const ggml_backend_zdnn_context * ctx, |
| 17 | + ggml_tensor * dst) { |
91 | 18 |
|
92 | | -static void ggml_zdnn_mul_mat_dispatch(ggml_backend_zdnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
93 | | - // debug helpers |
94 | | - // GGML_LOG_INFO("%s: use_mul_mat_vec = %d\n", __func__, use_mul_mat_vec); |
95 | | - // GGML_LOG_INFO("%s: use_mul_mat_vec_q = %d\n", __func__, use_mul_mat_vec_q); |
96 | | - // GGML_LOG_INFO("%s: use_mul_mat_q = %d\n", __func__, use_mul_mat_q); |
97 | | - // GGML_LOG_INFO("%s: src0: %8d %8d %8d %8d\n", __func__, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); |
98 | | - // GGML_LOG_INFO("%s: %8d %8d %8d %8d\n", __func__, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]); |
99 | | - // GGML_LOG_INFO("%s: src1: %8d %8d %8d %8d\n", __func__, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]); |
100 | | - // GGML_LOG_INFO("%s: %8d %8d %8d %8d\n", __func__, src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]); |
101 | | - // GGML_LOG_INFO("%s: src0 is contiguous %d, transposed %d, type = %s, name = %s\n", __func__, ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); |
102 | | - // GGML_LOG_INFO("%s: src1 is contiguous %d, transposed %d, type = %s, name = %s\n", __func__, ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); |
| 19 | + const ggml_tensor * src0 = dst->src[0]; // weights |
| 20 | + const ggml_tensor * src1 = dst->src[1]; // inputs |
103 | 21 |
|
| 22 | + // TODO: implement support for quantized types |
104 | 23 | ggml_zdnn_mul_mat_f(ctx, src0, src1, dst); |
105 | 24 | } |
106 | 25 |
|
107 | 26 | static bool ggml_zdnn_compute_forward(ggml_backend_zdnn_context * ctx, ggml_tensor * dst) { |
108 | 27 | switch (dst->op) { |
109 | 28 | case GGML_OP_MUL_MAT: |
110 | | - ggml_zdnn_mul_mat_dispatch(ctx, dst->src[0], dst->src[1], dst); |
111 | | - break; |
| 29 | + { |
| 30 | + ggml_zdnn_compute_forward_mul_mat(ctx, dst); |
| 31 | + } break; |
112 | 32 |
|
113 | 33 | default: |
114 | 34 | return false; |
|
0 commit comments