Skip to content

Commit 85b71fd

Browse files
author
Anoop Kapoor
committed
@FIR-999 - Create SOFT_MAX for tsavorite-backend for GGML
1 parent 272b85c commit 85b71fd

File tree

3 files changed

+139
-46
lines changed

3 files changed

+139
-46
lines changed

ggml-tsi-kernel

ggml/include/ggml-tsavorite.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,8 @@ enum ggml_tsavorite_kernel_type {
140140
GGML_TSAVORITE_KERNEL_TYPE_GEGLU_ERF,
141141
GGML_TSAVORITE_KERNEL_TYPE_GEGLU_QUICK,
142142

143+
GGML_TSAVORITE_KERNEL_TYPE_SOFT_MAX,
144+
143145
GGML_TSAVORITE_KERNEL_TYPE_COUNT
144146
};
145147

@@ -156,7 +158,7 @@ typedef struct tensor_log_ {
156158
uint32_t leaf2_len;
157159
uint32_t node_len;
158160
enum ggml_tsavorite_tensor_data_type data_type;
159-
enum ggml_tsavorite_kernel_type kernel_type;
161+
enum ggml_op kernel_type;
160162
uint64_t num_of_op;
161163
FILE *log_file;
162164
const ggml_tensor *tensor;
@@ -185,6 +187,7 @@ extern void _mlir_ciface_txe_sin_host(void *a, void *res);
185187
extern void _mlir_ciface_txe_sigmoid_host(void *a, void *res);
186188
extern void _mlir_ciface_txe_silu_host(void *a, void *res);
187189
extern void _mlir_ciface_txe_swiglu_host(void *a, void *b, void *res);
190+
extern void _mlir_ciface_txe_soft_max_host(void *a, void *b, void *res, void *buf);
188191
extern void _mlir_ciface_txe_rms_norm_host(void *a, void *res, void *buf);
189192

190193
/*

ggml/src/ggml-tsavorite/ggml-tsavorite.cpp

Lines changed: 134 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ struct _txe_device_t {
7373
};
7474

7575
struct _txe_compute_pipeline_state_t {
76+
void (*_mlir_fptr_3_input[DATA_TYPE_MAX_INDEX])(void *, void *, void *, void *);
7677
void (*_mlir_fptr_2_input[DATA_TYPE_MAX_INDEX])(void *, void *, void *);
7778
void (*_mlir_fptr_1_input[DATA_TYPE_MAX_INDEX])(void *, void *);
7879
std::string kernel_name;
@@ -256,8 +257,8 @@ void ggml_tsi_log_tensor_data(tensor_log log_data) {
256257
fprintf(log_data.log_file, "\n\n");
257258
fprintf(log_data.log_file, "#############################################################\n");
258259
fprintf(log_data.log_file,
259-
"Tensor Number %ld and Type %d \n leaf1 len %d, leaf2 len %d, Node len %d\n",
260-
log_data.num_of_op, log_data.kernel_type, log_data.leaf1_len, log_data.leaf2_len,
260+
"Tensor Number %ld and Type %s \n leaf1 len %d, leaf2 len %d, Node len %d\n",
261+
log_data.num_of_op, ggml_op_name(log_data.kernel_type), log_data.leaf1_len, log_data.leaf2_len,
261262
log_data.node_len);
262263
fprintf(log_data.log_file, "############################################################\n");
263264
fprintf(log_data.log_file, "\n\n");
@@ -485,6 +486,13 @@ static txe_compute_pipeline_state_s tsi_kernel_setup(enum ggml_tsavorite_kernel_
485486
flag = true;
486487
break;
487488
}
489+
case GGML_TSAVORITE_KERNEL_TYPE_SOFT_MAX:
490+
{
491+
kernel_pipeline->_mlir_fptr_3_input[DATA_TYPE_F32_INDEX] = &_mlir_ciface_txe_soft_max_host;
492+
kernel_pipeline->kernel_name = "TXE_SOFTMAX";
493+
flag = true;
494+
break;
495+
}
488496
default:
489497
break;
490498
}
@@ -634,6 +642,7 @@ static struct ggml_backend_tsavorite_context *ggml_tsavorite_init(ggml_backend_d
634642
GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SILU, true);
635643
GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_RMS_NORM, true);
636644
GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SWIGLU, true);
645+
GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SOFT_MAX, true);
637646
}
638647

639648
GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
@@ -746,6 +755,7 @@ static bool ggml_tsavorite_supports_op(const struct ggml_backend_tsavorite_devic
746755
case GGML_OP_SQR:
747756
case GGML_OP_SIN:
748757
case GGML_OP_RMS_NORM:
758+
case GGML_OP_SOFT_MAX:
749759
break;
750760
case GGML_OP_GLU:
751761
{
@@ -927,6 +937,15 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
927937
enum ggml_tsavorite_input_tensors_count num_of_input_tensors;
928938
tensor_log log_data;
929939

940+
MemRefDescriptor<Rank>* buf = create_mlir_buf<Rank>(96);
941+
942+
if (!buf) {
943+
GGML_TSAVORITE_LOG_ERROR("tsi_alloc failied for creating memory for buf \n");
944+
return GGML_STATUS_ABORTED;
945+
}
946+
buf->offset = 0;
947+
buf->data = buf->base = (void *)(buf+1);
948+
930949
for (int i = 0; i < cgraph->n_nodes; i++) {
931950
int32_t kernel_sub_type=-1;
932951
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
@@ -982,6 +1001,10 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
9821001
kernel_type = GGML_TSAVORITE_KERNEL_TYPE_RMS_NORM;
9831002
num_of_input_tensors = TSAVORITE_UNARY_INPUT_TENSORS;
9841003
break;
1004+
case GGML_OP_SOFT_MAX:
1005+
kernel_type = GGML_TSAVORITE_KERNEL_TYPE_SOFT_MAX;
1006+
num_of_input_tensors = TSAVORITE_TWO_INPUT_TENSORS;
1007+
break;
9851008
case GGML_OP_GLU:
9861009
kernel_type = tsi_glu_kernel_type(node);
9871010
if (!src1)
@@ -1023,7 +1046,8 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
10231046
}
10241047

10251048
if (!ctx->kernels[kernel_type].pipeline ||
1026-
(!ctx->kernels[kernel_type].pipeline->_mlir_fptr_2_input[kernel_sub_type] &&
1049+
(!ctx->kernels[kernel_type].pipeline->_mlir_fptr_3_input[kernel_sub_type] &&
1050+
!ctx->kernels[kernel_type].pipeline->_mlir_fptr_2_input[kernel_sub_type] &&
10271051
!ctx->kernels[kernel_type].pipeline->_mlir_fptr_1_input[kernel_sub_type])) {
10281052
GGML_TSAVORITE_LOG_ERROR("Kernel Type %d, not supported \n", kernel_type);
10291053
return GGML_STATUS_ABORTED;
@@ -1091,7 +1115,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
10911115
log_data.node_len = num_elem_node;
10921116
log_data.log_file = tsi_op_log_file;
10931117
log_data.num_of_op = num_of_op;
1094-
log_data.kernel_type = kernel_type;
1118+
log_data.kernel_type = node->op;
10951119

10961120
log_data.data_type = GGML_TSAVORITE_TENSOR_HEADER;
10971121
ggml_tsi_log_tensor_data(log_data);
@@ -1108,36 +1132,108 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
11081132
ggml_tensor *dst = node;
11091133
const int nr = ggml_nrows(src0);
11101134

1111-
GGML_TENSOR_BINARY_OP_LOCALS
1112-
1113-
for (int ir = 0; ir < nr; ++ir) {
1114-
const int64_t i03 = ir / (ne02 * ne01);
1115-
const int64_t i02 = (ir - i03 * ne02 * ne01) / ne01;
1116-
const int64_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01);
1117-
1118-
const int64_t i13 = i03 % ne13;
1119-
const int64_t i12 = i02 % ne12;
1120-
const int64_t i11 = i01 % ne11;
1121-
const int64_t nr0 = ne00 / ne10;
1122-
1123-
float *dst_ptr = (float *)((char *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1);
1124-
float *src0_ptr = (float *)((char *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01);
1125-
float *src1_ptr = (float *)((char *)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11);
1126-
1127-
// The following below code operates exclusively on Rank 0
1128-
// (i.e., the first dimension) for all blob-related processing.
1129-
1130-
for (int64_t r = 0; r < nr0; ++r) {
1131-
srcP0->shape[0] = ne10;
1132-
srcP1->shape[0] = ne10;
1133-
nodeP->shape[0] = ne10;
1134-
srcP1->data = srcP1->base = (void *)(src1_ptr);
1135-
srcP0->data = srcP0->base = (void *)(src0_ptr + r * ne10);
1136-
nodeP->data = nodeP->base = (void *)(dst_ptr + r * ne10);
1137-
// kernel call
1138-
ctx->kernels[kernel_type].pipeline->_mlir_fptr_2_input[kernel_sub_type](srcP0, srcP1, nodeP);
1139-
++device->stats.op_run_count[kernel_type].num_of_kernel_call;
1140-
}
1135+
/* The current SoftMax implementation does not consider the src2 input,
1136+
* as none of the popular models we currently use require it.
1137+
* However, for future enhancements to SOFT_MAX, we plan to support src2
1138+
* for sinking-based maximization. In that case, src2 will be used to
1139+
* recalculate the maximum value.
1140+
*/
1141+
if( kernel_type == GGML_TSAVORITE_KERNEL_TYPE_SOFT_MAX) {
1142+
const ggml_tensor * src2 = dst->src[2];
1143+
float scale = 1.0f;
1144+
float max_bias = 0.0f;
1145+
1146+
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
1147+
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
1148+
1149+
GGML_TENSOR_UNARY_OP_LOCALS
1150+
1151+
const int64_t nb11 = src1 ? src1->nb[1] : 1;
1152+
const int64_t nb12 = src1 ? src1->nb[2] : 1;
1153+
const int64_t nb13 = src1 ? src1->nb[3] : 1;
1154+
1155+
const int64_t ne12 = src1 ? src1->ne[2] : 1;
1156+
const int64_t ne13 = src1 ? src1->ne[3] : 1;
1157+
1158+
// TODO: is this supposed to be ceil instead of floor?
1159+
// https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
1160+
const uint32_t n_head = ne02;
1161+
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
1162+
1163+
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
1164+
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
1165+
1166+
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
1167+
1168+
// sinks
1169+
const float * sk = src2 ? (float *)((char *) src2->data) : nullptr;
1170+
//here src2 is NULL for particular model hence u can ignore this for now
1171+
if (src2) {
1172+
printf("\n ANOOP src2 is not null\n");
1173+
}
1174+
for (int64_t i03 = 0; i03 < ne03; i03++) {
1175+
for (int64_t i02 = 0; i02 < ne02; i02++) {
1176+
for (int64_t i01 = 0; i01 < ne01; i01 += 1) {
1177+
const int64_t i11 = i01;
1178+
const int64_t i12 = i02%ne12;
1179+
const int64_t i13 = i03%ne13;
1180+
1181+
// ALiBi
1182+
const uint32_t h = i02; // head
1183+
const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
1184+
1185+
float * sp = (float *)((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
1186+
float * dp = (float *)((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
1187+
1188+
// broadcast the mask across rows
1189+
ggml_fp16_t * mp_f16 = src1 ? (ggml_fp16_t *)((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13) : NULL;
1190+
float * mp_f32 = src1 ? (float *)((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13) : NULL;
1191+
1192+
srcP0->shape[0] = ne00;
1193+
srcP1->shape[0] = ne00;
1194+
nodeP->shape[0] = ne00;
1195+
srcP1->data = srcP1->base = (void *)(mp_f32);
1196+
srcP0->data = srcP0->base = (void *)(sp);
1197+
nodeP->data = nodeP->base = (void *)(dp);
1198+
1199+
float *val = (float *)buf->data;
1200+
val[0] = scale;
1201+
ctx->kernels[kernel_type].pipeline->_mlir_fptr_3_input[kernel_sub_type](srcP0, srcP1, nodeP, buf);
1202+
}
1203+
}
1204+
}
1205+
} else {
1206+
GGML_TENSOR_BINARY_OP_LOCALS
1207+
1208+
for (int ir = 0; ir < nr; ++ir) {
1209+
const int64_t i03 = ir / (ne02 * ne01);
1210+
const int64_t i02 = (ir - i03 * ne02 * ne01) / ne01;
1211+
const int64_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01);
1212+
1213+
const int64_t i13 = i03 % ne13;
1214+
const int64_t i12 = i02 % ne12;
1215+
const int64_t i11 = i01 % ne11;
1216+
const int64_t nr0 = ne00 / ne10;
1217+
1218+
float *dst_ptr = (float *)((char *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1);
1219+
float *src0_ptr = (float *)((char *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01);
1220+
float *src1_ptr = (float *)((char *)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11);
1221+
1222+
// The following below code operates exclusively on Rank 0
1223+
// (i.e., the first dimension) for all blob-related processing.
1224+
1225+
for (int64_t r = 0; r < nr0; ++r) {
1226+
srcP0->shape[0] = ne10;
1227+
srcP1->shape[0] = ne10;
1228+
nodeP->shape[0] = ne10;
1229+
srcP1->data = srcP1->base = (void *)(src1_ptr);
1230+
srcP0->data = srcP0->base = (void *)(src0_ptr + r * ne10);
1231+
nodeP->data = nodeP->base = (void *)(dst_ptr + r * ne10);
1232+
// kernel call
1233+
ctx->kernels[kernel_type].pipeline->_mlir_fptr_2_input[kernel_sub_type](srcP0, srcP1, nodeP);
1234+
++device->stats.op_run_count[kernel_type].num_of_kernel_call;
1235+
}
1236+
}
11411237
}
11421238

11431239
if (ggml_tsavorite_log_type_val == GGML_TSAVORITE_LOG_DEBUG) {
@@ -1184,7 +1280,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
11841280
log_data.node_len = num_elem_src0;
11851281
log_data.log_file = tsi_op_log_file;
11861282
log_data.num_of_op = num_of_op;
1187-
log_data.kernel_type = kernel_type;
1283+
log_data.kernel_type = node->op;
11881284

11891285
log_data.data_type = GGML_TSAVORITE_TENSOR_HEADER;
11901286
ggml_tsi_log_tensor_data(log_data);
@@ -1214,15 +1310,6 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
12141310
// Although only 32 elements are strictly necessary, reducing this would require changes to the RMS kernel.
12151311
// The remaining 32 elements are used to store src0->ne[0], replicated across each of the last 32 entries.
12161312

1217-
MemRefDescriptor<Rank>* buf = create_mlir_buf<Rank>(96);
1218-
1219-
if (!buf) {
1220-
GGML_TSAVORITE_LOG_ERROR("tsi_alloc failied for creating memory for buf \n");
1221-
return GGML_STATUS_ABORTED;
1222-
}
1223-
buf->offset = 0;
1224-
buf->data = buf->base = (void *)(buf+1);
1225-
12261313
float *val = (float *)buf->data;
12271314
int i;
12281315
for(i=64; i <= 95; ++i)
@@ -1250,6 +1337,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
12501337
}
12511338

12521339
ctx->kernels[kernel_type].pipeline->_mlir_fptr_2_input[kernel_sub_type](srcP0, nodeP, buf);
1340+
12531341
}
12541342
else {
12551343
// kernel call
@@ -1460,6 +1548,7 @@ static void ggml_backend_tsavorite_log_allocated_size(txe_device_s device, size_
14601548
static ggml_backend_buffer_t
14611549
ggml_backend_tsavorite_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
14621550
GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
1551+
tsi_log_setup();
14631552
struct ggml_backend_tsavorite_buffer_context *ctx =
14641553
(struct ggml_backend_tsavorite_buffer_context *)calloc(
14651554
1, sizeof(struct ggml_backend_tsavorite_buffer_context));
@@ -1984,6 +2073,7 @@ static bool ggml_backend_tsavorite_device_offload_op(ggml_backend_dev_t dev,
19842073
case GGML_OP_SQR:
19852074
case GGML_OP_SIN:
19862075
case GGML_OP_RMS_NORM:
2076+
case GGML_OP_SOFT_MAX:
19872077
break;
19882078
case GGML_OP_GLU:
19892079
{

0 commit comments

Comments
 (0)