@@ -73,6 +73,7 @@ struct _txe_device_t {
7373};
7474
7575struct _txe_compute_pipeline_state_t {
76+ void (*_mlir_fptr_3_input[DATA_TYPE_MAX_INDEX])(void *, void *, void *, void *);
7677 void (*_mlir_fptr_2_input[DATA_TYPE_MAX_INDEX])(void *, void *, void *);
7778 void (*_mlir_fptr_1_input[DATA_TYPE_MAX_INDEX])(void *, void *);
7879 std::string kernel_name;
@@ -256,8 +257,8 @@ void ggml_tsi_log_tensor_data(tensor_log log_data) {
256257 fprintf (log_data.log_file , " \n\n " );
257258 fprintf (log_data.log_file , " #############################################################\n " );
258259 fprintf (log_data.log_file ,
259- " Tensor Number %ld and Type %d \n leaf1 len %d, leaf2 len %d, Node len %d\n " ,
260- log_data.num_of_op , log_data.kernel_type , log_data.leaf1_len , log_data.leaf2_len ,
260+ " Tensor Number %ld and Type %s \n leaf1 len %d, leaf2 len %d, Node len %d\n " ,
261+ log_data.num_of_op , ggml_op_name ( log_data.kernel_type ) , log_data.leaf1_len , log_data.leaf2_len ,
261262 log_data.node_len );
262263 fprintf (log_data.log_file , " ############################################################\n " );
263264 fprintf (log_data.log_file , " \n\n " );
@@ -485,6 +486,13 @@ static txe_compute_pipeline_state_s tsi_kernel_setup(enum ggml_tsavorite_kernel_
485486 flag = true ;
486487 break ;
487488 }
489+ case GGML_TSAVORITE_KERNEL_TYPE_SOFT_MAX:
490+ {
491+ kernel_pipeline->_mlir_fptr_3_input [DATA_TYPE_F32_INDEX] = &_mlir_ciface_txe_soft_max_host;
492+ kernel_pipeline->kernel_name = " TXE_SOFTMAX" ;
493+ flag = true ;
494+ break ;
495+ }
488496 default :
489497 break ;
490498 }
@@ -634,6 +642,7 @@ static struct ggml_backend_tsavorite_context *ggml_tsavorite_init(ggml_backend_d
634642 GGML_TSAVORITE_KERNEL (GGML_TSAVORITE_KERNEL_TYPE_SILU, true );
635643 GGML_TSAVORITE_KERNEL (GGML_TSAVORITE_KERNEL_TYPE_RMS_NORM, true );
636644 GGML_TSAVORITE_KERNEL (GGML_TSAVORITE_KERNEL_TYPE_SWIGLU, true );
645+ GGML_TSAVORITE_KERNEL (GGML_TSAVORITE_KERNEL_TYPE_SOFT_MAX, true );
637646 }
638647
639648 GGML_TSAVORITE_LOG_INFO (" End %s\n " , __func__);
@@ -746,6 +755,7 @@ static bool ggml_tsavorite_supports_op(const struct ggml_backend_tsavorite_devic
746755 case GGML_OP_SQR:
747756 case GGML_OP_SIN:
748757 case GGML_OP_RMS_NORM:
758+ case GGML_OP_SOFT_MAX:
749759 break ;
750760 case GGML_OP_GLU:
751761 {
@@ -927,6 +937,15 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
927937 enum ggml_tsavorite_input_tensors_count num_of_input_tensors;
928938 tensor_log log_data;
929939
940+ MemRefDescriptor<Rank>* buf = create_mlir_buf<Rank>(96 );
941+
942+ if (!buf) {
943+ GGML_TSAVORITE_LOG_ERROR (" tsi_alloc failied for creating memory for buf \n " );
944+ return GGML_STATUS_ABORTED;
945+ }
946+ buf->offset = 0 ;
947+ buf->data = buf->base = (void *)(buf+1 );
948+
930949 for (int i = 0 ; i < cgraph->n_nodes ; i++) {
931950 int32_t kernel_sub_type=-1 ;
932951#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
@@ -982,6 +1001,10 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
9821001 kernel_type = GGML_TSAVORITE_KERNEL_TYPE_RMS_NORM;
9831002 num_of_input_tensors = TSAVORITE_UNARY_INPUT_TENSORS;
9841003 break ;
1004+ case GGML_OP_SOFT_MAX:
1005+ kernel_type = GGML_TSAVORITE_KERNEL_TYPE_SOFT_MAX;
1006+ num_of_input_tensors = TSAVORITE_TWO_INPUT_TENSORS;
1007+ break ;
9851008 case GGML_OP_GLU:
9861009 kernel_type = tsi_glu_kernel_type (node);
9871010 if (!src1)
@@ -1023,7 +1046,8 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
10231046 }
10241047
10251048 if (!ctx->kernels [kernel_type].pipeline ||
1026- (!ctx->kernels [kernel_type].pipeline ->_mlir_fptr_2_input [kernel_sub_type] &&
1049+ (!ctx->kernels [kernel_type].pipeline ->_mlir_fptr_3_input [kernel_sub_type] &&
1050+ !ctx->kernels [kernel_type].pipeline ->_mlir_fptr_2_input [kernel_sub_type] &&
10271051 !ctx->kernels [kernel_type].pipeline ->_mlir_fptr_1_input [kernel_sub_type])) {
10281052 GGML_TSAVORITE_LOG_ERROR (" Kernel Type %d, not supported \n " , kernel_type);
10291053 return GGML_STATUS_ABORTED;
@@ -1091,7 +1115,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
10911115 log_data.node_len = num_elem_node;
10921116 log_data.log_file = tsi_op_log_file;
10931117 log_data.num_of_op = num_of_op;
1094- log_data.kernel_type = kernel_type ;
1118+ log_data.kernel_type = node-> op ;
10951119
10961120 log_data.data_type = GGML_TSAVORITE_TENSOR_HEADER;
10971121 ggml_tsi_log_tensor_data (log_data);
@@ -1108,36 +1132,108 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
11081132 ggml_tensor *dst = node;
11091133 const int nr = ggml_nrows (src0);
11101134
1111- GGML_TENSOR_BINARY_OP_LOCALS
1112-
1113- for (int ir = 0 ; ir < nr; ++ir) {
1114- const int64_t i03 = ir / (ne02 * ne01);
1115- const int64_t i02 = (ir - i03 * ne02 * ne01) / ne01;
1116- const int64_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01);
1117-
1118- const int64_t i13 = i03 % ne13;
1119- const int64_t i12 = i02 % ne12;
1120- const int64_t i11 = i01 % ne11;
1121- const int64_t nr0 = ne00 / ne10;
1122-
1123- float *dst_ptr = (float *)((char *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1);
1124- float *src0_ptr = (float *)((char *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01);
1125- float *src1_ptr = (float *)((char *)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11);
1126-
1127- // The following below code operates exclusively on Rank 0
1128- // (i.e., the first dimension) for all blob-related processing.
1129-
1130- for (int64_t r = 0 ; r < nr0; ++r) {
1131- srcP0->shape [0 ] = ne10;
1132- srcP1->shape [0 ] = ne10;
1133- nodeP->shape [0 ] = ne10;
1134- srcP1->data = srcP1->base = (void *)(src1_ptr);
1135- srcP0->data = srcP0->base = (void *)(src0_ptr + r * ne10);
1136- nodeP->data = nodeP->base = (void *)(dst_ptr + r * ne10);
1137- // kernel call
1138- ctx->kernels [kernel_type].pipeline ->_mlir_fptr_2_input [kernel_sub_type](srcP0, srcP1, nodeP);
1139- ++device->stats .op_run_count [kernel_type].num_of_kernel_call ;
1140- }
1135+ /* The current SoftMax implementation does not consider the src2 input,
1136+ * as none of the popular models we currently use require it.
1137+ * However, for future enhancements to SOFT_MAX, we plan to support src2
1138+ * for sinking-based maximization. In that case, src2 will be used to
1139+ * recalculate the maximum value.
1140+ */
1141+ if ( kernel_type == GGML_TSAVORITE_KERNEL_TYPE_SOFT_MAX) {
1142+ const ggml_tensor * src2 = dst->src [2 ];
1143+ float scale = 1 .0f ;
1144+ float max_bias = 0 .0f ;
1145+
1146+ memcpy (&scale, (float *) dst->op_params + 0 , sizeof (float ));
1147+ memcpy (&max_bias, (float *) dst->op_params + 1 , sizeof (float ));
1148+
1149+ GGML_TENSOR_UNARY_OP_LOCALS
1150+
1151+ const int64_t nb11 = src1 ? src1->nb [1 ] : 1 ;
1152+ const int64_t nb12 = src1 ? src1->nb [2 ] : 1 ;
1153+ const int64_t nb13 = src1 ? src1->nb [3 ] : 1 ;
1154+
1155+ const int64_t ne12 = src1 ? src1->ne [2 ] : 1 ;
1156+ const int64_t ne13 = src1 ? src1->ne [3 ] : 1 ;
1157+
1158+ // TODO: is this supposed to be ceil instead of floor?
1159+ // https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
1160+ const uint32_t n_head = ne02;
1161+ const uint32_t n_head_log2 = 1u << (uint32_t ) floor (log2 (n_head));
1162+
1163+ const float m0 = powf (2 .0f , -(max_bias ) / n_head_log2);
1164+ const float m1 = powf (2 .0f , -(max_bias / 2 .0f ) / n_head_log2);
1165+
1166+ const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
1167+
1168+ // sinks
1169+ const float * sk = src2 ? (float *)((char *) src2->data ) : nullptr ;
1170+ // here src2 is NULL for particular model hence u can ignore this for now
1171+ if (src2) {
1172+ printf (" \n ANOOP src2 is not null\n " );
1173+ }
1174+ for (int64_t i03 = 0 ; i03 < ne03; i03++) {
1175+ for (int64_t i02 = 0 ; i02 < ne02; i02++) {
1176+ for (int64_t i01 = 0 ; i01 < ne01; i01 += 1 ) {
1177+ const int64_t i11 = i01;
1178+ const int64_t i12 = i02%ne12;
1179+ const int64_t i13 = i03%ne13;
1180+
1181+ // ALiBi
1182+ const uint32_t h = i02; // head
1183+ const float slope = (max_bias > 0 .0f ) ? h < n_head_log2 ? powf (m0, h + 1 ) : powf (m1, 2 *(h - n_head_log2) + 1 ) : 1 .0f ;
1184+
1185+ float * sp = (float *)((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
1186+ float * dp = (float *)((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
1187+
1188+ // broadcast the mask across rows
1189+ ggml_fp16_t * mp_f16 = src1 ? (ggml_fp16_t *)((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13) : NULL ;
1190+ float * mp_f32 = src1 ? (float *)((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13) : NULL ;
1191+
1192+ srcP0->shape [0 ] = ne00;
1193+ srcP1->shape [0 ] = ne00;
1194+ nodeP->shape [0 ] = ne00;
1195+ srcP1->data = srcP1->base = (void *)(mp_f32);
1196+ srcP0->data = srcP0->base = (void *)(sp);
1197+ nodeP->data = nodeP->base = (void *)(dp);
1198+
1199+ float *val = (float *)buf->data ;
1200+ val[0 ] = scale;
1201+ ctx->kernels [kernel_type].pipeline ->_mlir_fptr_3_input [kernel_sub_type](srcP0, srcP1, nodeP, buf);
1202+ }
1203+ }
1204+ }
1205+ } else {
1206+ GGML_TENSOR_BINARY_OP_LOCALS
1207+
1208+ for (int ir = 0 ; ir < nr; ++ir) {
1209+ const int64_t i03 = ir / (ne02 * ne01);
1210+ const int64_t i02 = (ir - i03 * ne02 * ne01) / ne01;
1211+ const int64_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01);
1212+
1213+ const int64_t i13 = i03 % ne13;
1214+ const int64_t i12 = i02 % ne12;
1215+ const int64_t i11 = i01 % ne11;
1216+ const int64_t nr0 = ne00 / ne10;
1217+
1218+ float *dst_ptr = (float *)((char *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1);
1219+ float *src0_ptr = (float *)((char *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01);
1220+ float *src1_ptr = (float *)((char *)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11);
1221+
1222+ // The following below code operates exclusively on Rank 0
1223+ // (i.e., the first dimension) for all blob-related processing.
1224+
1225+ for (int64_t r = 0 ; r < nr0; ++r) {
1226+ srcP0->shape [0 ] = ne10;
1227+ srcP1->shape [0 ] = ne10;
1228+ nodeP->shape [0 ] = ne10;
1229+ srcP1->data = srcP1->base = (void *)(src1_ptr);
1230+ srcP0->data = srcP0->base = (void *)(src0_ptr + r * ne10);
1231+ nodeP->data = nodeP->base = (void *)(dst_ptr + r * ne10);
1232+ // kernel call
1233+ ctx->kernels [kernel_type].pipeline ->_mlir_fptr_2_input [kernel_sub_type](srcP0, srcP1, nodeP);
1234+ ++device->stats .op_run_count [kernel_type].num_of_kernel_call ;
1235+ }
1236+ }
11411237 }
11421238
11431239 if (ggml_tsavorite_log_type_val == GGML_TSAVORITE_LOG_DEBUG) {
@@ -1184,7 +1280,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
11841280 log_data.node_len = num_elem_src0;
11851281 log_data.log_file = tsi_op_log_file;
11861282 log_data.num_of_op = num_of_op;
1187- log_data.kernel_type = kernel_type ;
1283+ log_data.kernel_type = node-> op ;
11881284
11891285 log_data.data_type = GGML_TSAVORITE_TENSOR_HEADER;
11901286 ggml_tsi_log_tensor_data (log_data);
@@ -1214,15 +1310,6 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
12141310 // Although only 32 elements are strictly necessary, reducing this would require changes to the RMS kernel.
12151311 // The remaining 32 elements are used to store src0->ne[0], replicated across each of the last 32 entries.
12161312
1217- MemRefDescriptor<Rank>* buf = create_mlir_buf<Rank>(96 );
1218-
1219- if (!buf) {
1220- GGML_TSAVORITE_LOG_ERROR (" tsi_alloc failied for creating memory for buf \n " );
1221- return GGML_STATUS_ABORTED;
1222- }
1223- buf->offset = 0 ;
1224- buf->data = buf->base = (void *)(buf+1 );
1225-
12261313 float *val = (float *)buf->data ;
12271314 int i;
12281315 for (i=64 ; i <= 95 ; ++i)
@@ -1250,6 +1337,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
12501337 }
12511338
12521339 ctx->kernels [kernel_type].pipeline ->_mlir_fptr_2_input [kernel_sub_type](srcP0, nodeP, buf);
1340+
12531341 }
12541342 else {
12551343 // kernel call
@@ -1460,6 +1548,7 @@ static void ggml_backend_tsavorite_log_allocated_size(txe_device_s device, size_
14601548static ggml_backend_buffer_t
14611549ggml_backend_tsavorite_buffer_type_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size) {
14621550 GGML_TSAVORITE_LOG_INFO (" Start %s\n " , __func__);
1551+ tsi_log_setup ();
14631552 struct ggml_backend_tsavorite_buffer_context *ctx =
14641553 (struct ggml_backend_tsavorite_buffer_context *)calloc (
14651554 1 , sizeof (struct ggml_backend_tsavorite_buffer_context ));
@@ -1984,6 +2073,7 @@ static bool ggml_backend_tsavorite_device_offload_op(ggml_backend_dev_t dev,
19842073 case GGML_OP_SQR:
19852074 case GGML_OP_SIN:
19862075 case GGML_OP_RMS_NORM:
2076+ case GGML_OP_SOFT_MAX:
19872077 break ;
19882078 case GGML_OP_GLU:
19892079 {
0 commit comments