@@ -52,6 +52,38 @@ typedef struct _txe_command_buffer_t *txe_command_buffer_s;
5252#endif /* USE_COMMAND_BUFFERS */
5353typedef struct ggml_backend_tsavorite_buffer ggml_backend_tsavorite_buffer_s;
5454
55+ const int Rank = MEM_REF_DESCRIPTOR_RANK;
56+ MemRefDescriptor<Rank>* glob_buf;
57+
58+ template <int Rank>
59+ // Assumes tsi_alloc is available and returns a pointer to allocated memory
60+ static MemRefDescriptor<Rank>* create_mlir_buf (int K) {
61+ // TVU load size (e.g., 32 for 1024-bit vector with 32-bit elements)
62+ const int32_t mem_align = TSI_TVU_MEM_ALIGN;
63+ // we are supporting only float or F32
64+ int data_type_len = 4 ;
65+ // MemRef Header also added
66+ int total_bytes = (sizeof (MemRefDescriptor<Rank>) + 4 *K);
67+
68+ // Round up K to the next multiple of tvu_size
69+ int32_t total_align_bytes = ((total_bytes % mem_align) != 0 ) ? ((total_bytes / mem_align) + 1 ) * mem_align : total_bytes;
70+
71+ // Allocate memory dynamically: space for header + data
72+ MemRefDescriptor<Rank>* header = (MemRefDescriptor<Rank>*) tsi_alloc (total_align_bytes);
73+
74+ if (!header) {
75+ return header;
76+ }
77+ // Advance pointer to skip header and get to data
78+ int32_t * data = (int32_t *)(header + 1 );
79+
80+ for (int32_t i = 0 ; i < K; ++i) {
81+ data[i] = 0 ;
82+ }
83+ return header;
84+ }
85+
86+
5587struct _txe_device_t {
5688 char name[100 ];
5789 uint32_t max_buf_len;
@@ -343,7 +375,6 @@ static void _mlir_ciface_txe_add_test (void *src0, void *src1, void *res)
343375 if (!src0 || !src1 || !res)
344376 return ;
345377
346- const int Rank = MEM_REF_DESCRIPTOR_RANK;
347378 MemRefDescriptor<Rank> *srcP0, *srcP1, *nodeP;
348379 srcP0 = (MemRefDescriptor<Rank> *)src0;
349380 srcP1 = (MemRefDescriptor<Rank> *)src1;
@@ -368,7 +399,6 @@ static void _mlir_ciface_txe_mult_test (void *src0, void *src1, void *res)
368399 if (!src0 || !src1 || !res)
369400 return ;
370401
371- const int Rank = MEM_REF_DESCRIPTOR_RANK;
372402 MemRefDescriptor<Rank> *srcP0, *srcP1, *nodeP;
373403 srcP0 = (MemRefDescriptor<Rank> *)src0;
374404 srcP1 = (MemRefDescriptor<Rank> *)src1;
@@ -489,6 +519,7 @@ static txe_compute_pipeline_state_s tsi_kernel_setup(enum ggml_tsavorite_kernel_
489519 case GGML_TSAVORITE_KERNEL_TYPE_SOFT_MAX:
490520 {
491521 kernel_pipeline->_mlir_fptr_3_input [DATA_TYPE_F32_INDEX] = &_mlir_ciface_txe_soft_max_host;
522+ // kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F16_INDEX] = &_mlir_ciface_txe_soft_max_16_host;
492523 kernel_pipeline->kernel_name = " TXE_SOFTMAX" ;
493524 flag = true ;
494525 break ;
@@ -553,7 +584,11 @@ static void *ggml_tsavorite_host_malloc(size_t n) {
553584 void *data = NULL ;
554585 GGML_TSAVORITE_LOG_INFO (" Start %s\n " , __func__);
555586 GGML_TSAVORITE_LOG_INFO (" \n Allocating memory from tsi_alloc with size %ld \n " , n);
556- data = tsi_alloc (n);
587+
588+ const int32_t mem_align = TSI_TVU_MEM_ALIGN;
589+ int total_align_bytes = (n/mem_align +1 )*mem_align;
590+ data = tsi_alloc (total_align_bytes);
591+
557592 GGML_TSAVORITE_LOG_CONT (" \n Allocating memory from tsi_alloc with size %ld starting memory %p\n " ,
558593 n, data);
559594
@@ -644,6 +679,12 @@ static struct ggml_backend_tsavorite_context *ggml_tsavorite_init(ggml_backend_d
644679 GGML_TSAVORITE_KERNEL (GGML_TSAVORITE_KERNEL_TYPE_SWIGLU, true );
645680 GGML_TSAVORITE_KERNEL (GGML_TSAVORITE_KERNEL_TYPE_SOFT_MAX, true );
646681 }
682+ glob_buf = create_mlir_buf<Rank>(96 );
683+ if (!glob_buf) {
684+ GGML_TSAVORITE_LOG_ERROR (" tsi_alloc failied for creating memory for buf \n " );
685+ free (ctx);
686+ return NULL ;
687+ }
647688
648689 GGML_TSAVORITE_LOG_INFO (" End %s\n " , __func__);
649690 return ctx;
@@ -755,7 +796,9 @@ static bool ggml_tsavorite_supports_op(const struct ggml_backend_tsavorite_devic
755796 case GGML_OP_SQR:
756797 case GGML_OP_SIN:
757798 case GGML_OP_RMS_NORM:
758- case GGML_OP_SOFT_MAX:
799+ #ifdef GGML_TARGET_POSIX
800+ case GGML_OP_SOFT_MAX:
801+ #endif /* GGML_TARGET_POSIX */
759802 break ;
760803 case GGML_OP_GLU:
761804 {
@@ -811,31 +854,6 @@ static void ggml_tsavorite_decompose_unary_kernel(uint32_t num_elem, ggml_tensor
811854 return ;
812855}
813856
814- template <int Rank>
815- // Assumes tsi_alloc is available and returns a pointer to allocated memory
816- static MemRefDescriptor<Rank>* create_mlir_buf (int K) {
817- // TVU load size (e.g., 32 for 1024-bit vector with 32-bit elements)
818- const int32_t tvu_size = TSI_TVU_LOAD_SIZE;
819-
820- // Round up K to the next multiple of tvu_size
821- int32_t num_of_elem = ((K % tvu_size) != 0 ) ? ((K / tvu_size) + 1 ) * tvu_size : K;
822-
823- // Allocate memory dynamically: space for header + data
824- MemRefDescriptor<Rank>* header = (MemRefDescriptor<Rank>*) tsi_alloc (
825- sizeof (MemRefDescriptor<Rank>) + num_of_elem * sizeof (float )
826- );
827-
828- if (!header) {
829- return header;
830- }
831- // Advance pointer to skip header and get to data
832- int32_t * data = (int32_t *)(header + 1 );
833-
834- for (int32_t i = 0 ; i < num_of_elem; ++i) {
835- data[i] = 0 ;
836- }
837- return header;
838- }
839857
840858static enum ggml_tsavorite_kernel_type tsi_glu_kernel_type (struct ggml_tensor *node) {
841859 const ggml_glu_op op = ggml_get_glu_op (node);
@@ -926,7 +944,6 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
926944 return GGML_STATUS_FAILED;
927945 }
928946 // MemRefDescriptor
929- const int Rank = MEM_REF_DESCRIPTOR_RANK;
930947 MemRefDescriptor<Rank> *srcP0, *srcP1, *nodeP;
931948 struct ggml_tensor *src0, *src1, *node;
932949 uint32_t num_elem_src0, num_elem_src1, num_elem_node;
@@ -937,14 +954,6 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
937954 enum ggml_tsavorite_input_tensors_count num_of_input_tensors;
938955 tensor_log log_data;
939956
940- MemRefDescriptor<Rank>* buf = create_mlir_buf<Rank>(96 );
941-
942- if (!buf) {
943- GGML_TSAVORITE_LOG_ERROR (" tsi_alloc failied for creating memory for buf \n " );
944- return GGML_STATUS_ABORTED;
945- }
946- buf->offset = 0 ;
947- buf->data = buf->base = (void *)(buf+1 );
948957
949958 for (int i = 0 ; i < cgraph->n_nodes ; i++) {
950959 int32_t kernel_sub_type=-1 ;
@@ -968,6 +977,21 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
968977 printf (" \n kernel_sub_type not suppored\n " );
969978 return GGML_STATUS_ABORTED;
970979 }
980+
981+ if (node->op == GGML_OP_RMS_NORM || node->op == GGML_OP_SOFT_MAX) {
982+ if (!glob_buf) {
983+ GGML_TSAVORITE_LOG_ERROR (" tsi_alloc failied for creating memory for buf \n " );
984+ return GGML_STATUS_ABORTED;
985+ }
986+ glob_buf->offset = 0 ;
987+ glob_buf->data = glob_buf->base = (void *)(glob_buf+1 );
988+
989+ float *vall = (float *)glob_buf->data ;
990+ int ii;
991+ for (ii=0 ; ii <= 95 ; ++ii)
992+ vall[ii] = 0 ;
993+ }
994+
971995 switch (node->op ) {
972996 case GGML_OP_ADD:
973997 kernel_type = GGML_TSAVORITE_KERNEL_TYPE_ADD;
@@ -1115,6 +1139,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
11151139 log_data.node_len = num_elem_node;
11161140 log_data.log_file = tsi_op_log_file;
11171141 log_data.num_of_op = num_of_op;
1142+ // log_data.kernel_type = kernel_type;
11181143 log_data.kernel_type = node->op ;
11191144
11201145 log_data.data_type = GGML_TSAVORITE_TENSOR_HEADER;
@@ -1169,7 +1194,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
11691194 const float * sk = src2 ? (float *)((char *) src2->data ) : nullptr ;
11701195 // here src2 is NULL for particular model hence u can ignore this for now
11711196 if (src2) {
1172- printf (" \n ANOOP src2 is not null\n " );
1197+ printf (" \n src2 is not null for SOFT_MAX \n " );
11731198 }
11741199 for (int64_t i03 = 0 ; i03 < ne03; i03++) {
11751200 for (int64_t i02 = 0 ; i02 < ne02; i02++) {
@@ -1196,9 +1221,10 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
11961221 srcP0->data = srcP0->base = (void *)(sp);
11971222 nodeP->data = nodeP->base = (void *)(dp);
11981223
1199- float *val = (float *)buf ->data ;
1224+ float *val = (float *)glob_buf ->data ;
12001225 val[0 ] = scale;
1201- ctx->kernels [kernel_type].pipeline ->_mlir_fptr_3_input [kernel_sub_type](srcP0, srcP1, nodeP, buf);
1226+ ctx->kernels [kernel_type].pipeline ->_mlir_fptr_3_input [kernel_sub_type](srcP0, srcP1, nodeP, glob_buf);
1227+ ++device->stats .op_run_count [kernel_type].num_of_kernel_call ;
12021228 }
12031229 }
12041230 }
@@ -1280,6 +1306,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
12801306 log_data.node_len = num_elem_src0;
12811307 log_data.log_file = tsi_op_log_file;
12821308 log_data.num_of_op = num_of_op;
1309+ // log_data.kernel_type = kernel_type;
12831310 log_data.kernel_type = node->op ;
12841311
12851312 log_data.data_type = GGML_TSAVORITE_TENSOR_HEADER;
@@ -1310,7 +1337,8 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
13101337 // Although only 32 elements are strictly necessary, reducing this would require changes to the RMS kernel.
13111338 // The remaining 32 elements are used to store src0->ne[0], replicated across each of the last 32 entries.
13121339
1313- float *val = (float *)buf->data ;
1340+
1341+ float *val = (float *)glob_buf->data ;
13141342 int i;
13151343 for (i=64 ; i <= 95 ; ++i)
13161344 val[i] = node->ne [0 ];
@@ -1336,7 +1364,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
13361364 strides = strides * src0->ne [i];
13371365 }
13381366
1339- ctx->kernels [kernel_type].pipeline ->_mlir_fptr_2_input [kernel_sub_type](srcP0, nodeP, buf );
1367+ ctx->kernels [kernel_type].pipeline ->_mlir_fptr_2_input [kernel_sub_type](srcP0, nodeP, glob_buf );
13401368
13411369 }
13421370 else {
@@ -1442,7 +1470,6 @@ static void *ggml_backend_tsavorite_buffer_get_base(ggml_backend_buffer_t buffer
14421470static ggml_status ggml_backend_tsavorite_buffer_init_tensor (ggml_backend_buffer_t buffer,
14431471 struct ggml_tensor *tensor) {
14441472 GGML_TSAVORITE_LOG_INFO (" Start %s\n " , __func__);
1445- const int Rank = MEM_REF_DESCRIPTOR_RANK;
14461473 MemRefDescriptor<Rank> tensor_data_header;
14471474 tensor->data = (void *)(sizeof (tensor_data_header) + (char *)tensor->data );
14481475 GGML_TSAVORITE_LOG_INFO (" End %s\n " , __func__);
@@ -1633,7 +1660,6 @@ static size_t ggml_backend_tsavorite_buffer_type_get_alloc_size(ggml_backend_buf
16331660 GGML_TSAVORITE_LOG_ERROR (" \n tsavorite device is NULL \n " );
16341661 return 0 ;
16351662 }
1636- const int Rank = MEM_REF_DESCRIPTOR_RANK;
16371663 MemRefDescriptor<Rank> tensor_data_header;
16381664 ggml_backend_tsavorite_device_rel (
16391665 (struct ggml_backend_tsavorite_device_context *)buft->device ->context );
@@ -1645,7 +1671,10 @@ static size_t ggml_backend_tsavorite_buffer_type_get_alloc_size(ggml_backend_buf
16451671
16461672 // Add 128-byte buffer to avoid crossing memory boundaries during TVU 1024-bit operations.
16471673 // TVU processes data in 1024-bit chunks, so the last elements may exceed allocated space without this padding.
1648- return (sizeof (tensor_data_header) + ggml_nbytes (tensor) + 128 );
1674+ const int32_t mem_align = TSI_TVU_MEM_ALIGN;
1675+ // I also added extra Padding buffer
1676+ size_t n = (((sizeof (tensor_data_header) + ggml_nbytes (tensor))/mem_align +1 )*mem_align + mem_align);
1677+ return (n);
16491678
16501679 TSI_UNUSED (buft);
16511680}
@@ -2073,7 +2102,9 @@ static bool ggml_backend_tsavorite_device_offload_op(ggml_backend_dev_t dev,
20732102 case GGML_OP_SQR:
20742103 case GGML_OP_SIN:
20752104 case GGML_OP_RMS_NORM:
2076- case GGML_OP_SOFT_MAX:
2105+ #ifdef GGML_TARGET_POSIX
2106+ case GGML_OP_SOFT_MAX:
2107+ #endif /* GGML_TARGET_POSIX */
20772108 break ;
20782109 case GGML_OP_GLU:
20792110 {
0 commit comments