Skip to content

Commit 41137ce

Browse files
author
Anoop Kapoor
committed
Added memory Alignment for 128 Bytes
1 parent 85b71fd commit 41137ce

File tree

3 files changed

+82
-51
lines changed

3 files changed

+82
-51
lines changed

ggml/include/ggml-tsavorite.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ extern void ggml_tsi_log_tensor_data(tensor_log log_data);
213213

214214
// GGML supports tensors with a maximum rank of 4
215215
#define MEM_REF_DESCRIPTOR_RANK 4
216-
#define TSI_TVU_LOAD_SIZE 32
216+
#define TSI_TVU_MEM_ALIGN 128
217217

218218
//
219219
// backend API

ggml/src/ggml-tsavorite/ggml-tsavorite.cpp

Lines changed: 78 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,38 @@ typedef struct _txe_command_buffer_t *txe_command_buffer_s;
5252
#endif /* USE_COMMAND_BUFFERS */
5353
typedef struct ggml_backend_tsavorite_buffer ggml_backend_tsavorite_buffer_s;
5454

55+
const int Rank = MEM_REF_DESCRIPTOR_RANK;
56+
MemRefDescriptor<Rank>* glob_buf;
57+
58+
template<int Rank>
59+
// Assumes tsi_alloc is available and returns a pointer to allocated memory
60+
static MemRefDescriptor<Rank>* create_mlir_buf(int K) {
61+
// TVU load size (e.g., 32 for 1024-bit vector with 32-bit elements)
62+
const int32_t mem_align = TSI_TVU_MEM_ALIGN;
63+
// we are supporting only float or F32
64+
int data_type_len = 4;
65+
// MemRef Header also added
66+
int total_bytes = (sizeof(MemRefDescriptor<Rank>) + 4*K);
67+
68+
// Round up K to the next multiple of tvu_size
69+
int32_t total_align_bytes = ((total_bytes % mem_align) != 0) ? ((total_bytes / mem_align) + 1) * mem_align : total_bytes;
70+
71+
// Allocate memory dynamically: space for header + data
72+
MemRefDescriptor<Rank>* header = (MemRefDescriptor<Rank>*) tsi_alloc(total_align_bytes);
73+
74+
if (!header) {
75+
return header;
76+
}
77+
// Advance pointer to skip header and get to data
78+
int32_t* data = (int32_t*)(header + 1);
79+
80+
for (int32_t i = 0; i < K; ++i) {
81+
data[i] = 0;
82+
}
83+
return header;
84+
}
85+
86+
5587
struct _txe_device_t {
5688
char name[100];
5789
uint32_t max_buf_len;
@@ -343,7 +375,6 @@ static void _mlir_ciface_txe_add_test (void *src0, void *src1, void *res)
343375
if (!src0 || !src1 || !res)
344376
return;
345377

346-
const int Rank = MEM_REF_DESCRIPTOR_RANK;
347378
MemRefDescriptor<Rank> *srcP0, *srcP1, *nodeP;
348379
srcP0 = (MemRefDescriptor<Rank> *)src0;
349380
srcP1 = (MemRefDescriptor<Rank> *)src1;
@@ -368,7 +399,6 @@ static void _mlir_ciface_txe_mult_test (void *src0, void *src1, void *res)
368399
if (!src0 || !src1 || !res)
369400
return;
370401

371-
const int Rank = MEM_REF_DESCRIPTOR_RANK;
372402
MemRefDescriptor<Rank> *srcP0, *srcP1, *nodeP;
373403
srcP0 = (MemRefDescriptor<Rank> *)src0;
374404
srcP1 = (MemRefDescriptor<Rank> *)src1;
@@ -489,6 +519,7 @@ static txe_compute_pipeline_state_s tsi_kernel_setup(enum ggml_tsavorite_kernel_
489519
case GGML_TSAVORITE_KERNEL_TYPE_SOFT_MAX:
490520
{
491521
kernel_pipeline->_mlir_fptr_3_input[DATA_TYPE_F32_INDEX] = &_mlir_ciface_txe_soft_max_host;
522+
//kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F16_INDEX] = &_mlir_ciface_txe_soft_max_16_host;
492523
kernel_pipeline->kernel_name = "TXE_SOFTMAX";
493524
flag = true;
494525
break;
@@ -553,7 +584,11 @@ static void *ggml_tsavorite_host_malloc(size_t n) {
553584
void *data = NULL;
554585
GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
555586
GGML_TSAVORITE_LOG_INFO("\n Allocating memory from tsi_alloc with size %ld \n", n);
556-
data = tsi_alloc(n);
587+
588+
const int32_t mem_align = TSI_TVU_MEM_ALIGN;
589+
int total_align_bytes = (n/mem_align +1)*mem_align;
590+
data = tsi_alloc(total_align_bytes);
591+
557592
GGML_TSAVORITE_LOG_CONT("\n Allocating memory from tsi_alloc with size %ld starting memory %p\n",
558593
n, data);
559594

@@ -644,6 +679,12 @@ static struct ggml_backend_tsavorite_context *ggml_tsavorite_init(ggml_backend_d
644679
GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SWIGLU, true);
645680
GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SOFT_MAX, true);
646681
}
682+
glob_buf = create_mlir_buf<Rank>(96);
683+
if (!glob_buf) {
684+
GGML_TSAVORITE_LOG_ERROR("tsi_alloc failied for creating memory for buf \n");
685+
free(ctx);
686+
return NULL;
687+
}
647688

648689
GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
649690
return ctx;
@@ -755,7 +796,9 @@ static bool ggml_tsavorite_supports_op(const struct ggml_backend_tsavorite_devic
755796
case GGML_OP_SQR:
756797
case GGML_OP_SIN:
757798
case GGML_OP_RMS_NORM:
758-
case GGML_OP_SOFT_MAX:
799+
#ifdef GGML_TARGET_POSIX
800+
case GGML_OP_SOFT_MAX:
801+
#endif /* GGML_TARGET_POSIX */
759802
break;
760803
case GGML_OP_GLU:
761804
{
@@ -811,31 +854,6 @@ static void ggml_tsavorite_decompose_unary_kernel(uint32_t num_elem, ggml_tensor
811854
return;
812855
}
813856

814-
template<int Rank>
815-
// Assumes tsi_alloc is available and returns a pointer to allocated memory
816-
static MemRefDescriptor<Rank>* create_mlir_buf(int K) {
817-
// TVU load size (e.g., 32 for 1024-bit vector with 32-bit elements)
818-
const int32_t tvu_size = TSI_TVU_LOAD_SIZE;
819-
820-
// Round up K to the next multiple of tvu_size
821-
int32_t num_of_elem = ((K % tvu_size) != 0) ? ((K / tvu_size) + 1) * tvu_size : K;
822-
823-
// Allocate memory dynamically: space for header + data
824-
MemRefDescriptor<Rank>* header = (MemRefDescriptor<Rank>*) tsi_alloc(
825-
sizeof(MemRefDescriptor<Rank>) + num_of_elem * sizeof(float)
826-
);
827-
828-
if (!header) {
829-
return header;
830-
}
831-
// Advance pointer to skip header and get to data
832-
int32_t* data = (int32_t*)(header + 1);
833-
834-
for (int32_t i = 0; i < num_of_elem; ++i) {
835-
data[i] = 0;
836-
}
837-
return header;
838-
}
839857

840858
static enum ggml_tsavorite_kernel_type tsi_glu_kernel_type(struct ggml_tensor *node) {
841859
const ggml_glu_op op = ggml_get_glu_op(node);
@@ -926,7 +944,6 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
926944
return GGML_STATUS_FAILED;
927945
}
928946
// MemRefDescriptor
929-
const int Rank = MEM_REF_DESCRIPTOR_RANK;
930947
MemRefDescriptor<Rank> *srcP0, *srcP1, *nodeP;
931948
struct ggml_tensor *src0, *src1, *node;
932949
uint32_t num_elem_src0, num_elem_src1, num_elem_node;
@@ -937,14 +954,6 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
937954
enum ggml_tsavorite_input_tensors_count num_of_input_tensors;
938955
tensor_log log_data;
939956

940-
MemRefDescriptor<Rank>* buf = create_mlir_buf<Rank>(96);
941-
942-
if (!buf) {
943-
GGML_TSAVORITE_LOG_ERROR("tsi_alloc failied for creating memory for buf \n");
944-
return GGML_STATUS_ABORTED;
945-
}
946-
buf->offset = 0;
947-
buf->data = buf->base = (void *)(buf+1);
948957

949958
for (int i = 0; i < cgraph->n_nodes; i++) {
950959
int32_t kernel_sub_type=-1;
@@ -968,6 +977,21 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
968977
printf("\n kernel_sub_type not suppored\n");
969978
return GGML_STATUS_ABORTED;
970979
}
980+
981+
if (node->op == GGML_OP_RMS_NORM || node->op == GGML_OP_SOFT_MAX) {
982+
if (!glob_buf) {
983+
GGML_TSAVORITE_LOG_ERROR("tsi_alloc failied for creating memory for buf \n");
984+
return GGML_STATUS_ABORTED;
985+
}
986+
glob_buf->offset = 0;
987+
glob_buf->data = glob_buf->base = (void *)(glob_buf+1);
988+
989+
float *vall = (float *)glob_buf->data;
990+
int ii;
991+
for(ii=0; ii <= 95; ++ii)
992+
vall[ii] = 0;
993+
}
994+
971995
switch (node->op) {
972996
case GGML_OP_ADD:
973997
kernel_type = GGML_TSAVORITE_KERNEL_TYPE_ADD;
@@ -1115,6 +1139,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
11151139
log_data.node_len = num_elem_node;
11161140
log_data.log_file = tsi_op_log_file;
11171141
log_data.num_of_op = num_of_op;
1142+
//log_data.kernel_type = kernel_type;
11181143
log_data.kernel_type = node->op;
11191144

11201145
log_data.data_type = GGML_TSAVORITE_TENSOR_HEADER;
@@ -1169,7 +1194,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
11691194
const float * sk = src2 ? (float *)((char *) src2->data) : nullptr;
11701195
//here src2 is NULL for particular model hence u can ignore this for now
11711196
if (src2) {
1172-
printf("\n ANOOP src2 is not null\n");
1197+
printf("\n src2 is not null for SOFT_MAX\n");
11731198
}
11741199
for (int64_t i03 = 0; i03 < ne03; i03++) {
11751200
for (int64_t i02 = 0; i02 < ne02; i02++) {
@@ -1196,9 +1221,10 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
11961221
srcP0->data = srcP0->base = (void *)(sp);
11971222
nodeP->data = nodeP->base = (void *)(dp);
11981223

1199-
float *val = (float *)buf->data;
1224+
float *val = (float *)glob_buf->data;
12001225
val[0] = scale;
1201-
ctx->kernels[kernel_type].pipeline->_mlir_fptr_3_input[kernel_sub_type](srcP0, srcP1, nodeP, buf);
1226+
ctx->kernels[kernel_type].pipeline->_mlir_fptr_3_input[kernel_sub_type](srcP0, srcP1, nodeP, glob_buf);
1227+
++device->stats.op_run_count[kernel_type].num_of_kernel_call;
12021228
}
12031229
}
12041230
}
@@ -1280,6 +1306,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
12801306
log_data.node_len = num_elem_src0;
12811307
log_data.log_file = tsi_op_log_file;
12821308
log_data.num_of_op = num_of_op;
1309+
//log_data.kernel_type = kernel_type;
12831310
log_data.kernel_type = node->op;
12841311

12851312
log_data.data_type = GGML_TSAVORITE_TENSOR_HEADER;
@@ -1310,7 +1337,8 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
13101337
// Although only 32 elements are strictly necessary, reducing this would require changes to the RMS kernel.
13111338
// The remaining 32 elements are used to store src0->ne[0], replicated across each of the last 32 entries.
13121339

1313-
float *val = (float *)buf->data;
1340+
1341+
float *val = (float *)glob_buf->data;
13141342
int i;
13151343
for(i=64; i <= 95; ++i)
13161344
val[i] = node->ne[0];
@@ -1336,7 +1364,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
13361364
strides = strides * src0->ne[i];
13371365
}
13381366

1339-
ctx->kernels[kernel_type].pipeline->_mlir_fptr_2_input[kernel_sub_type](srcP0, nodeP, buf);
1367+
ctx->kernels[kernel_type].pipeline->_mlir_fptr_2_input[kernel_sub_type](srcP0, nodeP, glob_buf);
13401368

13411369
}
13421370
else {
@@ -1442,7 +1470,6 @@ static void *ggml_backend_tsavorite_buffer_get_base(ggml_backend_buffer_t buffer
14421470
static ggml_status ggml_backend_tsavorite_buffer_init_tensor(ggml_backend_buffer_t buffer,
14431471
struct ggml_tensor *tensor) {
14441472
GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
1445-
const int Rank = MEM_REF_DESCRIPTOR_RANK;
14461473
MemRefDescriptor<Rank> tensor_data_header;
14471474
tensor->data = (void *)(sizeof(tensor_data_header) + (char *)tensor->data);
14481475
GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
@@ -1633,7 +1660,6 @@ static size_t ggml_backend_tsavorite_buffer_type_get_alloc_size(ggml_backend_buf
16331660
GGML_TSAVORITE_LOG_ERROR("\n tsavorite device is NULL \n");
16341661
return 0;
16351662
}
1636-
const int Rank = MEM_REF_DESCRIPTOR_RANK;
16371663
MemRefDescriptor<Rank> tensor_data_header;
16381664
ggml_backend_tsavorite_device_rel(
16391665
(struct ggml_backend_tsavorite_device_context *)buft->device->context);
@@ -1645,7 +1671,10 @@ static size_t ggml_backend_tsavorite_buffer_type_get_alloc_size(ggml_backend_buf
16451671

16461672
// Add 128-byte buffer to avoid crossing memory boundaries during TVU 1024-bit operations.
16471673
// TVU processes data in 1024-bit chunks, so the last elements may exceed allocated space without this padding.
1648-
return (sizeof(tensor_data_header) + ggml_nbytes(tensor) + 128);
1674+
const int32_t mem_align = TSI_TVU_MEM_ALIGN;
1675+
// I also added extra Padding buffer
1676+
size_t n = (((sizeof(tensor_data_header) + ggml_nbytes(tensor))/mem_align +1)*mem_align + mem_align);
1677+
return (n);
16491678

16501679
TSI_UNUSED(buft);
16511680
}
@@ -2073,7 +2102,9 @@ static bool ggml_backend_tsavorite_device_offload_op(ggml_backend_dev_t dev,
20732102
case GGML_OP_SQR:
20742103
case GGML_OP_SIN:
20752104
case GGML_OP_RMS_NORM:
2076-
case GGML_OP_SOFT_MAX:
2105+
#ifdef GGML_TARGET_POSIX
2106+
case GGML_OP_SOFT_MAX:
2107+
#endif /* GGML_TARGET_POSIX */
20772108
break;
20782109
case GGML_OP_GLU:
20792110
{

tsi-pkg-build.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,11 @@ cd ../../
3838
echo 'building llama.cp, ggml for tsavorite and other binary for posix'
3939
if [ "$(echo "$1" | tr '[:upper:]' '[:lower:]')" = "release" ];
4040
then
41-
cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix -DCMAKE_C_FLAGS="-DGGML_PERF_RELEASE" -DCMAKE_CXX_FLAGS="-DGGML_PERF_RELEASE"
41+
cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix -DCMAKE_C_FLAGS="-DGGML_PERF_RELEASE -DGGML_TARGET_POSIX" -DCMAKE_CXX_FLAGS="-DGGML_PERF_RELEASE -DGGML_TARGET_POSIX"
4242
elif [ "$(echo "$1" | tr '[:upper:]' '[:lower:]')" = "debug" ]; then
43-
cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix -DCMAKE_C_FLAGS="-DGGML_PERF_DETAIL" -DCMAKE_CXX_FLAGS="-DGGML_PERF_DETAIL"
43+
cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix -DCMAKE_C_FLAGS="-DGGML_PERF_DETAIL -DGGML_TARGET_POSIX" -DCMAKE_CXX_FLAGS="-DGGML_PERF_DETAIL -DGGML_TARGET_POSIX"
4444
else
45-
cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix -DCMAKE_C_FLAGS="-DGGML_PERF" -DCMAKE_CXX_FLAGS="-DGGML_PERF"
45+
cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix -DCMAKE_C_FLAGS="-DGGML_PERF -DGGML_TARGET_POSIX" -DCMAKE_CXX_FLAGS="-DGGML_PERF -DGGML_TARGET_POSIX"
4646
fi
4747

4848
cmake --build build-posix --config Release

0 commit comments

Comments
 (0)