Skip to content

Commit 13002a0

Browse files
authored
ggml-hexagon: respect input size when getting/setting tensor data (#16836)
* respect input size when getting/setting tensor data allows partial repacking/copying when get tensor size is smaller than the actual tensor * Removed duplicate repack_mxfp4_mxfp4x4x2 function
1 parent 6eb208d commit 13002a0

File tree

1 file changed

+168
-12
lines changed

1 file changed

+168
-12
lines changed

ggml/src/ggml-hexagon/ggml-hexagon.cpp

Lines changed: 168 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -676,6 +676,15 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
676676
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad
677677
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
678678

679+
// Ensure we don't try to read more data than is available in the source buffer 'data'
680+
// or write more than the tensor can hold.
681+
const size_t total_tensor_size = (size_t)nrows * row_size;
682+
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
683+
684+
// Calculate how many full rows and how many remaining bytes we need to process.
685+
const int64_t n_full_rows = n_bytes_to_copy / row_size;
686+
const size_t n_rem_bytes = n_bytes_to_copy % row_size;
687+
679688
void * buf_pd = ggml_aligned_malloc(row_size_pd);
680689
GGML_ASSERT(buf_pd != NULL);
681690

@@ -687,7 +696,8 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
687696

688697
init_row_q4x4x2((block_q4_0 *) buf_pd, t->ne[0]); // init padded buffer to make sure the tail is all zeros
689698

690-
for (int64_t i = 0; i < nrows; i++) {
699+
// 1. Process all the full rows
700+
for (int64_t i = 0; i < n_full_rows; i++) {
691701
const uint8_t * src = (const uint8_t *) data + (i * row_size);
692702
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
693703

@@ -696,6 +706,25 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
696706
memcpy(dst, buf_rp, row_size);
697707
}
698708

709+
// 2. Process the final, potentially partial, row
710+
if (n_rem_bytes > 0) {
711+
const int64_t i = n_full_rows;
712+
const uint8_t * src = (const uint8_t *) data + (i * row_size);
713+
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
714+
715+
// re-init the row because we are potentially copying a partial row
716+
init_row_q4x4x2((block_q4_0 *) buf_pd, t->ne[0]);
717+
718+
// Copy only the remaining bytes from the source.
719+
memcpy(buf_pd, src, n_rem_bytes);
720+
721+
// Repack the entire buffer
722+
repack_row_q4x4x2((uint8_t *) buf_rp, (const block_q4_0 *) buf_pd, t->ne[0]);
723+
724+
// Write only the corresponding remaining bytes to the destination tensor.
725+
memcpy(dst, buf_rp, n_rem_bytes);
726+
}
727+
699728
ggml_aligned_free(buf_pd, row_size_pd);
700729
ggml_aligned_free(buf_rp, row_size_rp);
701730
}
@@ -708,6 +737,14 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
708737
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad
709738
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
710739

740+
// Ensure we don't try to copy more data than the tensor actually contains.
741+
const size_t total_tensor_size = (size_t)nrows * row_size;
742+
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
743+
744+
// Calculate how many full rows and how many remaining bytes we need to process.
745+
const int64_t n_full_rows = n_bytes_to_copy / row_size;
746+
const size_t n_rem_bytes = n_bytes_to_copy % row_size;
747+
711748
void * buf_pd = ggml_aligned_malloc(row_size_pd);
712749
GGML_ASSERT(buf_pd != NULL);
713750

@@ -719,7 +756,8 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
719756

720757
memset(buf_pd, 0, row_size_pd); // clear-out padded buffer to make sure the tail is all zeros
721758

722-
for (int64_t i = 0; i < nrows; i++) {
759+
// 1. Process all the full rows
760+
for (int64_t i = 0; i < n_full_rows; i++) {
723761
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
724762
uint8_t * dst = (uint8_t *) data + (i * row_size);
725763

@@ -728,6 +766,20 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
728766
memcpy(dst, buf_rp, row_size);
729767
}
730768

769+
// 2. Process the final, potentially partial, row
770+
if (n_rem_bytes > 0) {
771+
const int64_t i = n_full_rows;
772+
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
773+
uint8_t * dst = (uint8_t *) data + (i * row_size);
774+
775+
// We still need to read and unpack the entire source row because quantization is block-based.
776+
memcpy(buf_pd, src, row_size);
777+
unpack_row_q4x4x2((block_q4_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
778+
779+
// But we only copy the remaining number of bytes to the destination.
780+
memcpy(dst, buf_rp, n_rem_bytes);
781+
}
782+
731783
ggml_aligned_free(buf_pd, row_size_pd);
732784
ggml_aligned_free(buf_rp, row_size_rp);
733785
}
@@ -950,6 +1002,15 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
9501002
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad
9511003
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
9521004

1005+
// Ensure we don't try to read more data than is available in the source buffer 'data'
1006+
// or write more than the tensor can hold.
1007+
const size_t total_tensor_size = (size_t)nrows * row_size;
1008+
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
1009+
1010+
// Calculate how many full rows and how many remaining bytes we need to process.
1011+
const int64_t n_full_rows = n_bytes_to_copy / row_size;
1012+
const size_t n_rem_bytes = n_bytes_to_copy % row_size;
1013+
9531014
void * buf_pd = ggml_aligned_malloc(row_size_pd);
9541015
GGML_ASSERT(buf_pd != NULL);
9551016

@@ -961,7 +1022,8 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
9611022

9621023
init_row_q8x4x2((block_q8_0 *) buf_pd, t->ne[0]); // init padded buffer to make sure the tail is all zeros
9631024

964-
for (int64_t i = 0; i < nrows; i++) {
1025+
// 1. Process all the full rows
1026+
for (int64_t i = 0; i < n_full_rows; i++) {
9651027
const uint8_t * src = (const uint8_t *) data + (i * row_size);
9661028
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
9671029

@@ -970,6 +1032,25 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
9701032
memcpy(dst, buf_rp, row_size);
9711033
}
9721034

1035+
// 2. Process the final, potentially partial, row
1036+
if (n_rem_bytes > 0) {
1037+
const int64_t i = n_full_rows;
1038+
const uint8_t * src = (const uint8_t *) data + (i * row_size);
1039+
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
1040+
1041+
// re-init the row because we are potentially copying a partial row
1042+
init_row_q8x4x2((block_q8_0 *) buf_pd, t->ne[0]);
1043+
1044+
// Copy only the remaining bytes from the source.
1045+
memcpy(buf_pd, src, n_rem_bytes);
1046+
1047+
// Repack the entire buffer
1048+
repack_row_q8x4x2((uint8_t *) buf_rp, (const block_q8_0 *) buf_pd, t->ne[0]);
1049+
1050+
// Write only the corresponding remaining bytes to the destination tensor.
1051+
memcpy(dst, buf_rp, n_rem_bytes);
1052+
}
1053+
9731054
ggml_aligned_free(buf_pd, row_size_pd);
9741055
ggml_aligned_free(buf_rp, row_size_rp);
9751056
}
@@ -982,6 +1063,14 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
9821063
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad
9831064
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
9841065

1066+
// Ensure we don't try to copy more data than the tensor actually contains.
1067+
const size_t total_tensor_size = (size_t)nrows * row_size;
1068+
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
1069+
1070+
// Calculate how many full rows and how many remaining bytes we need to process.
1071+
const int64_t n_full_rows = n_bytes_to_copy / row_size;
1072+
const size_t n_rem_bytes = n_bytes_to_copy % row_size;
1073+
9851074
void * buf_pd = ggml_aligned_malloc(row_size_pd);
9861075
GGML_ASSERT(buf_pd != NULL);
9871076

@@ -993,7 +1082,8 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
9931082

9941083
memset(buf_pd, 0, row_size_pd); // clear-out padded buffer to make sure the tail is all zeros
9951084

996-
for (int64_t i = 0; i < nrows; i++) {
1085+
// 1. Process all the full rows
1086+
for (int64_t i = 0; i < n_full_rows; i++) {
9971087
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
9981088
uint8_t * dst = (uint8_t *) data + (i * row_size);
9991089

@@ -1002,6 +1092,20 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
10021092
memcpy(dst, buf_rp, row_size);
10031093
}
10041094

1095+
// 2. Process the final, potentially partial, row
1096+
if (n_rem_bytes > 0) {
1097+
const int64_t i = n_full_rows;
1098+
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
1099+
uint8_t * dst = (uint8_t *) data + (i * row_size);
1100+
1101+
// We still need to read and unpack the entire source row because quantization is block-based.
1102+
memcpy(buf_pd, src, row_size);
1103+
unpack_row_q8x4x2((block_q8_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
1104+
1105+
// But we only copy the remaining number of bytes to the destination.
1106+
memcpy(dst, buf_rp, n_rem_bytes);
1107+
}
1108+
10051109
ggml_aligned_free(buf_pd, row_size_pd);
10061110
ggml_aligned_free(buf_rp, row_size_rp);
10071111
}
@@ -1249,6 +1353,15 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
12491353
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad
12501354
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
12511355

1356+
// Ensure we don't try to read more data than is available in the source buffer 'data'
1357+
// or write more than the tensor can hold.
1358+
const size_t total_tensor_size = (size_t)nrows * row_size;
1359+
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
1360+
1361+
// Calculate how many full rows and how many remaining bytes we need to process.
1362+
const int64_t n_full_rows = n_bytes_to_copy / row_size;
1363+
const size_t n_rem_bytes = n_bytes_to_copy % row_size;
1364+
12521365
void * buf_pd = ggml_aligned_malloc(row_size_pd);
12531366
GGML_ASSERT(buf_pd != NULL);
12541367

@@ -1260,7 +1373,8 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
12601373

12611374
init_row_mxfp4x4x2((block_mxfp4 *) buf_pd, t->ne[0]); // init padded buffer to make sure the tail is all zeros
12621375

1263-
for (int64_t i = 0; i < nrows; i++) {
1376+
// 1. Process all the full rows
1377+
for (int64_t i = 0; i < n_full_rows; i++) {
12641378
const uint8_t * src = (const uint8_t *) data + (i * row_size);
12651379
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
12661380

@@ -1269,6 +1383,25 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
12691383
memcpy(dst, buf_rp, row_size);
12701384
}
12711385

1386+
// 2. Process the final, potentially partial, row
1387+
if (n_rem_bytes > 0) {
1388+
const int64_t i = n_full_rows;
1389+
const uint8_t * src = (const uint8_t *) data + (i * row_size);
1390+
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
1391+
1392+
// re-init the row because we are potentially copying a partial row
1393+
init_row_mxfp4x4x2((block_mxfp4 *) buf_pd, t->ne[0]);
1394+
1395+
// Copy only the remaining bytes from the source.
1396+
memcpy(buf_pd, src, n_rem_bytes);
1397+
1398+
// Repack the entire buffer (partial data + zero padding).
1399+
repack_row_mxfp4x4x2((uint8_t *) buf_rp, (const block_mxfp4 *) buf_pd, t->ne[0]);
1400+
1401+
// Write only the corresponding remaining bytes to the destination tensor.
1402+
memcpy(dst, buf_rp, n_rem_bytes);
1403+
}
1404+
12721405
ggml_aligned_free(buf_pd, row_size_pd);
12731406
ggml_aligned_free(buf_rp, row_size_rp);
12741407
}
@@ -1281,6 +1414,14 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
12811414
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad
12821415
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
12831416

1417+
// Ensure we don't try to copy more data than the tensor actually contains.
1418+
const size_t total_tensor_size = (size_t)nrows * row_size;
1419+
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
1420+
1421+
// Calculate how many full rows and how many remaining bytes we need to process.
1422+
const int64_t n_full_rows = n_bytes_to_copy / row_size;
1423+
const size_t n_rem_bytes = n_bytes_to_copy % row_size;
1424+
12841425
void * buf_pd = ggml_aligned_malloc(row_size_pd);
12851426
GGML_ASSERT(buf_pd != NULL);
12861427

@@ -1292,7 +1433,8 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
12921433

12931434
memset(buf_pd, 0, row_size_pd); // clear-out padded buffer to make sure the tail is all zeros
12941435

1295-
for (int64_t i = 0; i < nrows; i++) {
1436+
// 1. Process all the full rows
1437+
for (int64_t i = 0; i < n_full_rows; i++) {
12961438
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
12971439
uint8_t * dst = (uint8_t *) data + (i * row_size);
12981440

@@ -1301,6 +1443,20 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
13011443
memcpy(dst, buf_rp, row_size);
13021444
}
13031445

1446+
// 2. Process the final, potentially partial, row
1447+
if (n_rem_bytes > 0) {
1448+
const int64_t i = n_full_rows;
1449+
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
1450+
uint8_t * dst = (uint8_t *) data + (i * row_size);
1451+
1452+
// We still need to read and unpack the entire source row because the format is block-based.
1453+
memcpy(buf_pd, src, row_size);
1454+
unpack_row_mxfp4x4x2((block_mxfp4 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
1455+
1456+
// But we only copy the remaining number of bytes to the destination to respect the size limit.
1457+
memcpy(dst, buf_rp, n_rem_bytes);
1458+
}
1459+
13041460
ggml_aligned_free(buf_pd, row_size_pd);
13051461
ggml_aligned_free(buf_rp, row_size_rp);
13061462
}
@@ -1319,19 +1475,19 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
13191475
switch (tensor->type) {
13201476
case GGML_TYPE_Q4_0:
13211477
GGML_ASSERT(offset == 0);
1322-
GGML_ASSERT(size == ggml_nbytes(tensor));
1478+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
13231479
repack_q4_0_q4x4x2(tensor, data, size);
13241480
break;
13251481

13261482
case GGML_TYPE_Q8_0:
13271483
GGML_ASSERT(offset == 0);
1328-
GGML_ASSERT(size == ggml_nbytes(tensor));
1484+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
13291485
repack_q8_0_q8x4x2(tensor, data, size);
13301486
break;
13311487

13321488
case GGML_TYPE_MXFP4:
13331489
GGML_ASSERT(offset == 0);
1334-
GGML_ASSERT(size == ggml_nbytes(tensor));
1490+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
13351491
repack_mxfp4_mxfp4x4x2(tensor, data, size);
13361492
break;
13371493

@@ -1355,19 +1511,19 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
13551511
switch (tensor->type) {
13561512
case GGML_TYPE_Q4_0:
13571513
GGML_ASSERT(offset == 0);
1358-
GGML_ASSERT(size == ggml_nbytes(tensor));
1514+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
13591515
repack_q4x4x2_q4_0(data, tensor, size);
13601516
break;
13611517

13621518
case GGML_TYPE_Q8_0:
13631519
GGML_ASSERT(offset == 0);
1364-
GGML_ASSERT(size == ggml_nbytes(tensor));
1520+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
13651521
repack_q8x4x2_q8_0(data, tensor, size);
13661522
break;
13671523

13681524
case GGML_TYPE_MXFP4:
13691525
GGML_ASSERT(offset == 0);
1370-
GGML_ASSERT(size == ggml_nbytes(tensor));
1526+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
13711527
repack_mxfp4x4x2_mxfp4(data, tensor, size);
13721528
break;
13731529

0 commit comments

Comments
 (0)