Skip to content

Commit fe62901

Browse files
committed
hexagon: enhance tensor repacking functions to handle partial rows and ensure buffer size safety
1 parent 904fd34 commit fe62901

File tree

2 files changed

+216
-10
lines changed

2 files changed

+216
-10
lines changed

CMakeUserPresets.json

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
{
2+
"version": 4,
3+
"configurePresets": [
4+
{
5+
"name": "arm64-android-snapdragon",
6+
"hidden": true,
7+
"architecture": { "value": "arm64", "strategy": "external" },
8+
"toolset": { "value": "host=x86_64", "strategy": "external" },
9+
"cacheVariables": {
10+
"ANDROID_ABI": "arm64-v8a",
11+
"ANDROID_PLATFORM": "android-31",
12+
"CMAKE_TOOLCHAIN_FILE": "$env{ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake",
13+
"CMAKE_C_FLAGS": "-march=armv8.6-a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
14+
"CMAKE_CXX_FLAGS": "-march=armv8.6-a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
15+
"CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG",
16+
"CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG",
17+
"CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
18+
"CMAKE_CXX_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
19+
"HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
20+
"PREBUILT_LIB_DIR": "android_aarch64",
21+
"GGML_OPENMP": "OFF",
22+
"GGML_LLAMAFILE": "OFF",
23+
"GGML_OPENCL": "OFF",
24+
"GGML_HEXAGON": "ON",
25+
"LLAMA_CURL": "OFF",
26+
"GGML_BACKEND_DL": "ON"
27+
}
28+
},
29+
30+
{
31+
"name": "arm64-windows-snapdragon",
32+
"inherits": [ "base", "arm64-windows-llvm" ],
33+
"cacheVariables": {
34+
"HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
35+
"PREBUILT_LIB_DIR": "windows_aarch64",
36+
"GGML_OPENMP": "OFF",
37+
"GGML_LLAMAFILE": "OFF",
38+
"GGML_OPENCL": "ON",
39+
"GGML_HEXAGON": "ON",
40+
"LLAMA_CURL": "OFF"
41+
}
42+
},
43+
44+
{ "name": "arm64-android-snapdragon-debug" , "inherits": [ "base", "arm64-android-snapdragon", "debug" ] },
45+
{ "name": "arm64-android-snapdragon-release", "inherits": [ "base", "arm64-android-snapdragon", "release" ] },
46+
47+
{ "name": "arm64-windows-snapdragon-debug" , "inherits": [ "base", "arm64-windows-snapdragon", "debug" ] },
48+
{ "name": "arm64-windows-snapdragon-release", "inherits": [ "base", "arm64-windows-snapdragon", "release" ] }
49+
]
50+
}

ggml/src/ggml-hexagon/ggml-hexagon.cpp

Lines changed: 166 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -676,6 +676,15 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
676676
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad
677677
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
678678

679+
// Ensure we don't try to read more data than is available in the source buffer 'data'
680+
// or write more than the tensor can hold.
681+
const size_t total_tensor_size = (size_t)nrows * row_size;
682+
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
683+
684+
// Calculate how many full rows and how many remaining bytes we need to process.
685+
const int64_t n_full_rows = n_bytes_to_copy / row_size;
686+
const size_t n_rem_bytes = n_bytes_to_copy % row_size;
687+
679688
void * buf_pd = ggml_aligned_malloc(row_size_pd);
680689
GGML_ASSERT(buf_pd != NULL);
681690

@@ -687,7 +696,8 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
687696

688697
init_row_q4x4x2((block_q4_0 *) buf_pd, t->ne[0]); // init padded buffer to make sure the tail is all zeros
689698

690-
for (int64_t i = 0; i < nrows; i++) {
699+
// 1. Process all the full rows
700+
for (int64_t i = 0; i < n_full_rows; i++) {
691701
const uint8_t * src = (const uint8_t *) data + (i * row_size);
692702
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
693703

@@ -696,6 +706,25 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
696706
memcpy(dst, buf_rp, row_size);
697707
}
698708

709+
// 2. Process the final, potentially partial, row
710+
if (n_rem_bytes > 0) {
711+
const int64_t i = n_full_rows;
712+
const uint8_t * src = (const uint8_t *) data + (i * row_size);
713+
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
714+
715+
// re-init the row because we are potentially copying a partial row
716+
init_row_q4x4x2((block_q4_0 *) buf_pd, t->ne[0]);
717+
718+
// Copy only the remaining bytes from the source.
719+
memcpy(buf_pd, src, n_rem_bytes);
720+
721+
// Repack the entire buffer
722+
repack_row_q4x4x2((uint8_t *) buf_rp, (const block_q4_0 *) buf_pd, t->ne[0]);
723+
724+
// Write only the corresponding remaining bytes to the destination tensor.
725+
memcpy(dst, buf_rp, n_rem_bytes);
726+
}
727+
699728
ggml_aligned_free(buf_pd, row_size_pd);
700729
ggml_aligned_free(buf_rp, row_size_rp);
701730
}
@@ -708,6 +737,14 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
708737
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad
709738
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
710739

740+
// Ensure we don't try to copy more data than the tensor actually contains.
741+
const size_t total_tensor_size = (size_t)nrows * row_size;
742+
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
743+
744+
// Calculate how many full rows and how many remaining bytes we need to process.
745+
const int64_t n_full_rows = n_bytes_to_copy / row_size;
746+
const size_t n_rem_bytes = n_bytes_to_copy % row_size;
747+
711748
void * buf_pd = ggml_aligned_malloc(row_size_pd);
712749
GGML_ASSERT(buf_pd != NULL);
713750

@@ -719,7 +756,8 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
719756

720757
memset(buf_pd, 0, row_size_pd); // clear-out padded buffer to make sure the tail is all zeros
721758

722-
for (int64_t i = 0; i < nrows; i++) {
759+
// 1. Process all the full rows
760+
for (int64_t i = 0; i < n_full_rows; i++) {
723761
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
724762
uint8_t * dst = (uint8_t *) data + (i * row_size);
725763

@@ -728,6 +766,20 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
728766
memcpy(dst, buf_rp, row_size);
729767
}
730768

769+
// 2. Process the final, potentially partial, row
770+
if (n_rem_bytes > 0) {
771+
const int64_t i = n_full_rows;
772+
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
773+
uint8_t * dst = (uint8_t *) data + (i * row_size);
774+
775+
// We still need to read and unpack the entire source row because quantization is block-based.
776+
memcpy(buf_pd, src, row_size);
777+
unpack_row_q4x4x2((block_q4_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
778+
779+
// But we only copy the remaining number of bytes to the destination.
780+
memcpy(dst, buf_rp, n_rem_bytes);
781+
}
782+
731783
ggml_aligned_free(buf_pd, row_size_pd);
732784
ggml_aligned_free(buf_rp, row_size_rp);
733785
}
@@ -950,6 +1002,15 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
9501002
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad
9511003
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
9521004

1005+
// Ensure we don't try to read more data than is available in the source buffer 'data'
1006+
// or write more than the tensor can hold.
1007+
const size_t total_tensor_size = (size_t)nrows * row_size;
1008+
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
1009+
1010+
// Calculate how many full rows and how many remaining bytes we need to process.
1011+
const int64_t n_full_rows = n_bytes_to_copy / row_size;
1012+
const size_t n_rem_bytes = n_bytes_to_copy % row_size;
1013+
9531014
void * buf_pd = ggml_aligned_malloc(row_size_pd);
9541015
GGML_ASSERT(buf_pd != NULL);
9551016

@@ -961,7 +1022,8 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
9611022

9621023
init_row_q8x4x2((block_q8_0 *) buf_pd, t->ne[0]); // init padded buffer to make sure the tail is all zeros
9631024

964-
for (int64_t i = 0; i < nrows; i++) {
1025+
// 1. Process all the full rows
1026+
for (int64_t i = 0; i < n_full_rows; i++) {
9651027
const uint8_t * src = (const uint8_t *) data + (i * row_size);
9661028
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
9671029

@@ -970,6 +1032,25 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
9701032
memcpy(dst, buf_rp, row_size);
9711033
}
9721034

1035+
// 2. Process the final, potentially partial, row
1036+
if (n_rem_bytes > 0) {
1037+
const int64_t i = n_full_rows;
1038+
const uint8_t * src = (const uint8_t *) data + (i * row_size);
1039+
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
1040+
1041+
// re-init the row because we are potentially copying a partial row
1042+
init_row_q8x4x2((block_q8_0 *) buf_pd, t->ne[0]);
1043+
1044+
// Copy only the remaining bytes from the source.
1045+
memcpy(buf_pd, src, n_rem_bytes);
1046+
1047+
// Repack the entire buffer
1048+
repack_row_q8x4x2((uint8_t *) buf_rp, (const block_q8_0 *) buf_pd, t->ne[0]);
1049+
1050+
// Write only the corresponding remaining bytes to the destination tensor.
1051+
memcpy(dst, buf_rp, n_rem_bytes);
1052+
}
1053+
9731054
ggml_aligned_free(buf_pd, row_size_pd);
9741055
ggml_aligned_free(buf_rp, row_size_rp);
9751056
}
@@ -982,6 +1063,14 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
9821063
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad
9831064
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
9841065

1066+
// Ensure we don't try to copy more data than the tensor actually contains.
1067+
const size_t total_tensor_size = (size_t)nrows * row_size;
1068+
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
1069+
1070+
// Calculate how many full rows and how many remaining bytes we need to process.
1071+
const int64_t n_full_rows = n_bytes_to_copy / row_size;
1072+
const size_t n_rem_bytes = n_bytes_to_copy % row_size;
1073+
9851074
void * buf_pd = ggml_aligned_malloc(row_size_pd);
9861075
GGML_ASSERT(buf_pd != NULL);
9871076

@@ -993,7 +1082,8 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
9931082

9941083
memset(buf_pd, 0, row_size_pd); // clear-out padded buffer to make sure the tail is all zeros
9951084

996-
for (int64_t i = 0; i < nrows; i++) {
1085+
// 1. Process all the full rows
1086+
for (int64_t i = 0; i < n_full_rows; i++) {
9971087
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
9981088
uint8_t * dst = (uint8_t *) data + (i * row_size);
9991089

@@ -1002,6 +1092,20 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
10021092
memcpy(dst, buf_rp, row_size);
10031093
}
10041094

1095+
// 2. Process the final, potentially partial, row
1096+
if (n_rem_bytes > 0) {
1097+
const int64_t i = n_full_rows;
1098+
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
1099+
uint8_t * dst = (uint8_t *) data + (i * row_size);
1100+
1101+
// We still need to read and unpack the entire source row because quantization is block-based.
1102+
memcpy(buf_pd, src, row_size);
1103+
unpack_row_q8x4x2((block_q8_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
1104+
1105+
// But we only copy the remaining number of bytes to the destination.
1106+
memcpy(dst, buf_rp, n_rem_bytes);
1107+
}
1108+
10051109
ggml_aligned_free(buf_pd, row_size_pd);
10061110
ggml_aligned_free(buf_rp, row_size_rp);
10071111
}
@@ -1249,6 +1353,15 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
12491353
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad
12501354
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
12511355

1356+
// Ensure we don't try to read more data than is available in the source buffer 'data'
1357+
// or write more than the tensor can hold.
1358+
const size_t total_tensor_size = (size_t)nrows * row_size;
1359+
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
1360+
1361+
// Calculate how many full rows and how many remaining bytes we need to process.
1362+
const int64_t n_full_rows = n_bytes_to_copy / row_size;
1363+
const size_t n_rem_bytes = n_bytes_to_copy % row_size;
1364+
12521365
void * buf_pd = ggml_aligned_malloc(row_size_pd);
12531366
GGML_ASSERT(buf_pd != NULL);
12541367

@@ -1260,7 +1373,8 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
12601373

12611374
init_row_mxfp4x4x2((block_mxfp4 *) buf_pd, t->ne[0]); // init padded buffer to make sure the tail is all zeros
12621375

1263-
for (int64_t i = 0; i < nrows; i++) {
1376+
// 1. Process all the full rows
1377+
for (int64_t i = 0; i < n_full_rows; i++) {
12641378
const uint8_t * src = (const uint8_t *) data + (i * row_size);
12651379
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
12661380

@@ -1269,6 +1383,25 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
12691383
memcpy(dst, buf_rp, row_size);
12701384
}
12711385

1386+
// 2. Process the final, potentially partial, row
1387+
if (n_rem_bytes > 0) {
1388+
const int64_t i = n_full_rows;
1389+
const uint8_t * src = (const uint8_t *) data + (i * row_size);
1390+
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
1391+
1392+
// re-init the row because we are potentially copying a partial row
1393+
init_row_mxfp4x4x2((block_mxfp4 *) buf_pd, t->ne[0]);
1394+
1395+
// Copy only the remaining bytes from the source.
1396+
memcpy(buf_pd, src, n_rem_bytes);
1397+
1398+
// Repack the entire buffer (partial data + zero padding).
1399+
repack_row_mxfp4x4x2((uint8_t *) buf_rp, (const block_mxfp4 *) buf_pd, t->ne[0]);
1400+
1401+
// Write only the corresponding remaining bytes to the destination tensor.
1402+
memcpy(dst, buf_rp, n_rem_bytes);
1403+
}
1404+
12721405
ggml_aligned_free(buf_pd, row_size_pd);
12731406
ggml_aligned_free(buf_rp, row_size_rp);
12741407
}
@@ -1281,6 +1414,14 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
12811414
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad
12821415
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
12831416

1417+
// Ensure we don't try to copy more data than the tensor actually contains.
1418+
const size_t total_tensor_size = (size_t)nrows * row_size;
1419+
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
1420+
1421+
// Calculate how many full rows and how many remaining bytes we need to process.
1422+
const int64_t n_full_rows = n_bytes_to_copy / row_size;
1423+
const size_t n_rem_bytes = n_bytes_to_copy % row_size;
1424+
12841425
void * buf_pd = ggml_aligned_malloc(row_size_pd);
12851426
GGML_ASSERT(buf_pd != NULL);
12861427

@@ -1292,7 +1433,8 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
12921433

12931434
memset(buf_pd, 0, row_size_pd); // clear-out padded buffer to make sure the tail is all zeros
12941435

1295-
for (int64_t i = 0; i < nrows; i++) {
1436+
// 1. Process all the full rows
1437+
for (int64_t i = 0; i < n_full_rows; i++) {
12961438
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
12971439
uint8_t * dst = (uint8_t *) data + (i * row_size);
12981440

@@ -1301,6 +1443,20 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
13011443
memcpy(dst, buf_rp, row_size);
13021444
}
13031445

1446+
// 2. Process the final, potentially partial, row
1447+
if (n_rem_bytes > 0) {
1448+
const int64_t i = n_full_rows;
1449+
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
1450+
uint8_t * dst = (uint8_t *) data + (i * row_size);
1451+
1452+
// We still need to read and unpack the entire source row because the format is block-based.
1453+
memcpy(buf_pd, src, row_size);
1454+
unpack_row_mxfp4x4x2((block_mxfp4 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
1455+
1456+
// But we only copy the remaining number of bytes to the destination to respect the size limit.
1457+
memcpy(dst, buf_rp, n_rem_bytes);
1458+
}
1459+
13041460
ggml_aligned_free(buf_pd, row_size_pd);
13051461
ggml_aligned_free(buf_rp, row_size_rp);
13061462
}
@@ -1319,19 +1475,19 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
13191475
switch (tensor->type) {
13201476
case GGML_TYPE_Q4_0:
13211477
GGML_ASSERT(offset == 0);
1322-
GGML_ASSERT(size == ggml_nbytes(tensor));
1478+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
13231479
repack_q4_0_q4x4x2(tensor, data, size);
13241480
break;
13251481

13261482
case GGML_TYPE_Q8_0:
13271483
GGML_ASSERT(offset == 0);
1328-
GGML_ASSERT(size == ggml_nbytes(tensor));
1484+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
13291485
repack_q8_0_q8x4x2(tensor, data, size);
13301486
break;
13311487

13321488
case GGML_TYPE_MXFP4:
13331489
GGML_ASSERT(offset == 0);
1334-
GGML_ASSERT(size == ggml_nbytes(tensor));
1490+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
13351491
repack_mxfp4_mxfp4x4x2(tensor, data, size);
13361492
break;
13371493

@@ -1361,7 +1517,7 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
13611517

13621518
case GGML_TYPE_Q8_0:
13631519
GGML_ASSERT(offset == 0);
1364-
GGML_ASSERT(size == ggml_nbytes(tensor));
1520+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
13651521
repack_q8x4x2_q8_0(data, tensor, size);
13661522
break;
13671523

0 commit comments

Comments
 (0)