@@ -676,6 +676,15 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
676676 size_t row_size_pd = ggml_row_size (t->type , hex_round_up (t->ne [0 ], QK_Q4_0x4x2)); // extra elements for the pad
677677 size_t row_size_rp = row_size * 2 ; // extra space for tmp pad (if any)
678678
679+ // Ensure we don't try to read more data than is available in the source buffer 'data'
680+ // or write more than the tensor can hold.
681+ const size_t total_tensor_size = (size_t )nrows * row_size;
682+ const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
683+
684+ // Calculate how many full rows and how many remaining bytes we need to process.
685+ const int64_t n_full_rows = n_bytes_to_copy / row_size;
686+ const size_t n_rem_bytes = n_bytes_to_copy % row_size;
687+
679688 void * buf_pd = ggml_aligned_malloc (row_size_pd);
680689 GGML_ASSERT (buf_pd != NULL );
681690
@@ -687,7 +696,8 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
687696
688697 init_row_q4x4x2 ((block_q4_0 *) buf_pd, t->ne [0 ]); // init padded buffer to make sure the tail is all zeros
689698
690- for (int64_t i = 0 ; i < nrows; i++) {
699+ // 1. Process all the full rows
700+ for (int64_t i = 0 ; i < n_full_rows; i++) {
691701 const uint8_t * src = (const uint8_t *) data + (i * row_size);
692702 uint8_t * dst = (uint8_t *) t->data + (i * row_size);
693703
@@ -696,6 +706,25 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
696706 memcpy (dst, buf_rp, row_size);
697707 }
698708
709+ // 2. Process the final, potentially partial, row
710+ if (n_rem_bytes > 0 ) {
711+ const int64_t i = n_full_rows;
712+ const uint8_t * src = (const uint8_t *) data + (i * row_size);
713+ uint8_t * dst = (uint8_t *) t->data + (i * row_size);
714+
715+ // re-init the row because we are potentially copying a partial row
716+ init_row_q4x4x2 ((block_q4_0 *) buf_pd, t->ne [0 ]);
717+
718+ // Copy only the remaining bytes from the source.
719+ memcpy (buf_pd, src, n_rem_bytes);
720+
721+ // Repack the entire buffer
722+ repack_row_q4x4x2 ((uint8_t *) buf_rp, (const block_q4_0 *) buf_pd, t->ne [0 ]);
723+
724+ // Write only the corresponding remaining bytes to the destination tensor.
725+ memcpy (dst, buf_rp, n_rem_bytes);
726+ }
727+
699728 ggml_aligned_free (buf_pd, row_size_pd);
700729 ggml_aligned_free (buf_rp, row_size_rp);
701730}
@@ -708,6 +737,14 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
708737 size_t row_size_pd = ggml_row_size (t->type , hex_round_up (t->ne [0 ], QK_Q4_0x4x2)); // extra elements for the pad
709738 size_t row_size_rp = row_size * 2 ; // extra space for tmp pad (if any)
710739
740+ // Ensure we don't try to copy more data than the tensor actually contains.
741+ const size_t total_tensor_size = (size_t )nrows * row_size;
742+ const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
743+
744+ // Calculate how many full rows and how many remaining bytes we need to process.
745+ const int64_t n_full_rows = n_bytes_to_copy / row_size;
746+ const size_t n_rem_bytes = n_bytes_to_copy % row_size;
747+
711748 void * buf_pd = ggml_aligned_malloc (row_size_pd);
712749 GGML_ASSERT (buf_pd != NULL );
713750
@@ -719,7 +756,8 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
719756
720757 memset (buf_pd, 0 , row_size_pd); // clear-out padded buffer to make sure the tail is all zeros
721758
722- for (int64_t i = 0 ; i < nrows; i++) {
759+ // 1. Process all the full rows
760+ for (int64_t i = 0 ; i < n_full_rows; i++) {
723761 const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
724762 uint8_t * dst = (uint8_t *) data + (i * row_size);
725763
@@ -728,6 +766,20 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
728766 memcpy (dst, buf_rp, row_size);
729767 }
730768
769+ // 2. Process the final, potentially partial, row
770+ if (n_rem_bytes > 0 ) {
771+ const int64_t i = n_full_rows;
772+ const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
773+ uint8_t * dst = (uint8_t *) data + (i * row_size);
774+
775+ // We still need to read and unpack the entire source row because quantization is block-based.
776+ memcpy (buf_pd, src, row_size);
777+ unpack_row_q4x4x2 ((block_q4_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne [0 ]);
778+
779+ // But we only copy the remaining number of bytes to the destination.
780+ memcpy (dst, buf_rp, n_rem_bytes);
781+ }
782+
731783 ggml_aligned_free (buf_pd, row_size_pd);
732784 ggml_aligned_free (buf_rp, row_size_rp);
733785}
@@ -950,6 +1002,15 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
9501002 size_t row_size_pd = ggml_row_size (t->type , hex_round_up (t->ne [0 ], QK_Q8_0x4x2)); // extra elements for the pad
9511003 size_t row_size_rp = row_size * 2 ; // extra space for tmp pad (if any)
9521004
1005+ // Ensure we don't try to read more data than is available in the source buffer 'data'
1006+ // or write more than the tensor can hold.
1007+ const size_t total_tensor_size = (size_t )nrows * row_size;
1008+ const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
1009+
1010+ // Calculate how many full rows and how many remaining bytes we need to process.
1011+ const int64_t n_full_rows = n_bytes_to_copy / row_size;
1012+ const size_t n_rem_bytes = n_bytes_to_copy % row_size;
1013+
9531014 void * buf_pd = ggml_aligned_malloc (row_size_pd);
9541015 GGML_ASSERT (buf_pd != NULL );
9551016
@@ -961,7 +1022,8 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
9611022
9621023 init_row_q8x4x2 ((block_q8_0 *) buf_pd, t->ne [0 ]); // init padded buffer to make sure the tail is all zeros
9631024
964- for (int64_t i = 0 ; i < nrows; i++) {
1025+ // 1. Process all the full rows
1026+ for (int64_t i = 0 ; i < n_full_rows; i++) {
9651027 const uint8_t * src = (const uint8_t *) data + (i * row_size);
9661028 uint8_t * dst = (uint8_t *) t->data + (i * row_size);
9671029
@@ -970,6 +1032,25 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
9701032 memcpy (dst, buf_rp, row_size);
9711033 }
9721034
1035+ // 2. Process the final, potentially partial, row
1036+ if (n_rem_bytes > 0 ) {
1037+ const int64_t i = n_full_rows;
1038+ const uint8_t * src = (const uint8_t *) data + (i * row_size);
1039+ uint8_t * dst = (uint8_t *) t->data + (i * row_size);
1040+
1041+ // re-init the row because we are potentially copying a partial row
1042+ init_row_q8x4x2 ((block_q8_0 *) buf_pd, t->ne [0 ]);
1043+
1044+ // Copy only the remaining bytes from the source.
1045+ memcpy (buf_pd, src, n_rem_bytes);
1046+
1047+ // Repack the entire buffer
1048+ repack_row_q8x4x2 ((uint8_t *) buf_rp, (const block_q8_0 *) buf_pd, t->ne [0 ]);
1049+
1050+ // Write only the corresponding remaining bytes to the destination tensor.
1051+ memcpy (dst, buf_rp, n_rem_bytes);
1052+ }
1053+
9731054 ggml_aligned_free (buf_pd, row_size_pd);
9741055 ggml_aligned_free (buf_rp, row_size_rp);
9751056}
@@ -982,6 +1063,14 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
9821063 size_t row_size_pd = ggml_row_size (t->type , hex_round_up (t->ne [0 ], QK_Q8_0x4x2)); // extra elements for the pad
9831064 size_t row_size_rp = row_size * 2 ; // extra space for tmp pad (if any)
9841065
1066+ // Ensure we don't try to copy more data than the tensor actually contains.
1067+ const size_t total_tensor_size = (size_t )nrows * row_size;
1068+ const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
1069+
1070+ // Calculate how many full rows and how many remaining bytes we need to process.
1071+ const int64_t n_full_rows = n_bytes_to_copy / row_size;
1072+ const size_t n_rem_bytes = n_bytes_to_copy % row_size;
1073+
9851074 void * buf_pd = ggml_aligned_malloc (row_size_pd);
9861075 GGML_ASSERT (buf_pd != NULL );
9871076
@@ -993,7 +1082,8 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
9931082
9941083 memset (buf_pd, 0 , row_size_pd); // clear-out padded buffer to make sure the tail is all zeros
9951084
996- for (int64_t i = 0 ; i < nrows; i++) {
1085+ // 1. Process all the full rows
1086+ for (int64_t i = 0 ; i < n_full_rows; i++) {
9971087 const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
9981088 uint8_t * dst = (uint8_t *) data + (i * row_size);
9991089
@@ -1002,6 +1092,20 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
10021092 memcpy (dst, buf_rp, row_size);
10031093 }
10041094
1095+ // 2. Process the final, potentially partial, row
1096+ if (n_rem_bytes > 0 ) {
1097+ const int64_t i = n_full_rows;
1098+ const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
1099+ uint8_t * dst = (uint8_t *) data + (i * row_size);
1100+
1101+ // We still need to read and unpack the entire source row because quantization is block-based.
1102+ memcpy (buf_pd, src, row_size);
1103+ unpack_row_q8x4x2 ((block_q8_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne [0 ]);
1104+
1105+ // But we only copy the remaining number of bytes to the destination.
1106+ memcpy (dst, buf_rp, n_rem_bytes);
1107+ }
1108+
10051109 ggml_aligned_free (buf_pd, row_size_pd);
10061110 ggml_aligned_free (buf_rp, row_size_rp);
10071111}
@@ -1249,6 +1353,15 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
12491353 size_t row_size_pd = ggml_row_size (t->type , hex_round_up (t->ne [0 ], QK_MXFP4x4x2)); // extra elements for the pad
12501354 size_t row_size_rp = row_size * 2 ; // extra space for tmp pad (if any)
12511355
1356+ // Ensure we don't try to read more data than is available in the source buffer 'data'
1357+ // or write more than the tensor can hold.
1358+ const size_t total_tensor_size = (size_t )nrows * row_size;
1359+ const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
1360+
1361+ // Calculate how many full rows and how many remaining bytes we need to process.
1362+ const int64_t n_full_rows = n_bytes_to_copy / row_size;
1363+ const size_t n_rem_bytes = n_bytes_to_copy % row_size;
1364+
12521365 void * buf_pd = ggml_aligned_malloc (row_size_pd);
12531366 GGML_ASSERT (buf_pd != NULL );
12541367
@@ -1260,7 +1373,8 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
12601373
12611374 init_row_mxfp4x4x2 ((block_mxfp4 *) buf_pd, t->ne [0 ]); // init padded buffer to make sure the tail is all zeros
12621375
1263- for (int64_t i = 0 ; i < nrows; i++) {
1376+ // 1. Process all the full rows
1377+ for (int64_t i = 0 ; i < n_full_rows; i++) {
12641378 const uint8_t * src = (const uint8_t *) data + (i * row_size);
12651379 uint8_t * dst = (uint8_t *) t->data + (i * row_size);
12661380
@@ -1269,6 +1383,25 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
12691383 memcpy (dst, buf_rp, row_size);
12701384 }
12711385
1386+ // 2. Process the final, potentially partial, row
1387+ if (n_rem_bytes > 0 ) {
1388+ const int64_t i = n_full_rows;
1389+ const uint8_t * src = (const uint8_t *) data + (i * row_size);
1390+ uint8_t * dst = (uint8_t *) t->data + (i * row_size);
1391+
1392+ // re-init the row because we are potentially copying a partial row
1393+ init_row_mxfp4x4x2 ((block_mxfp4 *) buf_pd, t->ne [0 ]);
1394+
1395+ // Copy only the remaining bytes from the source.
1396+ memcpy (buf_pd, src, n_rem_bytes);
1397+
1398+ // Repack the entire buffer (partial data + zero padding).
1399+ repack_row_mxfp4x4x2 ((uint8_t *) buf_rp, (const block_mxfp4 *) buf_pd, t->ne [0 ]);
1400+
1401+ // Write only the corresponding remaining bytes to the destination tensor.
1402+ memcpy (dst, buf_rp, n_rem_bytes);
1403+ }
1404+
12721405 ggml_aligned_free (buf_pd, row_size_pd);
12731406 ggml_aligned_free (buf_rp, row_size_rp);
12741407}
@@ -1281,6 +1414,14 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
12811414 size_t row_size_pd = ggml_row_size (t->type , hex_round_up (t->ne [0 ], QK_MXFP4x4x2)); // extra elements for the pad
12821415 size_t row_size_rp = row_size * 2 ; // extra space for tmp pad (if any)
12831416
1417+ // Ensure we don't try to copy more data than the tensor actually contains.
1418+ const size_t total_tensor_size = (size_t )nrows * row_size;
1419+ const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
1420+
1421+ // Calculate how many full rows and how many remaining bytes we need to process.
1422+ const int64_t n_full_rows = n_bytes_to_copy / row_size;
1423+ const size_t n_rem_bytes = n_bytes_to_copy % row_size;
1424+
12841425 void * buf_pd = ggml_aligned_malloc (row_size_pd);
12851426 GGML_ASSERT (buf_pd != NULL );
12861427
@@ -1292,7 +1433,8 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
12921433
12931434 memset (buf_pd, 0 , row_size_pd); // clear-out padded buffer to make sure the tail is all zeros
12941435
1295- for (int64_t i = 0 ; i < nrows; i++) {
1436+ // 1. Process all the full rows
1437+ for (int64_t i = 0 ; i < n_full_rows; i++) {
12961438 const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
12971439 uint8_t * dst = (uint8_t *) data + (i * row_size);
12981440
@@ -1301,6 +1443,20 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
13011443 memcpy (dst, buf_rp, row_size);
13021444 }
13031445
1446+ // 2. Process the final, potentially partial, row
1447+ if (n_rem_bytes > 0 ) {
1448+ const int64_t i = n_full_rows;
1449+ const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
1450+ uint8_t * dst = (uint8_t *) data + (i * row_size);
1451+
1452+ // We still need to read and unpack the entire source row because the format is block-based.
1453+ memcpy (buf_pd, src, row_size);
1454+ unpack_row_mxfp4x4x2 ((block_mxfp4 *) buf_rp, (const uint8_t *) buf_pd, t->ne [0 ]);
1455+
1456+ // But we only copy the remaining number of bytes to the destination to respect the size limit.
1457+ memcpy (dst, buf_rp, n_rem_bytes);
1458+ }
1459+
13041460 ggml_aligned_free (buf_pd, row_size_pd);
13051461 ggml_aligned_free (buf_rp, row_size_rp);
13061462}
@@ -1319,19 +1475,19 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
13191475 switch (tensor->type ) {
13201476 case GGML_TYPE_Q4_0:
13211477 GGML_ASSERT (offset == 0 );
1322- GGML_ASSERT (size = = ggml_nbytes (tensor));
1478+ GGML_ASSERT (offset + size < = ggml_nbytes (tensor));
13231479 repack_q4_0_q4x4x2 (tensor, data, size);
13241480 break ;
13251481
13261482 case GGML_TYPE_Q8_0:
13271483 GGML_ASSERT (offset == 0 );
1328- GGML_ASSERT (size = = ggml_nbytes (tensor));
1484+ GGML_ASSERT (offset + size < = ggml_nbytes (tensor));
13291485 repack_q8_0_q8x4x2 (tensor, data, size);
13301486 break ;
13311487
13321488 case GGML_TYPE_MXFP4:
13331489 GGML_ASSERT (offset == 0 );
1334- GGML_ASSERT (size = = ggml_nbytes (tensor));
1490+ GGML_ASSERT (offset + size < = ggml_nbytes (tensor));
13351491 repack_mxfp4_mxfp4x4x2 (tensor, data, size);
13361492 break ;
13371493
@@ -1361,7 +1517,7 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
13611517
13621518 case GGML_TYPE_Q8_0:
13631519 GGML_ASSERT (offset == 0 );
1364- GGML_ASSERT (size = = ggml_nbytes (tensor));
1520+ GGML_ASSERT (offset + size < = ggml_nbytes (tensor));
13651521 repack_q8x4x2_q8_0 (data, tensor, size);
13661522 break ;
13671523
0 commit comments