@@ -676,6 +676,15 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
676676    size_t  row_size_pd = ggml_row_size (t->type , hex_round_up (t->ne [0 ], QK_Q4_0x4x2));  //  extra elements for the pad
677677    size_t  row_size_rp = row_size * 2 ;  //  extra space for tmp pad (if any)
678678
679+     //  Ensure we don't try to read more data than is available in the source buffer 'data'
680+     //  or write more than the tensor can hold.
681+     const  size_t  total_tensor_size = (size_t )nrows * row_size;
682+     const  size_t  n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
683+ 
684+     //  Calculate how many full rows and how many remaining bytes we need to process.
685+     const  int64_t  n_full_rows = n_bytes_to_copy / row_size;
686+     const  size_t   n_rem_bytes = n_bytes_to_copy % row_size;
687+ 
679688    void  * buf_pd = ggml_aligned_malloc (row_size_pd);
680689    GGML_ASSERT (buf_pd != NULL );
681690
@@ -687,7 +696,8 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
687696
688697    init_row_q4x4x2 ((block_q4_0 *) buf_pd, t->ne [0 ]);  //  init padded buffer to make sure the tail is all zeros
689698
690-     for  (int64_t  i = 0 ; i < nrows; i++) {
699+     //  1. Process all the full rows
700+     for  (int64_t  i = 0 ; i < n_full_rows; i++) {
691701        const  uint8_t  * src = (const  uint8_t  *) data + (i * row_size);
692702        uint8_t  *       dst = (uint8_t  *) t->data  + (i * row_size);
693703
@@ -696,6 +706,25 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
696706        memcpy (dst, buf_rp, row_size);
697707    }
698708
709+     //  2. Process the final, potentially partial, row
710+     if  (n_rem_bytes > 0 ) {
711+         const  int64_t  i = n_full_rows;
712+         const  uint8_t  * src = (const  uint8_t  *) data + (i * row_size);
713+         uint8_t  *       dst = (uint8_t  *) t->data  + (i * row_size);
714+ 
715+         //  re-init the row because we are potentially copying a partial row
716+         init_row_q4x4x2 ((block_q4_0 *) buf_pd, t->ne [0 ]);
717+ 
718+         //  Copy only the remaining bytes from the source.
719+         memcpy (buf_pd, src, n_rem_bytes);
720+ 
721+         //  Repack the entire buffer
722+         repack_row_q4x4x2 ((uint8_t  *) buf_rp, (const  block_q4_0 *) buf_pd, t->ne [0 ]);
723+ 
724+         //  Write only the corresponding remaining bytes to the destination tensor.
725+         memcpy (dst, buf_rp, n_rem_bytes);
726+     }
727+ 
699728    ggml_aligned_free (buf_pd, row_size_pd);
700729    ggml_aligned_free (buf_rp, row_size_rp);
701730}
@@ -708,6 +737,14 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
708737    size_t  row_size_pd = ggml_row_size (t->type , hex_round_up (t->ne [0 ], QK_Q4_0x4x2));  //  extra elements for the pad
709738    size_t  row_size_rp = row_size * 2 ;  //  extra space for tmp pad (if any)
710739
740+     //  Ensure we don't try to copy more data than the tensor actually contains.
741+     const  size_t  total_tensor_size = (size_t )nrows * row_size;
742+     const  size_t  n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
743+ 
744+     //  Calculate how many full rows and how many remaining bytes we need to process.
745+     const  int64_t  n_full_rows = n_bytes_to_copy / row_size;
746+     const  size_t   n_rem_bytes = n_bytes_to_copy % row_size;
747+ 
711748    void  * buf_pd = ggml_aligned_malloc (row_size_pd);
712749    GGML_ASSERT (buf_pd != NULL );
713750
@@ -719,7 +756,8 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
719756
720757    memset (buf_pd, 0 , row_size_pd);  //  clear-out padded buffer to make sure the tail is all zeros
721758
722-     for  (int64_t  i = 0 ; i < nrows; i++) {
759+     //  1. Process all the full rows
760+     for  (int64_t  i = 0 ; i < n_full_rows; i++) {
723761        const  uint8_t  * src = (const  uint8_t  *) t->data  + (i * row_size);
724762        uint8_t  *       dst = (uint8_t  *) data + (i * row_size);
725763
@@ -728,6 +766,20 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
728766        memcpy (dst, buf_rp, row_size);
729767    }
730768
769+     //  2. Process the final, potentially partial, row
770+     if  (n_rem_bytes > 0 ) {
771+         const  int64_t  i = n_full_rows;
772+         const  uint8_t  * src = (const  uint8_t  *) t->data  + (i * row_size);
773+         uint8_t  *       dst = (uint8_t  *) data + (i * row_size);
774+ 
775+         //  We still need to read and unpack the entire source row because quantization is block-based.
776+         memcpy (buf_pd, src, row_size);
777+         unpack_row_q4x4x2 ((block_q4_0 *) buf_rp, (const  uint8_t  *) buf_pd, t->ne [0 ]);
778+ 
779+         //  But we only copy the remaining number of bytes to the destination.
780+         memcpy (dst, buf_rp, n_rem_bytes);
781+     }
782+ 
731783    ggml_aligned_free (buf_pd, row_size_pd);
732784    ggml_aligned_free (buf_rp, row_size_rp);
733785}
@@ -950,6 +1002,15 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
9501002    size_t  row_size_pd = ggml_row_size (t->type , hex_round_up (t->ne [0 ], QK_Q8_0x4x2));  //  extra elements for the pad
9511003    size_t  row_size_rp = row_size * 2 ;  //  extra space for tmp pad (if any)
9521004
1005+     //  Ensure we don't try to read more data than is available in the source buffer 'data'
1006+     //  or write more than the tensor can hold.
1007+     const  size_t  total_tensor_size = (size_t )nrows * row_size;
1008+     const  size_t  n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
1009+ 
1010+     //  Calculate how many full rows and how many remaining bytes we need to process.
1011+     const  int64_t  n_full_rows = n_bytes_to_copy / row_size;
1012+     const  size_t   n_rem_bytes = n_bytes_to_copy % row_size;
1013+ 
9531014    void  * buf_pd = ggml_aligned_malloc (row_size_pd);
9541015    GGML_ASSERT (buf_pd != NULL );
9551016
@@ -961,7 +1022,8 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
9611022
9621023    init_row_q8x4x2 ((block_q8_0 *) buf_pd, t->ne [0 ]);  //  init padded buffer to make sure the tail is all zeros
9631024
964-     for  (int64_t  i = 0 ; i < nrows; i++) {
1025+     //  1. Process all the full rows
1026+     for  (int64_t  i = 0 ; i < n_full_rows; i++) {
9651027        const  uint8_t  * src = (const  uint8_t  *) data + (i * row_size);
9661028        uint8_t  *       dst = (uint8_t  *) t->data  + (i * row_size);
9671029
@@ -970,6 +1032,25 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
9701032        memcpy (dst, buf_rp, row_size);
9711033    }
9721034
1035+     //  2. Process the final, potentially partial, row
1036+     if  (n_rem_bytes > 0 ) {
1037+         const  int64_t  i = n_full_rows;
1038+         const  uint8_t  * src = (const  uint8_t  *) data + (i * row_size);
1039+         uint8_t  *       dst = (uint8_t  *) t->data  + (i * row_size);
1040+ 
1041+         //  re-init the row because we are potentially copying a partial row
1042+         init_row_q8x4x2 ((block_q8_0 *) buf_pd, t->ne [0 ]);
1043+ 
1044+         //  Copy only the remaining bytes from the source.
1045+         memcpy (buf_pd, src, n_rem_bytes);
1046+ 
1047+         //  Repack the entire buffer
1048+         repack_row_q8x4x2 ((uint8_t  *) buf_rp, (const  block_q8_0 *) buf_pd, t->ne [0 ]);
1049+ 
1050+         //  Write only the corresponding remaining bytes to the destination tensor.
1051+         memcpy (dst, buf_rp, n_rem_bytes);
1052+     }
1053+ 
9731054    ggml_aligned_free (buf_pd, row_size_pd);
9741055    ggml_aligned_free (buf_rp, row_size_rp);
9751056}
@@ -982,6 +1063,14 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
9821063    size_t  row_size_pd = ggml_row_size (t->type , hex_round_up (t->ne [0 ], QK_Q8_0x4x2));  //  extra elements for the pad
9831064    size_t  row_size_rp = row_size * 2 ;  //  extra space for tmp pad (if any)
9841065
1066+     //  Ensure we don't try to copy more data than the tensor actually contains.
1067+     const  size_t  total_tensor_size = (size_t )nrows * row_size;
1068+     const  size_t  n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
1069+ 
1070+     //  Calculate how many full rows and how many remaining bytes we need to process.
1071+     const  int64_t  n_full_rows = n_bytes_to_copy / row_size;
1072+     const  size_t   n_rem_bytes = n_bytes_to_copy % row_size;
1073+ 
9851074    void  * buf_pd = ggml_aligned_malloc (row_size_pd);
9861075    GGML_ASSERT (buf_pd != NULL );
9871076
@@ -993,7 +1082,8 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
9931082
9941083    memset (buf_pd, 0 , row_size_pd);  //  clear-out padded buffer to make sure the tail is all zeros
9951084
996-     for  (int64_t  i = 0 ; i < nrows; i++) {
1085+     //  1. Process all the full rows
1086+     for  (int64_t  i = 0 ; i < n_full_rows; i++) {
9971087        const  uint8_t  * src = (const  uint8_t  *) t->data  + (i * row_size);
9981088        uint8_t  *       dst = (uint8_t  *) data + (i * row_size);
9991089
@@ -1002,6 +1092,20 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
10021092        memcpy (dst, buf_rp, row_size);
10031093    }
10041094
1095+     //  2. Process the final, potentially partial, row
1096+     if  (n_rem_bytes > 0 ) {
1097+         const  int64_t  i = n_full_rows;
1098+         const  uint8_t  * src = (const  uint8_t  *) t->data  + (i * row_size);
1099+         uint8_t  *       dst = (uint8_t  *) data + (i * row_size);
1100+ 
1101+         //  We still need to read and unpack the entire source row because quantization is block-based.
1102+         memcpy (buf_pd, src, row_size);
1103+         unpack_row_q8x4x2 ((block_q8_0 *) buf_rp, (const  uint8_t  *) buf_pd, t->ne [0 ]);
1104+ 
1105+         //  But we only copy the remaining number of bytes to the destination.
1106+         memcpy (dst, buf_rp, n_rem_bytes);
1107+     }
1108+ 
10051109    ggml_aligned_free (buf_pd, row_size_pd);
10061110    ggml_aligned_free (buf_rp, row_size_rp);
10071111}
@@ -1249,6 +1353,15 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
12491353    size_t  row_size_pd = ggml_row_size (t->type , hex_round_up (t->ne [0 ], QK_MXFP4x4x2));  //  extra elements for the pad
12501354    size_t  row_size_rp = row_size * 2 ;  //  extra space for tmp pad (if any)
12511355
1356+     //  Ensure we don't try to read more data than is available in the source buffer 'data'
1357+     //  or write more than the tensor can hold.
1358+     const  size_t  total_tensor_size = (size_t )nrows * row_size;
1359+     const  size_t  n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
1360+ 
1361+     //  Calculate how many full rows and how many remaining bytes we need to process.
1362+     const  int64_t  n_full_rows = n_bytes_to_copy / row_size;
1363+     const  size_t   n_rem_bytes = n_bytes_to_copy % row_size;
1364+ 
12521365    void  * buf_pd = ggml_aligned_malloc (row_size_pd);
12531366    GGML_ASSERT (buf_pd != NULL );
12541367
@@ -1260,7 +1373,8 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
12601373
12611374    init_row_mxfp4x4x2 ((block_mxfp4 *) buf_pd, t->ne [0 ]);  //  init padded buffer to make sure the tail is all zeros
12621375
1263-     for  (int64_t  i = 0 ; i < nrows; i++) {
1376+     //  1. Process all the full rows
1377+     for  (int64_t  i = 0 ; i < n_full_rows; i++) {
12641378        const  uint8_t  * src = (const  uint8_t  *) data + (i * row_size);
12651379        uint8_t  *       dst = (uint8_t  *) t->data  + (i * row_size);
12661380
@@ -1269,6 +1383,25 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
12691383        memcpy (dst, buf_rp, row_size);
12701384    }
12711385
1386+     //  2. Process the final, potentially partial, row
1387+     if  (n_rem_bytes > 0 ) {
1388+         const  int64_t  i = n_full_rows;
1389+         const  uint8_t  * src = (const  uint8_t  *) data + (i * row_size);
1390+         uint8_t  *       dst = (uint8_t  *) t->data  + (i * row_size);
1391+ 
1392+         //  re-init the row because we are potentially copying a partial row
1393+         init_row_mxfp4x4x2 ((block_mxfp4 *) buf_pd, t->ne [0 ]);
1394+ 
1395+         //  Copy only the remaining bytes from the source.
1396+         memcpy (buf_pd, src, n_rem_bytes);
1397+ 
1398+         //  Repack the entire buffer (partial data + zero padding).
1399+         repack_row_mxfp4x4x2 ((uint8_t  *) buf_rp, (const  block_mxfp4 *) buf_pd, t->ne [0 ]);
1400+ 
1401+         //  Write only the corresponding remaining bytes to the destination tensor.
1402+         memcpy (dst, buf_rp, n_rem_bytes);
1403+     }
1404+ 
12721405    ggml_aligned_free (buf_pd, row_size_pd);
12731406    ggml_aligned_free (buf_rp, row_size_rp);
12741407}
@@ -1281,6 +1414,14 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
12811414    size_t  row_size_pd = ggml_row_size (t->type , hex_round_up (t->ne [0 ], QK_MXFP4x4x2));  //  extra elements for the pad
12821415    size_t  row_size_rp = row_size * 2 ;  //  extra space for tmp pad (if any)
12831416
1417+     //  Ensure we don't try to copy more data than the tensor actually contains.
1418+     const  size_t  total_tensor_size = (size_t )nrows * row_size;
1419+     const  size_t  n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
1420+ 
1421+     //  Calculate how many full rows and how many remaining bytes we need to process.
1422+     const  int64_t  n_full_rows = n_bytes_to_copy / row_size;
1423+     const  size_t   n_rem_bytes = n_bytes_to_copy % row_size;
1424+ 
12841425    void  * buf_pd = ggml_aligned_malloc (row_size_pd);
12851426    GGML_ASSERT (buf_pd != NULL );
12861427
@@ -1292,7 +1433,8 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
12921433
12931434    memset (buf_pd, 0 , row_size_pd);  //  clear-out padded buffer to make sure the tail is all zeros
12941435
1295-     for  (int64_t  i = 0 ; i < nrows; i++) {
1436+     //  1. Process all the full rows
1437+     for  (int64_t  i = 0 ; i < n_full_rows; i++) {
12961438        const  uint8_t  * src = (const  uint8_t  *) t->data  + (i * row_size);
12971439        uint8_t  *       dst = (uint8_t  *) data + (i * row_size);
12981440
@@ -1301,6 +1443,20 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
13011443        memcpy (dst, buf_rp, row_size);
13021444    }
13031445
1446+     //  2. Process the final, potentially partial, row
1447+     if  (n_rem_bytes > 0 ) {
1448+         const  int64_t  i = n_full_rows;
1449+         const  uint8_t  * src = (const  uint8_t  *) t->data  + (i * row_size);
1450+         uint8_t  *       dst = (uint8_t  *) data + (i * row_size);
1451+ 
1452+         //  We still need to read and unpack the entire source row because the format is block-based.
1453+         memcpy (buf_pd, src, row_size);
1454+         unpack_row_mxfp4x4x2 ((block_mxfp4 *) buf_rp, (const  uint8_t  *) buf_pd, t->ne [0 ]);
1455+ 
1456+         //  But we only copy the remaining number of bytes to the destination to respect the size limit.
1457+         memcpy (dst, buf_rp, n_rem_bytes);
1458+     }
1459+ 
13041460    ggml_aligned_free (buf_pd, row_size_pd);
13051461    ggml_aligned_free (buf_rp, row_size_rp);
13061462}
@@ -1319,19 +1475,19 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
13191475    switch  (tensor->type ) {
13201476        case  GGML_TYPE_Q4_0:
13211477            GGML_ASSERT (offset == 0 );
1322-             GGML_ASSERT (size = = ggml_nbytes (tensor));
1478+             GGML_ASSERT (offset +  size < = ggml_nbytes (tensor));
13231479            repack_q4_0_q4x4x2 (tensor, data, size);
13241480            break ;
13251481
13261482        case  GGML_TYPE_Q8_0:
13271483            GGML_ASSERT (offset == 0 );
1328-             GGML_ASSERT (size = = ggml_nbytes (tensor));
1484+             GGML_ASSERT (offset +  size < = ggml_nbytes (tensor));
13291485            repack_q8_0_q8x4x2 (tensor, data, size);
13301486            break ;
13311487
13321488        case  GGML_TYPE_MXFP4:
13331489            GGML_ASSERT (offset == 0 );
1334-             GGML_ASSERT (size = = ggml_nbytes (tensor));
1490+             GGML_ASSERT (offset +  size < = ggml_nbytes (tensor));
13351491            repack_mxfp4_mxfp4x4x2 (tensor, data, size);
13361492            break ;
13371493
@@ -1355,19 +1511,19 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
13551511    switch  (tensor->type ) {
13561512        case  GGML_TYPE_Q4_0:
13571513            GGML_ASSERT (offset == 0 );
1358-             GGML_ASSERT (size = = ggml_nbytes (tensor));
1514+             GGML_ASSERT (offset +  size < = ggml_nbytes (tensor));
13591515            repack_q4x4x2_q4_0 (data, tensor, size);
13601516            break ;
13611517
13621518        case  GGML_TYPE_Q8_0:
13631519            GGML_ASSERT (offset == 0 );
1364-             GGML_ASSERT (size = = ggml_nbytes (tensor));
1520+             GGML_ASSERT (offset +  size < = ggml_nbytes (tensor));
13651521            repack_q8x4x2_q8_0 (data, tensor, size);
13661522            break ;
13671523
13681524        case  GGML_TYPE_MXFP4:
13691525            GGML_ASSERT (offset == 0 );
1370-             GGML_ASSERT (size = = ggml_nbytes (tensor));
1526+             GGML_ASSERT (offset +  size < = ggml_nbytes (tensor));
13711527            repack_mxfp4x4x2_mxfp4 (data, tensor, size);
13721528            break ;
13731529
0 commit comments