Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
180 changes: 168 additions & 12 deletions ggml/src/ggml-hexagon/ggml-hexagon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -656,6 +656,15 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)

// Ensure we don't try to read more data than is available in the source buffer 'data'
// or write more than the tensor can hold.
const size_t total_tensor_size = (size_t)nrows * row_size;
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;

// Calculate how many full rows and how many remaining bytes we need to process.
const int64_t n_full_rows = n_bytes_to_copy / row_size;
const size_t n_rem_bytes = n_bytes_to_copy % row_size;

void * buf_pd = ggml_aligned_malloc(row_size_pd);
GGML_ASSERT(buf_pd != NULL);

Expand All @@ -667,7 +676,8 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)

init_row_q4x4x2((block_q4_0 *) buf_pd, t->ne[0]); // init padded buffer to make sure the tail is all zeros

for (int64_t i = 0; i < nrows; i++) {
// 1. Process all the full rows
for (int64_t i = 0; i < n_full_rows; i++) {
const uint8_t * src = (const uint8_t *) data + (i * row_size);
uint8_t * dst = (uint8_t *) t->data + (i * row_size);

Expand All @@ -676,6 +686,25 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
memcpy(dst, buf_rp, row_size);
}

// 2. Process the final, potentially partial, row
if (n_rem_bytes > 0) {
const int64_t i = n_full_rows;
const uint8_t * src = (const uint8_t *) data + (i * row_size);
uint8_t * dst = (uint8_t *) t->data + (i * row_size);

// re-init the row because we are potentially copying a partial row
init_row_q4x4x2((block_q4_0 *) buf_pd, t->ne[0]);

// Copy only the remaining bytes from the source.
memcpy(buf_pd, src, n_rem_bytes);

// Repack the entire buffer
repack_row_q4x4x2((uint8_t *) buf_rp, (const block_q4_0 *) buf_pd, t->ne[0]);

// Write only the corresponding remaining bytes to the destination tensor.
memcpy(dst, buf_rp, n_rem_bytes);
}

ggml_aligned_free(buf_pd, row_size_pd);
ggml_aligned_free(buf_rp, row_size_rp);
}
Expand All @@ -688,6 +717,14 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)

// Ensure we don't try to copy more data than the tensor actually contains.
const size_t total_tensor_size = (size_t)nrows * row_size;
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;

// Calculate how many full rows and how many remaining bytes we need to process.
const int64_t n_full_rows = n_bytes_to_copy / row_size;
const size_t n_rem_bytes = n_bytes_to_copy % row_size;

void * buf_pd = ggml_aligned_malloc(row_size_pd);
GGML_ASSERT(buf_pd != NULL);

Expand All @@ -699,7 +736,8 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)

memset(buf_pd, 0, row_size_pd); // clear-out padded buffer to make sure the tail is all zeros

for (int64_t i = 0; i < nrows; i++) {
// 1. Process all the full rows
for (int64_t i = 0; i < n_full_rows; i++) {
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
uint8_t * dst = (uint8_t *) data + (i * row_size);

Expand All @@ -708,6 +746,20 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
memcpy(dst, buf_rp, row_size);
}

// 2. Process the final, potentially partial, row
if (n_rem_bytes > 0) {
const int64_t i = n_full_rows;
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
uint8_t * dst = (uint8_t *) data + (i * row_size);

// We still need to read and unpack the entire source row because quantization is block-based.
memcpy(buf_pd, src, row_size);
unpack_row_q4x4x2((block_q4_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);

// But we only copy the remaining number of bytes to the destination.
memcpy(dst, buf_rp, n_rem_bytes);
}

ggml_aligned_free(buf_pd, row_size_pd);
ggml_aligned_free(buf_rp, row_size_rp);
}
Expand Down Expand Up @@ -930,6 +982,15 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)

// Ensure we don't try to read more data than is available in the source buffer 'data'
// or write more than the tensor can hold.
const size_t total_tensor_size = (size_t)nrows * row_size;
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;

// Calculate how many full rows and how many remaining bytes we need to process.
const int64_t n_full_rows = n_bytes_to_copy / row_size;
const size_t n_rem_bytes = n_bytes_to_copy % row_size;

void * buf_pd = ggml_aligned_malloc(row_size_pd);
GGML_ASSERT(buf_pd != NULL);

Expand All @@ -941,7 +1002,8 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)

init_row_q8x4x2((block_q8_0 *) buf_pd, t->ne[0]); // init padded buffer to make sure the tail is all zeros

for (int64_t i = 0; i < nrows; i++) {
// 1. Process all the full rows
for (int64_t i = 0; i < n_full_rows; i++) {
const uint8_t * src = (const uint8_t *) data + (i * row_size);
uint8_t * dst = (uint8_t *) t->data + (i * row_size);

Expand All @@ -950,6 +1012,25 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
memcpy(dst, buf_rp, row_size);
}

// 2. Process the final, potentially partial, row
if (n_rem_bytes > 0) {
const int64_t i = n_full_rows;
const uint8_t * src = (const uint8_t *) data + (i * row_size);
uint8_t * dst = (uint8_t *) t->data + (i * row_size);

// re-init the row because we are potentially copying a partial row
init_row_q8x4x2((block_q8_0 *) buf_pd, t->ne[0]);

// Copy only the remaining bytes from the source.
memcpy(buf_pd, src, n_rem_bytes);

// Repack the entire buffer
repack_row_q8x4x2((uint8_t *) buf_rp, (const block_q8_0 *) buf_pd, t->ne[0]);

// Write only the corresponding remaining bytes to the destination tensor.
memcpy(dst, buf_rp, n_rem_bytes);
}

ggml_aligned_free(buf_pd, row_size_pd);
ggml_aligned_free(buf_rp, row_size_rp);
}
Expand All @@ -962,6 +1043,14 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)

// Ensure we don't try to copy more data than the tensor actually contains.
const size_t total_tensor_size = (size_t)nrows * row_size;
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;

// Calculate how many full rows and how many remaining bytes we need to process.
const int64_t n_full_rows = n_bytes_to_copy / row_size;
const size_t n_rem_bytes = n_bytes_to_copy % row_size;

void * buf_pd = ggml_aligned_malloc(row_size_pd);
GGML_ASSERT(buf_pd != NULL);

Expand All @@ -973,7 +1062,8 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)

memset(buf_pd, 0, row_size_pd); // clear-out padded buffer to make sure the tail is all zeros

for (int64_t i = 0; i < nrows; i++) {
// 1. Process all the full rows
for (int64_t i = 0; i < n_full_rows; i++) {
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
uint8_t * dst = (uint8_t *) data + (i * row_size);

Expand All @@ -982,6 +1072,20 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
memcpy(dst, buf_rp, row_size);
}

// 2. Process the final, potentially partial, row
if (n_rem_bytes > 0) {
const int64_t i = n_full_rows;
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
uint8_t * dst = (uint8_t *) data + (i * row_size);

// We still need to read and unpack the entire source row because quantization is block-based.
memcpy(buf_pd, src, row_size);
unpack_row_q8x4x2((block_q8_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);

// But we only copy the remaining number of bytes to the destination.
memcpy(dst, buf_rp, n_rem_bytes);
}

ggml_aligned_free(buf_pd, row_size_pd);
ggml_aligned_free(buf_rp, row_size_rp);
}
Expand Down Expand Up @@ -1229,6 +1333,15 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)

// Ensure we don't try to read more data than is available in the source buffer 'data'
// or write more than the tensor can hold.
const size_t total_tensor_size = (size_t)nrows * row_size;
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;

// Calculate how many full rows and how many remaining bytes we need to process.
const int64_t n_full_rows = n_bytes_to_copy / row_size;
const size_t n_rem_bytes = n_bytes_to_copy % row_size;

void * buf_pd = ggml_aligned_malloc(row_size_pd);
GGML_ASSERT(buf_pd != NULL);

Expand All @@ -1240,7 +1353,8 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si

init_row_mxfp4x4x2((block_mxfp4 *) buf_pd, t->ne[0]); // init padded buffer to make sure the tail is all zeros

for (int64_t i = 0; i < nrows; i++) {
// 1. Process all the full rows
for (int64_t i = 0; i < n_full_rows; i++) {
const uint8_t * src = (const uint8_t *) data + (i * row_size);
uint8_t * dst = (uint8_t *) t->data + (i * row_size);

Expand All @@ -1249,6 +1363,25 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
memcpy(dst, buf_rp, row_size);
}

// 2. Process the final, potentially partial, row
if (n_rem_bytes > 0) {
const int64_t i = n_full_rows;
const uint8_t * src = (const uint8_t *) data + (i * row_size);
uint8_t * dst = (uint8_t *) t->data + (i * row_size);

// re-init the row because we are potentially copying a partial row
init_row_mxfp4x4x2((block_mxfp4 *) buf_pd, t->ne[0]);

// Copy only the remaining bytes from the source.
memcpy(buf_pd, src, n_rem_bytes);

// Repack the entire buffer (partial data + zero padding).
repack_row_mxfp4x4x2((uint8_t *) buf_rp, (const block_mxfp4 *) buf_pd, t->ne[0]);

// Write only the corresponding remaining bytes to the destination tensor.
memcpy(dst, buf_rp, n_rem_bytes);
}

ggml_aligned_free(buf_pd, row_size_pd);
ggml_aligned_free(buf_rp, row_size_rp);
}
Expand All @@ -1261,6 +1394,14 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)

// Ensure we don't try to copy more data than the tensor actually contains.
const size_t total_tensor_size = (size_t)nrows * row_size;
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;

// Calculate how many full rows and how many remaining bytes we need to process.
const int64_t n_full_rows = n_bytes_to_copy / row_size;
const size_t n_rem_bytes = n_bytes_to_copy % row_size;

void * buf_pd = ggml_aligned_malloc(row_size_pd);
GGML_ASSERT(buf_pd != NULL);

Expand All @@ -1272,7 +1413,8 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si

memset(buf_pd, 0, row_size_pd); // clear-out padded buffer to make sure the tail is all zeros

for (int64_t i = 0; i < nrows; i++) {
// 1. Process all the full rows
for (int64_t i = 0; i < n_full_rows; i++) {
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
uint8_t * dst = (uint8_t *) data + (i * row_size);

Expand All @@ -1281,6 +1423,20 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
memcpy(dst, buf_rp, row_size);
}

// 2. Process the final, potentially partial, row
if (n_rem_bytes > 0) {
const int64_t i = n_full_rows;
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
uint8_t * dst = (uint8_t *) data + (i * row_size);

// We still need to read and unpack the entire source row because the format is block-based.
memcpy(buf_pd, src, row_size);
unpack_row_mxfp4x4x2((block_mxfp4 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);

// But we only copy the remaining number of bytes to the destination to respect the size limit.
memcpy(dst, buf_rp, n_rem_bytes);
}

ggml_aligned_free(buf_pd, row_size_pd);
ggml_aligned_free(buf_rp, row_size_rp);
}
Expand All @@ -1299,19 +1455,19 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
switch (tensor->type) {
case GGML_TYPE_Q4_0:
GGML_ASSERT(offset == 0);
GGML_ASSERT(size == ggml_nbytes(tensor));
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
repack_q4_0_q4x4x2(tensor, data, size);
break;

case GGML_TYPE_Q8_0:
GGML_ASSERT(offset == 0);
GGML_ASSERT(size == ggml_nbytes(tensor));
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
repack_q8_0_q8x4x2(tensor, data, size);
break;

case GGML_TYPE_MXFP4:
GGML_ASSERT(offset == 0);
GGML_ASSERT(size == ggml_nbytes(tensor));
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
repack_mxfp4_mxfp4x4x2(tensor, data, size);
break;

Expand All @@ -1335,19 +1491,19 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
switch (tensor->type) {
case GGML_TYPE_Q4_0:
GGML_ASSERT(offset == 0);
GGML_ASSERT(size == ggml_nbytes(tensor));
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
repack_q4x4x2_q4_0(data, tensor, size);
break;

case GGML_TYPE_Q8_0:
GGML_ASSERT(offset == 0);
GGML_ASSERT(size == ggml_nbytes(tensor));
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
repack_q8x4x2_q8_0(data, tensor, size);
break;

case GGML_TYPE_MXFP4:
GGML_ASSERT(offset == 0);
GGML_ASSERT(size == ggml_nbytes(tensor));
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
repack_mxfp4x4x2_mxfp4(data, tensor, size);
break;

Expand Down
Loading