Skip to content
24 changes: 22 additions & 2 deletions ggml/src/ggml-hexagon/htp/binary-ops.c
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,30 @@ static void binary_job_f32_per_thread(const struct htp_tensor * src0,
uint8_t * restrict dst_ptr = (uint8_t *) dst->data + (src0_start_row * dst_row_size);

const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
const uint8_t * restrict src1_ptr = NULL;

const uint32_t ne0201 = ne02 * ne01;
uint32_t div03[2];
uint32_t div02[2];
init_fastdiv_values(ne0201, &div03[0], &div03[1]);
init_fastdiv_values(ne01, &div02[0], &div02[1]);

uint32_t mod13[2];
uint32_t mod12[2];
uint32_t mod11[2];
init_fastdiv_values(ne13, &mod13[0], &mod13[1]);
init_fastdiv_values(ne12, &mod12[0], &mod12[1]);
init_fastdiv_values(ne11, &mod11[0], &mod11[1]);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Liked @lhez said, maybe we could move this to cpu once tensor init, and save it at htp_tensor?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep. The host would be even better.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're going to need that for FP16 MatMuls that need to deal with the broadcasting.
So yeah, the host would be best to precompute this. We should be able to take advantage of the graph caching to avoid precomputing it several times.


for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
src1_ptr = data_src1 + (ir % src1_nrows) * src1_row_size;
const uint32_t i03 = fastdiv(ir, div03[0], div03[1]);
const uint32_t i02 = fastdiv(ir - i03 * ne02 * ne01, div02[0], div02[1]);
const uint32_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01);

const uint32_t i13 = fastmodulo(i03, ne13, mod13[0], mod13[1]);
const uint32_t i12 = fastmodulo(i02, ne12, mod12[0], mod12[1]);
const uint32_t i11 = fastmodulo(i01, ne11, mod11[0], mod11[1]);

const uint8_t * restrict src1_ptr = data_src1 + i13 * nb13 + i12 * nb12 + i11 * src1_row_size;

if (ir + 1 < src0_end_row) {
htp_l2fetch(src0_ptr + ne00, 1, src0_row_size, src0_row_size);
Expand Down
28 changes: 28 additions & 0 deletions ggml/src/ggml-hexagon/htp/ops-utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,34 @@ static inline uint32_t htp_round_up(uint32_t n, uint32_t m) {
return m * ((n + m - 1) / m);
}

// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
// Precompute mp (m' in the paper) and L such that division
// can be computed using a multiply (high 32b of 64b result)
// and a shift:
//
// n/d = (mulhi(n, mp) + n) >> L;
static inline void init_fastdiv_values(uint32_t d, uint32_t * p_mp, uint32_t * p_l) {
// compute L = ceil(log2(d));
uint32_t L = 0;
while (L < 32 && ((uint32_t) 1 << L) < d) {
L++;
}

*p_mp = (uint32_t) (((uint64_t) 1 << 32) * (((uint64_t) 1 << L) - d) / d + 1);
*p_l = L;
}

static inline uint32_t fastdiv(uint32_t n, const uint32_t mp, const uint32_t l) {
// Compute high 32 bits of n * mp
const uint32_t hi = (uint32_t) (((uint64_t) n * mp) >> 32); // mulhi(n, mp)
Copy link
Contributor Author

@chraac chraac Nov 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thought we can avoid the hi 32bit calc here if can ensure n * mp < std::numeric_limits<uint32_t>::max().

// add n, apply bit shift
return (hi + n) >> l;
}

static inline uint32_t fastmodulo(uint32_t n, uint32_t d, const uint32_t mp, const uint32_t l) {
return n - fastdiv(n, mp, l) * d;
}

static inline void htp_l2fetch(const void * p, uint32_t height, uint32_t width, uint32_t stride) {
const uint64_t control = Q6_P_combine_RR(stride, Q6_R_combine_RlRl(width, height));
asm volatile(" l2fetch(%0,%1) " : : "r"(p), "r"(control));
Expand Down
Loading