@@ -2226,7 +2226,7 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
22262226    bufs[0 ].ptr     = src0->data ;
22272227    bufs[0 ].offset  = (uint8_t  *) src0->data  - src0_buf->base ;
22282228    bufs[0 ].size    = ggml_nbytes (src0);
2229-     bufs[0 ].flags   = DSPQUEUE_BUFFER_FLAG_REF ;
2229+     bufs[0 ].flags   = 0 ;
22302230
22312231    //  Second buffer Input Activations. This is a buffer that the CPU
22322232    //  writes and the DSP reads, so we'll need to flush CPU caches and
@@ -2236,8 +2236,7 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
22362236    bufs[1 ].ptr     = src1->data ;
22372237    bufs[1 ].offset  = (uint8_t  *) src1->data  - src1_buf->base ;
22382238    bufs[1 ].size    = ggml_nbytes (src1);
2239-     bufs[1 ].flags   = (DSPQUEUE_BUFFER_FLAG_REF |                   //  Take a reference
2240-                      DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          //  Flush CPU
2239+     bufs[1 ].flags   = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         //  Flush CPU
22412240                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  //  Invalidate DSP
22422241
22432242    //  Third buffer Output Activations. We'll handle DSP
@@ -2248,7 +2247,7 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
22482247    bufs[2 ].ptr     = dst->data ;
22492248    bufs[2 ].offset  = (uint8_t  *) dst->data  - dst_buf->base ;
22502249    bufs[2 ].size    = ggml_nbytes (dst);
2251-     bufs[2 ].flags   = (DSPQUEUE_BUFFER_FLAG_REF |  DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
2250+     bufs[2 ].flags   = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
22522251
22532252    //  Primary DSP session from the src0 (normally weight) tensor
22542253    auto  sess = src0_buf->sess ;
@@ -2332,7 +2331,7 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
23322331    bufs[0 ].ptr     = src0->data ;
23332332    bufs[0 ].offset  = (uint8_t  *) src0->data  - src0_buf->base ;
23342333    bufs[0 ].size    = ggml_nbytes (src0);
2335-     bufs[0 ].flags   = DSPQUEUE_BUFFER_FLAG_REF ;
2334+     bufs[0 ].flags   = 0 ;
23362335
23372336    //  Second buffer Input Activations. This is a buffer that the CPU
23382337    //  writes and the DSP reads, so we'll need to flush CPU caches and
@@ -2342,8 +2341,7 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
23422341    bufs[1 ].ptr     = src1->data ;
23432342    bufs[1 ].offset  = (uint8_t  *) src1->data  - src1_buf->base ;
23442343    bufs[1 ].size    = ggml_nbytes (src1);
2345-     bufs[1 ].flags   = (DSPQUEUE_BUFFER_FLAG_REF |                   //  Take a reference
2346-                      DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          //  Flush CPU
2344+     bufs[1 ].flags   = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         //  Flush CPU
23472345                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  //  Invalidate DSP
23482346
23492347    //  Third buffer expert IDs. This is a buffer that the CPU
@@ -2354,8 +2352,7 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
23542352    bufs[2 ].ptr     = src2->data ;
23552353    bufs[2 ].offset  = (uint8_t  *) src2->data  - src2_buf->base ;
23562354    bufs[2 ].size    = ggml_nbytes (src2);
2357-     bufs[2 ].flags   = (DSPQUEUE_BUFFER_FLAG_REF |                   //  Take a reference
2358-                      DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          //  Flush CPU
2355+     bufs[2 ].flags   = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         //  Flush CPU
23592356                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  //  Invalidate DSP
23602357
23612358    //  Forth buffer Output Activations. We'll handle DSP
@@ -2366,7 +2363,7 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
23662363    bufs[3 ].ptr     = dst->data ;
23672364    bufs[3 ].offset  = (uint8_t  *) dst->data  - dst_buf->base ;
23682365    bufs[3 ].size    = ggml_nbytes (dst);
2369-     bufs[3 ].flags   = (DSPQUEUE_BUFFER_FLAG_REF |  DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
2366+     bufs[3 ].flags   = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
23702367
23712368    //  Primary DSP session from the src0 (normally weight) tensor
23722369    auto  sess = src0_buf->sess ;
@@ -2468,8 +2465,7 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
24682465    bufs[0 ].ptr     = src0->data ;
24692466    bufs[0 ].offset  = (uint8_t  *) src0->data  - src0_buf->base ;
24702467    bufs[0 ].size    = ggml_nbytes (src0);
2471-     bufs[0 ].flags   = (DSPQUEUE_BUFFER_FLAG_REF |                   //  Take a reference
2472-                      DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          //  Flush CPU
2468+     bufs[0 ].flags   = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         //  Flush CPU
24732469                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  //  Invalidate DSP;
24742470
24752471    //  Second buffer = Second Operand of Binary op
@@ -2481,8 +2477,7 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
24812477    bufs[1 ].ptr     = src1->data ;
24822478    bufs[1 ].offset  = (uint8_t  *) src1->data  - src1_buf->base ;
24832479    bufs[1 ].size    = ggml_nbytes (src1);
2484-     bufs[1 ].flags   = (DSPQUEUE_BUFFER_FLAG_REF |                   //  Take a reference
2485-                      DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          //  Flush CPU
2480+     bufs[1 ].flags   = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         //  Flush CPU
24862481                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  //  Invalidate DSP
24872482
24882483    //  Third buffer = Output Activations. We'll handle DSP
@@ -2493,7 +2488,7 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
24932488    bufs[2 ].ptr     = dst->data ;
24942489    bufs[2 ].offset  = (uint8_t  *) dst->data  - dst_buf->base ;
24952490    bufs[2 ].size    = ggml_nbytes (dst);
2496-     bufs[2 ].flags   = (DSPQUEUE_BUFFER_FLAG_REF |  DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
2491+     bufs[2 ].flags   = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
24972492
24982493    //  Primary DSP session from the src0 tensor
24992494    ggml_hexagon_session * sess = src0_buf->sess ;
@@ -2586,34 +2581,31 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
25862581    bufs[0 ].ptr     = src0->data ;
25872582    bufs[0 ].offset  = (uint8_t  *) src0->data  - src0_buf->base ;
25882583    bufs[0 ].size    = ggml_nbytes (src0);
2589-     bufs[0 ].flags   = (DSPQUEUE_BUFFER_FLAG_REF |                   //  Take a reference
2590-                      DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          //  Flush CPU
2584+     bufs[0 ].flags   = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         //  Flush CPU
25912585                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  //  Invalidate DSP;
25922586
25932587    //  Second buffer = experts bias
25942588    bufs[1 ].fd      = src1_buf->fd ;
25952589    bufs[1 ].ptr     = src1->data ;
25962590    bufs[1 ].offset  = (uint8_t  *) src1->data  - src1_buf->base ;
25972591    bufs[1 ].size    = ggml_nbytes (src1);
2598-     bufs[1 ].flags   = (DSPQUEUE_BUFFER_FLAG_REF |                   //  Take a reference
2599-                      DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          //  Flush CPU
2592+     bufs[1 ].flags   = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         //  Flush CPU
26002593                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  //  Invalidate DSP
26012594
26022595    //  Third buffer = activated experts
26032596    bufs[2 ].fd      = src2_buf->fd ;
26042597    bufs[2 ].ptr     = src2->data ;
26052598    bufs[2 ].offset  = (uint8_t  *) src2->data  - src2_buf->base ;
26062599    bufs[2 ].size    = ggml_nbytes (src2);
2607-     bufs[2 ].flags   = (DSPQUEUE_BUFFER_FLAG_REF |                   //  Take a reference
2608-                      DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          //  Flush CPU
2600+     bufs[2 ].flags   = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         //  Flush CPU
26092601                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  //  Invalidate DSP
26102602
26112603    //  Forth buffer = output activations
26122604    bufs[3 ].fd      = dst_buf->fd ;
26132605    bufs[3 ].ptr     = dst->data ;
26142606    bufs[3 ].offset  = (uint8_t  *) dst->data  - dst_buf->base ;
26152607    bufs[3 ].size    = ggml_nbytes (dst);
2616-     bufs[3 ].flags   = (DSPQUEUE_BUFFER_FLAG_REF |  DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
2608+     bufs[3 ].flags   = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
26172609
26182610    //  Primary DSP session from the src0 tensor
26192611    ggml_hexagon_session * sess = src0_buf->sess ;
@@ -2741,8 +2733,7 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
27412733    bufs[n_bufs].ptr     = src0->data ;
27422734    bufs[n_bufs].offset  = (uint8_t  *) src0->data  - src0_buf->base ;
27432735    bufs[n_bufs].size    = ggml_nbytes (src0);
2744-     bufs[n_bufs].flags   = (DSPQUEUE_BUFFER_FLAG_REF |                   //  Take a reference
2745-                           DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          //  Flush CPU
2736+     bufs[n_bufs].flags   = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         //  Flush CPU
27462737                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  //  Invalidate DSP;
27472738    ++n_bufs;
27482739
@@ -2757,8 +2748,7 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
27572748        bufs[n_bufs].ptr     = src1->data ;
27582749        bufs[n_bufs].offset  = (uint8_t  *) src1->data  - src1_buf->base ;
27592750        bufs[n_bufs].size    = ggml_nbytes (src1);
2760-         bufs[n_bufs].flags   = (DSPQUEUE_BUFFER_FLAG_REF |                   //  Take a reference
2761-                               DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          //  Flush CPU
2751+         bufs[n_bufs].flags   = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         //  Flush CPU
27622752                              DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  //  Invalidate DSP
27632753        ++n_bufs;
27642754    }
@@ -2773,7 +2763,7 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
27732763    bufs[n_bufs].ptr     = dst->data ;
27742764    bufs[n_bufs].offset  = (uint8_t  *) dst->data  - dst_buf->base ;
27752765    bufs[n_bufs].size    = ggml_nbytes (dst);
2776-     bufs[n_bufs].flags   = (DSPQUEUE_BUFFER_FLAG_REF |  DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
2766+     bufs[n_bufs].flags   = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
27772767    ++n_bufs;
27782768
27792769    //  Primary DSP session from the src0 tensor
@@ -2880,8 +2870,7 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
28802870    bufs[n_bufs].ptr     = src0->data ;
28812871    bufs[n_bufs].offset  = (uint8_t  *) src0->data  - src0_buf->base ;
28822872    bufs[n_bufs].size    = ggml_nbytes (src0);
2883-     bufs[n_bufs].flags   = (DSPQUEUE_BUFFER_FLAG_REF |                   //  Take a reference
2884-                           DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          //  Flush CPU
2873+     bufs[n_bufs].flags   = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         //  Flush CPU
28852874                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  //  Invalidate DSP;
28862875    ++n_bufs;
28872876
@@ -2895,8 +2884,7 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
28952884    bufs[n_bufs].ptr     = src1->data ;
28962885    bufs[n_bufs].offset  = (uint8_t  *) src1->data  - src1_buf->base ;
28972886    bufs[n_bufs].size    = ggml_nbytes (src1);
2898-     bufs[n_bufs].flags   = (DSPQUEUE_BUFFER_FLAG_REF |                   //  Take a reference
2899-                           DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          //  Flush CPU
2887+     bufs[n_bufs].flags   = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         //  Flush CPU
29002888                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  //  Invalidate DSP
29012889    ++n_bufs;
29022890
@@ -2911,8 +2899,7 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
29112899        bufs[n_bufs].ptr     = src2->data ;
29122900        bufs[n_bufs].offset  = (uint8_t  *) src2->data  - src2_buf->base ;
29132901        bufs[n_bufs].size    = ggml_nbytes (src2);
2914-         bufs[n_bufs].flags   = (DSPQUEUE_BUFFER_FLAG_REF |                   //  Take a reference
2915-                               DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          //  Flush CPU
2902+         bufs[n_bufs].flags   = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         //  Flush CPU
29162903                              DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  //  Invalidate DSP
29172904        ++n_bufs;
29182905    }
@@ -2927,7 +2914,7 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
29272914    bufs[n_bufs].ptr     = dst->data ;
29282915    bufs[n_bufs].offset  = (uint8_t  *) dst->data  - dst_buf->base ;
29292916    bufs[n_bufs].size    = ggml_nbytes (dst);
2930-     bufs[n_bufs].flags   = (DSPQUEUE_BUFFER_FLAG_REF |  DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
2917+     bufs[n_bufs].flags   = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
29312918    ++n_bufs;
29322919
29332920    //  Primary DSP session from the src0 tensor
0 commit comments