move use count to the graph to avoid data races and double increments when used in multiple threads

jeffbolznv · jeffbolznv · commit 8c50a9b40cb8 · 2025-06-26T22:37:16.000-05:00
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
@@ -604,11 +604,7 @@ extern "C" {
 
         void * extra; // extra things e.g. for ggml-cuda.cu
 
-        // number of operations that use this tensor as a src
-        int32_t use_count;
-
-        // add padding if needed to make a multiple of GGML_MEM_ALIGN
-        char padding[4];
+        char padding[8];
     };
 
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
@@ -818,7 +818,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
         if (sched->debug > 1) {
             ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
             GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d:", i, ggml_op_name(node->op), node->name,
-                fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node), node->use_count);
+                fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node), graph->use_counts[i]);
             for (int j = 0; j < GGML_MAX_SRC; j++) {
                 struct ggml_tensor * src = node->src[j];
                 if (src == NULL) {
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
@@ -301,6 +301,7 @@ struct ggml_cgraph {
     struct ggml_tensor ** grads;     // the outputs of these tensors are the gradients of the nodes
     struct ggml_tensor ** grad_accs; // accumulators for node gradients
     struct ggml_tensor ** leafs;     // tensors with constant data
+    int32_t             * use_counts;// number of uses of each tensor
 
     struct ggml_hash_set visited_hash_set;
 
@@ -469,9 +470,10 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
 
 // return true if the node's results are only used by N other nodes
 // and can be fused into their calculations.
-static inline bool ggml_node_has_N_uses(const struct ggml_tensor * node, int32_t N) {
+static inline bool ggml_node_has_N_uses(const struct ggml_cgraph * cgraph, int node_idx, int32_t N) {
+    const struct ggml_tensor * node = cgraph->nodes[node_idx];
     // check the use count against how many we're replacing
-    if (node->use_count != N) {
+    if (cgraph->use_counts[node_idx] != N) {
         return false;
     }
 
@@ -505,7 +507,7 @@ static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx
         if (node->op != ops[i]) {
             return false;
         }
-        if (i < num_ops && !ggml_node_has_N_uses(node, 1)) {
+        if (i < num_ops && !ggml_node_has_N_uses(cgraph, node_idx + i, 1)) {
             return false;
         }
         if (i > 0) {
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -1608,7 +1608,6 @@ static struct ggml_tensor * ggml_new_tensor_impl(
         /*.data         =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
         /*.name         =*/ { 0 },
         /*.extra        =*/ NULL,
-        /*.use_count    =*/ 0,
         /*.padding      =*/ { 0 },
     };
 
@@ -5816,9 +5815,31 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
             (cgraph->order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? i :
             (cgraph->order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? (GGML_MAX_SRC-1-i) :
             /* unknown order, just fall back to using i*/ i;
-        if (node->src[k]) {
-            ggml_visit_parents(cgraph, node->src[k]);
-            node->src[k]->use_count++;
+
+        struct ggml_tensor * s = node->src[k];
+        if (s) {
+            ggml_visit_parents(cgraph, s);
+
+            // Update the use count for this operand.
+            // Skip if it's a leaf node
+            if (!(s->op == GGML_OP_NONE && !(s->flags & GGML_TENSOR_FLAG_PARAM))) {
+                // the src can be the node itself (happens in ggml_cast)
+                if (s == node) {
+                    cgraph->use_counts[cgraph->n_nodes]++;
+                } else {
+                    // Search backward to find the src. This usually takes very few
+                    // (most often one) iteration(s). Probably comparable to hashing
+                    // on average..
+                    int j = cgraph->n_nodes - 1;
+                    for (; j >= 0; --j) {
+                        if (s == cgraph->nodes[j]) {
+                            break;
+                        }
+                    }
+                    GGML_ASSERT(j >= 0);
+                    cgraph->use_counts[j]++;
+                }
+            }
         }
     }
 
@@ -5979,6 +6000,7 @@ static size_t ggml_graph_nbytes(size_t size, bool grads) {
     incr_ptr_aligned(&p, sizeof(struct ggml_cgraph), 1);
     incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // nodes
     incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // leafs
+    incr_ptr_aligned(&p, size * sizeof(int32_t), sizeof(int32_t)); // use_counts
     incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // hash keys
     if (grads) {
         incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // grads
@@ -6010,6 +6032,7 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
 
     struct ggml_tensor ** nodes_ptr     =         incr_ptr_aligned(&p, size      * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
     struct ggml_tensor ** leafs_ptr     =         incr_ptr_aligned(&p, size      * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
+    int32_t             * use_counts_ptr=         incr_ptr_aligned(&p, size      * sizeof(int32_t), sizeof(int32_t));
     struct ggml_tensor ** hash_keys_ptr =         incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
     struct ggml_tensor ** grads_ptr     = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL;
     struct ggml_tensor ** grad_accs_ptr = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL;
@@ -6027,6 +6050,7 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
         /*.grads        =*/ grads_ptr,
         /*.grad_accs    =*/ grad_accs_ptr,
         /*.leafs        =*/ leafs_ptr,
+        /*.use_counts   =*/ use_counts_ptr,
         /*.hash_table   =*/ { hash_size, hash_used, hash_keys_ptr },
         /*.order        =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
     };
@@ -6036,6 +6060,7 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
         memset(cgraph->grads,     0, hash_size*sizeof(struct ggml_tensor *));
         memset(cgraph->grad_accs, 0, hash_size*sizeof(struct ggml_tensor *));
     }
+    memset(cgraph->use_counts,    0, size*sizeof(int32_t));
 
     return cgraph;
 }
@@ -6053,6 +6078,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
         /*.grads            =*/ NULL, // gradients would need visited_hash_set
         /*.grad_accs        =*/ NULL,
         /*.leafs            =*/ NULL,
+        /*.use_counts       =*/ cgraph0->use_counts + i0,
         /*.visited_hash_set =*/ { 0, NULL, NULL },
         /*.order            =*/ cgraph0->order,
     };