profiler: initial support for profiling graph ops

max-krasnyansky · max-krasnyansky · commit ca407742c523 · 2025-06-05T14:38:13.000-07:00
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
@@ -84,6 +84,7 @@ option(GGML_CCACHE "ggml: use ccache if available"                   ON)
 option(GGML_ALL_WARNINGS           "ggml: enable all compiler warnings"                   ON)
 option(GGML_ALL_WARNINGS_3RD_PARTY "ggml: enable all compiler warnings in 3rd party libs" OFF)
 option(GGML_GPROF                  "ggml: enable gprof"                                   OFF)
+option(GGML_GRAPH_PROFILER         "ggml: enable internal Graph and Op profiler"          OFF)
 
 # build
 option(GGML_FATAL_WARNINGS    "ggml: enable -Werror flag"    OFF)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
@@ -8,6 +8,10 @@ if (CMAKE_SYSTEM_NAME MATCHES "Linux")
     add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
 endif()
 
+if (GGML_GRAPH_PROFILER)
+    add_compile_definitions(GGML_GRAPH_PROFILER)
+endif()
+
 if (NOT MSVC)
     if (GGML_SANITIZE_THREAD)
         add_compile_options(-fsanitize=thread)
@@ -204,6 +208,8 @@ add_library(ggml-base
             ggml-threading.h
             ggml-quants.c
             ggml-quants.h
+            ggml-profile.h
+            ggml-profile.cpp
             gguf.cpp)
 
 target_include_directories(ggml-base PRIVATE .)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -13,6 +13,7 @@
 #include "binary-ops.h"
 #include "vec.h"
 #include "ops.h"
+#include "ggml-profile.h"
 #include "ggml.h"
 
 #if defined(_MSC_VER) || defined(__MINGW32__)
@@ -2871,6 +2872,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
     for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
         struct ggml_tensor * node = cgraph->nodes[node_n];
 
+        ggml_graph_profile_event(cgraph, GGML_PROF_OP_START, node_n, state->ith);
+
         ggml_compute_forward(&params, node);
 
         if (state->ith == 0 && cplan->abort_callback &&
@@ -2879,9 +2882,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
             tp->ec    = GGML_STATUS_ABORTED;
         }
 
+        ggml_graph_profile_event(cgraph, GGML_PROF_OP_SYNC, node_n, state->ith);
+
         if (node_n + 1 < cgraph->n_nodes) {
             ggml_barrier(state->threadpool);
         }
+
+        ggml_graph_profile_event(cgraph, GGML_PROF_OP_END,  node_n, state->ith);
     }
 
     ggml_barrier(state->threadpool);
@@ -3117,6 +3124,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
     int n_threads                               = cplan->n_threads;
     struct ggml_threadpool * threadpool = cplan->threadpool;
 
+    ggml_graph_profile_start(cgraph, n_threads);
+
     bool disposable_threadpool = false;
 
     if (threadpool == NULL) {
@@ -3168,6 +3177,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
     // don't leave affinity set on the main thread
     clear_numa_thread_affinity();
 
+    ggml_graph_profile_finish(cgraph, n_threads);
+
     enum ggml_status ret = threadpool->ec;
 
     if (disposable_threadpool) {
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
@@ -292,6 +292,8 @@ enum ggml_cgraph_eval_order {
     GGML_CGRAPH_EVAL_ORDER_COUNT
 };
 
+struct ggml_profile_data;
+
 struct ggml_cgraph {
     int size;    // maximum number of nodes/leafs/grads/grad_accs
     int n_nodes; // number of nodes currently in use
@@ -302,6 +304,8 @@ struct ggml_cgraph {
     struct ggml_tensor ** grad_accs; // accumulators for node gradients
     struct ggml_tensor ** leafs;     // tensors with constant data
 
+    struct ggml_profile_data * prof;
+
     struct ggml_hash_set visited_hash_set;
 
     enum ggml_cgraph_eval_order order;
diff --git a/ggml/src/ggml-profile.cpp b/ggml/src/ggml-profile.cpp
@@ -0,0 +1,177 @@
+#include "ggml-profile.h"
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <string>
+#include <chrono>
+
+#ifdef GGML_GRAPH_PROFILER
+
+struct ggml_profile_output {
+    const char * prefix;
+    FILE *       stream;
+};
+
+extern "C" void ggml_graph_profile_init(struct ggml_cgraph *cg, int n_threads)
+{
+    // TODO: make this a param
+    const char *env = getenv("GGML_GRAPH_PROFILE");
+    if (!env) { return; }
+
+    // The number of threads may change between passes (pp vs tg).
+    // Allocate for max_n_threads for simplicity for now.
+    // TODO: use aligned allocator
+
+    size_t node_size = sizeof(struct ggml_profile_timing) * GGML_MAX_N_THREADS;
+    size_t pvec_size = sizeof(std::intptr_t) * cg->n_nodes;
+    size_t time_size = node_size * cg->n_nodes;
+    size_t t_size    = pvec_size + time_size + sizeof(ggml_profile_output) + sizeof(ggml_profile_data);
+
+    uint8_t * ptr = (uint8_t *) malloc(t_size);
+    if (!ptr) {
+        fprintf(stderr, "ggml-profile: failed to allocate profiling data : n_threads %d n_nodes %d\n", n_threads, cg->n_nodes);
+        return;
+    }
+    memset(ptr, 0, t_size);
+
+    // init all pointers
+    cg->prof         = (ggml_profile_data *)    ptr; ptr += sizeof(ggml_profile_data);
+    cg->prof->output = (ggml_profile_output *)  ptr; ptr += sizeof(ggml_profile_output);
+    cg->prof->timing = (ggml_profile_timing **) ptr; ptr += pvec_size;
+    for (int i=0; i < cg->n_nodes; i++) {
+        cg->prof->timing[i] = (struct ggml_profile_timing *) ptr; ptr += node_size;
+    }
+
+    // init the output
+    ggml_profile_output *out = cg->prof->output;
+    if (!strcmp("stderr", env) || !strcmp("1", env)) {
+        out->prefix = "ggml-profile:";
+        out->stream = stderr;
+    } else {
+        out->prefix = "";
+        out->stream = fopen(env, "w");
+    }
+
+}
+
+extern "C" void ggml_graph_profile_start(struct ggml_cgraph *cg, int n_threads)
+{
+    if (!cg->prof) { ggml_graph_profile_init(cg, n_threads); }
+    if (!cg->prof) { return; }
+}
+
+static inline int ggml_profile_format_tensor_dims(char *str, struct ggml_tensor *t)
+{
+    return sprintf(str, "%d:%d:%d:%d",
+        (int) t->ne[0], (int) t->ne[1], (int) t->ne[3], (int) t->ne[3]);
+}
+
+static inline void ggml_profile_format_op_dims(char *str, struct ggml_tensor *t)
+{
+    char *p = str;
+
+    // append src0 and src1 (if any)
+    if (t->src[0]) {
+       p += ggml_profile_format_tensor_dims(p, t->src[0]);
+
+       for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
+           p += sprintf(p, " x ");
+           p += ggml_profile_format_tensor_dims(p, t->src[i]);
+       }
+
+       p += sprintf(p, " -> ");
+    }
+
+    // format self dims separately for better visual alignment
+    char self[64];
+    ggml_profile_format_tensor_dims(self, t);
+
+    p += sprintf(p, "%12s", self);
+}
+
+static inline void ggml_profile_format_op_types(char *str, struct ggml_tensor *t)
+{
+    char *p = str;
+
+    // append src0 and src1 (if any)
+    if (t->src[0]) {
+       p += sprintf(p, "%s", ggml_type_name(t->src[0]->type));
+
+       for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
+           p += sprintf(p, " x ");
+           p += sprintf(p, "%s", ggml_type_name(t->src[i]->type));
+       }
+
+       p += sprintf(p, " -> ");
+    }
+
+    p += sprintf(p, "%3s", ggml_type_name(t->type));
+}
+
+extern "C" void ggml_graph_profile_finish(struct ggml_cgraph *cg, int n_threads)
+{
+    if (!cg->prof) { return; }
+
+    ggml_profile_output *out = cg->prof->output;
+
+    fprintf(out->stream, "%s| node idx | op name | proc (nsec) | sync (nsec) | total (nsec) | op dims | op types | tensor name |\n", out->prefix);
+    fprintf(out->stream, "%s| -------: | :------ | ----------: | ----------: | -----------: | ------: | -------: | ----------: |\n", out->prefix);
+
+    char dims[64 * GGML_MAX_SRC];
+    char types[16 * GGML_MAX_SRC];
+
+    for (int i = 0; i < cg->n_nodes; i++) {
+        uint64_t p_nsec = 0;
+        uint64_t s_nsec = 0;
+        uint64_t t_nsec = 0;
+
+        // add up per thread counters and reset them
+        for (int t=0; t < n_threads; t++) {
+            ggml_profile_timing &timing = cg->prof->timing[i][t];
+
+            p_nsec += timing.nsec[GGML_PROF_OP_SYNC] - timing.nsec[GGML_PROF_OP_START];
+            s_nsec += timing.nsec[GGML_PROF_OP_END]  - timing.nsec[GGML_PROF_OP_SYNC];
+            t_nsec += timing.nsec[GGML_PROF_OP_END]  - timing.nsec[GGML_PROF_OP_START];
+
+            timing.nsec[GGML_PROF_OP_START] = 0;
+            timing.nsec[GGML_PROF_OP_SYNC]  = 0;
+            timing.nsec[GGML_PROF_OP_END]   = 0;
+        }
+
+        ggml_profile_format_op_dims(dims, cg->nodes[i]);
+        ggml_profile_format_op_types(types, cg->nodes[i]);
+
+        fprintf(out->stream, "%s| %04d | %10s | %10lu | %10lu | %10lu | %46s | %22s | %20s |\n", out->prefix,
+            i, ggml_op_name(cg->nodes[i]->op),
+            (unsigned long) p_nsec, (unsigned long) s_nsec, (unsigned long) t_nsec,
+            dims, types, cg->nodes[i]->name);
+    }
+    fprintf(out->stream, "%s   \n", out->prefix); // empty line to split tables
+}
+
+extern "C" void ggml_graph_profile_free(struct ggml_cgraph *cg)
+{
+    if (!cg->prof) { return; }
+
+    ggml_profile_output *out = cg->prof->output;
+    if (out->stream != stderr) {
+        fclose(out->stream);
+    }
+
+    free(cg->prof); cg->prof = nullptr;
+}
+
+extern "C" void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith)
+{
+    if (!cg->prof) { return; }
+
+    using clock = std::chrono::high_resolution_clock;
+
+    ggml_profile_timing &timing = cg->prof->timing[node_n][ith];
+    timing.nsec[e] = std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
+}
+
+#endif // GGML_GRAPH_PROFILER
diff --git a/ggml/src/ggml-profile.h b/ggml/src/ggml-profile.h
@@ -0,0 +1,90 @@
+#pragma once
+
+#include "ggml-impl.h"
+
+// GGML internal header
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// op profile events & timing (per op / per thread)
+enum ggml_profile_event {
+    GGML_PROF_OP_START,
+    GGML_PROF_OP_SYNC,
+    GGML_PROF_OP_END
+};
+
+struct ggml_profile_timing {
+    uint64_t nsec[GGML_PROF_OP_END + 1]; // event times in nsec
+};
+
+struct ggml_profile_output;
+
+struct ggml_profile_data {
+    struct ggml_profile_output *output;
+    struct ggml_profile_timing ** timing; // per op / per thread timing
+};
+
+// check if profiling is enabled for this graph
+static inline bool ggml_graph_profile_enabled(const struct ggml_cgraph *cg)
+{
+    return cg->prof != NULL;
+}
+
+// get pointer to the timing data for specific node / thread
+// can be used by the backends to populate data collected internally
+static inline struct ggml_profile_timing * ggml_graph_profile_timing(const struct ggml_cgraph *cg, int node_n, int ith)
+{
+    if (!cg->prof) { return NULL; }
+    return &cg->prof->timing[node_n][ith];
+}
+
+#ifndef GGML_GRAPH_PROFILER
+
+// Stub out all profiler functions
+
+static inline void ggml_graph_profile_init(struct ggml_cgraph *cg, int n_threads)
+{
+    GGML_UNUSED(cg);
+    GGML_UNUSED(n_threads);
+}
+
+static inline void ggml_graph_profile_start(struct ggml_cgraph *cg, int n_threads)
+{
+    GGML_UNUSED(cg);
+    GGML_UNUSED(n_threads);
+}
+
+static inline void ggml_graph_profile_finish(struct ggml_cgraph *cg, int n_threads)
+{
+    GGML_UNUSED(cg);
+    GGML_UNUSED(n_threads);
+}
+
+static inline void ggml_graph_profile_free(struct ggml_cgraph *cg)
+{
+    GGML_UNUSED(cg);
+}
+
+static inline void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith)
+{
+    GGML_UNUSED(cg);
+    GGML_UNUSED(e);
+    GGML_UNUSED(node_n);
+    GGML_UNUSED(ith);
+}
+
+#else
+
+void ggml_graph_profile_init(struct ggml_cgraph *cg, int n_threads);
+void ggml_graph_profile_start(struct ggml_cgraph *cg, int n_threads);
+void ggml_graph_profile_finish(struct ggml_cgraph *cg, int n_threads);
+void ggml_graph_profile_free(struct ggml_cgraph *cg);
+void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith);
+
+#endif // GGML_GRAPH_PROFILER
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -9,6 +9,7 @@
 
 // FIXME: required here for quantization functions
 #include "ggml-quants.h"
+#include "ggml-profile.h"
 
 #ifdef GGML_USE_CPU_HBM
 #include <hbwmalloc.h>
@@ -6012,6 +6013,7 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
         /*.grads        =*/ grads_ptr,
         /*.grad_accs    =*/ grad_accs_ptr,
         /*.leafs        =*/ leafs_ptr,
+        /*.prof         =*/ NULL,
         /*.hash_table   =*/ { hash_size, hash_used, hash_keys_ptr },
         /*.order        =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
     };
@@ -6038,6 +6040,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
         /*.grads            =*/ NULL, // gradients would need visited_hash_set
         /*.grad_accs        =*/ NULL,
         /*.leafs            =*/ NULL,
+        /*.prof             =*/ NULL,
         /*.visited_hash_set =*/ { 0, NULL, NULL },
         /*.order            =*/ cgraph0->order,
     };