Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ option(GGML_CCACHE "ggml: use ccache if available" ON)
option(GGML_ALL_WARNINGS "ggml: enable all compiler warnings" ON)
option(GGML_ALL_WARNINGS_3RD_PARTY "ggml: enable all compiler warnings in 3rd party libs" OFF)
option(GGML_GPROF "ggml: enable gprof" OFF)
option(GGML_GRAPH_PROFILER "ggml: enable internal Graph and Op profiler" OFF)

# build
option(GGML_FATAL_WARNINGS "ggml: enable -Werror flag" OFF)
Expand Down
5 changes: 5 additions & 0 deletions ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ if (CMAKE_SYSTEM_NAME MATCHES "Linux")
add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
endif()

if (GGML_GRAPH_PROFILER)
add_compile_definitions(GGML_GRAPH_PROFILER)
endif()

if (NOT MSVC)
if (GGML_SANITIZE_THREAD)
add_compile_options(-fsanitize=thread)
Expand Down Expand Up @@ -1313,6 +1317,7 @@ add_library(ggml
ggml-backend.c
ggml-quants.c
ggml-quants.h
ggml-profile.cpp
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC}
Expand Down
4 changes: 4 additions & 0 deletions ggml/src/ggml-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,8 @@ enum ggml_cgraph_eval_order {
GGML_CGRAPH_EVAL_ORDER_COUNT
};

struct ggml_profile_data;

struct ggml_cgraph {
int size;
int n_nodes;
Expand All @@ -174,6 +176,8 @@ struct ggml_cgraph {
struct ggml_tensor ** grads;
struct ggml_tensor ** leafs;

struct ggml_profile_data * prof;

struct ggml_hash_set visited_hash_set;

enum ggml_cgraph_eval_order order;
Expand Down
176 changes: 176 additions & 0 deletions ggml/src/ggml-profile.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
#include "ggml-profile.h"

#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>

#include <string>
#include <chrono>

#ifdef GGML_GRAPH_PROFILER

struct ggml_profile_output {
const char * prefix;
FILE * stream;
};

extern "C" void ggml_graph_profile_init(struct ggml_cgraph *cg, int n_threads)
{
// TODO: make this a param
const char *env = getenv("GGML_GRAPH_PROFILE");
if (!env) { return; }

// The number of threads may change between passes (pp vs tg).
// Allocate for max_n_threads for simplicity for now.
// TODO: use aligned allocator

size_t node_size = sizeof(struct ggml_profile_timing) * GGML_MAX_N_THREADS;
size_t pvec_size = sizeof(std::intptr_t) * cg->n_nodes;
size_t time_size = node_size * cg->n_nodes;
size_t t_size = pvec_size + time_size + sizeof(ggml_profile_output) + sizeof(ggml_profile_data);

uint8_t * ptr = (uint8_t *) malloc(t_size);
if (!ptr) {
fprintf(stderr, "ggml-profile: failed to allocate profiling data : n_threads %d n_nodes %d\n", n_threads, cg->n_nodes);
return;
}
memset(ptr, 0, t_size);

// init all pointers
cg->prof = (ggml_profile_data *) ptr; ptr += sizeof(ggml_profile_data);
cg->prof->output = (ggml_profile_output *) ptr; ptr += sizeof(ggml_profile_output);
cg->prof->timing = (ggml_profile_timing **) ptr; ptr += pvec_size;
for (int i=0; i < cg->n_nodes; i++) {
cg->prof->timing[i] = (struct ggml_profile_timing *) ptr; ptr += node_size;
}

// init the output
ggml_profile_output *out = cg->prof->output;
if (!strcmp("stderr", env) || !strcmp("1", env)) {
out->prefix = "ggml-profile:";
out->stream = stderr;
} else {
out->prefix = "";
out->stream = fopen(env, "w");
}

}

extern "C" void ggml_graph_profile_start(struct ggml_cgraph *cg, int n_threads)
{
if (!cg->prof) { ggml_graph_profile_init(cg, n_threads); }
if (!cg->prof) { return; }
}

static inline int ggml_profile_format_tensor_dims(char *str, struct ggml_tensor *t)
{
return sprintf(str, "%d:%d:%d:%d",
(int) t->ne[0], (int) t->ne[1], (int) t->ne[3], (int) t->ne[3]);
}

static inline void ggml_profile_format_op_dims(char *str, struct ggml_tensor *t)
{
char *p = str;

// append src0 and src1 (if any)
if (t->src[0]) {
p += ggml_profile_format_tensor_dims(p, t->src[0]);

for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
p += sprintf(p, " x ");
p += ggml_profile_format_tensor_dims(p, t->src[i]);
}

p += sprintf(p, " -> ");
}

// format self dims separately for better visual alignment
char self[64];
ggml_profile_format_tensor_dims(self, t);

p += sprintf(p, "%12s", self);
}

static inline void ggml_profile_format_op_types(char *str, struct ggml_tensor *t)
{
char *p = str;

// append src0 and src1 (if any)
if (t->src[0]) {
p += sprintf(p, "%s", ggml_type_name(t->src[0]->type));

for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
p += sprintf(p, " x ");
p += sprintf(p, "%s", ggml_type_name(t->src[i]->type));
}

p += sprintf(p, " -> ");
}

p += sprintf(p, "%3s", ggml_type_name(t->type));
}

extern "C" void ggml_graph_profile_finish(struct ggml_cgraph *cg, int n_threads)
{
if (!cg->prof) { return; }

ggml_profile_output *out = cg->prof->output;

fprintf(out->stream, "%s| node idx | op name | proc (nsec) | sync (nsec) | total (nsec) | op dims | op types | tensor name |\n", out->prefix);
fprintf(out->stream, "%s| -------: | :------ | ----------: | ----------: | -----------: | ------: | -------: | ----------: |\n", out->prefix);

char dims[64 * GGML_MAX_SRC];
char types[16 * GGML_MAX_SRC];

for (int i = 0; i < cg->n_nodes; i++) {
uint64_t p_nsec = 0;
uint64_t s_nsec = 0;
uint64_t t_nsec = 0;

// add up per thread counters and reset them
for (int t=0; t < n_threads; t++) {
ggml_profile_timing &timing = cg->prof->timing[i][t];

p_nsec += timing.nsec[GGML_PROF_OP_SYNC] - timing.nsec[GGML_PROF_OP_START];
s_nsec += timing.nsec[GGML_PROF_OP_END] - timing.nsec[GGML_PROF_OP_SYNC];
t_nsec += timing.nsec[GGML_PROF_OP_END] - timing.nsec[GGML_PROF_OP_START];

timing.nsec[GGML_PROF_OP_START] = 0;
timing.nsec[GGML_PROF_OP_SYNC] = 0;
timing.nsec[GGML_PROF_OP_END] = 0;
}

ggml_profile_format_op_dims(dims, cg->nodes[i]);
ggml_profile_format_op_types(types, cg->nodes[i]);

fprintf(out->stream, "%s| %04d | %10s | %10lu | %10lu | %10lu | %46s | %22s | %20s |\n", out->prefix,
i, ggml_op_name(cg->nodes[i]->op),
(unsigned long) p_nsec, (unsigned long) s_nsec, (unsigned long) t_nsec,
dims, types, cg->nodes[i]->name);
}
fprintf(out->stream, "%s \n", out->prefix); // empty line to split tables
}

extern "C" void ggml_graph_profile_free(struct ggml_cgraph *cg)
{
if (!cg->prof) { return; }

ggml_profile_output *out = cg->prof->output;
if (out->stream != stderr) {
fclose(out->stream);
}

free(cg->prof); cg->prof = nullptr;
}

extern "C" void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith)
{
if (!cg->prof) { return; }

using clock = std::chrono::high_resolution_clock;

ggml_profile_timing &timing = cg->prof->timing[node_n][ith];
timing.nsec[e] = std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
}

#endif // GGML_GRAPH_PROFILER
90 changes: 90 additions & 0 deletions ggml/src/ggml-profile.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#pragma once

#include "ggml-impl.h"

// GGML internal header

#ifdef __cplusplus
extern "C" {
#endif

// op profile events & timing (per op / per thread)
enum ggml_profile_event {
GGML_PROF_OP_START,
GGML_PROF_OP_SYNC,
GGML_PROF_OP_END
};

struct ggml_profile_timing {
uint64_t nsec[GGML_PROF_OP_END + 1]; // event times in nsec
};

struct ggml_profile_output;

struct ggml_profile_data {
struct ggml_profile_output *output;
struct ggml_profile_timing ** timing; // per op / per thread timing
};

// check if profiling is enabled for this graph
static inline bool ggml_graph_profile_enabled(const struct ggml_cgraph *cg)
{
return cg->prof != NULL;
}

// get pointer to the timing data for specific node / thread
// can be used by the backends to populate data collected internally
static inline struct ggml_profile_timing * ggml_graph_profile_timing(const struct ggml_cgraph *cg, int node_n, int ith)
{
if (!cg->prof) { return NULL; }
return &cg->prof->timing[node_n][ith];
}

#ifndef GGML_GRAPH_PROFILER

// Stub out all profiler functions

static inline void ggml_graph_profile_init(struct ggml_cgraph *cg, int n_threads)
{
GGML_UNUSED(cg);
GGML_UNUSED(n_threads);
}

static inline void ggml_graph_profile_start(struct ggml_cgraph *cg, int n_threads)
{
GGML_UNUSED(cg);
GGML_UNUSED(n_threads);
}

static inline void ggml_graph_profile_finish(struct ggml_cgraph *cg, int n_threads)
{
GGML_UNUSED(cg);
GGML_UNUSED(n_threads);
}

static inline void ggml_graph_profile_free(struct ggml_cgraph *cg)
{
GGML_UNUSED(cg);
}

static inline void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith)
{
GGML_UNUSED(cg);
GGML_UNUSED(e);
GGML_UNUSED(node_n);
GGML_UNUSED(ith);
}

#else

void ggml_graph_profile_init(struct ggml_cgraph *cg, int n_threads);
void ggml_graph_profile_start(struct ggml_cgraph *cg, int n_threads);
void ggml_graph_profile_finish(struct ggml_cgraph *cg, int n_threads);
void ggml_graph_profile_free(struct ggml_cgraph *cg);
void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith);

#endif // GGML_GRAPH_PROFILER

#ifdef __cplusplus
}
#endif
18 changes: 18 additions & 0 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "ggml-quants.h"
#include "ggml.h"
#include "ggml-aarch64.h"
#include "ggml-profile.h"

#if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW
Expand Down Expand Up @@ -19355,6 +19356,7 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
/*.nodes =*/ nodes_ptr,
/*.grads =*/ grads_ptr,
/*.leafs =*/ leafs_ptr,
/*.prof =*/ NULL,
/*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr },
/*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
};
Expand All @@ -19376,6 +19378,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
/*.nodes =*/ cgraph0->nodes + i0,
/*.grads =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
/*.leafs =*/ NULL,
/*.prof =*/ NULL,
/*.hash_table =*/ { 0, NULL, NULL },
/*.order =*/ cgraph0->order,
};
Expand Down Expand Up @@ -20229,6 +20232,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
for (int node_n = 0; node_n < cgraph->n_nodes && !tp->abort; node_n++) {
struct ggml_tensor * node = cgraph->nodes[node_n];

ggml_graph_profile_event(cgraph, GGML_PROF_OP_START, node_n, state->ith);

ggml_compute_forward(&params, node);

if (state->ith == 0 && cplan->abort_callback &&
Expand All @@ -20237,6 +20242,15 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
tp->ec = GGML_STATUS_ABORTED;
}

ggml_graph_profile_event(cgraph, GGML_PROF_OP_SYNC, node_n, state->ith);

ggml_barrier(state->threadpool);

ggml_graph_profile_event(cgraph, GGML_PROF_OP_END, node_n, state->ith);
}

if (ggml_graph_profile_enabled(cgraph)) {
// need another barrier to flush the last timing update
ggml_barrier(state->threadpool);
}

Expand Down Expand Up @@ -20510,6 +20524,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
threadpool->ec = GGML_STATUS_SUCCESS;
}

ggml_graph_profile_start(cgraph, n_threads);

#ifdef GGML_USE_OPENMP
if (n_threads > 1) {
#pragma omp parallel num_threads(n_threads)
Expand Down Expand Up @@ -20540,6 +20556,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
ggml_graph_compute_thread(&threadpool->workers[0]);
#endif

ggml_graph_profile_finish(cgraph, n_threads);

// don't leave affinity set on the main thread
clear_numa_thread_affinity();

Expand Down
Loading