Skip to content

Commit 40bfeea

Browse files
authored
Merge pull request #61 from tsisw/llama-cpp-new-release
@FIR-1006 - GGML: PERF changes with following option
2 parents 8cbd5a4 + 09a3864 commit 40bfeea

File tree

8 files changed

+54
-37
lines changed

8 files changed

+54
-37
lines changed

ggml/include/ggml.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -343,13 +343,13 @@ extern "C" {
343343
GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
344344
GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
345345

346-
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
346+
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
347347
enum ggml_compute_backend_type {
348348
GGML_COMPUTE_BACKEND_CPU=0,
349349
GGML_COMPUTE_BACKEND_TSAVORITE,
350350
GGML_COMPUTE_BACKEND_COUNT
351351
};
352-
#endif /* GGML_PERF || GGML_PERF_DETAIL */
352+
#endif /* GML_PERF-related flag */
353353

354354
enum ggml_status {
355355
GGML_STATUS_ALLOC_FAILED = -2,
@@ -660,14 +660,14 @@ extern "C" {
660660

661661
void * extra; // extra things e.g. for ggml-cuda.cu
662662

663-
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
663+
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
664664
int64_t perf_runs;
665665
int64_t perf_time_us;
666666
enum ggml_compute_backend_type ggml_compute_backend;
667667
char padding[4];
668668
#else
669669
char padding[8];
670-
#endif /* GGML_PERF || GGML_PERF_DETAIL */
670+
#endif /* GML_PERF-related flag */
671671
};
672672

673673
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -2557,7 +2557,7 @@ extern "C" {
25572557
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
25582558
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
25592559

2560-
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
2560+
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
25612561
struct ggml_perf_backend_subtotals {
25622562
int64_t total_us;
25632563
int64_t runs;
@@ -2587,7 +2587,7 @@ void ggml_perf_write_detailed_csv(struct ggml_cgraph * cgraph, FILE *fp);
25872587
void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct ggml_cgraph * cgraph);
25882588
const char * ggml_backend_type(enum ggml_compute_backend_type backend);
25892589

2590-
#endif /* GGML_PERF || GGML_PERF_DETAIL */
2590+
#endif /* GML_PERF-related flags */
25912591

25922592
#ifdef __cplusplus
25932593
}

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2879,12 +2879,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
28792879
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
28802880
struct ggml_tensor * node = cgraph->nodes[node_n];
28812881

2882-
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
2882+
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
28832883
int64_t t_start = ggml_time_us();
2884-
#endif /* GGML_PERF || GGML_PERF_DETAIL */
2884+
#endif /* GGML_PERF-related flags */
28852885
ggml_compute_forward(&params, node);
28862886

2887-
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
2887+
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
28882888
int64_t t_end = ggml_time_us();
28892889
node->perf_runs++;
28902890
if (t_end >= t_start) {
@@ -2893,7 +2893,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
28932893
// Handle wraparound by assuming timer rolls over at max int64_t value
28942894
node->perf_time_us += (INT64_MAX - t_start + t_end + 1);
28952895
}
2896-
#endif /* GGML_PERF || GGML_PERF_DETAIL */
2896+
#endif /* GGML_PERF-related flags */
28972897
if (state->ith == 0 && cplan->abort_callback &&
28982898
cplan->abort_callback(cplan->abort_callback_data)) {
28992899
atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);

ggml/src/ggml-tsavorite/ggml-tsavorite.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -929,9 +929,9 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
929929

930930
for (int i = 0; i < cgraph->n_nodes; i++) {
931931
int32_t kernel_sub_type=-1;
932-
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
932+
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
933933
int64_t t_start = ggml_time_us();
934-
#endif /* GGML_PERF || GGML_PERF_DETAIL */
934+
#endif /* GGML_PERF-related flags */
935935
node = cgraph->nodes[i];
936936
src0 = node->src[0];
937937
src1 = node->src[1];
@@ -1279,7 +1279,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
12791279
device->stats.op_run_count[kernel_type].max_num_of_elem < max_num_of_elem)
12801280
device->stats.op_run_count[kernel_type].max_num_of_elem = max_num_of_elem;
12811281
}
1282-
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
1282+
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
12831283
int64_t t_end = ggml_time_us();
12841284
node->perf_runs++;
12851285
node->ggml_compute_backend = GGML_COMPUTE_BACKEND_TSAVORITE;
@@ -1289,7 +1289,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
12891289
// Handle wraparound by assuming timer rolls over at max int64_t value
12901290
node->perf_time_us += (INT64_MAX - t_start + t_end + 1);
12911291
}
1292-
#endif /* GGML_PERF || GGML_PERF_DETAIL */
1292+
#endif /* GGML_PERF-related flags */
12931293
}
12941294

12951295
// This this need to implement correctly when we have mixture of CPU and accelerator operation

ggml/src/ggml.c

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1020,12 +1020,12 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
10201020
"GLU",
10211021
};
10221022

1023-
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
1023+
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
10241024
static const char * GGML_BACKEND_TYPE[GGML_COMPUTE_BACKEND_COUNT] = {
10251025
"CPU",
10261026
"OPU"
10271027
};
1028-
#endif /* GGML_PERF || GGML_PERF_DETAIL */
1028+
#endif /* GGML_PERF-related flags */
10291029

10301030
static_assert(GGML_OP_COUNT == 90, "GGML_OP_COUNT != 90");
10311031

@@ -1262,11 +1262,11 @@ const char * ggml_op_name(enum ggml_op op) {
12621262
return GGML_OP_NAME[op];
12631263
}
12641264

1265-
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
1265+
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
12661266
const char * ggml_backend_type(enum ggml_compute_backend_type backend) {
12671267
return GGML_BACKEND_TYPE[backend];
12681268
}
1269-
#endif /* GGML_PERF || GGML_PERF_DETAIL */
1269+
#endif /* GML_PERF-related flags */
12701270

12711271
const char * ggml_op_symbol(enum ggml_op op) {
12721272
return GGML_OP_SYMBOL[op];
@@ -1692,11 +1692,11 @@ static struct ggml_tensor * ggml_new_tensor_impl(
16921692
/*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
16931693
/*.name =*/ { 0 },
16941694
/*.extra =*/ NULL,
1695-
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
1695+
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
16961696
/*.perf_runs =*/ 0,
16971697
/*.perf_time_us =*/ 0,
16981698
/*.ggml_compute_backend =*/ GGML_COMPUTE_BACKEND_CPU,
1699-
#endif /* GGML_PERF || GGML_PERF_DETAIL */
1699+
#endif /* GML_PERF-related flags */
17001700
/*.padding =*/ { 0 },
17011701
};
17021702

@@ -7231,7 +7231,7 @@ bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, cons
72317231
return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
72327232
}
72337233

7234-
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
7234+
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
72357235
void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct ggml_cgraph * cgraph) {
72367236
for (int i = 0; i < cgraph->n_nodes; ++i) {
72377237
struct ggml_tensor * node = cgraph->nodes[i];
@@ -7258,7 +7258,7 @@ void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct
72587258
}
72597259
}
72607260
}
7261-
#endif /* GGML_PERF || GGML_PERF_DETAIL */
7261+
#endif /* GML_PERF-related flags */
72627262

72637263
#if defined(GGML_PERF_DETAIL)
72647264
FILE * ggml_perf_log_open(const char *filename) {

src/llama-context.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1090,12 +1090,12 @@ int llama_context::decode(const llama_batch & batch_inp) {
10901090
ggml_status status;
10911091
const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
10921092

1093-
#if defined(GGML_PERF)
1093+
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE)
10941094
ggml_perf_accumulate(perf_totals, res->get_gf());
10951095
#elif defined(GGML_PERF_DETAIL)
10961096
ggml_perf_accumulate(perf_totals, res->get_gf());
10971097
ggml_perf_write_detailed_csv(res->get_gf(), perf_all_shape_fp);
1098-
#endif /* GGML_PERF || GGML_PERF_DETAIL */
1098+
#endif /* GML_PERF-related flags */
10991099

11001100

11011101
if (!res) {
@@ -2759,7 +2759,7 @@ llama_perf_context_data llama_perf_context(const llama_context * ctx) {
27592759
}
27602760

27612761

2762-
#if defined(GGML_PERF)
2762+
#if defined(GGML_PERF_RELEASE)
27632763
void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
27642764
LLAMA_LOG_TSAVORITE("\n=== GGML Perf Summary ===\n");
27652765
LLAMA_LOG_TSAVORITE(" %-16s %7s %14s %16s\n", "Op", "Runs", "Total us", "Avg us");
@@ -2788,7 +2788,7 @@ void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
27882788
}
27892789
}
27902790

2791-
#elif defined(GGML_PERF_DETAIL)
2791+
#elif defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
27922792
void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
27932793
LLAMA_LOG_TSAVORITE("\n=== GGML Perf Summary ===\n");
27942794
LLAMA_LOG_TSAVORITE(" %-16s %-8s %7s %14s %16s\n", "Op", "Target", "Runs", "Total us", "Avg us");
@@ -2838,7 +2838,7 @@ void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
28382838
}
28392839
}
28402840
}
2841-
#endif /* GGML_PERF || GGML_PERF_DETAI */
2841+
#endif /* GGML_PERF-related flags */
28422842

28432843

28442844
void llama_perf_context_print(const llama_context * ctx) {
@@ -2852,7 +2852,7 @@ void llama_perf_context_print(const llama_context * ctx) {
28522852
__func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
28532853
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
28542854

2855-
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
2855+
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
28562856
LLAMA_LOG_TSAVORITE("\n%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
28572857
LLAMA_LOG_TSAVORITE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
28582858
__func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
@@ -2861,7 +2861,7 @@ void llama_perf_context_print(const llama_context * ctx) {
28612861
LLAMA_LOG_TSAVORITE("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
28622862

28632863
ggml_perf_print_totals(const_cast<ggml_perf_totals *>(ctx->perf_totals));
2864-
#endif /* GGML_PERF || GGML_PERF_DETAIL */
2864+
#endif /* GGML_PERF-related flags */
28652865
}
28662866

28672867
void llama_perf_context_reset(llama_context * ctx) {

src/llama-context.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -198,9 +198,9 @@ struct llama_context {
198198

199199
// reserve a graph with a dummy ubatch of the specified size
200200
ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false);
201-
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
201+
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
202202
struct ggml_perf_totals perf_totals[GGML_OP_COUNT] = {}; // add this to llama_context
203-
#endif /* GGML_PERF || GGML_PERF_DETAIL */
203+
#endif /* GGML_PERF-related flags */
204204

205205
private:
206206
llm_graph_params graph_params(

tools/main/main.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,9 +126,9 @@ int main(int argc, char ** argv) {
126126
LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
127127
}
128128

129-
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
129+
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
130130
llama_log_set(my_logger, nullptr);
131-
#endif /* GGML_PERF || GGML_PERF_DETAIL */
131+
#endif /* GGML_PERF-related flags */
132132
LOG_INF("%s: llama backend init\n", __func__);
133133

134134
llama_backend_init();

tsi-pkg-build.sh

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,18 @@ cd ../posix-kernel/
3232

3333
cd ../../
3434

35-
#Compile for posix with build-posix as a target folder
35+
#Compile for posix & fpga with build-posix as a target folder
3636

3737
echo 'building llama.cp, ggml for tsavorite and other binary for posix'
38-
cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix -DCMAKE_C_FLAGS="-DGGML_PERF" -DCMAKE_CXX_FLAGS="-DGGML_PERF"
38+
if [ "$(echo "$1" | tr '[:upper:]' '[:lower:]')" = "release" ];
39+
then
40+
cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix -DCMAKE_C_FLAGS="-DGGML_PERF_RELEASE" -DCMAKE_CXX_FLAGS="-DGGML_PERF_RELEASE"
41+
elif [ "$(echo "$1" | tr '[:upper:]' '[:lower:]')" = "debug" ]; then
42+
cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix -DCMAKE_C_FLAGS="-DGGML_PERF_DETAIL" -DCMAKE_CXX_FLAGS="-DGGML_PERF_DETAIL"
43+
else
44+
cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix -DCMAKE_C_FLAGS="-DGGML_PERF" -DCMAKE_CXX_FLAGS="-DGGML_PERF"
45+
fi
46+
3947
cmake --build build-posix --config Release
4048

4149
# Fix GLIBC compatibility for TSI binaries
@@ -64,12 +72,21 @@ chmod +x build-posix/bin/llama-cli
6472
echo 'building llama.cp, ggml for tsavorite and other binary for fpga'
6573
export CC="/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-gcc"
6674
export CXX="/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-g++"
67-
cmake -B build-fpga -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=fpga -DCMAKE_C_FLAGS="-DGGML_PERF" -DCMAKE_CXX_FLAGS="-DGGML_PERF"
75+
76+
if [ "$(echo "$1" | tr '[:upper:]' '[:lower:]')" = "release" ];
77+
then
78+
cmake -B build-fpga -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=fpga -DCMAKE_C_FLAGS="-DGGML_PERF_RELEASE" -DCMAKE_CXX_FLAGS="-DGGML_PERF_RELEASE"
79+
elif [ "$(echo "$1" | tr '[:upper:]' '[:lower:]')" = "debug" ]; then
80+
cmake -B build-fpga -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=fpga -DCMAKE_C_FLAGS="-DGGML_PERF_DETAIL" -DCMAKE_CXX_FLAGS="-DGGML_PERF_DETAIL"
81+
else
82+
cmake -B build-fpga -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=fpga -DCMAKE_C_FLAGS="-DGGML_PERF" -DCMAKE_CXX_FLAGS="-DGGML_PERF"
83+
fi
84+
6885
cmake --build build-fpga --config Release
6986

7087

7188
echo 'creating tar bundle for fpga'
72-
TSI_GGML_VERSION=0.0.8
89+
TSI_GGML_VERSION=0.0.9
7390
TSI_GGML_BUNDLE_INSTALL_DIR=tsi-ggml
7491
GGML_TSI_INSTALL_DIR=ggml-tsi-kernel
7592
TSI_GGML_RELEASE_DIR=/proj/rel/sw/ggml

0 commit comments

Comments
 (0)