Skip to content

Commit 3f27066

Browse files
committed
check invalid values, add debug_hook function and gdb shell
1 parent 35cae5b commit 3f27066

File tree

2 files changed

+131
-0
lines changed

2 files changed

+131
-0
lines changed

debug_check.gdb

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# 1) Set program arguments
2+
set args -m ~/dev/llm/DeepSeek-R1-Distill-Qwen-1.5B-Q4_0-GGUF/deepseek-r1-distill-qwen-1.5b-q4_0.gguf -b 16 -ngl 0 -c 1024 -t 4 -p "Hello"
3+
4+
# 2) Redirect GDB output to a log file
5+
set logging file gdb_output.log
6+
set logging on
7+
8+
# 3) Place a breakpoint at the debug_hook() function in ggml-cpu.c
9+
break ggml-cpu.c:debug_hook
10+
11+
# 4) Commands to execute once the breakpoint is hit
12+
commands
13+
# Prevent GDB from printing its usual breakpoint messages
14+
silent
15+
16+
# (a) Exit from debug_hook() and return to its caller
17+
# This should land you at check_invalid_values() right before 'return true;'
18+
finish
19+
20+
# (b) Now that you're in check_invalid_values(), print variables of interest
21+
p *src0
22+
p (*src0).data
23+
x/128f (*src0).data
24+
25+
# (c) If you only want to trigger once, disable the breakpoint afterwards
26+
disable $bpnum
27+
28+
# If you would rather keep hitting this breakpoint repeatedly, comment out
29+
# the disable command above and uncomment the following 'continue' command:
30+
# continue
31+
end
32+
33+
# 5) Automatically run the program (remove or comment out if you want to run manually)
34+
run

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10217,13 +10217,53 @@ static void ggml_compute_forward_diag_mask_zero(
1021710217
}
1021810218
}
1021910219

10220+
10221+
__attribute__((noinline)) static void debug_hook(void) {
10222+
}
10223+
1022010224
// ggml_compute_forward_soft_max
10225+
static bool check_invalid_values(const struct ggml_tensor * src0) {
10226+
if (!src0) {
10227+
printf("Error: src0 is NULL!\n");
10228+
return false;
10229+
}
10230+
10231+
const int nc = src0->ne[0]; // 列数
10232+
const int nr = ggml_nrows(src0); // 行数
10233+
10234+
int nan_count = 0, inf_count = 0;
10235+
10236+
// printf("Checking tensor for NaN/Inf values...\n");
10237+
10238+
for (int i1 = 0; i1 < nr; i1++) {
10239+
float * sp = (float *)((char *) src0->data + i1 * src0->nb[1]);
10240+
10241+
for (int i = 0; i < nc; ++i) {
10242+
if (isnan(sp[i])) {
10243+
nan_count++;
10244+
// printf("NaN detected at row %d, col %d (index %d)\n", i1, i, i1 * nc + i);
10245+
}
10246+
else if (isinf(sp[i])) {
10247+
inf_count++;
10248+
// printf("Inf detected at row %d, col %d (index %d)\n", i1, i, i1 * nc + i);
10249+
}
10250+
}
10251+
}
10252+
10253+
10254+
if (nan_count > 0 || inf_count > 0) {
10255+
debug_hook();
10256+
return true;
10257+
}
10258+
}
1022110259

1022210260
static void ggml_compute_forward_soft_max_f32(
1022310261
const struct ggml_compute_params * params,
1022410262
struct ggml_tensor * dst) {
1022510263

1022610264
const struct ggml_tensor * src0 = dst->src[0];
10265+
10266+
// check_invalid_values(src0);
1022710267
const struct ggml_tensor * src1 = dst->src[1];
1022810268

1022910269
assert(ggml_is_contiguous(dst));
@@ -10266,6 +10306,12 @@ static void ggml_compute_forward_soft_max_f32(
1026610306

1026710307
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
1026810308

10309+
// 限制 scale 避免溢出
10310+
if (!isfinite(scale) || scale > 1e6) {
10311+
// printf("Warning: scale is invalid (%f), resetting to 1.0\n", scale);
10312+
scale = 1.0f;
10313+
}
10314+
1026910315
for (int i1 = ir0; i1 < ir1; i1++) {
1027010316
// ALiBi
1027110317
const uint32_t h = (i1/ne01)%ne02; // head
@@ -10278,6 +10324,27 @@ static void ggml_compute_forward_soft_max_f32(
1027810324
ggml_fp16_t * mp_f16 = src1 ? (ggml_fp16_t *)((char *) src1->data) + (i1%ne01)*ne00 : NULL;
1027910325
float * mp_f32 = src1 ? (float *)((char *) src1->data) + (i1%ne01)*ne00 : NULL;
1028010326

10327+
int nan_count = 0, inf_count = 0;
10328+
for (int i = 0; i < nc; ++i)
10329+
{
10330+
if (isnan(sp[i])) nan_count++;
10331+
else if (isinf(sp[i])) {
10332+
// printf("Error: sp contains inf value!\n");
10333+
inf_count++;
10334+
sp[i] = FLT_MAX;
10335+
}
10336+
}
10337+
10338+
if(inf_count)
10339+
{
10340+
// printf("sp count: col: %d, row: %d, inf: [%d]\n", nc, nr, inf_count);
10341+
}
10342+
10343+
if (nan_count) {
10344+
// printf("Error: sp contains %d NaN values, aborting!\n", nan_count);
10345+
exit(1);
10346+
}
10347+
1028110348
ggml_vec_cpy_f32 (nc, wp, sp);
1028210349
ggml_vec_scale_f32(nc, wp, scale);
1028310350
if (mp_f32) {
@@ -10302,6 +10369,10 @@ static void ggml_compute_forward_soft_max_f32(
1030210369
float max = -INFINITY;
1030310370
ggml_vec_max_f32(nc, &max, wp);
1030410371

10372+
if (!isfinite(max)) {
10373+
max = FLT_MAX;
10374+
}
10375+
1030510376
ggml_float sum = ggml_vec_soft_max_f32(nc, dp, wp, max);
1030610377
assert(sum > 0.0);
1030710378

@@ -15431,6 +15502,9 @@ struct ggml_cplan ggml_graph_plan(
1543115502
return cplan;
1543215503
}
1543315504

15505+
// ggml_graph_compute_with_ctx
15506+
// ggml_graph_compute
15507+
// check_invalid_values
1543415508
static thread_ret_t ggml_graph_compute_thread(void * data) {
1543515509
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
1543615510
struct ggml_threadpool * tp = state->threadpool;
@@ -15450,6 +15524,27 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1545015524

1545115525
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
1545215526
struct ggml_tensor * node = cgraph->nodes[node_n];
15527+
struct ggml_tensor * tensor = node;
15528+
15529+
{
15530+
if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor))
15531+
{
15532+
15533+
}
15534+
else if (ggml_cpu_extra_compute_forward(&params, tensor))
15535+
{
15536+
15537+
}
15538+
else if(tensor->op == GGML_OP_SOFT_MAX)
15539+
{
15540+
// ggml_compute_forward
15541+
// GGML_OP_SOFT_MAX
15542+
// ggml_compute_forward_soft_max
15543+
// ggml_compute_forward_soft_max_f32
15544+
// check_invalid_values
15545+
check_invalid_values(tensor);
15546+
}
15547+
}
1545315548

1545415549
ggml_compute_forward(&params, node);
1545515550

@@ -15726,6 +15821,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
1572615821
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
1572715822
}
1572815823

15824+
// printf("GGML_USE_OPENMP->ggml_graph_compute_thread: %d\n", omp_get_thread_num());
1572915825
ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
1573015826
}
1573115827
} else {
@@ -15757,6 +15853,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
1575715853
return ret;
1575815854
}
1575915855

15856+
// TODO cgraph
1576015857
enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
1576115858
struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL);
1576215859

0 commit comments

Comments
 (0)