Skip to content

Commit 4c61d4e

Browse files
committed
Bug#37518267 Improve data node thread watchdog shutdown handling
Backport to 7.6 Two changes : 1. Have node error handling set thread watchdog state prior to attempting to serialise or log error details to files. This helps users understand whether Watchdog logs indicate a detected overload, or whether they indicate a delay in shutting down a data node. 2. Have the Watchdog thread treat 'slow logging' as a special case. If a registered thread exceeds its time allowance in a shutdown logging state then the watchdog directly calls NdbShutdown(), which is more likely to lead to an immediate process exit. This improves the system's ability to force a timely process failure (and subsequent restart) potentially at the expense of some logging. Test coverage by testNodeRestart -n WatchdogSlowShutdown is enhanced to cover another case. Error injection coverage of data node shutdown is refactored to enable future extensions. Change-Id: I57eabbdb04423409d0aae1b6e548013a7088f4d0
1 parent 6b53720 commit 4c61d4e

File tree

9 files changed

+166
-70
lines changed

9 files changed

+166
-70
lines changed

storage/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2003, 2024, Oracle and/or its affiliates.
2+
Copyright (c) 2003, 2025, Oracle and/or its affiliates.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License, version 2.0,
@@ -74,11 +74,6 @@
7474

7575
// Used here only to print event reports on stdout/console.
7676
extern EventLogger * g_eventLogger;
77-
extern int simulate_error_during_shutdown;
78-
79-
#ifdef ERROR_INSERT
80-
extern int simulate_error_during_error_reporting;
81-
#endif
8277

8378
// Index pages used by ACC instances
8479
Uint32 g_acc_pages_used[1 + MAX_NDBMT_LQH_WORKERS];
@@ -219,17 +214,20 @@ void Cmvmi::execNDB_TAMPER(Signal* signal)
219214
ndbrequire(false);
220215
}
221216

217+
#ifdef ERROR_INSERT
222218
#ifndef NDB_WIN32
223219
if(ERROR_INSERTED(9996)){
224-
simulate_error_during_shutdown= SIGSEGV;
220+
globalEmulatorData.theConfiguration->setShutdownHandlingFault(
221+
Configuration::SHF_UNIX_SIGNAL, SIGSEGV);
225222
ndbrequire(false);
226223
}
227224

228225
if(ERROR_INSERTED(9995)){
229-
simulate_error_during_shutdown= SIGSEGV;
226+
globalEmulatorData.theConfiguration->setShutdownHandlingFault(
227+
Configuration::SHF_UNIX_SIGNAL, SIGSEGV);
230228
kill(getpid(), SIGABRT);
231229
}
232-
230+
#endif
233231
#endif
234232

235233
} // execNDB_TAMPER()
@@ -1982,13 +1980,17 @@ Cmvmi::execDUMP_STATE_ORD(Signal* signal)
19821980
if (arg == DumpStateOrd::CmvmiSetErrorHandlingError)
19831981
{
19841982
Uint32 val = 0;
1983+
Uint32 extra = 0;
19851984
if (signal->length() >= 2)
19861985
{
19871986
val = signal->theData[1];
1987+
if (signal->length() >= 3) {
1988+
extra = signal->theData[2];
1989+
}
19881990
}
1989-
g_eventLogger->info("Cmvmi : Setting ErrorHandlingError to %u",
1990-
val);
1991-
simulate_error_during_error_reporting = val;
1991+
g_eventLogger->info("Cmvmi : Setting ShutdownErrorHandling to %u %u", val,
1992+
extra);
1993+
globalEmulatorData.theConfiguration->setShutdownHandlingFault(val, extra);
19921994
}
19931995
#endif
19941996

storage/ndb/src/kernel/error/ErrorReporter.cpp

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2003, 2022, Oracle and/or its affiliates.
2+
Copyright (c) 2003, 2025, Oracle and/or its affiliates.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License, version 2.0,
@@ -65,10 +65,6 @@ static void dumpJam(FILE* jamStream,
6565

6666
const char * ndb_basename(const char *path);
6767

68-
#ifdef ERROR_INSERT
69-
int simulate_error_during_error_reporting = 0;
70-
#endif
71-
7268
static
7369
const char*
7470
formatTimeStampString(char* theDateTimeString, size_t len){
@@ -435,13 +431,33 @@ WriteMessage(int thrdMessageID,
435431
fflush(stream);
436432
fclose(stream);
437433

434+
#ifdef ERROR_INSERT
435+
if (globalEmulatorData.theConfiguration->getShutdownHandlingFault() ==
436+
Configuration::SHF_DELAY_WHILE_WRITING_ERRORLOG) {
437+
Uint32 seconds =
438+
globalEmulatorData.theConfiguration->getShutdownHandlingFaultExtra();
439+
if (seconds == 0) seconds = 300;
440+
441+
fprintf(stderr,
442+
"Stall for %us during error reporting before releasing lock\n",
443+
seconds);
444+
NdbSleep_SecSleep(seconds);
445+
fprintf(stderr, "Stall finished\n");
446+
}
447+
#endif
448+
438449
ErrorReporter::prepare_to_crash(false, (nst == NST_ErrorInsert));
439450

440451
#ifdef ERROR_INSERT
441-
if (simulate_error_during_error_reporting == 1)
442-
{
443-
fprintf(stderr, "Stall during error reporting after releasing lock\n");
444-
NdbSleep_MilliSleep(30000);
452+
if (globalEmulatorData.theConfiguration->getShutdownHandlingFault() ==
453+
Configuration::SHF_DELAY_AFTER_WRITING_ERRORLOG) {
454+
Uint32 seconds =
455+
globalEmulatorData.theConfiguration->getShutdownHandlingFaultExtra();
456+
if (seconds == 0) seconds = 300;
457+
fprintf(stderr,
458+
"Stall for %us during error reporting after releasing lock\n",
459+
seconds);
460+
NdbSleep_SecSleep(seconds);
445461
}
446462
#endif
447463

storage/ndb/src/kernel/ndbd.cpp

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* Copyright (c) 2009, 2024, Oracle and/or its affiliates.
1+
/* Copyright (c) 2009, 2025, Oracle and/or its affiliates.
22
33
This program is free software; you can redistribute it and/or modify
44
it under the terms of the GNU General Public License, version 2.0,
@@ -1152,8 +1152,6 @@ extern "C" my_bool opt_core;
11521152
// instantiated and updated in NdbcntrMain.cpp
11531153
extern Uint32 g_currentStartPhase;
11541154

1155-
int simulate_error_during_shutdown= 0;
1156-
11571155
void
11581156
NdbShutdown(int error_code,
11591157
NdbShutdownType type,
@@ -1225,6 +1223,19 @@ NdbShutdown(int error_code,
12251223
* Very serious, don't attempt to free, just die!!
12261224
*/
12271225
g_eventLogger->info("Watchdog shutdown completed - %s", exitAbort);
1226+
#ifdef ERROR_INSERT
1227+
const Uint32 shf =
1228+
globalEmulatorData.theConfiguration->getShutdownHandlingFault();
1229+
if (shf != 0) {
1230+
if (shf == Configuration::SHF_DELAY_AFTER_WRITING_ERRORLOG ||
1231+
shf == Configuration::SHF_DELAY_WHILE_WRITING_ERRORLOG) {
1232+
g_eventLogger->info(
1233+
"ERROR_INSERT : Watchdog choosing restart rather than hard exit "
1234+
"for test pass");
1235+
childExit(error_code, NRT_NoStart_Restart, g_currentStartPhase);
1236+
}
1237+
}
1238+
#endif
12281239
if (opt_core)
12291240
{
12301241
childAbort(error_code, -1,g_currentStartPhase);
@@ -1235,13 +1246,19 @@ NdbShutdown(int error_code,
12351246
}
12361247
}
12371248

1249+
#ifdef ERROR_INSERT
12381250
#ifndef NDB_WIN32
1239-
if (simulate_error_during_shutdown)
1240-
{
1241-
kill(getpid(), simulate_error_during_shutdown);
1251+
if (globalEmulatorData.theConfiguration->getShutdownHandlingFault() ==
1252+
Configuration::SHF_UNIX_SIGNAL) {
1253+
const Uint32 sigId =
1254+
globalEmulatorData.theConfiguration->getShutdownHandlingFaultExtra();
1255+
g_eventLogger->info("ERROR_INSERT : Raising unix signal %u to self",
1256+
sigId);
1257+
kill(getpid(), sigId);
12421258
while(true)
12431259
NdbSleep_MilliSleep(10);
12441260
}
1261+
#endif
12451262
#endif
12461263

12471264
globalEmulatorData.theWatchDog->doStop();

storage/ndb/src/kernel/vm/Configuration.cpp

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2003, 2021, Oracle and/or its affiliates.
2+
Copyright (c) 2003, 2025, Oracle and/or its affiliates.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License, version 2.0,
@@ -384,6 +384,9 @@ Configuration::setupConfiguration(){
384384
ndbout_c("Mixology level set to 0x%x", _mixologyLevel);
385385
globalTransporterRegistry.setMixologyLevel(_mixologyLevel);
386386
}
387+
388+
_shutdownHandlingFault = 0;
389+
_shutdownHandlingFaultExtra = 0;
387390
#endif
388391

389392
/**
@@ -665,6 +668,18 @@ void
665668
Configuration::setMixologyLevel(Uint32 l){
666669
_mixologyLevel = l;
667670
}
671+
672+
Uint32 Configuration::getShutdownHandlingFault() const {
673+
return _shutdownHandlingFault;
674+
};
675+
Uint32 Configuration::getShutdownHandlingFaultExtra() const {
676+
return _shutdownHandlingFaultExtra;
677+
};
678+
679+
void Configuration ::setShutdownHandlingFault(Uint32 v, Uint32 extra) {
680+
_shutdownHandlingFault = v;
681+
_shutdownHandlingFaultExtra = extra;
682+
};
668683
#endif
669684

670685
const ndb_mgm_configuration_iterator *

storage/ndb/src/kernel/vm/Configuration.hpp

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2003, 2021, Oracle and/or its affiliates.
2+
Copyright (c) 2003, 2025, Oracle and/or its affiliates.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License, version 2.0,
@@ -136,6 +136,23 @@ class Configuration {
136136
#ifdef ERROR_INSERT
137137
Uint32 getMixologyLevel() const;
138138
void setMixologyLevel(Uint32);
139+
140+
enum {
141+
SHF_NONE = 0,
142+
/* Delays during crash handling */
143+
/* Extra specifies delay in seconds */
144+
SHF_DELAY_AFTER_WRITING_ERRORLOG = 1,
145+
SHF_DELAY_WHILE_WRITING_ERRORLOG = 2,
146+
147+
/* Unix signal during crash handling */
148+
/* Extra specifies signal number */
149+
SHF_UNIX_SIGNAL = 10
150+
} ShutdownHandlingFaults;
151+
152+
Uint32 getShutdownHandlingFault() const;
153+
Uint32 getShutdownHandlingFaultExtra() const;
154+
155+
void setShutdownHandlingFault(Uint32 v, Uint32 extra = 0);
139156
#endif
140157

141158
// Cluster configuration
@@ -172,6 +189,8 @@ class Configuration {
172189
Uint32 _timeBetweenWatchDogCheckInitial;
173190
#ifdef ERROR_INSERT
174191
Uint32 _mixologyLevel;
192+
Uint32 _shutdownHandlingFault;
193+
Uint32 _shutdownHandlingFaultExtra;
175194
#endif
176195

177196
Vector<struct ThreadInfo> threadInfo;

storage/ndb/src/kernel/vm/SimulatedBlock.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2003, 2023, Oracle and/or its affiliates.
2+
Copyright (c) 2003, 2025, Oracle and/or its affiliates.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License, version 2.0,
@@ -4838,6 +4838,8 @@ ErrorReporter::prepare_to_crash(bool first_phase, bool error_insert_crash)
48384838
{
48394839
(void)first_phase;
48404840
(void)error_insert_crash;
4841+
4842+
globalData.incrementWatchDogCounter(22); // Handling node stop
48414843
}
48424844
#endif
48434845

storage/ndb/src/kernel/vm/WatchDog.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2003, 2021, Oracle and/or its affiliates.
2+
Copyright (c) 2003, 2025, Oracle and/or its affiliates.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License, version 2.0,
@@ -217,6 +217,9 @@ const char *get_action(char *buf, Uint32 IPValue)
217217
case 21:
218218
action = "Initial value in mt_job_thread_main";
219219
break;
220+
case 22:
221+
action = "Handling node stop";
222+
break;
220223
default:
221224
action = NULL;
222225
break;
@@ -440,6 +443,13 @@ WatchDog::run()
440443
}
441444
if ((elapsed[i] > 3 * theInterval) || killer)
442445
{
446+
if (oldCounterValue[i] == 4 || // Print Job Buffers at crash
447+
oldCounterValue[i] == 22) { // Handling node stop
448+
/* Immediate exit without attempting to trace
449+
* to avoid I/O stalls leaving process hanging
450+
*/
451+
NdbShutdown(NDBD_EXIT_WATCHDOG_TERMINATE, NST_Watchdog);
452+
}
443453
shutdownSystem(last_stuck_action);
444454
}
445455
}

storage/ndb/src/kernel/vm/mt.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8049,6 +8049,14 @@ static bool crash_started = false;
80498049
void
80508050
ErrorReporter::prepare_to_crash(bool first_phase, bool error_insert_crash)
80518051
{
8052+
{
8053+
void *value= NdbThread_GetTlsKey(NDB_THREAD_TLS_THREAD);
8054+
thr_data *selfptr = reinterpret_cast<thr_data *>(value);
8055+
if (selfptr != NULL) {
8056+
selfptr->m_watchdog_counter = 22;
8057+
}
8058+
}
8059+
80528060
if (first_phase)
80538061
{
80548062
NdbMutex_Lock(&g_thr_repository->stop_for_crash_mutex);

0 commit comments

Comments
 (0)