Skip to content

Commit e929a7a

Browse files
committed
Bug#37162636 Ndb : GCP_SAVE_REF handling + recoverability [noclose]
Backport to 7.6 As described in the bug, there are some situations around graceful shutdown + node failures which could lead to a cluster becoming unrecoverable without manual intervention. The causes of these situations can be addressed individually. In this patch the generic GCI info propagation mechanism (CopyGCI) is modified to reject propagating any set of GCI info which does not describe the ability to automatically recover a cluster via SR. Specifically, it is essential that for a given restorable GCI, there must be at least one node in every nodegroup also restorable to that GCI. Checking this in the Master before propagating it avoids it becoming disk durable anywhere, so that in any cases where it may be about-to-be violated, the problem does not propagate. This could lead to a (faster) cluster shutdown, but the cluster would then be automatically recoverable via SR. Change-Id: I840267ef15873c47856ad31378052675efe008e8
1 parent f0b6652 commit e929a7a

File tree

2 files changed

+47
-2
lines changed

2 files changed

+47
-2
lines changed

storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1523,6 +1523,7 @@ class Dbdih: public SimulatedBlock {
15231523
void copyTabReq_complete(Signal* signal, TabRecordPtr tabPtr);
15241524

15251525
void gcpcommitreqLab(Signal *);
1526+
bool checkAllNgsRepresented(Signal *, const NdbNodeBitmask *nodes);
15261527
void validateCopyGci(Signal *);
15271528
void upgradeAlignCopyGci();
15281529
void copyGciLab(Signal *, CopyGCIReq::CopyReason reason);

storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17797,25 +17797,57 @@ void Dbdih::execDIHNDBTAMPER(Signal* signal)
1779717797
/*****************************************************************************/
1779817798
/* ********** FILE HANDLING MODULE *************/
1779917799
/*****************************************************************************/
17800+
bool Dbdih::checkAllNgsRepresented(Signal *signal,
17801+
const NdbNodeBitmask *nodes) {
17802+
jam();
17803+
17804+
/**
17805+
* CheckNodeGroups will examine our bitmap of nodes
17806+
* If any nodegroup is entirely missing then the result
17807+
* will be Lose.
17808+
* If all nodegroups are present then the result will
17809+
* be Win or Partitioning (we don't care which)
17810+
*/
17811+
CheckNodeGroups *cng = (CheckNodeGroups *)&signal->theData[0];
17812+
cng->requestType = CheckNodeGroups::Direct | CheckNodeGroups::ArbitCheck;
17813+
cng->mask.assign(*nodes);
17814+
execCHECKNODEGROUPSREQ(signal);
17815+
17816+
return (cng->output != CheckNodeGroups::Lose);
17817+
}
17818+
1780017819
void Dbdih::validateCopyGci(Signal *signal) {
1780117820
jam();
1780217821
/**
1780317822
* Before we (Master) copy our GCI info to all other
1780417823
* nodes, let's check it for sanity
1780517824
*/
1780617825
bool newestRestorableGCIIsMax = true;
17826+
bool recoverableNodesAreSufficientForSR = true;
17827+
1780717828
const Uint32 newestRestorableGCI = SYSFILE->newestRestorableGCI;
1780817829

17830+
/* Build bitmap of recoverable nodes */
17831+
NdbNodeBitmask recoverableNodes;
1780917832
for (Uint32 i = 0; i < MAX_NDB_NODES; i++) {
1781017833
const Uint32 nodeLastCompletedGci = SYSFILE->lastCompletedGCI[i];
1781117834

17835+
if (nodeLastCompletedGci == newestRestorableGCI) {
17836+
recoverableNodes.set(i);
17837+
}
17838+
1781217839
if (nodeLastCompletedGci > newestRestorableGCI) {
1781317840
jam();
1781417841
newestRestorableGCIIsMax = false;
1781517842
}
1781617843
}
1781717844

17818-
if (unlikely(!newestRestorableGCIIsMax)) {
17845+
/* Check overall system is recoverable */
17846+
recoverableNodesAreSufficientForSR =
17847+
checkAllNgsRepresented(signal, &recoverableNodes);
17848+
17849+
if (unlikely(
17850+
!(newestRestorableGCIIsMax && recoverableNodesAreSufficientForSR))) {
1781917851
jam();
1782017852
g_eventLogger->error("DIH : newestRestorableGCI %u", newestRestorableGCI);
1782117853
for (Uint32 i = 0; i < MAX_NDB_NODES; i++) {
@@ -17835,8 +17867,20 @@ void Dbdih::validateCopyGci(Signal *signal) {
1783517867
g_eventLogger->error(
1783617868
"DIH : Invalid CopyGCIREQ attempted, newestRestorableGCI is not max");
1783717869
}
17838-
17870+
if (!recoverableNodesAreSufficientForSR) {
17871+
jam();
17872+
/**
17873+
* Require that every nodegroup has at least one representative
17874+
* which is restorable to the newestRestorableGci number.
17875+
* Otherwise System Restart with all nodes present will not have
17876+
* sufficient 'log' to be recoverable
17877+
*/
17878+
g_eventLogger->error(
17879+
"DIH : Invalid CopyGCIREQ attempted, recoverable nodes are not "
17880+
"sufficient for SR");
17881+
}
1783917882
ndbrequire(newestRestorableGCIIsMax);
17883+
ndbrequire(recoverableNodesAreSufficientForSR);
1784017884
}
1784117885
}
1784217886

0 commit comments

Comments
 (0)