Skip to content

Commit bfdb2de

Browse files
committed
Bug#37524092 Improve Api Failure handling logs + limit duration
7.6 backport Improve observability of API failure handling stall - QMGR signals blocks yet to complete API failure handling to dump block internal API failure handling state - TC enhanced to - Track + dump API failure handling sub-state - Dump info about remaining transactions to be handled - Include TC instance number in generated logs - Also dump to node log in cases where truncation may occur in cluster log Change-Id: I20c96ba9081610abd4c4f9696bada496b8f4c1ba
1 parent 10c4ba9 commit bfdb2de

File tree

4 files changed

+96
-31
lines changed

4 files changed

+96
-31
lines changed

storage/ndb/src/kernel/blocks/ERROR_codes.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2003, 2024, Oracle and/or its affiliates.
1+
# Copyright (c) 2003, 2025, Oracle and/or its affiliates.
22
#
33
# This program is free software; you can redistribute it and/or modify
44
# it under the terms of the GNU General Public License, version 2.0,
@@ -29,7 +29,7 @@ Next DBTUP 4040
2929
Next DBLQH 5113
3030
Next DBDICT 6227
3131
Next DBDIH 7251
32-
Next DBTC 8125
32+
Next DBTC 8127
3333
Next TRPMAN 9007
3434
Next CMVMI 9993 Note: CMVMI grows downwards
3535
Next BACKUP 10057

storage/ndb/src/kernel/blocks/dbtc/Dbtc.hpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2003, 2024, Oracle and/or its affiliates.
2+
Copyright (c) 2003, 2025, Oracle and/or its affiliates.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License, version 2.0,
@@ -1210,6 +1210,16 @@ class Dbtc
12101210

12111211
Uint32 m_location_domain_id;
12121212

1213+
/* Discrete states of API failure handling for logs etc */
1214+
enum ApiFailStates {
1215+
AF_IDLE,
1216+
AF_CHECK_TRANS,
1217+
AF_CHECK_MARKERS,
1218+
AF_CHECK_MARKERS_WAIT_TC_TAKEOVER,
1219+
AF_CHECK_MARKERS_WAIT_TRANS
1220+
};
1221+
Uint32 m_af_state;
1222+
/* Independent steps of Data node failure handling */
12131223
enum NodeFailBits
12141224
{
12151225
NF_TAKEOVER = 0x1,
@@ -1218,7 +1228,7 @@ class Dbtc
12181228
NF_BLOCK_HANDLE = 0x8,
12191229
NF_NODE_FAIL_BITS = 0xF // All bits...
12201230
};
1221-
Uint32 m_nf_bits;
1231+
Uint32 m_nf_bits; /* Node fail handling state */
12221232
NdbNodeBitmask m_lqh_trans_conf;
12231233
/**
12241234
* Indicator if any history to track yet

storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp

Lines changed: 69 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* Copyright (c) 2003, 2024, Oracle and/or its affiliates.
1+
/* Copyright (c) 2003, 2025, Oracle and/or its affiliates.
22

33
This program is free software; you can redistribute it and/or modify
44
it under the terms of the GNU General Public License, version 2.0,
@@ -1219,6 +1219,7 @@ void Dbtc::execAPI_FAILREQ(Signal* signal)
12191219
**************************************************************************/
12201220
jamEntry();
12211221

1222+
const Uint32 apiNodeId = signal->theData[0];
12221223
if (ERROR_INSERTED(8056))
12231224
{
12241225
CLEAR_ERROR_INSERT_VALUE;
@@ -1227,15 +1228,16 @@ void Dbtc::execAPI_FAILREQ(Signal* signal)
12271228
#ifdef ERROR_INSERT
12281229
if (ERROR_INSERTED(8078))
12291230
{
1230-
c_lastFailedApi = signal->theData[0];
1231+
c_lastFailedApi = apiNodeId;
12311232
SET_ERROR_INSERT_VALUE(8079);
12321233
}
12331234
#endif
12341235

12351236
capiFailRef = signal->theData[1];
1236-
arrGuard(signal->theData[0], MAX_NODES);
1237-
capiConnectClosing[signal->theData[0]] = 1;
1238-
handleFailedApiNode(signal, signal->theData[0], (UintR)0);
1237+
1238+
arrGuard(apiNodeId, MAX_NODES);
1239+
capiConnectClosing[apiNodeId] = 1;
1240+
handleFailedApiNode(signal, apiNodeId, (UintR)0);
12391241
}
12401242

12411243
/**
@@ -1425,8 +1427,13 @@ Dbtc::handleFailedApiNode(Signal* signal,
14251427
{
14261428
UintR TloopCount = 0;
14271429
arrGuard(TapiFailedNode, MAX_NODES);
1430+
hostptr.i = TapiFailedNode;
1431+
ptrCheckGuard(hostptr, chostFilesize, hostRecord);
1432+
/* Mark progress */
1433+
hostptr.p->m_af_state = HostRecord::AF_CHECK_TRANS;
14281434
apiConnectptr.i = TapiConnectPtr;
1429-
do {
1435+
while (TloopCount++ <= 256 && !ERROR_INSERTED(8125))
1436+
{
14301437
ptrCheckGuard(apiConnectptr, capiConnectFilesize, apiConnectRecord);
14311438
const UintR TapiNode = refToNode(apiConnectptr.p->ndbapiBlockref);
14321439
if (TapiNode == TapiFailedNode)
@@ -1456,7 +1463,7 @@ Dbtc::handleFailedApiNode(Signal* signal,
14561463
removeMarkerForFailedAPI(signal, TapiFailedNode, 0);
14571464
return;
14581465
}//if
1459-
} while (TloopCount++ < 256);
1466+
}
14601467
signal->theData[0] = TcContinueB::ZHANDLE_FAILED_API_NODE;
14611468
signal->theData[1] = TapiFailedNode;
14621469
signal->theData[2] = apiConnectptr.i;
@@ -1471,8 +1478,16 @@ Dbtc::removeMarkerForFailedAPI(Signal* signal,
14711478
TcFailRecordPtr node_fail_ptr;
14721479
node_fail_ptr.i = 0;
14731480
ptrAss(node_fail_ptr, tcFailRecord);
1474-
if(node_fail_ptr.p->failStatus != FS_IDLE) {
1481+
HostRecordPtr myHostPtr;
1482+
myHostPtr.i = nodeId;
1483+
ptrCheckGuard(myHostPtr, chostFilesize, hostRecord);
1484+
/* Mark progress */
1485+
myHostPtr.p->m_af_state = HostRecord::AF_CHECK_MARKERS;
1486+
1487+
if(node_fail_ptr.p->failStatus != FS_IDLE || ERROR_INSERTED(8126)) {
14751488
jam();
1489+
/* Mark progress */
1490+
myHostPtr.p->m_af_state = HostRecord::AF_CHECK_MARKERS_WAIT_TC_TAKEOVER;
14761491
DEBUG("Restarting removeMarkerForFailedAPI");
14771492
/**
14781493
* TC take-over in progress
@@ -1501,6 +1516,8 @@ Dbtc::removeMarkerForFailedAPI(Signal* signal,
15011516
capiConnectClosing[nodeId]--;
15021517
if (capiConnectClosing[nodeId] == 0) {
15031518
jam();
1519+
/* Mark progress */
1520+
myHostPtr.p->m_af_state = HostRecord::AF_IDLE;
15041521

15051522
/********************************************************************/
15061523
// No outstanding ABORT or COMMIT's of this failed API node.
@@ -1538,6 +1555,9 @@ Dbtc::removeMarkerForFailedAPI(Signal* signal,
15381555
*
15391556
* Don't remove it, but continueb retry with a short delay
15401557
*/
1558+
/* Mark progress */
1559+
myHostPtr.p->m_af_state = HostRecord::AF_CHECK_MARKERS_WAIT_TRANS;
1560+
15411561
signal->theData[0] = TcContinueB::ZHANDLE_FAILED_API_NODE_REMOVE_MARKERS;
15421562
signal->theData[1] = nodeId;
15431563
signal->theData[2] = iter.bucket;
@@ -1588,6 +1608,11 @@ void Dbtc::handleApiFailState(Signal* signal, UintR TapiConnectptr)
15881608
{
15891609
jam();
15901610

1611+
/* Mark progress */
1612+
hostptr.i = TfailedApiNode;
1613+
ptrCheckGuard(hostptr, chostFilesize, hostRecord);
1614+
hostptr.p->m_af_state = HostRecord::AF_IDLE;
1615+
15911616
/**
15921617
* Perform block-level cleanups (e.g assembleFragments...)
15931618
*/
@@ -15343,6 +15368,7 @@ void Dbtc::inithost(Signal* signal)
1534315368
container->noOfPackedWords = 0;
1534415369
container->hostBlockRef = numberToRef(DBLQH, i, hostptr.i);
1534515370
}
15371+
hostptr.p->m_af_state = HostRecord::AF_IDLE;
1534615372
hostptr.p->m_nf_bits = 0;
1534715373
}//for
1534815374
c_alive_nodes.clear();
@@ -16617,7 +16643,7 @@ Dbtc::execDUMP_STATE_ORD(Signal* signal)
1661716643
if (len + 2 > 25)
1661816644
{
1661916645
jam();
16620-
infoEvent("Too long filter");
16646+
infoEvent("DBTC %u: Too long filter", instance());
1662116647
return;
1662216648
}
1662316649
if (validate_filter(signal))
@@ -16628,7 +16654,7 @@ Dbtc::execDUMP_STATE_ORD(Signal* signal)
1662816654
signal->theData[1] = 0; // record
1662916655
sendSignal(reference(), GSN_DUMP_STATE_ORD, signal, len + 2, JBB);
1663016656

16631-
infoEvent("Starting dump of transactions");
16657+
infoEvent("DBTC %u: Starting dump of transactions", instance());
1663216658
}
1663316659
return;
1663416660
}
@@ -16663,7 +16689,7 @@ Dbtc::execDUMP_STATE_ORD(Signal* signal)
1666316689
if (ap.i == capiConnectFilesize)
1666416690
{
1666516691
jam();
16666-
infoEvent("End of transaction dump");
16692+
infoEvent("DBTC %u: End of transaction dump", instance());
1666716693
return;
1666816694
}
1666916695

@@ -16695,12 +16721,30 @@ Dbtc::execDUMP_STATE_ORD(Signal* signal)
1669516721
NodeId nodeId = signal->theData[1];
1669616722
if (nodeId < MAX_NODES && nodeId < NDB_ARRAY_SIZE(capiConnectClosing))
1669716723
{
16698-
warningEvent(" DBTC: capiConnectClosing[%u]: %u",
16699-
nodeId, capiConnectClosing[nodeId]);
16724+
if (getNodeInfo(nodeId).getType() == NODE_TYPE_API) {
16725+
jam();
16726+
hostptr.i = nodeId;
16727+
ptrCheckGuard(hostptr, chostFilesize, hostRecord);
16728+
warningEvent(" DBTC %u: capiConnectClosing[%u]: %u", instance(), nodeId,
16729+
capiConnectClosing[nodeId]);
16730+
warningEvent(" DBTC %u: apiFailState[%u]: %u", instance(), nodeId,
16731+
hostptr.p->m_af_state);
16732+
16733+
if (capiConnectClosing[nodeId] > 0) {
16734+
jam();
16735+
/* Dump all transactions with given nodeid as client */
16736+
signal->theData[0] = 2550;
16737+
signal->theData[1] = 1;
16738+
signal->theData[2] = nodeId;
16739+
sendSignal(reference(), GSN_DUMP_STATE_ORD, signal, 3, JBB);
16740+
}
16741+
}
16742+
// Could add more info for Data node failure handling delay
1670016743
}
1670116744
else
1670216745
{
16703-
warningEvent(" DBTC: dump-%u to unknown node: %u", arg, nodeId);
16746+
warningEvent(" DBTC %u: dump-%u to unknown node: %u", instance(), arg,
16747+
nodeId);
1670416748
}
1670516749
}
1670616750

@@ -17268,19 +17312,18 @@ Dbtc::match_and_print(Signal* signal, ApiConnectRecordPtr apiPtr)
1726817312
break;
1726917313
}
1727017314

17271-
char buf[100];
17272-
BaseString::snprintf(buf, sizeof(buf),
17273-
"TRX[%u]: API: %d(0x%x)"
17274-
"transid: 0x%x 0x%x inactive: %u(%d) state: %s",
17275-
apiPtr.i,
17276-
refToNode(apiPtr.p->ndbapiBlockref),
17277-
refToBlock(apiPtr.p->ndbapiBlockref),
17278-
apiPtr.p->transid[0],
17279-
apiPtr.p->transid[1],
17280-
apiTimer ? (ctcTimer - apiTimer) / 100 : 0,
17281-
c_apiConTimer_line[apiPtr.i],
17282-
stateptr);
17315+
char buf[150];
17316+
BaseString::snprintf(
17317+
buf, sizeof(buf),
17318+
"DBTC %u TRX[%u] API %d(0x%x)"
17319+
"trid 0x%x 0x%x inact %u(%d) state %s nodes %s",
17320+
instance(), apiPtr.i, refToNode(apiPtr.p->ndbapiBlockref),
17321+
refToBlock(apiPtr.p->ndbapiBlockref), apiPtr.p->transid[0],
17322+
apiPtr.p->transid[1], apiTimer ? (ctcTimer - apiTimer) / 100 : 0,
17323+
c_apiConTimer_line[apiPtr.i], stateptr,
17324+
BaseString::getPrettyText(apiPtr.p->m_transaction_nodes).c_str());
1728317325
infoEvent("%s", buf);
17326+
g_eventLogger->info("%s", buf);
1728417327

1728517328
memcpy(signal->theData, temp, 4*len);
1728617329
return true;

storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2003, 2023, Oracle and/or its affiliates.
2+
Copyright (c) 2003, 2025, Oracle and/or its affiliates.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License, version 2.0,
@@ -3188,6 +3188,18 @@ void Qmgr::checkStartInterface(Signal* signal, NDB_TICKS now)
31883188
nodePtr.p->m_failconf_blocks[3],
31893189
nodePtr.p->m_failconf_blocks[4]);
31903190
warningEvent("%s", buf);
3191+
3192+
/* Ask delayed block(s) to explain themselves */
3193+
for (Uint32 i = 0;
3194+
i < NDB_ARRAY_SIZE(nodePtr.p->m_failconf_blocks); i++) {
3195+
if (nodePtr.p->m_failconf_blocks[i] != 0) {
3196+
signal->theData[0] = DumpStateOrd::DihTcSumaNodeFailCompleted;
3197+
signal->theData[1] = nodePtr.i;
3198+
const Uint32 dstRef =
3199+
numberToRef(nodePtr.p->m_failconf_blocks[i], 0);
3200+
sendSignal(dstRef, GSN_DUMP_STATE_ORD, signal, 2, JBB);
3201+
}
3202+
}
31913203
}
31923204
}
31933205
}

0 commit comments

Comments
 (0)