Skip to content

Commit ac1d889

Browse files
committed
Bug#37140331 MySQL NDB Cluster is crushing with Signal 8 error (Floating Point Exception) 1/2
Backport to 7.6 Problem: When a query on ndbinfo table FRAG_MEM_USE or FRAG_OPERATIONS is performed, a DBINFO_SCANREQ signal is sent to LQH. during the handling of that signal, LQH asks TUP about fragStats of all tables defined in the cluster. If in 'parallel' with the ndbinfo scan a create (or drop) table operation is on going, there could be a discrepancy between the view LQH has of new table fragments and TUP view about those fragments, that can lead to a crash in both TUP/ACC or LQH. In particular, if DBINFO_SCANREQ finds the new table with status ADD_TABLE_ONGOING it could be impossible to TUP/ACC to get the status of the new table fragments since at that point, fragments information in TUP/ACC is not yet updated. In similar way, during drop table if the status of the target table in LQH is DROP_TABLE_* or PREP_DROP_* there could be differences between the view that LQH and TUP/ACC have of the fragments of that table. This patch add 2 new tests to testNdbinfo suite where tables FRAG_MEM_USE and FRAG_OPERATIONS are scanned (intensively) in parallel with create/drop table operations running in different threads. Without a fix to the discrepancy in fragments in LQH and ACC/TUP these tests should crash data nodes in different ways on both ACC or TUP (it depends on the 'phase' of create/drop table at the moment that SCANREQ of frags table arrives to LQH) Note: due to some asserts in the code, if ndb(mt)d is compiled in debug mode data node can crash earlier in those assert instead of in the problematic places in LQH and ACC/TUP. Change-Id: I40b2aed992d5086aeea711c162da88b5e1be19cb
1 parent e929a7a commit ac1d889

File tree

5 files changed

+174
-9
lines changed

5 files changed

+174
-9
lines changed

storage/ndb/src/kernel/blocks/ERROR_codes.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@
2424
Next QMGR 950
2525
Next NDBCNTR 1030
2626
Next NDBFS 2003
27-
Next DBACC 3006
28-
Next DBTUP 4039
27+
Next DBACC 3007
28+
Next DBTUP 4040
2929
Next DBLQH 5113
3030
Next DBDICT 6227
3131
Next DBDIH 7251

storage/ndb/src/kernel/blocks/dbacc/DbaccMain.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* Copyright (c) 2003, 2023, Oracle and/or its affiliates.
1+
/* Copyright (c) 2003, 2024, Oracle and/or its affiliates.
22
33
This program is free software; you can redistribute it and/or modify
44
it under the terms of the GNU General Public License, version 2.0,
@@ -560,6 +560,16 @@ void Dbacc::execACCFRAGREQ(Signal* signal)
560560
{
561561
const AccFragReq * const req = (AccFragReq*)&signal->theData[0];
562562
jamEntry();
563+
if (ERROR_INSERTED(3006)) {
564+
jam();
565+
// Delay each GSN_ACCFRAGREQ only once
566+
if (signal->senderBlockRef() != reference()) {
567+
jam();
568+
sendSignalWithDelay(reference(), GSN_ACCFRAGREQ, signal, 100,
569+
signal->getLength());
570+
return;
571+
}
572+
}
563573
if (ERROR_INSERTED(3001)) {
564574
jam();
565575
addFragRefuse(signal, 1);

storage/ndb/src/kernel/blocks/dbtup/DbtupMeta.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2003, 2023, Oracle and/or its affiliates.
2+
Copyright (c) 2003, 2024, Oracle and/or its affiliates.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License, version 2.0,
@@ -2584,9 +2584,15 @@ Dbtup::drop_fragment_fsremove_done(Signal* signal,
25842584
if (tabPtr.p->tableStatus == DROPPING)
25852585
{
25862586
jam();
2587-
signal->theData[0]= ZREL_FRAG;
2588-
signal->theData[1]= tabPtr.i;
2589-
signal->theData[2]= logfile_group_id;
2587+
signal->theData[0] = ZREL_FRAG;
2588+
signal->theData[1] = tabPtr.i;
2589+
signal->theData[2] = logfile_group_id;
2590+
if (ERROR_INSERTED(4039)) {
2591+
jam();
2592+
// Delay fragment release
2593+
sendSignalWithDelay(cownref, GSN_CONTINUEB, signal, 1000, 3);
2594+
return;
2595+
}
25902596
sendSignal(cownref, GSN_CONTINUEB, signal, 3, JBB);
25912597
}
25922598
else

storage/ndb/test/ndbapi/testNdbinfo.cpp

Lines changed: 143 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2009, 2021, Oracle and/or its affiliates.
2+
Copyright (c) 2009, 2024, Oracle and/or its affiliates.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License, version 2.0,
@@ -532,7 +532,131 @@ int runRestarter(NDBT_Context* ctx, NDBT_Step* step){
532532
return result;
533533
}
534534

535+
int runCreateDropTableUntilStopped(NDBT_Context *ctx, NDBT_Step *step) {
536+
Ndb *pNdb = GETNDB(step);
537+
NdbRestarter res;
538+
const char *tableName = ctx->getProperty("tableName", (char *)NULL);
535539

540+
NdbDictionary::Dictionary *pDict = pNdb->getDictionary();
541+
Uint32 stepNum = step->getStepNo();
542+
BaseString tabName(tableName);
543+
tabName.appfmt("_%i", stepNum);
544+
545+
NdbDictionary::Table tab(tabName.c_str());
546+
{
547+
NdbDictionary::Column col("a");
548+
col.setType(NdbDictionary::Column::Unsigned);
549+
col.setPrimaryKey(true);
550+
tab.addColumn(col);
551+
}
552+
{
553+
NdbDictionary::Column col("b");
554+
col.setType(NdbDictionary::Column::Unsigned);
555+
col.setNullable(false);
556+
tab.addColumn(col);
557+
}
558+
559+
while (!ctx->isTestStopped()) {
560+
/*
561+
* 3006: Error Insert to delay handling of ACCFRAGREQ (and indirectly
562+
* TUPFRAGREQ).
563+
* 4039: Error Insert to delay fragment release in TUP.
564+
*/
565+
int error = rand() % 2 ? 3006 : 4039;
566+
if (res.insertErrorInAllNodes(error) != 0) {
567+
g_err << "Failed to insertError " << error << endl;
568+
return NDBT_FAILED;
569+
}
570+
571+
if (pDict->createTable(tab) != 0) {
572+
NdbError err = pDict->getNdbError();
573+
g_err << "Failed to create table (" << tabName.c_str() << ") " << err
574+
<< endl;
575+
/**
576+
* if error is:
577+
* 701: System busy with other schema operation, or
578+
* 721: Schema object with given name already exists
579+
* test can continue
580+
*/
581+
if (err.code != 701 && err.code != 721) {
582+
return NDBT_FAILED;
583+
}
584+
}
585+
586+
/**
587+
* if error is:
588+
* 701: System busy with other schema operation, or
589+
* 723: No such table existed
590+
* test can continue
591+
*/
592+
if (pDict->dropTable(tabName.c_str()) != 0) {
593+
NdbError err = pDict->getNdbError();
594+
g_err << "Failed to drop table (" << tabName.c_str() << ") " << err
595+
<< endl;
596+
if (err.code != 701 && err.code != 723) {
597+
return NDBT_FAILED;
598+
}
599+
}
600+
NdbSleep_MilliSleep(20);
601+
if (res.insertErrorInAllNodes(0) != 0) {
602+
g_err << "Failed to clear Error " << error << endl;
603+
return NDBT_FAILED;
604+
}
605+
}
606+
return NDBT_OK;
607+
}
608+
609+
int runScanNdbInfoTable(NDBT_Context *ctx, NDBT_Step *step) {
610+
int result = NDBT_OK;
611+
int loops = ctx->getNumLoops();
612+
const char *tableName = ctx->getProperty("infoTableName", (char *)NULL);
613+
614+
NdbInfo ndbinfo(&ctx->m_cluster_connection, "ndbinfo/");
615+
if (!ndbinfo.init()) {
616+
g_err << "ndbinfo.init failed" << endl;
617+
ctx->stopTest();
618+
return NDBT_FAILED;
619+
}
620+
621+
const NdbInfo::Table *table;
622+
if (ndbinfo.openTable(tableName, &table) != 0) {
623+
g_err << "Failed to openTable " << tableName << endl;
624+
ctx->stopTest();
625+
return NDBT_FAILED;
626+
}
627+
628+
while (loops-- && !ctx->isTestStopped()) {
629+
NdbInfoScanOperation *scanOp = NULL;
630+
if (ndbinfo.createScanOperation(table, &scanOp)) {
631+
g_err << "createScanOperation failed" << endl;
632+
ndbinfo.releaseScanOperation(scanOp);
633+
result = NDBT_FAILED;
634+
break;
635+
}
636+
637+
if (scanOp->readTuples() != 0) {
638+
g_err << "scanOp->readTuples failed" << endl;
639+
ndbinfo.releaseScanOperation(scanOp);
640+
result = NDBT_FAILED;
641+
break;
642+
}
643+
644+
if (scanOp->execute() != 0) {
645+
g_err << "scanOp->execute failed" << endl;
646+
ndbinfo.releaseScanOperation(scanOp);
647+
result = NDBT_FAILED;
648+
break;
649+
}
650+
while (scanOp->nextResult() == 1) {
651+
}
652+
653+
ndbinfo.releaseScanOperation(scanOp);
654+
}
655+
656+
ndbinfo.closeTable(table);
657+
ctx->stopTest();
658+
return result;
659+
}
536660

537661
NDBT_TESTSUITE(testNdbinfo);
538662
TESTCASE("NodeRestart", "Scan NdbInfo tables while restarting nodes"){
@@ -574,8 +698,25 @@ TESTCASE("TestTable",
574698
"of rows which will depend on how many TUP blocks are configured"){
575699
STEP(runTestTable);
576700
}
577-
NDBT_TESTSUITE_END(testNdbinfo);
578701

702+
TESTCASE("ScanFragOperationsDuringCreateDropTable",
703+
"Check that scanning of ndbinfo/frag_operations table is robust"
704+
"to CREATE/DROP table operation running in parallel with the scan") {
705+
TC_PROPERTY("tableName", "tmp_table");
706+
TC_PROPERTY("infoTableName", "ndbinfo/frag_operations");
707+
STEPS(runCreateDropTableUntilStopped, 1);
708+
STEPS(runScanNdbInfoTable, 16);
709+
}
710+
TESTCASE("ScanFragMemUseDuringCreateDropTable",
711+
"Check that scanning of ndbinfo/frag_mem_use table is robust"
712+
"to CREATE/DROP table operation running in parallel with the scan") {
713+
TC_PROPERTY("tableName", "tmp_table");
714+
TC_PROPERTY("infoTableName", "ndbinfo/frag_mem_use");
715+
STEPS(runCreateDropTableUntilStopped, 1);
716+
STEPS(runScanNdbInfoTable, 16);
717+
}
718+
719+
NDBT_TESTSUITE_END(testNdbinfo)
579720

580721
int main(int argc, const char** argv){
581722
ndb_init();

storage/ndb/test/run-test/daily-devel--07-tests.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,3 +312,11 @@ max-time: 480
312312
cmd: testSystemRestart
313313
args: -n GCPSaveLagLcpSR T1
314314
max-time: 240
315+
316+
cmd: testNdbinfo
317+
args: -n ScanFragOperationsDuringCreateDropTable -l 10000 T1
318+
max-time: 180
319+
320+
cmd: testNdbinfo
321+
args: -n ScanFragMemUseDuringCreateDropTable -l 10000 T1
322+
max-time: 180

0 commit comments

Comments
 (0)