Skip to content

Commit bcdc7a8

Browse files
committed
Bug#37140331 MySQL NDB Cluster is crushing with Signal 8 error (Floating Point Exception) 1/2
Problem: When a query on ndbinfo table FRAG_MEM_USE or FRAG_OPERATIONS is performed, a DBINFO_SCANREQ signal is sent to LQH. during the handling of that signal, LQH asks TUP about fragStats of all tables defined in the cluster. If in 'parallel' with the ndbinfo scan a create (or drop) table operation is on going, there could be a discrepancy between the view LQH has of new table fragments and TUP view about those fragments, that can lead to a crash in both TUP/ACC or LQH. In particular, if DBINFO_SCANREQ finds the new table with status ADD_TABLE_ONGOING it could be impossible to TUP/ACC to get the status of the new table fragments since at that point, fragments information in TUP/ACC is not yet updated. In similar way, during drop table if the status of the target table in LQH is DROP_TABLE_* or PREP_DROP_* there could be differences between the view that LQH and TUP/ACC have of the fragments of that table. This patch add 2 new tests to testNdbinfo suite where tables FRAG_MEM_USE and FRAG_OPERATIONS are scanned (intensively) in parallel with create/drop table operations running in different threads. Without a fix to the discrepancy in fragments in LQH and ACC/TUP these tests should crash data nodes in different ways on both ACC or TUP (it depends on the 'phase' of create/drop table at the moment that SCANREQ of frags table arrives to LQH) Note: due to some asserts in the code, if ndb(mt)d is compiled in debug mode data node can crash earlier in those assert instead of in the problematic places in LQH and ACC/TUP. Change-Id: I60d1954b9ef6a3b1324fe3b668860d1b1d680a98
1 parent a8c96ce commit bcdc7a8

File tree

5 files changed

+169
-2
lines changed

5 files changed

+169
-2
lines changed

storage/ndb/src/kernel/blocks/ERROR_codes.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@
2424
Next QMGR 950
2525
Next NDBCNTR 1030
2626
Next NDBFS 2002
27-
Next DBACC 3006
28-
Next DBTUP 4039
27+
Next DBACC 3007
28+
Next DBTUP 4040
2929
Next DBLQH 5113
3030
Next DBDICT 6227
3131
Next DBDIH 7251

storage/ndb/src/kernel/blocks/dbacc/DbaccMain.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -543,6 +543,16 @@ void Dbacc::set_tup_fragptr(Uint32 fragptr, Uint32 tup_fragptr) {
543543
void Dbacc::execACCFRAGREQ(Signal *signal) {
544544
const AccFragReq *const req = (AccFragReq *)&signal->theData[0];
545545
jamEntry();
546+
if (ERROR_INSERTED(3006)) {
547+
jam();
548+
// Delay each GSN_ACCFRAGREQ only once
549+
if (signal->senderBlockRef() != reference()) {
550+
jam();
551+
sendSignalWithDelay(reference(), GSN_ACCFRAGREQ, signal, 100,
552+
signal->getLength());
553+
return;
554+
}
555+
}
546556
if (ERROR_INSERTED(3001)) {
547557
jam();
548558
addFragRefuse(signal, 1);

storage/ndb/src/kernel/blocks/dbtup/DbtupMeta.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2363,6 +2363,12 @@ void Dbtup::drop_fragment_fsremove_done(Signal *signal, TablerecPtr tabPtr,
23632363
signal->theData[0] = ZREL_FRAG;
23642364
signal->theData[1] = tabPtr.i;
23652365
signal->theData[2] = logfile_group_id;
2366+
if (ERROR_INSERTED(4039)) {
2367+
jam();
2368+
// Delay fragment release
2369+
sendSignalWithDelay(cownref, GSN_CONTINUEB, signal, 1000, 3);
2370+
return;
2371+
}
23662372
sendSignal(cownref, GSN_CONTINUEB, signal, 3, JBB);
23672373
} else {
23682374
jam();

storage/ndb/test/ndbapi/testNdbinfo.cpp

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -470,6 +470,132 @@ int runRestarter(NDBT_Context *ctx, NDBT_Step *step) {
470470
return result;
471471
}
472472

473+
int runCreateDropTableUntilStopped(NDBT_Context *ctx, NDBT_Step *step) {
474+
Ndb *pNdb = GETNDB(step);
475+
NdbRestarter res;
476+
const char *tableName = ctx->getProperty("tableName", (char *)NULL);
477+
478+
NdbDictionary::Dictionary *pDict = pNdb->getDictionary();
479+
Uint32 stepNum = step->getStepNo();
480+
BaseString tabName(tableName);
481+
tabName.appfmt("_%i", stepNum);
482+
483+
NdbDictionary::Table tab(tabName.c_str());
484+
{
485+
NdbDictionary::Column col("a");
486+
col.setType(NdbDictionary::Column::Unsigned);
487+
col.setPrimaryKey(true);
488+
tab.addColumn(col);
489+
}
490+
{
491+
NdbDictionary::Column col("b");
492+
col.setType(NdbDictionary::Column::Unsigned);
493+
col.setNullable(false);
494+
tab.addColumn(col);
495+
}
496+
497+
while (!ctx->isTestStopped()) {
498+
/*
499+
* 3006: Error Insert to delay handling of ACCFRAGREQ (and indirectly
500+
* TUPFRAGREQ).
501+
* 4039: Error Insert to delay fragment release in TUP.
502+
*/
503+
int error = rand() % 2 ? 3006 : 4039;
504+
if (res.insertErrorInAllNodes(error) != 0) {
505+
g_err << "Failed to insertError " << error << endl;
506+
return NDBT_FAILED;
507+
}
508+
509+
if (pDict->createTable(tab) != 0) {
510+
NdbError err = pDict->getNdbError();
511+
g_err << "Failed to create table (" << tabName.c_str() << ") " << err
512+
<< endl;
513+
/**
514+
* if error is:
515+
* 701: System busy with other schema operation, or
516+
* 721: Schema object with given name already exists
517+
* test can continue
518+
*/
519+
if (err.code != 701 && err.code != 721) {
520+
return NDBT_FAILED;
521+
}
522+
}
523+
524+
/**
525+
* if error is:
526+
* 701: System busy with other schema operation, or
527+
* 723: No such table existed
528+
* test can continue
529+
*/
530+
if (pDict->dropTable(tabName.c_str()) != 0) {
531+
NdbError err = pDict->getNdbError();
532+
g_err << "Failed to drop table (" << tabName.c_str() << ") " << err
533+
<< endl;
534+
if (err.code != 701 && err.code != 723) {
535+
return NDBT_FAILED;
536+
}
537+
}
538+
NdbSleep_MilliSleep(20);
539+
if (res.insertErrorInAllNodes(0) != 0) {
540+
g_err << "Failed to clear Error " << error << endl;
541+
return NDBT_FAILED;
542+
}
543+
}
544+
return NDBT_OK;
545+
}
546+
547+
int runScanNdbInfoTable(NDBT_Context *ctx, NDBT_Step *step) {
548+
int result = NDBT_OK;
549+
int loops = ctx->getNumLoops();
550+
const char *tableName = ctx->getProperty("infoTableName", (char *)NULL);
551+
552+
NdbInfo ndbinfo(&ctx->m_cluster_connection, "ndbinfo/");
553+
if (!ndbinfo.init()) {
554+
g_err << "ndbinfo.init failed" << endl;
555+
ctx->stopTest();
556+
return NDBT_FAILED;
557+
}
558+
559+
const NdbInfo::Table *table;
560+
if (ndbinfo.openTable(tableName, &table) != 0) {
561+
g_err << "Failed to openTable " << tableName << endl;
562+
ctx->stopTest();
563+
return NDBT_FAILED;
564+
}
565+
566+
while (loops-- && !ctx->isTestStopped()) {
567+
NdbInfoScanOperation *scanOp = nullptr;
568+
if (ndbinfo.createScanOperation(table, &scanOp)) {
569+
g_err << "createScanOperation failed" << endl;
570+
ndbinfo.releaseScanOperation(scanOp);
571+
result = NDBT_FAILED;
572+
break;
573+
}
574+
575+
if (scanOp->readTuples() != 0) {
576+
g_err << "scanOp->readTuples failed" << endl;
577+
ndbinfo.releaseScanOperation(scanOp);
578+
result = NDBT_FAILED;
579+
break;
580+
}
581+
582+
if (scanOp->execute() != 0) {
583+
g_err << "scanOp->execute failed" << endl;
584+
ndbinfo.releaseScanOperation(scanOp);
585+
result = NDBT_FAILED;
586+
break;
587+
}
588+
while (scanOp->nextResult() == 1) {
589+
}
590+
591+
ndbinfo.releaseScanOperation(scanOp);
592+
}
593+
594+
ndbinfo.closeTable(table);
595+
ctx->stopTest();
596+
return result;
597+
}
598+
473599
NDBT_TESTSUITE(testNdbinfo);
474600
TESTCASE("NodeRestart", "Scan NdbInfo tables while restarting nodes") {
475601
STEP(runRestarter);
@@ -504,6 +630,23 @@ TESTCASE("TestTable",
504630
"of rows which will depend on how many TUP blocks are configured") {
505631
STEP(runTestTable);
506632
}
633+
TESTCASE("ScanFragOperationsDuringCreateDropTable",
634+
"Check that scanning of ndbinfo/frag_operations table is robust"
635+
"to CREATE/DROP table operation running in parallel with the scan") {
636+
TC_PROPERTY("tableName", "tmp_table");
637+
TC_PROPERTY("infoTableName", "ndbinfo/frag_operations");
638+
STEPS(runCreateDropTableUntilStopped, 1);
639+
STEPS(runScanNdbInfoTable, 16);
640+
}
641+
TESTCASE("ScanFragMemUseDuringCreateDropTable",
642+
"Check that scanning of ndbinfo/frag_mem_use table is robust"
643+
"to CREATE/DROP table operation running in parallel with the scan") {
644+
TC_PROPERTY("tableName", "tmp_table");
645+
TC_PROPERTY("infoTableName", "ndbinfo/frag_mem_use");
646+
STEPS(runCreateDropTableUntilStopped, 1);
647+
STEPS(runScanNdbInfoTable, 16);
648+
}
649+
507650
NDBT_TESTSUITE_END(testNdbinfo)
508651

509652
int main(int argc, const char **argv) {

storage/ndb/test/run-test/daily-devel--07-tests.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,3 +364,11 @@ max-time: 480
364364
cmd: testSystemRestart
365365
args: -n GCPSaveLagLcpSR T1
366366
max-time: 240
367+
368+
cmd: testNdbinfo
369+
args: -n ScanFragOperationsDuringCreateDropTable -l 10000 T1
370+
max-time: 180
371+
372+
cmd: testNdbinfo
373+
args: -n ScanFragMemUseDuringCreateDropTable -l 10000 T1
374+
max-time: 180

0 commit comments

Comments
 (0)