BUG#37331118 Applier prepare error retry [test]

blaudden · blaudden · commit ac6d59c7632b · 2024-12-12T10:31:11.000+01:00
The replication applier normally retries temporary errors occurring
while applying transactions. Such retry logic is not performed for
transactions containing row events where the STMT_END_F flag is missing.
For such events, the statement will instead be committed as an
additional step while applying the subsequent COMMIT query event when it
is detected that there are still locked tables. When committing the
statement, temporary errors are not handled properly.

This patch reproduces the problem by writing an epoch trans with
simulated writes from multiple server ids on the source. The replica
then uses IGNORE_SERVER_IDS(&lt;last_server_id_in_binlog&gt;) to cause the
STMT_END_F to be filtered away, thus activating the above-described code
path in the applier. By holding a lock on one of the rows to be updated
by the applier, error handling is triggered. For reference the test then
also triggers error and retry handling when STMT_END_F has not been
filtered away.

Change-Id: Ifa3dc6d00691fbba286fcce5e1919f42a7e50d16
diff --git a/mysql-test/suite/ndb_rpl/r/mta_prepare_error_retry.result b/mysql-test/suite/ndb_rpl/r/mta_prepare_error_retry.result
@@ -0,0 +1,136 @@
+include/master-slave.inc
+Warnings:
+Note	####	Sending passwords in plain text without SSL/TLS is extremely insecure.
+Note	####	Storing MySQL user name or password information in the connection metadata repository is not secure and is therefore not recommended. Please consider using the USER and PASSWORD connection options for START REPLICA; see the 'START REPLICA Syntax' in the MySQL Manual for more information.
+[connection master]
+######################################################################
+# BUG#37331118 Applier prepare error retry
+#
+# The replication applier normally retries temporary errors occurring
+# while applying transactions. Such retry logic is not performed for
+# transactions containing row events where the STMT_END_F flag is
+# missing.
+# For such events, the statement will instead be committed as an
+# additional step while applying the subsequent COMMIT query event when
+# it is detected that there are still locked tables. When committing the
+# statement, temporary errors are not handled properly.
+#
+# This patch reproduces the problem by writing an epoch trans with
+# simulated writes from multiple server ids on the source. The replica
+# then uses IGNORE_SERVER_IDS(<last_server_id_in_binlog>) to cause the
+# STMT_END_F to be filtered away, thus activating the above-described
+# code path in the applier. By holding a lock on one of the rows to be
+# updated by the applier, error handling is triggered.
+# For reference the test then also triggers error and retry handling
+# when STMT_END_F has not been filtered away.
+######################################################################
+# Populate the source and replica
+# - create table and populate with two rows, sync to replica
+[connection master]
+CREATE TABLE test_multi_server_id (
+id INT PRIMARY KEY,
+what VARCHAR(128),
+epoch INT UNSIGNED
+) ENGINE = NDB;
+INSERT INTO test_multi_server_id
+VALUES (31, "not updated", 0), (32, "not updated", 0);
+# Source need log_replica_updates turned on
+show variables like 'log_replica_updates';
+Variable_name	Value
+log_replica_updates	ON
+include/sync_slave_sql_with_master.inc
+# Replica need both workers and retries to be greater than 1
+show variables like 'replica_parallel_workers';
+Variable_name	Value
+replica_parallel_workers	2
+show variables like 'replica_transaction_retries';
+Variable_name	Value
+replica_transaction_retries	3
+include/stop_slave.inc
+SELECT * FROM test_multi_server_id ORDER BY id;
+id	what	epoch
+31	not updated	0
+32	not updated	0
+# Supress MTA errors generated by test
+call mtr.add_suppression(".*worker thread retried transaction [0-9] time.*");
+call mtr.add_suppression(".*Worker [0-9] failed executing transaction.*");
+call mtr.add_suppression(".*replica coordinator and worker threads are stopped.*");
+[connection master]
+FLUSH LOGS;
+SET @save_debug= @@GLOBAL.debug;
+SET @@GLOBAL.debug="+d,ndb_binlog_log_multi_server_id";
+SET @@GLOBAL.debug= @save_debug;
+SELECT id, what FROM test_multi_server_id ORDER BY id;
+id	what
+31	change from 31
+32	change from 32
+# Wait for ndb_binlog thread...
+[connection slave1]
+begin;
+SELECT * FROM test_multi_server_id WHERE id=<other_server_id> FOR UPDATE;
+id	what	epoch
+<other_server_id>	not updated	0
+#
+# A) Test error and retry with STMT_END_F filtered away
+#     - i.e statement commit from Query_log_event("COMMIT")
+#
+[connection slave]
+CHANGE REPLICATION SOURCE TO
+IGNORE_SERVER_IDS = (<last_server_id>);
+include/start_slave.inc
+# Wait for replication error
+include/wait_for_slave_sql_error.inc [errno=1180]
+SELECT LAST_ERROR_NUMBER, LAST_ERROR_MESSAGE,
+APPLYING_TRANSACTION_RETRIES_COUNT
+FROM performance_schema.replication_applier_status_by_worker
+WHERE LAST_ERROR_NUMBER != 0;
+LAST_ERROR_NUMBER	1180
+LAST_ERROR_MESSAGE	Worker NNN failed executing transaction 'ANONYMOUS' at source log master-bin.000002, end_log_pos NNN; Error 'Got error 125 - 'Transaction has been rolled back' during COMMIT' on query. Default database: ''. Query: 'COMMIT'
+APPLYING_TRANSACTION_RETRIES_COUNT	0
+SELECT LAST_ERROR_NUMBER, LAST_ERROR_MESSAGE
+FROM performance_schema.replication_applier_status_by_coordinator;
+LAST_ERROR_NUMBER	1180
+LAST_ERROR_MESSAGE	Coordinator stopped because there were error(s) in the worker(s). The most recent failure being: Worker NNN failed executing transaction 'ANONYMOUS' at source log master-bin.000002, end_log_pos NNN. See error log and/or performance_schema.replication_applier_status_by_worker table for more details about this failure or others, if any.
+SELECT id, what FROM test_multi_server_id ORDER BY id;
+id	what
+31	not updated
+32	not updated
+#
+# B) Test error and retry when STMT_END_F has NOT been filtered
+#     - i.e statement commit from Rows_log_event
+#
+[connection slave]
+include/stop_slave.inc
+CHANGE REPLICATION SOURCE TO IGNORE_SERVER_IDS = ();
+include/start_slave.inc
+# Wait for replication error
+include/wait_for_slave_sql_error.inc [errno=1205]
+SELECT LAST_ERROR_NUMBER, LAST_ERROR_MESSAGE,
+APPLYING_TRANSACTION_RETRIES_COUNT
+FROM performance_schema.replication_applier_status_by_worker
+WHERE LAST_ERROR_NUMBER != 0;
+LAST_ERROR_NUMBER	1205
+LAST_ERROR_MESSAGE	Worker NNN failed executing transaction 'ANONYMOUS' at source log master-bin.000002, end_log_pos NNN; Lock wait timeout exceeded; try restarting transaction
+APPLYING_TRANSACTION_RETRIES_COUNT	3
+SELECT LAST_ERROR_NUMBER, LAST_ERROR_MESSAGE
+FROM performance_schema.replication_applier_status_by_coordinator;
+LAST_ERROR_NUMBER	1205
+LAST_ERROR_MESSAGE	Coordinator stopped because there were error(s) in the worker(s). The most recent failure being: Worker NNN failed executing transaction 'ANONYMOUS' at source log master-bin.000002, end_log_pos NNN. See error log and/or performance_schema.replication_applier_status_by_worker table for more details about this failure or others, if any.
+SELECT id, what FROM test_multi_server_id ORDER BY id;
+id	what
+31	not updated
+32	not updated
+[connection slave1]
+# Release lock
+commit;
+# Start replication again
+[connection slave]
+include/start_slave.inc
+SELECT id, what FROM test_multi_server_id ORDER BY id;
+id	what
+31	change from 31
+32	change from 32
+# Cleanup
+[connection master]
+DROP TABLE test_multi_server_id;
+include/rpl_end.inc
diff --git a/mysql-test/suite/ndb_rpl/t/mta_prepare_error_retry.cnf b/mysql-test/suite/ndb_rpl/t/mta_prepare_error_retry.cnf
@@ -0,0 +1,8 @@
+!include suite/ndb_rpl/my.cnf
+
+[mysqld.1.1]
+log-replica-updates=ON
+
+[mysqld.1.slave]
+replica-parallel-workers=2
+replica-transaction-retries=3
diff --git a/mysql-test/suite/ndb_rpl/t/mta_prepare_error_retry.test b/mysql-test/suite/ndb_rpl/t/mta_prepare_error_retry.test
@@ -0,0 +1,190 @@
+--source include/have_debug.inc
+--source include/have_ndb.inc
+--source include/master-slave.inc
+
+--echo ######################################################################
+--echo # BUG#37331118 Applier prepare error retry
+--echo #
+--echo # The replication applier normally retries temporary errors occurring
+--echo # while applying transactions. Such retry logic is not performed for
+--echo # transactions containing row events where the STMT_END_F flag is
+--echo # missing.
+--echo # For such events, the statement will instead be committed as an
+--echo # additional step while applying the subsequent COMMIT query event when
+--echo # it is detected that there are still locked tables. When committing the
+--echo # statement, temporary errors are not handled properly.
+--echo #
+--echo # This patch reproduces the problem by writing an epoch trans with
+--echo # simulated writes from multiple server ids on the source. The replica
+--echo # then uses IGNORE_SERVER_IDS(<last_server_id_in_binlog>) to cause the
+--echo # STMT_END_F to be filtered away, thus activating the above-described
+--echo # code path in the applier. By holding a lock on one of the rows to be
+--echo # updated by the applier, error handling is triggered.
+--echo # For reference the test then also triggers error and retry handling
+--echo # when STMT_END_F has not been filtered away.
+--echo ######################################################################
+
+--echo # Populate the source and replica
+--echo # - create table and populate with two rows, sync to replica
+
+--source include/rpl_connection_master.inc
+CREATE TABLE test_multi_server_id (
+  id INT PRIMARY KEY,
+  what VARCHAR(128),
+  epoch INT UNSIGNED
+) ENGINE = NDB;
+INSERT INTO test_multi_server_id
+  VALUES (31, "not updated", 0), (32, "not updated", 0);
+
+--echo # Source need log_replica_updates turned on
+show variables like 'log_replica_updates';
+
+--source include/sync_slave_sql_with_master.inc
+
+--echo # Replica need both workers and retries to be greater than 1
+show variables like 'replica_parallel_workers';
+show variables like 'replica_transaction_retries';
+
+# Stop replication
+--source include/stop_slave.inc
+SELECT * FROM test_multi_server_id ORDER BY id;
+
+--echo # Supress MTA errors generated by test
+call mtr.add_suppression(".*worker thread retried transaction [0-9] time.*");
+call mtr.add_suppression(".*Worker [0-9] failed executing transaction.*");
+call mtr.add_suppression(".*replica coordinator and worker threads are stopped.*");
+
+--source include/rpl_connection_master.inc
+FLUSH LOGS;
+
+# Trigger generation of at least one "multi server id" epoch trans
+SET @save_debug= @@GLOBAL.debug;
+SET @@GLOBAL.debug="+d,ndb_binlog_log_multi_server_id";
+
+# Wait for generation of at least one epoch trans with multiple server ids
+--let $ROWS = 2
+while (`SELECT COUNT(*) < $ROWS FROM test_multi_server_id WHERE epoch != 0`) {
+  sleep 0.1;
+}
+SET @@GLOBAL.debug= @save_debug;
+
+# Show rows written by generator
+SELECT id, what FROM test_multi_server_id ORDER BY id;
+
+--source suite/ndb/include/ndb_binlog_wait_own_changes.inc
+
+# Show the generated binlog transaction(s)
+# SHOW BINLOG EVENTS IN 'master-bin.000002';
+
+# Find last rows server_id in the first epoch trans of the binlog, i.e the
+# row which contains the STMT_END_F flag. This is the server_id which the
+# applier will filter away. This search is necessary since the order of rows
+# are not deterministic.
+let $row = 4; # Skip first rows, it can't be there
+let $last_server_id = 1;
+let $found = 0;
+let $query= SHOW BINLOG EVENTS IN 'master-bin.000002';
+while (!$found) {
+  let $sid= query_get_value($query, Server_id, $row);
+  let $row_next = $row;
+  inc $row_next;
+  let $info= query_get_value($query, Info, $row_next);
+  if ($info == 'COMMIT') {
+    let $last_server_id = $sid;
+    let $found = 1;
+  }
+  inc $row;
+}
+#echo last_server_id: $last_server_id;
+
+# Determine id of second last row (the one who will be locked)
+let $other_server_id = 32;
+if ($other_server_id == $last_server_id) {
+  let $other_server_id = 31;
+}
+assert($other_server_id != $last_server_id);
+
+# Use second slave connection to take a lock which will cause applier error
+# when starting replication again.
+--source include/rpl_connection_slave1.inc
+begin;
+--replace_result $other_server_id <other_server_id>
+eval SELECT * FROM test_multi_server_id WHERE id=$other_server_id FOR UPDATE;
+
+--echo #
+--echo # A) Test error and retry with STMT_END_F filtered away
+--echo #     - i.e statement commit from Query_log_event("COMMIT")
+--echo #
+--source include/rpl_connection_slave.inc
+--replace_result $last_server_id <last_server_id>
+eval CHANGE REPLICATION SOURCE TO
+       IGNORE_SERVER_IDS = ($last_server_id);
+
+--source include/start_slave.inc
+
+--echo # Wait for replication error
+let $slave_sql_errno= 1180;
+--source include/wait_for_slave_sql_error.inc
+
+--replace_regex /end_log_pos [0-9]*/end_log_pos NNN/ /Worker [0-9]* failed/Worker NNN failed/
+query_vertical
+  SELECT LAST_ERROR_NUMBER, LAST_ERROR_MESSAGE,
+         APPLYING_TRANSACTION_RETRIES_COUNT
+    FROM performance_schema.replication_applier_status_by_worker
+      WHERE LAST_ERROR_NUMBER != 0;
+--replace_regex /end_log_pos [0-9]*/end_log_pos NNN/ /Worker [0-9]* failed/Worker NNN failed/
+query_vertical
+  SELECT LAST_ERROR_NUMBER, LAST_ERROR_MESSAGE
+    FROM performance_schema.replication_applier_status_by_coordinator;
+
+# Show that rows on replica has not been updated
+SELECT id, what FROM test_multi_server_id ORDER BY id;
+
+
+--echo #
+--echo # B) Test error and retry when STMT_END_F has NOT been filtered
+--echo #     - i.e statement commit from Rows_log_event
+--echo #
+--source include/rpl_connection_slave.inc
+--source include/stop_slave.inc
+eval CHANGE REPLICATION SOURCE TO IGNORE_SERVER_IDS = ();
+
+--source include/start_slave.inc
+
+--echo # Wait for replication error
+let $slave_sql_errno= 1205;
+--source include/wait_for_slave_sql_error.inc
+
+--replace_regex /end_log_pos [0-9]*/end_log_pos NNN/ /Worker [0-9]* failed/Worker NNN failed/
+query_vertical
+  SELECT LAST_ERROR_NUMBER, LAST_ERROR_MESSAGE,
+         APPLYING_TRANSACTION_RETRIES_COUNT
+    FROM performance_schema.replication_applier_status_by_worker
+      WHERE LAST_ERROR_NUMBER != 0;
+--replace_regex /end_log_pos [0-9]*/end_log_pos NNN/ /Worker [0-9]* failed/Worker NNN failed/
+query_vertical
+  SELECT LAST_ERROR_NUMBER, LAST_ERROR_MESSAGE
+    FROM performance_schema.replication_applier_status_by_coordinator;
+
+# Show that rows on replica has not been updated
+SELECT id, what FROM test_multi_server_id ORDER BY id;
+
+--source include/rpl_connection_slave1.inc
+--echo # Release lock
+commit;
+
+--echo # Start replication again
+--source include/rpl_connection_slave.inc
+--source include/start_slave.inc
+# Wait until rows on replica has been updated
+while (`SELECT COUNT(*) < $ROWS FROM test_multi_server_id WHERE epoch != 0`) {
+  sleep 0.1;
+}
+# Show that rows on replica has been updated
+SELECT id, what FROM test_multi_server_id ORDER BY id;
+
+--echo # Cleanup
+--source include/rpl_connection_master.inc
+DROP TABLE test_multi_server_id;
+
+--source include/rpl_end.inc
diff --git a/storage/ndb/plugin/ha_ndbcluster_binlog.cc b/storage/ndb/plugin/ha_ndbcluster_binlog.cc
@@ -7859,6 +7859,9 @@ void Ndb_binlog_thread::do_run() {
     DBUG_EXECUTE_IF("ndb_binlog_log_table_maps",
                     { dbug_log_table_maps(i_ndb, current_epoch); });
 
+    DBUG_EXECUTE_IF("ndb_binlog_log_multi_server_id",
+                    { dbug_log_multi_server_id(i_ndb, current_epoch); });
+
     DBUG_EXECUTE_IF("ndb_binlog_inject_incident", {
       // Test rpl_injector function for writing incident to binlog
       const std::string message{"Epoch: " + std::to_string(current_epoch)};
diff --git a/storage/ndb/plugin/ndb_binlog_thread.cc b/storage/ndb/plugin/ndb_binlog_thread.cc
diff --git a/storage/ndb/plugin/ndb_binlog_thread.h b/storage/ndb/plugin/ndb_binlog_thread.h