Skip to content

Commit ac6d59c

Browse files
committed
BUG#37331118 Applier prepare error retry [test]
The replication applier normally retries temporary errors occurring while applying transactions. Such retry logic is not performed for transactions containing row events where the STMT_END_F flag is missing. For such events, the statement will instead be committed as an additional step while applying the subsequent COMMIT query event when it is detected that there are still locked tables. When committing the statement, temporary errors are not handled properly. This patch reproduces the problem by writing an epoch trans with simulated writes from multiple server ids on the source. The replica then uses IGNORE_SERVER_IDS(<last_server_id_in_binlog>) to cause the STMT_END_F to be filtered away, thus activating the above-described code path in the applier. By holding a lock on one of the rows to be updated by the applier, error handling is triggered. For reference the test then also triggers error and retry handling when STMT_END_F has not been filtered away. Change-Id: Ifa3dc6d00691fbba286fcce5e1919f42a7e50d16
1 parent 9cb5ef6 commit ac6d59c

File tree

6 files changed

+393
-0
lines changed

6 files changed

+393
-0
lines changed
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
include/master-slave.inc
2+
Warnings:
3+
Note #### Sending passwords in plain text without SSL/TLS is extremely insecure.
4+
Note #### Storing MySQL user name or password information in the connection metadata repository is not secure and is therefore not recommended. Please consider using the USER and PASSWORD connection options for START REPLICA; see the 'START REPLICA Syntax' in the MySQL Manual for more information.
5+
[connection master]
6+
######################################################################
7+
# BUG#37331118 Applier prepare error retry
8+
#
9+
# The replication applier normally retries temporary errors occurring
10+
# while applying transactions. Such retry logic is not performed for
11+
# transactions containing row events where the STMT_END_F flag is
12+
# missing.
13+
# For such events, the statement will instead be committed as an
14+
# additional step while applying the subsequent COMMIT query event when
15+
# it is detected that there are still locked tables. When committing the
16+
# statement, temporary errors are not handled properly.
17+
#
18+
# This patch reproduces the problem by writing an epoch trans with
19+
# simulated writes from multiple server ids on the source. The replica
20+
# then uses IGNORE_SERVER_IDS(<last_server_id_in_binlog>) to cause the
21+
# STMT_END_F to be filtered away, thus activating the above-described
22+
# code path in the applier. By holding a lock on one of the rows to be
23+
# updated by the applier, error handling is triggered.
24+
# For reference the test then also triggers error and retry handling
25+
# when STMT_END_F has not been filtered away.
26+
######################################################################
27+
# Populate the source and replica
28+
# - create table and populate with two rows, sync to replica
29+
[connection master]
30+
CREATE TABLE test_multi_server_id (
31+
id INT PRIMARY KEY,
32+
what VARCHAR(128),
33+
epoch INT UNSIGNED
34+
) ENGINE = NDB;
35+
INSERT INTO test_multi_server_id
36+
VALUES (31, "not updated", 0), (32, "not updated", 0);
37+
# Source need log_replica_updates turned on
38+
show variables like 'log_replica_updates';
39+
Variable_name Value
40+
log_replica_updates ON
41+
include/sync_slave_sql_with_master.inc
42+
# Replica need both workers and retries to be greater than 1
43+
show variables like 'replica_parallel_workers';
44+
Variable_name Value
45+
replica_parallel_workers 2
46+
show variables like 'replica_transaction_retries';
47+
Variable_name Value
48+
replica_transaction_retries 3
49+
include/stop_slave.inc
50+
SELECT * FROM test_multi_server_id ORDER BY id;
51+
id what epoch
52+
31 not updated 0
53+
32 not updated 0
54+
# Supress MTA errors generated by test
55+
call mtr.add_suppression(".*worker thread retried transaction [0-9] time.*");
56+
call mtr.add_suppression(".*Worker [0-9] failed executing transaction.*");
57+
call mtr.add_suppression(".*replica coordinator and worker threads are stopped.*");
58+
[connection master]
59+
FLUSH LOGS;
60+
SET @save_debug= @@GLOBAL.debug;
61+
SET @@GLOBAL.debug="+d,ndb_binlog_log_multi_server_id";
62+
SET @@GLOBAL.debug= @save_debug;
63+
SELECT id, what FROM test_multi_server_id ORDER BY id;
64+
id what
65+
31 change from 31
66+
32 change from 32
67+
# Wait for ndb_binlog thread...
68+
[connection slave1]
69+
begin;
70+
SELECT * FROM test_multi_server_id WHERE id=<other_server_id> FOR UPDATE;
71+
id what epoch
72+
<other_server_id> not updated 0
73+
#
74+
# A) Test error and retry with STMT_END_F filtered away
75+
# - i.e statement commit from Query_log_event("COMMIT")
76+
#
77+
[connection slave]
78+
CHANGE REPLICATION SOURCE TO
79+
IGNORE_SERVER_IDS = (<last_server_id>);
80+
include/start_slave.inc
81+
# Wait for replication error
82+
include/wait_for_slave_sql_error.inc [errno=1180]
83+
SELECT LAST_ERROR_NUMBER, LAST_ERROR_MESSAGE,
84+
APPLYING_TRANSACTION_RETRIES_COUNT
85+
FROM performance_schema.replication_applier_status_by_worker
86+
WHERE LAST_ERROR_NUMBER != 0;
87+
LAST_ERROR_NUMBER 1180
88+
LAST_ERROR_MESSAGE Worker NNN failed executing transaction 'ANONYMOUS' at source log master-bin.000002, end_log_pos NNN; Error 'Got error 125 - 'Transaction has been rolled back' during COMMIT' on query. Default database: ''. Query: 'COMMIT'
89+
APPLYING_TRANSACTION_RETRIES_COUNT 0
90+
SELECT LAST_ERROR_NUMBER, LAST_ERROR_MESSAGE
91+
FROM performance_schema.replication_applier_status_by_coordinator;
92+
LAST_ERROR_NUMBER 1180
93+
LAST_ERROR_MESSAGE Coordinator stopped because there were error(s) in the worker(s). The most recent failure being: Worker NNN failed executing transaction 'ANONYMOUS' at source log master-bin.000002, end_log_pos NNN. See error log and/or performance_schema.replication_applier_status_by_worker table for more details about this failure or others, if any.
94+
SELECT id, what FROM test_multi_server_id ORDER BY id;
95+
id what
96+
31 not updated
97+
32 not updated
98+
#
99+
# B) Test error and retry when STMT_END_F has NOT been filtered
100+
# - i.e statement commit from Rows_log_event
101+
#
102+
[connection slave]
103+
include/stop_slave.inc
104+
CHANGE REPLICATION SOURCE TO IGNORE_SERVER_IDS = ();
105+
include/start_slave.inc
106+
# Wait for replication error
107+
include/wait_for_slave_sql_error.inc [errno=1205]
108+
SELECT LAST_ERROR_NUMBER, LAST_ERROR_MESSAGE,
109+
APPLYING_TRANSACTION_RETRIES_COUNT
110+
FROM performance_schema.replication_applier_status_by_worker
111+
WHERE LAST_ERROR_NUMBER != 0;
112+
LAST_ERROR_NUMBER 1205
113+
LAST_ERROR_MESSAGE Worker NNN failed executing transaction 'ANONYMOUS' at source log master-bin.000002, end_log_pos NNN; Lock wait timeout exceeded; try restarting transaction
114+
APPLYING_TRANSACTION_RETRIES_COUNT 3
115+
SELECT LAST_ERROR_NUMBER, LAST_ERROR_MESSAGE
116+
FROM performance_schema.replication_applier_status_by_coordinator;
117+
LAST_ERROR_NUMBER 1205
118+
LAST_ERROR_MESSAGE Coordinator stopped because there were error(s) in the worker(s). The most recent failure being: Worker NNN failed executing transaction 'ANONYMOUS' at source log master-bin.000002, end_log_pos NNN. See error log and/or performance_schema.replication_applier_status_by_worker table for more details about this failure or others, if any.
119+
SELECT id, what FROM test_multi_server_id ORDER BY id;
120+
id what
121+
31 not updated
122+
32 not updated
123+
[connection slave1]
124+
# Release lock
125+
commit;
126+
# Start replication again
127+
[connection slave]
128+
include/start_slave.inc
129+
SELECT id, what FROM test_multi_server_id ORDER BY id;
130+
id what
131+
31 change from 31
132+
32 change from 32
133+
# Cleanup
134+
[connection master]
135+
DROP TABLE test_multi_server_id;
136+
include/rpl_end.inc
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
!include suite/ndb_rpl/my.cnf
2+
3+
[mysqld.1.1]
4+
log-replica-updates=ON
5+
6+
[mysqld.1.slave]
7+
replica-parallel-workers=2
8+
replica-transaction-retries=3
Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
--source include/have_debug.inc
2+
--source include/have_ndb.inc
3+
--source include/master-slave.inc
4+
5+
--echo ######################################################################
6+
--echo # BUG#37331118 Applier prepare error retry
7+
--echo #
8+
--echo # The replication applier normally retries temporary errors occurring
9+
--echo # while applying transactions. Such retry logic is not performed for
10+
--echo # transactions containing row events where the STMT_END_F flag is
11+
--echo # missing.
12+
--echo # For such events, the statement will instead be committed as an
13+
--echo # additional step while applying the subsequent COMMIT query event when
14+
--echo # it is detected that there are still locked tables. When committing the
15+
--echo # statement, temporary errors are not handled properly.
16+
--echo #
17+
--echo # This patch reproduces the problem by writing an epoch trans with
18+
--echo # simulated writes from multiple server ids on the source. The replica
19+
--echo # then uses IGNORE_SERVER_IDS(<last_server_id_in_binlog>) to cause the
20+
--echo # STMT_END_F to be filtered away, thus activating the above-described
21+
--echo # code path in the applier. By holding a lock on one of the rows to be
22+
--echo # updated by the applier, error handling is triggered.
23+
--echo # For reference the test then also triggers error and retry handling
24+
--echo # when STMT_END_F has not been filtered away.
25+
--echo ######################################################################
26+
27+
--echo # Populate the source and replica
28+
--echo # - create table and populate with two rows, sync to replica
29+
30+
--source include/rpl_connection_master.inc
31+
CREATE TABLE test_multi_server_id (
32+
id INT PRIMARY KEY,
33+
what VARCHAR(128),
34+
epoch INT UNSIGNED
35+
) ENGINE = NDB;
36+
INSERT INTO test_multi_server_id
37+
VALUES (31, "not updated", 0), (32, "not updated", 0);
38+
39+
--echo # Source need log_replica_updates turned on
40+
show variables like 'log_replica_updates';
41+
42+
--source include/sync_slave_sql_with_master.inc
43+
44+
--echo # Replica need both workers and retries to be greater than 1
45+
show variables like 'replica_parallel_workers';
46+
show variables like 'replica_transaction_retries';
47+
48+
# Stop replication
49+
--source include/stop_slave.inc
50+
SELECT * FROM test_multi_server_id ORDER BY id;
51+
52+
--echo # Supress MTA errors generated by test
53+
call mtr.add_suppression(".*worker thread retried transaction [0-9] time.*");
54+
call mtr.add_suppression(".*Worker [0-9] failed executing transaction.*");
55+
call mtr.add_suppression(".*replica coordinator and worker threads are stopped.*");
56+
57+
--source include/rpl_connection_master.inc
58+
FLUSH LOGS;
59+
60+
# Trigger generation of at least one "multi server id" epoch trans
61+
SET @save_debug= @@GLOBAL.debug;
62+
SET @@GLOBAL.debug="+d,ndb_binlog_log_multi_server_id";
63+
64+
# Wait for generation of at least one epoch trans with multiple server ids
65+
--let $ROWS = 2
66+
while (`SELECT COUNT(*) < $ROWS FROM test_multi_server_id WHERE epoch != 0`) {
67+
sleep 0.1;
68+
}
69+
SET @@GLOBAL.debug= @save_debug;
70+
71+
# Show rows written by generator
72+
SELECT id, what FROM test_multi_server_id ORDER BY id;
73+
74+
--source suite/ndb/include/ndb_binlog_wait_own_changes.inc
75+
76+
# Show the generated binlog transaction(s)
77+
# SHOW BINLOG EVENTS IN 'master-bin.000002';
78+
79+
# Find last rows server_id in the first epoch trans of the binlog, i.e the
80+
# row which contains the STMT_END_F flag. This is the server_id which the
81+
# applier will filter away. This search is necessary since the order of rows
82+
# are not deterministic.
83+
let $row = 4; # Skip first rows, it can't be there
84+
let $last_server_id = 1;
85+
let $found = 0;
86+
let $query= SHOW BINLOG EVENTS IN 'master-bin.000002';
87+
while (!$found) {
88+
let $sid= query_get_value($query, Server_id, $row);
89+
let $row_next = $row;
90+
inc $row_next;
91+
let $info= query_get_value($query, Info, $row_next);
92+
if ($info == 'COMMIT') {
93+
let $last_server_id = $sid;
94+
let $found = 1;
95+
}
96+
inc $row;
97+
}
98+
#echo last_server_id: $last_server_id;
99+
100+
# Determine id of second last row (the one who will be locked)
101+
let $other_server_id = 32;
102+
if ($other_server_id == $last_server_id) {
103+
let $other_server_id = 31;
104+
}
105+
assert($other_server_id != $last_server_id);
106+
107+
# Use second slave connection to take a lock which will cause applier error
108+
# when starting replication again.
109+
--source include/rpl_connection_slave1.inc
110+
begin;
111+
--replace_result $other_server_id <other_server_id>
112+
eval SELECT * FROM test_multi_server_id WHERE id=$other_server_id FOR UPDATE;
113+
114+
--echo #
115+
--echo # A) Test error and retry with STMT_END_F filtered away
116+
--echo # - i.e statement commit from Query_log_event("COMMIT")
117+
--echo #
118+
--source include/rpl_connection_slave.inc
119+
--replace_result $last_server_id <last_server_id>
120+
eval CHANGE REPLICATION SOURCE TO
121+
IGNORE_SERVER_IDS = ($last_server_id);
122+
123+
--source include/start_slave.inc
124+
125+
--echo # Wait for replication error
126+
let $slave_sql_errno= 1180;
127+
--source include/wait_for_slave_sql_error.inc
128+
129+
--replace_regex /end_log_pos [0-9]*/end_log_pos NNN/ /Worker [0-9]* failed/Worker NNN failed/
130+
query_vertical
131+
SELECT LAST_ERROR_NUMBER, LAST_ERROR_MESSAGE,
132+
APPLYING_TRANSACTION_RETRIES_COUNT
133+
FROM performance_schema.replication_applier_status_by_worker
134+
WHERE LAST_ERROR_NUMBER != 0;
135+
--replace_regex /end_log_pos [0-9]*/end_log_pos NNN/ /Worker [0-9]* failed/Worker NNN failed/
136+
query_vertical
137+
SELECT LAST_ERROR_NUMBER, LAST_ERROR_MESSAGE
138+
FROM performance_schema.replication_applier_status_by_coordinator;
139+
140+
# Show that rows on replica has not been updated
141+
SELECT id, what FROM test_multi_server_id ORDER BY id;
142+
143+
144+
--echo #
145+
--echo # B) Test error and retry when STMT_END_F has NOT been filtered
146+
--echo # - i.e statement commit from Rows_log_event
147+
--echo #
148+
--source include/rpl_connection_slave.inc
149+
--source include/stop_slave.inc
150+
eval CHANGE REPLICATION SOURCE TO IGNORE_SERVER_IDS = ();
151+
152+
--source include/start_slave.inc
153+
154+
--echo # Wait for replication error
155+
let $slave_sql_errno= 1205;
156+
--source include/wait_for_slave_sql_error.inc
157+
158+
--replace_regex /end_log_pos [0-9]*/end_log_pos NNN/ /Worker [0-9]* failed/Worker NNN failed/
159+
query_vertical
160+
SELECT LAST_ERROR_NUMBER, LAST_ERROR_MESSAGE,
161+
APPLYING_TRANSACTION_RETRIES_COUNT
162+
FROM performance_schema.replication_applier_status_by_worker
163+
WHERE LAST_ERROR_NUMBER != 0;
164+
--replace_regex /end_log_pos [0-9]*/end_log_pos NNN/ /Worker [0-9]* failed/Worker NNN failed/
165+
query_vertical
166+
SELECT LAST_ERROR_NUMBER, LAST_ERROR_MESSAGE
167+
FROM performance_schema.replication_applier_status_by_coordinator;
168+
169+
# Show that rows on replica has not been updated
170+
SELECT id, what FROM test_multi_server_id ORDER BY id;
171+
172+
--source include/rpl_connection_slave1.inc
173+
--echo # Release lock
174+
commit;
175+
176+
--echo # Start replication again
177+
--source include/rpl_connection_slave.inc
178+
--source include/start_slave.inc
179+
# Wait until rows on replica has been updated
180+
while (`SELECT COUNT(*) < $ROWS FROM test_multi_server_id WHERE epoch != 0`) {
181+
sleep 0.1;
182+
}
183+
# Show that rows on replica has been updated
184+
SELECT id, what FROM test_multi_server_id ORDER BY id;
185+
186+
--echo # Cleanup
187+
--source include/rpl_connection_master.inc
188+
DROP TABLE test_multi_server_id;
189+
190+
--source include/rpl_end.inc

storage/ndb/plugin/ha_ndbcluster_binlog.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7859,6 +7859,9 @@ void Ndb_binlog_thread::do_run() {
78597859
DBUG_EXECUTE_IF("ndb_binlog_log_table_maps",
78607860
{ dbug_log_table_maps(i_ndb, current_epoch); });
78617861

7862+
DBUG_EXECUTE_IF("ndb_binlog_log_multi_server_id",
7863+
{ dbug_log_multi_server_id(i_ndb, current_epoch); });
7864+
78627865
DBUG_EXECUTE_IF("ndb_binlog_inject_incident", {
78637866
// Test rpl_injector function for writing incident to binlog
78647867
const std::string message{"Epoch: " + std::to_string(current_epoch)};

0 commit comments

Comments
 (0)