Skip to content

Commit 21bde5e

Browse files
committed
Improve the 011_zodan_sync_third TAP test.
There are some corner cases that, at least for now, need to involve both the publisher and the subscriber when we try to determine whether all the data is replicated (synchronisation has finished). So, rewrite the sync code to sync_event/wait calls. It seems that if we improve wait functions a little, it may serve to multiple purposes. For example, we may add sync_event to the wait function, replace integer timeouts with the Interval type, and introduce 'verbose' mode. And yes, rewrite it as a function.
1 parent c799a70 commit 21bde5e

File tree

2 files changed

+27
-36
lines changed

2 files changed

+27
-36
lines changed

.github/workflows/zodan_sync.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Keep its logic in sync with spockbench.yml!
22

33
name: Z0DAN Sync long-lasting test
4+
run-name: Add new node into highly loaded multi-master configuration
45
on:
56
workflow_dispatch:
67
schedule:

tests/tap/t/011_zodan_sync_third.pl

Lines changed: 26 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
create_cluster(3, 'Create initial 2-node Spock test cluster');
2525

2626

27-
my ($ret1, $ret2, $ret3);
27+
my ($ret1, $ret2, $ret3, $lsn1, $lsn2, $lsn3);
2828

2929
# Get cluster configuration
3030
my $config = get_test_config();
@@ -264,7 +264,7 @@
264264
$alive = kill 0, $pid;
265265
ok($alive eq 1, "pgbench load to N2 still exists");
266266

267-
print STDERR "Kill pgbench process to reduce test time";
267+
print STDERR "Kill pgbench process to reduce test time\n";
268268
$pgbench_handle1->pump();
269269
$pgbench_handle2->pump();
270270
$pgbench_handle1->kill_kill;
@@ -363,40 +363,30 @@
363363
print STDERR $pgbench_stdout2;
364364
print STDERR "##### end of output #####\n";
365365

366-
# We need such a trick: the wait_slot_confirm_lsn routine gets Last Committed
367-
# LSN position and waits for the confirmations on the remote side. But if there
368-
# a conflict has happened, feedback will not be sent and we will wait forever.
369-
psql_or_bail(1, "SELECT spock.sync_event()");
370-
psql_or_bail(2, "SELECT spock.sync_event()");
371-
psql_or_bail(3, "SELECT spock.sync_event()");
372-
373-
psql_or_bail(1, 'SELECT spock.wait_slot_confirm_lsn(NULL, NULL)');
374-
psql_or_bail(2, 'SELECT spock.wait_slot_confirm_lsn(NULL, NULL)');
375-
376-
print STDERR "Wait for the end of N3->N1, N3->N2 decoding process that means the actual start of LR\n";
377-
psql_or_bail(3, 'SELECT spock.wait_slot_confirm_lsn(NULL, NULL)');
378-
379-
print STDERR "Wait until the end of replication ..\n";
380-
$lag = scalar_query(1, "SELECT * FROM wait_subscription(remote_node_name := 'n2',
381-
report_it := true,
382-
timeout := '10 minutes',
383-
delay := 1.)");
384-
ok($lag <= 0, "Replication N2 => N1 has been finished successfully");
385-
$lag = scalar_query(2, "SELECT * FROM wait_subscription(remote_node_name := 'n1',
386-
report_it := true,
387-
timeout := '10 minutes',
388-
delay := 1.)");
389-
ok($lag <= 0, "Replication N1 => N2 has been finished successfully");
390-
$lag = scalar_query(3, "SELECT * FROM wait_subscription(remote_node_name := 'n1',
391-
report_it := true,
392-
timeout := '10 minutes',
393-
delay := 1.)");
394-
ok($lag <= 0, "Replication N1 => N3 has been finished successfully");
395-
$lag = scalar_query(3, "SELECT * FROM wait_subscription(remote_node_name := 'n2',
396-
report_it := true,
397-
timeout := '10 minutes',
398-
delay := 1.)");
399-
ok($lag <= 0, "Replication N2 => N3 has been finished successfully");
366+
#
367+
# Wait for the end of apply process
368+
#
369+
print STDERR "Wait for the end of LR caused by the pgbench load\n";
370+
$lsn1 = scalar_query(1, "SELECT spock.sync_event()");
371+
$lsn2 = scalar_query(2, "SELECT spock.sync_event()");
372+
$lsn3 = scalar_query(3, "SELECT spock.sync_event()");
373+
print STDERR "DEBUGGING. LSNs: N1: $lsn1, N2: $lsn2, N3: $lsn3\n";
374+
375+
print STDERR "Wait for the N2 -> N1 sync message ...\n";
376+
psql_or_bail(1, "CALL spock.wait_for_sync_event(true, 'n2', '$lsn2'::pg_lsn, 600)");
377+
print STDERR "Wait for the N1 -> N2 sync message ...\n";
378+
psql_or_bail(2, "CALL spock.wait_for_sync_event(true, 'n1', '$lsn1'::pg_lsn, 600)");
379+
print STDERR "Wait for the N1 -> N3 sync message ...\n";
380+
psql_or_bail(3, "CALL spock.wait_for_sync_event(true, 'n1', '$lsn1'::pg_lsn, 600)");
381+
print STDERR "Wait for the N2 -> N3 sync message ...\n";
382+
psql_or_bail(3, "CALL spock.wait_for_sync_event(true, 'n2', '$lsn2'::pg_lsn, 600)");
383+
print STDERR "LR messages from active nodes has arrived to the new one\n";
384+
385+
print STDERR "Wait for the N3 -> N1 sync message ...\n";
386+
psql_or_bail(1, "CALL spock.wait_for_sync_event(true, 'n3', '$lsn3'::pg_lsn, 600)");
387+
print STDERR "Wait for the N3 -> N2 sync message ...\n";
388+
psql_or_bail(2, "CALL spock.wait_for_sync_event(true, 'n3', '$lsn3'::pg_lsn, 600)");
389+
print STDERR "First LR transaction has arrived from new node to the active ones\n";
400390

401391
print STDERR "Check the data consistency.\n";
402392
$ret1 = scalar_query(1, "SELECT sum(abalance), sum(aid), count(*) FROM pgbench_accounts");

0 commit comments

Comments
 (0)