Skip to content

Commit 2dcef2e

Browse files
committed
Check subscription state in the wait_for_sync_event.
If no group's subscription is enabled, wait_for_sync_event may get stuck in an infinite loop. Add a check of the subscription state in the waiting loop to reflect the fact that an apply worker may quietly die, deactivating its subscription. Also, add the correct processing of this behaviour to the Z0DAN and include a TAP test.
1 parent 89bfb42 commit 2dcef2e

File tree

3 files changed

+71
-8
lines changed

3 files changed

+71
-8
lines changed

samples/Z0DAN/zodan.sql

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2351,8 +2351,7 @@ BEGIN
23512351
RAISE NOTICE ' OK: %', rpad('Waiting for sync event from ' || src_node_name || ' on new node ' || new_node_name || '...', 120, ' ');
23522352
EXCEPTION
23532353
WHEN OTHERS THEN
2354-
RAISE NOTICE ' ✗ %', rpad('Unable to wait for sync event from ' || src_node_name || ' on new node ' || new_node_name || ' (error: ' || SQLERRM || ')', 120, ' ');
2355-
RAISE;
2354+
RAISE EXCEPTION ' ✗ %', rpad('Unable to wait for sync event from ' || src_node_name || ' on new node ' || new_node_name || ' (error: ' || SQLERRM || ')', 120, ' ');
23562355
END;
23572356
END;
23582357
$$;

sql/spock--6.0.0-devel.sql

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,17 @@ BEGIN
414414
target_id := node_id FROM spock.node_info();
415415

416416
WHILE true LOOP
417+
-- If an unresolvable issue occurs with the apply worker, the LR
418+
-- progress gets stuck, and we need to check the subscription's state
419+
-- carefully.
420+
IF NOT EXISTS (SELECT * FROM spock.subscription
421+
WHERE sub_origin = origin_id AND
422+
sub_target = target_id AND
423+
sub_enabled = true) THEN
424+
RAISE EXCEPTION 'Replication % => % does not have any enabled subscription yet',
425+
origin_id, target_id;
426+
END IF;
427+
417428
SELECT INTO progress_lsn remote_commit_lsn
418429
FROM spock.progress
419430
WHERE node_id = target_id AND remote_node_id = origin_id;
@@ -452,6 +463,17 @@ BEGIN
452463
target_id := node_id FROM spock.node_info();
453464

454465
WHILE true LOOP
466+
-- If an unresolvable issue occurs with the apply worker, the LR
467+
-- progress gets stuck, and we need to check the subscription's state
468+
-- carefully.
469+
IF NOT EXISTS (SELECT * FROM spock.subscription
470+
WHERE sub_origin = origin_id AND
471+
sub_target = target_id AND
472+
sub_enabled = true) THEN
473+
RAISE EXCEPTION 'Replication % => % does not have any enabled subscription yet',
474+
origin_id, target_id;
475+
END IF;
476+
455477
SELECT INTO progress_lsn remote_commit_lsn
456478
FROM spock.progress
457479
WHERE node_id = target_id AND remote_node_id = origin_id;

tests/tap/t/012_zodan_basics.pl

Lines changed: 48 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,6 @@
2121

2222
psql_or_bail(2, "SELECT spock.node_drop('n2')");
2323
psql_or_bail(3, "SELECT spock.node_drop('n3')");
24-
psql_or_bail(1, "CREATE EXTENSION snowflake");
25-
psql_or_bail(1, "CREATE EXTENSION lolor");
2624
psql_or_bail(1, "CREATE EXTENSION amcheck");
2725
psql_or_bail(2, "CREATE EXTENSION dblink");
2826
psql_or_bail(3, "CREATE EXTENSION dblink");
@@ -31,9 +29,15 @@
3129
psql_or_bail(1, "CREATE TABLE test(x serial PRIMARY KEY)");
3230
psql_or_bail(1, "INSERT INTO test DEFAULT VALUES");
3331

34-
print STDERR "All supporting stuff has been installed\n";
32+
print STDERR "All supporting stuff has been installed successfully\n";
3533

36-
print STDERR "Call Z0DAN: n2 => n1";
34+
# ##############################################################################
35+
#
36+
# Basic check that Z0DAN correctly add node to the single-node cluster
37+
#
38+
# ##############################################################################
39+
40+
print STDERR "Call Z0DAN: n2 => n1\n";
3741
psql_or_bail(2, "
3842
CALL spock.add_node(
3943
src_node_name := 'n1',
@@ -49,8 +53,13 @@
4953

5054
psql_or_bail(1, "SELECT spock.sub_disable('sub_n1_n2')");
5155

52-
print STDERR "Call Z0DAN: n3 => n2\n";
56+
# ##############################################################################
57+
#
58+
# Z0DAN reject node addition if some subscriptions are disabled
59+
#
60+
# ##############################################################################
5361

62+
print STDERR "Call Z0DAN: n3 => n2\n";
5463
scalar_query(3, "
5564
CALL spock.add_node(
5665
src_node_name := 'n2',
@@ -63,7 +72,7 @@
6372
print STDERR "Z0DAN should fail because of a disabled subscription\n";
6473

6574
psql_or_bail(1, "SELECT spock.sub_enable('sub_n1_n2')");
66-
scalar_query(3, "
75+
psql_or_bail(3, "
6776
CALL spock.add_node(
6877
src_node_name := 'n2',
6978
src_dsn := 'host=$host dbname=$dbname port=$node_ports->[1] user=$db_user password=$db_password',
@@ -77,6 +86,39 @@
7786
ok($result eq '1', "Check state of the test table on N3 after the attachment");
7887
print STDERR "Z0DAN should add N3 to the cluster\n";
7988

89+
# ##############################################################################
90+
#
91+
# Test that Z0DAN correctly doesn't add node to the cluster if something happens
92+
# during the SYNC process.
93+
#
94+
# ##############################################################################
95+
96+
# Remove node from the cluster and data leftovers.
97+
psql_or_bail(3, "\\i ../../samples/Z0DAN/zodremove.sql");
98+
psql_or_bail(3, "CALL spock.remove_node(target_node_name := 'n3',
99+
target_node_dsn := 'host=$host dbname=$dbname port=$node_ports->[2] user=$db_user password=$db_password',
100+
verbose_mode := true)");
101+
psql_or_bail(3, "DROP TABLE test");
102+
103+
psql_or_bail(1, "CREATE FUNCTION fake_fn() RETURNS integer LANGUAGE sql AS \$\$ SELECT 1\$\$");
104+
psql_or_bail(3, "CREATE FUNCTION fake_fn() RETURNS integer LANGUAGE sql AS \$\$ SELECT 1\$\$");
105+
scalar_query(3, "
106+
CALL spock.add_node(
107+
src_node_name := 'n2',
108+
src_dsn := 'host=$host dbname=$dbname port=$node_ports->[1] user=$db_user password=$db_password',
109+
new_node_name := 'n3', new_node_dsn := 'host=$host dbname=$dbname port=$node_ports->[2] user=$db_user password=$db_password',
110+
verb := true)");
111+
112+
# TODO:
113+
# It seems that add_node keeps remnants after unsuccessful execution. It is
114+
# happened because we have commited some intermediate results before.
115+
# It would be better to keep remote transaction opened until the end of the
116+
# operation or just remove these remnants at the end pretending to be a
117+
# distributed transaction.
118+
#
119+
# $result = scalar_query(3, "SELECT count(*) FROM spock.local_node");
120+
# ok($result eq '0', "N3 is not in the cluster");
121+
80122
# Clean up
81123
destroy_cluster('Destroy test cluster');
82124
done_testing();

0 commit comments

Comments
 (0)