Skip to content

Commit e6dd6f2

Browse files
committed
Check subscription state in the wait_for_sync_event.
If no group's subscription is enabled, wait_for_sync_event may get stuck in an infinite loop. Add a check of the subscription state in the waiting loop to reflect the fact that an apply worker may quietly die, deactivating its subscription. Also, add the correct processing of this behaviour to the Z0DAN and include a TAP test. XXX: This commit highlights the fact that in case of an unsuccessful add_node call, there are some database objects left.
1 parent 25e8f33 commit e6dd6f2

File tree

3 files changed

+84
-10
lines changed

3 files changed

+84
-10
lines changed

samples/Z0DAN/zodan.sql

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2351,8 +2351,7 @@ BEGIN
23512351
RAISE NOTICE ' OK: %', rpad('Waiting for sync event from ' || src_node_name || ' on new node ' || new_node_name || '...', 120, ' ');
23522352
EXCEPTION
23532353
WHEN OTHERS THEN
2354-
RAISE NOTICE ' ✗ %', rpad('Unable to wait for sync event from ' || src_node_name || ' on new node ' || new_node_name || ' (error: ' || SQLERRM || ')', 120, ' ');
2355-
RAISE;
2354+
RAISE EXCEPTION ' ✗ %', rpad('Unable to wait for sync event from ' || src_node_name || ' on new node ' || new_node_name || ' (error: ' || SQLERRM || ')', 120, ' ');
23562355
END;
23572356
END;
23582357
$$;

sql/spock--6.0.0-devel.sql

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -383,8 +383,12 @@ RETURNS void RETURNS NULL ON NULL INPUT VOLATILE LANGUAGE c AS 'MODULE_PATHNAME'
383383
CREATE FUNCTION spock.sync_event()
384384
RETURNS pg_lsn RETURNS NULL ON NULL INPUT VOLATILE LANGUAGE c AS 'MODULE_PATHNAME', 'spock_create_sync_event';
385385

386-
CREATE PROCEDURE spock.wait_for_sync_event(OUT result bool, origin_id oid, lsn pg_lsn, timeout int DEFAULT 0)
387-
AS $$
386+
CREATE PROCEDURE spock.wait_for_sync_event(
387+
OUT result bool,
388+
origin_id oid,
389+
lsn pg_lsn,
390+
timeout int DEFAULT 0
391+
) AS $$
388392
DECLARE
389393
target_id oid;
390394
elapsed_time numeric := 0;
@@ -396,6 +400,17 @@ BEGIN
396400
target_id := node_id FROM spock.node_info();
397401

398402
WHILE true LOOP
403+
-- If an unresolvable issue occurs with the apply worker, the LR
404+
-- progress gets stuck, and we need to check the subscription's state
405+
-- carefully.
406+
IF NOT EXISTS (SELECT * FROM spock.subscription
407+
WHERE sub_origin = origin_id AND
408+
sub_target = target_id AND
409+
sub_enabled = true) THEN
410+
RAISE EXCEPTION 'Replication % => % does not have any enabled subscription yet',
411+
origin_id, target_id;
412+
END IF;
413+
399414
SELECT INTO progress_lsn remote_commit_lsn
400415
FROM spock.progress
401416
WHERE node_id = target_id AND remote_node_id = origin_id;
@@ -415,8 +430,12 @@ BEGIN
415430
END;
416431
$$ LANGUAGE plpgsql;
417432

418-
CREATE PROCEDURE spock.wait_for_sync_event(OUT result bool, origin name, lsn pg_lsn, timeout int DEFAULT 0)
419-
AS $$
433+
CREATE PROCEDURE spock.wait_for_sync_event(
434+
OUT result bool,
435+
origin name,
436+
lsn pg_lsn,
437+
timeout int DEFAULT 0
438+
) AS $$
420439
DECLARE
421440
origin_id oid;
422441
target_id oid;
@@ -430,6 +449,17 @@ BEGIN
430449
target_id := node_id FROM spock.node_info();
431450

432451
WHILE true LOOP
452+
-- If an unresolvable issue occurs with the apply worker, the LR
453+
-- progress gets stuck, and we need to check the subscription's state
454+
-- carefully.
455+
IF NOT EXISTS (SELECT * FROM spock.subscription
456+
WHERE sub_origin = origin_id AND
457+
sub_target = target_id AND
458+
sub_enabled = true) THEN
459+
RAISE EXCEPTION 'Replication % => % does not have any enabled subscription yet',
460+
origin_id, target_id;
461+
END IF;
462+
433463
SELECT INTO progress_lsn remote_commit_lsn
434464
FROM spock.progress
435465
WHERE node_id = target_id AND remote_node_id = origin_id;

tests/tap/t/012_zodan_basics.pl

Lines changed: 49 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,15 @@
3131
psql_or_bail(1, "CREATE TABLE test(x serial PRIMARY KEY)");
3232
psql_or_bail(1, "INSERT INTO test DEFAULT VALUES");
3333

34-
print STDERR "All supporting stuff has been installed\n";
34+
print STDERR "All supporting stuff has been installed successfully\n";
3535

36-
print STDERR "Call Z0DAN: n2 => n1";
36+
# ##############################################################################
37+
#
38+
# Basic check that Z0DAN correctly add node to the single-node cluster
39+
#
40+
# ##############################################################################
41+
42+
print STDERR "Call Z0DAN: n2 => n1\n";
3743
psql_or_bail(2, "
3844
CALL spock.add_node(
3945
src_node_name := 'n1',
@@ -49,8 +55,13 @@
4955

5056
psql_or_bail(1, "SELECT spock.sub_disable('sub_n1_n2')");
5157

52-
print STDERR "Call Z0DAN: n3 => n2\n";
58+
# ##############################################################################
59+
#
60+
# Z0DAN reject node addition if some subscriptions are disabled
61+
#
62+
# ##############################################################################
5363

64+
print STDERR "Call Z0DAN: n3 => n2\n";
5465
scalar_query(3, "
5566
CALL spock.add_node(
5667
src_node_name := 'n2',
@@ -63,7 +74,7 @@
6374
print STDERR "Z0DAN should fail because of a disabled subscription\n";
6475

6576
psql_or_bail(1, "SELECT spock.sub_enable('sub_n1_n2')");
66-
scalar_query(3, "
77+
psql_or_bail(3, "
6778
CALL spock.add_node(
6879
src_node_name := 'n2',
6980
src_dsn := 'host=$host dbname=$dbname port=$node_ports->[1] user=$db_user password=$db_password',
@@ -77,6 +88,40 @@
7788
ok($result eq '1', "Check state of the test table on N3 after the attachment");
7889
print STDERR "Z0DAN should add N3 to the cluster\n";
7990

91+
# ##############################################################################
92+
#
93+
# Test that Z0DAN correctly doesn't add node to the cluster if something happens
94+
# during the SYNC process.
95+
#
96+
# ##############################################################################
97+
98+
# Remove node from the cluster and data leftovers.
99+
psql_or_bail(3, "\\i ../../samples/Z0DAN/zodremove.sql");
100+
psql_or_bail(3, "CALL spock.remove_node(target_node_name := 'n3',
101+
target_node_dsn := 'host=$host dbname=$dbname port=$node_ports->[2] user=$db_user password=$db_password',
102+
verbose_mode := true)");
103+
psql_or_bail(3, "DROP TABLE test");
104+
psql_or_bail(3, "DROP EXTENSION lolor");
105+
106+
psql_or_bail(1, "CREATE FUNCTION fake_fn() RETURNS integer LANGUAGE sql AS \$\$ SELECT 1\$\$");
107+
psql_or_bail(3, "CREATE FUNCTION fake_fn() RETURNS integer LANGUAGE sql AS \$\$ SELECT 1\$\$");
108+
scalar_query(3, "
109+
CALL spock.add_node(
110+
src_node_name := 'n2',
111+
src_dsn := 'host=$host dbname=$dbname port=$node_ports->[1] user=$db_user password=$db_password',
112+
new_node_name := 'n3', new_node_dsn := 'host=$host dbname=$dbname port=$node_ports->[2] user=$db_user password=$db_password',
113+
verb := true)");
114+
115+
# TODO:
116+
# It seems that add_node keeps remnants after unsuccessful execution. It is
117+
# happened because we have commited some intermediate results before.
118+
# It would be better to keep remote transaction opened until the end of the
119+
# operation or just remove these remnants at the end pretending to be a
120+
# distributed transaction.
121+
#
122+
# $result = scalar_query(3, "SELECT count(*) FROM spock.local_node");
123+
# ok($result eq '0', "N3 is not in the cluster");
124+
80125
# Clean up
81126
destroy_cluster('Destroy test cluster');
82127
done_testing();

0 commit comments

Comments
 (0)