Skip to content

Commit 871f1eb

Browse files
committed
Check subscription state in the wait_for_sync_event.
If no group's subscription is enabled, wait_for_sync_event may get stuck in an infinite loop. Add a check of the subscription state in the waiting loop to reflect the fact that an apply worker may quietly die, deactivating its subscription. Also, add the correct processing of this behaviour to the Z0DAN and include a TAP test. XXX: This commit highlights the fact that in case of an unsuccessful add_node call, there are some database objects left.
1 parent 33388a3 commit 871f1eb

File tree

3 files changed

+84
-10
lines changed

3 files changed

+84
-10
lines changed

samples/Z0DAN/zodan.sql

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2333,8 +2333,7 @@ BEGIN
23332333
RAISE NOTICE ' OK: %', rpad('Waiting for sync event from ' || src_node_name || ' on new node ' || new_node_name || '...', 120, ' ');
23342334
EXCEPTION
23352335
WHEN OTHERS THEN
2336-
RAISE NOTICE ' ✗ %', rpad('Unable to wait for sync event from ' || src_node_name || ' on new node ' || new_node_name || ' (error: ' || SQLERRM || ')', 120, ' ');
2337-
RAISE;
2336+
RAISE EXCEPTION ' ✗ %', rpad('Unable to wait for sync event from ' || src_node_name || ' on new node ' || new_node_name || ' (error: ' || SQLERRM || ')', 120, ' ');
23382337
END;
23392338
END;
23402339
$$;

sql/spock--6.0.0-devel.sql

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -385,8 +385,12 @@ RETURNS void RETURNS NULL ON NULL INPUT VOLATILE LANGUAGE c AS 'MODULE_PATHNAME'
385385
CREATE FUNCTION spock.sync_event()
386386
RETURNS pg_lsn RETURNS NULL ON NULL INPUT VOLATILE LANGUAGE c AS 'MODULE_PATHNAME', 'spock_create_sync_event';
387387

388-
CREATE PROCEDURE spock.wait_for_sync_event(OUT result bool, origin_id oid, lsn pg_lsn, timeout int DEFAULT 0)
389-
AS $$
388+
CREATE PROCEDURE spock.wait_for_sync_event(
389+
OUT result bool,
390+
origin_id oid,
391+
lsn pg_lsn,
392+
timeout int DEFAULT 0
393+
) AS $$
390394
DECLARE
391395
target_id oid;
392396
elapsed_time numeric := 0;
@@ -398,6 +402,17 @@ BEGIN
398402
target_id := node_id FROM spock.node_info();
399403

400404
WHILE true LOOP
405+
-- If an unresolvable issue occurs with the apply worker, the LR
406+
-- progress gets stuck, and we need to check the subscription's state
407+
-- carefully.
408+
IF NOT EXISTS (SELECT * FROM spock.subscription
409+
WHERE sub_origin = origin_id AND
410+
sub_target = target_id AND
411+
sub_enabled = true) THEN
412+
RAISE EXCEPTION 'Replication % => % does not have any enabled subscription yet',
413+
origin_id, target_id;
414+
END IF;
415+
401416
SELECT INTO progress_lsn remote_lsn
402417
FROM spock.progress
403418
WHERE node_id = target_id AND remote_node_id = origin_id;
@@ -417,8 +432,12 @@ BEGIN
417432
END;
418433
$$ LANGUAGE plpgsql;
419434

420-
CREATE PROCEDURE spock.wait_for_sync_event(OUT result bool, origin name, lsn pg_lsn, timeout int DEFAULT 0)
421-
AS $$
435+
CREATE PROCEDURE spock.wait_for_sync_event(
436+
OUT result bool,
437+
origin name,
438+
lsn pg_lsn,
439+
timeout int DEFAULT 0
440+
) AS $$
422441
DECLARE
423442
origin_id oid;
424443
target_id oid;
@@ -432,6 +451,17 @@ BEGIN
432451
target_id := node_id FROM spock.node_info();
433452

434453
WHILE true LOOP
454+
-- If an unresolvable issue occurs with the apply worker, the LR
455+
-- progress gets stuck, and we need to check the subscription's state
456+
-- carefully.
457+
IF NOT EXISTS (SELECT * FROM spock.subscription
458+
WHERE sub_origin = origin_id AND
459+
sub_target = target_id AND
460+
sub_enabled = true) THEN
461+
RAISE EXCEPTION 'Replication % => % does not have any enabled subscription yet',
462+
origin_id, target_id;
463+
END IF;
464+
435465
SELECT INTO progress_lsn remote_lsn
436466
FROM spock.progress
437467
WHERE node_id = target_id AND remote_node_id = origin_id;

tests/tap/t/012_zodan_basics.pl

Lines changed: 49 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,15 @@
3131
psql_or_bail(1, "CREATE TABLE test(x serial PRIMARY KEY)");
3232
psql_or_bail(1, "INSERT INTO test DEFAULT VALUES");
3333

34-
print STDERR "All supporting stuff has been installed\n";
34+
print STDERR "All supporting stuff has been installed successfully\n";
3535

36-
print STDERR "Call Z0DAN: n2 => n1";
36+
# ##############################################################################
37+
#
38+
# Basic check that Z0DAN correctly add node to the single-node cluster
39+
#
40+
# ##############################################################################
41+
42+
print STDERR "Call Z0DAN: n2 => n1\n";
3743
psql_or_bail(2, "
3844
CALL spock.add_node(
3945
src_node_name := 'n1',
@@ -49,8 +55,13 @@
4955

5056
psql_or_bail(1, "SELECT spock.sub_disable('sub_n1_n2')");
5157

52-
print STDERR "Call Z0DAN: n3 => n2\n";
58+
# ##############################################################################
59+
#
60+
# Z0DAN reject node addition if some subscriptions are disabled
61+
#
62+
# ##############################################################################
5363

64+
print STDERR "Call Z0DAN: n3 => n2\n";
5465
scalar_query(3, "
5566
CALL spock.add_node(
5667
src_node_name := 'n2',
@@ -63,7 +74,7 @@
6374
print STDERR "Z0DAN should fail because of a disabled subscription\n";
6475

6576
psql_or_bail(1, "SELECT spock.sub_enable('sub_n1_n2')");
66-
scalar_query(3, "
77+
psql_or_bail(3, "
6778
CALL spock.add_node(
6879
src_node_name := 'n2',
6980
src_dsn := 'host=$host dbname=$dbname port=$node_ports->[1] user=$db_user password=$db_password',
@@ -77,6 +88,40 @@
7788
ok($result eq '1', "Check state of the test table on N3 after the attachment");
7889
print STDERR "Z0DAN should add N3 to the cluster\n";
7990

91+
# ##############################################################################
92+
#
93+
# Test that Z0DAN correctly doesn't add node to the cluster if something happens
94+
# during the SYNC process.
95+
#
96+
# ##############################################################################
97+
98+
# Remove node from the cluster and data leftovers.
99+
psql_or_bail(3, "\\i ../../samples/Z0DAN/zodremove.sql");
100+
psql_or_bail(3, "CALL spock.remove_node(target_node_name := 'n3',
101+
target_node_dsn := 'host=$host dbname=$dbname port=$node_ports->[2] user=$db_user password=$db_password',
102+
verbose_mode := true)");
103+
psql_or_bail(3, "DROP TABLE test");
104+
psql_or_bail(3, "DROP EXTENSION lolor");
105+
106+
psql_or_bail(1, "CREATE FUNCTION fake_fn() RETURNS integer LANGUAGE sql AS \$\$ SELECT 1\$\$");
107+
psql_or_bail(3, "CREATE FUNCTION fake_fn() RETURNS integer LANGUAGE sql AS \$\$ SELECT 1\$\$");
108+
scalar_query(3, "
109+
CALL spock.add_node(
110+
src_node_name := 'n2',
111+
src_dsn := 'host=$host dbname=$dbname port=$node_ports->[1] user=$db_user password=$db_password',
112+
new_node_name := 'n3', new_node_dsn := 'host=$host dbname=$dbname port=$node_ports->[2] user=$db_user password=$db_password',
113+
verb := true)");
114+
115+
# TODO:
116+
# It seems that add_node keeps remnants after unsuccessful execution. It is
117+
# happened because we have commited some intermediate results before.
118+
# It would be better to keep remote transaction opened until the end of the
119+
# operation or just remove these remnants at the end pretending to be a
120+
# distributed transaction.
121+
#
122+
# $result = scalar_query(3, "SELECT count(*) FROM spock.local_node");
123+
# ok($result eq '0', "N3 is not in the cluster");
124+
80125
# Clean up
81126
destroy_cluster('Destroy test cluster');
82127
done_testing();

0 commit comments

Comments
 (0)