diff --git a/deps/rabbit/src/rabbit.erl b/deps/rabbit/src/rabbit.erl index c213e13abd00..e0822dafe0bc 100644 --- a/deps/rabbit/src/rabbit.erl +++ b/deps/rabbit/src/rabbit.erl @@ -31,7 +31,10 @@ base_product_version/0, motd_file/0, motd/0, - pg_local_scope/1]). + pg_local_scope/1, + pg_scope_amqp091_channel/0, + pg_scope_amqp091_connection/0, + pg_scope_non_amqp_connection/0]). %% For CLI, testing and mgmt-agent. -export([set_log_level/1, log_locations/0, config_files/0]). -export([is_booted/1, is_booted/0, is_booting/1, is_booting/0]). @@ -40,7 +43,11 @@ %% Boot steps. -export([update_cluster_tags/0, maybe_insert_default_data/0, boot_delegate/0, recover/0, pg_local_amqp_session/0, - pg_local_amqp_connection/0, prevent_startup_if_node_was_reset/0]). + pg_local_amqp_connection/0, + pg_local_amqp091_channel/0, + pg_local_amqp091_connection/0, + pg_local_non_amqp_connection/0, + prevent_startup_if_node_was_reset/0]). -rabbit_boot_step({pre_boot, [{description, "rabbit boot start"}]}). @@ -292,11 +299,29 @@ {enables, core_initialized}]}). -rabbit_boot_step({pg_local_amqp_connection, - [{description, "local-only pg scope for AMQP connections"}, + [{description, "local-only pg scope for AMQP 1.0 connections"}, {mfa, {rabbit, pg_local_amqp_connection, []}}, {requires, kernel_ready}, {enables, core_initialized}]}). +-rabbit_boot_step({pg_local_amqp091_channel, + [{description, "local-only pg scope for AMQP 0-9-1 channels"}, + {mfa, {rabbit, pg_local_amqp091_channel, []}}, + {requires, kernel_ready}, + {enables, core_initialized}]}). + +-rabbit_boot_step({pg_local_amqp091_connection, + [{description, "local-only pg scope for AMQP 0-9-1 connections"}, + {mfa, {rabbit, pg_local_amqp091_connection, []}}, + {requires, kernel_ready}, + {enables, core_initialized}]}). + +-rabbit_boot_step({pg_local_non_amqp_connection, + [{description, "local-only pg scope for non-AMQP connections"}, + {mfa, {rabbit, pg_local_non_amqp_connection, []}}, + {requires, kernel_ready}, + {enables, core_initialized}]}). + %%--------------------------------------------------------------------------- -include_lib("rabbit_common/include/rabbit.hrl"). @@ -1154,9 +1179,33 @@ pg_local_amqp_connection() -> PgScope = pg_local_scope(amqp_connection), rabbit_sup:start_child(pg_amqp_connection, pg, [PgScope]). +pg_local_amqp091_channel() -> + PgScope = pg_local_scope(amqp091_channel), + persistent_term:put(pg_scope_amqp091_channel, PgScope), + rabbit_sup:start_child(pg_amqp091_channel, pg, [PgScope]). + +pg_local_amqp091_connection() -> + PgScope = pg_local_scope(amqp091_connection), + persistent_term:put(pg_scope_amqp091_connection, PgScope), + rabbit_sup:start_child(pg_amqp091_connection, pg, [PgScope]). + +pg_local_non_amqp_connection() -> + PgScope = pg_local_scope(non_amqp_connection), + persistent_term:put(pg_scope_non_amqp_connection, PgScope), + rabbit_sup:start_child(pg_non_amqp_connection, pg, [PgScope]). + pg_local_scope(Prefix) -> list_to_atom(io_lib:format("~s_~s", [Prefix, node()])). +pg_scope_amqp091_channel() -> + persistent_term:get(pg_scope_amqp091_channel). + +pg_scope_amqp091_connection() -> + persistent_term:get(pg_scope_amqp091_connection). + +pg_scope_non_amqp_connection() -> + persistent_term:get(pg_scope_non_amqp_connection). + -spec update_cluster_tags() -> 'ok'. update_cluster_tags() -> diff --git a/deps/rabbit/src/rabbit_channel.erl b/deps/rabbit/src/rabbit_channel.erl index 3c7c865fcb00..27b132e6ee7a 100644 --- a/deps/rabbit/src/rabbit_channel.erl +++ b/deps/rabbit/src/rabbit_channel.erl @@ -304,9 +304,9 @@ send_command(Pid, Msg) -> %% Delete this function when feature flag rabbitmq_4.2.0 becomes required. -spec deliver_reply_local(pid(), binary(), mc:state()) -> ok. deliver_reply_local(Pid, Key, Message) -> - case pg_local:in_group(rabbit_channels, Pid) of - true -> gen_server2:cast(Pid, {deliver_reply, Key, Message}); - false -> ok + case pg:get_local_members(pg_scope(), Pid) of + [] -> ok; + _ -> gen_server2:cast(Pid, {deliver_reply, Key, Message}) end. -spec list() -> [pid()]. @@ -318,7 +318,9 @@ list() -> -spec list_local() -> [pid()]. list_local() -> - pg_local:get_members(rabbit_channels). + try pg:which_groups(pg_scope()) + catch error:badarg -> [] + end. -spec info_keys() -> rabbit_types:info_keys(). @@ -436,6 +438,10 @@ update_user_state(Pid, UserState) when is_pid(Pid) -> %%--------------------------------------------------------------------------- +-spec pg_scope() -> atom(). +pg_scope() -> + rabbit:pg_scope_amqp091_channel(). + init([Channel, ReaderPid, WriterPid, ConnPid, ConnName, Protocol, User, VHost, Capabilities, CollectorPid, LimiterPid, AmqpParams]) -> process_flag(trap_exit, true), @@ -444,7 +450,7 @@ init([Channel, ReaderPid, WriterPid, ConnPid, ConnName, Protocol, User, VHost, ?LG_PROCESS_TYPE(channel), ?store_proc_name({ConnName, Channel}), - ok = pg_local:join(rabbit_channels, self()), + ok = pg:join(pg_scope(), self(), self()), Flow = case rabbit_misc:get_env(rabbit, classic_queue_flow_control, true) of true -> flow; false -> noflow @@ -783,7 +789,7 @@ terminate(_Reason, queue_states = QueueCtxs}) -> rabbit_queue_type:close(QueueCtxs), {_Res, _State1} = notify_queues(State), - pg_local:leave(rabbit_channels, self()), + pg:leave(pg_scope(), self(), self()), rabbit_event:if_enabled(State, #ch.stats_timer, fun() -> emit_stats(State) end), [delete_stats(Tag) || {Tag, _} <- get()], diff --git a/deps/rabbit/src/rabbit_networking.erl b/deps/rabbit/src/rabbit_networking.erl index 23445a3ddc1a..b971e6f0527f 100644 --- a/deps/rabbit/src/rabbit_networking.erl +++ b/deps/rabbit/src/rabbit_networking.erl @@ -461,13 +461,18 @@ node_client_listeners(Node) -> end, Xs) end. +pg_scope_amqp091_connection() -> + rabbit:pg_scope_amqp091_connection(). + -spec register_connection(pid()) -> ok. -register_connection(Pid) -> pg_local:join(rabbit_connections, Pid). +register_connection(Pid) -> + pg:join(pg_scope_amqp091_connection(), Pid, Pid). -spec unregister_connection(pid()) -> ok. -unregister_connection(Pid) -> pg_local:leave(rabbit_connections, Pid). +unregister_connection(Pid) -> + pg:leave(pg_scope_amqp091_connection(), Pid, Pid). -spec connections() -> [rabbit_types:connection()]. connections() -> @@ -476,17 +481,17 @@ connections() -> -spec local_connections() -> [rabbit_types:connection()]. local_connections() -> - Amqp091Pids = pg_local:get_members(rabbit_connections), + Amqp091Pids = pg:which_groups(pg_scope_amqp091_connection()), Amqp10Pids = rabbit_amqp1_0:list_local(), Amqp10Pids ++ Amqp091Pids. -spec register_non_amqp_connection(pid()) -> ok. -register_non_amqp_connection(Pid) -> pg_local:join(rabbit_non_amqp_connections, Pid). +register_non_amqp_connection(Pid) -> pg:join(rabbit:pg_scope_non_amqp_connection(), Pid, Pid). -spec unregister_non_amqp_connection(pid()) -> ok. -unregister_non_amqp_connection(Pid) -> pg_local:leave(rabbit_non_amqp_connections, Pid). +unregister_non_amqp_connection(Pid) -> pg:leave(rabbit:pg_scope_non_amqp_connection(), Pid, Pid). -spec non_amqp_connections() -> [rabbit_types:connection()]. @@ -496,7 +501,7 @@ non_amqp_connections() -> -spec local_non_amqp_connections() -> [rabbit_types:connection()]. local_non_amqp_connections() -> - pg_local:get_members(rabbit_non_amqp_connections). + pg:which_local_groups(rabbit:pg_scope_non_amqp_connection()). -spec connection_info(rabbit_types:connection(), rabbit_types:info_keys()) -> rabbit_types:infos(). diff --git a/deps/rabbit/src/rabbit_volatile_queue.erl b/deps/rabbit/src/rabbit_volatile_queue.erl index 506838a9c340..083018241672 100644 --- a/deps/rabbit/src/rabbit_volatile_queue.erl +++ b/deps/rabbit/src/rabbit_volatile_queue.erl @@ -235,7 +235,7 @@ local_call(Pid, Request) -> is_local(Pid) -> rabbit_amqp_session:is_local(Pid) orelse - pg_local:in_group(rabbit_channels, Pid). + pg:get_local_members(rabbit:pg_scope_amqp091_channel(), Pid) =/= []. handle_event(QName, {deliver, Msg}, #?STATE{name = QName, ctag = Ctag, diff --git a/deps/rabbit/test/proxy_protocol_SUITE.erl b/deps/rabbit/test/proxy_protocol_SUITE.erl index a3abc23602e3..24abc5f91c31 100644 --- a/deps/rabbit/test/proxy_protocol_SUITE.erl +++ b/deps/rabbit/test/proxy_protocol_SUITE.erl @@ -110,8 +110,14 @@ proxy_protocol_v2_local(Config) -> ok. connection_name() -> - ?awaitMatch([_], pg_local:get_members(rabbit_connections), 30000), - [Pid] = pg_local:get_members(rabbit_connections), + Scope = rabbit:pg_scope_amqp091_connection(), + GetGroups = fun() -> + try pg:which_groups(Scope) + catch error:badarg -> [] + end + end, + ?awaitMatch([_], GetGroups(), 30000), + [Pid] = GetGroups(), {dictionary, Dict} = process_info(Pid, dictionary), {process_name, {rabbit_reader, ConnectionName}} = lists:keyfind(process_name, 1, Dict), ConnectionName. @@ -119,6 +125,10 @@ connection_name() -> wait_for_connection_close(Config) -> ?awaitMatch( [], - rabbit_ct_broker_helpers:rpc( - Config, 0, pg_local, get_members, [rabbit_connnections]), + begin + Scope = rabbit_ct_broker_helpers:rpc(Config, 0, rabbit, pg_scope_amqp091_connection, []), + try rabbit_ct_broker_helpers:rpc(Config, 0, pg, which_groups, [Scope]) + catch error:badarg -> [] + end + end, 30000). diff --git a/deps/rabbit/test/quorum_queue_SUITE.erl b/deps/rabbit/test/quorum_queue_SUITE.erl index 16ca4f762932..0a059037ea88 100644 --- a/deps/rabbit/test/quorum_queue_SUITE.erl +++ b/deps/rabbit/test/quorum_queue_SUITE.erl @@ -2214,8 +2214,17 @@ cleanup_queue_state_on_channel_after_publish(Config) -> [NCh1, NCh2] = rpc:call(Server, rabbit_channel, list, []), %% Check the channel state contains the state for the quorum queue on %% channel 1 and 2 - wait_for_cleanup(Server, NCh1, 0), - wait_for_cleanup(Server, NCh2, 1), + %% Note: pg:get_local_members doesn't guarantee order, so we need to identify + %% which channel has queue state + {ChWithoutState, ChWithState} = case length(rpc:call(Server, + rabbit_channel, + list_queue_states, + [NCh1])) of + 0 -> {NCh1, NCh2}; + 1 -> {NCh2, NCh1} + end, + wait_for_cleanup(Server, ChWithoutState, 0), + wait_for_cleanup(Server, ChWithState, 1), %% then delete the queue and wait for the process to terminate ?assertMatch(#'queue.delete_ok'{}, amqp_channel:call(Ch1, #'queue.delete'{queue = QQ})), @@ -2225,8 +2234,8 @@ cleanup_queue_state_on_channel_after_publish(Config) -> [?SUPNAME])) end, 30000), %% Check that all queue states have been cleaned - wait_for_cleanup(Server, NCh2, 0), - wait_for_cleanup(Server, NCh1, 0). + wait_for_cleanup(Server, ChWithState, 0), + wait_for_cleanup(Server, ChWithoutState, 0). cleanup_queue_state_on_channel_after_subscribe(Config) -> %% Declare/delete the queue and publish in one channel, while consuming on a diff --git a/deps/rabbit_common/src/rabbit_misc.erl b/deps/rabbit_common/src/rabbit_misc.erl index f64d71d6c5c1..71fb44050530 100644 --- a/deps/rabbit_common/src/rabbit_misc.erl +++ b/deps/rabbit_common/src/rabbit_misc.erl @@ -1057,7 +1057,14 @@ otp_release() -> end. platform_and_version() -> - string:join(["Erlang/OTP", otp_release()], " "). + case persistent_term:get(platform_and_version, undefined) of + undefined -> + PV = string:join(["Erlang/OTP", otp_release()], " "), + persistent_term:put(platform_and_version, PV), + PV; + PV -> + PV + end. otp_system_version() -> string:strip(erlang:system_info(system_version), both, $\n). diff --git a/deps/rabbit_common/src/rabbit_net.erl b/deps/rabbit_common/src/rabbit_net.erl index d2a43431b471..33fd9b2122ca 100644 --- a/deps/rabbit_common/src/rabbit_net.erl +++ b/deps/rabbit_common/src/rabbit_net.erl @@ -237,12 +237,19 @@ tcp_host(IPAddress) -> end. hostname() -> - {ok, Hostname} = inet:gethostname(), - case inet:gethostbyname(Hostname) of - {ok, #hostent{h_name = Name}} -> Name; - {error, _Reason} -> Hostname + case persistent_term:get(platform_and_version, undefined) of + undefined -> + {ok, Hostname} = inet:gethostname(), + H = case inet:gethostbyname(Hostname) of + {ok, #hostent{h_name = Name}} -> Name; + {error, _Reason} -> Hostname + end, + persistent_term:put(platform_and_version, H); + Hostname -> + Hostname end. + format_nic_attribute({Key, undefined}) -> {Key, undefined}; format_nic_attribute({Key = flags, List}) when is_list(List) -> diff --git a/deps/rabbitmq_stream/src/rabbit_stream.erl b/deps/rabbitmq_stream/src/rabbit_stream.erl index e1baceb657de..a0983130557f 100644 --- a/deps/rabbitmq_stream/src/rabbit_stream.erl +++ b/deps/rabbitmq_stream/src/rabbit_stream.erl @@ -22,8 +22,7 @@ host/0, tls_host/0, port/0, - tls_port/0, - kill_connection/1]). + tls_port/0]). -export([stop/1]). -export([emit_connection_info_local/3, emit_connection_info_all/4, @@ -132,20 +131,6 @@ tls_port_from_listener() -> stop(_State) -> ok. -kill_connection(ConnectionName) -> - ConnectionNameBin = rabbit_data_coercion:to_binary(ConnectionName), - lists:foreach(fun(ConnectionPid) -> - ConnectionPid ! {infos, self()}, - receive - {ConnectionPid, - #{<<"connection_name">> := ConnectionNameBin}} -> - exit(ConnectionPid, kill); - {ConnectionPid, _ClientProperties} -> ok - after 1000 -> ok - end - end, - pg_local:get_members(rabbit_stream_connections)). - emit_connection_info_all(Nodes, Items, Ref, AggregatorPid) -> Pids = [spawn_link(Node, diff --git a/deps/rabbitmq_stream/src/rabbit_stream_reader.erl b/deps/rabbitmq_stream/src/rabbit_stream_reader.erl index aac8bd4abd87..71b035f6c63a 100644 --- a/deps/rabbitmq_stream/src/rabbit_stream_reader.erl +++ b/deps/rabbitmq_stream/src/rabbit_stream_reader.erl @@ -470,10 +470,6 @@ transition_to_opened(Transport, Configuration, NewConnection, NewConnectionState) -> - % TODO remove registration to rabbit_stream_connections - % just meant to be able to close the connection remotely - % should be possible once the connections are available in ctl list_connections - pg_local:join(rabbit_stream_connections, self()), Connection1 = rabbit_event:init_stats_timer(NewConnection, #stream_connection.stats_timer), diff --git a/deps/rabbitmq_stream/test/rabbit_stream_SUITE_data/src/test/java/com/rabbitmq/stream/FailureTest.java b/deps/rabbitmq_stream/test/rabbit_stream_SUITE_data/src/test/java/com/rabbitmq/stream/FailureTest.java index 016da1f59789..ed270cad6770 100644 --- a/deps/rabbitmq_stream/test/rabbit_stream_SUITE_data/src/test/java/com/rabbitmq/stream/FailureTest.java +++ b/deps/rabbitmq_stream/test/rabbit_stream_SUITE_data/src/test/java/com/rabbitmq/stream/FailureTest.java @@ -429,6 +429,7 @@ void noLostConfirmedMessagesWhenLeaderGoesAway() throws Exception { @Test void consumerReattachesToOtherReplicaWhenReplicaGoesAway() throws Exception { + LOGGER.info("Stream name is {}", stream); executorService = Executors.newCachedThreadPool(); Client metadataClient = cf.get(new Client.ClientParameters().port(streamPortNode1())); Map metadata = metadataClient.metadata(stream); @@ -514,42 +515,64 @@ void consumerReattachesToOtherReplicaWhenReplicaGoesAway() throws Exception { CountDownLatch reconnectionLatch = new CountDownLatch(1); AtomicReference shutdownListenerReference = new AtomicReference<>(); + Runnable resubscribe = + () -> { + AtomicInteger newReplicaPort = new AtomicInteger(-1); + waitAtMost( + Duration.ofSeconds(5), + () -> { + try { + Client.StreamMetadata m = metadataClient.metadata(stream).get(stream); + newReplicaPort.set(m.getReplicas().get(0).getPort()); + LOGGER.info("Metadata: {}", m); + return true; + } catch (Exception e) { + return false; + } + }); + LOGGER.info("Replica port is {}", newReplicaPort); + + Client newConsumer = + cf.get( + new Client.ClientParameters() + .port(newReplicaPort.get()) + .shutdownListener(shutdownListenerReference.get()) + .chunkListener(credit()) + .messageListener(messageListener)); + + LOGGER.info("Subscribing..."); + newConsumer.subscribe( + (byte) 1, stream, OffsetSpecification.offset(lastProcessedOffset.get() + 1), 10); + LOGGER.info("Subscribed"); + + generation.incrementAndGet(); + reconnectionLatch.countDown(); + LOGGER.info("Shutdown listener done"); + }; Client.ShutdownListener shutdownListener = shutdownContext -> { + LOGGER.info("Shutdown reason: {}", shutdownContext.getShutdownReason()); if (shutdownContext.getShutdownReason() == Client.ShutdownContext.ShutdownReason.UNKNOWN) { // avoid long-running task in the IO thread executorService.submit( () -> { - AtomicInteger newReplicaPort = new AtomicInteger(-1); - waitAtMost( - Duration.ofSeconds(5), - () -> { - try { - Client.StreamMetadata m = metadataClient.metadata(stream).get(stream); - newReplicaPort.set(m.getReplicas().get(0).getPort()); - return true; - } catch (Exception e) { - return false; - } - }); - - Client newConsumer = - cf.get( - new Client.ClientParameters() - .port(newReplicaPort.get()) - .shutdownListener(shutdownListenerReference.get()) - .chunkListener(credit()) - .messageListener(messageListener)); - - newConsumer.subscribe( - (byte) 1, - stream, - OffsetSpecification.offset(lastProcessedOffset.get() + 1), - 10); - - generation.incrementAndGet(); - reconnectionLatch.countDown(); + int attempts = 0; + while (attempts < 3) { + try { + resubscribe.run(); + break; + } catch (RuntimeException e) { + LOGGER.warn("Error while re-subscribing: {}", e.getMessage()); + try { + Thread.sleep(1000); + } catch (InterruptedException ex) { + Thread.currentThread().interrupt(); + break; + } + attempts++; + } + } }); } };