Skip to content

Commit e017412

Browse files
author
Erlang/OTP
committed
Merge branch 'rickard/net-kernel-connect-fix/ERIERL-1241/OTP-19702' into maint-27
* rickard/net-kernel-connect-fix/ERIERL-1241/OTP-19702: [kernel] Prevent net_kernel from being blocked on connection setup
2 parents 586c35d + 838d263 commit e017412

File tree

4 files changed

+285
-48
lines changed

4 files changed

+285
-48
lines changed

lib/kernel/src/net_kernel.erl

Lines changed: 116 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ in the Erlang Reference Manual.
176176
-define(LISTEN_ID, #listen.listen).
177177
-define(ACCEPT_ID, #listen.accept).
178178

179-
-type connection_state() :: pending | up | up_pending.
179+
-type connection_state() :: check_pending | pending | up | up_pending.
180180
-type connection_type() :: normal | hidden.
181181

182182
-include("net_address.hrl").
@@ -1079,7 +1079,7 @@ do_auto_connect_2(Node, ConnId, From, State, ConnLookup) ->
10791079
ets:lookup(sys_dist, Node)}),
10801080
{reply, false, State};
10811081
_ ->
1082-
case setup(Node, ConnId, normal, From, State) of
1082+
case setup_check(Node, ConnId, normal, From, State) of
10831083
{ok, SetupPid} ->
10841084
Owners = State#state.conn_owners,
10851085
{noreply,State#state{conn_owners=Owners#{SetupPid => Node}}};
@@ -1094,7 +1094,8 @@ do_auto_connect_2(Node, ConnId, From, State, ConnLookup) ->
10941094
do_explicit_connect([#connection{conn_id = ConnId, state = up}], _, _, ConnId, _From, State) ->
10951095
{reply, true, State};
10961096
do_explicit_connect([#connection{conn_id = ConnId}=Conn], _, _, ConnId, From, State)
1097-
when Conn#connection.state =:= pending;
1097+
when Conn#connection.state =:= check_pending;
1098+
Conn#connection.state =:= pending;
10981099
Conn#connection.state =:= up_pending ->
10991100
Waiting = Conn#connection.waiting,
11001101
ets:insert(sys_dist, Conn#connection{waiting = [From|Waiting]}),
@@ -1103,7 +1104,7 @@ do_explicit_connect([#barred_connection{}], Type, Node, ConnId, From , State) ->
11031104
%% Barred connection only affects auto_connect, ignore it.
11041105
do_explicit_connect([], Type, Node, ConnId, From , State);
11051106
do_explicit_connect(_ConnLookup, Type, Node, ConnId, From , State) ->
1106-
case setup(Node,ConnId,Type,From,State) of
1107+
case setup_check(Node,ConnId,Type,From,State) of
11071108
{ok, SetupPid} ->
11081109
Owners = State#state.conn_owners,
11091110
{noreply,State#state{conn_owners=Owners#{SetupPid => Node}}};
@@ -1445,7 +1446,7 @@ handle_info({dist_ctrlr, Ctrlr, Node, SetupPid} = Msg,
14451446
%%
14461447
%% A node has successfully been connected.
14471448
%%
1448-
handle_info({SetupPid, {nodeup,Node,Address,Type,NamedMe} = Nodeup},
1449+
handle_info({SetupPid, {nodeup,Node,Address,Type,NamedMe} = Nodeup} = Msg,
14491450
#state{tick = Tick} = State) ->
14501451
case ets:lookup(sys_dist, Node) of
14511452
[Conn] when (Conn#connection.state =:= pending)
@@ -1469,7 +1470,8 @@ handle_info({SetupPid, {nodeup,Node,Address,Type,NamedMe} = Nodeup},
14691470
verbose(Nodeup, 1, State1),
14701471
verbose({nodeup,Node,SetupPid,Conn#connection.ctrlr}, 2, State1),
14711472
{noreply, State1};
1472-
_ ->
1473+
_Conn ->
1474+
verbose({bad_request, Msg, _Conn}, 2, State),
14731475
SetupPid ! {self(), bad_request},
14741476
{noreply, State}
14751477
end;
@@ -1481,9 +1483,14 @@ handle_info({AcceptPid, {accept_pending,MyNode,NodeOrHost,Type}}, State0) ->
14811483
{NameType, Node, Creation,
14821484
ConnLookup, State} = ensure_node_name(NodeOrHost, State0),
14831485
case ConnLookup of
1484-
[#connection{state=pending}=Conn] ->
1486+
[#connection{state=Pending}=Conn] when Pending == pending;
1487+
Pending == check_pending ->
14851488
if
1486-
MyNode > Node ->
1489+
%% If we are in check_pending, we always select the other node.
1490+
%% We currently have not started the handshake at all, and the
1491+
%% other node's connection attempt is ongoing, so we select its
1492+
%% connection.
1493+
MyNode > Node andalso Pending == pending ->
14871494
AcceptPid ! {self(),{accept_pending,nok_pending}},
14881495
verbose({accept_pending_nok, Node, AcceptPid}, 2, State),
14891496
{noreply,State};
@@ -1495,7 +1502,7 @@ handle_info({AcceptPid, {accept_pending,MyNode,NodeOrHost,Type}}, State0) ->
14951502
OldOwner = Conn#connection.owner,
14961503
case maps:is_key(OldOwner, State#state.conn_owners) of
14971504
true ->
1498-
verbose({remark,OldOwner,AcceptPid}, 2, State),
1505+
verbose({remark,Node,OldOwner,AcceptPid}, 2, State),
14991506
?debug({net_kernel, remark, old, OldOwner, new, AcceptPid}),
15001507
exit(OldOwner, remarked),
15011508
receive
@@ -1505,11 +1512,13 @@ handle_info({AcceptPid, {accept_pending,MyNode,NodeOrHost,Type}}, State0) ->
15051512
end;
15061513
false ->
15071514
verbose(
1508-
{accept_pending, OldOwner, inconsistency},
1515+
{accept_pending, Node, OldOwner, inconsistency},
15091516
2, State),
15101517
ok % Owner already exited
15111518
end,
1512-
ets:insert(sys_dist, Conn#connection{owner = AcceptPid}),
1519+
ets:insert(sys_dist, Conn#connection{owner = AcceptPid,
1520+
state = pending,
1521+
type = Type}),
15131522
AcceptPid ! {self(),{accept_pending,ok_pending}},
15141523
Owners = maps:remove(OldOwner, State#state.conn_owners),
15151524
{noreply, State#state{conn_owners=Owners#{AcceptPid => Node}}}
@@ -1642,6 +1651,14 @@ handle_info(transition_period_end,
16421651
time = T,
16431652
intensity = I}}};
16441653

1654+
handle_info({setup_check, Node, Pid, Timer, Res} = Msg, State) ->
1655+
verbose(Msg, 2, State),
1656+
{noreply, setup(Node, Pid, Timer, Res, State)};
1657+
1658+
handle_info({setup_check_timeout, Node, Pid} = Msg, State) ->
1659+
verbose(Msg, 2, State),
1660+
{noreply, setup_check_timeout(Node, Pid, State)};
1661+
16451662
handle_info(X, State) ->
16461663
error_msg("Net kernel got ~tw~n",[X]),
16471664
{noreply,State}.
@@ -1829,8 +1846,10 @@ delete_ctrlr(Ctrlr, #state{dist_ctrlrs = DCs} = State) ->
18291846
State#state{dist_ctrlrs = maps:remove(Ctrlr, DCs)}.
18301847

18311848
nodedown(Conn, Exited, Node, Reason, Type, State) ->
1849+
verbose({nodedown, Node, Conn#connection.state, Reason}, 2, State),
18321850
case Conn#connection.state of
1833-
pending ->
1851+
Pending when Pending == pending;
1852+
Pending == check_pending ->
18341853
pending_nodedown(Conn, Exited, Node, Type, State);
18351854
up ->
18361855
up_nodedown(Conn, Exited, Node, Reason, Type, State);
@@ -2226,52 +2245,102 @@ spawn_func(_,{From,Tag},M,F,A,Gleader) ->
22262245
%% Set up connection to a new node.
22272246
%% -----------------------------------------------------------
22282247

2229-
setup(Node, ConnId, Type, From, State) ->
2230-
case setup_check(Node, State) of
2231-
{ok, L} ->
2232-
Mod = L#listen.module,
2233-
LAddr = L#listen.address,
2234-
MyNode = State#state.node,
2235-
Pid = Mod:setup(Node,
2236-
Type,
2237-
MyNode,
2238-
State#state.type,
2239-
State#state.connecttime),
2248+
setup(Node, CheckPid, CheckTimer, CheckRes, State) ->
2249+
ok = erlang:cancel_timer(CheckTimer, [{async, true}, {info, false}]),
2250+
unlink(CheckPid),
2251+
exit(CheckPid, kill),
2252+
case ets:lookup(sys_dist, Node) of
2253+
[#connection{owner = CheckPid, state = check_pending} = Conn] ->
2254+
case CheckRes of
2255+
{ok, #listen{} = L} ->
2256+
Mod = L#listen.module,
2257+
LAddr = L#listen.address,
2258+
MyNode = State#state.node,
2259+
%% We intentionally allow for a full connecttime in
2260+
%% Mod:setup() since the check was successful. The
2261+
%% setup timer is reset in other places as well...
2262+
SetupPid = Mod:setup(Node,
2263+
Conn#connection.type,
2264+
MyNode,
2265+
State#state.type,
2266+
State#state.connecttime),
22402267
verbose(
2241-
{setup,Node,Type,MyNode,State#state.type,Pid},
2268+
{setup,Node,Conn#connection.type,
2269+
MyNode,State#state.type,SetupPid},
22422270
2, State),
2243-
Addr = LAddr#net_address {
2244-
address = undefined,
2245-
host = undefined },
2246-
Waiting = case From of
2247-
noreply -> [];
2248-
_ -> [From]
2249-
end,
2250-
ets:insert(sys_dist, #connection{node = Node,
2251-
conn_id = ConnId,
2252-
state = pending,
2253-
owner = Pid,
2254-
waiting = Waiting,
2255-
address = Addr,
2256-
type = normal,
2257-
remote_name_type = static}),
2258-
{ok, Pid};
2259-
Error ->
2260-
Error
2271+
Addr = LAddr#net_address{address = undefined,
2272+
host = undefined},
2273+
ets:insert(sys_dist, Conn#connection{state = pending,
2274+
owner = SetupPid,
2275+
address = Addr,
2276+
type = normal}),
2277+
State2 = delete_owner(CheckPid, State),
2278+
Owners = State#state.conn_owners,
2279+
State2#state{conn_owners = Owners#{SetupPid => Node}};
2280+
CheckError ->
2281+
Failure = {setup_check_failed, Node, CheckError},
2282+
verbose(Failure, 2, State),
2283+
?connect_failure(Node, Failure),
2284+
pending_nodedown(Conn, CheckPid, Node,
2285+
Conn#connection.type, State)
2286+
end;
2287+
_ ->
2288+
State
2289+
end.
2290+
2291+
setup_check_timeout(Node, CheckPid, State) ->
2292+
case ets:lookup(sys_dist, Node) of
2293+
[#connection{owner = CheckPid, state = check_pending} = Conn] ->
2294+
unlink(CheckPid),
2295+
exit(CheckPid, kill),
2296+
Failure = {setup_check_failed, Node, setup_check_timeout},
2297+
verbose(Failure, 2, State),
2298+
?connect_failure(Node, Failure),
2299+
pending_nodedown(Conn, CheckPid, Node,
2300+
Conn#connection.type, State);
2301+
_ ->
2302+
State
22612303
end.
22622304

2263-
setup_check(Node, State) ->
2305+
%% Shut up dialyzer warning about no return from SelMod fun. It
2306+
%% should be that way...
2307+
-dialyzer([{nowarn_function, setup_check/5}, no_return]).
2308+
2309+
setup_check(Node, ConnId, Type, From, State) ->
22642310
Allowed = State#state.allowed,
22652311
case lists:member(Node, Allowed) of
22662312
false when Allowed =/= [] ->
22672313
error_msg("** Connection attempt with "
22682314
"disallowed node ~w ** ~n", [Node]),
22692315
{error, bad_node};
22702316
_ ->
2271-
case select_mod(Node, State#state.listen) of
2272-
{ok, _L}=OK -> OK;
2273-
Error -> Error
2274-
end
2317+
NetKernel = self(),
2318+
TimeoutTime = State#state.connecttime,
2319+
Listen = State#state.listen,
2320+
SelMod = fun () ->
2321+
TimeoutMsg = {setup_check_timeout, Node, self()},
2322+
Tmr = erlang:send_after(TimeoutTime,
2323+
NetKernel,
2324+
TimeoutMsg),
2325+
Res = select_mod(Node, Listen),
2326+
NetKernel ! {setup_check, Node, self(), Tmr, Res},
2327+
%% Wait for net_kernel to kill us...
2328+
receive after infinity -> ok end
2329+
end,
2330+
Pid = spawn_link(SelMod),
2331+
verbose({init_setup_check, Node, Pid, TimeoutTime}, 2, State),
2332+
Waiting = case From of
2333+
noreply -> [];
2334+
_ -> [From]
2335+
end,
2336+
ets:insert(sys_dist, #connection{node = Node,
2337+
conn_id = ConnId,
2338+
state = check_pending,
2339+
owner = Pid,
2340+
waiting = Waiting,
2341+
type = Type,
2342+
remote_name_type = static}),
2343+
{ok, Pid}
22752344
end.
22762345

22772346
%%

lib/kernel/test/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ MODULES= \
7171
appinc2top \
7272
appinc2A \
7373
appinc2B \
74+
block_select_dist \
7475
code_SUITE \
7576
code_b_test \
7677
code_coverage_SUITE \
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
%%
2+
%% %CopyrightBegin%
3+
%%
4+
%% SPDX-License-Identifier: Apache-2.0
5+
%%
6+
%% Copyright Ericsson AB 2025. All Rights Reserved.
7+
%%
8+
%% Licensed under the Apache License, Version 2.0 (the "License");
9+
%% you may not use this file except in compliance with the License.
10+
%% You may obtain a copy of the License at
11+
%%
12+
%% http://www.apache.org/licenses/LICENSE-2.0
13+
%%
14+
%% Unless required by applicable law or agreed to in writing, software
15+
%% distributed under the License is distributed on an "AS IS" BASIS,
16+
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17+
%% See the License for the specific language governing permissions and
18+
%% limitations under the License.
19+
%%
20+
%% %CopyrightEnd%
21+
%%
22+
-module(block_select_dist).
23+
24+
%% A wrapper around gen_tcp_dist with the option to block the select()
25+
%% call for testing of net_kernel...
26+
27+
-export([block_select/0, unblock_select/0]).
28+
29+
-export([listen/1, accept/1, accept_connection/5,
30+
setup/5, close/1, select/1, is_node_name/1,
31+
address/0, setopts/2, getopts/2]).
32+
33+
block_select() ->
34+
persistent_term:put(?MODULE, true),
35+
ok.
36+
37+
unblock_select() ->
38+
persistent_term:put(?MODULE, false),
39+
ok.
40+
41+
select(A1) ->
42+
WaitOnBlock = fun WaitOnBlock () ->
43+
case persistent_term:get(?MODULE, false) of
44+
true ->
45+
receive after 1000 -> ok end,
46+
WaitOnBlock();
47+
_ ->
48+
ok
49+
end
50+
end,
51+
WaitOnBlock(),
52+
gen_tcp_dist:select(A1).
53+
54+
listen(A1) ->
55+
gen_tcp_dist:listen(A1).
56+
accept(A1) ->
57+
gen_tcp_dist:accept(A1).
58+
accept_connection(A1, A2, A3, A4, A5) ->
59+
gen_tcp_dist:accept_connection(A1, A2, A3, A4, A5).
60+
setup(A1, A2, A3, A4, A5) ->
61+
gen_tcp_dist:setup(A1, A2, A3, A4, A5).
62+
close(A1) ->
63+
gen_tcp_dist:close(A1).
64+
is_node_name(A1) ->
65+
gen_tcp_dist:is_node_name(A1).
66+
address() ->
67+
gen_tcp_dist:address().
68+
setopts(A1, A2) ->
69+
gen_tcp_dist:setopts(A1, A2).
70+
getopts(A1, A2) ->
71+
gen_tcp_dist:getopts(A1, A2).

0 commit comments

Comments
 (0)