Skip to content

Commit d17819b

Browse files
committed
Retry to early add_table_copy
add_table copy/3 could abort with system_limit as reason if the "added" node was starting, i.e. alive but not merged schema yet. Now abort with node_not_running, which should restart the transaction automaticly after a short sleep. Also improve debug (verbose) printouts and saved coredump info.
1 parent 5e5963b commit d17819b

File tree

4 files changed

+32
-6
lines changed

4 files changed

+32
-6
lines changed

lib/mnesia/src/mnesia_lib.erl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1065,7 +1065,7 @@ save2(DbgInfo) ->
10651065
Key = {'$$$_report', current_pos},
10661066
P =
10671067
case ?ets_lookup_element(mnesia_gvar, Key, 2) of
1068-
30 -> -1;
1068+
100 -> -1;
10691069
I -> I
10701070
end,
10711071
set({'$$$_report', current_pos}, P+1),

lib/mnesia/src/mnesia_loader.erl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,7 @@ do_get_network_copy(Tab, Reason, Ns, Storage, Cs) ->
226226
dbg_out("Table ~tp copied from ~p to ~p~n", [Tab, Node, node()]),
227227
{loaded, ok};
228228
Err = {error, _} when element(1, Reason) == dumper ->
229+
verbose("Copy failed: ~tp ~p~n", [Tab, Err]),
229230
{not_loaded,Err};
230231
restart ->
231232
try_net_load_table(Tab, Reason, Tail ++ [Node], Cs);
@@ -339,6 +340,7 @@ start_receiver(Tab,Storage,Cs,SenderPid,TabSize,DetsData,{dumper,{add_table_copy
339340
Init = table_init_fun(SenderPid, Storage),
340341
case do_init_table(Tab,Storage,Cs,SenderPid,TabSize,DetsData,self(), Init) of
341342
Err = {error, _} ->
343+
verbose("Init table failed: ~tp ~p~n", [Tab, Err]),
342344
SenderPid ! {copier_done, node()},
343345
Err;
344346
Else ->
@@ -363,6 +365,7 @@ wait_on_load_complete(Pid) ->
363365
{Pid, Res} ->
364366
Res;
365367
{'EXIT', Pid, Reason} ->
368+
verbose("Loader crashed : ~tp ~p~n", [Pid, Reason]),
366369
error(Reason);
367370
Else ->
368371
Pid ! Else,

lib/mnesia/src/mnesia_schema.erl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2466,13 +2466,14 @@ prepare_op(Tid, {op, add_table_copy, Storage, Node, TabDef}, _WaitFor) ->
24662466
_ ->
24672467
ok
24682468
end,
2469-
mnesia_lib:verbose("~w:~w Adding table~n",[?MODULE,?LINE]),
24702469

24712470
case mnesia_controller:get_network_copy(Tid, Tab, Cs) of
24722471
{loaded, ok} ->
24732472
%% Tables are created by mnesia_loader get_network code
24742473
insert_cstruct(Tid, Cs, true),
24752474
{true, optional};
2475+
{not_loaded, {not_active, schema, Node}} ->
2476+
mnesia:abort({node_not_running, Node});
24762477
{not_loaded, ErrReason} ->
24772478
Reason = {system_limit, Tab, {Node, ErrReason}},
24782479
mnesia:abort(Reason)

lib/mnesia/src/mnesia_tm.erl

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -892,27 +892,26 @@ restart(Mod, Tid, Ts, Fun, Args, Factor0, Retries0, Type, Why) ->
892892
return_abort(Fun, Args, Why),
893893
Factor = 1,
894894
SleepTime = mnesia_lib:random_time(Factor, Tid#tid.counter),
895-
dbg_out("Restarting transaction ~w: in ~wms ~w~n", [Tid, SleepTime, Why]),
895+
log_restart("Restarting transaction ~w: in ~wms ~w~n", [Tid, SleepTime, Why]),
896896
timer:sleep(SleepTime),
897897
execute_outer(Mod, Fun, Args, Factor, Retries, Type);
898898
{node_not_running, _N} -> %% Avoids hanging in receive_release_tid_ack
899899
return_abort(Fun, Args, Why),
900900
Factor = 1,
901901
SleepTime = mnesia_lib:random_time(Factor, Tid#tid.counter),
902-
dbg_out("Restarting transaction ~w: in ~wms ~w~n", [Tid, SleepTime, Why]),
902+
log_restart("Restarting transaction ~w: in ~wms ~w~n", [Tid, SleepTime, Why]),
903903
timer:sleep(SleepTime),
904904
execute_outer(Mod, Fun, Args, Factor, Retries, Type);
905905
_ ->
906906
SleepTime = mnesia_lib:random_time(Factor0, Tid#tid.counter),
907907
dbg_out("Restarting transaction ~w: in ~wms ~w~n", [Tid, SleepTime, Why]),
908-
908+
909909
if
910910
Factor0 /= 10 ->
911911
ignore;
912912
true ->
913913
%% Our serial may be much larger than other nodes ditto
914914
AllNodes = val({current, db_nodes}),
915-
verbose("Sync serial ~p~n", [Tid]),
916915
rpc:abcast(AllNodes, ?MODULE, {sync_trans_serial, Tid})
917916
end,
918917
intercept_friends(Tid, Ts),
@@ -931,6 +930,24 @@ restart(Mod, Tid, Ts, Fun, Args, Factor0, Retries0, Type, Why) ->
931930
end
932931
end.
933932

933+
log_restart(F,A) ->
934+
case get(transaction_client) of
935+
undefined ->
936+
dbg_out(F,A);
937+
_ ->
938+
case get(transaction_count) of
939+
undefined ->
940+
put(transaction_count, 1),
941+
verbose(F,A);
942+
N when (N rem 10) == 0 ->
943+
put(transaction_count, N+1),
944+
verbose(F,A);
945+
N ->
946+
put(transaction_count, N+1),
947+
dbg_out(F,A)
948+
end
949+
end.
950+
934951
get_restarted(Tid) ->
935952
case Res = rec() of
936953
{restarted, Tid} ->
@@ -2086,6 +2103,7 @@ new_cr_format(#commit{ext=Snmp}=Cr) ->
20862103
Cr#commit{ext=[{snmp,Snmp}]}.
20872104

20882105
rec_all([Node | Tail], Tid, Res, Pids) ->
2106+
put({?MODULE, ?FUNCTION_NAME}, {Node, Tail}),
20892107
receive
20902108
{?MODULE, Node, {vote_yes, Tid}} ->
20912109
rec_all(Tail, Tid, Res, Pids);
@@ -2104,8 +2122,12 @@ rec_all([Node | Tail], Tid, Res, Pids) ->
21042122
Abort = {do_abort, {bad_commit, Node}},
21052123
?SAFE({?MODULE, Node} ! {Tid, Abort}),
21062124
rec_all(Tail, Tid, Abort, Pids)
2125+
after 15000 ->
2126+
mnesia_lib:verbose("~p: trans ~p waiting ~p~n", [self(), Tid, Node]),
2127+
rec_all([Node | Tail], Tid, Res, Pids)
21072128
end;
21082129
rec_all([], _Tid, Res, Pids) ->
2130+
erase({?MODULE, ?FUNCTION_NAME}),
21092131
{Res, Pids}.
21102132

21112133
get_transactions() ->

0 commit comments

Comments
 (0)