Skip to content

Commit 5b96eed

Browse files
committed
connectd: fix accidental handling of old reconnections.
We had multiple reports of channels being unilaterally closed because it seemed like the peer was sending old revocation numbers. Turns out, it was actually old reestablish messages! When we have a reconnection, we would put the new connection aside, and tell lightningd to close the current connection: when it did, we would restart processing of the initial reconnection. However, we could end up with *multiple* "reconnecting" connections, while waiting for an existing connection to close. Though the connections were long gone, there could still be messages queued (particularly the channel_reestablish message, which comes early on). Eventually, a normal reconnection would cause us to process one of these reconnecting connections, and channeld would see the (perhaps very old!) messages, and get confused. (I have a test which triggers this, but it also hangs the connect command, due to other issues we will fix in the next release...) Fixes: #5240 Signed-off-by: Rusty Russell <[email protected]>
1 parent 0ec947f commit 5b96eed

File tree

2 files changed

+58
-24
lines changed

2 files changed

+58
-24
lines changed

connectd/connectd.c

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -211,36 +211,29 @@ static void peer_connected_in(struct daemon *daemon,
211211
tal_free(connect);
212212
}
213213

214-
/*~ This is an ad-hoc marshalling structure where we store arguments so we
215-
* can call peer_connected again. */
216-
struct peer_reconnected {
217-
struct daemon *daemon;
218-
struct node_id id;
219-
struct wireaddr_internal addr;
220-
const struct wireaddr *remote_addr;
221-
struct crypto_state cs;
222-
const u8 *their_features;
223-
bool incoming;
224-
};
225-
226214
/*~ For simplicity, lightningd only ever deals with a single connection per
227215
* peer. So if we already know about a peer, we tell lightning to disconnect
228216
* the old one and retry once it does. */
229217
static struct io_plan *retry_peer_connected(struct io_conn *conn,
230218
struct peer_reconnected *pr)
231219
{
232-
struct io_plan *plan;
233-
234220
/*~ As you can see, we've had issues with this code before :( */
235221
status_peer_debug(&pr->id, "processing now old peer gone");
236222

237-
/*~ Usually the pattern is to return this directly, but we have to free
238-
* our temporary structure. */
239-
plan = peer_connected(conn, pr->daemon, &pr->id, &pr->addr,
223+
/* If this fails (still waiting), pr will be freed, so reparent onto
224+
* tmpctx so it gets freed either way. */
225+
tal_steal(tmpctx, pr);
226+
227+
/*~ Usually the pattern is to return this directly. */
228+
return peer_connected(conn, pr->daemon, &pr->id, &pr->addr,
240229
pr->remote_addr,
241230
&pr->cs, take(pr->their_features), pr->incoming);
242-
tal_free(pr);
243-
return plan;
231+
}
232+
233+
/*~ A common use for destructors is to remove themselves from a data structure */
234+
static void destroy_peer_reconnected(struct peer_reconnected *pr)
235+
{
236+
peer_reconnected_htable_del(&pr->daemon->reconnected, pr);
244237
}
245238

246239
/*~ If we already know about this peer, we tell lightningd and it disconnects
@@ -259,6 +252,13 @@ static struct io_plan *peer_reconnected(struct io_conn *conn,
259252

260253
status_peer_debug(id, "reconnect");
261254

255+
/* If we have a previous reconnection, we replace it. */
256+
pr = peer_reconnected_htable_get(&daemon->reconnected, id);
257+
if (pr) {
258+
peer_reconnected_htable_del(&daemon->reconnected, pr);
259+
tal_free(pr);
260+
}
261+
262262
/* Tell master to kill it: will send peer_disconnect */
263263
msg = towire_connectd_reconnected(NULL, id);
264264
daemon_conn_send(daemon->master, take(msg));
@@ -271,6 +271,8 @@ static struct io_plan *peer_reconnected(struct io_conn *conn,
271271
pr->addr = *addr;
272272
pr->remote_addr = tal_dup_or_null(pr, struct wireaddr, remote_addr);
273273
pr->incoming = incoming;
274+
peer_reconnected_htable_add(&daemon->reconnected, pr);
275+
tal_add_destructor(pr, destroy_peer_reconnected);
274276

275277
/*~ Note that tal_dup_talarr() will do handle the take() of features
276278
* (turning it into a simply tal_steal() in those cases). */
@@ -280,11 +282,7 @@ static struct io_plan *peer_reconnected(struct io_conn *conn,
280282
* the peer set. When someone calls `io_wake()` on that address, it
281283
* will call retry_peer_connected above. */
282284
return io_wait(conn, peer_htable_get(&daemon->peers, id),
283-
/*~ The notleak() wrapper is a DEVELOPER-mode hack so
284-
* that our memory leak detection doesn't consider 'pr'
285-
* (which is not referenced from our code) to be a
286-
* memory leak. */
287-
retry_peer_connected, notleak(pr));
285+
retry_peer_connected, pr);
288286
}
289287

290288
/*~ When we free a peer, we remove it from the daemon's hashtable */
@@ -1981,6 +1979,7 @@ static void dev_connect_memleak(struct daemon *daemon, const u8 *msg)
19811979
/* Now delete daemon and those which it has pointers to. */
19821980
memleak_remove_region(memtable, daemon, sizeof(daemon));
19831981
memleak_remove_htable(memtable, &daemon->peers.raw);
1982+
memleak_remove_htable(memtable, &daemon->reconnected.raw);
19841983

19851984
found_leak = dump_memleak(memtable, memleak_status_broken);
19861985
daemon_conn_send(daemon->master,
@@ -2127,6 +2126,7 @@ int main(int argc, char *argv[])
21272126
/* Allocate and set up our simple top-level structure. */
21282127
daemon = tal(NULL, struct daemon);
21292128
peer_htable_init(&daemon->peers);
2129+
peer_reconnected_htable_init(&daemon->reconnected);
21302130
memleak_add_helper(daemon, memleak_daemon_cb);
21312131
list_head_init(&daemon->connecting);
21322132
timers_init(&daemon->timers, time_mono());

connectd/connectd.h

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,37 @@ HTABLE_DEFINE_TYPE(struct peer,
126126
peer_eq_node_id,
127127
peer_htable);
128128

129+
/*~ This is an ad-hoc marshalling structure where we store arguments so we
130+
* can call peer_connected again. */
131+
struct peer_reconnected {
132+
struct daemon *daemon;
133+
struct node_id id;
134+
struct wireaddr_internal addr;
135+
const struct wireaddr *remote_addr;
136+
struct crypto_state cs;
137+
const u8 *their_features;
138+
bool incoming;
139+
};
140+
141+
static const struct node_id *
142+
peer_reconnected_keyof(const struct peer_reconnected *pr)
143+
{
144+
return &pr->id;
145+
}
146+
147+
static bool peer_reconnected_eq_node_id(const struct peer_reconnected *pr,
148+
const struct node_id *id)
149+
{
150+
return node_id_eq(&pr->id, id);
151+
}
152+
153+
/*~ This defines 'struct peer_reconnected_htable'. */
154+
HTABLE_DEFINE_TYPE(struct peer_reconnected,
155+
peer_reconnected_keyof,
156+
node_id_hash,
157+
peer_reconnected_eq_node_id,
158+
peer_reconnected_htable);
159+
129160
/*~ This is the global state, like `struct lightningd *ld` in lightningd. */
130161
struct daemon {
131162
/* Who am I? */
@@ -142,6 +173,9 @@ struct daemon {
142173
* have disconnected. */
143174
struct peer_htable peers;
144175

176+
/* Peers which have reconnected, waiting for us to kill existing conns */
177+
struct peer_reconnected_htable reconnected;
178+
145179
/* Peers we are trying to reach */
146180
struct list_head connecting;
147181

0 commit comments

Comments
 (0)