From e5150e7b4a78b27560ba73be82a07421180e9306 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 25 Jan 2025 11:07:11 +1030 Subject: [PATCH 01/16] pytest: fix flake in test_no_reconnect_awating_unilateral ``` def test_no_reconnect_awating_unilateral(node_factory, bitcoind): l1, l2 = node_factory.line_graph(2, opts={'may_reconnect': True}) l2.stop() # Close immediately. l1.rpc.close(l2.info['id'], 1) wait_for(lambda: only_one(l1.rpc.listpeerchannels(l2.info['id'])['channels'])['state'] == 'AWAITING_UNILATERAL') # After switching to AWAITING_UNILATERAL it will *not* try to reconnect. l1.daemon.wait_for_log("State changed from CHANNELD_SHUTTING_DOWN to AWAITING_UNILATERAL") time.sleep(10) > assert not l1.daemon.is_in_log('Will try reconnect', start=l1.daemon.logsearch_start) E AssertionError: assert not 'lightningd-1 2025-01-20T05:46:04.548Z DEBUG 022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59-connectd: Will try reconnect in 2 seconds' E + where 'lightningd-1 2025-01-20T05:46:04.548Z DEBUG 022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59-connectd: Will try reconnect in 2 seconds' = >('Will try reconnect', start=453) E + where > = .is_in_log E + where = .daemon E + and 453 = .logsearch_start E + where = .daemon ``` In fact: ``` 2025-01-20T06:15:27.2854309Z lightningd-2 2025-01-20T05:46:03.527Z DEBUG lightningd: io_break: destroy_plugin 2025-01-20T06:15:27.2855089Z lightningd-2 2025-01-20T05:46:03.527Z DEBUG lightningd: Command returned result after jcon close 2025-01-20T06:15:27.2855805Z lightningd-2 2025-01-20T05:46:03.528Z DEBUG connectd: Shutting down 2025-01-20T06:15:27.2856576Z lightningd-2 2025-01-20T05:46:03.528Z DEBUG gossipd: Shutting down 2025-01-20T06:15:27.2857159Z lightningd-1 2025-01-20T05:46:03.529Z DEBUG connectd: drain_peer 2025-01-20T06:15:27.2857790Z lightningd-1 2025-01-20T05:46:03.529Z DEBUG connectd: drain_peer draining subd! 2025-01-20T06:15:27.2858825Z lightningd-1 2025-01-20T05:46:03.529Z DEBUG 022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59-lightningd: peer_disconnect_done 2025-01-20T06:15:27.2860481Z lightningd-1 2025-01-20T05:46:03.529Z DEBUG 022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59-connectd: Will try reconnect in 1 seconds 2025-01-20T06:15:27.2861626Z lightningd-1 2025-01-20T05:46:03.529Z DEBUG connectd: maybe_free_peer freeing peer! 2025-01-20T06:15:27.2862723Z lightningd-1 2025-01-20T05:46:03.530Z INFO 022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59-channeld-chan#1: Peer connection lost 2025-01-20T06:15:27.2864510Z lightningd-1 2025-01-20T05:46:03.530Z INFO 022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59-chan#1: Peer transient failure in CHANNELD_NORMAL: channeld: Owning subdaemon channeld died (62208) 2025-01-20T06:15:27.2866301Z lightningd-1 2025-01-20T05:46:03.530Z DEBUG plugin-funder: Cleaning up inflights for peer id 022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59 2025-01-20T06:15:27.2867313Z lightningd-2 2025-01-20T05:46:03.530Z DEBUG hsmd: Shutting down 2025-01-20T06:15:27.2868029Z lightningd-1 2025-01-20T05:46:03.535Z TRACE lightningd: Calling rpc_command hook of plugin cln-xpay 2025-01-20T06:15:27.2868935Z lightningd-1 2025-01-20T05:46:03.535Z TRACE lightningd: Plugin cln-xpay returned from rpc_command hook call 2025-01-20T06:15:27.2869830Z lightningd-1 2025-01-20T05:46:03.536Z TRACE lightningd: Calling rpc_command hook of plugin cln-xpay 2025-01-20T06:15:27.2870740Z lightningd-1 2025-01-20T05:46:03.536Z TRACE lightningd: Plugin cln-xpay returned from rpc_command hook call 2025-01-20T06:15:27.2872276Z lightningd-1 2025-01-20T05:46:03.536Z INFO 022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59-chan#1: State changed from CHANNELD_NORMAL to CHANNELD_SHUTTING_DOWN 2025-01-20T06:15:27.2873873Z lightningd-1 2025-01-20T05:46:03.538Z DEBUG lightningd: NOTIFY "-c:close#30" info peer is offline, will negotiate once they reconnect (1 seconds before unilateral close). 2025-01-20T06:15:27.2874947Z lightningd-1 2025-01-20T05:46:03.538Z DEBUG lightningd: close_command: timeout = 1 2025-01-20T06:15:27.2878248Z lightningd-1 2025-01-20T05:46:03.541Z DEBUG plugin-cln-grpc: Received a message: CustomNotification(Object {\"jsonrpc\": String(\"2.0\"), \"method\": String(\"channel_state_changed\"), \"params\": Object {\"channel_state_changed\": Object {\"cause\": String(\"user\"), \"channel_id\": String(\"9aeb0e78b23e67f13670c602d5a40a7c8482a87220d93522cc0f7c6c9da9daab\"), \"message\": String(\"User or plugin invoked close command\"), \"new_state\": String(\"CHANNELD_SHUTTING_DOWN\"), \"old_state\": String(\"CHANNELD_NORMAL\"), \"peer_id\": String(\"022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59\"), \"short_channel_id\": String(\"103x1x0\"), \"timestamp\": String(\"2025-01-20T05:46:03.537Z\")}}}) 2025-01-20T06:15:27.2884122Z lightningd-1 2025-01-20T05:46:03.542Z DEBUG plugin-cln-grpc: Dispatching custom notification Object {\"jsonrpc\": String(\"2.0\"), \"method\": String(\"channel_state_changed\"), \"params\": Object {\"channel_state_changed\": Object {\"cause\": String(\"user\"), \"channel_id\": String(\"9aeb0e78b23e67f13670c602d5a40a7c8482a87220d93522cc0f7c6c9da9daab\"), \"message\": String(\"User or plugin invoked close command\"), \"new_state\": String(\"CHANNELD_SHUTTING_DOWN\"), \"old_state\": String(\"CHANNELD_NORMAL\"), \"peer_id\": String(\"022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59\"), \"short_channel_id\": String(\"103x1x0\"), \"timestamp\": String(\"2025-01-20T05:46:03.537Z\")}}} 2025-01-20T06:15:27.2888242Z lightningd-1 2025-01-20T05:46:03.542Z DEBUG plugin-cln-grpc: Failed to parse notification from lightningd Error(\"unknown variant `channel_state_changed`, expected one of `block_added`, `channel_open_failed`, `channel_opened`, `connect`, `custommsg`\", line: 0, column: 0) 2025-01-20T06:15:27.2889970Z lightningd-1 2025-01-20T05:46:04.350Z DEBUG gossipd: seeker: no peers, waiting 2025-01-20T06:15:27.2890762Z lightningd-1 2025-01-20T05:46:04.538Z DEBUG lightningd: NOTIFY "-c:close#30" info Timed out, forcing close. 2025-01-20T06:15:27.2892345Z lightningd-1 2025-01-20T05:46:04.539Z UNUSUAL 022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59-chan#1: Peer permanent failure in CHANNELD_SHUTTING_DOWN: Forcibly closed by `close` command timeout (reason=user) 2025-01-20T06:15:27.2894333Z lightningd-1 2025-01-20T05:46:04.539Z INFO 022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59-chan#1: State changed from CHANNELD_SHUTTING_DOWN to AWAITING_UNILATERAL 2025-01-20T06:15:27.2895943Z lightningd-1 2025-01-20T05:46:04.543Z DEBUG 022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59-chan#1: We have 1 anchor points to use 2025-01-20T06:15:27.2897412Z lightningd-1 2025-01-20T05:46:04.544Z DEBUG lightningd: Broadcasting txid c96f6e4409140474fdd5d8b4862c3d74bd6fa24dadd178182e78fdc4cbf68149 for "-c:close#30" 2025-01-20T06:15:27.2906272Z lightningd-1 2025-01-20T05:46:04.544Z DEBUG lightningd: sendrawtransaction: 020000000001019aeb0e78b23e67f13670c602d5a40a7c8482a87220d93522cc0f7c6c9da9daab00000000009db0e280024a01000000000000220020be7935a77ca9ab70a4b8b1906825637767fed3c00824aa90c988983587d68488352f0f00000000002200209f4684ddb28acdc73959bc194d1a25df906f61ed030f52d163e6f1e247d32cbb0400473044022061656b7587a96723e3942f7445313d1c4315e8b2a17fade62d9a48d8a69e2806022024ac01825c79d3bcee6f5dc19586af6b3f8415156cbda934d0e2cbad9268a6a20147304402201fb54f5cb72da9dfd1399b8df6bf4f1950313b057fc0f5b30b2b09aec2783c9e022052dbd5e30bdc7015c0eb939ff3ad58b83c2d7369bb546571603ccfa62e65c4e00147522102324266de8403b3ab157a09f1f784d587af61831c998c151bcc21bb74c2b2314b2102e3bd38009866c9da8ec4aa99cc4ea9c6c0dd46df15c61ef0ce1f271291714e5752ae9a3ed620 2025-01-20T06:15:27.2914366Z lightningd-1 2025-01-20T05:46:04.548Z DEBUG hsmd: Client: Received message 5 from client 2025-01-20T06:15:27.2915958Z lightningd-1 2025-01-20T05:46:04.548Z DEBUG 022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59-connectd: Failed connected out: All addresses failed: 127.0.0.1:42595: Connection establishment: Connection refused. 2025-01-20T06:15:27.2917891Z lightningd-1 2025-01-20T05:46:04.548Z DEBUG 022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59-connectd: Will try reconnect in 2 seconds 2025-01-20T06:15:27.2921924Z lightningd-1 2025-01-20T05:46:04.548Z DEBUG plugin-cln-grpc: Received a message: CustomNotification(Object {\"jsonrpc\": String(\"2.0\"), \"method\": String(\"channel_state_changed\"), \"params\": Object {\"channel_state_changed\": Object {\"cause\": String(\"user\"), \"channel_id\": String(\"9aeb0e78b23e67f13670c602d5a40a7c8482a87220d93522cc0f7c6c9da9daab\"), \"message\": String(\"Forcibly closed by `close` command timeout\"), \"new_state\": String(\"AWAITING_UNILATERAL\"), \"old_state\": String(\"CHANNELD_SHUTTING_DOWN\"), \"peer_id\": String(\"022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59\"), \"short_channel_id\": String(\"103x1x0\"), \"timestamp\": String(\"2025-01-20T05:46:04.540Z\")}}}) 2025-01-20T06:15:27.2928311Z lightningd-1 2025-01-20T05:46:04.548Z DEBUG plugin-cln-grpc: Dispatching custom notification Object {\"jsonrpc\": String(\"2.0\"), \"method\": String(\"channel_state_changed\"), \"params\": Object {\"channel_state_changed\": Object {\"cause\": String(\"user\"), \"channel_id\": String(\"9aeb0e78b23e67f13670c602d5a40a7c8482a87220d93522cc0f7c6c9da9daab\"), \"message\": String(\"Forcibly closed by `close` command timeout\"), \"new_state\": String(\"AWAITING_UNILATERAL\"), \"old_state\": String(\"CHANNELD_SHUTTING_DOWN\"), \"peer_id\": String(\"022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59\"), \"short_channel_id\": String(\"103x1x0\"), \"timestamp\": String(\"2025-01-20T05:46:04.540Z\")}}} 2025-01-20T06:15:27.2932905Z lightningd-1 2025-01-20T05:46:04.549Z DEBUG plugin-cln-grpc: Failed to parse notification from lightningd Error(\"unknown variant `channel_state_changed`, expected one of `block_added`, `channel_open_failed`, `channel_opened`, `connect`, `custommsg`\", line: 0, column: 0) 2025-01-20T06:15:27.2934604Z lightningd-1 2025-01-20T05:46:04.549Z TRACE lightningd: Calling rpc_command hook of plugin cln-xpay 2025-01-20T06:15:27.2935599Z lightningd-1 2025-01-20T05:46:04.551Z TRACE lightningd: Plugin cln-xpay returned from rpc_command hook call 2025-01-20T06:15:27.2936589Z lightningd-1 2025-01-20T05:46:04.553Z TRACE lightningd: Calling rpc_command hook of plugin cln-xpay 2025-01-20T06:15:27.2937546Z lightningd-1 2025-01-20T05:46:04.554Z TRACE lightningd: Plugin cln-xpay returned from rpc_command hook call 2025-01-20T06:15:27.2946767Z lightningd-1 2025-01-20T05:46:04.581Z DEBUG plugin-bcli: sendrawtx exit 0 (bitcoin-cli -regtest -datadir=/tmp/ltests-qdo6i210/test_no_reconnect_awating_unilateral_1/lightning-1/ -rpcclienttimeout=60 -rpcport=57415 -rpcuser=... -stdinrpcpass sendrawtransaction 020000000001019aeb0e78b23e67f13670c602d5a40a7c8482a87220d93522cc0f7c6c9da9daab00000000009db0e280024a01000000000000220020be7935a77ca9ab70a4b8b1906825637767fed3c00824aa90c988983587d68488352f0f00000000002200209f4684ddb28acdc73959bc194d1a25df906f61ed030f52d163e6f1e247d32cbb0400473044022061656b7587a96723e3942f7445313d1c4315e8b2a17fade62d9a48d8a69e2806022024ac01825c79d3bcee6f5dc19586af6b3f8415156cbda934d0e2cbad9268a6a20147304402201fb54f5cb72da9dfd1399b8df6bf4f1950313b057fc0f5b30b2b09aec2783c9e022052dbd5e30bdc7015c0eb939ff3ad58b83c2d7369bb546571603ccfa62e65c4e00147522102324266de8403b3ab157a09f1f784d587af61831c998c151bcc21bb74c2b2314b2102e3bd38009866c9da8ec4aa99cc4ea9c6c0dd46df15c61ef0ce1f271291714e5752ae9a3ed620) 2025-01-20T06:15:27.2956874Z lightningd-1 2025-01-20T05:46:04.581Z DEBUG 022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59-chan#1: Low-priority anchorspend aiming for block 2119 (feerate 253) 2025-01-20T06:15:27.2958216Z lightningd-1 2025-01-20T05:46:04.583Z DEBUG hsmd: Client: Received message 28 from client 2025-01-20T06:15:27.2958970Z lightningd-1 2025-01-20T05:46:09.354Z DEBUG gossipd: seeker: no peers, waiting 2025-01-20T06:15:27.2959764Z lightningd-1 2025-01-20T05:46:09.568Z DEBUG lightningd: channel_gossip: no longer in startup mode 2025-01-20T06:15:27.2960572Z lightningd-1 2025-01-20T05:46:14.358Z DEBUG gossipd: seeker: no peers, waiting 2025-01-20T06:15:27.2962896Z {'github_repository': 'ElementsProject/lightning', 'github_sha': '8c945c6075752c2488c211f912e9b97d1d019fc5', 'github_ref': 'refs/pull/7886/merge', 'github_ref_name': 'HEAD', 'github_run_id': 12859491513, 'github_head_ref': 'guilt/test-htable', 'github_run_number': 11999, 'github_base_ref': 'master', 'github_run_attempt': '3', 'testname': 'test_no_reconnect_awating_unilateral', 'start_time': 1737351957, 'end_time': 1737351974, 'outcome': 'fail'} 2025-01-20T06:15:27.2965150Z --------------------------- Captured stdout teardown --------------------------- ``` We need to check that connectd got the "peer_downgrade" message: before that it *might* try to connect. Signed-off-by: Rusty Russell --- connectd/connectd.c | 1 + tests/test_connection.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/connectd/connectd.c b/connectd/connectd.c index db06054a3cbc..881db38fee10 100644 --- a/connectd/connectd.c +++ b/connectd/connectd.c @@ -2017,6 +2017,7 @@ static void peer_downgrade(struct daemon *daemon, const u8 *msg) if (!fromwire_connectd_downgrade_peer(msg, &id)) master_badmsg(WIRE_CONNECTD_DOWNGRADE_PEER, msg); + status_peer_debug(&id, "peer_downgrade"); tal_free(important_id_htable_get(daemon->important_ids, &id)); } diff --git a/tests/test_connection.py b/tests/test_connection.py index 9f4d12c4476f..09385d9b18b6 100644 --- a/tests/test_connection.py +++ b/tests/test_connection.py @@ -4432,7 +4432,7 @@ def test_no_reconnect_awating_unilateral(node_factory, bitcoind): wait_for(lambda: only_one(l1.rpc.listpeerchannels(l2.info['id'])['channels'])['state'] == 'AWAITING_UNILATERAL') # After switching to AWAITING_UNILATERAL it will *not* try to reconnect. - l1.daemon.wait_for_log("State changed from CHANNELD_SHUTTING_DOWN to AWAITING_UNILATERAL") + l1.daemon.wait_for_log("{}-connectd: peer_downgrade".format(l2.info['id'])) time.sleep(10) assert not l1.daemon.is_in_log('Will try reconnect', start=l1.daemon.logsearch_start) From 67f5baa10757a208dfe26bdca3c5dbb657168c64 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 25 Jan 2025 11:08:11 +1030 Subject: [PATCH 02/16] dev_disconnect: rename to dev_disconnect_out, in preparation for incoming filters. Signed-off-by: Rusty Russell --- common/dev_disconnect.c | 8 ++++---- common/dev_disconnect.h | 16 ++++++++-------- connectd/multiplex.c | 14 +++++++------- connectd/peer_exchange_initmsg.c | 14 +++++++------- 4 files changed, 26 insertions(+), 26 deletions(-) diff --git a/common/dev_disconnect.c b/common/dev_disconnect.c index 289bd9fde565..76df764b9910 100644 --- a/common/dev_disconnect.c +++ b/common/dev_disconnect.c @@ -50,20 +50,20 @@ void dev_disconnect_init(int fd) dev_disconnect_fd = fd; } -enum dev_disconnect dev_disconnect(const struct node_id *id, int pkt_type) +enum dev_disconnect_out dev_disconnect_out(const struct node_id *id, int pkt_type) { if (dev_disconnect_fd == -1) - return DEV_DISCONNECT_NORMAL; + return DEV_DISCONNECT_OUT_NORMAL; if (!dev_disconnect_count) next_dev_disconnect(); if (!dev_disconnect_line[0] || !streq(peer_wire_name(pkt_type), dev_disconnect_line+1)) - return DEV_DISCONNECT_NORMAL; + return DEV_DISCONNECT_OUT_NORMAL; if (--dev_disconnect_count != 0) { - return DEV_DISCONNECT_NORMAL; + return DEV_DISCONNECT_OUT_NORMAL; } if (lseek(dev_disconnect_fd, dev_disconnect_len+1, SEEK_CUR) < 0) { diff --git a/common/dev_disconnect.h b/common/dev_disconnect.h index 1b28fd30d891..333ebf26fc43 100644 --- a/common/dev_disconnect.h +++ b/common/dev_disconnect.h @@ -5,23 +5,23 @@ struct node_id; -enum dev_disconnect { +enum dev_disconnect_out { /* Do nothing. */ - DEV_DISCONNECT_NORMAL = '=', + DEV_DISCONNECT_OUT_NORMAL = '=', /* Close connection before sending packet. */ - DEV_DISCONNECT_BEFORE = '-', + DEV_DISCONNECT_OUT_BEFORE = '-', /* Close connection after sending packet. */ - DEV_DISCONNECT_AFTER = '+', + DEV_DISCONNECT_OUT_AFTER = '+', /* Drop message (don't send to peer) */ - DEV_DISCONNECT_DROP = '$', + DEV_DISCONNECT_OUT_DROP = '$', /* Swallow all writes from now on, and do no more reads. */ - DEV_DISCONNECT_BLACKHOLE = '0', + DEV_DISCONNECT_OUT_BLACKHOLE = '0', /* Don't use connection after sending packet, but don't close. */ - DEV_DISCONNECT_DISABLE_AFTER = 'x', + DEV_DISCONNECT_OUT_DISABLE_AFTER = 'x', }; /* Force a close fd before or after a certain packet type */ -enum dev_disconnect dev_disconnect(const struct node_id *id, int pkt_type); +enum dev_disconnect_out dev_disconnect_out(const struct node_id *id, int pkt_type); /* Make next write on fd fail as if they'd disconnected. */ void dev_sabotage_fd(int fd, bool close_fd); diff --git a/connectd/multiplex.c b/connectd/multiplex.c index 817994b670ae..e49bb768da01 100644 --- a/connectd/multiplex.c +++ b/connectd/multiplex.c @@ -415,25 +415,25 @@ static struct io_plan *encrypt_and_send(struct peer *peer, { int type = fromwire_peektype(msg); - switch (dev_disconnect(&peer->id, type)) { - case DEV_DISCONNECT_BEFORE: + switch (dev_disconnect_out(&peer->id, type)) { + case DEV_DISCONNECT_OUT_BEFORE: if (taken(msg)) tal_free(msg); return io_close(peer->to_peer); - case DEV_DISCONNECT_AFTER: + case DEV_DISCONNECT_OUT_AFTER: /* Disallow reads from now on */ peer->dev_read_enabled = false; /* Using io_close here can lose the data we're about to send! */ next = io_sock_shutdown_cb; break; - case DEV_DISCONNECT_BLACKHOLE: + case DEV_DISCONNECT_OUT_BLACKHOLE: /* Disable both reads and writes from now on */ peer->dev_read_enabled = false; peer->dev_writes_enabled = talz(peer, u32); break; - case DEV_DISCONNECT_NORMAL: + case DEV_DISCONNECT_OUT_NORMAL: break; - case DEV_DISCONNECT_DROP: + case DEV_DISCONNECT_OUT_DROP: /* Drop this message and continue */ if (taken(msg)) tal_free(msg); @@ -441,7 +441,7 @@ static struct io_plan *encrypt_and_send(struct peer *peer, io_wake(&peer->subds); return msg_queue_wait(peer->to_peer, peer->peer_outq, next, peer); - case DEV_DISCONNECT_DISABLE_AFTER: + case DEV_DISCONNECT_OUT_DISABLE_AFTER: peer->dev_read_enabled = false; peer->dev_writes_enabled = tal(peer, u32); *peer->dev_writes_enabled = 1; diff --git a/connectd/peer_exchange_initmsg.c b/connectd/peer_exchange_initmsg.c index 63b3ddb8da6f..a7df0d6eb466 100644 --- a/connectd/peer_exchange_initmsg.c +++ b/connectd/peer_exchange_initmsg.c @@ -280,21 +280,21 @@ struct io_plan *peer_exchange_initmsg(struct io_conn *conn, peer->msg = cryptomsg_encrypt_msg(peer, &peer->cs, take(peer->msg)); next = read_init; - switch (dev_disconnect(&peer->id, WIRE_INIT)) { - case DEV_DISCONNECT_BEFORE: + switch (dev_disconnect_out(&peer->id, WIRE_INIT)) { + case DEV_DISCONNECT_OUT_BEFORE: dev_sabotage_fd(io_conn_fd(conn), true); break; - case DEV_DISCONNECT_AFTER: + case DEV_DISCONNECT_OUT_AFTER: next = dev_peer_write_postclose; break; - case DEV_DISCONNECT_BLACKHOLE: + case DEV_DISCONNECT_OUT_BLACKHOLE: status_failed(STATUS_FAIL_INTERNAL_ERROR, "Blackhole not supported during handshake"); break; - case DEV_DISCONNECT_NORMAL: - case DEV_DISCONNECT_DROP: + case DEV_DISCONNECT_OUT_NORMAL: + case DEV_DISCONNECT_OUT_DROP: break; - case DEV_DISCONNECT_DISABLE_AFTER: + case DEV_DISCONNECT_OUT_DISABLE_AFTER: next = dev_peer_write_post_sabotage; break; } From dd8663e02b6c6bcdad6a113dcb652908b8c19d29 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 25 Jan 2025 11:09:11 +1030 Subject: [PATCH 03/16] dev_disconnect: add '<' to close after an incoming msg. I'd been resisting this, but some tests really do want "hang up after *receiving* this", and it's more reliable than getting the peer to "hang up afer *sending* this" which seems not always work. Signed-off-by: Rusty Russell --- common/dev_disconnect.c | 27 +++++++++++++++++++++++++++ common/dev_disconnect.h | 10 ++++++++++ 2 files changed, 37 insertions(+) diff --git a/common/dev_disconnect.c b/common/dev_disconnect.c index 76df764b9910..2feaf927862e 100644 --- a/common/dev_disconnect.c +++ b/common/dev_disconnect.c @@ -59,6 +59,7 @@ enum dev_disconnect_out dev_disconnect_out(const struct node_id *id, int pkt_typ next_dev_disconnect(); if (!dev_disconnect_line[0] + || dev_disconnect_line[0] == DEV_DISCONNECT_IN_AFTER_RECV || !streq(peer_wire_name(pkt_type), dev_disconnect_line+1)) return DEV_DISCONNECT_OUT_NORMAL; @@ -76,6 +77,32 @@ enum dev_disconnect_out dev_disconnect_out(const struct node_id *id, int pkt_typ return dev_disconnect_line[0]; } +enum dev_disconnect_in dev_disconnect_in(const struct node_id *id, int pkt_type) +{ + if (dev_disconnect_fd == -1) + return DEV_DISCONNECT_IN_NORMAL; + + if (!dev_disconnect_count) + next_dev_disconnect(); + + if (dev_disconnect_line[0] != DEV_DISCONNECT_IN_AFTER_RECV + || !streq(peer_wire_name(pkt_type), dev_disconnect_line+1)) + return DEV_DISCONNECT_IN_NORMAL; + + if (--dev_disconnect_count != 0) { + return DEV_DISCONNECT_IN_NORMAL; + } + + if (lseek(dev_disconnect_fd, dev_disconnect_len+1, SEEK_CUR) < 0) { + err(1, "lseek failure"); + } + + status_peer_debug(id, "dev_disconnect: %s (%s)", + dev_disconnect_line, + peer_wire_name(pkt_type)); + return dev_disconnect_line[0]; +} + void dev_sabotage_fd(int fd, bool close_fd) { int fds[2]; diff --git a/common/dev_disconnect.h b/common/dev_disconnect.h index 333ebf26fc43..0237cd25fc3e 100644 --- a/common/dev_disconnect.h +++ b/common/dev_disconnect.h @@ -23,6 +23,16 @@ enum dev_disconnect_out { /* Force a close fd before or after a certain packet type */ enum dev_disconnect_out dev_disconnect_out(const struct node_id *id, int pkt_type); +enum dev_disconnect_in { + /* Do nothing. */ + DEV_DISCONNECT_IN_NORMAL = '=', + /* Close connection after receiving packet. */ + DEV_DISCONNECT_IN_AFTER_RECV = '<', +}; + +/* Force a close fd after receiving a certain packet type */ +enum dev_disconnect_in dev_disconnect_in(const struct node_id *id, int pkt_type); + /* Make next write on fd fail as if they'd disconnected. */ void dev_sabotage_fd(int fd, bool close_fd); From eb8e97a69e5e225c2270aa36e8c06d776a2a54f8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 25 Jan 2025 11:10:11 +1030 Subject: [PATCH 04/16] connectd: attach input filtering for incoming dev_disconnect. Signed-off-by: Rusty Russell --- connectd/multiplex.c | 28 ++++++++++++++++++++++------ connectd/peer_exchange_initmsg.c | 7 +++++++ 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/connectd/multiplex.c b/connectd/multiplex.c index e49bb768da01..227dbe6a9579 100644 --- a/connectd/multiplex.c +++ b/connectd/multiplex.c @@ -1136,6 +1136,13 @@ static struct subd *new_subd(struct peer *peer, return subd; } +static struct io_plan *close_peer_dev_disconnect(struct io_conn *peer_conn, + struct peer *peer) +{ + assert(peer->to_peer == peer_conn); + return io_close_cb(peer_conn, peer); +} + static struct io_plan *read_hdr_from_peer(struct io_conn *peer_conn, struct peer *peer); static struct io_plan *read_body_from_peer_done(struct io_conn *peer_conn, @@ -1145,7 +1152,8 @@ static struct io_plan *read_body_from_peer_done(struct io_conn *peer_conn, struct channel_id channel_id; struct subd *subd; enum peer_wire type; - + struct io_plan *(*next_read)(struct io_conn *peer_conn, + struct peer *peer) = read_hdr_from_peer; decrypted = cryptomsg_decrypt_body(tmpctx, &peer->cs, peer->peer_in); @@ -1162,16 +1170,24 @@ static struct io_plan *read_body_from_peer_done(struct io_conn *peer_conn, if (!peer->dev_read_enabled) return read_hdr_from_peer(peer_conn, peer); + switch (dev_disconnect_in(&peer->id, type)) { + case DEV_DISCONNECT_IN_NORMAL: + break; + case DEV_DISCONNECT_IN_AFTER_RECV: + next_read = close_peer_dev_disconnect; + break; + } + /* We got something! */ peer->last_recv_time = time_now(); /* Don't process packets while we're closing */ if (peer->draining) - return read_hdr_from_peer(peer_conn, peer); + return next_read(peer_conn, peer); /* If we swallow this, just try again. */ if (handle_message_locally(peer, decrypted)) - return read_hdr_from_peer(peer_conn, peer); + return next_read(peer_conn, peer); /* After this we should be able to match to subd by channel_id */ if (!extract_channel_id(decrypted, &channel_id)) { @@ -1186,7 +1202,7 @@ static struct io_plan *read_body_from_peer_done(struct io_conn *peer_conn, "Received %s: %s", peer_wire_name(type), desc); if (type == WIRE_WARNING) - return read_hdr_from_peer(peer_conn, peer); + return next_read(peer_conn, peer); return io_close(peer_conn); } @@ -1194,7 +1210,7 @@ static struct io_plan *read_body_from_peer_done(struct io_conn *peer_conn, send_warning(peer, "Unexpected message %s: %s", peer_wire_name(type), tal_hex(tmpctx, decrypted)); - return read_hdr_from_peer(peer_conn, peer); + return next_read(peer_conn, peer); } /* If we don't find a subdaemon for this, create a new one. */ @@ -1237,7 +1253,7 @@ static struct io_plan *read_body_from_peer_done(struct io_conn *peer_conn, } /* Wait for them to wake us */ - return io_wait(peer_conn, &peer->peer_in, read_hdr_from_peer, peer); + return io_wait(peer_conn, &peer->peer_in, next_read, peer); } static struct io_plan *read_body_from_peer(struct io_conn *peer_conn, diff --git a/connectd/peer_exchange_initmsg.c b/connectd/peer_exchange_initmsg.c index a7df0d6eb466..b0a29549caba 100644 --- a/connectd/peer_exchange_initmsg.c +++ b/connectd/peer_exchange_initmsg.c @@ -61,6 +61,13 @@ static struct io_plan *peer_init_received(struct io_conn *conn, status_peer_io(LOG_IO_IN, &peer->id, msg); + switch (dev_disconnect_in(&peer->id, fromwire_peektype(msg))) { + case DEV_DISCONNECT_IN_NORMAL: + break; + case DEV_DISCONNECT_IN_AFTER_RECV: + return io_close(conn); + } + /* BOLT #1: * * A receiving node: From 6e1c82a923ebec4b7cdda39a53c1a475773cf28e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 25 Jan 2025 11:11:11 +1030 Subject: [PATCH 05/16] pytest: fix flake in tests/test_connection.py::test_reconnect_signed Sometimes, l1 wouldn't receive the msg before l2 hung up, causing chaos. What we *actually* want here is to make sure that l1 receives the msg before closing: ``` l1.rpc.connect(l2.info['id'], 'localhost', l2.port) > l1.rpc.fundchannel(l2.info['id'], CHANNEL_SIZE) tests/test_connection.py:667: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ contrib/pyln-client/pyln/client/lightning.py:767: in fundchannel return self.call("fundchannel", payload) contrib/pyln-testing/pyln/testing/utils.py:740: in call res = LightningRpc.call(self, method, payload, cmdprefix, filter) _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = method = 'fundchannel' payload = {'amount': 50000, 'announce': True, 'id': '022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59'} cmdprefix = None, filter = None def call(self, method, payload=None, cmdprefix=None, filter=None): """Generic call API: you can set cmdprefix here, or set self.cmdprefix before the call is made. """ self.logger.debug("Calling %s with payload %r", method, payload) if payload is None: payload = {} # Filter out arguments that are None if isinstance(payload, dict): payload = {k: v for k, v in payload.items() if v is not None} this_id = self.get_json_id(method, cmdprefix) self.next_id += 1 # FIXME: we open a new socket for every readobj call... sock = UnixSocket(self.socket_path) buf = b'' if self._notify is not None: # Opt into the notifications support self._writeobj(sock, { "jsonrpc": "2.0", "method": "notifications", "id": this_id + "+notify-enable", "params": { "enable": True }, }) # FIXME: Notification schema support? _, buf = self._readobj(sock, buf) request = { "jsonrpc": "2.0", "method": method, "params": payload, "id": this_id, } if filter is None: filter = self._filter if filter is not None: request["filter"] = filter self._writeobj(sock, request) while True: resp, buf = self._readobj(sock, buf) id = resp.get("id", None) meth = resp.get("method", None) if meth == 'message' and self._notify is not None: n = resp['params'] self._notify( message=n.get('message', None), progress=n.get('progress', None), request=request ) continue if meth is None or id is None: break self.logger.debug("Received response for %s call: %r", method, resp) if 'id' in resp and resp['id'] != this_id: raise ValueError("Malformed response, id is not {}: {}.".format(this_id, resp)) sock.close() if not isinstance(resp, dict): raise TypeError("Malformed response, response is not a dictionary %s." % resp) elif "error" in resp: > raise RpcError(method, payload, resp['error']) E pyln.client.lightning.RpcError: RPC call failed: method: fundchannel, payload: {'id': '022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59', 'amount': 50000, 'announce': True}, error: {'code': -1, 'message': 'Disconnected', 'data': {'id': '022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59', 'method': 'openchannel_update'}} ``` Signed-off-by: Rusty Russell --- tests/test_connection.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/test_connection.py b/tests/test_connection.py index 09385d9b18b6..727553e7a08e 100644 --- a/tests/test_connection.py +++ b/tests/test_connection.py @@ -653,13 +653,12 @@ def test_disconnect_half_signed_v2(node_factory): @pytest.mark.openchannel('v2') def test_reconnect_signed(node_factory): # This will fail *after* both sides consider channel opening. - disconnects = ['+WIRE_FUNDING_SIGNED'] + disconnects = [' Date: Sat, 25 Jan 2025 11:12:11 +1030 Subject: [PATCH 06/16] pytest: mark xpay real gossip test as slow ``` ____________________ ERROR at teardown of test_xpay_maxfee _____________________ ... # Format a nice list of everything that went wrong and raise an exception request.node.has_errors = True > raise ValueError(str(errors)) E ValueError: E Node errors: E - lightningd-1: Node exited with return code 1 E Global errors: ``` And: ``` @unittest.skipIf(TEST_NETWORK != 'regtest', 'too dusty on elements') def test_xpay_maxfee(node_factory, bitcoind, chainparams): """Test which shows that we don't excees maxfee""" outfile = tempfile.NamedTemporaryFile(prefix='gossip-store-') subprocess.check_output(['devtools/gossmap-compress', 'decompress', '--node-map=3301=022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59', 'tests/data/gossip-store-2024-09-22.compressed', outfile.name]).decode('utf-8').splitlines() AMOUNT = 100_000_000 # l2 will warn l1 about its invalid gossip: ignore. # We throttle l1's gossip to avoid massive log spam. > l1, l2 = node_factory.line_graph(2, # This is in sats, so 1000x amount we send. fundamount=AMOUNT, opts=[{'gossip_store_file': outfile.name, 'subdaemon': 'channeld:../tests/plugins/channeld_fakenet', 'allow_warning': True, 'dev-throttle-gossip': None}, {'allow_bad_gossip': True}]) tests/test_xpay.py:509: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ contrib/pyln-testing/pyln/testing/utils.py:1720: in line_graph nodes = self.get_nodes(num_nodes, opts=opts) contrib/pyln-testing/pyln/testing/utils.py:1602: in get_nodes return [j.result() for j in jobs] contrib/pyln-testing/pyln/testing/utils.py:1602: in return [j.result() for j in jobs] /opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/concurrent/futures/_base.py:458: in result return self.__get_result() /opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/concurrent/futures/_base.py:403: in __get_result raise self._exception /opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/concurrent/futures/thread.py:58: in run result = self.fn(*self.args, **self.kwargs) contrib/pyln-testing/pyln/testing/utils.py:1653: in get_node node.start(wait_for_bitcoind_sync) contrib/pyln-testing/pyln/testing/utils.py:1015: in start self.daemon.start(stderr_redir=stderr_redir) contrib/pyln-testing/pyln/testing/utils.py:671: in start self.wait_for_log("Server started with public key") contrib/pyln-testing/pyln/testing/utils.py:355: in wait_for_log return self.wait_for_logs([regex], timeout) _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = regexs = ['Server started with public key'], timeout = 180 def wait_for_logs(self, regexs, timeout=TIMEOUT): """Look for `regexs` in the logs. The logs contain tailed stdout of the process. We look for each regex in `regexs`, starting from `logsearch_start` which normally is the position of the last found entry of a previous wait-for logs call. The ordering inside `regexs` doesn't matter. We fail if the timeout is exceeded or if the underlying process exits before all the `regexs` were found. If timeout is None, no time-out is applied. """ logging.debug("Waiting for {} in the logs".format(regexs)) exs = [re.compile(r) for r in regexs] start_time = time.time() while True: if self.logsearch_start >= len(self.logs): if not self.logs_catchup(): time.sleep(0.25) if timeout is not None and time.time() > start_time + timeout: print("Time-out: can't find {} in logs".format(exs)) for r in exs: if self.is_in_log(r): print("({} was previously in logs!)".format(r)) > raise TimeoutError('Unable to find "{}" in logs.'.format(exs)) E TimeoutError: Unable to find "[re.compile('Server started with public key')]" in logs. ``` gossipd (and other plugins) simply take too long to digest the gossmap under valgrind. Signed-off-by: Rusty Russell --- tests/test_xpay.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_xpay.py b/tests/test_xpay.py index f50aedd8a898..ac8ab95847d8 100644 --- a/tests/test_xpay.py +++ b/tests/test_xpay.py @@ -494,6 +494,7 @@ def test_xpay_preapprove(node_factory): @unittest.skipIf(TEST_NETWORK != 'regtest', 'too dusty on elements') +@pytest.mark.slow_test def test_xpay_maxfee(node_factory, bitcoind, chainparams): """Test which shows that we don't excees maxfee""" outfile = tempfile.NamedTemporaryFile(prefix='gossip-store-') From f44a26bae69e7cf961038d076576f07188703c28 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 25 Jan 2025 11:13:11 +1030 Subject: [PATCH 07/16] pytest: fix test_splice_disconnect_commit This test seems confused: l2 won't be able to reconnect to l1 (l1 connected to l2 in the first place, it's the only one which can reconnect). Also, there's a commitment_signed early on when we have dual funding, so this doesn't actually test splicing disconnect in that case? The race seems to happen when l1 reconnectd, and l2 hasn't registered the disconnect yet. ``` ________________________ test_splice_disconnect_commit _________________________ [gw9] linux -- Python 3.10.16 /home/runner/.cache/pypoetry/virtualenvs/cln-meta-project-AqJ9wMix-py3.10/bin/python node_factory = bitcoind = executor = @pytest.mark.openchannel('v1') @pytest.mark.openchannel('v2') @unittest.skipIf(TEST_NETWORK != 'regtest', 'elementsd doesnt yet support PSBT features we need') def test_splice_disconnect_commit(node_factory, bitcoind, executor): l1 = node_factory.get_node(options={'experimental-splicing': None}, may_reconnect=True) l2 = node_factory.get_node(disconnect=['+WIRE_COMMITMENT_SIGNED'], options={'experimental-splicing': None, 'dev-no-reconnect': None}, may_reconnect=True) > l1.openchannel(l2, 1000000) tests/test_splicing_disconnect.py:77: ... elif "error" in resp: > raise RpcError(method, payload, resp['error']) E pyln.client.lightning.RpcError: RPC call failed: method: fundchannel, payload: {'id': '022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59', 'amount': 1000000, 'announce': True}, error: {'code': -1, 'message': 'Disconnected', 'data': {'id': '022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59', 'method': 'openchannel_update'}} ``` From the logs: ``` 2025-01-23T23:50:25.4098040Z lightningd-2 2025-01-23T23:47:12.443Z DEBUG 0266e4598d1d3c415f572a8488830b60f7e744ed9235eb0b1ba93283b315c03518-connectd: dev_disconnect: +WIRE_COMMITMENT_SIGNED (WIRE_COMMITMENT_SIGNED) ... 2025-01-23T23:50:25.4107026Z lightningd-2 2025-01-23T23:47:12.444Z DEBUG 0266e4598d1d3c415f572a8488830b60f7e744ed9235eb0b1ba93283b315c03518-dualopend-chan#1: peer_out WIRE_COMMITMENT_SIGNED 2025-01-23T23:50:25.4108070Z lightningd-2 2025-01-23T23:47:12.444Z INFO 0266e4598d1d3c415f572a8488830b60f7e744ed9235eb0b1ba93283b315c03518-dualopend-chan#1: Peer connection lost 2025-01-23T23:50:25.4109375Z lightningd-2 2025-01-23T23:47:12.445Z INFO 0266e4598d1d3c415f572a8488830b60f7e744ed9235eb0b1ba93283b315c03518-chan#1: Peer transient failure in DUALOPEND_OPEN_COMMIT_READY: dualopend: Owning subdaemon dualopend died (62208) ... 2025-01-23T23:50:25.4111195Z lightningd-2 2025-01-23T23:47:12.445Z DEBUG plugin-cln-grpc: Received a message: CustomNotification(Object {\"jsonrpc\": String(\"2.0\"), \"method\": String(\"channel_open_failed\"), \"params\": Object {\"channel_open_failed\": Object {\"channel_id\": String(\"252d1b0a1e57895e84137f28cf19ab2c35847e284c112fefdecc7afeaa5c1de7\")}}}) 2025-01-23T23:50:25.4113039Z lightningd-2 2025-01-23T23:47:12.445Z DEBUG plugin-cln-grpc: Dispatching custom notification Object {\"jsonrpc\": String(\"2.0\"), \"method\": String(\"channel_open_failed\"), \"params\": Object {\"channel_open_failed\": Object {\"channel_id\": String(\"252d1b0a1e57895e84137f28cf19ab2c35847e284c112fefdecc7afeaa5c1de7\")}}} 2025-01-23T23:50:25.4114525Z lightningd-2 2025-01-23T23:47:12.446Z DEBUG plugin-funder: Cleaning up inflights for peer id 0266e4598d1d3c415f572a8488830b60f7e744ed9235eb0b1ba93283b315c03518 2025-01-23T23:50:25.4115550Z lightningd-2 2025-01-23T23:47:12.446Z DEBUG plugin-funder: Cleaning up inflight for channel_id 252d1b0a1e57895e84137f28cf19ab2c35847e284c112fefdecc7afeaa5c1de7 2025-01-23T23:50:25.4116406Z lightningd-2 2025-01-23T23:47:12.446Z TRACE lightningd: Calling rpc_command hook of plugin cln-xpay 2025-01-23T23:50:25.4117134Z lightningd-2 2025-01-23T23:47:12.447Z TRACE lightningd: Plugin cln-xpay returned from rpc_command hook call 2025-01-23T23:50:25.4117728Z lightningd-1 2025-01-23T23:47:12.448Z DEBUG connectd: drain_peer 2025-01-23T23:50:25.4118229Z lightningd-1 2025-01-23T23:47:12.448Z DEBUG connectd: drain_peer draining subd! 2025-01-23T23:50:25.4119066Z lightningd-1 2025-01-23T23:47:12.448Z DEBUG 022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59-lightningd: peer_disconnect_done 2025-01-23T23:50:25.4119792Z lightningd-1 2025-01-23T23:47:12.449Z DEBUG connectd: maybe_free_peer freeing peer! ... 2025-01-23T23:50:25.4135647Z lightningd-2 2025-01-23T23:47:12.455Z DEBUG 0266e4598d1d3c415f572a8488830b60f7e744ed9235eb0b1ba93283b315c03518-connectd: Connect IN 2025-01-23T23:50:25.4136554Z lightningd-1 2025-01-23T23:47:12.455Z DEBUG 022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59-connectd: Connected out, starting crypto 2025-01-23T23:50:25.4137502Z lightningd-1 2025-01-23T23:47:12.456Z DEBUG 022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59-connectd: Connect OUT 2025-01-23T23:50:25.4138483Z lightningd-1 2025-01-23T23:47:12.456Z DEBUG 022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59-connectd: peer_out WIRE_INIT 2025-01-23T23:50:25.4139407Z lightningd-1 2025-01-23T23:47:12.456Z DEBUG 022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59-connectd: peer_in WIRE_INIT 2025-01-23T23:50:25.4140714Z lightningd-1 2025-01-23T23:47:12.456Z INFO 022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59-chan#1: Peer transient failure in DUALOPEND_OPEN_COMMIT_READY: Disconnected ``` Signed-off-by: Rusty Russell --- tests/test_splicing_disconnect.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/tests/test_splicing_disconnect.py b/tests/test_splicing_disconnect.py index e4553d425ffa..fd3c09fb35dd 100644 --- a/tests/test_splicing_disconnect.py +++ b/tests/test_splicing_disconnect.py @@ -70,8 +70,14 @@ def test_splice_disconnect_sig(node_factory, bitcoind): @pytest.mark.openchannel('v2') @unittest.skipIf(TEST_NETWORK != 'regtest', 'elementsd doesnt yet support PSBT features we need') def test_splice_disconnect_commit(node_factory, bitcoind, executor): - l1 = node_factory.get_node(options={'experimental-splicing': None}, may_reconnect=True) - l2 = node_factory.get_node(disconnect=['+WIRE_COMMITMENT_SIGNED'], + l1 = node_factory.get_node(options={'experimental-splicing': None, 'dev-no-reconnect': None}, + may_reconnect=True) + # Note: for dual-fund, there's a COMMITMENT_SIGNED for the initial tx, before splicing! + if EXPERIMENTAL_DUAL_FUND: + disconnects = ['+WIRE_COMMITMENT_SIGNED*2'] + else: + disconnects = ['+WIRE_COMMITMENT_SIGNED'] + l2 = node_factory.get_node(disconnect=disconnects, options={'experimental-splicing': None, 'dev-no-reconnect': None}, may_reconnect=True) l1.openchannel(l2, 1000000) @@ -91,15 +97,13 @@ def test_splice_disconnect_commit(node_factory, bitcoind, executor): l2.daemon.wait_for_log(r'dev_disconnect: \+WIRE_COMMITMENT_SIGNED') - print("Killing l2 without sending WIRE_COMMITMENT_SIGNED") - l2.daemon.kill() + l1.daemon.kill() - # Restart l1, without disconnect stuff. - del l2.daemon.opts['dev-no-reconnect'] - del l2.daemon.opts['dev-disconnect'] + # Restart l1, should reconnect + del l1.daemon.opts['dev-no-reconnect'] # Should reconnect, and reestablish the splice. - l2.start() + l1.start() # Splice should be abandoned via tx_abort From e62104f8512749abe8c47861bf53423922348f58 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 25 Jan 2025 11:14:11 +1030 Subject: [PATCH 08/16] pytest: fix flake in test_connect_ratelimit. Sometimes they connect too fast, so we don't get a chance to ratelimit all of them: ``` def test_connect_ratelimit(node_factory, bitcoind): """l1 has 5 peers, restarts, make sure we limit""" nodes = node_factory.get_nodes(6, opts=[{'dev-limit-connections-inflight': None, 'may_reconnect': True}] + [{'may_reconnect': True}] * 5) l1 = nodes[0] nodes = nodes[1:] addr = l1.rpc.newaddr()['bech32'] for n in nodes: bitcoind.rpc.sendtoaddress(addr, (FUNDAMOUNT + 1000000) / 10**8) bitcoind.generate_block(1, wait_for_mempool=len(nodes)) sync_blockheight(bitcoind, [l1]) for n in nodes: l1.rpc.connect(n.info['id'], 'localhost', n.port) l1.rpc.fundchannel(n.info['id'], FUNDAMOUNT) # Make sure all channels are established and announced. bitcoind.generate_block(6, wait_for_mempool=len(nodes)) wait_for(lambda: len(l1.rpc.listchannels()['channels']) == len(nodes) * 2) assert not l1.daemon.is_in_log('Unblocking for') l1.restart() # The first will be ok, but others should block and be unblocked. > l1.daemon.wait_for_logs((['Unblocking for '] + ['Too many connections, waiting']) * (len(nodes) - 1)) tests/test_connection.py:4721: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = regexs = ['Unblocking for ', 'Too many connections, waiting', 'Unblocking for ', 'Too many connections, waiting', 'Unblocking for ', 'Too many connections, waiting', ...] timeout = 180 def wait_for_logs(self, regexs, timeout=TIMEOUT): """Look for `regexs` in the logs. The logs contain tailed stdout of the process. We look for each regex in `regexs`, starting from `logsearch_start` which normally is the position of the last found entry of a previous wait-for logs call. The ordering inside `regexs` doesn't matter. We fail if the timeout is exceeded or if the underlying process exits before all the `regexs` were found. If timeout is None, no time-out is applied. """ logging.debug("Waiting for {} in the logs".format(regexs)) exs = [re.compile(r) for r in regexs] start_time = time.time() while True: if self.logsearch_start >= len(self.logs): if not self.logs_catchup(): time.sleep(0.25) if timeout is not None and time.time() > start_time + timeout: print("Time-out: can't find {} in logs".format(exs)) for r in exs: if self.is_in_log(r): print("({} was previously in logs!)".format(r)) > raise TimeoutError('Unable to find "{}" in logs.'.format(exs)) E TimeoutError: Unable to find "[re.compile('Unblocking for '), re.compile('Too many connections, waiting')]" in logs. ``` Signed-off-by: Rusty Russell --- tests/test_connection.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/test_connection.py b/tests/test_connection.py index 727553e7a08e..6b956413e27d 100644 --- a/tests/test_connection.py +++ b/tests/test_connection.py @@ -21,6 +21,7 @@ import time import unittest import websocket +import signal import ssl @@ -4715,13 +4716,21 @@ def test_connect_ratelimit(node_factory, bitcoind): assert not l1.daemon.is_in_log('Unblocking for') - l1.restart() + l1.stop() + # Suspend the others, to make sure they cannot respond too fast. + for n in nodes: + os.kill(n.daemon.proc.pid, signal.SIGSTOP) + l1.start() # The first will be ok, but others should block and be unblocked. l1.daemon.wait_for_logs((['Unblocking for '] + ['Too many connections, waiting']) * (len(nodes) - 1)) + # Resume them + for n in nodes: + os.kill(n.daemon.proc.pid, signal.SIGCONT) + # And now they're all connected wait_for(lambda: [p['connected'] for p in l1.rpc.listpeers()['peers']] == [True] * len(nodes)) From 62ca2b25071716c824b6f967bda4e270e7b6bb6e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 25 Jan 2025 11:15:11 +1030 Subject: [PATCH 09/16] pytest: fix race in test_reconnect_remote_sends_no_sigs We can be a bit early in our assertion: ``` @unittest.skipIf(os.getenv('TEST_DB_PROVIDER', 'sqlite3') != 'sqlite3', "sqlite3-specific DB manip") def test_reconnect_remote_sends_no_sigs(node_factory): """We re-announce, even when remote node doesn't send its announcement_signatures on reconnect. """ l1, l2 = node_factory.line_graph(2, wait_for_announce=True, opts={'may_reconnect': True, 'dev-no-reconnect': None}) # Wipe l2's gossip_store l2.stop() gs_path = os.path.join(l2.daemon.lightning_dir, TEST_NETWORK, 'gossip_store') os.unlink(gs_path) l2.start() # l2 will now uses (REMOTE's) announcement_signatures it has stored wait_for(lambda: l2.rpc.listchannels()['channels'] != []) # Remove remote signatures from l1 so it asks for them (and delete gossip store) l1.db_manip("UPDATE channels SET remote_ann_node_sig=NULL, remote_ann_bitcoin_sig=NULL") gs_path = os.path.join(l1.daemon.lightning_dir, TEST_NETWORK, 'gossip_store') os.unlink(gs_path) l1.restart() l1.connect(l2) l1needle = l1.daemon.logsearch_start l2needle = l2.daemon.logsearch_start # l1 asks once, l2 replies once. # Make sure we get all the msgs! time.sleep(5) l1.daemon.wait_for_log('peer_out WIRE_ANNOUNCEMENT_SIGNATURES') l2.daemon.wait_for_log('peer_out WIRE_ANNOUNCEMENT_SIGNATURES') l1msgs = [l.split()[4] for l in l1.daemon.logs[l1needle:] if 'WIRE_ANNOUNCEMENT_SIGNATURES' in l] > assert l1msgs == ['peer_out', 'peer_in'] E AssertionError: assert ['peer_out'] == ['peer_out', 'peer_in'] E Right contains one more item: 'peer_in' E Full diff: E - ['peer_out', 'peer_in'] E + ['peer_out'] ``` ``` lightningd-2 2025-01-24T05:53:22.862Z DEBUG 0266e4598d1d3c415f572a8488830b60f7e744ed9235eb0b1ba93283b315c03518-connectd: peer_out WIRE_ANNOUNCEMENT_SIGNATURES lightningd-1 2025-01-24T05:53:22.864Z DEBUG 022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59-channeld-chan#1: peer_in WIRE_ANNOUNCEMENT_SIGNATURES lightningd-1 2025-01-24T05:53:22.885Z DEBUG 022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59-chan#1: channel_gossip: received announcement sigs for 103x1x0 (we have 103x1x0) {'github_repository': 'ElementsProject/lightning', 'github_sha': 'e9d36f2b8ecd45882753cbe062c355e40bc7109c', 'github_ref': 'refs/pull/8027/merge', 'github_ref_name': 'HEAD', 'github_run_id': 12943530601, 'github_head_ref': ``` Signed-off-by: Rusty Russell --- tests/test_connection.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_connection.py b/tests/test_connection.py index 6b956413e27d..d5c85c2498cb 100644 --- a/tests/test_connection.py +++ b/tests/test_connection.py @@ -990,7 +990,8 @@ def test_reconnect_remote_sends_no_sigs(node_factory): # Make sure we get all the msgs! time.sleep(5) - l1.daemon.wait_for_log('peer_out WIRE_ANNOUNCEMENT_SIGNATURES') + l1.daemon.wait_for_logs(['peer_out WIRE_ANNOUNCEMENT_SIGNATURES', + 'peer_in WIRE_ANNOUNCEMENT_SIGNATURES']) l2.daemon.wait_for_log('peer_out WIRE_ANNOUNCEMENT_SIGNATURES') l1msgs = [l.split()[4] for l in l1.daemon.logs[l1needle:] if 'WIRE_ANNOUNCEMENT_SIGNATURES' in l] From 62ca9c9508884deb1bf9caa35fb6058e34c04a32 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 25 Jan 2025 11:16:11 +1030 Subject: [PATCH 10/16] gossipd: don't prune if we haven't seen on side's update at all. This caused a "flake" in testing, because it's wrong: ``` _____________________________ test_gossip_pruning ______________________________ [gw2] linux -- Python 3.10.16 /home/runner/.cache/pypoetry/virtualenvs/cln-meta-project-AqJ9wMix-py3.10/bin/python node_factory = bitcoind = def test_gossip_pruning(node_factory, bitcoind): """ Create channel and see it being updated in time before pruning """ l1, l2, l3 = node_factory.get_nodes(3, opts={'dev-fast-gossip-prune': None, 'allow_bad_gossip': True, 'autoconnect-seeker-peers': 0}) l1.rpc.connect(l2.info['id'], 'localhost', l2.port) l2.rpc.connect(l3.info['id'], 'localhost', l3.port) scid1, _ = l1.fundchannel(l2, 10**6) scid2, _ = l2.fundchannel(l3, 10**6) mine_funding_to_announce(bitcoind, [l1, l2, l3]) wait_for(lambda: l1.rpc.listchannels(source=l1.info['id'])['channels'] != []) l1_initial_cupdate_timestamp = only_one(l1.rpc.listchannels(source=l1.info['id'])['channels'])['last_update'] # Get timestamps of initial updates, so we can ensure they change. # Channels should be activated locally > wait_for(lambda: [c['active'] for c in l1.rpc.listchannels()['channels']] == [True] * 4) ``` Here you can see it has pruned: ``` lightningd-1 2025-01-24T07:39:40.873Z DEBUG gossipd: Pruning channel 105x1x0 from network view (ages 1737704380 and 0) ... lightningd-1 2025-01-24T07:39:50.941Z UNUSUAL lightningd: Bad gossip order: could not find channel 105x1x0 for peer's channel update ``` Changelog-Fixed: Protocol: we were overzealous in pruning channels if we hadn't seen one side's gossip update yet. Signed-off-by: Rusty Russell --- gossipd/gossmap_manage.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gossipd/gossmap_manage.c b/gossipd/gossmap_manage.c index 596d200df3db..cebcf6bcd3ba 100644 --- a/gossipd/gossmap_manage.c +++ b/gossipd/gossmap_manage.c @@ -320,15 +320,15 @@ static void remove_channel(struct gossmap_manage *gm, } } +/* If we don't know, we assume it's good */ static u32 get_timestamp(struct gossmap *gossmap, const struct gossmap_chan *chan, int dir) { u32 timestamp; - /* 0 is sufficient for our needs */ if (!gossmap_chan_set(chan, dir)) - return 0; + return UINT32_MAX; gossmap_chan_get_update_details(gossmap, chan, dir, ×tamp, From 906eb958897d959ecc123baf89f1bf0fdbdede9e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 25 Jan 2025 11:17:11 +1030 Subject: [PATCH 11/16] devtools/gossipwith: filter for max-messages. So not every message type counts: this is useful when we want to get a specific number of a specific type. Signed-off-by: Rusty Russell --- devtools/gossipwith.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/devtools/gossipwith.c b/devtools/gossipwith.c index 41132cf995ca..9acb85fcad54 100644 --- a/devtools/gossipwith.c +++ b/devtools/gossipwith.c @@ -56,6 +56,7 @@ struct io_conn { static struct secret notsosecret; static bool no_gossip = false, all_gossip = false; static unsigned long max_messages = -1UL; +static u16 *accept_messages = NULL; /* Empty stubs to make us compile */ void status_peer_io(enum log_level iodir, @@ -101,6 +102,18 @@ bool is_unknown_msg_discardable(const u8 *cursor) return false; } +static bool accept_message(const u8 *msg) +{ + u16 type = fromwire_peektype(msg); + if (!accept_messages) + return true; + for (size_t i = 0; i < tal_count(accept_messages); i++) { + if (type == accept_messages[i]) + return true; + } + return false; +} + static struct io_plan *simple_write(struct io_conn *conn, const void *data, size_t len, struct io_plan *(*next)(struct io_conn *, void *), @@ -240,6 +253,10 @@ static struct io_plan *handshake_success(struct io_conn *conn, msg = sync_crypto_read(NULL, peer_fd, cs); if (!msg) err(1, "Reading msg"); + if (!accept_message(msg)) { + tal_free(msg); + continue; + } if (hex) { printf("%s\n", tal_hex(msg, msg)); } else { @@ -248,8 +265,8 @@ static struct io_plan *handshake_success(struct io_conn *conn, || !write_all(STDOUT_FILENO, msg, tal_bytelen(msg))) err(1, "Writing out msg"); } - tal_free(msg); --max_messages; + tal_free(msg); } } @@ -284,6 +301,15 @@ static char *opt_set_features(const char *arg, u8 **features) return NULL; } +static char *opt_set_filter(const char *arg, u16 **accept) +{ + char **elems = tal_strsplit(tmpctx, arg, ",", STR_EMPTY_OK); + *accept = tal_arr(NULL, u16, tal_count(elems)-1); + for (size_t i = 0; elems[i]; i++) + (*accept)[i] = atoi(elems[i]); + return NULL; +} + int main(int argc, char *argv[]) { struct io_conn *conn = tal(NULL, struct io_conn); @@ -306,6 +332,8 @@ int main(int argc, char *argv[]) "Stream complete gossip history at start"); opt_register_noarg("--no-gossip", opt_set_bool, &no_gossip, "Suppress all gossip at start"); + opt_register_arg("--filter", opt_set_filter, NULL, &accept_messages, + "Only process these message types"); opt_register_arg("--max-messages", opt_set_ulongval, opt_show_ulongval, &max_messages, "Terminate after reading this many messages"); From a364229e436f596b55e4a7e084dad7b564139141 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 25 Jan 2025 11:18:11 +1030 Subject: [PATCH 12/16] pytest: make test_gossip_throttle more reliable. By having gossipwith filter out messages we don't want, we can get the counts of expected messages correct, and not hit errors like this: ``` def test_gossip_throttle(node_factory, bitcoind, chainparams): """Make some gossip, test it gets throttled""" l1, l2, l3, l4 = node_factory.line_graph(4, wait_for_announce=True, opts=[{}, {}, {}, {'dev-throttle-gossip': None}]) # We expect: self-advertizement (3 messages for l1 and l4) plus # 4 node announcements, 3 channel announcements and 6 channel updates. # We also expect it to send a timestamp filter message. # (We won't take long enough to get a ping!) expected = 4 + 4 + 3 + 6 + 1 # l1 is unlimited start_fast = time.time() out1 = subprocess.run(['devtools/gossipwith', '--all-gossip', '--hex', '--network={}'.format(TEST_NETWORK), '--max-messages={}'.format(expected), '{}@localhost:{}'.format(l1.info['id'], l1.port)], check=True, timeout=TIMEOUT, stdout=subprocess.PIPE).stdout.split() time_fast = time.time() - start_fast assert time_fast < 2 # Remove timestamp filter, since timestamp will change! out1 = [m for m in out1 if not m.startswith(b'0109')] # l4 is throttled start_slow = time.time() out2 = subprocess.run(['devtools/gossipwith', '--all-gossip', '--hex', '--network={}'.format(TEST_NETWORK), '--max-messages={}'.format(expected), '{}@localhost:{}'.format(l4.info['id'], l4.port)], check=True, timeout=TIMEOUT, stdout=subprocess.PIPE).stdout.split() time_slow = time.time() - start_slow assert time_slow > 3 # Remove timestamp filter, since timestamp will change! out2 = [m for m in out2 if not m.startswith(b'0109')] # Contents should be identical (once uniquified, since each # doubles-up on its own gossip) > assert set(out1) == set(out2) E AssertionError: assert {b'010054b1907bdf639c9060e0fa4bca02419c46f75a99f0908b87a2e09711d5d031ba76b8fd07acc8be1b2fac9e31efb808e5d362c32ef4665... E Extra items in the left set: E b'01010ad5be8b9ba029245c2ae2d667af7ead7c0129c479c7fd7145a9b65931e90222082e6e4ab37ef60ebd10f1493d73e8bf7a40c4ae5f7d87cc...8488830b60f7e744ed9235eb0b1ba93283b315c035180266e44a554e494f524245414d2d333930353033622d6d6f64646564000000000000000000' E Extra items in the right set: E b'01079f87eb580b9e5f11dc211e9fb66abb3699999044f8fe146801162393364286c6000000010000006c010101' ``` Signed-off-by: Rusty Russell --- tests/test_gossip.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/tests/test_gossip.py b/tests/test_gossip.py index 700db4296608..6ba70e4f71f9 100644 --- a/tests/test_gossip.py +++ b/tests/test_gossip.py @@ -2058,9 +2058,7 @@ def test_gossip_throttle(node_factory, bitcoind, chainparams): # We expect: self-advertizement (3 messages for l1 and l4) plus # 4 node announcements, 3 channel announcements and 6 channel updates. - # We also expect it to send a timestamp filter message. - # (We won't take long enough to get a ping!) - expected = 4 + 4 + 3 + 6 + 1 + expected = 4 + 4 + 3 + 6 # l1 is unlimited start_fast = time.time() @@ -2068,14 +2066,13 @@ def test_gossip_throttle(node_factory, bitcoind, chainparams): '--all-gossip', '--hex', '--network={}'.format(TEST_NETWORK), + '--filter=256,257,258', '--max-messages={}'.format(expected), '{}@localhost:{}'.format(l1.info['id'], l1.port)], check=True, timeout=TIMEOUT, stdout=subprocess.PIPE).stdout.split() time_fast = time.time() - start_fast assert time_fast < 2 - # Remove timestamp filter, since timestamp will change! - out1 = [m for m in out1 if not m.startswith(b'0109')] # l4 is throttled start_slow = time.time() @@ -2083,6 +2080,7 @@ def test_gossip_throttle(node_factory, bitcoind, chainparams): '--all-gossip', '--hex', '--network={}'.format(TEST_NETWORK), + '--filter=256,257,258', '--max-messages={}'.format(expected), '{}@localhost:{}'.format(l4.info['id'], l4.port)], check=True, @@ -2090,9 +2088,6 @@ def test_gossip_throttle(node_factory, bitcoind, chainparams): time_slow = time.time() - start_slow assert time_slow > 3 - # Remove timestamp filter, since timestamp will change! - out2 = [m for m in out2 if not m.startswith(b'0109')] - # Contents should be identical (once uniquified, since each # doubles-up on its own gossip) assert set(out1) == set(out2) @@ -2121,15 +2116,14 @@ def test_gossip_throttle(node_factory, bitcoind, chainparams): '--no-gossip', '--hex', '--network={}'.format(TEST_NETWORK), - '--max-messages={}'.format(expected + 1), + '--filter=256,257,258', + '--max-messages={}'.format(expected), '{}@localhost:{}'.format(l1.info['id'], l1.port), query], check=True, timeout=TIMEOUT, stdout=subprocess.PIPE).stdout.split() time_fast = time.time() - start_fast assert time_fast < 2 - # Ignore gossip_timestamp_filter and reply_short_channel_ids_end - out3 = [m for m in out3 if not m.startswith(b'0109') and not m.startswith(b'0106')] assert set(out1) == set(out3) start_slow = time.time() @@ -2137,6 +2131,7 @@ def test_gossip_throttle(node_factory, bitcoind, chainparams): '--no-gossip', '--hex', '--network={}'.format(TEST_NETWORK), + '--filter=256,257,258', '--max-messages={}'.format(expected), '{}@localhost:{}'.format(l4.info['id'], l4.port), query], @@ -2144,7 +2139,6 @@ def test_gossip_throttle(node_factory, bitcoind, chainparams): timeout=TIMEOUT, stdout=subprocess.PIPE).stdout.split() time_slow = time.time() - start_slow assert time_slow > 3 - out4 = [m for m in out4 if not m.startswith(b'0109')] assert set(out2) == set(out4) From b13397d104008bceded1b59396a31a7a8beb7d70 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 25 Jan 2025 11:19:11 +1030 Subject: [PATCH 13/16] pytest: fix flake in test_onchain_reestablish_reply We can ask for the state too fast, before WIRE_ERROR is processed: ``` 2025-01-24T14:17:57.4799255Z ________________________ test_onchain_reestablish_reply ________________________ 2025-01-24T14:17:57.4800383Z [gw2] linux -- Python 3.10.16 /home/runner/.cache/pypoetry/virtualenvs/cln-meta-project-AqJ9wMix-py3.10/bin/python 2025-01-24T14:17:57.4801246Z 2025-01-24T14:17:57.4802068Z node_factory = 2025-01-24T14:17:57.4802645Z bitcoind = 2025-01-24T14:17:57.4803483Z executor = 2025-01-24T14:17:57.4804029Z 2025-01-24T14:17:57.4804340Z def test_onchain_reestablish_reply(node_factory, bitcoind, executor): 2025-01-24T14:17:57.4805013Z l1, l2, l3 = node_factory.line_graph(3, opts={'may_reconnect': True, 2025-01-24T14:17:57.4805572Z 'dev-no-reconnect': None}) 2025-01-24T14:17:57.4805990Z 2025-01-24T14:17:57.4806262Z # Make l1 close unilaterally. 2025-01-24T14:17:57.4806697Z l1.rpc.disconnect(l2.info['id'], force=True) 2025-01-24T14:17:57.4807200Z l1.rpc.close(l2.info['id'], unilateraltimeout=1) 2025-01-24T14:17:57.4807622Z 2025-01-24T14:17:57.4808025Z # l2 doesn't know, reconnection should get REESTABLISH *then* error. 2025-01-24T14:17:57.4808637Z l2.rpc.connect(l1.info['id'], 'localhost', l1.port) 2025-01-24T14:17:57.4809066Z 2025-01-24T14:17:57.4809334Z # We should exchange messages 2025-01-24T14:17:57.4809835Z l2.daemon.wait_for_logs(["peer_out WIRE_CHANNEL_REESTABLISH", 2025-01-24T14:17:57.4810399Z "peer_in WIRE_CHANNEL_REESTABLISH"]) 2025-01-24T14:17:57.4811197Z # It should be OK 2025-01-24T14:17:57.4811642Z l2.daemon.wait_for_log("Reconnected, and reestablished.") 2025-01-24T14:17:57.4812116Z 2025-01-24T14:17:57.4812378Z # Then we get the error, close. 2025-01-24T14:17:57.4812824Z l2.daemon.wait_for_log("peer_in WIRE_ERROR") 2025-01-24T14:17:57.4813576Z > assert only_one(l2.rpc.listpeerchannels(l1.info['id'])['channels'])['state'] == 'AWAITING_UNILATERAL' 2025-01-24T14:17:57.4814446Z E AssertionError: assert 'CHANNELD_NORMAL' == 'AWAITING_UNILATERAL' 2025-01-24T14:17:57.4815014Z E - AWAITING_UNILATERAL 2025-01-24T14:17:57.4815362Z E + CHANNELD_NORMAL ``` ``` 2025-01-24T14:17:57.5760435Z lightningd-2 2025-01-24T14:04:28.398Z DEBUG 0266e4598d1d3c415f572a8488830b60f7e744ed9235eb0b1ba93283b315c03518-channeld-chan#1: peer_in WIRE_ERROR 2025-01-24T14:17:57.5760674Z lightningd-2 2025-01-24T14:04:28.401Z TRACE lightningd: Calling rpc_command hook of plugin cln-xpay 2025-01-24T14:17:57.5760935Z lightningd-2 2025-01-24T14:04:28.403Z TRACE lightningd: Plugin cln-xpay returned from rpc_command hook call 2025-01-24T14:17:57.5761176Z lightningd-2 2025-01-24T14:04:28.408Z TRACE lightningd: Calling rpc_command hook of plugin cln-xpay 2025-01-24T14:17:57.5761430Z lightningd-2 2025-01-24T14:04:28.414Z TRACE lightningd: Plugin cln-xpay returned from rpc_command hook call 2025-01-24T14:17:57.5762598Z {'github_repository': 'ElementsProject/lightning', 'github_sha': '390503b31e1949cb84e5a5257bf29a1b80d5cd20', 'github_ref': 'refs/pull/8027/merge', 'github_ref_name': 'HEAD', 'github_run_id': 12950414032, 'github_head_ref': 'guilt/fix-flakes12', 'github_run_number': 12051, 'github_base_ref': 'master', 'github_run_attempt': '1', 'testname': 'test_onchain_reestablish_reply', 'start_time': 1737727408, 'end_time': 1737727468, 'outcome': 'fail'} 2025-01-24T14:17:57.5763430Z lightningd-2 2025-01-24T14:04:28.426Z DEBUG 0266e4598d1d3c415f572a8488830b60f7e744ed9235eb0b1ba93283b315c03518-channeld-chan#1: billboard perm: Received ERROR channel 69d744488de7ec7f4c018da98a8a9d726ee3b47a97809608e4ce6f17747b5f09: Forcibly closed by `close` command timeout 2025-01-24T14:17:57.5764512Z lightningd-2 2025-01-24T14:04:28.433Z UNUSUAL 0266e4598d1d3c415f572a8488830b60f7e744ed9235eb0b1ba93283b315c03518-chan#1: Peer permanent failure in CHANNELD_NORMAL: channeld: received ERROR channel 69d744488de7ec7f4c018da98a8a9d726ee3b47a97809608e4ce6f17747b5f09: Forcibly closed by `close` command timeout (reason=protocol) 2025-01-24T14:17:57.5764997Z lightningd-2 2025-01-24T14:04:28.439Z INFO 0266e4598d1d3c415f572a8488830b60f7e744ed9235eb0b1ba93283b315c03518-chan#1: State changed from CHANNELD_NORMAL to AWAITING_UNILATERAL ``` Signed-off-by: Rusty Russell --- tests/test_closing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_closing.py b/tests/test_closing.py index 26f1148e38a0..c24b4cae8c71 100644 --- a/tests/test_closing.py +++ b/tests/test_closing.py @@ -4168,7 +4168,7 @@ def test_onchain_reestablish_reply(node_factory, bitcoind, executor): # Then we get the error, close. l2.daemon.wait_for_log("peer_in WIRE_ERROR") - assert only_one(l2.rpc.listpeerchannels(l1.info['id'])['channels'])['state'] == 'AWAITING_UNILATERAL' + wait_for(lambda: only_one(l2.rpc.listpeerchannels(l1.info['id'])['channels'])['state'] == 'AWAITING_UNILATERAL') # Mine it now so we don't confuse the code below. bitcoind.generate_block(1, wait_for_mempool=1) From a42547d5843d48749cd9a813583a7b02ed2f1126 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 25 Jan 2025 11:20:11 +1030 Subject: [PATCH 14/16] pytest: mark test_rbf_reconnect_tx_construct flaky: Doesn't always die messily, it seems? ``` @unittest.skipIf(TEST_NETWORK != 'regtest', 'elementsd doesnt yet support PSBT features we need') @pytest.mark.openchannel('v2') def test_rbf_reconnect_tx_construct(node_factory, bitcoind, chainparams): disconnects = ['=WIRE_TX_ADD_INPUT', # Initial funding succeeds '-WIRE_TX_ADD_INPUT', '+WIRE_TX_ADD_INPUT', '-WIRE_TX_ADD_OUTPUT', '+WIRE_TX_ADD_OUTPUT', '-WIRE_TX_COMPLETE', '+WIRE_TX_COMPLETE', '-WIRE_COMMITMENT_SIGNED', '+WIRE_COMMITMENT_SIGNED'] l1, l2 = node_factory.get_nodes(2, opts=[{'disconnect': disconnects, 'may_reconnect': True, 'dev-no-reconnect': None}, {'may_reconnect': True, 'dev-no-reconnect': None, 'broken_log': 'dualopend daemon died before signed PSBT returned'}]) l1.rpc.connect(l2.info['id'], 'localhost', l2.port) amount = 2**24 chan_amount = 100000 bitcoind.rpc.sendtoaddress(l1.rpc.newaddr()['bech32'], amount / 10**8 + 0.01) bitcoind.generate_block(1) # Wait for it to arrive. wait_for(lambda: len(l1.rpc.listfunds()['outputs']) > 0) res = l1.rpc.fundchannel(l2.info['id'], chan_amount) chan_id = res['channel_id'] vins = bitcoind.rpc.decoderawtransaction(res['tx'])['vin'] assert(only_one(vins)) prev_utxos = ["{}:{}".format(vins[0]['txid'], vins[0]['vout'])] # Check that we're waiting for lockin l1.daemon.wait_for_log(' to DUALOPEND_AWAITING_LOCKIN') # rbf the lease with a higher amount rate = int(find_next_feerate(l1, l2)[:-5]) # We 4x the feerate to beat the min-relay fee next_feerate = '{}perkw'.format(rate * 4) # Initiate an RBF startweight = 42 + 172 # base weight, funding output initpsbt = l1.rpc.utxopsbt(chan_amount, next_feerate, startweight, prev_utxos, reservedok=True, excess_as_change=True) # Run through TX_ADD wires for d in disconnects[1:-4]: l1.rpc.connect(l2.info['id'], 'localhost', l2.port) with pytest.raises(RpcError): l1.rpc.openchannel_bump(chan_id, chan_amount, initpsbt['psbt']) wait_for(lambda: l1.rpc.getpeer(l2.info['id'])['connected'] is False) # The first TX_COMPLETE breaks l1.rpc.connect(l2.info['id'], 'localhost', l2.port) bump = l1.rpc.openchannel_bump(chan_id, chan_amount, initpsbt['psbt']) with pytest.raises(RpcError): update = l1.rpc.openchannel_update(chan_id, bump['psbt']) wait_for(lambda: l1.rpc.getpeer(l2.info['id'])['connected'] is False) l1.rpc.connect(l2.info['id'], 'localhost', l2.port) # l1 should remember, l2 has forgotten # l2 should send tx-abort, to reset l2.daemon.wait_for_log(r'tx-abort: Sent next_funding_txid .* doesn\'t match ours .*') l1.daemon.wait_for_log(r'Cleaned up incomplete inflight') # abort doesn't cause a disconnect assert l1.rpc.getpeer(l2.info['id'])['connected'] # The next TX_COMPLETE break (both remember) + they break on the # COMMITMENT_SIGNED during the reconnect bump = l1.rpc.openchannel_bump(chan_id, chan_amount, initpsbt['psbt']) with pytest.raises(RpcError): update = l1.rpc.openchannel_update(chan_id, bump['psbt']) wait_for(lambda: l1.rpc.getpeer(l2.info['id'])['connected'] is False) l1.rpc.connect(l2.info['id'], 'localhost', l2.port) l2.daemon.wait_for_logs([r'Got dualopend reestablish', r'No commitment, not sending our sigs']) l1.daemon.wait_for_logs([r'Got dualopend reestablish', r'No commitment, not sending our sigs', r'dev_disconnect: -WIRE_COMMITMENT_SIGNED', 'peer_disconnect_done']) assert not l1.rpc.getpeer(l2.info['id'])['connected'] l1.rpc.connect(l2.info['id'], 'localhost', l2.port) # COMMITMENT_SIGNED disconnects *during* the reconnect # We can't bump because the last negotiation is in the wrong state with pytest.raises(RpcError, match=r'Funding sigs for this channel not secured'): l1.rpc.openchannel_bump(chan_id, chan_amount, initpsbt['psbt']) # l2 reconnects, but doesn't have l1's commitment > l2.daemon.wait_for_logs([r'Got dualopend reestablish', r'No commitment, not sending our sigs', # This is a BROKEN log, it's expected! r'dualopend daemon died before signed PSBT returned']) tests/test_opening.py:944: ... > raise TimeoutError('Unable to find "{}" in logs.'.format(exs)) E TimeoutError: Unable to find "[re.compile('dualopend daemon died before signed PSBT returned')]" in logs. ``` Signed-off-by: Rusty Russell --- tests/test_opening.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_opening.py b/tests/test_opening.py index af9920404e5e..7346c7283779 100644 --- a/tests/test_opening.py +++ b/tests/test_opening.py @@ -944,7 +944,7 @@ def test_rbf_reconnect_tx_construct(node_factory, bitcoind, chainparams): l2.daemon.wait_for_logs([r'Got dualopend reestablish', r'No commitment, not sending our sigs', # This is a BROKEN log, it's expected! - r'dualopend daemon died before signed PSBT returned']) + r'dualopend daemon died before signed PSBT returned|dualopend: Owning subdaemon dualopend died']) # We don't have the commtiments yet, there's no scratch_txid inflights = only_one(l1.rpc.listpeerchannels()['channels'])['inflight'] From 479e7ae1af6df4a939621fd7c0242006cb9f98d6 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sun, 26 Jan 2025 11:36:17 +1030 Subject: [PATCH 15/16] pytest: fix flake in test_onchain_slow_anchor. If we get "lucky" then commit tx will have short sig, one less weight (1 in 256 chance): ``` @unittest.skipIf(TEST_NETWORK != 'regtest', 'elementsd anchors not supportd') def test_onchain_slow_anchor(node_factory, bitcoind): """We still use anchors for non-critical closes""" l1, l2 = node_factory.line_graph(2) # Don't let l1 succeed in sending commit tx def censoring_sendrawtx(r): return {'id': r['id'], 'result': {}} l1.daemon.rpcproxy.mock_rpc('sendrawtransaction', censoring_sendrawtx) close_start_depth = bitcoind.rpc.getblockchaininfo()['blocks'] # Make l1 close unilaterally. l1.rpc.disconnect(l2.info['id'], force=True) l1.rpc.close(l2.info['id'], unilateraltimeout=1) # We will have a super-low-prio anchor spend. l1.daemon.wait_for_log(r"Low-priority anchorspend aiming for block {} \(feerate 253\)".format(close_start_depth + 2016)) # Restart with reduced block time. l1.stop() l1.daemon.opts['dev-low-prio-anchor-blocks'] = 20 l1.start() l1.daemon.wait_for_log("Low-priority anchorspend aiming for block {}".format(close_start_depth + 20)) l1.daemon.wait_for_log("Anchorspend for local commit tx") # Won't go under 12 blocks though. # Make sure it sees all these blocks at once, to avoid test flakes! l1.stop() bitcoind.generate_block(7) l1.start() height = bitcoind.rpc.getblockchaininfo()['blocks'] l1.daemon.wait_for_log(r"Low-priority anchorspend aiming for block {} \(feerate 7458\)".format(height + 13)) > l1.daemon.wait_for_log(r"Anchorspend for local commit tx fee 12335sat \(w=714\), commit_tx fee 4545sat \(w=768\): package feerate 11390 perkw") ``` Here's the log we *did* get: ``` 2025-01-25T08:46:40.9399213Z lightningd-1 2025-01-25T08:40:06.312Z DEBUG 022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59-chan#1: Anchorspend for local commit tx fee 12328sat (w=714), commit_tx fee 4545sat (w=767): package feerate 11392 perkw ``` Signed-off-by: Rusty Russell --- tests/test_closing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_closing.py b/tests/test_closing.py index c24b4cae8c71..6fed91de54b4 100644 --- a/tests/test_closing.py +++ b/tests/test_closing.py @@ -4239,7 +4239,8 @@ def censoring_sendrawtx(r): height = bitcoind.rpc.getblockchaininfo()['blocks'] l1.daemon.wait_for_log(r"Low-priority anchorspend aiming for block {} \(feerate 7458\)".format(height + 13)) - l1.daemon.wait_for_log(r"Anchorspend for local commit tx fee 12335sat \(w=714\), commit_tx fee 4545sat \(w=768\): package feerate 11390 perkw") + # Can be out-by-one (short sig)! + l1.daemon.wait_for_log(r"Anchorspend for local commit tx fee (12335|12328)sat \(w=714\), commit_tx fee 4545sat \(w=76[78]\): package feerate 1139[02] perkw") assert not l1.daemon.is_in_log("Low-priority anchorspend aiming for block {}".format(height + 12)) bitcoind.generate_block(1) From 00b22d193784b48613ac273aa65a8d68f5e54fcd Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sun, 26 Jan 2025 16:33:22 +1030 Subject: [PATCH 16/16] lightningd: add option to crash itself after some time. We have CI runs which timeout (after 2 hours). It's not clear why, but we can at least eliminate CLN lockups as the answer. Since pytest disabled the --timeout option on test shutdown, we could be seeing an issue on stopping taking a long time? Signed-off-by: Rusty Russell --- contrib/msggen/msggen/schema.json | 4 ++++ doc/schemas/lightning-listconfigs.json | 4 ++++ lightningd/options.c | 30 ++++++++++++++++++++++++++ tests/fixtures.py | 3 +++ 4 files changed, 41 insertions(+) diff --git a/contrib/msggen/msggen/schema.json b/contrib/msggen/msggen/schema.json index 766af8fc22a5..048f9f928c57 100644 --- a/contrib/msggen/msggen/schema.json +++ b/contrib/msggen/msggen/schema.json @@ -20552,6 +20552,10 @@ "value_int": 3, "source": "cmdline" }, + "dev-crash-after": { + "value_str": "3600", + "source": "cmdline" + }, "dev-fail-on-subdaemon-fail": { "set": true, "source": "cmdline" diff --git a/doc/schemas/lightning-listconfigs.json b/doc/schemas/lightning-listconfigs.json index 260dea4c9e87..f96463d950df 100644 --- a/doc/schemas/lightning-listconfigs.json +++ b/doc/schemas/lightning-listconfigs.json @@ -2678,6 +2678,10 @@ "value_int": 3, "source": "cmdline" }, + "dev-crash-after": { + "value_str": "3600", + "source": "cmdline" + }, "dev-fail-on-subdaemon-fail": { "set": true, "source": "cmdline" diff --git a/lightningd/options.c b/lightningd/options.c index 643e0063e6f0..147592a24e53 100644 --- a/lightningd/options.c +++ b/lightningd/options.c @@ -811,6 +811,31 @@ static char *opt_ignore(void *unused) return NULL; } +static void handle_alarm(int sig) +{ + abort(); +} + +static char *opt_set_crash_timeout(const char *arg, struct lightningd *ld) +{ + struct sigaction act; + u32 time; + char *errstr = opt_set_u32(arg, &time); + if (errstr) + return errstr; + + /* In case we're *REALLY* stuck, use alarm() */ + memset(&act, 0, sizeof(act)); + act.sa_handler = handle_alarm; + act.sa_flags = 0; + + if (sigaction(SIGALRM, &act, NULL) != 0) + err(1, "Setting up SIGARLM handler"); + + alarm(time); + return NULL; +} + static void dev_register_opts(struct lightningd *ld) { /* We might want to debug plugins, which are started before normal @@ -978,6 +1003,10 @@ static void dev_register_opts(struct lightningd *ld) opt_set_u32, opt_show_u32, &ld->dev_low_prio_anchor_blocks, "How many blocks to aim for low-priority anchor closes (default: 2016)"); + clnopt_witharg("--dev-crash-after", OPT_DEV, + opt_set_crash_timeout, NULL, + ld, + "Crash if we are still going after this long."); /* This is handled directly in daemon_developer_mode(), so we ignore it here */ clnopt_noarg("--dev-debug-self", OPT_DEV, opt_ignore, @@ -2219,6 +2248,7 @@ bool is_known_opt_cb_arg(char *(*cb_arg)(const char *, void *)) || cb_arg == (void *)opt_add_accept_htlc_tlv || cb_arg == (void *)opt_set_codex32_or_hex || cb_arg == (void *)opt_subd_dev_disconnect + || cb_arg == (void *)opt_set_crash_timeout || cb_arg == (void *)opt_add_api_beg || cb_arg == (void *)opt_force_featureset || cb_arg == (void *)opt_force_privkey diff --git a/tests/fixtures.py b/tests/fixtures.py index ea23f10ad0a9..1e0334afe27a 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -18,6 +18,9 @@ class LightningNode(utils.LightningNode): def __init__(self, *args, **kwargs): utils.LightningNode.__init__(self, *args, **kwargs) + # This is a recent innovation, and we don't want to nail pyln-testing to this version. + self.daemon.opts['dev-crash-after'] = 3600 + # We have some valgrind suppressions in the `tests/` # directory, so we can add these to the valgrind configuration # (not generally true when running pyln-testing, hence why