From d84850cf63ec5ac9336750c7828df2f5028455c0 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 17 Oct 2025 23:09:48 +1030 Subject: [PATCH 1/3] libplugin: remove redundant destructor which causes exponential slowdown on large numbers of requests. Note that we create a destructor on the command to reset request->cmd pointer if the cmd is freed (so we know not to call the callback). But attaching hundreds of thousands of them is slow: it's a single-linked list, which is iterated in several places. But that's redundant: the request is now allocated off the cmd, so freeing the command will free the request anyway. Hacking in something to print progress to a file, here's the number of requests processed every 10 seconds before and after: Before: $ while sleep 10; do wc -l /tmp/bkpr-progress; done 181529 /tmp/bkpr-progress 195994 /tmp/bkpr-progress 207083 /tmp/bkpr-progress 226336 /tmp/bkpr-progress 234319 /tmp/bkpr-progress 241514 /tmp/bkpr-progress 247421 /tmp/bkpr-progress 255292 /tmp/bkpr-progress 261367 /tmp/bkpr-progress 269085 /tmp/bkpr-progress 276953 /tmp/bkpr-progress 282233 /tmp/bkpr-progress 286193 /tmp/bkpr-progress 290930 /tmp/bkpr-progress 295276 /tmp/bkpr-progress 301086 /tmp/bkpr-progress After: 169505 /tmp/bkpr-progress 196010 /tmp/bkpr-progress 219370 /tmp/bkpr-progress 235671 /tmp/bkpr-progress 244242 /tmp/bkpr-progress 255362 /tmp/bkpr-progress 265636 /tmp/bkpr-progress 276966 /tmp/bkpr-progress 284451 /tmp/bkpr-progress 288836 /tmp/bkpr-progress 296578 /tmp/bkpr-progress 304571 /tmp/bkpr-progress Signed-off-by: Rusty Russell --- plugins/libplugin.c | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/plugins/libplugin.c b/plugins/libplugin.c index 36a74d9f8134..167d13621099 100644 --- a/plugins/libplugin.c +++ b/plugins/libplugin.c @@ -290,17 +290,6 @@ static void ld_rpc_send(struct plugin *plugin, struct json_stream *stream) io_wake(plugin->io_rpc_conn); } - -/* When cmd for request is gone, we use this as noop callback */ -static struct command_result *ignore_cb(struct command *command, - const char *method, - const char *buf, - const jsmntok_t *result, - void *arg) -{ - return &complete; -} - /* Ignore the result, and terminate the timer/aux/hook */ struct command_result *ignore_and_complete(struct command *cmd, const char *method, @@ -357,14 +346,6 @@ struct command_result *plugin_broken_cb(struct command *cmd, json_tok_full(buf, result)); } -static void disable_request_cb(struct command *cmd, struct out_req *out) -{ - out->errcb = NULL; - out->cb = ignore_cb; - /* Called because cmd got free'd */ - out->cmd = NULL; -} - /* Prefix is usually a cmd->id */ static const char *json_id(const tal_t *ctx, struct plugin *plugin, const char *method, const char *prefix) @@ -424,9 +405,6 @@ jsonrpc_request_start_(struct command *cmd, strmap_add(&cmd->plugin->out_reqs, out->id, out); tal_add_destructor2(out, destroy_out_req, cmd->plugin); - /* If command goes away, don't call callbacks! */ - tal_add_destructor2(out->cmd, disable_request_cb, out); - out->js = new_json_stream(NULL, cmd, NULL); json_object_start(out->js, NULL); json_add_string(out->js, "jsonrpc", "2.0"); @@ -1100,9 +1078,6 @@ static void handle_rpc_reply(struct plugin *plugin, const char *buf, const jsmnt return; } - /* Remove destructor if one existed */ - tal_del_destructor2(out->cmd, disable_request_cb, out); - /* We want to free this if callback doesn't. */ tal_steal(tmpctx, out); From f582452886cc2a482fcde0331477114e265a99c6 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 18 Oct 2025 08:51:47 +1030 Subject: [PATCH 2/3] plugins/sql: add payment_hash index to channelmoves table. This significantly speeds up the query which bookkeeper often does: "SELECT created_index" " FROM channelmoves" " WHERE payment_hash = X'%s'" " AND credit_msat = %"PRIu64 " AND created_index <= %"PRIu64, On large databases this scan is expensive, and a payment_hash index cuts it down a great deal. It does take longer to load the channelmoves in the first place though (about 3x). Before: $ while sleep 10; do wc -l /tmp/bkpr-progress; done 169505 /tmp/bkpr-progress 196010 /tmp/bkpr-progress 219370 /tmp/bkpr-progress 235671 /tmp/bkpr-progress 244242 /tmp/bkpr-progress 255362 /tmp/bkpr-progress 265636 /tmp/bkpr-progress 276966 /tmp/bkpr-progress 284451 /tmp/bkpr-progress 288836 /tmp/bkpr-progress 296578 /tmp/bkpr-progress 304571 /tmp/bkpr-progress After: $ while sleep 10; do wc -l /tmp/bkpr-progress; done 161421 /tmp/bkpr-progress 238273 /tmp/bkpr-progress 281185 /tmp/bkpr-progress 305787 /tmp/bkpr-progress Signed-off-by: Rusty Russell Changelog-Changed: plugins: the sql plugin now keeps an index on `channelmoves` by `payment_hash`. --- plugins/sql.c | 22 ++++++++++++++-------- tests/test_plugin.py | 2 +- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/plugins/sql.c b/plugins/sql.c index 7b7e21bf4793..a1c361da2e18 100644 --- a/plugins/sql.c +++ b/plugins/sql.c @@ -197,6 +197,10 @@ static const struct index indices[] = { "channelmoves", { "account_id", NULL }, }, + { + "channelmoves", + { "payment_hash", NULL }, + }, }; static enum fieldtype find_fieldtype(const jsmntok_t *name) @@ -1757,20 +1761,22 @@ static const char *fmt_indexes(const tal_t *ctx, const char *table) for (size_t i = 0; i < ARRAY_SIZE(indices); i++) { if (!streq(indices[i].tablename, table)) continue; - /* FIXME: Handle multiple indices! */ - assert(!ret); + if (!ret) + ret = tal_fmt(ctx, " indexed by "); + else + tal_append_fmt(&ret, ", also indexed by "); BUILD_ASSERT(ARRAY_SIZE(indices[i].fields) == 2); if (indices[i].fields[1]) - ret = tal_fmt(tmpctx, "`%s` and `%s`", - indices[i].fields[0], - indices[i].fields[1]); + tal_append_fmt(&ret, "`%s` and `%s`", + indices[i].fields[0], + indices[i].fields[1]); else - ret = tal_fmt(tmpctx, "`%s`", - indices[i].fields[0]); + tal_append_fmt(&ret, "`%s`", + indices[i].fields[0]); } if (!ret) return ""; - return tal_fmt(ctx, " indexed by %s", ret); + return ret; } static const char *json_prefix(const tal_t *ctx, diff --git a/tests/test_plugin.py b/tests/test_plugin.py index 82f832d42c2c..48f4dcfef010 100644 --- a/tests/test_plugin.py +++ b/tests/test_plugin.py @@ -3795,7 +3795,7 @@ def test_sql(node_factory, bitcoind): {'name': 'extra_tags', 'type': 'string'}]}, 'channelmoves': { - 'indices': [['account_id']], + 'indices': [['account_id'], ['payment_hash']], 'columns': [{'name': 'created_index', 'type': 'u64'}, {'name': 'account_id', From bd8c907de5a9298bba8a85f40f383b4b9f8bb3cd Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 18 Oct 2025 08:51:49 +1030 Subject: [PATCH 3/3] bookkeeper: only read listchannelmoves 1000 entries at a time. If we read all of them, we might get 1.6M at once (after initial migration). Then we submit a few hundred thousand simultaneous requests to lightningd, and it gets upset, queueing them all on the xpay command hook and running out of memory. Signed-off-by: Rusty Russell Changelog-Fixed: plugins: bookkeeper first invocation after migration from prior to 25.09 with very large databases will not crash. --- plugins/bkpr/bookkeeper.c | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/plugins/bkpr/bookkeeper.c b/plugins/bkpr/bookkeeper.c index 1312924de600..75fb76ad5aab 100644 --- a/plugins/bkpr/bookkeeper.c +++ b/plugins/bkpr/bookkeeper.c @@ -53,6 +53,10 @@ static struct refresh_info *use_rinfo(struct refresh_info *rinfo) return rinfo; } +/* Recursion */ +static struct command_result *limited_listchannelmoves(struct command *cmd, + struct refresh_info *rinfo); + static struct command_result *rinfo_one_done(struct command *cmd, struct refresh_info *rinfo) { @@ -123,16 +127,37 @@ static struct command_result *listchannelmoves_done(struct command *cmd, &be_index, sizeof(be_index), "create-or-replace", datastore_done, NULL, use_rinfo(rinfo)); + + /* If there might be more, try asking for more */ + if (moves->size != 0) + limited_listchannelmoves(cmd, rinfo); + return rinfo_one_done(cmd, rinfo); } +/* We do 1000 at a time to avoid overwhelming lightningd */ +static struct command_result *limited_listchannelmoves(struct command *cmd, + struct refresh_info *rinfo) +{ + struct bkpr *bkpr = bkpr_of(cmd->plugin); + struct out_req *req; + + req = jsonrpc_request_start(cmd, "listchannelmoves", + listchannelmoves_done, + plugin_broken_cb, + use_rinfo(rinfo)); + json_add_string(req->js, "index", "created"); + json_add_u64(req->js, "start", bkpr->channelmoves_index + 1); + json_add_u64(req->js, "limit", 1000); + return send_outreq(req); +} + static struct command_result *listchainmoves_done(struct command *cmd, const char *method, const char *buf, const jsmntok_t *result, struct refresh_info *rinfo) { - struct out_req *req; const jsmntok_t *moves, *t; size_t i; struct bkpr *bkpr = bkpr_of(cmd->plugin); @@ -148,13 +173,7 @@ static struct command_result *listchainmoves_done(struct command *cmd, "create-or-replace", datastore_done, NULL, use_rinfo(rinfo)); - req = jsonrpc_request_start(cmd, "listchannelmoves", - listchannelmoves_done, - plugin_broken_cb, - use_rinfo(rinfo)); - json_add_string(req->js, "index", "created"); - json_add_u64(req->js, "start", bkpr->channelmoves_index + 1); - send_outreq(req); + limited_listchannelmoves(cmd, rinfo); return rinfo_one_done(cmd, rinfo); }