Skip to content

Commit c15d9ed

Browse files
committed
gossip_store: make copy of corrupt gossip_store on failure.
This should help debugging vastly. Signed-off-by: Rusty Russell <[email protected]>
1 parent 8928f0b commit c15d9ed

File tree

2 files changed

+34
-27
lines changed

2 files changed

+34
-27
lines changed

gossipd/gossip_store.c

Lines changed: 28 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <ccan/endian/endian.h>
66
#include <ccan/noerr/noerr.h>
77
#include <ccan/read_write_all/read_write_all.h>
8+
#include <ccan/tal/str/str.h>
89
#include <common/gossip_store.h>
910
#include <common/status.h>
1011
#include <common/utils.h>
@@ -546,13 +547,13 @@ bool gossip_store_load(struct routing_state *rstate, struct gossip_store *gs)
546547
msg = tal_arr(tmpctx, u8, msglen);
547548

548549
if (pread(gs->fd, msg, msglen, gs->len+sizeof(hdr)) != msglen) {
549-
status_unusual("gossip_store: truncated file?");
550-
goto truncate_nomsg;
550+
bad = "gossip_store: truncated file?";
551+
goto corrupt;
551552
}
552553

553554
if (checksum != crc32c(be32_to_cpu(hdr.timestamp), msg, msglen)) {
554555
bad = "Checksum verification failed";
555-
goto truncate;
556+
goto badmsg;
556557
}
557558

558559
/* Skip deleted entries */
@@ -568,7 +569,7 @@ bool gossip_store_load(struct routing_state *rstate, struct gossip_store *gs)
568569
if (!fromwire_gossip_store_channel_amount(msg,
569570
&satoshis)) {
570571
bad = "Bad gossip_store_channel_amount";
571-
goto truncate;
572+
goto badmsg;
572573
}
573574
/* Previous channel_announcement may have been deleted */
574575
if (!chan_ann)
@@ -578,15 +579,15 @@ bool gossip_store_load(struct routing_state *rstate, struct gossip_store *gs)
578579
satoshis,
579580
chan_ann_off)) {
580581
bad = "Bad channel_announcement";
581-
goto truncate;
582+
goto badmsg;
582583
}
583584
chan_ann = NULL;
584585
stats[0]++;
585586
break;
586587
case WIRE_CHANNEL_ANNOUNCEMENT:
587588
if (chan_ann) {
588589
bad = "channel_announcement without amount";
589-
goto truncate;
590+
goto badmsg;
590591
}
591592
/* Save for channel_amount (next msg) */
592593
chan_ann = tal_steal(gs, msg);
@@ -598,34 +599,34 @@ bool gossip_store_load(struct routing_state *rstate, struct gossip_store *gs)
598599
case WIRE_GOSSIP_STORE_PRIVATE_UPDATE:
599600
if (!fromwire_gossip_store_private_update(tmpctx, msg, &msg)) {
600601
bad = "invalid gossip_store_private_update";
601-
goto truncate;
602+
goto badmsg;
602603
}
603604
/* fall thru */
604605
case WIRE_CHANNEL_UPDATE:
605606
if (!routing_add_channel_update(rstate,
606607
take(msg), gs->len)) {
607608
bad = "Bad channel_update";
608-
goto truncate;
609+
goto badmsg;
609610
}
610611
stats[1]++;
611612
break;
612613
case WIRE_NODE_ANNOUNCEMENT:
613614
if (!routing_add_node_announcement(rstate,
614615
take(msg), gs->len)) {
615616
bad = "Bad node_announcement";
616-
goto truncate;
617+
goto badmsg;
617618
}
618619
stats[2]++;
619620
break;
620621
case WIRE_GOSSIPD_LOCAL_ADD_CHANNEL:
621622
if (!handle_local_add_channel(rstate, msg, gs->len)) {
622623
bad = "Bad local_add_channel";
623-
goto truncate;
624+
goto badmsg;
624625
}
625626
break;
626627
default:
627628
bad = "Unknown message";
628-
goto truncate;
629+
goto badmsg;
629630
}
630631

631632
gs->count++;
@@ -635,29 +636,33 @@ bool gossip_store_load(struct routing_state *rstate, struct gossip_store *gs)
635636
}
636637

637638
if (chan_ann) {
638-
status_unusual("gossip_store: dangling channel_announcement");
639-
goto truncate_nomsg;
639+
bad = "dangling channel_announcement";
640+
goto corrupt;
640641
}
641642

642643
bad = unfinalized_entries(tmpctx, rstate);
643-
if (bad) {
644-
status_unusual("gossip_store: %s", bad);
645-
goto truncate_nomsg;
646-
}
644+
if (bad)
645+
goto corrupt;
647646

648647
/* If last timestamp is within 24 hours, say we're OK. */
649648
contents_ok = (last_timestamp >= time_now().ts.tv_sec - 24*3600);
650649
goto out;
651650

652-
truncate:
653-
status_unusual("gossip_store: %s (%s) truncating",
654-
bad, tal_hex(msg, msg));
651+
badmsg:
652+
bad = tal_fmt(tmpctx, "%s (%s)", bad, tal_hex(tmpctx, msg));
653+
654+
corrupt:
655+
status_broken("gossip_store: %s. Moving to %s.corrupt and truncating",
656+
bad, GOSSIP_STORE_FILENAME);
655657

656-
truncate_nomsg:
657658
/* FIXME: Debug partial truncate case. */
658-
if (ftruncate(gs->fd, 1) != 0)
659+
rename(GOSSIP_STORE_FILENAME, GOSSIP_STORE_FILENAME ".corrupt");
660+
close(gs->fd);
661+
gs->fd = open(GOSSIP_STORE_FILENAME,
662+
O_RDWR|O_APPEND|O_TRUNC|O_CREAT, 0600);
663+
if (gs->fd < 0 || !write_all(gs->fd, &gs->version, sizeof(gs->version)))
659664
status_failed(STATUS_FAIL_INTERNAL_ERROR,
660-
"Truncating store: %s", strerror(errno));
665+
"Truncating new store file: %s", strerror(errno));
661666
remove_all_gossip(rstate);
662667
gs->count = gs->deleted = 0;
663668
gs->len = 1;

tests/test_gossip.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -945,9 +945,9 @@ def test_gossip_store_load_amount_truncated(node_factory):
945945

946946
l1.start()
947947
# May preceed the Started msg waited for in 'start'.
948-
wait_for(lambda: l1.daemon.is_in_log(r'gossip_store: dangling channel_announcement'))
948+
wait_for(lambda: l1.daemon.is_in_log(r'gossip_store: dangling channel_announcement. Moving to gossip_store.corrupt and truncating'))
949949
wait_for(lambda: l1.daemon.is_in_log(r'gossip_store: Read 0/0/0/0 cannounce/cupdate/nannounce/cdelete from store \(0 deleted\) in 1 bytes'))
950-
assert not l1.daemon.is_in_log('gossip_store.*truncating')
950+
assert os.path.exists(os.path.join(l1.daemon.lightning_dir, 'gossip_store.corrupt'))
951951

952952
# Extra sanity check if we can.
953953
if DEVELOPER:
@@ -1274,10 +1274,12 @@ def test_gossip_store_load_no_channel_update(node_factory):
12741274

12751275
l1.start()
12761276

1277+
# May preceed the Started msg waited for in 'start'.
1278+
wait_for(lambda: l1.daemon.is_in_log('gossip_store: Unupdated channel_announcement at 1. Moving to gossip_store.corrupt and truncating'))
1279+
assert os.path.exists(os.path.join(l1.daemon.lightning_dir, 'gossip_store.corrupt'))
1280+
12771281
# This should actually result in an empty store.
12781282
l1.rpc.call('dev-compact-gossip-store')
12791283

12801284
with open(os.path.join(l1.daemon.lightning_dir, 'gossip_store'), "rb") as f:
12811285
assert bytearray(f.read()) == bytearray.fromhex("07")
1282-
1283-
assert not l1.daemon.is_in_log('gossip_store.*truncating')

0 commit comments

Comments
 (0)