Skip to content

Commit 8928f0b

Browse files
committed
gossipd: remove gossip entirely if we hit a problem on load.
The crashes in #2750 are mostly caused by us trying to partially truncate the store. The simplest fix for release is to discard the whole thing if we detect a problem. This is a workaround: it'd be far nicer to try to recover. Fixes: #2750 Signed-off-by: Rusty Russell <[email protected]>
1 parent 8ce3b86 commit 8928f0b

File tree

4 files changed

+71
-81
lines changed

4 files changed

+71
-81
lines changed

gossipd/gossip_store.c

Lines changed: 19 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -525,44 +525,6 @@ int gossip_store_readonly_fd(struct gossip_store *gs)
525525
return fd;
526526
}
527527

528-
/* If we ever truncated, we might have a dangling entries. */
529-
static void cleanup_truncated_store(struct routing_state *rstate,
530-
struct gossip_store *gs,
531-
u32 chan_ann_off)
532-
{
533-
size_t num;
534-
u32 index;
535-
536-
/* channel_announce with no channel_amount. */
537-
if (chan_ann_off) {
538-
status_unusual("Deleting un-amounted channel_announcement @%u",
539-
chan_ann_off);
540-
delete_by_index(gs, chan_ann_off, WIRE_CHANNEL_ANNOUNCEMENT);
541-
}
542-
543-
num = 0;
544-
while ((index = remove_unfinalized_node_announce(rstate)) != 0) {
545-
delete_by_index(gs, index, WIRE_NODE_ANNOUNCEMENT);
546-
num++;
547-
}
548-
if (num)
549-
status_unusual("Deleted %zu unfinalized node_announcements",
550-
num);
551-
552-
num = 0;
553-
while ((index = remove_unupdated_channel_announce(rstate)) != 0) {
554-
u32 next;
555-
556-
/* Delete announcement and channel amount, too */
557-
next = delete_by_index(gs, index, WIRE_CHANNEL_ANNOUNCEMENT);
558-
delete_by_index(gs, next, WIRE_GOSSIP_STORE_CHANNEL_AMOUNT);
559-
num++;
560-
}
561-
if (num)
562-
status_unusual("Deleted %zu unupdated channel_announcements",
563-
num);
564-
}
565-
566528
bool gossip_store_load(struct routing_state *rstate, struct gossip_store *gs)
567529
{
568530
struct gossip_hdr hdr;
@@ -672,25 +634,36 @@ bool gossip_store_load(struct routing_state *rstate, struct gossip_store *gs)
672634
clean_tmpctx();
673635
}
674636

637+
if (chan_ann) {
638+
status_unusual("gossip_store: dangling channel_announcement");
639+
goto truncate_nomsg;
640+
}
641+
642+
bad = unfinalized_entries(tmpctx, rstate);
643+
if (bad) {
644+
status_unusual("gossip_store: %s", bad);
645+
goto truncate_nomsg;
646+
}
647+
675648
/* If last timestamp is within 24 hours, say we're OK. */
676649
contents_ok = (last_timestamp >= time_now().ts.tv_sec - 24*3600);
677650
goto out;
678651

679652
truncate:
680-
status_unusual("gossip_store: %s (%s) truncating to %"PRIu64,
681-
bad, tal_hex(msg, msg), gs->len);
653+
status_unusual("gossip_store: %s (%s) truncating",
654+
bad, tal_hex(msg, msg));
655+
682656
truncate_nomsg:
683-
/* FIXME: We would like to truncate to known_good, except we would
684-
* miss channel_delete msgs. If we put block numbers into the store
685-
* as we process them, we can know how far we need to roll back if we
686-
* truncate the store */
687-
if (ftruncate(gs->fd, gs->len) != 0)
657+
/* FIXME: Debug partial truncate case. */
658+
if (ftruncate(gs->fd, 1) != 0)
688659
status_failed(STATUS_FAIL_INTERNAL_ERROR,
689660
"Truncating store: %s", strerror(errno));
661+
remove_all_gossip(rstate);
662+
gs->count = gs->deleted = 0;
663+
gs->len = 1;
690664
contents_ok = false;
691665
out:
692666
gs->writable = true;
693-
cleanup_truncated_store(rstate, gs, chan_ann ? chan_ann_off : 0);
694667
status_trace("total store load time: %"PRIu64" msec",
695668
time_to_msec(time_between(time_now(), start)));
696669
status_trace("gossip_store: Read %zu/%zu/%zu/%zu cannounce/cupdate/nannounce/cdelete from store (%zu deleted) in %"PRIu64" bytes",

gossipd/routing.c

Lines changed: 47 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -2583,42 +2583,63 @@ struct timeabs gossip_time_now(const struct routing_state *rstate)
25832583
return time_now();
25842584
}
25852585

2586-
/* gossip_store wants to delete any dangling node_announcement msgs */
2587-
u32 remove_unfinalized_node_announce(struct routing_state *rstate)
2586+
const char *unfinalized_entries(const tal_t *ctx, struct routing_state *rstate)
25882587
{
2589-
/* We're only interested in node_announcement we caught. */
2590-
for (;;) {
2591-
struct pending_node_announce *pna;
2592-
struct pending_node_map_iter it;
2588+
struct unupdated_channel *uc;
2589+
u64 index;
2590+
struct pending_node_announce *pna;
2591+
struct pending_node_map_iter it;
25932592

2594-
pna = pending_node_map_first(rstate->pending_node_map, &it);
2595-
if (!pna)
2596-
return 0;
2593+
uc = uintmap_first(&rstate->unupdated_chanmap, &index);
2594+
if (uc)
2595+
return tal_fmt(ctx, "Unupdated channel_announcement at %u",
2596+
uc->index);
25972597

2598-
/* This will be deleted by the associated unupdated_channel; just
2599-
* remove from map for now. */
2600-
pending_node_map_del(rstate->pending_node_map, pna);
2601-
if (!pna->node_announcement)
2602-
continue;
2598+
pna = pending_node_map_first(rstate->pending_node_map, &it);
2599+
if (pna)
2600+
return tal_fmt(ctx, "Waiting node_announcement at %u",
2601+
pna->index);
26032602

2604-
assert(pna->index);
2605-
return pna->index;
2606-
}
2603+
return NULL;
26072604
}
26082605

2609-
/* gossip_store wants to delete any dangling channel_announcement msgs */
2610-
u32 remove_unupdated_channel_announce(struct routing_state *rstate)
2606+
/* Gossip store was corrupt, forget anything we loaded. */
2607+
void remove_all_gossip(struct routing_state *rstate)
26112608
{
2609+
struct node *n;
2610+
struct node_map_iter nit;
2611+
struct chan *c;
26122612
struct unupdated_channel *uc;
26132613
u64 index;
2614+
struct pending_cannouncement *pca;
2615+
struct pending_cannouncement_map_iter pit;
2616+
struct pending_node_map_iter pnait;
2617+
2618+
/* We don't want them to try to delete from store, so do this
2619+
* manually. */
2620+
while ((n = node_map_first(rstate->nodes, &nit)) != NULL) {
2621+
tal_del_destructor2(n, destroy_node, rstate);
2622+
if (node_uses_chan_map(n))
2623+
chan_map_clear(&n->chans.map);
2624+
node_map_del(rstate->nodes, n);
2625+
tal_free(n);
2626+
}
26142627

2615-
uc = uintmap_first(&rstate->unupdated_chanmap, &index);
2616-
if (!uc)
2617-
return 0;
2628+
/* Now free all the channels. */
2629+
while ((c = uintmap_first(&rstate->chanmap, &index)) != NULL) {
2630+
uintmap_del(&rstate->chanmap, index);
2631+
2632+
/* Remove from local_disabled_map if it's there. */
2633+
chan_map_del(&rstate->local_disabled_map, c);
2634+
tal_free(c);
2635+
}
2636+
2637+
while ((uc = uintmap_first(&rstate->unupdated_chanmap, &index)) != NULL)
2638+
tal_free(uc);
26182639

2619-
assert(uc->index);
2620-
index = uc->index;
2640+
while ((pca = pending_cannouncement_map_first(&rstate->pending_cannouncements, &pit)) != NULL)
2641+
tal_free(pca);
26212642

2622-
tal_free(uc);
2623-
return index;
2643+
/* Freeing unupdated chanmaps should empty this */
2644+
assert(pending_node_map_first(rstate->pending_node_map, &pnait) == NULL);
26242645
}

gossipd/routing.h

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -437,12 +437,8 @@ struct wireaddr *read_addresses(const tal_t *ctx, const u8 *ser);
437437
void remove_channel_from_store(struct routing_state *rstate,
438438
struct chan *chan);
439439

440-
/* gossip_store wants to delete any dangling entries immediately after
441-
* load; return 0 if no more, otherwise index into store.
442-
*
443-
* Must call remove_unfinalized_node_announce first, because removing
444-
* unupdated channels may delete associatd node_announcements. */
445-
u32 remove_unfinalized_node_announce(struct routing_state *rstate);
446-
u32 remove_unupdated_channel_announce(struct routing_state *rstate);
440+
/* Returns an error string if there are unfinalized entries after load */
441+
const char *unfinalized_entries(const tal_t *ctx, struct routing_state *rstate);
447442

443+
void remove_all_gossip(struct routing_state *rstate);
448444
#endif /* LIGHTNING_GOSSIPD_ROUTING_H */

tests/test_gossip.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -945,8 +945,8 @@ def test_gossip_store_load_amount_truncated(node_factory):
945945

946946
l1.start()
947947
# May preceed the Started msg waited for in 'start'.
948-
wait_for(lambda: l1.daemon.is_in_log(r'Deleting un-amounted channel_announcement @1'))
949-
wait_for(lambda: l1.daemon.is_in_log(r'gossip_store: Read 0/0/0/0 cannounce/cupdate/nannounce/cdelete from store \(1 deleted\) in 445 bytes'))
948+
wait_for(lambda: l1.daemon.is_in_log(r'gossip_store: dangling channel_announcement'))
949+
wait_for(lambda: l1.daemon.is_in_log(r'gossip_store: Read 0/0/0/0 cannounce/cupdate/nannounce/cdelete from store \(0 deleted\) in 1 bytes'))
950950
assert not l1.daemon.is_in_log('gossip_store.*truncating')
951951

952952
# Extra sanity check if we can.

0 commit comments

Comments
 (0)