Skip to content

Commit a452050

Browse files
committed
common: Add a lame-duck mode for tracing
The tracing subsystem has some rather strong consistency requirements that are easily broken by incorrect use of its external API. For example not closing a span can cause an assertion on the next call to crash the daemon. Due to the separation in time and code, it is not always easy to debug there, and crashing the entire system is bad. The lame-duck mode is switched to as soon as the first error occurs, and we just stop emitting traces in that case. Changelog-None
1 parent beffd02 commit a452050

File tree

2 files changed

+51
-8
lines changed

2 files changed

+51
-8
lines changed

common/trace.c

Lines changed: 46 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,39 @@ struct span {
7676
static struct span *active_spans = NULL;
7777
static struct span *current;
7878

79+
/* Whether or not we're tracing things. We start in tracing mode, but
80+
* may deactivate ourselves, most likely due to an API usage
81+
* error. This is a sort of lame-duck mode, in which traces are
82+
* no-ops. It is safer than crashing the entire daemon.
83+
*/
84+
static bool active = true;
85+
86+
#define TRACE_ASSERT_RET(cond, msg, ret) \
87+
{ \
88+
if (!active) { \
89+
return ret; \
90+
} else if (!cond) { \
91+
fprintf(stderr, \
92+
"TRACE_ASSERT: Detected inconsistency in " \
93+
"tracing at %s:%d\n%s", \
94+
__FILE__, __LINE__, msg); \
95+
return ret; \
96+
} \
97+
}
98+
99+
#define TRACE_ASSERT(cond, msg) \
100+
{ \
101+
if (!active) { \
102+
return; \
103+
} else if (!cond) { \
104+
fprintf(stderr, \
105+
"TRACE_ASSERT: Detected inconsistency in " \
106+
"tracing at %s:%d\n%s", \
107+
__FILE__, __LINE__, msg); \
108+
return; \
109+
} \
110+
}
111+
79112
/* If the `CLN_TRACEPARENT` envvar is set, we inject that as the
80113
* parent for the startup. This allows us to integrate the startup
81114
* tracing with whatever tooling we build around it. This only has an
@@ -187,8 +220,8 @@ static struct span *trace_span_slot(void)
187220

188221
/* Might end up here if we have more than MAX_ACTIVE_SPANS
189222
* concurrent spans. */
190-
assert(s);
191-
assert(s->parent == NULL);
223+
TRACE_ASSERT_RET(s, "MAX_ACTIVE_SPANS exceeded", NULL);
224+
TRACE_ASSERT_RET(!s->parent, "New span already has a parent", NULL);
192225

193226
/* Be extra careful not to create cycles. If we return the
194227
* position that is pointed at by current then we can only
@@ -205,6 +238,11 @@ static void trace_emit(struct span *s)
205238
char trace_id[HEX_TRACE_ID_SIZE];
206239
char parent_span_id[HEX_SPAN_ID_SIZE];
207240

241+
/* If we're not active we cannot emit spans, they might be
242+
* half-initialized. */
243+
if (!active)
244+
return;
245+
208246
/* If this is a remote span it's not up to us to emit it. Make
209247
* this a no-op. `trace_span_end` will take care of cleaning
210248
* the in-memory span up. */
@@ -263,7 +301,7 @@ void trace_span_start(const char *name, const void *key)
263301
trace_init();
264302
trace_check_tree();
265303

266-
assert(trace_span_find(numkey) == NULL);
304+
TRACE_ASSERT((trace_span_find(numkey) == NULL), "Reused key");
267305
struct span *s = trace_span_slot();
268306
s->key = numkey;
269307
randombytes_buf(s->id, SPAN_ID_SIZE);
@@ -295,8 +333,8 @@ void trace_span_end(const void *key)
295333
{
296334
size_t numkey = trace_key(key);
297335
struct span *s = trace_span_find(numkey);
298-
assert(s && "Span to end not found");
299-
assert(s == current && "Ending a span that isn't the current one");
336+
TRACE_ASSERT(s, "Span to end not found");
337+
TRACE_ASSERT((s == current),"Ending a span that isn't the current one");
300338

301339
trace_check_tree();
302340

@@ -315,7 +353,7 @@ void trace_span_end(const void *key)
315353
* the root. And we should terminate that trace along with
316354
* this one. */
317355
if (current && current->remote) {
318-
assert(current->parent == NULL);
356+
TRACE_ASSERT((current->parent == NULL), "Remote parent trace, but not local root?");
319357
current = NULL;
320358
}
321359
trace_check_tree();
@@ -325,7 +363,7 @@ void trace_span_tag(const void *key, const char *name, const char *value)
325363
{
326364
size_t numkey = trace_key(key);
327365
struct span *span = trace_span_find(numkey);
328-
assert(span);
366+
TRACE_ASSERT(span, "Tagging a non-existent span.");
329367

330368
size_t s = tal_count(span->tags);
331369
tal_resize(&span->tags, s + 1);
@@ -344,7 +382,7 @@ void trace_span_suspend_(const void *key, const char *lbl)
344382
size_t numkey = trace_key(key);
345383
struct span *span = trace_span_find(numkey);
346384
TRACE_DBG("Suspending span %s (%zu)\n", current->name, current->key);
347-
assert(current == span);
385+
TRACE_ASSERT((current == span), "Suspending non-current span");
348386
current = NULL;
349387
DTRACE_PROBE1(lightningd, span_suspend, span->id);
350388
}

contrib/pyln-testing/pyln/testing/fixtures.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,7 @@ def map_node_error(nodes, f, msg):
497497
map_node_error(nf.nodes, checkBroken, "had BROKEN messages")
498498
map_node_error(nf.nodes, lambda n: not n.allow_warning and n.daemon.is_in_log(r' WARNING:'), "had warning messages")
499499
map_node_error(nf.nodes, checkReconnect, "had unexpected reconnections")
500+
map_node_error(nf.nodes, get_trace_assert_errors, "had trace_assert errors")
500501

501502
# On any bad gossip complaints, print out every nodes' gossip_store
502503
if map_node_error(nf.nodes, checkBadGossip, "had bad gossip messages"):
@@ -532,6 +533,10 @@ def prinErrlog(node):
532533
return 1 if errors else 0
533534

534535

536+
def get_trace_assert_errors(node):
537+
return 1 if node.daemon.is_in_log('TRACE_ASSERT') else 0
538+
539+
535540
def getValgrindErrors(node):
536541
for error_file in os.listdir(node.daemon.lightning_dir):
537542
if not re.fullmatch(r"valgrind-errors.\d+", error_file):

0 commit comments

Comments
 (0)