Skip to content

Commit 668358c

Browse files
committed
log_event_decoder: improve robustness for invalid group markers
Enhance the log event decoder to handle corrupted group markers more robustly while preserving data integrity: 1. **Preserve group state on invalid markers**: Invalid group markers (negative timestamps not -1 or -2) no longer clear the active group state. This prevents data loss when corruption occurs mid-group, ensuring subsequent normal logs retain their group metadata. 2. **Add recursion depth guard**: Implement a safety limit (1000) to prevent potential stack overflow from excessive consecutive group markers or corruption. The decoder returns an error if the limit is reached. 3. **Add debug logging**: Log invalid group marker timestamps at debug level to aid in troubleshooting data corruption issues without cluttering normal operation logs. 4. **Improve documentation**: Enhanced comments explaining the negative timestamp contract and the improved error handling behavior. These changes make the decoder more resilient to data corruption while maintaining backward compatibility and ensuring data integrity is preserved even when encountering invalid group markers. Signed-off-by: Eduardo Silva <[email protected]>
1 parent 0e6513f commit 668358c

File tree

3 files changed

+53
-11
lines changed

3 files changed

+53
-11
lines changed

include/fluent-bit/flb_log_event.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,17 @@
3333
#define FLB_LOG_EVENT_FORMAT_FLUENT_BIT_V1 FLB_LOG_EVENT_FORMAT_FORWARD
3434
#define FLB_LOG_EVENT_FORMAT_FLUENT_BIT_V2 4
3535

36+
/*
37+
* Log event type identification via timestamp value:
38+
* - Non-negative timestamps (>= 0): Normal log records with actual timestamps
39+
* - -1 (FLB_LOG_EVENT_GROUP_START): Group marker indicating start of a log group
40+
* - -2 (FLB_LOG_EVENT_GROUP_END): Group marker indicating end of a log group
41+
* - Other negative values: Invalid/corrupted data (will be skipped by decoder)
42+
*
43+
* NOTE: Negative timestamps are RESERVED for group markers. Only -1 and -2 are valid.
44+
* Any other negative timestamp is considered invalid and will be skipped during decoding.
45+
* Encoders must respect this contract and only use -1/-2 for group markers.
46+
*/
3647
#define FLB_LOG_EVENT_NORMAL (int32_t) 0
3748
#define FLB_LOG_EVENT_GROUP_START (int32_t) -1
3849
#define FLB_LOG_EVENT_GROUP_END (int32_t) -2

include/fluent-bit/flb_log_event_decoder.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ struct flb_log_event_decoder {
6262
size_t length;
6363
int last_result;
6464
int read_groups;
65+
unsigned int recursion_depth; /* Safety guard for recursion limit */
6566
};
6667

6768
void flb_log_event_decoder_reset(struct flb_log_event_decoder *context,

src/flb_log_event_decoder.c

Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020
#include <fluent-bit/flb_log_event_decoder.h>
2121
#include <fluent-bit/flb_byteswap.h>
2222
#include <fluent-bit/flb_compat.h>
23+
#include <fluent-bit/flb_log.h>
24+
25+
#define FLB_LOG_EVENT_DECODER_MAX_RECURSION_DEPTH 1000 /* Safety limit for recursion */
2326

2427
static int create_empty_map(struct flb_log_event_decoder *context) {
2528
msgpack_packer packer;
@@ -74,6 +77,7 @@ void flb_log_event_decoder_reset(struct flb_log_event_decoder *context,
7477
context->last_result = FLB_EVENT_DECODER_ERROR_INSUFFICIENT_DATA;
7578
context->current_group_metadata = NULL;
7679
context->current_group_attributes = NULL;
80+
context->recursion_depth = 0; /* Reset recursion counter */
7781

7882
msgpack_unpacked_destroy(&context->unpacked_group_record);
7983
msgpack_unpacked_init(&context->unpacked_group_record);
@@ -309,6 +313,7 @@ int flb_log_event_decoder_next(struct flb_log_event_decoder *context,
309313
int result;
310314
int record_type;
311315
size_t previous_offset;
316+
int32_t invalid_timestamp;
312317

313318
if (context == NULL) {
314319
return FLB_EVENT_DECODER_ERROR_INVALID_CONTEXT;
@@ -347,22 +352,43 @@ int flb_log_event_decoder_next(struct flb_log_event_decoder *context,
347352
&context->unpacked_event.data);
348353

349354
if (context->last_result == FLB_EVENT_DECODER_SUCCESS) {
355+
/* Check recursion depth limit to prevent stack overflow */
356+
if (context->recursion_depth >= FLB_LOG_EVENT_DECODER_MAX_RECURSION_DEPTH) {
357+
flb_warn("[decoder] Maximum recursion depth (%d) reached, possible corruption or excessive group markers",
358+
FLB_LOG_EVENT_DECODER_MAX_RECURSION_DEPTH);
359+
context->last_result = FLB_EVENT_DECODER_ERROR_DESERIALIZATION_FAILURE;
360+
return context->last_result;
361+
}
362+
350363
/* get log event type */
351364
ret = flb_log_event_decoder_get_record_type(event, &record_type);
352365
if (ret != 0) {
353-
context->current_group_metadata = NULL;
354-
context->current_group_attributes = NULL;
355-
356-
context->last_result = FLB_EVENT_DECODER_ERROR_DESERIALIZATION_FAILURE;
357-
return context->last_result;
366+
/* Invalid group marker (negative timestamp but not -1 or -2).
367+
* Log the invalid marker for debugging, but preserve group state
368+
* to avoid losing valid group metadata if corruption occurs mid-group.
369+
* Skip the record and continue processing.
370+
*/
371+
invalid_timestamp = (int32_t) event->timestamp.tm.tv_sec;
372+
flb_debug("[decoder] Invalid group marker timestamp (%d), skipping record. "
373+
"Group state preserved.", invalid_timestamp);
374+
375+
/* Increment recursion depth before recursive call */
376+
context->recursion_depth++;
377+
memset(event, 0, sizeof(struct flb_log_event));
378+
ret = flb_log_event_decoder_next(context, event);
379+
context->recursion_depth--; /* Restore after return */
380+
return ret;
358381
}
359382

360383
/* Meta records such as the group opener and closer are identified by negative
361-
* timestamp values. In these cases we track the current group metadata and
362-
* attributes in order to transparently provide them through the log_event
363-
* structure but we also want to allow the client code raw access to such
364-
* records which is why the read_groups decoder context property is used
365-
* to determine the behavior.
384+
* timestamp values (-1 for GROUP_START, -2 for GROUP_END). Only these two
385+
* negative values are valid; any other negative timestamp is considered
386+
* invalid and is skipped (see handling above).
387+
*
388+
* We track the current group metadata and attributes in order to transparently
389+
* provide them through the log_event structure, but we also want to allow the
390+
* client code raw access to such records, which is why the read_groups decoder
391+
* context property is used to determine the behavior.
366392
*/
367393
if (record_type != FLB_LOG_EVENT_NORMAL) {
368394
msgpack_unpacked_destroy(&context->unpacked_group_record);
@@ -411,9 +437,13 @@ int flb_log_event_decoder_next(struct flb_log_event_decoder *context,
411437
* Skip group markers by recursively calling to get next record.
412438
* msgpack_unpack_next will properly destroy and reinitialize
413439
* unpacked_event, so no explicit cleanup needed here.
440+
* Increment recursion depth before recursive call.
414441
*/
442+
context->recursion_depth++;
415443
memset(event, 0, sizeof(struct flb_log_event));
416-
return flb_log_event_decoder_next(context, event);
444+
ret = flb_log_event_decoder_next(context, event);
445+
context->recursion_depth--; /* Restore after return */
446+
return ret;
417447
}
418448
}
419449
else {

0 commit comments

Comments
 (0)