Skip to content

Commit 5178cca

Browse files
committed
http_server/health: Implement throughput health check using ring buffer
Use a ring buffer for storing samples as per Leonardo's suggestion. Signed-off-by: Thiago Padilha <[email protected]>
1 parent 2240019 commit 5178cca

File tree

1 file changed

+75
-78
lines changed

1 file changed

+75
-78
lines changed

src/http_server/api/v1/health.c

Lines changed: 75 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -37,17 +37,57 @@ struct flb_hs_throughput_sample {
3737
uint64_t in_records;
3838
uint64_t out_records;
3939
uint64_t timestamp_seconds;
40-
struct mk_list _head;
4140
};
4241

42+
/* ring buffer + helper functions for storing samples */
43+
struct flb_hs_throughput_samples {
44+
struct flb_hs_throughput_sample *items;
45+
int size;
46+
int count;
47+
int insert;
48+
};
49+
50+
static struct flb_hs_throughput_sample *samples_add(
51+
struct flb_hs_throughput_samples *samples)
52+
{
53+
struct flb_hs_throughput_sample *sample = samples->items + samples->insert;
54+
samples->insert = (samples->insert + 1) % samples->size;
55+
if (samples->count < samples->size) {
56+
samples->count++;
57+
}
58+
return sample;
59+
}
60+
61+
static int samples_translate_index(
62+
struct flb_hs_throughput_samples *samples, int index)
63+
{
64+
if (index >= samples->count || index < 0) {
65+
return -1;
66+
}
67+
int end_index = samples->insert;
68+
int start_index = end_index - samples->count;
69+
int modulo = (start_index + index) % samples->size;
70+
return modulo < 0 ? modulo + samples->size : modulo;
71+
}
72+
73+
static struct flb_hs_throughput_sample *samples_get(
74+
struct flb_hs_throughput_samples *samples, int index)
75+
{
76+
int real_index = samples_translate_index(samples, index);
77+
if (real_index < 0) {
78+
return NULL;
79+
}
80+
81+
return samples->items + samples_translate_index(samples, index);
82+
}
83+
4384
struct {
4485
int enabled;
4586
struct mk_list *input_plugins;
4687
struct mk_list *output_plugins;
4788
double out_in_ratio_threshold;
48-
int min_failures;
4989

50-
struct mk_list *sample_list;
90+
struct flb_hs_throughput_samples samples;
5191
bool healthy;
5292
} throughput_check_state = {0};
5393

@@ -305,78 +345,45 @@ static int cleanup_metrics()
305345

306346
static int check_throughput_health(uint64_t in_records,
307347
uint64_t out_records,
308-
struct mk_list *sample_list,
309-
int sample_count,
348+
struct flb_hs_throughput_samples *samples,
310349
double out_in_ratio_threshold) {
350+
int i;
311351
struct flb_time tp;
312-
uint64_t timestamp_seconds;
313352
uint64_t in_rate;
314353
uint64_t out_rate;
315-
struct mk_list *tmp;
316-
struct mk_list *head;
317354
double out_in_ratio;
318355
struct flb_hs_throughput_sample *entry;
319356
struct flb_hs_throughput_sample *prev;
320357
struct flb_hs_throughput_sample *sample;
321-
struct flb_hs_throughput_sample *last_sample = NULL;
322-
int count;
323358
bool healthy;
324359
bool rv;
325360

326361
flb_time_get(&tp);
327-
timestamp_seconds = flb_time_to_seconds(&tp);
328-
329-
if (mk_list_is_empty(sample_list) != 0) {
330-
last_sample = mk_list_entry_last(sample_list,
331-
struct flb_hs_throughput_sample,
332-
_head);
333-
}
334362

335-
if (!last_sample ||
336-
in_records != last_sample->in_records ||
337-
out_records != last_sample->out_records) {
338-
339-
sample = flb_malloc(sizeof(struct flb_hs_throughput_sample));
340-
341-
if (sample) {
342-
sample->timestamp_seconds = timestamp_seconds;
343-
sample->in_records = in_records;
344-
sample->out_records = out_records;
345-
mk_list_add(&sample->_head, sample_list);
346-
} else {
347-
flb_error("[api/v1/health/throughput]: failed to allocate sample");
348-
}
349-
350-
} else {
351-
/* don't collect another sample unless either in_records or out_records have
352-
* changed since last check */
353-
flb_debug("[api/v1/health/throughput]: no changes since last check");
354-
}
363+
sample = samples_add(samples);
364+
sample->timestamp_seconds = flb_time_to_seconds(&tp);
365+
sample->in_records = in_records;
366+
sample->out_records = out_records;
355367

356368
flb_debug("[api/v1/health/throughput]: check samples start %d %f",
357-
sample_count,
369+
samples->size,
358370
out_in_ratio_threshold);
359371

360372
healthy = false;
361-
mk_list_foreach_safe_r(head, tmp, sample_list) {
362-
entry = mk_list_entry(head, struct flb_hs_throughput_sample, _head);
363-
if (entry == mk_list_entry_first(sample_list,
364-
struct flb_hs_throughput_sample,
365-
_head)) {
366-
break;
373+
for (i = samples->count - 1; i > 0; i--) {
374+
entry = samples_get(samples, i);
375+
prev = samples_get(samples, i - 1);
376+
uint64_t timestamp_delta = entry->timestamp_seconds - prev->timestamp_seconds;
377+
if (timestamp_delta == 0) {
378+
/* check against divide by zero */
379+
continue;
367380
}
368-
369-
prev = mk_list_entry(entry->_head.prev,
370-
struct flb_hs_throughput_sample,
371-
_head);
372-
in_rate = (entry->in_records - prev->in_records) /
373-
(entry->timestamp_seconds - prev->timestamp_seconds);
374-
out_rate = (entry->out_records - prev->out_records) /
375-
(entry->timestamp_seconds - prev->timestamp_seconds);
381+
in_rate = (entry->in_records - prev->in_records) / timestamp_delta;
382+
out_rate = (entry->out_records - prev->out_records) / timestamp_delta;
376383
out_in_ratio = (double)out_rate / (double)in_rate;
377384
healthy = healthy || out_in_ratio > out_in_ratio_threshold;
378385

379-
flb_debug("[api/v1/health/throughput]: out: %"PRIu64" in: %"PRIu64" ratio: %f\n",
386+
flb_debug("[api/v1/health/throughput]: out: %"PRIu64" in: %"PRIu64" ratio: %f",
380387
out_in_ratio,
381388
out_rate,
382389
in_rate);
@@ -386,19 +393,7 @@ static int check_throughput_health(uint64_t in_records,
386393
}
387394
}
388395

389-
count = 0;
390-
mk_list_foreach_safe_r(head, tmp, sample_list) {
391-
entry = mk_list_entry(head, struct flb_hs_throughput_sample, _head);
392-
if (count == sample_count) {
393-
mk_list_del(&entry->_head);
394-
flb_free(entry);
395-
}
396-
else {
397-
count++;
398-
}
399-
}
400-
401-
rv = count < sample_count || healthy;
396+
rv = samples->count < samples->size || healthy;
402397
flb_debug("checking throughput samples stop, result: %s",
403398
rv ? "healthy" :"unhealthy");
404399

@@ -463,8 +458,7 @@ static void cb_mq_health(mk_mq_t *queue, void *data, size_t size)
463458
throughput_check_state.healthy =
464459
check_throughput_health(input_records,
465460
output_records,
466-
throughput_check_state.sample_list,
467-
throughput_check_state.min_failures,
461+
&throughput_check_state.samples,
468462
throughput_check_state.out_in_ratio_threshold);
469463
}
470464

@@ -498,6 +492,7 @@ static void configure_throughput_check(struct flb_config *config)
498492
{
499493
bool enabled = config->hc_throughput;
500494

495+
memset(&throughput_check_state, 0, sizeof(throughput_check_state));
501496
throughput_check_state.enabled = false;
502497
throughput_check_state.healthy = true;
503498

@@ -522,18 +517,10 @@ static void configure_throughput_check(struct flb_config *config)
522517
return;
523518
}
524519

525-
throughput_check_state.sample_list = flb_malloc(sizeof(struct mk_list));
526-
if (!throughput_check_state.sample_list) {
527-
flb_errno();
528-
return;
529-
}
530-
mk_list_init(throughput_check_state.sample_list);
531-
532520
throughput_check_state.input_plugins =
533521
flb_utils_split(config->hc_throughput_input_plugins, ',', 0);
534522

535523
if (!throughput_check_state.input_plugins) {
536-
flb_free(throughput_check_state.sample_list);
537524
flb_errno();
538525
return;
539526
}
@@ -542,16 +529,26 @@ static void configure_throughput_check(struct flb_config *config)
542529
flb_utils_split(config->hc_throughput_output_plugins, ',', 0);
543530

544531
if (!throughput_check_state.output_plugins) {
545-
flb_free(throughput_check_state.sample_list);
546532
flb_free(throughput_check_state.input_plugins);
547533
flb_errno();
548534
return;
549535
}
550536

551537
throughput_check_state.out_in_ratio_threshold = config->hc_throughput_ratio_threshold;
552-
throughput_check_state.min_failures = config->hc_throughput_min_failures;
553538
throughput_check_state.enabled = true;
554539

540+
throughput_check_state.samples.items = flb_calloc(
541+
config->hc_throughput_min_failures,
542+
sizeof(struct flb_hs_throughput_sample));
543+
544+
if (!throughput_check_state.samples.items) {
545+
flb_free(throughput_check_state.input_plugins);
546+
flb_free(throughput_check_state.output_plugins);
547+
flb_errno();
548+
return;
549+
}
550+
throughput_check_state.samples.size = config->hc_throughput_min_failures;
551+
555552
flb_info("[api/v1/health/throughput]: configuration complete. "
556553
"input plugins: %s | "
557554
"output plugins: %s | "

0 commit comments

Comments
 (0)