-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Closed as not planned
Description
Bug Report
Describe the bug
Fluent-bit pod deadlocks, causing CPU to rise, no further log processing happens.
I created a core dump and examined it:
Thread 6 (Thread 0xffffa83ea9c0 (LWP 23)):
#0 0x0000ffffad247c74 in __GI_epoll_pwait (epfd=69, events=0xffffa740c000, maxevents=256, timeout=-1, set=0x0) at ../sysdeps/unix/sysv/linux/epoll_pwait.c:40
#1 0x0000aaaae1331e90 in _mk_event_wait_2 (loop=0xffffa7416000, timeout=-1) at /src/fluent-bit/lib/monkey/mk_core/mk_event_epoll.c:444
#2 0x0000aaaae13322f4 in mk_event_wait (loop=0xffffa7416000) at /src/fluent-bit/lib/monkey/mk_core/mk_event.c:207
#3 0x0000aaaae132a9b0 in mk_server_worker_loop (server=0xffffaaa65700) at /src/fluent-bit/lib/monkey/mk_server/mk_server.c:506
#4 0x0000aaaae1321aa0 in mk_sched_launch_worker_loop (data=0xffffa8e00008) at /src/fluent-bit/lib/monkey/mk_server/mk_scheduler.c:417
#5 0x0000ffffad1dee30 in start_thread (arg=0xffffa9d69f77) at ./nptl/pthread_create.c:442
#6 0x0000ffffad247adc in thread_start () at ../sysdeps/unix/sysv/linux/aarch64/clone.S:79
Thread 5 (Thread 0xffffa8dfa9c0 (LWP 22)):
#0 0x0000ffffad2136b0 in __GI___clock_nanosleep (clock_id=<optimized out>, clock_id@entry=0, flags=flags@entry=0, req=req@entry=0xffffa8dfa108, rem=rem@entry=0xffffa8dfa108) at ../sysdeps/unix/sysv/linux/clock_nanosleep.c:48
#1 0x0000ffffad21861c in __GI___nanosleep (req=req@entry=0xffffa8dfa108, rem=rem@entry=0xffffa8dfa108) at ../sysdeps/unix/sysv/linux/nanosleep.c:25
#2 0x0000ffffad2184e8 in __sleep (seconds=0) at ../sysdeps/posix/sleep.c:55
#3 0x0000aaaae132e240 in mk_clock_worker_init (data=0xffffaaa65700) at /src/fluent-bit/lib/monkey/mk_server/mk_clock.c:124
#4 0x0000ffffad1dee30 in start_thread (arg=0xffffa9d69fa7) at ./nptl/pthread_create.c:442
#5 0x0000ffffad247adc in thread_start () at ../sysdeps/unix/sysv/linux/aarch64/clone.S:79
Thread 4 (Thread 0xffffa9d6a9c0 (LWP 21)):
#0 0x0000ffffad247c74 in __GI_epoll_pwait (epfd=63, events=0xffffaaaaf000, maxevents=8, timeout=-1, set=0x0) at ../sysdeps/unix/sysv/linux/epoll_pwait.c:40
#1 0x0000aaaae1331e90 in _mk_event_wait_2 (loop=0xffffa9e547a0, timeout=-1) at /src/fluent-bit/lib/monkey/mk_core/mk_event_epoll.c:444
#2 0x0000aaaae13322f4 in mk_event_wait (loop=0xffffa9e547a0) at /src/fluent-bit/lib/monkey/mk_core/mk_event.c:207
#3 0x0000aaaae1319124 in mk_lib_worker (data=0xffffa9e547b8) at /src/fluent-bit/lib/monkey/mk_server/mk_lib.c:154
#4 0x0000ffffad1dee30 in start_thread (arg=0xffffab9f9e57) at ./nptl/pthread_create.c:442
#5 0x0000ffffad247adc in thread_start () at ../sysdeps/unix/sysv/linux/aarch64/clone.S:79
Thread 3 (Thread 0xffffaa9fa9c0 (LWP 20)):
#0 0x0000ffffad247c74 in __GI_epoll_pwait (epfd=11, events=0xffffaaa1d000, maxevents=32, timeout=-1, set=0x0) at ../sysdeps/unix/sysv/linux/epoll_pwait.c:40
#1 0x0000aaaae1331e90 in _mk_event_wait_2 (loop=0xffffaaa0b018, timeout=-1) at /src/fluent-bit/lib/monkey/mk_core/mk_event_epoll.c:444
#2 0x0000aaaae13322f4 in mk_event_wait (loop=0xffffaaa0b018) at /src/fluent-bit/lib/monkey/mk_core/mk_event.c:207
#3 0x0000aaaae0e54c2c in log_worker_collector (data=0xffffaaa16000) at /src/fluent-bit/src/flb_log.c:131
#4 0x0000aaaae0e7d70c in step_callback (data=<optimized out>) at /src/fluent-bit/src/flb_worker.c:43
#5 0x0000ffffad1dee30 in start_thread (arg=0xffffab9f9e67) at ./nptl/pthread_create.c:442
#6 0x0000ffffad247adc in thread_start () at ../sysdeps/unix/sysv/linux/aarch64/clone.S:79
Thread 2 (Thread 0xffffab9fa9c0 (LWP 19)):
#0 phn_next_set (offset=40, next=0x0, phn=0xffffaac1ed80) at /src/fluent-bit/lib/jemalloc-5.3.0/include/jemalloc/internal/ph.h:76
#1 phn_merge_siblings (cmp=<optimized out>, offset=40, phn=<optimized out>) at /src/fluent-bit/lib/jemalloc-5.3.0/include/jemalloc/internal/ph.h:206
#2 ph_merge_children (cmp=<optimized out>, offset=40, phn=0xffffaac20f00) at /src/fluent-bit/lib/jemalloc-5.3.0/include/jemalloc/internal/ph.h:244
#3 ph_remove (cmp=<optimized out>, offset=40, phn=phn@entry=0xffffaac20f00, ph=ph@entry=0xffffaac03d50) at /src/fluent-bit/lib/jemalloc-5.3.0/include/jemalloc/internal/ph.h:401
#4 je_edata_heap_remove (ph=ph@entry=0xffffaac03d50, phn=phn@entry=0xffffaac20f00) at /src/fluent-bit/lib/jemalloc-5.3.0/src/edata.c:6
#5 0x0000aaaae0e2595c in je_eset_remove (eset=eset@entry=0xffffaac03ab0, edata=edata@entry=0xffffaac20f00) at /src/fluent-bit/lib/jemalloc-5.3.0/src/eset.c:126
#6 0x0000aaaae0e266a8 in extent_coalesce (forward=false, outer=0xffffaac20f00, inner=0xffffaac25680, ecache=<optimized out>, ehooks=0xffffaac000c0, pac=0xffffaac03a00, tsdn=0xffffab9fb110) at /src/fluent-bit/lib/jemalloc-5.3.0/src/extent.c:806
#7 extent_try_coalesce_impl (tsdn=tsdn@entry=0xffffab9fb110, pac=pac@entry=0xffffaac03a00, ehooks=ehooks@entry=0xffffaac000c0, ecache=ecache@entry=0xffffaac03a38, edata=edata@entry=0xffffaac25680, coalesced=coalesced@entry=0xffffab9f9a4f) at /src/fluent-bit/lib/jemalloc-5.3.0/src/extent.c:855
#8 0x0000aaaae0e2803c in extent_try_coalesce_large (coalesced=0xffffab9f9a4f, edata=0xffffaac25680, ecache=0xffffaac03a38, ehooks=0xffffaac000c0, pac=0xffffaac03a00, tsdn=0xffffab9fb110) at /src/fluent-bit/lib/jemalloc-5.3.0/src/extent.c:884
#9 je_extent_record (tsdn=tsdn@entry=0xffffab9fb110, pac=pac@entry=0xffffaac03a00, ehooks=ehooks@entry=0xffffaac000c0, ecache=ecache@entry=0xffffaac03a38, edata=edata@entry=0xffffaac25680) at /src/fluent-bit/lib/jemalloc-5.3.0/src/extent.c:937
#10 0x0000aaaae0e281b4 in je_ecache_dalloc (tsdn=tsdn@entry=0xffffab9fb110, pac=pac@entry=0xffffaac03a00, ehooks=ehooks@entry=0xffffaac000c0, ecache=ecache@entry=0xffffaac03a38, edata=edata@entry=0xffffaac25680) at /src/fluent-bit/lib/jemalloc-5.3.0/src/extent.c:147
#11 0x0000aaaae0e33db8 in pac_dalloc_impl (tsdn=0xffffab9fb110, self=0xffffaac03a00, edata=0xffffaac25680, deferred_work_generated=0xffffab9f9adf) at /src/fluent-bit/lib/jemalloc-5.3.0/src/pac.c:277
#12 0x0000aaaae0e319a8 in large_dalloc_finish_impl (edata=<optimized out>, arena=0xffffaac01040, tsdn=0xffffab9fb110) at /src/fluent-bit/lib/jemalloc-5.3.0/src/large.c:253
#13 je_large_dalloc (tsdn=0xffffab9fb110, edata=<optimized out>) at /src/fluent-bit/lib/jemalloc-5.3.0/src/large.c:273
#14 0x0000aaaae0df2a34 in arena_dalloc_large (slow_path=false, szind=<optimized out>, tcache=0xffffab9fb478, ptr=<optimized out>, tsdn=0xffffab9fb110) at /src/fluent-bit/lib/jemalloc-5.3.0/include/jemalloc/internal/arena_inlines_b.h:297
#15 arena_dalloc (slow_path=false, caller_alloc_ctx=<synthetic pointer>, tcache=0xffffab9fb478, ptr=<optimized out>, tsdn=0xffffab9fb110) at /src/fluent-bit/lib/jemalloc-5.3.0/include/jemalloc/internal/arena_inlines_b.h:334
#16 idalloctm (slow_path=false, is_internal=false, alloc_ctx=<synthetic pointer>, tcache=0xffffab9fb478, ptr=<optimized out>, tsdn=0xffffab9fb110) at /src/fluent-bit/lib/jemalloc-5.3.0/include/jemalloc/internal/jemalloc_internal_inlines_c.h:120
#17 ifree (slow_path=false, tcache=0xffffab9fb478, ptr=<optimized out>, tsd=0xffffab9fb110) at /src/fluent-bit/lib/jemalloc-5.3.0/src/jemalloc.c:2887
#18 je_free_default (ptr=<optimized out>) at /src/fluent-bit/lib/jemalloc-5.3.0/src/jemalloc.c:3014
#19 0x0000aaaae0df3138 in free (ptr=ptr@entry=0xffffa687c900) at /src/fluent-bit/lib/jemalloc-5.3.0/src/jemalloc.c:3162
#20 0x0000aaaae0e630a8 in flb_free (ptr=0xffffa687c900) at /src/fluent-bit/include/fluent-bit/flb_mem.h:127
#21 flb_filter_do (ic=ic@entry=0xffffaaa3c660, data=0xffffa6a353c0, bytes=<optimized out>, out_data=out_data@entry=0xffffab9f9d50, out_bytes=out_bytes@entry=0xffffab9f9d58, tag=tag@entry=0xffffaabfc680 "kube.var.log.containers.<redacted>.log", tag_len=tag_len@entry=148, config=0xffffabe3edc0) at /src/fluent-bit/src/flb_filter.c:190
#22 0x0000aaaae0e60714 in input_chunk_append_raw (in=0xffffabea1400, event_type=0, n_records=<optimized out>, tag=0xffffaabfc680 "kube.var.log.containers.<redacted>.log", tag_len=<optimized out>, buf=0xffffa6a353c0, buf_size=47308) at /src/fluent-bit/src/flb_input_chunk.c:1588
#23 0x0000aaaae0e60f34 in flb_input_chunk_append_raw (in=in@entry=0xffffabea1400, event_type=event_type@entry=0, records=records@entry=4, tag=tag@entry=0xffffaabfc680 "kube.var.log.containers.<redacted>.log", tag_len=tag_len@entry=148, buf=<optimized out>, buf_size=<optimized out>) at /src/fluent-bit/src/flb_input_chunk.c:1929
#24 0x0000aaaae0ea13c8 in input_log_append (ins=0xffffabea1400, processor_starting_stage=processor_starting_stage@entry=0, records=4, tag=0xffffaabfc680 "kube.var.log.containers.<redacted>.log", tag_len=148, buf=0xffffa6a353c0, buf_size=47308) at /src/fluent-bit/src/flb_input_log.c:71
#25 0x0000aaaae0ea1464 in flb_input_log_append (ins=<optimized out>, tag=<optimized out>, tag_len=<optimized out>, buf=<optimized out>, buf_size=<optimized out>) at /src/fluent-bit/src/flb_input_log.c:90
#26 0x0000aaaae0ed363c in ml_stream_buffer_flush (file=0xffffaaab9980, ctx=0xffffaaa32400) at /src/fluent-bit/plugins/in_tail/tail_file.c:412
#27 process_content (bytes=<synthetic pointer>, file=0xffffaaab9980) at /src/fluent-bit/plugins/in_tail/tail_file.c:630
#28 flb_tail_file_chunk (file=file@entry=0xffffaaab9980) at /src/fluent-bit/plugins/in_tail/tail_file.c:1555
#29 0x0000aaaae0ece538 in in_tail_collect_pending (ins=<optimized out>, config=<optimized out>, in_context=0xffffaaa32400) at /src/fluent-bit/plugins/in_tail/tail.c:109
#30 0x0000aaaae0e5e20c in flb_input_collector_fd (fd=fd@entry=25, config=config@entry=0xffffabe3edc0) at /src/fluent-bit/src/flb_input.c:1970
#31 0x0000aaaae0e7472c in flb_engine_handle_event (config=0xffffabe3edc0, mask=<optimized out>, fd=25) at /src/fluent-bit/src/flb_engine.c:575
#32 flb_engine_start (config=config@entry=0xffffabe3edc0) at /src/fluent-bit/src/flb_engine.c:941
#33 0x0000aaaae0e53c94 in flb_lib_worker (data=0xffffabe183a0) at /src/fluent-bit/src/flb_lib.c:674
#34 0x0000ffffad1dee30 in start_thread (arg=0xffffc0932877) at ./nptl/pthread_create.c:442
#35 0x0000ffffad247adc in thread_start () at ../sysdeps/unix/sysv/linux/aarch64/clone.S:79
Thread 1 (Thread 0xffffada87020 (LWP 18)):
#0 0x0000ffffad2136b0 in __GI___clock_nanosleep (clock_id=<optimized out>, clock_id@entry=0, flags=flags@entry=0, req=req@entry=0xffffc0932a08, rem=rem@entry=0xffffc0932a08) at ../sysdeps/unix/sysv/linux/clock_nanosleep.c:48
#1 0x0000ffffad21861c in __GI___nanosleep (req=req@entry=0xffffc0932a08, rem=rem@entry=0xffffc0932a08) at ../sysdeps/unix/sysv/linux/nanosleep.c:25
#2 0x0000ffffad2184e8 in __sleep (seconds=0, seconds@entry=1) at ../sysdeps/posix/sleep.c:55
#3 0x0000aaaae0de6ee0 in flb_main (argc=<optimized out>, argv=<optimized out>) at /src/fluent-bit/src/fluent-bit.c:1388
#4 0x0000ffffad187740 in __libc_start_call_main (main=main@entry=0xaaaae0de5640 <main>, argc=argc@entry=4, argv=argv@entry=0xffffc0932cc8) at ../sysdeps/nptl/libc_start_call_main.h:58
#5 0x0000ffffad187818 in __libc_start_main_impl (main=0xaaaae0de5640 <main>, argc=4, argv=0xffffc0932cc8, init=<optimized out>, fini=<optimized out>, rtld_fini=<optimized out>, stack_end=<optimized out>) at ../csu/libc-start.c:360
#6 0x0000aaaae0de56f0 in _start ()
To Reproduce
I am not able to reproduce at that moment. This doesn't happen frequently.
Expected behavior
No deadlocks. The only solution currently is to monitor pods that reach their CPU limit and kill them.
Screenshots
Your Environment
- Version used: Built from master branch commit 41b90b7
- Configuration: Attaching tail config below
- Environment name and version (e.g. Kubernetes? What version?): Kubernetes EKS v1.29.8
- Server type and version: -
- Operating System and version: Amazon Linux
- Filters and plugins: Attaching full config below
Additional context
Full config:
[SERVICE]
Daemon Off
Flush 10
Log_Level warn
Parsers_File /fluent-bit/etc/parsers.conf
Parsers_File /fluent-bit/etc/conf/custom_parsers.conf
HTTP_Server On
HTTP_Listen 0.0.0.0
HTTP_Port 2020
Health_Check On
[INPUT]
Name tail
multiline.parser docker, cri
Path /var/log/containers/*.log
Tag kube.*
Skip_Long_Lines On
Refresh_Interval 1
DB /tmp/fluent/flb-kube.db
DB.locking true
[FILTER]
Name kubernetes
Match kube.*
Merge_Log On
Keep_Log On
K8S-Logging.Parser Off
K8S-Logging.Exclude On
Buffer_Size 0
Merge_Log_Key parsed_message
Kube_URL https://${KUBERNETES_SERVICE_HOST}:${KUBERNETES_SERVICE_PORT}
tls.vhost kubernetes.default.svc
[FILTER]
Name grep
Match *
Logical_Op or
Exclude log should not be processed by cluster autoscaler \(no node group config\)
Exclude log _logKeep":false
[FILTER]
Name lua
Match *
Script /fluent-bit/scripts/filter_annotations.lua
Call filter_annotations
[FILTER]
Name record_modifier
Match *
Record cluster_name ${CLUSTER_NAME}
Record environment ${env}
Record aws_account_id ${aws_account_id}
Record aws_account_name ${aws_account_name}
Record aws_account_region ${aws_account_region}
# This parses logs that were not parsed by the multiline.parser parsers
[FILTER]
Name parser
Alias klog_parser
Match kube.var.log.containers.*
Key_Name log
Parser klog_parser
Reserve_Data On
Preserve_Key On
# This parses logs that were not parsed by the multiline.parser parsers
[FILTER]
Name parser
Alias coredns_parser
Match kube.var.log.containers.coredns*
Key_Name log
Parser coredns_parser
Reserve_Data On
Preserve_Key On
# We lift it because the modify plugin does not support nested fields. We want to make sure that we have only msg and not message
[FILTER]
Name nest
Match *
Operation lift
Wildcard ^(level|msg|message)$
Nested_under parsed_message
Add_prefix p_
[FILTER]
Name modify
Match *
Condition Key_does_not_exist $p_msg
Rename p_message p_msg
[FILTER]
Name modify
Match *
Condition Key_does_not_exist $p_time
Rename time p_time
# If we didn't match any of the above, we send the raw log as msg
[FILTER]
Name modify
Match *
Condition Key_does_not_exist $p_msg
Rename log p_msg
[FILTER]
Name nest
Match *
Operation nest
Wildcard p_*
Nest_under parsed_message
Remove_prefix p_
[FILTER]
Name record_modifier
Match *
Allowlist_key parsed_message
Allowlist_key kubernetes
Allowlist_key cluster_name
Allowlist_key environment
Allowlist_key aws_account_id
Allowlist_key aws_account_name
Allowlist_key aws_account_region
Uuid_key log_id
[OUTPUT]