@@ -43,6 +43,10 @@ def before_run
4343 @rpc_endpoint = nil
4444 @rpc_server = nil
4545 @counter = nil
46+ @socket_manager_server = nil
47+ @starting_new_supervisor_without_downtime = false
48+ @new_supervisor_pid = nil
49+ start_in_parallel = ENV . key? ( "FLUENT_RUNNING_IN_PARALLEL_WITH_OLD" )
4650
4751 @fluentd_lock_dir = Dir . mktmpdir ( "fluentd-lock-" )
4852 ENV [ 'FLUENTD_LOCK_DIR' ] = @fluentd_lock_dir
@@ -65,18 +69,31 @@ def before_run
6569
6670 if config [ :disable_shared_socket ]
6771 $log. info "shared socket for multiple workers is disabled"
72+ elsif start_in_parallel
73+ begin
74+ raise "[BUG] SERVERENGINE_SOCKETMANAGER_PATH env var must exist when starting in parallel" unless ENV . key? ( 'SERVERENGINE_SOCKETMANAGER_PATH' )
75+ @socket_manager_server = ServerEngine ::SocketManager ::Server . share_sockets_with_another_server ( ENV [ 'SERVERENGINE_SOCKETMANAGER_PATH' ] )
76+ $log. info "restart-without-downtime: took over the shared sockets" , path : ENV [ 'SERVERENGINE_SOCKETMANAGER_PATH' ]
77+ rescue => e
78+ $log. error "restart-without-downtime: cancel sequence because failed to take over the shared sockets" , error : e
79+ raise
80+ end
6881 else
69- server = ServerEngine ::SocketManager ::Server . open
70- ENV [ 'SERVERENGINE_SOCKETMANAGER_PATH' ] = server . path . to_s
82+ @socket_manager_server = ServerEngine ::SocketManager ::Server . open
83+ ENV [ 'SERVERENGINE_SOCKETMANAGER_PATH' ] = @socket_manager_server . path . to_s
7184 end
85+
86+ stop_parallel_old_supervisor_after_delay if start_in_parallel
7287 end
7388
7489 def after_run
7590 stop_windows_event_thread if Fluent . windows?
7691 stop_rpc_server if @rpc_endpoint
7792 stop_counter_server if @counter
7893 cleanup_lock_dir
79- Fluent ::Supervisor . cleanup_resources
94+ Fluent ::Supervisor . cleanup_socketmanager_path unless @starting_new_supervisor_without_downtime
95+
96+ notify_new_supervisor_that_old_one_has_stopped if @starting_new_supervisor_without_downtime
8097 end
8198
8299 def cleanup_lock_dir
@@ -138,7 +155,7 @@ def run_rpc_server
138155 @rpc_server . mount_proc ( '/api/config.gracefulReload' ) { |req , res |
139156 $log. debug "fluentd RPC got /api/config.gracefulReload request"
140157 if Fluent . windows?
141- supervisor_sigusr2_handler
158+ graceful_reload
142159 else
143160 Process . kill :USR2 , Process . pid
144161 end
@@ -172,6 +189,47 @@ def stop_counter_server
172189 @counter . stop
173190 end
174191
192+ def stop_parallel_old_supervisor_after_delay
193+ # TODO if the new supervisor fails to start and this is not called,
194+ # it would be necessary to update the pid in the PID file to the old one when daemonized.
195+
196+ Thread . new do
197+ # Delay to wait the new workers to start up.
198+ # Even if it takes a long time to start the new workers and stop the old Fluentd first,
199+ # it is no problem because the socket buffer works, as long as the capacity is not exceeded.
200+ sleep 10
201+ old_pid = ENV [ "FLUENT_RUNNING_IN_PARALLEL_WITH_OLD" ] &.to_i
202+ if old_pid
203+ $log. info "restart-without-downtime: stop the old supervisor"
204+ Process . kill :TERM , old_pid
205+ end
206+ rescue => e
207+ $log. warn "restart-without-downtime: failed to stop the old supervisor." +
208+ " If the old one does not exist, please send SIGWINCH to this new process to start to work fully." +
209+ " If it exists, something went wrong. Please kill the old one manually." ,
210+ error : e
211+ end
212+ end
213+
214+ def notify_new_supervisor_that_old_one_has_stopped
215+ if config [ :pid_path ]
216+ new_pid = File . read ( config [ :pid_path ] ) . to_i
217+ else
218+ raise "[BUG] new_supervisor_pid is not saved" unless @new_supervisor_pid
219+ new_pid = @new_supervisor_pid
220+ end
221+
222+ $log. info "restart-without-downtime: notify the new supervisor (pid: #{ new_pid } ) that old one has stopped"
223+ Process . kill :WINCH , new_pid
224+ rescue => e
225+ $log. error (
226+ "restart-without-downtime: failed to notify the new supervisor." +
227+ " Please send SIGWINCH to the new supervisor process manually" +
228+ " if it does not start to work fully." ,
229+ error : e
230+ )
231+ end
232+
175233 def install_supervisor_signal_handlers
176234 return if Fluent . windows?
177235
@@ -187,7 +245,11 @@ def install_supervisor_signal_handlers
187245
188246 trap :USR2 do
189247 $log. debug 'fluentd supervisor process got SIGUSR2'
190- supervisor_sigusr2_handler
248+ if Fluent . windows?
249+ graceful_reload
250+ else
251+ restart_without_downtime
252+ end
191253 end
192254
193255 trap :WINCH do
@@ -259,7 +321,7 @@ def install_windows_event_handler
259321 when :usr1
260322 supervisor_sigusr1_handler
261323 when :usr2
262- supervisor_sigusr2_handler
324+ graceful_reload
263325 when :cont
264326 supervisor_dump_handler_for_windows
265327 when :stop_event_thread
@@ -289,7 +351,7 @@ def supervisor_sigusr1_handler
289351 send_signal_to_workers ( :USR1 )
290352 end
291353
292- def supervisor_sigusr2_handler
354+ def graceful_reload
293355 conf = nil
294356 t = Thread . new do
295357 $log. info 'Reloading new config'
@@ -317,7 +379,38 @@ def supervisor_sigusr2_handler
317379 $log. error "Failed to reload config file: #{ e } "
318380 end
319381
382+ def restart_without_downtime
383+ # TODO exclusive lock
384+
385+ $log. info "start restart-without-downtime sequence"
386+
387+ if @starting_new_supervisor_without_downtime
388+ $log. warn "restart-without-downtime: canceled because it is already starting"
389+ return
390+ end
391+ if ENV . key? ( "FLUENT_RUNNING_IN_PARALLEL_WITH_OLD" )
392+ $log. warn "restart-without-downtime: canceled because the previous sequence is still running"
393+ return
394+ end
395+
396+ @starting_new_supervisor_without_downtime = true
397+ commands = [ ServerEngine . ruby_bin_path , $0] + ARGV
398+ env_to_add = {
399+ "SERVERENGINE_SOCKETMANAGER_INTERNAL_TOKEN" => ServerEngine ::SocketManager ::INTERNAL_TOKEN ,
400+ "FLUENT_RUNNING_IN_PARALLEL_WITH_OLD" => "#{ Process . pid } " ,
401+ }
402+ pid = Process . spawn ( env_to_add , commands . join ( " " ) )
403+ @new_supervisor_pid = pid unless config [ :daemonize ]
404+ rescue => e
405+ $log. error "restart-without-downtime: failed" , error : e
406+ @starting_new_supervisor_without_downtime = false
407+ end
408+
320409 def cancel_source_only
410+ if ENV . key? ( "FLUENT_RUNNING_IN_PARALLEL_WITH_OLD" )
411+ $log. info "restart-without-downtime: done all sequences, now the new workers starts to work fully"
412+ ENV . delete ( "FLUENT_RUNNING_IN_PARALLEL_WITH_OLD" )
413+ end
321414 send_signal_to_workers ( :WINCH )
322415 end
323416
@@ -510,12 +603,11 @@ def self.default_options
510603 }
511604 end
512605
513- def self . cleanup_resources
514- unless Fluent . windows?
515- if ENV . has_key? ( 'SERVERENGINE_SOCKETMANAGER_PATH' )
516- FileUtils . rm_f ( ENV [ 'SERVERENGINE_SOCKETMANAGER_PATH' ] )
517- end
518- end
606+ def self . cleanup_socketmanager_path
607+ return if Fluent . windows?
608+ return unless ENV . key? ( 'SERVERENGINE_SOCKETMANAGER_PATH' )
609+
610+ FileUtils . rm_f ( ENV [ 'SERVERENGINE_SOCKETMANAGER_PATH' ] )
519611 end
520612
521613 def initialize ( cl_opt )
@@ -583,7 +675,7 @@ def run_supervisor(dry_run: false)
583675 begin
584676 ServerEngine ::Privilege . change ( @chuser , @chgroup )
585677 MessagePackFactory . init ( enable_time_support : @system_config . enable_msgpack_time_support )
586- Fluent ::Engine . init ( @system_config , supervisor_mode : true )
678+ Fluent ::Engine . init ( @system_config , supervisor_mode : true , start_in_parallel : ENV . key? ( "FLUENT_RUNNING_IN_PARALLEL_WITH_OLD" ) )
587679 Fluent ::Engine . run_configure ( @conf , dry_run : dry_run )
588680 rescue Fluent ::ConfigError => e
589681 $log. error 'config error' , file : @config_path , error : e
@@ -632,10 +724,10 @@ def run_worker
632724 File . umask ( @chumask . to_i ( 8 ) )
633725 end
634726 MessagePackFactory . init ( enable_time_support : @system_config . enable_msgpack_time_support )
635- Fluent ::Engine . init ( @system_config )
727+ Fluent ::Engine . init ( @system_config , start_in_parallel : ENV . key? ( "FLUENT_RUNNING_IN_PARALLEL_WITH_OLD" ) )
636728 Fluent ::Engine . run_configure ( @conf )
637729 Fluent ::Engine . run
638- self . class . cleanup_resources if @standalone_worker
730+ self . class . cleanup_socketmanager_path if @standalone_worker
639731 exit 0
640732 end
641733 end
@@ -853,7 +945,8 @@ def install_main_process_signal_handlers
853945 end
854946
855947 trap :USR2 do
856- reload_config
948+ # Do nothing
949+ # TODO consider suitable code for this
857950 end
858951
859952 trap :CONT do
0 commit comments