@@ -39,11 +39,12 @@ using log_v2 = com::centreon::common::log_v2::log_v2;
3939 */
4040failover::failover (std::shared_ptr<io::endpoint> endp,
4141 std::shared_ptr<multiplexing::muxer> mux,
42- const std::string& name )
43- : endpoint(false , name),
42+ const config::endpoint& cfg )
43+ : endpoint(false , cfg. name),
4444 _should_exit(false ),
4545 _state(not_started),
4646 _logger{log_v2::instance ().get (log_v2::PROCESSING)},
47+ _max_retry_delay (30 ),
4748 _buffering_timeout(0 ),
4849 _endpoint(endp),
4950 _failover_launched(false ),
@@ -52,6 +53,14 @@ failover::failover(std::shared_ptr<io::endpoint> endp,
5253 _muxer(mux),
5354 _update(false ) {
5455 SPDLOG_LOGGER_TRACE (_logger, " failover '{}' construction." , _name);
56+
57+ auto search = cfg.params .find (" max_retry_delay" );
58+ if (search != cfg.params .end ()) {
59+ if (!absl::SimpleAtoi (search->second , &_max_retry_delay)) {
60+ throw msg_fmt (" max_retry_delay needs a numerical value and not {}" ,
61+ search->second );
62+ }
63+ }
5564}
5665
5766/* *
@@ -158,6 +167,27 @@ void failover::_run() {
158167 return ;
159168 }
160169
170+ unsigned error_retry_delay = 0 ;
171+
172+ auto increase_retry_delay_and_wait = [&]() {
173+ // in order to avoid a write infinite loop, we increase delay between two
174+ // failures
175+ if (!error_retry_delay) {
176+ error_retry_delay = 1 ;
177+ } else {
178+ error_retry_delay *= 2 ;
179+ }
180+ if (error_retry_delay > _max_retry_delay) {
181+ error_retry_delay = _max_retry_delay;
182+ }
183+ SPDLOG_LOGGER_ERROR (
184+ _logger, " {} Failed to send event to stream, we wait {s} before retry" ,
185+ _name, error_retry_delay);
186+ for (ssize_t i = 0 ; !should_exit () && i < error_retry_delay * 10 ; i++) {
187+ std::this_thread::sleep_for (std::chrono::milliseconds (100 ));
188+ }
189+ };
190+
161191 auto on_exception_handler = [&]() {
162192 if (_stream) {
163193 int32_t ack_events;
@@ -173,6 +203,7 @@ void failover::_run() {
173203 }
174204 set_state (" connecting" );
175205 if (!should_exit ()) {
206+ increase_retry_delay_and_wait ();
176207 _launch_failover ();
177208 _initialized = true ;
178209 }
@@ -355,8 +386,16 @@ void failover::_run() {
355386 int we (0 );
356387
357388 try {
358- std::lock_guard<std::timed_mutex> stream_lock (_stream_m);
359- we = _stream->write (d);
389+ {
390+ std::lock_guard<std::timed_mutex> stream_lock (_stream_m);
391+ we = _stream->write (d);
392+ }
393+ if (we < 0 ) { // stream write failure
394+ increase_retry_delay_and_wait ();
395+ } else {
396+ // no exception, no error => reset error_retry_delay
397+ error_retry_delay = 0 ;
398+ }
360399 } catch (exceptions::shutdown const & e) {
361400 SPDLOG_LOGGER_DEBUG (
362401 _logger,
@@ -625,7 +664,8 @@ bool failover::should_exit() const {
625664}
626665
627666bool failover::wait_for_all_events_written (unsigned ms_timeout) {
628- _logger->info (" processing::failover::wait_for_all_events_written" );
667+ SPDLOG_LOGGER_INFO (
668+ _logger, " {} processing::failover::wait_for_all_events_written" , _name);
629669 std::lock_guard<std::timed_mutex> stream_lock (_stream_m);
630670 if (_stream) {
631671 return _stream->wait_for_all_events_written (ms_timeout);
0 commit comments