Skip to content

Commit d108cf5

Browse files
Evan-Adampkippes
andauthored
feat(gorgone-autodisco): implement a retry for autodiscovery host cron job (#3121) (#3122)
* feat(gorgone-autodisco): Add retry mechanism for automatic host discovery Manual discovery are not changed. automatic discovery are passed to status failed in this function, and the following code will run the discovery just as normal * ci(gorgone-autodisco): Add new dep to parse date This is a pretty common lib, useful for ingesting date from other sources Co-authored-by: Sylvain Cresto <scresto@centreon.com> Refs:MON-176449 Co-authored-by: pkippes <144150042+pkippes@users.noreply.github.com>
1 parent fc4a433 commit d108cf5

File tree

3 files changed

+198
-2
lines changed

3 files changed

+198
-2
lines changed

gorgone/gorgone/modules/centreon/autodiscovery/class.pm

Lines changed: 123 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ use gorgone::class::frame;
3535
use JSON::XS;
3636
use Time::HiRes;
3737
use POSIX qw(strftime);
38+
use DateTime;
39+
use DateTime::Format::Strptime;
3840
use Digest::MD5 qw(md5_hex);
3941
use Try::Tiny;
4042
use EV;
@@ -149,6 +151,125 @@ sub hdisco_is_running_job {
149151

150152
return 0;
151153
}
154+
=head3 $self->hdisco_can_start_job(job => $jobRef)
155+
156+
Check if we can start a host discovery job.
157+
If the job is in timeout, update the job status in db and run the job again
158+
159+
For now there is no real mutex on the execution except the db column status,
160+
so if the post execution command outlive the timeout undefined behaviour may appear.
161+
162+
Parameters:
163+
164+
=over 4
165+
166+
=item * job: job information hash ref. Required data to work correctly are :
167+
168+
=over 4
169+
170+
=item * status: the job status (see JOB_FINISH and other constant for possible states)
171+
172+
=item * job_id: the job unique identifier. Used to set the job as failure if timeout is reached.
173+
174+
=item * last_execution : hash containing 'timezone' and 'date' (ex 'Europe/Paris', '2026-01-30 20:31:00.000000')
175+
176+
=back
177+
178+
=back
179+
180+
Output : Bool
181+
182+
1 if the job should be started
183+
184+
0 if invalid data given, or job correctly running and should not be started.
185+
=cut
186+
sub hdisco_can_start_job {
187+
my ($self, %options) = @_;
188+
if (!$options{job} || !defined($options{job}->{status}) || !defined($options{job}->{job_id})){
189+
return 0;
190+
}
191+
if ($options{job}->{status} != JOB_RUNNING &&
192+
$options{job}->{status} != SAVE_RUNNING) {
193+
# if job is not running, we can start it safely
194+
return 1;
195+
}
196+
if (!defined($options{job}->{last_execution}) || !defined($options{job}->{last_execution}->{date})) {
197+
# probably first run of the job
198+
return 1;
199+
}
200+
if ($options{job}->{execution}->{mode} != 1){
201+
# never timeout manual or paused jobs, only retry automatic jobs
202+
return 0;
203+
}
204+
205+
my $second_since_last_exec = $self->_get_duration_since_last_exec(
206+
date => $options{job}->{last_execution}->{date},
207+
timezone => $options{job}->{last_execution}->{timezone} // 'UTC'
208+
) or return 0; # could not parse the date, don't try to start the job again.
209+
210+
my $timeout = $options{timeout} // $self->{global_timeout};
211+
if ($second_since_last_exec <= $timeout * 2 + 10) {
212+
# job did not reach timeout, let it run, don't start it again.
213+
return 0;
214+
}
215+
216+
# job is in timeout, restarting it.
217+
$self->{logger}->writeLogError(
218+
"[autodiscovery] job is timing out (last execution: '" . $options{job}->{last_execution}->{date} . "'), we set it as failed and restart it");
219+
return 0 if -1 == $self->update_job_information(
220+
values => {
221+
status => JOB_FAILED,
222+
message => 'Job timed out and will be restarted by Gorgone',
223+
},
224+
where_clause => [
225+
{ id => $options{job}->{job_id} }
226+
]);
227+
$self->{hdisco_jobs_ids}->{ $options{job}->{job_id} }->{status} = JOB_FAILED;
228+
return 1;
229+
230+
}
231+
=head3 $self->_get_duration_since_last_exec(date => $dateStr, timezone => $tzStr)
232+
233+
Calculate the duration in seconds since the last job execution.
234+
235+
Parameters:
236+
237+
=over 4
238+
239+
=item * date: execution date string in format 'YYYY-MM-DD HH:MM:SS.NNNNNN' (e.g., '2026-01-30 20:31:00.000000')
240+
241+
=item * timezone: timezone string for the date (e.g., 'Europe/Paris', 'UTC')
242+
243+
=back
244+
245+
Output : Int or undef
246+
247+
Number of seconds since the last execution.
248+
249+
Returns undef if the date cannot be parsed or if the date is in the future.
250+
251+
=cut
252+
sub _get_duration_since_last_exec {
253+
my ($self, %options) = @_;
254+
255+
# parse format "2026-01-29 15:35:12.000000" given by php api.
256+
my $last_exec = DateTime::Format::Strptime->new(
257+
pattern => '%Y-%m-%d %H:%M:%S.%6N',
258+
time_zone => $options{timezone},
259+
)->parse_datetime($options{date}); # will return undef on any failure.
260+
261+
if (!defined($last_exec)) {
262+
$self->{logger}->writeLogWarning("[autodiscovery] can not parse last execution date '" . $options{date} . "from job, job won't start.");
263+
return undef;
264+
}
265+
266+
my $duration = DateTime->now()->epoch() - $last_exec->epoch();
267+
if ($duration < 0) {
268+
$self->{logger}->writeLogWarning("[autodiscovery] last execution date '" . $options{date} . "' is in the future, job won't start.");
269+
return undef;
270+
}
271+
return $duration;
272+
}
152273

153274
sub hdisco_add_cron {
154275
my ($self, %options) = @_;
@@ -443,7 +564,7 @@ sub launchhostdiscovery {
443564
if (!defined($job_id) || !defined($self->{hdisco_jobs_ids}->{$job_id})) {
444565
return (1, 'trying to launch discovery for inexistant job');
445566
}
446-
if ($self->hdisco_is_running_job(status => $self->{hdisco_jobs_ids}->{$job_id}->{status})) {
567+
if (! $self->hdisco_can_start_job(job => $self->{hdisco_jobs_ids}->{$job_id})) {
447568
return (1, 'job is already running');
448569
}
449570
if ($self->{hdisco_jobs_ids}->{$job_id}->{execution}->{mode} == EXECUTION_MODE_PAUSE && $options{source} eq 'cron') {
@@ -927,7 +1048,7 @@ sub update_job_information {
9271048

9281049
my ($status) = $self->{class_object_centreon}->custom_execute(request => $query, bind_values => \@bind_values);
9291050
if ($status == -1) {
930-
$self->{logger}->writeLogError('[autodiscovery] Failed to update job information');
1051+
$self->{logger}->writeLogError('[autodiscovery] Failed to update job information for ' . join(', ', @bind_values));
9311052
return -1;
9321053
}
9331054

gorgone/packaging/centreon-gorgone.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ overrides:
198198
- perl(lib)
199199
- perl(Safe)
200200
- perl(Tie::File) # required by MBI module
201+
- perl(DateTime::Format::Strptime)
201202
recommends:
202203
- logrotate
203204

@@ -234,6 +235,7 @@ overrides:
234235
- libzmq-ffi-perl
235236
- libclone-choose-perl
236237
- libjson-perl # gorgone_key_thumbprint.pl needs the json module, even when json::xs is already installed
238+
- libdatetime-format-strptime-perl # autodiscovery uses it to retry job scheduling
237239
- librrds-perl
238240
- perl-base
239241
- perl-modules
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
#!/usr/bin/perl
2+
3+
use strict;
4+
use warnings;
5+
use Test2::V0;
6+
use Test2::Plugin::NoWarnings echo => 1;
7+
use Test2::Tools::Compare qw{is like match};
8+
use FindBin;
9+
use lib "$FindBin::Bin/../../../../../";
10+
use lib "$FindBin::Bin/../../../../../../perl-libs/lib/";
11+
use Data::Dumper;
12+
use gorgone::modules::centreon::autodiscovery::class;
13+
use gorgone::modules::core::action::class;
14+
use tests::unit::lib::mockLogger;
15+
16+
sub main {
17+
test_hdisco_can_start_job();
18+
done_testing();
19+
}
20+
sub test_hdisco_can_start_job {
21+
my $mock_logger = mock 'centreon::common::logger';
22+
23+
my $gorgone = bless
24+
{ logger => $mock_logger->class,
25+
global_timeout => 60 },
26+
"gorgone::modules::centreon::autodiscovery::class";
27+
28+
my @tests = ({
29+
name => "first",
30+
arg => {
31+
status => 4,
32+
timezone => 'Europe/Paris',
33+
date => '2026-01-29 15:35:12.000000',
34+
timeout => 60 },
35+
result => 1,
36+
37+
}, {
38+
name => "undef mean take default timeout",
39+
arg => {
40+
status => 4,
41+
timezone => 'Europe/Paris',
42+
date => '2026-01-29 15:35:12.000000',
43+
timeout => undef },
44+
result => 1
45+
},
46+
);
47+
for my $test (@tests) {
48+
my $mock_gorgone = mock('gorgone::modules::centreon::autodiscovery::class' => (override => [
49+
_get_duration_since_last_exec => sub {
50+
return $test->{args}->{duration} // 300; # limit is at 130 secs
51+
}, update_job_information => sub {
52+
return 1;
53+
}]));
54+
my $res = $gorgone->hdisco_can_start_job(
55+
"timeout" => $test->{arg}->{timeout},
56+
"job" => {
57+
job_id => 2,
58+
execution => {mode => 1},
59+
status => $test->{arg}->{status},
60+
last_execution => {
61+
"timezone_type" => 3,
62+
'timezone' => $test->{arg}->{timezone},
63+
'date' => $test->{arg}->{date},
64+
'timezone_type' => 3
65+
}
66+
});
67+
is($res, $test->{result}, $test->{name});
68+
}
69+
70+
}
71+
72+
main();
73+

0 commit comments

Comments
 (0)