Skip to content

Commit d590c72

Browse files
LevKatssergepetrenko
authored andcommitted
gc: use *.snap st_mtime to schedule after restart
Introduce the `timestamp` field in `gc_checkpoint` so now `gc.{c,h}` are aware of actual times of checkpoints, which is important since this subsystem is responsible for scheduling. Now, one can track the unix time of a new checkpoint with new `timestamp` argument of `gc_add_checkpoint`. This change allows us to track previous checkpoints made before the server restart and even `checkpoint_interval` value reconfiguring. This approach was chosen instead of just scanning the `snap_dir` in `gc.c` because it was engine-independent. One may also notice that even if the actual time after the last snapshot before the restart is greater than `2 * checkpoint_interval` we won't start checkpointing immediately because that may cause high disk load in case of multiple instances. So in this case we just schedule a checkpoint at a random moment in the first `checkpoint_interval` seconds after the restart. It seems like even with this scheduling strategy a snapshot will be eventually created even during constant restarting. Fixes tarantool#9820 NO_DOC=bugfix
1 parent ae519ed commit d590c72

File tree

8 files changed

+116
-13
lines changed

8 files changed

+116
-13
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
## bugfix/box
2+
3+
* Fixed a bug when the timestamps of snapshots created before the server restart
4+
were not taken into account with `checkpoint_interval` enabled (gh-9820).

src/box/checkpoint_schedule.c

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,17 +29,26 @@
2929
* SUCH DAMAGE.
3030
*/
3131
#include "checkpoint_schedule.h"
32+
#include "trivia/util.h"
3233

3334
#include <assert.h>
3435
#include <math.h>
3536
#include <stdlib.h>
3637

3738
void
3839
checkpoint_schedule_cfg(struct checkpoint_schedule *sched,
39-
double now, double interval)
40+
double now, double interval,
41+
double time_since_checkpoint)
4042
{
4143
sched->interval = interval;
42-
sched->start_time = now + interval;
44+
/*
45+
* If the last checkpoint was long enough ago, we should not allow
46+
* the start_time to be before now in any case so possible parallel
47+
* snapshots on different instances will not result in too many
48+
* writes at once.
49+
*/
50+
sched->start_time = now + interval -
51+
MIN(time_since_checkpoint, interval);
4352

4453
/*
4554
* Add a random offset to the start time so as to avoid

src/box/checkpoint_schedule.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,12 @@ struct checkpoint_schedule {
5353
*
5454
* @now is the current time.
5555
* @interval is the configured interval between checkpoints.
56+
* @time_since_checkpoint is the time since the last checkpoint
5657
*/
5758
void
5859
checkpoint_schedule_cfg(struct checkpoint_schedule *sched,
59-
double now, double interval);
60+
double now, double interval,
61+
double time_since_checkpoint);
6062

6163
/**
6264
* Reset a checkpoint schedule.

src/box/gc.c

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ gc_init(on_garbage_collection_f on_garbage_collection)
135135
rlist_create(&gc.consumers);
136136
gc_tree_new(&gc.active_consumers);
137137
fiber_cond_create(&gc.cleanup_cond);
138-
checkpoint_schedule_cfg(&gc.checkpoint_schedule, 0, 0);
138+
checkpoint_schedule_cfg(&gc.checkpoint_schedule, 0, 0, 0);
139139

140140
gc.cleanup_fiber = fiber_new_system("gc", gc_cleanup_fiber_f);
141141
if (gc.cleanup_fiber == NULL)
@@ -477,14 +477,19 @@ gc_set_checkpoint_interval(double interval)
477477
* if it's waiting for checkpointing to complete, because
478478
* checkpointing code doesn't tolerate spurious wakeups.
479479
*/
480+
struct gc_checkpoint *last = gc_last_checkpoint();
481+
double time_since_checkpoint = 0;
482+
if (last != NULL && !gc.checkpoint_is_in_progress)
483+
time_since_checkpoint = ev_time() - last->timestamp;
480484
checkpoint_schedule_cfg(&gc.checkpoint_schedule,
481-
ev_monotonic_now(loop()), interval);
485+
ev_monotonic_now(loop()), interval,
486+
time_since_checkpoint);
482487
if (!gc.checkpoint_is_in_progress)
483488
fiber_wakeup(gc.checkpoint_fiber);
484489
}
485490

486491
void
487-
gc_add_checkpoint(const struct vclock *vclock)
492+
gc_add_checkpoint(const struct vclock *vclock, double timestamp)
488493
{
489494
struct gc_checkpoint *last_checkpoint = gc_last_checkpoint();
490495
if (last_checkpoint != NULL &&
@@ -511,6 +516,7 @@ gc_add_checkpoint(const struct vclock *vclock)
511516

512517
rlist_create(&checkpoint->refs);
513518
vclock_copy(&checkpoint->vclock, vclock);
519+
checkpoint->timestamp = timestamp;
514520
rlist_add_tail_entry(&gc.checkpoints, checkpoint, in_checkpoints);
515521
gc.checkpoint_count++;
516522

@@ -564,7 +570,7 @@ gc_do_checkpoint(bool is_scheduled)
564570
* Finally, track the newly created checkpoint in the garbage
565571
* collector state.
566572
*/
567-
gc_add_checkpoint(&checkpoint.vclock);
573+
gc_add_checkpoint(&checkpoint.vclock, ev_time());
568574
out:
569575
if (rc != 0)
570576
engine_abort_checkpoint();
@@ -593,7 +599,8 @@ gc_checkpoint(void)
593599
*/
594600
checkpoint_schedule_cfg(&gc.checkpoint_schedule,
595601
ev_monotonic_now(loop()),
596-
gc.checkpoint_schedule.interval);
602+
gc.checkpoint_schedule.interval,
603+
0);
597604
fiber_wakeup(gc.checkpoint_fiber);
598605

599606
if (gc_do_checkpoint(false) != 0)

src/box/gc.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ struct gc_checkpoint {
6161
struct rlist in_checkpoints;
6262
/** VClock of the checkpoint. */
6363
struct vclock vclock;
64+
/** Timestamp of the checkpoint. */
65+
double timestamp;
6466
/**
6567
* List of checkpoint references, linked by
6668
* gc_checkpoint_ref::in_refs.
@@ -329,7 +331,7 @@ gc_set_checkpoint_interval(double interval);
329331
* old checkpoints.
330332
*/
331333
void
332-
gc_add_checkpoint(const struct vclock *vclock);
334+
gc_add_checkpoint(const struct vclock *vclock, double timestamp);
333335

334336
/**
335337
* Make a *manual* checkpoint.

src/box/memtx_engine.cc

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1703,7 +1703,18 @@ memtx_engine_new(const char *snap_dirname, bool force_recovery,
17031703
for (struct vclock *vclock = vclockset_first(&memtx->snap_dir.index);
17041704
vclock != NULL;
17051705
vclock = vclockset_next(&memtx->snap_dir.index, vclock)) {
1706-
gc_add_checkpoint(vclock);
1706+
const char *name = xdir_format_filename(&memtx->snap_dir,
1707+
vclock_sum(vclock),
1708+
NONE);
1709+
struct stat attr;
1710+
double timestamp = ev_time();
1711+
if (stat(name, &attr) != 0)
1712+
say_warn("failed to get modification time of %s, "
1713+
"set it to the current time",
1714+
name);
1715+
else
1716+
timestamp = (double)attr.st_mtime;
1717+
gc_add_checkpoint(vclock, timestamp);
17071718
}
17081719

17091720
stailq_create(&memtx->gc_queue);
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
local t = require('luatest')
2+
local server = require('luatest.server')
3+
4+
local g = t.group()
5+
6+
g.before_all(function(g)
7+
g.server = server:new({box_cfg = {
8+
checkpoint_interval = 5,
9+
checkpoint_count = 1e9,
10+
}})
11+
g.server:start()
12+
g.server:exec(function()
13+
local s = box.schema.space.create('test')
14+
s:create_index('pk', {sequence = true})
15+
end)
16+
end)
17+
18+
g.before_each(function(g)
19+
g.server:exec(function()
20+
box.space.test:truncate()
21+
box.cfg{checkpoint_interval=5}
22+
t.helpers.retrying({}, function()
23+
t.assert_not(box.info.gc().checkpoint_is_in_progress)
24+
end)
25+
end)
26+
end)
27+
28+
g.test_box_checkpoint_after_restart = function(g)
29+
local fiber = require('fiber')
30+
local checkpoint_count = g.server:exec(function()
31+
local s = box.space.test
32+
-- Insert something to make sure that box.snapshot() will do something.
33+
s:insert({box.NULL})
34+
-- Reschedule future snapshots.
35+
box.snapshot()
36+
local checkpoint_count = #box.info.gc().checkpoints
37+
s:insert({box.NULL})
38+
return checkpoint_count
39+
end)
40+
g.server:stop()
41+
fiber.sleep(5)
42+
g.server:start()
43+
g.server:exec(function(checkpoint_count)
44+
local fiber = require('fiber')
45+
fiber.sleep(7)
46+
t.assert_gt(#box.info.gc().checkpoints, checkpoint_count)
47+
end, {checkpoint_count})
48+
end
49+
50+
g.test_box_checkpoint_after_reconfigure = function(g)
51+
g.server:exec(function()
52+
box.cfg{checkpoint_interval = 25}
53+
local fiber = require('fiber')
54+
local s = box.space.test
55+
s:insert({box.NULL})
56+
box.snapshot()
57+
local checkpoint_count = #box.info.gc().checkpoints
58+
s:insert({box.NULL})
59+
fiber.sleep(5)
60+
box.cfg{checkpoint_interval = 5}
61+
fiber.sleep(7)
62+
t.assert_gt(#box.info.gc().checkpoints, checkpoint_count)
63+
end)
64+
end
65+
66+
g.after_all(function(g)
67+
g.server:drop()
68+
end)

test/unit/checkpoint_schedule.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ main()
2525
double now = rand();
2626

2727
struct checkpoint_schedule sched;
28-
checkpoint_schedule_cfg(&sched, now, 0);
28+
checkpoint_schedule_cfg(&sched, now, 0, 0);
2929

3030
is(checkpoint_schedule_timeout(&sched, now), 0,
3131
"checkpointing disabled - timeout after configuration");
@@ -43,15 +43,15 @@ main()
4343
for (int i = 0; i < intervals_len; i++) {
4444
double interval = intervals[i];
4545

46-
checkpoint_schedule_cfg(&sched, now, interval);
46+
checkpoint_schedule_cfg(&sched, now, interval, 0);
4747
double t = checkpoint_schedule_timeout(&sched, now);
4848
ok(t >= interval && t <= interval * 2,
4949
"checkpoint interval %.0lf - timeout after configuration",
5050
interval);
5151

5252
double t0;
5353
for (int j = 0; j < 100; j++) {
54-
checkpoint_schedule_cfg(&sched, now, interval);
54+
checkpoint_schedule_cfg(&sched, now, interval, 0);
5555
t0 = checkpoint_schedule_timeout(&sched, now);
5656
if (fabs(t - t0) > interval / 4)
5757
break;

0 commit comments

Comments
 (0)