Skip to content

Commit 8960f2a

Browse files
committed
PGPRO-1918: Consider lock_backup()'s result
1 parent 3b91712 commit 8960f2a

File tree

9 files changed

+81
-54
lines changed

9 files changed

+81
-54
lines changed

src/backup.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -976,8 +976,9 @@ do_backup(time_t start_time)
976976

977977
/* Create backup directory and BACKUP_CONTROL_FILE */
978978
if (pgBackupCreateDir(&current))
979-
elog(ERROR, "cannot create backup directory");
980-
lock_backup(&current);
979+
elog(ERROR, "Cannot create backup directory");
980+
if (!lock_backup(&current))
981+
elog(ERROR, "Cannot lock backup directory");
981982
write_backup(&current);
982983

983984
elog(LOG, "Backup destination is initialized");

src/catalog.c

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -69,12 +69,13 @@ read_backup(time_t timestamp)
6969
* status.
7070
*/
7171
void
72-
write_backup_status(pgBackup *backup)
72+
write_backup_status(pgBackup *backup, BackupStatus status)
7373
{
7474
pgBackup *tmp;
7575

7676
tmp = read_backup(backup->start_time);
7777

78+
backup->status = status;
7879
tmp->status = backup->status;
7980
write_backup(tmp);
8081

@@ -84,7 +85,7 @@ write_backup_status(pgBackup *backup)
8485
/*
8586
* Create exclusive lockfile in the backup's directory.
8687
*/
87-
void
88+
bool
8889
lock_backup(pgBackup *backup)
8990
{
9091
char lock_file[MAXPGPATH];
@@ -149,7 +150,7 @@ lock_backup(pgBackup *backup)
149150
* Couldn't create the pid file. Probably it already exists.
150151
*/
151152
if ((errno != EEXIST && errno != EACCES) || ntries > 100)
152-
elog(ERROR, "could not create lock file \"%s\": %s",
153+
elog(ERROR, "Could not create lock file \"%s\": %s",
153154
lock_file, strerror(errno));
154155

155156
/*
@@ -161,22 +162,22 @@ lock_backup(pgBackup *backup)
161162
{
162163
if (errno == ENOENT)
163164
continue; /* race condition; try again */
164-
elog(ERROR, "could not open lock file \"%s\": %s",
165+
elog(ERROR, "Could not open lock file \"%s\": %s",
165166
lock_file, strerror(errno));
166167
}
167168
if ((len = read(fd, buffer, sizeof(buffer) - 1)) < 0)
168-
elog(ERROR, "could not read lock file \"%s\": %s",
169+
elog(ERROR, "Could not read lock file \"%s\": %s",
169170
lock_file, strerror(errno));
170171
close(fd);
171172

172173
if (len == 0)
173-
elog(ERROR, "lock file \"%s\" is empty", lock_file);
174+
elog(ERROR, "Lock file \"%s\" is empty", lock_file);
174175

175176
buffer[len] = '\0';
176177
encoded_pid = atoi(buffer);
177178

178179
if (encoded_pid <= 0)
179-
elog(ERROR, "bogus data in lock file \"%s\": \"%s\"",
180+
elog(ERROR, "Bogus data in lock file \"%s\": \"%s\"",
180181
lock_file, buffer);
181182

182183
/*
@@ -190,9 +191,21 @@ lock_backup(pgBackup *backup)
190191
*/
191192
if (encoded_pid != my_pid && encoded_pid != my_p_pid)
192193
{
193-
if (kill(encoded_pid, 0) == 0 ||
194-
(errno != ESRCH && errno != EPERM))
195-
elog(ERROR, "lock file \"%s\" already exists", lock_file);
194+
if (kill(encoded_pid, 0) == 0)
195+
{
196+
elog(WARNING, "Process %d is using backup %s and still is running",
197+
encoded_pid, base36enc(backup->start_time));
198+
return false;
199+
}
200+
else
201+
{
202+
if (errno == ESRCH)
203+
elog(WARNING, "Process %d which used backup %s no longer exists",
204+
encoded_pid, base36enc(backup->start_time));
205+
else
206+
elog(ERROR, "Failed to send signal 0 to a process %d: %s",
207+
encoded_pid, strerror(errno));
208+
}
196209
}
197210

198211
/*
@@ -201,7 +214,7 @@ lock_backup(pgBackup *backup)
201214
* would-be creators.
202215
*/
203216
if (unlink(lock_file) < 0)
204-
elog(ERROR, "could not remove old lock file \"%s\": %s",
217+
elog(ERROR, "Could not remove old lock file \"%s\": %s",
205218
lock_file, strerror(errno));
206219
}
207220

@@ -219,7 +232,7 @@ lock_backup(pgBackup *backup)
219232
unlink(lock_file);
220233
/* if write didn't set errno, assume problem is no disk space */
221234
errno = save_errno ? save_errno : ENOSPC;
222-
elog(ERROR, "could not write lock file \"%s\": %s",
235+
elog(ERROR, "Could not write lock file \"%s\": %s",
223236
lock_file, strerror(errno));
224237
}
225238
if (fsync(fd) != 0)
@@ -229,7 +242,7 @@ lock_backup(pgBackup *backup)
229242
close(fd);
230243
unlink(lock_file);
231244
errno = save_errno;
232-
elog(ERROR, "could not write lock file \"%s\": %s",
245+
elog(ERROR, "Could not write lock file \"%s\": %s",
233246
lock_file, strerror(errno));
234247
}
235248
if (close(fd) != 0)
@@ -238,7 +251,7 @@ lock_backup(pgBackup *backup)
238251

239252
unlink(lock_file);
240253
errno = save_errno;
241-
elog(ERROR, "could not write lock file \"%s\": %s",
254+
elog(ERROR, "Culd not write lock file \"%s\": %s",
242255
lock_file, strerror(errno));
243256
}
244257

@@ -255,6 +268,8 @@ lock_backup(pgBackup *backup)
255268
if (lock_files == NULL)
256269
lock_files = parray_new();
257270
parray_append(lock_files, pgut_strdup(lock_file));
271+
272+
return true;
258273
}
259274

260275
/*
@@ -418,7 +433,8 @@ catalog_lock_backup_list(parray *backup_list, int from_idx, int to_idx)
418433
end_idx = Min(from_idx, to_idx);
419434

420435
for (i = start_idx; i >= end_idx; i--)
421-
lock_backup((pgBackup *) parray_get(backup_list, i));
436+
if (!lock_backup((pgBackup *) parray_get(backup_list, i)))
437+
elog(ERROR, "Cannot lock backup directory");
422438
}
423439

424440
/*

src/delete.c

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,12 @@ do_retention_purge(void)
202202
continue;
203203
}
204204

205-
lock_backup(backup);
205+
/*
206+
* If the backup still is used do not interrupt go to the next
207+
* backup.
208+
*/
209+
if (!lock_backup(backup))
210+
continue;
206211

207212
/* Delete backup and update status to DELETED */
208213
delete_backup_files(backup);
@@ -238,7 +243,7 @@ do_retention_purge(void)
238243
if (backup_deleted)
239244
elog(INFO, "Purging finished");
240245
else
241-
elog(INFO, "Nothing to delete by retention policy");
246+
elog(INFO, "There are no backups to delete by retention policy");
242247

243248
return 0;
244249
}
@@ -275,8 +280,7 @@ delete_backup_files(pgBackup *backup)
275280
* Update STATUS to BACKUP_STATUS_DELETING in preparation for the case which
276281
* the error occurs before deleting all backup files.
277282
*/
278-
backup->status = BACKUP_STATUS_DELETING;
279-
write_backup_status(backup);
283+
write_backup_status(backup, BACKUP_STATUS_DELETING);
280284

281285
/* list files to be deleted */
282286
files = parray_new();

src/merge.c

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -227,11 +227,8 @@ merge_backups(pgBackup *to_backup, pgBackup *from_backup)
227227
if (from_backup->status == BACKUP_STATUS_DELETING)
228228
goto delete_source_backup;
229229

230-
to_backup->status = BACKUP_STATUS_MERGING;
231-
write_backup_status(to_backup);
232-
233-
from_backup->status = BACKUP_STATUS_MERGING;
234-
write_backup_status(from_backup);
230+
write_backup_status(to_backup, BACKUP_STATUS_MERGING);
231+
write_backup_status(from_backup, BACKUP_STATUS_MERGING);
235232

236233
create_data_directories(to_database_path, from_backup_path, false);
237234

src/parsexlog.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -484,8 +484,7 @@ validate_backup_wal_from_start_to_stop(pgBackup *backup,
484484
* If we don't have WAL between start_lsn and stop_lsn,
485485
* the backup is definitely corrupted. Update its status.
486486
*/
487-
backup->status = BACKUP_STATUS_CORRUPT;
488-
write_backup_status(backup);
487+
write_backup_status(backup, BACKUP_STATUS_CORRUPT);
489488

490489
elog(WARNING, "There are not enough WAL records to consistenly restore "
491490
"backup %s from START LSN: %X/%X to STOP LSN: %X/%X",

src/pg_probackup.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -459,8 +459,8 @@ extern int do_validate_all(void);
459459
/* in catalog.c */
460460
extern pgBackup *read_backup(time_t timestamp);
461461
extern void write_backup(pgBackup *backup);
462-
extern void write_backup_status(pgBackup *backup);
463-
extern void lock_backup(pgBackup *backup);
462+
extern void write_backup_status(pgBackup *backup, BackupStatus status);
463+
extern bool lock_backup(pgBackup *backup);
464464

465465
extern const char *pgBackupGetBackupMode(pgBackup *backup);
466466

src/restore.c

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -209,8 +209,7 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
209209
{
210210
if (backup->status == BACKUP_STATUS_OK)
211211
{
212-
backup->status = BACKUP_STATUS_ORPHAN;
213-
write_backup_status(backup);
212+
write_backup_status(backup, BACKUP_STATUS_ORPHAN);
214213

215214
elog(WARNING, "Backup %s is orphaned because his parent %s is missing",
216215
base36enc(backup->start_time), missing_backup_id);
@@ -242,8 +241,7 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
242241
{
243242
if (backup->status == BACKUP_STATUS_OK)
244243
{
245-
backup->status = BACKUP_STATUS_ORPHAN;
246-
write_backup_status(backup);
244+
write_backup_status(backup, BACKUP_STATUS_ORPHAN);
247245

248246
elog(WARNING,
249247
"Backup %s is orphaned because his parent %s has status: %s",
@@ -317,7 +315,10 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
317315
{
318316
tmp_backup = (pgBackup *) parray_get(parent_chain, i);
319317

320-
lock_backup(tmp_backup);
318+
/* Do not interrupt, validate the next backup */
319+
if (!lock_backup(tmp_backup))
320+
continue;
321+
321322
pgBackupValidate(tmp_backup);
322323
/* Maybe we should be more paranoid and check for !BACKUP_STATUS_OK? */
323324
if (tmp_backup->status == BACKUP_STATUS_CORRUPT)
@@ -360,8 +361,7 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
360361
{
361362
if (backup->status == BACKUP_STATUS_OK)
362363
{
363-
backup->status = BACKUP_STATUS_ORPHAN;
364-
write_backup_status(backup);
364+
write_backup_status(backup, BACKUP_STATUS_ORPHAN);
365365

366366
elog(WARNING, "Backup %s is orphaned because his parent %s has status: %s",
367367
base36enc(backup->start_time),
@@ -409,8 +409,8 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
409409
* Backup was locked during validation if no-validate wasn't
410410
* specified.
411411
*/
412-
if (rt->restore_no_validate)
413-
lock_backup(backup);
412+
if (rt->restore_no_validate && !lock_backup(backup))
413+
elog(ERROR, "Cannot lock backup directory");
414414

415415
restore_backup(backup);
416416
}

src/validate.c

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,15 @@ pgBackupValidate(pgBackup *backup)
5959
"Please upgrade pg_probackup binary.",
6060
PROGRAM_VERSION, base36enc(backup->start_time), backup->program_version);
6161

62+
if (backup->status == BACKUP_STATUS_RUNNING)
63+
{
64+
elog(WARNING, "Backup %s has status %s, change it to ERROR and skip validation",
65+
base36enc(backup->start_time), status2str(backup->status));
66+
write_backup_status(backup, BACKUP_STATUS_ERROR);
67+
corrupted_backup_found = true;
68+
return;
69+
}
70+
6271
/* Revalidation is attempted for DONE, ORPHAN and CORRUPT backups */
6372
if (backup->status != BACKUP_STATUS_OK &&
6473
backup->status != BACKUP_STATUS_DONE &&
@@ -143,8 +152,8 @@ pgBackupValidate(pgBackup *backup)
143152
parray_free(files);
144153

145154
/* Update backup status */
146-
backup->status = corrupted ? BACKUP_STATUS_CORRUPT : BACKUP_STATUS_OK;
147-
write_backup_status(backup);
155+
write_backup_status(backup, corrupted ? BACKUP_STATUS_CORRUPT :
156+
BACKUP_STATUS_OK);
148157

149158
if (corrupted)
150159
elog(WARNING, "Backup %s data files are corrupted", base36enc(backup->start_time));
@@ -385,8 +394,7 @@ do_validate_instance(void)
385394
/* orphanize current_backup */
386395
if (current_backup->status == BACKUP_STATUS_OK)
387396
{
388-
current_backup->status = BACKUP_STATUS_ORPHAN;
389-
write_backup_status(current_backup);
397+
write_backup_status(current_backup, BACKUP_STATUS_ORPHAN);
390398
elog(WARNING, "Backup %s is orphaned because his parent %s is missing",
391399
base36enc(current_backup->start_time),
392400
parent_backup_id);
@@ -410,8 +418,7 @@ do_validate_instance(void)
410418
/* orphanize current_backup */
411419
if (current_backup->status == BACKUP_STATUS_OK)
412420
{
413-
current_backup->status = BACKUP_STATUS_ORPHAN;
414-
write_backup_status(current_backup);
421+
write_backup_status(current_backup, BACKUP_STATUS_ORPHAN);
415422
elog(WARNING, "Backup %s is orphaned because his parent %s has status: %s",
416423
base36enc(current_backup->start_time), parent_backup_id,
417424
status2str(tmp_backup->status));
@@ -435,7 +442,9 @@ do_validate_instance(void)
435442
else
436443
base_full_backup = current_backup;
437444

438-
lock_backup(current_backup);
445+
/* Do not interrupt, validate the next backup */
446+
if (!lock_backup(current_backup))
447+
continue;
439448
/* Valiate backup files*/
440449
pgBackupValidate(current_backup);
441450

@@ -469,8 +478,7 @@ do_validate_instance(void)
469478
{
470479
if (backup->status == BACKUP_STATUS_OK)
471480
{
472-
backup->status = BACKUP_STATUS_ORPHAN;
473-
write_backup_status(backup);
481+
write_backup_status(backup, BACKUP_STATUS_ORPHAN);
474482

475483
elog(WARNING, "Backup %s is orphaned because his parent %s has status: %s",
476484
base36enc(backup->start_time),
@@ -522,7 +530,9 @@ do_validate_instance(void)
522530

523531
if (backup->status == BACKUP_STATUS_ORPHAN)
524532
{
525-
lock_backup(backup);
533+
/* Do not interrupt, validate the next backup */
534+
if (!lock_backup(backup))
535+
continue;
526536
/* Revaliate backup files*/
527537
pgBackupValidate(backup);
528538

tests/locking.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ class LockingTest(ProbackupTest, unittest.TestCase):
1212
# @unittest.skip("skip")
1313
# @unittest.expectedFailure
1414
def test_locking_running_1(self):
15-
"""
15+
"""
1616
make node, take full backup, stop it in the middle
1717
run validate, expect it to successfully executed,
1818
concurrect RUNNING backup with pid file and active process is legal
@@ -46,7 +46,7 @@ def test_locking_running_1(self):
4646
self.assertEqual(
4747
'RUNNING', self.show_pb(backup_dir, 'node')[1]['status'])
4848

49-
self.validate_pb(backup_dir)
49+
self.validate_pb(backup_dir, options=['--log-level-file=VERBOSE'])
5050

5151
self.assertEqual(
5252
'OK', self.show_pb(backup_dir, 'node')[0]['status'])
@@ -58,7 +58,7 @@ def test_locking_running_1(self):
5858
self.del_test_dir(module_name, fname)
5959

6060
def test_locking_running_2(self):
61-
"""
61+
"""
6262
make node, take full backup, stop it in the middle,
6363
kill process so no cleanup is done - pid file is in place,
6464
run validate, expect it to not successfully executed,
@@ -112,7 +112,7 @@ def test_locking_running_2(self):
112112
self.del_test_dir(module_name, fname)
113113

114114
def test_locking_running_3(self):
115-
"""
115+
"""
116116
make node, take full backup, stop it in the middle,
117117
terminate process, delete pid file,
118118
run validate, expect it to not successfully executed,
@@ -168,4 +168,4 @@ def test_locking_running_3(self):
168168
'ERROR', self.show_pb(backup_dir, 'node')[1]['status'])
169169

170170
# Clean after yourself
171-
self.del_test_dir(module_name, fname)
171+
self.del_test_dir(module_name, fname)

0 commit comments

Comments
 (0)