Skip to content

Commit 7d64d58

Browse files
committed
[Issue #308] Wait on empty exclusive lock file
1 parent 0fbf1a2 commit 7d64d58

File tree

2 files changed

+42
-14
lines changed

2 files changed

+42
-14
lines changed

src/catalog.c

Lines changed: 39 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ lock_backup(pgBackup *backup, bool strict, bool exclusive)
205205
{
206206
/* release exclusive lock */
207207
if (fio_unlink(lock_file, FIO_BACKUP_HOST) < 0)
208-
elog(ERROR, "Could not remove old lock file \"%s\": %s",
208+
elog(ERROR, "Could not remove exclusive lock file \"%s\": %s",
209209
lock_file, strerror(errno));
210210

211211
/* we are done */
@@ -261,7 +261,7 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
261261
int fd = 0;
262262
char buffer[MAXPGPATH * 2 + 256];
263263
int ntries = LOCK_TIMEOUT;
264-
int log_freq = ntries / 5;
264+
int empty_tries = LOCK_STALE_TIMEOUT;
265265
int len;
266266
int encoded_pid;
267267
pid_t my_p_pid;
@@ -351,13 +351,39 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
351351
fclose(fp_out);
352352

353353
/*
354-
* It should be possible only as a result of system crash,
355-
* so its hypothetical owner should be dead by now
354+
* There are several possible reasons for lock file
355+
* to be empty:
356+
* - system crash
357+
* - process crash
358+
* - race between writer and reader
359+
*
360+
* Consider empty file to stale after LOCK_STALE_TIMEOUT
361+
* attempts.
362+
*
363+
* TODO: alternatively we can write into temp file (lock_file_%pid),
364+
* rename it and then re-read lock file to make sure,
365+
* that we are successfully acquired the lock.
356366
*/
357367
if (len == 0)
358368
{
359-
elog(WARNING, "Lock file \"%s\" is empty", lock_file);
360-
goto grab_lock;
369+
if (empty_tries == 0)
370+
{
371+
elog(WARNING, "Lock file \"%s\" is empty", lock_file);
372+
goto grab_lock;
373+
}
374+
375+
if ((empty_tries % LOG_FREQ) == 0)
376+
elog(WARNING, "Waiting %u seconds on empty exclusive lock for backup %s",
377+
empty_tries, base36enc(backup->start_time));
378+
379+
sleep(1);
380+
/*
381+
* waiting on empty lock file should not affect
382+
* the timer for concurrent lockers (ntries).
383+
*/
384+
empty_tries--;
385+
ntries++;
386+
continue;
361387
}
362388

363389
encoded_pid = atoi(buffer);
@@ -383,12 +409,13 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
383409
if (kill(encoded_pid, 0) == 0)
384410
{
385411
/* complain every fifth interval */
386-
if ((ntries % log_freq) == 0)
412+
if ((ntries % LOG_FREQ) == 0)
387413
{
388414
elog(WARNING, "Process %d is using backup %s, and is still running",
389415
encoded_pid, base36enc(backup->start_time));
390416

391-
elog(WARNING, "Waiting %u seconds on lock for backup %s", ntries, base36enc(backup->start_time));
417+
elog(WARNING, "Waiting %u seconds on exclusive lock for backup %s",
418+
ntries, base36enc(backup->start_time));
392419
}
393420

394421
sleep(1);
@@ -435,7 +462,7 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
435462
errno = 0;
436463
if (fio_write(fd, buffer, strlen(buffer)) != strlen(buffer))
437464
{
438-
int save_errno = errno;
465+
int save_errno = errno;
439466

440467
fio_close(fd);
441468
fio_unlink(lock_file, FIO_BACKUP_HOST);
@@ -453,7 +480,7 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
453480

454481
if (fio_flush(fd) != 0)
455482
{
456-
int save_errno = errno;
483+
int save_errno = errno;
457484

458485
fio_close(fd);
459486
fio_unlink(lock_file, FIO_BACKUP_HOST);
@@ -471,7 +498,7 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
471498

472499
if (fio_close(fd) != 0)
473500
{
474-
int save_errno = errno;
501+
int save_errno = errno;
475502

476503
fio_unlink(lock_file, FIO_BACKUP_HOST);
477504

@@ -493,7 +520,6 @@ wait_read_only_owners(pgBackup *backup)
493520
char buffer[256];
494521
pid_t encoded_pid;
495522
int ntries = LOCK_TIMEOUT;
496-
int log_freq = ntries / 5;
497523
char lock_file[MAXPGPATH];
498524

499525
join_path_components(lock_file, backup->root_dir, BACKUP_RO_LOCK_FILE);
@@ -523,7 +549,7 @@ wait_read_only_owners(pgBackup *backup)
523549
{
524550
if (kill(encoded_pid, 0) == 0)
525551
{
526-
if ((ntries % log_freq) == 0)
552+
if ((ntries % LOG_FREQ) == 0)
527553
{
528554
elog(WARNING, "Process %d is using backup %s in read only mode, and is still running",
529555
encoded_pid, base36enc(backup->start_time));

src/pg_probackup.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,9 @@ extern const char *PROGRAM_EMAIL;
7979
/* Timeout defaults */
8080
#define ARCHIVE_TIMEOUT_DEFAULT 300
8181
#define REPLICA_TIMEOUT_DEFAULT 300
82-
#define LOCK_TIMEOUT 30
82+
#define LOCK_TIMEOUT 60
83+
#define LOCK_STALE_TIMEOUT 30
84+
#define LOG_FREQ 10
8385

8486
/* Directory/File permission */
8587
#define DIR_PERMISSION (0700)

0 commit comments

Comments
 (0)