@@ -205,7 +205,7 @@ lock_backup(pgBackup *backup, bool strict, bool exclusive)
205205 {
206206 /* release exclusive lock */
207207 if (fio_unlink (lock_file , FIO_BACKUP_HOST ) < 0 )
208- elog (ERROR , "Could not remove old lock file \"%s\": %s" ,
208+ elog (ERROR , "Could not remove exclusive lock file \"%s\": %s" ,
209209 lock_file , strerror (errno ));
210210
211211 /* we are done */
@@ -261,7 +261,7 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
261261 int fd = 0 ;
262262 char buffer [MAXPGPATH * 2 + 256 ];
263263 int ntries = LOCK_TIMEOUT ;
264- int log_freq = ntries / 5 ;
264+ int empty_tries = LOCK_STALE_TIMEOUT ;
265265 int len ;
266266 int encoded_pid ;
267267 pid_t my_p_pid ;
@@ -351,13 +351,39 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
351351 fclose (fp_out );
352352
353353 /*
354- * It should be possible only as a result of system crash,
355- * so its hypothetical owner should be dead by now
354+ * There are several possible reasons for lock file
355+ * to be empty:
356+ * - system crash
357+ * - process crash
358+ * - race between writer and reader
359+ *
360+ * Consider empty file to stale after LOCK_STALE_TIMEOUT
361+ * attempts.
362+ *
363+ * TODO: alternatively we can write into temp file (lock_file_%pid),
364+ * rename it and then re-read lock file to make sure,
365+ * that we are successfully acquired the lock.
356366 */
357367 if (len == 0 )
358368 {
359- elog (WARNING , "Lock file \"%s\" is empty" , lock_file );
360- goto grab_lock ;
369+ if (empty_tries == 0 )
370+ {
371+ elog (WARNING , "Lock file \"%s\" is empty" , lock_file );
372+ goto grab_lock ;
373+ }
374+
375+ if ((empty_tries % LOG_FREQ ) == 0 )
376+ elog (WARNING , "Waiting %u seconds on empty exclusive lock for backup %s" ,
377+ empty_tries , base36enc (backup -> start_time ));
378+
379+ sleep (1 );
380+ /*
381+ * waiting on empty lock file should not affect
382+ * the timer for concurrent lockers (ntries).
383+ */
384+ empty_tries -- ;
385+ ntries ++ ;
386+ continue ;
361387 }
362388
363389 encoded_pid = atoi (buffer );
@@ -383,12 +409,13 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
383409 if (kill (encoded_pid , 0 ) == 0 )
384410 {
385411 /* complain every fifth interval */
386- if ((ntries % log_freq ) == 0 )
412+ if ((ntries % LOG_FREQ ) == 0 )
387413 {
388414 elog (WARNING , "Process %d is using backup %s, and is still running" ,
389415 encoded_pid , base36enc (backup -> start_time ));
390416
391- elog (WARNING , "Waiting %u seconds on lock for backup %s" , ntries , base36enc (backup -> start_time ));
417+ elog (WARNING , "Waiting %u seconds on exclusive lock for backup %s" ,
418+ ntries , base36enc (backup -> start_time ));
392419 }
393420
394421 sleep (1 );
@@ -435,7 +462,7 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
435462 errno = 0 ;
436463 if (fio_write (fd , buffer , strlen (buffer )) != strlen (buffer ))
437464 {
438- int save_errno = errno ;
465+ int save_errno = errno ;
439466
440467 fio_close (fd );
441468 fio_unlink (lock_file , FIO_BACKUP_HOST );
@@ -453,7 +480,7 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
453480
454481 if (fio_flush (fd ) != 0 )
455482 {
456- int save_errno = errno ;
483+ int save_errno = errno ;
457484
458485 fio_close (fd );
459486 fio_unlink (lock_file , FIO_BACKUP_HOST );
@@ -471,7 +498,7 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
471498
472499 if (fio_close (fd ) != 0 )
473500 {
474- int save_errno = errno ;
501+ int save_errno = errno ;
475502
476503 fio_unlink (lock_file , FIO_BACKUP_HOST );
477504
@@ -493,7 +520,6 @@ wait_read_only_owners(pgBackup *backup)
493520 char buffer [256 ];
494521 pid_t encoded_pid ;
495522 int ntries = LOCK_TIMEOUT ;
496- int log_freq = ntries / 5 ;
497523 char lock_file [MAXPGPATH ];
498524
499525 join_path_components (lock_file , backup -> root_dir , BACKUP_RO_LOCK_FILE );
@@ -523,7 +549,7 @@ wait_read_only_owners(pgBackup *backup)
523549 {
524550 if (kill (encoded_pid , 0 ) == 0 )
525551 {
526- if ((ntries % log_freq ) == 0 )
552+ if ((ntries % LOG_FREQ ) == 0 )
527553 {
528554 elog (WARNING , "Process %d is using backup %s in read only mode, and is still running" ,
529555 encoded_pid , base36enc (backup -> start_time ));
0 commit comments