@@ -205,7 +205,7 @@ lock_backup(pgBackup *backup, bool strict, bool exclusive)
205
205
{
206
206
/* release exclusive lock */
207
207
if (fio_unlink (lock_file , FIO_BACKUP_HOST ) < 0 )
208
- elog (ERROR , "Could not remove old lock file \"%s\": %s" ,
208
+ elog (ERROR , "Could not remove exclusive lock file \"%s\": %s" ,
209
209
lock_file , strerror (errno ));
210
210
211
211
/* we are done */
@@ -261,7 +261,7 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
261
261
int fd = 0 ;
262
262
char buffer [MAXPGPATH * 2 + 256 ];
263
263
int ntries = LOCK_TIMEOUT ;
264
- int log_freq = ntries / 5 ;
264
+ int empty_tries = LOCK_STALE_TIMEOUT ;
265
265
int len ;
266
266
int encoded_pid ;
267
267
pid_t my_p_pid ;
@@ -351,13 +351,39 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
351
351
fclose (fp_out );
352
352
353
353
/*
354
- * It should be possible only as a result of system crash,
355
- * so its hypothetical owner should be dead by now
354
+ * There are several possible reasons for lock file
355
+ * to be empty:
356
+ * - system crash
357
+ * - process crash
358
+ * - race between writer and reader
359
+ *
360
+ * Consider empty file to stale after LOCK_STALE_TIMEOUT
361
+ * attempts.
362
+ *
363
+ * TODO: alternatively we can write into temp file (lock_file_%pid),
364
+ * rename it and then re-read lock file to make sure,
365
+ * that we are successfully acquired the lock.
356
366
*/
357
367
if (len == 0 )
358
368
{
359
- elog (WARNING , "Lock file \"%s\" is empty" , lock_file );
360
- goto grab_lock ;
369
+ if (empty_tries == 0 )
370
+ {
371
+ elog (WARNING , "Lock file \"%s\" is empty" , lock_file );
372
+ goto grab_lock ;
373
+ }
374
+
375
+ if ((empty_tries % LOG_FREQ ) == 0 )
376
+ elog (WARNING , "Waiting %u seconds on empty exclusive lock for backup %s" ,
377
+ empty_tries , base36enc (backup -> start_time ));
378
+
379
+ sleep (1 );
380
+ /*
381
+ * waiting on empty lock file should not affect
382
+ * the timer for concurrent lockers (ntries).
383
+ */
384
+ empty_tries -- ;
385
+ ntries ++ ;
386
+ continue ;
361
387
}
362
388
363
389
encoded_pid = atoi (buffer );
@@ -383,12 +409,13 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
383
409
if (kill (encoded_pid , 0 ) == 0 )
384
410
{
385
411
/* complain every fifth interval */
386
- if ((ntries % log_freq ) == 0 )
412
+ if ((ntries % LOG_FREQ ) == 0 )
387
413
{
388
414
elog (WARNING , "Process %d is using backup %s, and is still running" ,
389
415
encoded_pid , base36enc (backup -> start_time ));
390
416
391
- elog (WARNING , "Waiting %u seconds on lock for backup %s" , ntries , base36enc (backup -> start_time ));
417
+ elog (WARNING , "Waiting %u seconds on exclusive lock for backup %s" ,
418
+ ntries , base36enc (backup -> start_time ));
392
419
}
393
420
394
421
sleep (1 );
@@ -435,7 +462,7 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
435
462
errno = 0 ;
436
463
if (fio_write (fd , buffer , strlen (buffer )) != strlen (buffer ))
437
464
{
438
- int save_errno = errno ;
465
+ int save_errno = errno ;
439
466
440
467
fio_close (fd );
441
468
fio_unlink (lock_file , FIO_BACKUP_HOST );
@@ -453,7 +480,7 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
453
480
454
481
if (fio_flush (fd ) != 0 )
455
482
{
456
- int save_errno = errno ;
483
+ int save_errno = errno ;
457
484
458
485
fio_close (fd );
459
486
fio_unlink (lock_file , FIO_BACKUP_HOST );
@@ -471,7 +498,7 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
471
498
472
499
if (fio_close (fd ) != 0 )
473
500
{
474
- int save_errno = errno ;
501
+ int save_errno = errno ;
475
502
476
503
fio_unlink (lock_file , FIO_BACKUP_HOST );
477
504
@@ -493,7 +520,6 @@ wait_read_only_owners(pgBackup *backup)
493
520
char buffer [256 ];
494
521
pid_t encoded_pid ;
495
522
int ntries = LOCK_TIMEOUT ;
496
- int log_freq = ntries / 5 ;
497
523
char lock_file [MAXPGPATH ];
498
524
499
525
join_path_components (lock_file , backup -> root_dir , BACKUP_RO_LOCK_FILE );
@@ -523,7 +549,7 @@ wait_read_only_owners(pgBackup *backup)
523
549
{
524
550
if (kill (encoded_pid , 0 ) == 0 )
525
551
{
526
- if ((ntries % log_freq ) == 0 )
552
+ if ((ntries % LOG_FREQ ) == 0 )
527
553
{
528
554
elog (WARNING , "Process %d is using backup %s in read only mode, and is still running" ,
529
555
encoded_pid , base36enc (backup -> start_time ));
0 commit comments