Skip to content

Commit eb9b234

Browse files
committed
[Issue #69] New algorithm of WAL purge, based on timelineInfo infrastructure
1 parent ec6a627 commit eb9b234

File tree

1 file changed

+193
-57
lines changed

1 file changed

+193
-57
lines changed

src/delete.c

Lines changed: 193 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,20 @@
1515
#include <unistd.h>
1616

1717
static void delete_walfiles(XLogRecPtr oldest_lsn, TimeLineID oldest_tli,
18-
uint32 xlog_seg_size);
18+
uint32 xlog_seg_size);
19+
static void delete_walfiles_internal(XLogRecPtr keep_lsn, timelineInfo *tli,
20+
uint32 xlog_seg_size, bool dry_run);
1921
static void do_retention_internal(parray *backup_list, parray *to_keep_list,
2022
parray *to_purge_list);
2123
static void do_retention_merge(parray *backup_list, parray *to_keep_list,
2224
parray *to_purge_list);
2325
static void do_retention_purge(parray *to_keep_list, parray *to_purge_list);
24-
static void do_retention_wal(void);
26+
static void do_retention_wal(bool dry_run);
2527

28+
// TODO: more useful messages for dry run.
2629
static bool backup_deleted = false; /* At least one backup was deleted */
2730
static bool backup_merged = false; /* At least one merge was enacted */
31+
static bool wal_deleted = false; /* At least one WAL segments was deleted */
2832

2933
void
3034
do_delete(time_t backup_id)
@@ -33,8 +37,8 @@ do_delete(time_t backup_id)
3337
parray *backup_list,
3438
*delete_list;
3539
pgBackup *target_backup = NULL;
36-
XLogRecPtr oldest_lsn = InvalidXLogRecPtr;
37-
TimeLineID oldest_tli = 0;
40+
// XLogRecPtr oldest_lsn = InvalidXLogRecPtr;
41+
// TimeLineID oldest_tli = 0;
3842

3943
/* Get complete list of backups */
4044
backup_list = catalog_get_backup_list(instance_name, INVALID_BACKUP_ID);
@@ -86,24 +90,7 @@ do_delete(time_t backup_id)
8690

8791
/* Clean WAL segments */
8892
if (delete_wal)
89-
{
90-
Assert(target_backup);
91-
92-
/* Find oldest LSN, used by backups */
93-
for (i = (int) parray_num(backup_list) - 1; i >= 0; i--)
94-
{
95-
pgBackup *backup = (pgBackup *) parray_get(backup_list, (size_t) i);
96-
97-
if (backup->status == BACKUP_STATUS_OK || backup->status == BACKUP_STATUS_DONE)
98-
{
99-
oldest_lsn = backup->start_lsn;
100-
oldest_tli = backup->tli;
101-
break;
102-
}
103-
}
104-
105-
delete_walfiles(oldest_lsn, oldest_tli, instance_config.xlog_seg_size);
106-
}
93+
do_retention_wal(false);
10794

10895
/* cleanup */
10996
parray_walk(backup_list, pgBackupFree);
@@ -172,8 +159,8 @@ int do_retention(void)
172159
do_retention_purge(to_keep_list, to_purge_list);
173160

174161
/* TODO: some sort of dry run for delete_wal */
175-
if (delete_wal && !dry_run)
176-
do_retention_wal();
162+
if (delete_wal)
163+
do_retention_wal(dry_run);
177164

178165
/* TODO: consider dry-run flag */
179166

@@ -622,47 +609,44 @@ do_retention_purge(parray *to_keep_list, parray *to_purge_list)
622609
}
623610
}
624611

625-
/* Purge WAL */
612+
/* Purge WAL
613+
* Iterate over timelines
614+
* Look for closest_backup, if exists, goto next timelime
615+
* if not exists, look for oldest backup on timeline
616+
*/
626617
static void
627-
do_retention_wal(void)
618+
do_retention_wal(bool dry_run)
628619
{
629-
parray *backup_list = NULL;
630-
631-
XLogRecPtr oldest_lsn = InvalidXLogRecPtr;
632-
TimeLineID oldest_tli = 0;
633-
bool backup_list_is_empty = false;
620+
parray *tli_list;
634621
int i;
635622

636-
/* Get list of backups. */
637-
backup_list = catalog_get_backup_list(instance_name, INVALID_BACKUP_ID);
623+
tli_list = catalog_get_timelines(&instance_config);
638624

639-
if (parray_num(backup_list) == 0)
640-
backup_list_is_empty = true;
641-
642-
/* Save LSN and Timeline to remove unnecessary WAL segments */
643-
for (i = (int) parray_num(backup_list) - 1; i >= 0; i--)
625+
for (i = 0; i < parray_num(tli_list); i++)
644626
{
645-
pgBackup *backup = (pgBackup *) parray_get(backup_list, i);
646-
647-
/* Get LSN and TLI of the oldest backup with valid start_lsn and tli */
648-
if (backup->tli > 0 && !XLogRecPtrIsInvalid(backup->start_lsn))
649-
{
650-
oldest_tli = backup->tli;
651-
oldest_lsn = backup->start_lsn;
652-
break;
653-
}
654-
}
627+
timelineInfo *tlinfo = (timelineInfo *) parray_get(tli_list, i);
655628

656-
/* Be paranoid */
657-
if (!backup_list_is_empty && XLogRecPtrIsInvalid(oldest_lsn))
658-
elog(ERROR, "Not going to purge WAL because LSN is invalid");
629+
/* Empty timeline can be safely skipped */
630+
if (tlinfo->n_xlog_files == 0 &&
631+
parray_num(tlinfo->xlog_filelist) == 0)
632+
continue;
659633

660-
/* Purge WAL files */
661-
delete_walfiles(oldest_lsn, oldest_tli, instance_config.xlog_seg_size);
634+
/* If closest backup is exists, then timeline can be safely skipped */
635+
if (tlinfo->closest_backup)
636+
continue;
662637

663-
/* Cleanup */
664-
parray_walk(backup_list, pgBackupFree);
665-
parray_free(backup_list);
638+
/*
639+
* Purge all WAL segments before START LSN of oldest backup.
640+
* If there is no backups on timeline, then whole timeline
641+
* can be safely purged.
642+
*/
643+
if (tlinfo->oldest_backup)
644+
delete_walfiles_internal(tlinfo->oldest_backup->start_lsn,
645+
tlinfo, instance_config.xlog_seg_size, dry_run);
646+
else
647+
delete_walfiles_internal(InvalidXLogRecPtr,
648+
tlinfo, instance_config.xlog_seg_size, dry_run);
649+
}
666650
}
667651

668652
/*
@@ -728,6 +712,158 @@ delete_backup_files(pgBackup *backup)
728712
return;
729713
}
730714

715+
/* Purge WAL archive.
716+
* If 'keep_lsn' is InvalidXLogRecPtr, then whole timeline can be purged
717+
* If 'keep_lsn' is valid LSN, then every lesser segment can be purged.
718+
* If 'dry_run' is set, then don`t actually delete anything.
719+
*
720+
* Case 1:
721+
* archive is not empty, 'keep_lsn' is valid and we can delete something.
722+
* Case 2:
723+
* archive is not empty, 'keep_lsn' is valid and prevening us from deleting anything.
724+
* Case 3:
725+
* archive is not empty, 'keep_lsn' is invalid, drop everyhing in archive.
726+
* Case 4:
727+
* archive is empty, 'keep_lsn' is valid, assume corruption of WAL archive.
728+
* Case 5:
729+
* archive is empty, 'keep_lsn' is invalid, drop backup history files
730+
* and partial WAL segments in archive.
731+
*
732+
* Q: Maybe we should stop treating partial WAL segments as second-class citizens?
733+
*/
734+
static void
735+
delete_walfiles_internal(XLogRecPtr keep_lsn, timelineInfo *tlinfo,
736+
uint32 xlog_seg_size, bool dry_run)
737+
{
738+
XLogSegNo StartSegNo; /* First segment to delete */
739+
XLogSegNo EndSegNo = 0; /* Oldest segment to keep */
740+
int rc;
741+
int i;
742+
int wal_size_logical = 0;
743+
int wal_size_actual = 0;
744+
char wal_pretty_size[20];
745+
bool purge_all = false;
746+
747+
/* Timeline is completely empty */
748+
if (parray_num(tlinfo->xlog_filelist) == 0)
749+
{
750+
elog(INFO, "Timeline %i is empty, nothing to remove", tlinfo->tli);
751+
return;
752+
}
753+
754+
if (XLogRecPtrIsInvalid(keep_lsn))
755+
{
756+
/* Drop all segments in timeline */
757+
elog(INFO, "All files on timeline %i will be removed", tlinfo->tli);
758+
StartSegNo = tlinfo->begin_segno;
759+
EndSegNo = tlinfo->end_segno;
760+
purge_all = true;
761+
}
762+
else
763+
{
764+
/* Drop all segments between begin_segno and segment with keep_lsn (excluding) */
765+
StartSegNo = tlinfo->begin_segno;
766+
GetXLogSegNo(keep_lsn, EndSegNo, xlog_seg_size);
767+
}
768+
769+
if (EndSegNo > 0 && EndSegNo > StartSegNo)
770+
elog(INFO, "WAL segments between %08X%08X and %08X%08X on timeline %i will be removed",
771+
(uint32) StartSegNo / xlog_seg_size, (uint32) StartSegNo % xlog_seg_size,
772+
(uint32) EndSegNo / xlog_seg_size, (uint32) EndSegNo % xlog_seg_size,
773+
tlinfo->tli);
774+
775+
if (EndSegNo > StartSegNo)
776+
/* typical scenario */
777+
wal_size_logical = (EndSegNo-StartSegNo) * xlog_seg_size;
778+
else if (EndSegNo < StartSegNo)
779+
{
780+
/* It is actually possible for EndSegNo to be less than StartSegNo
781+
* in case of :
782+
* 1. WAL archive corruption.
783+
* 2. There is no actual WAL archive to speak of and
784+
* 'keep_lsn' is coming from STREAM backup.
785+
*
786+
* Assume the worst.
787+
*/
788+
if (StartSegNo > 0 && EndSegNo > 0)
789+
elog(WARNING, "On timeline %i first segment %08X%08X is greater than "
790+
"oldest segment to keep %08X%08X. Possible WAL archive corruption.",
791+
tlinfo->tli,
792+
(uint32) StartSegNo / xlog_seg_size, (uint32) StartSegNo % xlog_seg_size,
793+
(uint32) EndSegNo / xlog_seg_size, (uint32) EndSegNo % xlog_seg_size);
794+
}
795+
else if (EndSegNo == StartSegNo && !purge_all)
796+
{
797+
/* 'Nothing to delete' scenario because of 'keep_lsn'
798+
* with possible exception of partial and backup history files.
799+
*/
800+
elog(INFO, "Nothing to remove on timeline %i", tlinfo->tli);
801+
}
802+
803+
/* Report the logical size to delete */
804+
if (wal_size_logical > 0)
805+
{
806+
pretty_size(wal_size_logical, wal_pretty_size, lengthof(wal_pretty_size));
807+
elog(INFO, "WAL size to remove on timeline %i: %s",
808+
tlinfo->tli, wal_pretty_size);
809+
}
810+
811+
/* Calculate the actual size to delete */
812+
for (i = 0; i < parray_num(tlinfo->xlog_filelist); i++)
813+
{
814+
xlogFile *wal_file = (xlogFile *) parray_get(tlinfo->xlog_filelist, i);
815+
816+
if (purge_all || wal_file->segno < EndSegNo)
817+
wal_size_actual += wal_file->file.size;
818+
}
819+
820+
/* Report the actual size to delete */
821+
if (wal_size_actual > 0)
822+
{
823+
pretty_size(wal_size_actual, wal_pretty_size, lengthof(wal_pretty_size));
824+
elog(INFO, "Resident data size to free on timeline %i: %s",
825+
tlinfo->tli, wal_pretty_size);
826+
}
827+
828+
if (dry_run)
829+
return;
830+
831+
for (i = 0; i < parray_num(tlinfo->xlog_filelist); i++)
832+
{
833+
xlogFile *wal_file = (xlogFile *) parray_get(tlinfo->xlog_filelist, i);
834+
835+
if (interrupted)
836+
elog(ERROR, "interrupted during WAL archive purge");
837+
838+
/* Any segment equal or greater than EndSegNo must be kept
839+
* unless it`s a 'purge all' scenario.
840+
*/
841+
if (purge_all || wal_file->segno < EndSegNo)
842+
{
843+
/* unlink segment */
844+
rc = unlink(wal_file->file.path);
845+
if (rc < 0)
846+
{
847+
/* Missing file is not considered as error condition */
848+
if (errno != ENOENT)
849+
elog(ERROR, "Could not remove file \"%s\": %s",
850+
wal_file->file.path, strerror(errno));
851+
}
852+
else
853+
{
854+
if (wal_file->type == SEGMENT)
855+
elog(VERBOSE, "Removed WAL segment \"%s\"", wal_file->file.path);
856+
else if (wal_file->type == PARTIAL_SEGMENT)
857+
elog(VERBOSE, "Removed partial WAL segment \"%s\"", wal_file->file.path);
858+
else if (wal_file->type == BACKUP_HISTORY_FILE)
859+
elog(VERBOSE, "Removed backup history file \"%s\"", wal_file->file.path);
860+
}
861+
862+
wal_deleted = true;
863+
}
864+
}
865+
}
866+
731867
/*
732868
* Deletes WAL segments up to oldest_lsn or all WAL segments (if all backups
733869
* was deleted and so oldest_lsn is invalid).
@@ -739,7 +875,7 @@ delete_backup_files(pgBackup *backup)
739875
*/
740876
static void
741877
delete_walfiles(XLogRecPtr oldest_lsn, TimeLineID oldest_tli,
742-
uint32 xlog_seg_size)
878+
uint32 xlog_seg_size)
743879
{
744880
XLogSegNo targetSegNo;
745881
char oldestSegmentNeeded[MAXFNAMELEN];

0 commit comments

Comments
 (0)