@@ -192,7 +192,7 @@ do_backup_instance(PGconn *backup_conn, PGNodeInfo *nodeInfo, bool no_sync)
192192 {
193193 /* try to setup multi-timeline backup chain */
194194 elog (WARNING , "Valid backup on current timeline %u is not found, "
195- "try to look up on previous timelines" ,
195+ "trying to look up on previous timelines" ,
196196 current .tli );
197197
198198 tli_list = catalog_get_timelines (& instance_config );
@@ -333,7 +333,7 @@ do_backup_instance(PGconn *backup_conn, PGNodeInfo *nodeInfo, bool no_sync)
333333
334334 /* list files with the logical path. omit $PGDATA */
335335 dir_list_file (backup_files_list , instance_config .pgdata ,
336- true, true, false, 0 , FIO_DB_HOST );
336+ true, true, false, true, 0 , FIO_DB_HOST );
337337
338338 /*
339339 * Get database_map (name to oid) for use in partial restore feature.
@@ -350,7 +350,7 @@ do_backup_instance(PGconn *backup_conn, PGNodeInfo *nodeInfo, bool no_sync)
350350 /* External dirs numeration starts with 1.
351351 * 0 value is not external dir */
352352 dir_list_file (backup_files_list , parray_get (external_dirs , i ),
353- false, true, false, i + 1 , FIO_DB_HOST );
353+ false, true, false, true, i + 1 , FIO_DB_HOST );
354354
355355 /* close ssh session in main thread */
356356 fio_disconnect ();
@@ -401,10 +401,10 @@ do_backup_instance(PGconn *backup_conn, PGNodeInfo *nodeInfo, bool no_sync)
401401
402402 if (current .backup_mode != BACKUP_MODE_FULL )
403403 {
404- elog (LOG , "current_tli: %X" , current .tli );
405- elog (LOG , "prev_backup-> start_lsn: %X/%X" ,
404+ elog (LOG , "Current tli: %X" , current .tli );
405+ elog (LOG , "Parent start_lsn: %X/%X" ,
406406 (uint32 ) (prev_backup -> start_lsn >> 32 ), (uint32 ) (prev_backup -> start_lsn ));
407- elog (LOG , "current. start_lsn: %X/%X" ,
407+ elog (LOG , "start_lsn: %X/%X" ,
408408 (uint32 ) (current .start_lsn >> 32 ), (uint32 ) (current .start_lsn ));
409409 }
410410
@@ -436,10 +436,11 @@ do_backup_instance(PGconn *backup_conn, PGNodeInfo *nodeInfo, bool no_sync)
436436 /*
437437 * Build the page map from ptrack information.
438438 */
439- if (nodeInfo -> ptrack_version_num = = 20 )
439+ if (nodeInfo -> ptrack_version_num > = 20 )
440440 make_pagemap_from_ptrack_2 (backup_files_list , backup_conn ,
441- nodeInfo -> ptrack_schema ,
442- prev_backup_start_lsn );
441+ nodeInfo -> ptrack_schema ,
442+ nodeInfo -> ptrack_version_num ,
443+ prev_backup_start_lsn );
443444 else if (nodeInfo -> ptrack_version_num == 15 ||
444445 nodeInfo -> ptrack_version_num == 16 ||
445446 nodeInfo -> ptrack_version_num == 17 )
@@ -582,9 +583,6 @@ do_backup_instance(PGconn *backup_conn, PGNodeInfo *nodeInfo, bool no_sync)
582583 /* Notify end of backup */
583584 pg_stop_backup (& current , pg_startbackup_conn , nodeInfo );
584585
585- elog (LOG , "current.stop_lsn: %X/%X" ,
586- (uint32 ) (stop_backup_lsn >> 32 ), (uint32 ) (stop_backup_lsn ));
587-
588586 /* In case of backup from replica >= 9.6 we must fix minRecPoint,
589587 * First we must find pg_control in backup_files_list.
590588 */
@@ -626,7 +624,7 @@ do_backup_instance(PGconn *backup_conn, PGNodeInfo *nodeInfo, bool no_sync)
626624 /* Scan backup PG_XLOG_DIR */
627625 xlog_files_list = parray_new ();
628626 join_path_components (pg_xlog_path , database_path , PG_XLOG_DIR );
629- dir_list_file (xlog_files_list , pg_xlog_path , false, true, false, 0 ,
627+ dir_list_file (xlog_files_list , pg_xlog_path , false, true, false, true, 0 ,
630628 FIO_BACKUP_HOST );
631629
632630 /* TODO: Drop streamed WAL segments greater than stop_lsn */
@@ -884,15 +882,10 @@ do_backup(time_t start_time, bool no_validate,
884882#endif
885883
886884 get_ptrack_version (backup_conn , & nodeInfo );
887- // elog(WARNING, "ptrack_version_num %d", ptrack_version_num);
885+ // elog(WARNING, "ptrack_version_num %d", ptrack_version_num);
888886
889887 if (nodeInfo .ptrack_version_num > 0 )
890- {
891- if (nodeInfo .ptrack_version_num >= 20 )
892- nodeInfo .is_ptrack_enable = pg_ptrack_enable2 (backup_conn );
893- else
894- nodeInfo .is_ptrack_enable = pg_ptrack_enable (backup_conn );
895- }
888+ nodeInfo .is_ptrack_enable = pg_ptrack_enable (backup_conn , nodeInfo .ptrack_version_num );
896889
897890 if (current .backup_mode == BACKUP_MODE_DIFF_PTRACK )
898891 {
@@ -1746,65 +1739,66 @@ pg_stop_backup(pgBackup *backup, PGconn *pg_startbackup_conn,
17461739 /* Calculate LSN */
17471740 stop_backup_lsn_tmp = ((uint64 ) lsn_hi ) << 32 | lsn_lo ;
17481741
1742+ /* It is ok for replica to return invalid STOP LSN
1743+ * UPD: Apparently it is ok even for a master.
1744+ */
17491745 if (!XRecOffIsValid (stop_backup_lsn_tmp ))
17501746 {
1751- /* It is ok for replica to return STOP LSN with NullXRecOff
1752- * UPD: Apparently it is ok even for master.
1753- */
1754- if (XRecOffIsNull (stop_backup_lsn_tmp ))
1755- {
1756- char * xlog_path ,
1757- stream_xlog_path [MAXPGPATH ];
1758- XLogSegNo segno = 0 ;
1759- XLogRecPtr lsn_tmp = InvalidXLogRecPtr ;
1747+ char * xlog_path ,
1748+ stream_xlog_path [MAXPGPATH ];
1749+ XLogSegNo segno = 0 ;
1750+ XLogRecPtr lsn_tmp = InvalidXLogRecPtr ;
17601751
1761- /*
1762- * Even though the value is invalid, it's expected postgres behaviour
1763- * and we're trying to fix it below.
1764- */
1765- elog (LOG , "Null offset in stop_backup_lsn value %X/%X, trying to fix" ,
1766- (uint32 ) (stop_backup_lsn_tmp >> 32 ), (uint32 ) (stop_backup_lsn_tmp ));
1752+ /*
1753+ * Even though the value is invalid, it's expected postgres behaviour
1754+ * and we're trying to fix it below.
1755+ */
1756+ elog (LOG , "Invalid offset in stop_lsn value %X/%X, trying to fix" ,
1757+ (uint32 ) (stop_backup_lsn_tmp >> 32 ), (uint32 ) (stop_backup_lsn_tmp ));
17671758
1768- /*
1769- * Note: even with gdb it is very hard to produce automated tests for
1770- * contrecord + NullXRecOff , so emulate it for manual testing.
1771- */
1772- //stop_backup_lsn_tmp = stop_backup_lsn_tmp - XLOG_SEG_SIZE;
1773- //elog(WARNING, "New Invalid stop_backup_lsn value %X/%X",
1774- // (uint32) (stop_backup_lsn_tmp >> 32), (uint32) (stop_backup_lsn_tmp));
1759+ /*
1760+ * Note: even with gdb it is very hard to produce automated tests for
1761+ * contrecord + invalid LSN , so emulate it for manual testing.
1762+ */
1763+ //stop_backup_lsn_tmp = stop_backup_lsn_tmp - XLOG_SEG_SIZE;
1764+ //elog(WARNING, "New Invalid stop_backup_lsn value %X/%X",
1765+ // (uint32) (stop_backup_lsn_tmp >> 32), (uint32) (stop_backup_lsn_tmp));
17751766
1776- if (stream_wal )
1777- {
1778- pgBackupGetPath2 (backup , stream_xlog_path ,
1779- lengthof (stream_xlog_path ),
1780- DATABASE_DIR , PG_XLOG_DIR );
1781- xlog_path = stream_xlog_path ;
1782- }
1783- else
1784- xlog_path = arclog_path ;
1767+ if (stream_wal )
1768+ {
1769+ pgBackupGetPath2 (backup , stream_xlog_path ,
1770+ lengthof (stream_xlog_path ),
1771+ DATABASE_DIR , PG_XLOG_DIR );
1772+ xlog_path = stream_xlog_path ;
1773+ }
1774+ else
1775+ xlog_path = arclog_path ;
17851776
1786- GetXLogSegNo (stop_backup_lsn_tmp , segno , instance_config .xlog_seg_size );
1777+ GetXLogSegNo (stop_backup_lsn_tmp , segno , instance_config .xlog_seg_size );
17871778
1788- /*
1789- * Note, that there is no guarantee that corresponding WAL file even exists.
1790- * Replica may return LSN from future and keep staying in present.
1791- * Or it can return LSN with NullXRecOff .
1792- *
1793- * That's bad, since we want to get real LSN to save it in backup label file
1794- * and to use it in WAL validation.
1795- *
1796- * So we try to do the following:
1797- * 1. Wait 'archive_timeout' seconds for segment containing stop_lsn and
1798- * look for the first valid record in it.
1799- * It solves the problem of occasional invalid XRecOff on write-busy system.
1800- * 2. Failing that, look for record in previous segment with endpoint
1801- * equal or greater than stop_lsn. It may(!) solve the problem of NullXRecOff
1802- * on write-idle system. If that fails too, error out.
1803- */
1779+ /*
1780+ * Note, that there is no guarantee that corresponding WAL file even exists.
1781+ * Replica may return LSN from future and keep staying in present.
1782+ * Or it can return invalid LSN .
1783+ *
1784+ * That's bad, since we want to get real LSN to save it in backup label file
1785+ * and to use it in WAL validation.
1786+ *
1787+ * So we try to do the following:
1788+ * 1. Wait 'archive_timeout' seconds for segment containing stop_lsn and
1789+ * look for the first valid record in it.
1790+ * It solves the problem of occasional invalid LSN on write-busy system.
1791+ * 2. Failing that, look for record in previous segment with endpoint
1792+ * equal or greater than stop_lsn. It may(!) solve the problem of invalid LSN
1793+ * on write-idle system. If that fails too, error out.
1794+ */
18041795
1796+ /* stop_lsn is pointing to a 0 byte of xlog segment */
1797+ if (stop_backup_lsn_tmp % instance_config .xlog_seg_size == 0 )
1798+ {
18051799 /* Wait for segment with current stop_lsn, it is ok for it to never arrive */
18061800 wait_wal_lsn (stop_backup_lsn_tmp , false, backup -> tli ,
1807- false, true, WARNING , stream_wal );
1801+ false, true, WARNING , stream_wal );
18081802
18091803 /* Get the first record in segment with current stop_lsn */
18101804 lsn_tmp = get_first_record_lsn (xlog_path , segno , backup -> tli ,
@@ -1840,17 +1834,39 @@ pg_stop_backup(pgBackup *backup, PGconn *pg_startbackup_conn,
18401834 (uint32 ) (stop_backup_lsn_tmp >> 32 ),
18411835 (uint32 ) (stop_backup_lsn_tmp ));
18421836 }
1837+ }
1838+ /* stop lsn is aligned to xlog block size, just find next lsn */
1839+ else if (stop_backup_lsn_tmp % XLOG_BLCKSZ == 0 )
1840+ {
1841+ /* Wait for segment with current stop_lsn */
1842+ wait_wal_lsn (stop_backup_lsn_tmp , false, backup -> tli ,
1843+ false, true, ERROR , stream_wal );
1844+
1845+ /* Get the next closest record in segment with current stop_lsn */
1846+ lsn_tmp = get_next_record_lsn (xlog_path , segno , backup -> tli ,
1847+ instance_config .xlog_seg_size ,
1848+ instance_config .archive_timeout ,
1849+ stop_backup_lsn_tmp );
18431850
1844- /* Setting stop_backup_lsn will set stop point for streaming */
1845- stop_backup_lsn = lsn_tmp ;
1846- stop_lsn_exists = true;
1851+ /* sanity */
1852+ if (!XRecOffIsValid (lsn_tmp ) || XLogRecPtrIsInvalid (lsn_tmp ))
1853+ elog (ERROR , "Failed to get WAL record next to %X/%X" ,
1854+ (uint32 ) (stop_backup_lsn_tmp >> 32 ),
1855+ (uint32 ) (stop_backup_lsn_tmp ));
18471856 }
18481857 /* PostgreSQL returned something very illegal as STOP_LSN, error out */
18491858 else
18501859 elog (ERROR , "Invalid stop_backup_lsn value %X/%X" ,
18511860 (uint32 ) (stop_backup_lsn_tmp >> 32 ), (uint32 ) (stop_backup_lsn_tmp ));
1861+
1862+ /* Setting stop_backup_lsn will set stop point for streaming */
1863+ stop_backup_lsn = lsn_tmp ;
1864+ stop_lsn_exists = true;
18521865 }
18531866
1867+ elog (LOG , "stop_lsn: %X/%X" ,
1868+ (uint32 ) (stop_backup_lsn >> 32 ), (uint32 ) (stop_backup_lsn ));
1869+
18541870 /* Write backup_label and tablespace_map */
18551871 if (!exclusive_backup )
18561872 {
0 commit comments