From ca77456c3fb4b43a173d6153004face1355e3132 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Wed, 12 Nov 2025 13:59:06 +0200 Subject: [PATCH 1/6] Cleanup: Merge recv_recovery_read_checkpoint() to srv_start() --- storage/innobase/include/log0recv.h | 5 ----- storage/innobase/log/log0recv.cc | 24 ------------------------ storage/innobase/srv/srv0start.cc | 18 +++++++++++++++--- 3 files changed, 15 insertions(+), 32 deletions(-) diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h index 457218656f439..95847965e1d4d 100644 --- a/storage/innobase/include/log0recv.h +++ b/storage/innobase/include/log0recv.h @@ -44,11 +44,6 @@ ATTRIBUTE_COLD MY_ATTRIBUTE((nonnull, warn_unused_result)) @return whether the page was recovered correctly */ bool recv_recover_page(fil_space_t* space, buf_page_t* bpage); -/** Read the latest checkpoint information from log file -and store it in log_sys.next_checkpoint and recv_sys.file_checkpoint -@return error code or DB_SUCCESS */ -dberr_t recv_recovery_read_checkpoint(); - /** Start recovering from a redo log checkpoint. of first system tablespace page @return error code or DB_SUCCESS */ diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index d206b3908ba9e..d806471f75382 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -4693,30 +4693,6 @@ static dberr_t recv_rename_files() return err; } -dberr_t recv_recovery_read_checkpoint() -{ - ut_ad(srv_operation <= SRV_OPERATION_EXPORT_RESTORED || - srv_operation == SRV_OPERATION_RESTORE || - srv_operation == SRV_OPERATION_RESTORE_EXPORT); - ut_ad(!recv_sys.recovery_on); - ut_d(mysql_mutex_lock(&buf_pool.mutex)); - ut_ad(UT_LIST_GET_LEN(buf_pool.LRU) == 0); - ut_ad(UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0); - ut_d(mysql_mutex_unlock(&buf_pool.mutex)); - - if (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO) - { - sql_print_information("InnoDB: innodb_force_recovery=6" - " skips redo log apply"); - return DB_SUCCESS; - } - - log_sys.latch.wr_lock(SRW_LOCK_CALL); - dberr_t err= recv_sys.find_checkpoint(); - log_sys.latch.wr_unlock(); - return err; -} - inline void log_t::set_recovered() noexcept { ut_ad(get_flushed_lsn() == get_lsn()); diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index 318357432cfdf..e92e0ff54e078 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -1458,9 +1458,21 @@ dberr_t srv_start(bool create_new_db) } recv_sys.debug_free(); } else { - err = recv_recovery_read_checkpoint(); - if (err != DB_SUCCESS) { - return srv_init_abort(err); + ut_ad(srv_operation <= SRV_OPERATION_EXPORT_RESTORED + || srv_operation == SRV_OPERATION_RESTORE + || srv_operation == SRV_OPERATION_RESTORE_EXPORT); + ut_ad(!recv_sys.recovery_on); + + if (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO) { + sql_print_information("InnoDB: innodb_force_recovery=6" + " skips redo log apply"); + } else { + log_sys.latch.wr_lock(SRW_LOCK_CALL); + err = recv_sys.find_checkpoint(); + log_sys.latch.wr_unlock(); + if (err != DB_SUCCESS) { + return srv_init_abort(err); + } } } From 9496a625301e56a8e04a5c71778b77dd0050b719 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Wed, 12 Nov 2025 13:59:15 +0200 Subject: [PATCH 2/6] MDEV-37949: Introduce innodb_log_recovery_start, innodb_log_recovery_target innodb_log_recovery_start: The checkpoint LSN to start recovery from. This will be useful when recovering from an archived log. innodb_log_recovery_target: The requested LSN to end recovery at. This will be useful when recovering data files that were copied as of a time that is before end of the available log. --- .../r/innodb_encrypt_log_corruption.result | 7 +++ .../innodb/r/corrupted_during_recovery.result | 10 +++- .../innodb/r/innodb-wl5522,strict_crc32.rdiff | 25 ++++++++- .../suite/innodb/r/innodb-wl5522.result | 7 +++ .../innodb/r/innodb_force_recovery.result | 7 +++ .../suite/innodb/r/log_corruption.result | 7 +++ mysql-test/suite/innodb/r/rename_table.result | 10 ++++ .../innodb/t/corrupted_during_recovery.test | 34 +++++++++--- mysql-test/suite/innodb/t/innodb-wl5522.test | 18 ++++++- .../suite/innodb/t/innodb_force_recovery.test | 13 +++++ mysql-test/suite/innodb/t/log_corruption.test | 11 +++- mysql-test/suite/innodb/t/rename_table.test | 15 ++++++ .../suite/sys_vars/r/sysvars_innodb.result | 24 +++++++++ storage/innobase/handler/ha_innodb.cc | 12 +++++ storage/innobase/include/log0recv.h | 5 ++ storage/innobase/log/log0recv.cc | 52 +++++++++++++++++-- 16 files changed, 241 insertions(+), 16 deletions(-) diff --git a/mysql-test/suite/encryption/r/innodb_encrypt_log_corruption.result b/mysql-test/suite/encryption/r/innodb_encrypt_log_corruption.result index 3c3e4831d8a0f..b7bdee10daa90 100644 --- a/mysql-test/suite/encryption/r/innodb_encrypt_log_corruption.result +++ b/mysql-test/suite/encryption/r/innodb_encrypt_log_corruption.result @@ -20,6 +20,13 @@ AND support IN ('YES', 'DEFAULT', 'ENABLED'); ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS FOUND 1 /InnoDB: Upgrade after a crash is not supported. This redo log was created before MariaDB 10\.2\.2, and we did not find a valid checkpoint/ in mysqld.1.err # empty redo log from before MariaDB 10.2.2 +# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-recovery-target=12345 +SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES +WHERE engine = 'innodb' +AND support IN ('YES', 'DEFAULT', 'ENABLED'); +COUNT(*) +0 +FOUND 1 /InnoDB: cannot fulfill innodb_log_recovery_target=12345!=/ in mysqld.1.err # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=4m SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' diff --git a/mysql-test/suite/innodb/r/corrupted_during_recovery.result b/mysql-test/suite/innodb/r/corrupted_during_recovery.result index 593943b4951ea..d063fcb0132cb 100644 --- a/mysql-test/suite/innodb/r/corrupted_during_recovery.result +++ b/mysql-test/suite/innodb/r/corrupted_during_recovery.result @@ -1,14 +1,19 @@ -CREATE TABLE t1(a BIGINT PRIMARY KEY) ENGINE=InnoDB; +CREATE TABLE t1(a BIGINT PRIMARY KEY) ENGINE=InnoDB STATS_PERSISTENT=0; INSERT INTO t1 VALUES(1); +SET GLOBAL innodb_max_purge_lag_wait=0, innodb_log_checkpoint_now=ON; CREATE TABLE t2(a BIGINT PRIMARY KEY) ENGINE=InnoDB; INSERT INTO t1 VALUES(2); SET GLOBAL innodb_flush_log_at_trx_commit=1; INSERT INTO t2 VALUES(1); # Kill the server +SELECT * FROM t2; +Got one of the listed errors +SELECT * FROM t2; +ERROR 42000: Unknown storage engine 'InnoDB' +FOUND 1 /InnoDB: impossible innodb_log_recovery_start=/ in mysqld.1.err # Corrupt the pages SELECT * FROM t1; ERROR 42000: Unknown storage engine 'InnoDB' -FOUND 1 /InnoDB: Page \[page id: space=[1-9][0-9]*, page number=3\] log sequence number 1311768467463790320 is in the future!/ in mysqld.1.err SELECT * FROM t1; a 1 @@ -18,6 +23,7 @@ a CHECK TABLE t2; Table Op Msg_type Msg_text test.t2 check status OK +FOUND 1 /InnoDB: Page \[page id: space=[1-9][0-9]*, page number=3\] log sequence number 1311768467463790320 is in the future!/ in mysqld.1.err DROP TABLE t1, t2; CREATE TABLE t1(pk SERIAL) ENGINE=InnoDB; INSERT INTO t1 VALUES (1),(2),(3); diff --git a/mysql-test/suite/innodb/r/innodb-wl5522,strict_crc32.rdiff b/mysql-test/suite/innodb/r/innodb-wl5522,strict_crc32.rdiff index 283bbe96aae97..e4c5128b0377f 100644 --- a/mysql-test/suite/innodb/r/innodb-wl5522,strict_crc32.rdiff +++ b/mysql-test/suite/innodb/r/innodb-wl5522,strict_crc32.rdiff @@ -1,6 +1,27 @@ --- innodb-wl5522.result +++ innodb-wl5522,strict_crc32.result~ -@@ -131,8 +131,7 @@ +@@ -1,9 +1,6 @@ + call mtr.add_suppression("InnoDB: Unable to import tablespace .* because it already exists. Please DISCARD the tablespace before IMPORT\\."); + call mtr.add_suppression("Index for table 't2' is corrupt; try to repair it"); + call mtr.add_suppression("InnoDB: Cannot save statistics for table `test`\\.`t1` because the \\.ibd file is missing"); +-call mtr.add_suppression("InnoDB: cannot fulfill innodb_log_recovery_target=123456<"); +-call mtr.add_suppression("InnoDB: Plugin initialization aborted"); +-call mtr.add_suppression("Plugin 'InnoDB' registration as a STORAGE ENGINE failed\\."); + FLUSH TABLES; + CREATE TABLE t1 + (a INT AUTO_INCREMENT PRIMARY KEY, +@@ -37,10 +34,6 @@ + t1.ibd + t2.frm + t2.ibd +-# restart: --innodb-log-recovery-target=123456 +-FOUND 1 /InnoDB: cannot fulfill innodb_log_recovery_target=123456::max(), 0); + +static MYSQL_SYSVAR_UINT64_T(log_recovery_target, recv_sys.rpo, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "recovery point objective (end LSN; 0=unlimited)", + nullptr, nullptr, 0, 0, std::numeric_limits::max(), 0); + static MYSQL_SYSVAR_ULONGLONG(log_file_size, srv_log_file_size, PLUGIN_VAR_RQCMDARG, "Redo log size in bytes.", @@ -19864,6 +19874,8 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(log_file_write_through), MYSQL_SYSVAR(data_file_buffering), MYSQL_SYSVAR(data_file_write_through), + MYSQL_SYSVAR(log_recovery_start), + MYSQL_SYSVAR(log_recovery_target), MYSQL_SYSVAR(log_file_size), MYSQL_SYSVAR(log_write_ahead_size), MYSQL_SYSVAR(log_spin_wait_delay), diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h index 95847965e1d4d..42aee0bc2af97 100644 --- a/storage/innobase/include/log0recv.h +++ b/storage/innobase/include/log0recv.h @@ -244,6 +244,11 @@ struct recv_sys_t lsn_t scanned_lsn; /** log sequence number at the end of the FILE_CHECKPOINT record, or 0 */ lsn_t file_checkpoint; + /** recovery start checkpoint */ + lsn_t recovery_start; + /** recovery point objective (a limit for scanned_lsn) */ + lsn_t rpo; + /** the time when progress was last reported */ time_t progress_time; diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index d806471f75382..cb0831e8b07c9 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -57,6 +57,8 @@ Created 9/20/1997 Heikki Tuuri /** The recovery system */ recv_sys_t recv_sys; +/** 0 or the first LSN that would conflict with innodb_log_recovery_target */ +static lsn_t recv_sys_rpo_exceeded; /** TRUE when recv_init_crash_recovery() has been called. */ bool recv_needed_recovery; #ifdef UNIV_DEBUG @@ -1688,6 +1690,16 @@ static dberr_t recv_log_recover_10_5(lsn_t lsn_offset) return DB_SUCCESS; } +/** @return if the specified innodb_log_recovery_target is being violated */ +static bool recv_sys_invalid_rpo(lsn_t lsn) noexcept +{ + if (!recv_sys.rpo || recv_sys.rpo >= lsn) + return false; + sql_print_error("InnoDB: cannot fulfill innodb_log_recovery_target=%" + PRIu64 "<%" PRIu64, recv_sys.rpo, lsn); + return true; +} + dberr_t recv_sys_t::find_checkpoint() { bool wrong_size= false; @@ -1779,6 +1791,12 @@ dberr_t recv_sys_t::find_checkpoint() log_sys.last_checkpoint_lsn= log_sys.next_checkpoint_lsn; log_sys.set_recovered_lsn(log_sys.next_checkpoint_lsn); lsn= file_checkpoint= log_sys.next_checkpoint_lsn; + if (recv_sys.rpo && recv_sys.rpo != lsn) + { + sql_print_error("InnoDB: cannot fulfill innodb_log_recovery_target=%" + PRIu64 "!=%" PRIu64, recv_sys.rpo, lsn); + return DB_CORRUPTION; + } if (UNIV_LIKELY(lsn != 0)) scanned_lsn= lsn; log_sys.next_checkpoint_no= 0; @@ -1855,6 +1873,8 @@ dberr_t recv_sys_t::find_checkpoint() } if (!log_sys.next_checkpoint_lsn) goto got_no_checkpoint; + if (recv_sys_invalid_rpo(lsn)) + return DB_READ_ONLY; if (!memcmp(creator, "Backup ", 7)) srv_start_after_restore= true; @@ -2422,8 +2442,16 @@ recv_sys_t::parse_mtr_result log_parse_start(source &l, unsigned nonce) return recv_sys_t::PREMATURE_EOF; eom_found: - if (*l != log_sys.get_sequence_bit((l - begin) + recv_sys.lsn)) + const lsn_t end_lsn{(l - begin) + recv_sys.lsn}; + + if (*l != log_sys.get_sequence_bit(end_lsn)) + return recv_sys_t::GOT_EOF; + + if (recv_sys.rpo && recv_sys.rpo < end_lsn) + { + recv_sys_rpo_exceeded= end_lsn; return recv_sys_t::GOT_EOF; + } if (l.is_eof(5 + nonce)) return recv_sys_t::PREMATURE_EOF; @@ -4765,6 +4793,7 @@ dberr_t recv_recovery_from_checkpoint_start() } recv_sys.recovery_on = true; + recv_sys_rpo_exceeded = 0; log_sys.latch.wr_lock(SRW_LOCK_CALL); log_sys.set_capacity(); @@ -4782,9 +4811,20 @@ dberr_t recv_recovery_from_checkpoint_start() recv_sys_t::parser parser[2]; if (log_sys.is_recoverable()) { + if (recv_sys.recovery_start > log_sys.next_checkpoint_lsn) { + sql_print_error("InnoDB: impossible " + "innodb_log_recovery_start=%" PRIu64 + ">%" PRIu64, + recv_sys.recovery_start, + log_sys.next_checkpoint_lsn); + goto err_exit; + } else { + log_sys.last_checkpoint_lsn = recv_sys.recovery_start + ? recv_sys.recovery_start + : log_sys.next_checkpoint_lsn; + } const bool rewind = recv_sys.lsn - != log_sys.next_checkpoint_lsn; - log_sys.last_checkpoint_lsn = log_sys.next_checkpoint_lsn; + != log_sys.last_checkpoint_lsn; parser[false] = get_parse_mmap(); parser[true] = get_parse_mmap(); recv_scan_log(false, parser); @@ -4792,6 +4832,7 @@ dberr_t recv_recovery_from_checkpoint_start() read_only_recovery: sql_print_warning("InnoDB: innodb_read_only" " prevents crash recovery"); +read_only_reported: err = DB_READ_ONLY; goto func_exit; } @@ -4812,8 +4853,11 @@ dberr_t recv_recovery_from_checkpoint_start() } rescan = recv_scan_log(false, parser); - if (srv_read_only_mode && recv_needed_recovery) { + if (!recv_needed_recovery) { + } else if (srv_read_only_mode) { goto read_only_recovery; + } else if (recv_sys_invalid_rpo(recv_sys_rpo_exceeded)) { + goto read_only_reported; } if ((recv_sys.is_corrupt_log() && !srv_force_recovery) From 9d14e2c6fbe13265f3a040399b8bc5d5b8b1185f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Wed, 12 Nov 2025 16:34:37 +0200 Subject: [PATCH 3/6] WIP MDEV-37949: Implement innodb_log_archive, innodb_lsn_archived The new setting SET GLOBAL innodb_log_archive=ON will enable log archiving as soon as the current ib_logfile0 is about to wrap around an odd number of times. The new status variable innodb_lsn_archived will return 0 when log archiving is not enabled, or a LSN since when a complete InnoDB log archive is available. When innodb_log_archive=ON, the setting SET GLOBAL innodb_log_file_size will affect subsequently created log files when the file that is being currently written is running out. log_t::archive_new_write(): Create and allocate a new log file, and write the outstanding data to both the current and the new file. log_t::archive_new_mmap(): Create and memory-map a new log file, and update file_size to resize_target. log_t::archive_set_size(): Ensure that resize_target is set for new archived log files. log_write_buf(): Add the parameter max_length, the file wrap limit. mtr_t::finish_writer(): Specialize for innodb_log_archive=ON TODO: Rename log files and set innodb_lsn_archived on checkpoint. TODO: Implement recovery from archived log. --- .../innodb/r/innodb_status_variables.result | 1 + .../innodb/r/log_file_size_online.result | 13 +- mysql-test/suite/innodb/t/innodb-master.opt | 2 - .../suite/innodb/t/log_file_size_online.opt | 1 + .../suite/innodb/t/log_file_size_online.test | 28 +++- .../suite/sys_vars/r/sysvars_innodb.result | 14 +- sql/upgrade_conf_file.cc | 1 - storage/innobase/handler/ha_innodb.cc | 32 +++- storage/innobase/include/log0log.h | 24 ++- storage/innobase/include/mtr0mtr.h | 14 +- storage/innobase/log/log0log.cc | 137 ++++++++++++++++-- storage/innobase/mtr/mtr0mtr.cc | 107 ++++++++------ storage/innobase/srv/srv0start.cc | 4 + 13 files changed, 306 insertions(+), 72 deletions(-) create mode 100644 mysql-test/suite/innodb/t/log_file_size_online.opt diff --git a/mysql-test/suite/innodb/r/innodb_status_variables.result b/mysql-test/suite/innodb/r/innodb_status_variables.result index 194528c99da5c..9a758a83fa4ae 100644 --- a/mysql-test/suite/innodb/r/innodb_status_variables.result +++ b/mysql-test/suite/innodb/r/innodb_status_variables.result @@ -51,6 +51,7 @@ INNODB_LOG_WRITES INNODB_LSN_CURRENT INNODB_LSN_FLUSHED INNODB_LSN_LAST_CHECKPOINT +INNODB_LSN_ARCHIVED INNODB_MASTER_THREAD_ACTIVE_LOOPS INNODB_MASTER_THREAD_IDLE_LOOPS INNODB_MAX_TRX_ID diff --git a/mysql-test/suite/innodb/r/log_file_size_online.result b/mysql-test/suite/innodb/r/log_file_size_online.result index 8dcd9a47b2f0b..41c4f358cfb8e 100644 --- a/mysql-test/suite/innodb/r/log_file_size_online.result +++ b/mysql-test/suite/innodb/r/log_file_size_online.result @@ -1,3 +1,4 @@ +SET GLOBAL innodb_log_archive=OFF; SET GLOBAL innodb_log_file_size=4194304; SHOW VARIABLES LIKE 'innodb_log_file_size'; Variable_name Value @@ -11,7 +12,7 @@ a INT PRIMARY KEY AUTO_INCREMENT, b CHAR(255) NOT NULL) ENGINE=INNODB; INSERT INTO t SELECT NULL, REPEAT('a', 255) FROM seq_1_to_20000; -# restart: --innodb-log-file-size=4194304 +# restart: --innodb-log-file-size=4194304 --skip-innodb-log-archive SELECT COUNT(*) FROM t; COUNT(*) 20000 @@ -28,8 +29,15 @@ Got one of the listed errors connect con1,localhost,root; SET GLOBAL innodb_log_file_size=7340032; connection default; +SET GLOBAL innodb_log_archive=ON; +SET GLOBAL innodb_log_archive=OFF; KILL QUERY @id; connection con1; +SET GLOBAL innodb_log_archive=ON, innodb_log_file_size=10485760; +SELECT @@GLOBAL.innodb_log_file_size!=10485760; +@@GLOBAL.innodb_log_file_size!=10485760 +1 +SET GLOBAL innodb_log_archive=OFF; connection default; SET GLOBAL innodb_log_file_size=5242880; connection con1; @@ -46,6 +54,8 @@ connection con1; disconnect con1; connection default; # restart +SET @save_archive=@@GLOBAL.innodb_log_archive; +SET GLOBAL innodb_log_archive=OFF; SELECT * FROM t WHERE a<10; a b 1 @@ -73,4 +83,5 @@ SHOW VARIABLES LIKE 'innodb_log_file_size'; Variable_name Value innodb_log_file_size 5242880 FOUND 1 /InnoDB: Resized log to 6\.000MiB/ in mysqld.1.err +SET GLOBAL innodb_log_archive=@save_archive; DROP TABLE t; diff --git a/mysql-test/suite/innodb/t/innodb-master.opt b/mysql-test/suite/innodb/t/innodb-master.opt index 2e71d62206dbd..5266978e4f0a7 100644 --- a/mysql-test/suite/innodb/t/innodb-master.opt +++ b/mysql-test/suite/innodb/t/innodb-master.opt @@ -2,5 +2,3 @@ --default-storage-engine=MyISAM --innodb-strict-mode=0 --innodb-file-per-table=0 ---loose-innodb-track-changed-pages ---loose-innodb-log-archive diff --git a/mysql-test/suite/innodb/t/log_file_size_online.opt b/mysql-test/suite/innodb/t/log_file_size_online.opt new file mode 100644 index 0000000000000..1f9a83fbbfbcf --- /dev/null +++ b/mysql-test/suite/innodb/t/log_file_size_online.opt @@ -0,0 +1 @@ +--skip-innodb-log-archive diff --git a/mysql-test/suite/innodb/t/log_file_size_online.test b/mysql-test/suite/innodb/t/log_file_size_online.test index fb6722ee8d76e..8ae7952db8a5c 100644 --- a/mysql-test/suite/innodb/t/log_file_size_online.test +++ b/mysql-test/suite/innodb/t/log_file_size_online.test @@ -4,6 +4,13 @@ let SEARCH_FILE = $MYSQLTEST_VARDIR/log/mysqld.1.err; +SET GLOBAL innodb_log_archive=OFF; +let $wait_condition= +SELECT variable_value = 0 +FROM information_schema.global_status +WHERE variable_name = 'innodb_lsn_archived'; +--source include/wait_condition.inc + SET GLOBAL innodb_log_file_size=4194304; SHOW VARIABLES LIKE 'innodb_log_file_size'; SELECT global_value FROM information_schema.system_variables @@ -16,7 +23,7 @@ ENGINE=INNODB; INSERT INTO t SELECT NULL, REPEAT('a', 255) FROM seq_1_to_20000; ---let $restart_parameters=--innodb-log-file-size=4194304 +--let $restart_parameters=--innodb-log-file-size=4194304 --skip-innodb-log-archive --source include/restart_mysqld.inc SELECT COUNT(*) FROM t; @@ -41,10 +48,20 @@ let $ID= `SELECT @id := CONNECTION_ID()`; send SET GLOBAL innodb_log_file_size=7340032; --connection default let $ignore= `SELECT @id := $ID`; +--error 0,ER_WRONG_USAGE +SET GLOBAL innodb_log_archive=ON; +SET GLOBAL innodb_log_archive=OFF; + KILL QUERY @id; --connection con1 reap; +# When innodb_log_archive=ON, SET GLOBAL innodb_log_file_size is instantaneous +# but will not reflect the file size. +SET GLOBAL innodb_log_archive=ON, innodb_log_file_size=10485760; +SELECT @@GLOBAL.innodb_log_file_size!=10485760; +SET GLOBAL innodb_log_archive=OFF; + --connection default send SET GLOBAL innodb_log_file_size=5242880; @@ -66,6 +83,14 @@ reap; --let $restart_parameters= --source include/restart_mysqld.inc +SET @save_archive=@@GLOBAL.innodb_log_archive; +SET GLOBAL innodb_log_archive=OFF; +let $wait_condition= +SELECT variable_value = 0 +FROM information_schema.global_status +WHERE variable_name = 'innodb_lsn_archived'; +--source include/wait_condition.inc + SELECT * FROM t WHERE a<10; SELECT COUNT(*),LENGTH(b) FROM t GROUP BY b; @@ -76,5 +101,6 @@ SET GLOBAL innodb_log_file_size=5242880; SHOW VARIABLES LIKE 'innodb_log_file_size'; let SEARCH_PATTERN = InnoDB: Resized log to 6\\.000MiB; --source include/search_pattern_in_file.inc +SET GLOBAL innodb_log_archive=@save_archive; DROP TABLE t; diff --git a/mysql-test/suite/sys_vars/r/sysvars_innodb.result b/mysql-test/suite/sys_vars/r/sysvars_innodb.result index d355c64c60817..d4a4c68b980b5 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_innodb.result +++ b/mysql-test/suite/sys_vars/r/sysvars_innodb.result @@ -932,6 +932,18 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO COMMAND_LINE_ARGUMENT REQUIRED +VARIABLE_NAME INNODB_LOG_ARCHIVE +SESSION_VALUE NULL +DEFAULT_VALUE OFF +VARIABLE_SCOPE GLOBAL +VARIABLE_TYPE BOOLEAN +VARIABLE_COMMENT Whether log archiving is desired (innodb_lsn_archived>0 if enabled) +NUMERIC_MIN_VALUE NULL +NUMERIC_MAX_VALUE NULL +NUMERIC_BLOCK_SIZE NULL +ENUM_VALUE_LIST OFF,ON +READ_ONLY NO +COMMAND_LINE_ARGUMENT OPTIONAL VARIABLE_NAME INNODB_LOG_BUFFER_SIZE SESSION_VALUE NULL DEFAULT_VALUE 16777216 @@ -973,7 +985,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 100663296 VARIABLE_SCOPE GLOBAL VARIABLE_TYPE BIGINT UNSIGNED -VARIABLE_COMMENT Redo log size in bytes. +VARIABLE_COMMENT Desired size of ib_logfile0 in bytes NUMERIC_MIN_VALUE 4194304 NUMERIC_MAX_VALUE 18446744073709551615 NUMERIC_BLOCK_SIZE 4096 diff --git a/sql/upgrade_conf_file.cc b/sql/upgrade_conf_file.cc index 0d7bc6034685c..f1fa9aac6ba3d 100644 --- a/sql/upgrade_conf_file.cc +++ b/sql/upgrade_conf_file.cc @@ -97,7 +97,6 @@ static const char *removed_variables[] = "innodb_locks_unsafe_for_binlog", "innodb_log_arch_dir", "innodb_log_arch_expire_sec", -"innodb_log_archive", "innodb_log_block_size", "innodb_log_checksum_algorithm", "innodb_log_checksums", diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index d77934bd871f2..5c0aafa90dbf0 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -950,6 +950,7 @@ static SHOW_VAR innodb_status_variables[]= { {"lsn_flushed", &export_vars.innodb_lsn_flushed, SHOW_ULONGLONG}, {"lsn_last_checkpoint", &export_vars.innodb_lsn_last_checkpoint, SHOW_ULONGLONG}, + {"lsn_archived", &log_sys.archived_lsn, SHOW_ULONGLONG}, {"master_thread_active_loops", &srv_main_active_loops, SHOW_SIZE_T}, {"master_thread_idle_loops", &srv_main_idle_loops, SHOW_SIZE_T}, {"max_trx_id", &export_vars.innodb_max_trx_id, SHOW_ULONGLONG}, @@ -19430,6 +19431,34 @@ static MYSQL_SYSVAR_BOOL(data_file_write_through, fil_system.write_through, "Whether each write to data files writes through", nullptr, innodb_data_file_write_through_update, FALSE); +static void innodb_log_archive_update(THD *, st_mysql_sys_var*, + void *, const void *save) noexcept +{ + const my_bool archive= *static_cast(save); + log_sys.latch.wr_lock(SRW_LOCK_CALL); + const lsn_t resizing{log_sys.resize_in_progress()}; + if (archive && UNIV_UNLIKELY(resizing != 0)) + my_printf_error(ER_WRONG_USAGE, + "SET GLOBAL innodb_log_file_size is in progress", MYF(0)); + else + { + log_sys.archive= archive; + if (!resizing) + { + if (archive) + log_sys.archive_set_size(); + mtr_t::finisher_update(); + } + } + log_sys.archived_lsn= 0; // FIXME: move this to log_t::write_checkpoint() + log_sys.latch.wr_unlock(); +} + +static MYSQL_SYSVAR_BOOL(log_archive, log_sys.archive, + PLUGIN_VAR_OPCMDARG, + "Whether log archiving is desired (innodb_lsn_archived>0 if enabled)", + nullptr, innodb_log_archive_update, FALSE); + static MYSQL_SYSVAR_UINT64_T(log_recovery_start, recv_sys.recovery_start, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "checkpoint LSN to start recovery from (0=automatic)", @@ -19442,7 +19471,7 @@ static MYSQL_SYSVAR_UINT64_T(log_recovery_target, recv_sys.rpo, static MYSQL_SYSVAR_ULONGLONG(log_file_size, srv_log_file_size, PLUGIN_VAR_RQCMDARG, - "Redo log size in bytes.", + "Desired size of ib_logfile0 in bytes", nullptr, innodb_log_file_size_update, 96 << 20, 4 << 20, std::numeric_limits::max(), 4096); @@ -19874,6 +19903,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(log_file_write_through), MYSQL_SYSVAR(data_file_buffering), MYSQL_SYSVAR(data_file_write_through), + MYSQL_SYSVAR(log_archive), MYSQL_SYSVAR(log_recovery_start), MYSQL_SYSVAR(log_recovery_target), MYSQL_SYSVAR(log_file_size), diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h index e80011a9c4c50..37ad66552e5e3 100644 --- a/storage/innobase/include/log0log.h +++ b/storage/innobase/include/log0log.h @@ -247,13 +247,16 @@ struct log_t lsn_t (*writer)() noexcept; /** next checkpoint LSN (protected by latch.wr_lock()) */ lsn_t next_checkpoint_lsn; + /** start of archived log, or 0 (proteted by latch.wr_lock()) */ + lsn_t archived_lsn; /** Log file */ log_file_t log; private: /** Log file being constructed during resizing; protected by latch */ log_file_t resize_log; - /** size of resize_log; protected by latch */ + /** size of resize_log, or the requested innodb_log_file_size + of the next file created if archive==TRUE; protected by latch */ lsn_t resize_target; /** Buffer for writing to resize_log; @see buf */ byte *resize_buf; @@ -270,6 +273,8 @@ struct log_t uint write_size; /** format of the redo log: e.g., FORMAT_10_8 */ uint32_t format; + /** the current value of innodb_log_archive; protected by latch.wr_lock() */ + my_bool archive; /** whether the memory-mapped interface is enabled for the log */ my_bool log_mmap; /** the default value of log_mmap */ @@ -455,7 +460,24 @@ struct log_t /** Persist the log. @param lsn desired new value of flushed_to_disk_lsn */ void persist(lsn_t lsn) noexcept; + /** Create, allocate and map a new log file. */ + ATTRIBUTE_COLD void archive_new_mmap() noexcept; #endif + /** Create a new log file when the current one will fill up. + @param buf log records to append + @param length size of the log records, in bytes + @param offset log file offset */ + ATTRIBUTE_COLD void archive_new_write(const byte *buf, size_t length, + lsn_t offset) noexcept; + + /** Ensure that innodb_log_archive=ON will default to the current + innodb_log_file_size if no size has been specified. */ + void archive_set_size() noexcept + { + ut_ad(!resize_in_progress()); + if (!resize_target) + resize_target= file_size; + } bool check_for_checkpoint() const { diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h index 82756ee43cf0d..accd5dd0932e5 100644 --- a/storage/innobase/include/mtr0mtr.h +++ b/storage/innobase/include/mtr0mtr.h @@ -701,12 +701,22 @@ struct mtr_t { @return {start_lsn,flush_ahead_lsn} */ std::pair do_write() noexcept; + /** How to write log */ + enum finish_writing { + /** circular memory-mapped writing when log_sys.is_mmap() */ + CIRCULAR_MMAP, + /** memory-mapped log for log_sys.archive */ + ARCHIVED_MMAP, + /** normal writing !log_sys.is_mmap() */ + WRITE_NORMAL + }; + /** Append the redo log records to the redo log buffer. - @tparam mmap log_sys.is_mmap() + @tparam how how to write @param mtr mini-transaction @param len number of bytes to write @return {start_lsn,flush_ahead_lsn} */ - template static + template static std::pair finish_writer(mtr_t *mtr, size_t len); /** The applicable variant of commit_log() */ diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc index eb5fc3fe721d5..03c4f6c772ea2 100644 --- a/storage/innobase/log/log0log.cc +++ b/storage/innobase/log/log0log.cc @@ -617,6 +617,14 @@ log_t::resize_start_status log_t::resize_start(os_offset_t size, void *thd) status= RESIZE_NO_CHANGE; else if (resize_in_progress()) status= RESIZE_IN_PROGRESS; + else if (archive) + { + status= RESIZE_NO_CHANGE; + /* When the current log becomes full and a new archivable log file + is being created, it will be of this size. At that point we will assign + file_size= resize_target, resize_target= 0; */ + resize_target= size; + } else { lsn_t start_lsn; @@ -758,10 +766,13 @@ void log_t::resize_abort(void *thd) noexcept } /** Write an aligned buffer to ib_logfile0. -@param buf buffer to be written -@param length length of data to be written -@param offset log file offset */ -static void log_write_buf(const byte *buf, size_t length, lsn_t offset) +@param max_length the maximum length that can be written to the file +@param buf buffer to be written +@param length length of data to be written +@param offset log file offset */ +static void log_write_buf(lsn_t max_length, + const byte *buf, size_t length, lsn_t offset) + noexcept { ut_ad(write_lock.is_owner()); ut_ad(!recv_no_log_write); @@ -770,21 +781,65 @@ static void log_write_buf(const byte *buf, size_t length, lsn_t offset) ut_ad(!(length & block_size_1)); ut_ad(!(size_t(buf) & block_size_1)); ut_ad(length); + ut_ad(max_length == log_sys.file_size - offset); - const lsn_t maximum_write_length{log_sys.file_size - offset}; - ut_ad(maximum_write_length <= log_sys.file_size - log_sys.START_OFFSET); - - if (UNIV_UNLIKELY(length > maximum_write_length)) + if (UNIV_UNLIKELY(length > max_length)) { - log_sys.log.write(offset, {buf, size_t(maximum_write_length)}); - length-= size_t(maximum_write_length); - buf+= size_t(maximum_write_length); + ut_ad(!log_sys.archive); + ut_ad(!log_sys.archived_lsn); + log_sys.log.write(offset, {buf, size_t(max_length)}); + length-= size_t(max_length); + buf+= size_t(max_length); ut_ad(log_sys.START_OFFSET + length < offset); offset= log_sys.START_OFFSET; } log_sys.log.write(offset, {buf, length}); } +static const char *const logfile_new= "ib_logfile_new"; + +ATTRIBUTE_COLD void log_t::archive_new_write(const byte *buf, size_t length, + lsn_t offset) noexcept +{ + ut_ad(latch_have_wr()); + ut_ad(write_lock.is_owner()); + ut_ad(archive); + ut_ad(length >= file_size - offset); + ut_ad(!resize_log.is_opened()); + ut_ad(!resize_buf); + ut_ad(!resize_in_progress()); + ut_ad(resize_target >= 4U << 20); + ut_ad(is_latest()); + + const size_t first{size_t(file_size - offset)}; + log.write(offset, {buf, first}); + length-= first; + buf+= first; + + std::string path{get_log_file_path(logfile_new)}; + bool success; + pfs_os_file_t file= + os_file_create_func(path.c_str(), OS_FILE_CREATE, OS_LOG_FILE, + false, &success); + ut_ad(success == (file != OS_FILE_CLOSED)); + if (file != OS_FILE_CLOSED) + { + if (os_file_set_size(path.c_str(), file, resize_target)) + { + log_sys.log.close(); + log_sys.log.m_file= file; + if (length) + log_sys.log.write(START_OFFSET, {buf, length}); + return; + } + os_file_close(file); + IF_WIN(DeleteFile(path.c_str()), unlink(path.c_str())); + } + sql_print_error("[FATAL] InnoDB: Failed to create %s of %" PRIu64 + " bytes", path.c_str(), resize_target); + abort(); +} + /** Invoke commit_checkpoint_notify_ha() to notify that outstanding log writes have been completed. */ void log_flush_notify(lsn_t flush_lsn); @@ -911,6 +966,44 @@ static size_t log_pad(lsn_t lsn, size_t pad, byte *begin, byte *extra) #endif #ifdef HAVE_PMEM +ATTRIBUTE_COLD void log_t::archive_new_mmap() noexcept +{ + ut_ad(latch_have_any()); + ut_ad(!resize_log.is_opened()); + ut_ad(!resize_buf); + ut_ad(!resize_in_progress()); + ut_ad(resize_target >= 4U << 20); + ut_ad(is_latest()); + std::string path{get_log_file_path(logfile_new)}; + bool success; + pfs_os_file_t file= + os_file_create_func(path.c_str(), OS_FILE_CREATE, OS_LOG_FILE, + false, &success); + ut_ad(success == (file != OS_FILE_CLOSED)); + if (file != OS_FILE_CLOSED) + { + if (os_file_set_size(path.c_str(), file, resize_target)) + { + bool is_pmem{false}; + void *ptr= ::log_mmap(file, is_pmem, resize_target); + os_file_close(file); + if (ptr != MAP_FAILED) + { + buf= static_cast(ptr); + file_size= resize_target; + return; + } + } + else + os_file_close(file); + + IF_WIN(DeleteFile(path.c_str()), unlink(path.c_str())); + } + sql_print_error("[FATAL] InnoDB: Failed to create and map %s of %" PRIu64 + " bytes", path.c_str(), resize_target); + abort(); +} + void log_t::persist(lsn_t lsn) noexcept { ut_ad(!is_opened()); @@ -1092,18 +1185,32 @@ lsn_t log_t::write_buf() noexcept ut_ad(base + (write_lsn_offset & (WRITE_TO_BUF - 1)) == lsn); write_to_log++; + DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF " at " LSN_PF, + write_lsn, lsn, offset)); + + const lsn_t max_length{file_size - offset}; + ut_ad(max_length <= capacity()); + if (UNIV_UNLIKELY(length >= max_length)) + { + if (resizing != RESIZING && archive) + { + archive_new_write(write_buf, length, offset); + if (resizing != RETAIN_LATCH) + latch.wr_unlock(); + goto written; + } + archived_lsn= 0; + } if (resizing != RETAIN_LATCH) latch.wr_unlock(); - DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF " at " LSN_PF, - write_lsn, lsn, offset)); - /* Do the write to the log file */ - log_write_buf(write_buf, length, offset); + log_write_buf(max_length, write_buf, length, offset); if (UNIV_LIKELY_NULL(re_write_buf)) resize_write_buf(re_write_buf, length); + written: write_lsn= lsn; if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED)) diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc index 952850b3f499d..bca459b9292ef 100644 --- a/storage/innobase/mtr/mtr0mtr.cc +++ b/storage/innobase/mtr/mtr0mtr.cc @@ -52,12 +52,14 @@ void mtr_t::finisher_update() if (log_sys.is_mmap()) { commit_logger= mtr_t::commit_log; - finisher= mtr_t::finish_writer; + finisher= log_sys.archive + ? mtr_t::finish_writer + : mtr_t::finish_writer; return; } commit_logger= mtr_t::commit_log; #endif - finisher= mtr_t::finish_writer; + finisher= mtr_t::finish_writer; } void mtr_memo_slot_t::release() const @@ -920,6 +922,7 @@ ATTRIBUTE_COLD void log_t::append_prepare_wait(bool late, bool ex) noexcept { ut_ad(lsn - get_flushed_lsn(std::memory_order_relaxed) < capacity() || overwrite_warned); + ut_a(!archive); // FIXME: create, allocate and attach a new file persist(lsn); } #endif @@ -1214,83 +1217,93 @@ inline void log_t::append(byte *&d, const void *s, size_t size) noexcept d+= size; } -template -std::pair mtr_t::finish_writer(mtr_t *mtr, size_t len) +template +std::pair +mtr_t::finish_writer(mtr_t *mtr, size_t len) { ut_ad(log_sys.is_latest()); ut_ad(!recv_no_log_write); ut_ad(mtr->is_logged()); ut_ad(mtr->m_latch_ex ? log_sys.latch_have_wr() : log_sys.latch_have_rd()); ut_ad(len < recv_sys.MTR_SIZE_MAX); + ut_ad(how == WRITE_NORMAL || log_sys.archive == (how == ARCHIVED_MMAP)); const size_t size{mtr->m_commit_lsn ? 5U + 8U : 5U}; std::pair start= - log_sys.append_prepare(len, mtr->m_latch_ex); + log_sys.append_prepare(len, mtr->m_latch_ex); - if (!mmap) - { + if (how == WRITE_NORMAL || + UNIV_LIKELY(start.second + len <= &log_sys.buf[log_sys.file_size])) for (const mtr_buf_t::block_t &b : mtr->m_log) log_sys.append(start.second, b.begin(), b.used()); - - write_trailer: - *start.second++= log_sys.get_sequence_bit(start.first + len - size); - if (mtr->m_commit_lsn) - { - mach_write_to_8(start.second, mtr->m_commit_lsn); - mtr->m_crc= my_crc32c(mtr->m_crc, start.second, 8); - start.second+= 8; - } - mach_write_to_4(start.second, mtr->m_crc); - start.second+= 4; - } +#ifdef HAVE_PMEM else { - if (UNIV_LIKELY(start.second + len <= &log_sys.buf[log_sys.file_size])) - { - for (const mtr_buf_t::block_t &b : mtr->m_log) - log_sys.append(start.second, b.begin(), b.used()); - goto write_trailer; - } + byte *const end= &log_sys.buf[log_sys.file_size]; + if (how == ARCHIVED_MMAP) + log_sys.archive_new_mmap(); + else + log_sys.archived_lsn= 0; + byte *const begin= &log_sys.buf[log_sys.START_OFFSET]; for (const mtr_buf_t::block_t &b : mtr->m_log) { size_t size{b.used()}; - const size_t size_left(&log_sys.buf[log_sys.file_size] - start.second); + const size_t size_left(end - start.second); const byte *src= b.begin(); if (size > size_left) { ::memcpy(start.second, src, size_left); - start.second= &log_sys.buf[log_sys.START_OFFSET]; + start.second= begin; src+= size_left; size-= size_left; } ::memcpy(start.second, src, size); start.second+= size; } - const size_t size_left(&log_sys.buf[log_sys.file_size] - start.second); - if (size_left > size) - goto write_trailer; + const size_t size_left(end - start.second); + if (size_left <= size) + { + byte tail[5 + 8]; + tail[0]= log_sys.get_sequence_bit(start.first + len - size); - byte tail[5 + 8]; - tail[0]= log_sys.get_sequence_bit(start.first + len - size); + if (mtr->m_commit_lsn) + { + mach_write_to_8(tail + 1, mtr->m_commit_lsn); + mtr->m_crc= my_crc32c(mtr->m_crc, tail + 1, 8); + mach_write_to_4(tail + 9, mtr->m_crc); + } + else + mach_write_to_4(tail + 1, mtr->m_crc); - if (mtr->m_commit_lsn) - { - mach_write_to_8(tail + 1, mtr->m_commit_lsn); - mtr->m_crc= my_crc32c(mtr->m_crc, tail + 1, 8); - mach_write_to_4(tail + 9, mtr->m_crc); + ::memcpy(start.second, tail, size_left); + ::memcpy(begin, tail + size_left, size - size_left); + start.second= ((size >= size_left) ? begin : end) + (size - size_left); + goto wrote_trailer; } - else - mach_write_to_4(tail + 1, mtr->m_crc); - - ::memcpy(start.second, tail, size_left); - ::memcpy(log_sys.buf + log_sys.START_OFFSET, tail + size_left, - size - size_left); - start.second= log_sys.buf + - ((size >= size_left) ? log_sys.START_OFFSET : log_sys.file_size) + - (size - size_left); } +#endif - log_sys.resize_write(start.first, start.second, len, size); + *start.second++= log_sys.get_sequence_bit(start.first + len - size); + + if (mtr->m_commit_lsn) + { + mach_write_to_8(start.second, mtr->m_commit_lsn); + mtr->m_crc= my_crc32c(mtr->m_crc, start.second, 8); + start.second+= 8; + } + mach_write_to_4(start.second, mtr->m_crc); + start.second+= 4; + +#ifdef HAVE_PMEM +wrote_trailer: +#else + static_assert(how == WRITE_NORMAL, ""); +#endif + + if (how == ARCHIVED_MMAP) + ut_ad(!log_sys.resize_in_progress()); + else + log_sys.resize_write(start.first, start.second, len, size); mtr->m_commit_lsn= start.first + len; return {start.first, log_close(mtr->m_commit_lsn)}; diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index e92e0ff54e078..806a4c334693b 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -1611,6 +1611,8 @@ dberr_t srv_start(bool create_new_db) if (log_sys.resize_rename()) { return(srv_init_abort(DB_ERROR)); } + + if (log_sys.archive) log_sys.archive_set_size(); } else { /* Suppress warnings in fil_space_t::create() for files that are being read before dict_boot() has recovered @@ -1732,6 +1734,8 @@ dberr_t srv_start(bool create_new_db) recv_sys.debug_free(); + if (log_sys.archive) log_sys.archive_set_size(); + if (!srv_read_only_mode) { const uint32_t flags = FSP_FLAGS_PAGE_SSIZE(); for (uint32_t id = srv_undo_space_id_start; From 6d4d0b5eb24460a7e751d2ff7949dd15c25dad6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Fri, 14 Nov 2025 15:11:50 +0200 Subject: [PATCH 4/6] squash! 9d14e2c6fbe13265f3a040399b8bc5d5b8b1185f NOTE! innodb_log_archive=ON writes will not work yet! innodb_log_archive_update(): Assign log_sys.archived_lsn to the latest checkpoint; it should be usable. If the log buffer is about to wrap around, back off and retry. log_t::first_lsn: This should be the start of the current log file, to be consulted in log_t::write_checkpoint() when renaming files. log_t::append_prepare_archived_mmap(): Create and memory-map a new log file. TODO: Do we need a separate function from append_prepare_wait()? log_t::append_prepare(): Special case. log_t::archive_new_mmap(): Switch to the buffer that was created in append_prepare_archived_mmap(). --- .../suite/innodb/t/log_file_size_online.test | 10 -- storage/innobase/handler/ha_innodb.cc | 41 ++++--- storage/innobase/include/log0log.h | 38 +++++- storage/innobase/include/mtr0mtr.h | 12 +- storage/innobase/log/log0log.cc | 116 ++++++++++++++---- storage/innobase/log/log0recv.cc | 1 + storage/innobase/mtr/mtr0mtr.cc | 99 ++++++++++++--- 7 files changed, 234 insertions(+), 83 deletions(-) diff --git a/mysql-test/suite/innodb/t/log_file_size_online.test b/mysql-test/suite/innodb/t/log_file_size_online.test index 8ae7952db8a5c..f835e54e6719b 100644 --- a/mysql-test/suite/innodb/t/log_file_size_online.test +++ b/mysql-test/suite/innodb/t/log_file_size_online.test @@ -5,11 +5,6 @@ let SEARCH_FILE = $MYSQLTEST_VARDIR/log/mysqld.1.err; SET GLOBAL innodb_log_archive=OFF; -let $wait_condition= -SELECT variable_value = 0 -FROM information_schema.global_status -WHERE variable_name = 'innodb_lsn_archived'; ---source include/wait_condition.inc SET GLOBAL innodb_log_file_size=4194304; SHOW VARIABLES LIKE 'innodb_log_file_size'; @@ -85,11 +80,6 @@ reap; SET @save_archive=@@GLOBAL.innodb_log_archive; SET GLOBAL innodb_log_archive=OFF; -let $wait_condition= -SELECT variable_value = 0 -FROM information_schema.global_status -WHERE variable_name = 'innodb_lsn_archived'; ---source include/wait_condition.inc SELECT * FROM t WHERE a<10; SELECT COUNT(*),LENGTH(b) FROM t GROUP BY b; diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 5c0aafa90dbf0..538c5b742a028 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -19434,24 +19434,37 @@ static MYSQL_SYSVAR_BOOL(data_file_write_through, fil_system.write_through, static void innodb_log_archive_update(THD *, st_mysql_sys_var*, void *, const void *save) noexcept { - const my_bool archive= *static_cast(save); - log_sys.latch.wr_lock(SRW_LOCK_CALL); - const lsn_t resizing{log_sys.resize_in_progress()}; - if (archive && UNIV_UNLIKELY(resizing != 0)) - my_printf_error(ER_WRONG_USAGE, - "SET GLOBAL innodb_log_file_size is in progress", MYF(0)); - else + for (const my_bool archive= *static_cast(save);;) { - log_sys.archive= archive; - if (!resizing) + log_sys.latch.wr_lock(SRW_LOCK_CALL); + const lsn_t resizing{log_sys.resize_in_progress()}; + if (archive && UNIV_UNLIKELY(resizing != 0)) + my_printf_error(ER_WRONG_USAGE, + "SET GLOBAL innodb_log_file_size is in progress", MYF(0)); + else { - if (archive) - log_sys.archive_set_size(); - mtr_t::finisher_update(); + log_sys.archive= archive; + if (!resizing) + { +#ifdef HAVE_PMEM + if (log_sys.is_backoff() && log_sys.is_mmap()) + { + /* Prevent a race condition with log_t::append_prepare() */ + log_sys.latch.wr_unlock(); + continue; + } +#endif + if (archive) + { + log_sys.archived_lsn= log_sys.next_checkpoint_lsn; + log_sys.archive_set_size(); + } + mtr_t::finisher_update(); + } } + log_sys.latch.wr_unlock(); + return; } - log_sys.archived_lsn= 0; // FIXME: move this to log_t::write_checkpoint() - log_sys.latch.wr_unlock(); } static MYSQL_SYSVAR_BOOL(log_archive, log_sys.archive, diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h index 37ad66552e5e3..bb38cca63fe8e 100644 --- a/storage/innobase/include/log0log.h +++ b/storage/innobase/include/log0log.h @@ -258,7 +258,9 @@ struct log_t /** size of resize_log, or the requested innodb_log_file_size of the next file created if archive==TRUE; protected by latch */ lsn_t resize_target; - /** Buffer for writing to resize_log; @see buf */ + /** Buffer for writing to resize_log; @see buf + Also a spare buffer between append_prepare_archived_mmap() and + archive_new_mmap() */ byte *resize_buf; /** Buffer for writing to resize_log; @see flush_buf */ byte *resize_flush_buf; @@ -266,7 +268,7 @@ struct log_t /** log sequence number when log resizing was initiated; 0 if the log is not being resized, 1 if resize_start() is in progress */ std::atomic resize_lsn; - /** the log sequence number at the start of the log file */ + /** the log sequence number at the start of the current log file */ lsn_t first_lsn; public: /** current innodb_log_write_ahead_size */ @@ -439,6 +441,13 @@ struct log_t (write_lsn_offset & (WRITE_BACKOFF - 1)); } + /** @return whether a back-off in a log write is in progress */ + bool is_backoff() const noexcept + { + ut_ad(latch_have_wr()); + return write_lsn_offset & WRITE_BACKOFF; + } + lsn_t get_flushed_lsn(std::memory_order order= std::memory_order_acquire) const noexcept { return flushed_to_disk_lsn.load(order); } @@ -460,8 +469,8 @@ struct log_t /** Persist the log. @param lsn desired new value of flushed_to_disk_lsn */ void persist(lsn_t lsn) noexcept; - /** Create, allocate and map a new log file. */ - ATTRIBUTE_COLD void archive_new_mmap() noexcept; + /** Switch the log buffers. */ + inline void archive_new_mmap() noexcept; #endif /** Create a new log file when the current one will fill up. @param buf log records to append @@ -511,13 +520,30 @@ struct log_t @param late whether the WRITE_BACKOFF flag had already been set @param ex whether log_sys.latch is exclusively locked */ ATTRIBUTE_COLD void append_prepare_wait(bool late, bool ex) noexcept; +#ifdef HAVE_PMEM + /** Wait in append_prepare() for buffer to become available + @param late whether the WRITE_BACKOFF flag had already been set + @param ex whether log_sys.latch is exclusively locked */ + ATTRIBUTE_COLD void append_prepare_archived_mmap(bool late, bool ex) + noexcept; +#endif public: + /** How to write log */ + enum write { + /** normal writing !log_sys.is_mmap() */ + WRITE_NORMAL, + /** circular memory-mapped writing when log_sys.is_mmap() */ + CIRCULAR_MMAP, + /** memory-mapped log for log_sys.archive */ + ARCHIVED_MMAP + }; + /** Reserve space in the log buffer for appending data. - @tparam mmap log_sys.is_mmap() + @tparam mode how to write log @param size total length of the data to append(), in bytes @param ex whether log_sys.latch is exclusively locked @return the start LSN and the buffer position for append() */ - template + template std::pair append_prepare(size_t size, bool ex) noexcept; /** Append a string of bytes to the redo log. diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h index accd5dd0932e5..567fad8a160e9 100644 --- a/storage/innobase/include/mtr0mtr.h +++ b/storage/innobase/include/mtr0mtr.h @@ -701,22 +701,12 @@ struct mtr_t { @return {start_lsn,flush_ahead_lsn} */ std::pair do_write() noexcept; - /** How to write log */ - enum finish_writing { - /** circular memory-mapped writing when log_sys.is_mmap() */ - CIRCULAR_MMAP, - /** memory-mapped log for log_sys.archive */ - ARCHIVED_MMAP, - /** normal writing !log_sys.is_mmap() */ - WRITE_NORMAL - }; - /** Append the redo log records to the redo log buffer. @tparam how how to write @param mtr mini-transaction @param len number of bytes to write @return {start_lsn,flush_ahead_lsn} */ - template static + template static std::pair finish_writer(mtr_t *mtr, size_t len); /** The applicable variant of commit_log() */ diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc index 03c4f6c772ea2..5a658aec79101 100644 --- a/storage/innobase/log/log0log.cc +++ b/storage/innobase/log/log0log.cc @@ -436,6 +436,7 @@ void log_t::create(lsn_t lsn) noexcept flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed); first_lsn= lsn; write_lsn= lsn; + archived_lsn= lsn; last_checkpoint_lsn= 0; @@ -966,42 +967,111 @@ static size_t log_pad(lsn_t lsn, size_t pad, byte *begin, byte *extra) #endif #ifdef HAVE_PMEM -ATTRIBUTE_COLD void log_t::archive_new_mmap() noexcept +ATTRIBUTE_COLD void log_t::append_prepare_archived_mmap(bool late, bool ex) + noexcept { - ut_ad(latch_have_any()); + ut_ad(archive); + ut_ad(is_mmap()); ut_ad(!resize_log.is_opened()); ut_ad(!resize_buf); ut_ad(!resize_in_progress()); ut_ad(resize_target >= 4U << 20); ut_ad(is_latest()); - std::string path{get_log_file_path(logfile_new)}; - bool success; - pfs_os_file_t file= - os_file_create_func(path.c_str(), OS_FILE_CREATE, OS_LOG_FILE, - false, &success); - ut_ad(success == (file != OS_FILE_CLOSED)); - if (file != OS_FILE_CLOSED) + + if (UNIV_LIKELY(!ex)) { - if (os_file_set_size(path.c_str(), file, resize_target)) + latch.rd_unlock(); + if (!late) { - bool is_pmem{false}; - void *ptr= ::log_mmap(file, is_pmem, resize_target); - os_file_close(file); - if (ptr != MAP_FAILED) + /* Wait for all threads to back off. */ + latch.wr_lock(SRW_LOCK_CALL); + goto got_ex; + } + + const auto delay= my_cpu_relax_multiplier / 4 * srv_spin_wait_delay; + const auto rounds= srv_n_spin_wait_rounds; + + for (;;) + { + HMT_low(); + for (auto r= rounds + 1; r--; ) { - buf= static_cast(ptr); - file_size= resize_target; - return; + if (write_lsn_offset.load(std::memory_order_relaxed) & WRITE_BACKOFF) + { + for (auto d= delay; d--; ) + MY_RELAX_CPU(); + } + else + { + HMT_medium(); + goto done; + } } + HMT_medium(); + std::this_thread::sleep_for(std::chrono::microseconds(100)); } - else - os_file_close(file); + } + else + { + got_ex: + const uint64_t l= write_lsn_offset.load(std::memory_order_relaxed); + const lsn_t lsn= base_lsn.load(std::memory_order_relaxed) + + (l & (WRITE_BACKOFF - 1)); + waits++; + ut_ad(archive); + ut_ad(!resize_log.is_opened()); + ut_ad(!resize_buf); + ut_ad(!resize_in_progress()); + ut_ad(resize_target >= 4U << 20); + ut_ad(is_latest()); - IF_WIN(DeleteFile(path.c_str()), unlink(path.c_str())); + do + { + std::string path{get_log_file_path(logfile_new)}; + bool success; + pfs_os_file_t file= + os_file_create_func(path.c_str(), OS_FILE_CREATE, OS_LOG_FILE, + false, &success); + ut_ad(success == (file != OS_FILE_CLOSED)); + if (file != OS_FILE_CLOSED) + { + if (os_file_set_size(path.c_str(), file, resize_target)) + { + bool is_pmem{false}; + resize_buf= static_cast(::log_mmap(file, is_pmem, + resize_target)); + os_file_close(file); + if (resize_buf != MAP_FAILED) + continue; + resize_buf= nullptr; + } + } + else + os_file_close(file); + + IF_WIN(DeleteFile(path.c_str()), unlink(path.c_str())); + sql_print_error("[FATAL] InnoDB: Failed to create and map %s of %" PRIu64 + " bytes", path.c_str(), resize_target); + abort(); + } + while (false); + + // TODO: adjust this, and clear the WRITE_BACKOFF flag + ut_ad(lsn - get_flushed_lsn(std::memory_order_relaxed) < capacity() || + overwrite_warned); + persist(lsn); // TODO: pmem_persist() + latch.wr_unlock(); + /* Above we cleared the WRITE_BACKOFF flag, + which our caller will recheck. */ + if (ex) + { + latch.wr_lock(SRW_LOCK_CALL); + return; + } } - sql_print_error("[FATAL] InnoDB: Failed to create and map %s of %" PRIu64 - " bytes", path.c_str(), resize_target); - abort(); + +done: + latch.rd_lock(SRW_LOCK_CALL); } void log_t::persist(lsn_t lsn) noexcept diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index cb0831e8b07c9..49fca94ea918f 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -1811,6 +1811,7 @@ dberr_t recv_sys_t::find_checkpoint() const lsn_t first_lsn{mach_read_from_8(buf + LOG_HEADER_START_LSN)}; log_sys.set_first_lsn(first_lsn); + log_sys.archived_lsn= first_lsn; char creator[LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR + 1]; memcpy(creator, buf + LOG_HEADER_CREATOR, sizeof creator); /* Ensure that the string is NUL-terminated. */ diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc index bca459b9292ef..798d273df9a68 100644 --- a/storage/innobase/mtr/mtr0mtr.cc +++ b/storage/innobase/mtr/mtr0mtr.cc @@ -53,13 +53,13 @@ void mtr_t::finisher_update() { commit_logger= mtr_t::commit_log; finisher= log_sys.archive - ? mtr_t::finish_writer - : mtr_t::finish_writer; + ? mtr_t::finish_writer + : mtr_t::finish_writer; return; } commit_logger= mtr_t::commit_log; #endif - finisher= mtr_t::finish_writer; + finisher= mtr_t::finish_writer; } void mtr_memo_slot_t::release() const @@ -854,6 +854,8 @@ static time_t log_close_warn_time; making the server crash-unsafe. */ ATTRIBUTE_COLD static void log_overwrite_warning(lsn_t lsn) { + ut_ad(!log_sys.archive); + if (log_sys.overwrite_warned) return; @@ -874,8 +876,61 @@ ATTRIBUTE_COLD static void log_overwrite_warning(lsn_t lsn) ? ". Shutdown is in progress" : ""); } + +#ifdef HAVE_PMEM +template<> +inline std::pair +log_t::append_prepare(size_t size, bool ex) noexcept +{ + ut_ad(ex ? latch_have_wr() : latch_have_rd()); + ut_ad(is_mmap()); + ut_ad(archive); + ut_ad(archived_lsn); + + uint64_t l, lsn; + static_assert(WRITE_TO_BUF == WRITE_BACKOFF << 1, ""); + while (UNIV_UNLIKELY((l= write_lsn_offset.fetch_add(size + WRITE_TO_BUF) & + (WRITE_TO_BUF - 1)) >= + size_t(capacity() - + ((lsn= base_lsn.load(std::memory_order_relaxed)) - + first_lsn)) - size)) + { + /* The following is inlined here instead of being part of + append_prepare_wait(), in order to increase the locality of reference + and to set the WRITE_BACKOFF flag as soon as possible. */ + bool late(write_lsn_offset.fetch_or(WRITE_BACKOFF) & WRITE_BACKOFF); + /* Subtract our LSN overshoot. */ + write_lsn_offset.fetch_sub(size); + append_prepare_archived_mmap(late, ex); + } + + lsn+= l; + return {lsn, buf + FIRST_LSN + (lsn - first_lsn)}; +} + +inline void log_t::archive_new_mmap() noexcept +{ + ut_ad(latch_have_any()); + ut_ad(!resize_log.is_opened()); + ut_ad(!resize_in_progress()); + ut_ad(resize_target >= 4U << 20); + ut_ad(is_latest()); + + resize_wrap_mutex.wr_lock(); + if (resize_buf) + { + my_munmap(buf, size_t(file_size)); + buf= resize_buf; + resize_buf= nullptr; + file_size= resize_target; + } + resize_wrap_mutex.wr_unlock(); +} +#endif + ATTRIBUTE_COLD void log_t::append_prepare_wait(bool late, bool ex) noexcept { + ut_ad(!archive); if (UNIV_LIKELY(!ex)) { latch.rd_unlock(); @@ -920,9 +975,9 @@ ATTRIBUTE_COLD void log_t::append_prepare_wait(bool late, bool ex) noexcept const bool is_pmem{is_mmap()}; if (is_pmem) { + ut_ad(!archive); ut_ad(lsn - get_flushed_lsn(std::memory_order_relaxed) < capacity() || overwrite_warned); - ut_a(!archive); // FIXME: create, allocate and attach a new file persist(lsn); } #endif @@ -945,17 +1000,20 @@ ATTRIBUTE_COLD void log_t::append_prepare_wait(bool late, bool ex) noexcept } /** Reserve space in the log buffer for appending data. -@tparam mmap log_sys.is_mmap() +@tparam mode how to write log @param size total length of the data to append(), in bytes @param ex whether log_sys.latch is exclusively locked @return the start LSN and the buffer position for append() */ -template +template inline std::pair log_t::append_prepare(size_t size, bool ex) noexcept { ut_ad(ex ? latch_have_wr() : latch_have_rd()); - ut_ad(mmap == is_mmap()); - ut_ad(!mmap || buf_size == std::min(capacity(), buf_size_max)); + static_assert(!bool(WRITE_NORMAL), ""); + static_assert(bool(CIRCULAR_MMAP), ""); + static_assert(mode == WRITE_NORMAL || mode == CIRCULAR_MMAP, ""); + ut_ad(bool(mode) == is_mmap()); + ut_ad(!mode || buf_size == std::min(capacity(), buf_size_max)); const size_t buf_size{this->buf_size - size}; uint64_t l; static_assert(WRITE_TO_BUF == WRITE_BACKOFF << 1, ""); @@ -978,7 +1036,7 @@ std::pair log_t::append_prepare(size_t size, bool ex) noexcept set_check_for_checkpoint(true); return {lsn, - buf + size_t(mmap ? FIRST_LSN + (lsn - first_lsn) % capacity() : l)}; + buf + size_t(mode ? FIRST_LSN + (lsn - first_lsn) % capacity() : l)}; } /** Finish appending data to the log. @@ -1217,7 +1275,7 @@ inline void log_t::append(byte *&d, const void *s, size_t size) noexcept d+= size; } -template +template std::pair mtr_t::finish_writer(mtr_t *mtr, size_t len) { @@ -1226,24 +1284,27 @@ mtr_t::finish_writer(mtr_t *mtr, size_t len) ut_ad(mtr->is_logged()); ut_ad(mtr->m_latch_ex ? log_sys.latch_have_wr() : log_sys.latch_have_rd()); ut_ad(len < recv_sys.MTR_SIZE_MAX); - ut_ad(how == WRITE_NORMAL || log_sys.archive == (how == ARCHIVED_MMAP)); + ut_ad(mode == log_t::WRITE_NORMAL || + log_sys.archive == (mode == log_t::ARCHIVED_MMAP)); const size_t size{mtr->m_commit_lsn ? 5U + 8U : 5U}; std::pair start= - log_sys.append_prepare(len, mtr->m_latch_ex); + log_sys.append_prepare(len, mtr->m_latch_ex); - if (how == WRITE_NORMAL || - UNIV_LIKELY(start.second + len <= &log_sys.buf[log_sys.file_size])) + if (mode == log_t::WRITE_NORMAL) + write_normal: for (const mtr_buf_t::block_t &b : mtr->m_log) log_sys.append(start.second, b.begin(), b.used()); #ifdef HAVE_PMEM else { byte *const end= &log_sys.buf[log_sys.file_size]; - if (how == ARCHIVED_MMAP) - log_sys.archive_new_mmap(); - else + if (UNIV_LIKELY(start.second + len <= end)) + goto write_normal; + if (mode == log_t::CIRCULAR_MMAP) log_sys.archived_lsn= 0; + else + log_sys.archive_new_mmap(); byte *const begin= &log_sys.buf[log_sys.START_OFFSET]; for (const mtr_buf_t::block_t &b : mtr->m_log) { @@ -1297,10 +1358,10 @@ mtr_t::finish_writer(mtr_t *mtr, size_t len) #ifdef HAVE_PMEM wrote_trailer: #else - static_assert(how == WRITE_NORMAL, ""); + static_assert(mode == log_t::WRITE_NORMAL, ""); #endif - if (how == ARCHIVED_MMAP) + if (mode == log_t::ARCHIVED_MMAP) ut_ad(!log_sys.resize_in_progress()); else log_sys.resize_write(start.first, start.second, len, size); From 6458cc2003744ce8b94d6972731ddd32bf5a603d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Mon, 17 Nov 2025 17:03:46 +0200 Subject: [PATCH 5/6] squash! 6d4d0b5eb24460a7e751d2ff7949dd15c25dad6b mtr_t::finish_writer(): Invoke log_close() only if innodb_log_archive=OFF. log_t::append_prepare_archived_mmap(): Simplify a bit, and try to ensure that a checkpoint will be completed inside the new log file. Still does not work! --- storage/innobase/include/log0log.h | 17 ++++++++++- storage/innobase/log/log0log.cc | 6 ++-- storage/innobase/mtr/mtr0mtr.cc | 45 +++++++++++++++++++----------- 3 files changed, 46 insertions(+), 22 deletions(-) diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h index bb38cca63fe8e..6a203b4a8ff38 100644 --- a/storage/innobase/include/log0log.h +++ b/storage/innobase/include/log0log.h @@ -369,14 +369,29 @@ struct log_t { return thd == resize_initiator; } /** Replicate a write to the log. + @tparam mmap whether the memory-mapped interface is enabled @param lsn start LSN @param end end of the mini-transaction @param len length of the mini-transaction @param seq offset of the sequence bit from the end */ + template inline void resize_write(lsn_t lsn, const byte *end, - size_t len, size_t seq) noexcept; + size_t len, size_t seq) noexcept + { + if (UNIV_LIKELY_NULL(resize_buf)) + resize_write_low(lsn, end, len, seq); + } private: + /** Replicate a write to the log. + @tparam mmap whether the memory-mapped interface is enabled + @param lsn start LSN + @param end end of the mini-transaction + @param len length of the mini-transaction + @param seq offset of the sequence bit from the end */ + template + ATTRIBUTE_COLD void resize_write_low(lsn_t lsn, const byte *end, + size_t len, size_t seq) noexcept; /** Write resize_buf to resize_log. @param b resize_buf or resize_flush_buf @param length the used length of b */ diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc index 5a658aec79101..7a07bd1cfe607 100644 --- a/storage/innobase/log/log0log.cc +++ b/storage/innobase/log/log0log.cc @@ -1060,14 +1060,12 @@ ATTRIBUTE_COLD void log_t::append_prepare_archived_mmap(bool late, bool ex) ut_ad(lsn - get_flushed_lsn(std::memory_order_relaxed) < capacity() || overwrite_warned); persist(lsn); // TODO: pmem_persist() - latch.wr_unlock(); /* Above we cleared the WRITE_BACKOFF flag, which our caller will recheck. */ if (ex) - { - latch.wr_lock(SRW_LOCK_CALL); return; - } + latch.wr_unlock(); + buf_flush_ahead(lsn, false); } done: diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc index 798d273df9a68..2fee384307bba 100644 --- a/storage/innobase/mtr/mtr0mtr.cc +++ b/storage/innobase/mtr/mtr0mtr.cc @@ -854,9 +854,9 @@ static time_t log_close_warn_time; making the server crash-unsafe. */ ATTRIBUTE_COLD static void log_overwrite_warning(lsn_t lsn) { - ut_ad(!log_sys.archive); + ut_ad(!log_sys.archive); /* we hope that this is unreachable */ - if (log_sys.overwrite_warned) + if (log_sys.overwrite_warned || log_sys.archive) return; time_t t= time(nullptr); @@ -891,9 +891,9 @@ log_t::append_prepare(size_t size, bool ex) noexcept static_assert(WRITE_TO_BUF == WRITE_BACKOFF << 1, ""); while (UNIV_UNLIKELY((l= write_lsn_offset.fetch_add(size + WRITE_TO_BUF) & (WRITE_TO_BUF - 1)) >= - size_t(capacity() - - ((lsn= base_lsn.load(std::memory_order_relaxed)) - - first_lsn)) - size)) + capacity() - + (lsn= base_lsn.load(std::memory_order_relaxed)) - + first_lsn - size)) { /* The following is inlined here instead of being part of append_prepare_wait(), in order to increase the locality of reference @@ -1029,10 +1029,9 @@ std::pair log_t::append_prepare(size_t size, bool ex) noexcept append_prepare_wait(late, ex); } - const lsn_t lsn{l + base_lsn.load(std::memory_order_relaxed)}, - end_lsn{lsn + size}; + const lsn_t lsn{l + base_lsn.load(std::memory_order_relaxed)}; - if (UNIV_UNLIKELY(end_lsn >= last_checkpoint_lsn + log_capacity)) + if (UNIV_UNLIKELY(lsn + size >= last_checkpoint_lsn + log_capacity)) set_check_for_checkpoint(true); return {lsn, @@ -1172,12 +1171,14 @@ std::pair mtr_t::do_write() noexcept return finish_write(len); } -inline void log_t::resize_write(lsn_t lsn, const byte *end, size_t len, - size_t seq) noexcept +template +ATTRIBUTE_COLD +void log_t::resize_write_low(lsn_t lsn, const byte *end, + size_t len, size_t seq) noexcept { ut_ad(latch_have_any()); + ut_ad(resize_buf); - if (UNIV_LIKELY_NULL(resize_buf)) { ut_ad(end >= buf); end-= len; @@ -1361,13 +1362,23 @@ mtr_t::finish_writer(mtr_t *mtr, size_t len) static_assert(mode == log_t::WRITE_NORMAL, ""); #endif - if (mode == log_t::ARCHIVED_MMAP) - ut_ad(!log_sys.resize_in_progress()); - else - log_sys.resize_write(start.first, start.second, len, size); - mtr->m_commit_lsn= start.first + len; - return {start.first, log_close(mtr->m_commit_lsn)}; + + switch (mode) { + case log_t::ARCHIVED_MMAP: + ut_ad(!log_sys.resize_in_progress()); + return {start.first, (log_sys.get_first_lsn() > log_sys.last_checkpoint_lsn + ? log_sys.get_first_lsn() : 0)}; + case log_t::CIRCULAR_MMAP: + log_sys.resize_write(start.first, start.second, len, size); + return {start.first, log_close(mtr->m_commit_lsn)}; + case log_t::WRITE_NORMAL: + log_sys.resize_write(start.first, start.second, len, size); + return {start.first, log_sys.archive + ? (log_sys.get_first_lsn() > log_sys.last_checkpoint_lsn + ? log_sys.get_first_lsn() : 0) + : log_close(mtr->m_commit_lsn)}; + } } bool mtr_t::have_x_latch(const buf_block_t &block) const From a03505054dcf6cec39b1c58322f387cc44b15511 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Tue, 18 Nov 2025 16:05:33 +0200 Subject: [PATCH 6/6] squash! 6458cc2003744ce8b94d6972731ddd32bf5a603d Fix the ARCHIVED_MMAP write path, except for renaming files at log checkpoint, and implementing crash recovery. log_t::archived_mmap_switch_complete(): Attempt to complete log_t::archived_mmap_switch_prepare(). --- storage/innobase/buf/buf0flu.cc | 3 +- storage/innobase/include/log0log.h | 31 ++++++++----- storage/innobase/include/mtr0mtr.h | 8 ++-- storage/innobase/log/log0log.cc | 11 ++--- storage/innobase/mtr/mtr0mtr.cc | 72 ++++++++++++++++++------------ 5 files changed, 73 insertions(+), 52 deletions(-) diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 4d8ebee1b0d22..a89e99ffdb07a 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -2172,7 +2172,8 @@ ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious) noexcept if (recv_recovery_is_on()) recv_sys.apply(true); - DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", return;); + DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", + if (!log_sys.archive) return;); Atomic_relaxed &limit= furious ? buf_flush_sync_lsn : buf_flush_async_lsn; diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h index 6a203b4a8ff38..4ebf5f50f493f 100644 --- a/storage/innobase/include/log0log.h +++ b/storage/innobase/include/log0log.h @@ -259,8 +259,8 @@ struct log_t of the next file created if archive==TRUE; protected by latch */ lsn_t resize_target; /** Buffer for writing to resize_log; @see buf - Also a spare buffer between append_prepare_archived_mmap() and - archive_new_mmap() */ + Also a spare buffer between archived_mmap_switch_prepare() + and archived_mmap_switch_complete() */ byte *resize_buf; /** Buffer for writing to resize_log; @see flush_buf */ byte *resize_flush_buf; @@ -402,13 +402,6 @@ struct log_t @return whether an error occurred */ static bool resize_rename() noexcept; - /** @return pointer for writing to resize_buf - @retval nullptr if no is_mmap() based resizing is active */ - inline byte *resize_buf_begin(lsn_t lsn) const noexcept; - /** @return end of resize_buf */ - inline const byte *resize_buf_end() const noexcept - { return resize_buf + resize_target; } - /** Initialise the redo log subsystem. */ void create() noexcept; @@ -484,9 +477,23 @@ struct log_t /** Persist the log. @param lsn desired new value of flushed_to_disk_lsn */ void persist(lsn_t lsn) noexcept; - /** Switch the log buffers. */ - inline void archive_new_mmap() noexcept; + /** @return the overflow buffer when ARCHIVED_MMAP is wrapping around */ + byte *get_archived_mmap_switch() const noexcept + { + ut_ad(archived_mmap_switch()); + return resize_buf + START_OFFSET; + } #endif + /** @return whether archived_mmap_switch_complete() needs to be called */ + bool archived_mmap_switch() const noexcept + { + ut_ad(latch_have_any()); + return UNIV_UNLIKELY(archive && resize_buf); + } + /** Attempt to finish archived_mmap_switch_prepare(). + @return the current LSN in the new file + @retval 0 if no switch took place */ + ATTRIBUTE_COLD lsn_t archived_mmap_switch_complete() noexcept; /** Create a new log file when the current one will fill up. @param buf log records to append @param length size of the log records, in bytes @@ -539,7 +546,7 @@ struct log_t /** Wait in append_prepare() for buffer to become available @param late whether the WRITE_BACKOFF flag had already been set @param ex whether log_sys.latch is exclusively locked */ - ATTRIBUTE_COLD void append_prepare_archived_mmap(bool late, bool ex) + ATTRIBUTE_COLD void archived_mmap_switch_prepare(bool late, bool ex) noexcept; #endif public: diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h index 567fad8a160e9..06229dc190981 100644 --- a/storage/innobase/include/mtr0mtr.h +++ b/storage/innobase/include/mtr0mtr.h @@ -688,13 +688,15 @@ struct mtr_t { ATTRIBUTE_NOINLINE size_t crc32c() noexcept; /** Commit the mini-transaction log. - @tparam pmem log_sys.is_mmap() + @tparam mmap log_sys.is_mmap() @param mtr mini-transaction @param lsns {start_lsn,flush_ahead_lsn} */ - template + template static void commit_log(mtr_t *mtr, std::pair lsns) noexcept; - /** Release log_sys.latch. */ + /** Release log_sys.latch. + @tparam mmap log_sys.is_mmap() */ + template void commit_log_release() noexcept; /** Append the redo log records to the redo log buffer. diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc index 7a07bd1cfe607..6bbe93e51ff9c 100644 --- a/storage/innobase/log/log0log.cc +++ b/storage/innobase/log/log0log.cc @@ -967,8 +967,8 @@ static size_t log_pad(lsn_t lsn, size_t pad, byte *begin, byte *extra) #endif #ifdef HAVE_PMEM -ATTRIBUTE_COLD void log_t::append_prepare_archived_mmap(bool late, bool ex) - noexcept +ATTRIBUTE_COLD +void log_t::archived_mmap_switch_prepare(bool late, bool ex) noexcept { ut_ad(archive); ut_ad(is_mmap()); @@ -1056,16 +1056,13 @@ ATTRIBUTE_COLD void log_t::append_prepare_archived_mmap(bool late, bool ex) } while (false); - // TODO: adjust this, and clear the WRITE_BACKOFF flag - ut_ad(lsn - get_flushed_lsn(std::memory_order_relaxed) < capacity() || - overwrite_warned); - persist(lsn); // TODO: pmem_persist() + ut_ad(lsn - get_flushed_lsn(std::memory_order_relaxed) < capacity()); + persist(lsn); /* Above we cleared the WRITE_BACKOFF flag, which our caller will recheck. */ if (ex) return; latch.wr_unlock(); - buf_flush_ahead(lsn, false); } done: diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc index 2fee384307bba..60057f07a3fec 100644 --- a/storage/innobase/mtr/mtr0mtr.cc +++ b/storage/innobase/mtr/mtr0mtr.cc @@ -338,15 +338,48 @@ void mtr_t::release() m_memo.clear(); } +#ifdef HAVE_PMEM +ATTRIBUTE_COLD lsn_t log_t::archived_mmap_switch_complete() noexcept +{ + ut_ad(latch_have_wr()); + if (!archive || !resize_buf) + return 0; + const lsn_t lsn{get_lsn()}, end_lsn{first_lsn + capacity()}; + if (lsn < end_lsn) + return 0; + persist(lsn); + my_munmap(buf, file_size); + /* TODO: make the file read-only */ + buf= resize_buf; + resize_buf= nullptr; + first_lsn= end_lsn; + file_size= resize_target; + return lsn; +} +#endif + +template ATTRIBUTE_NOINLINE void mtr_t::commit_log_release() noexcept { if (m_latch_ex) { + completed: + const lsn_t lsn{mmap ? log_sys.archived_mmap_switch_complete() : 0}; log_sys.latch.wr_unlock(); m_latch_ex= false; + if (mmap && lsn) + buf_flush_ahead(lsn, true); } else + { + const bool retry{mmap && log_sys.archived_mmap_switch()}; log_sys.latch.rd_unlock(); + if (retry) + { + log_sys.latch.wr_lock(SRW_LOCK_CALL); + goto completed; + } + } } static ATTRIBUTE_NOINLINE ATTRIBUTE_COLD @@ -397,12 +430,12 @@ void mtr_t::commit_log(mtr_t *mtr, std::pair lsns) noexcept buf_pool.page_cleaner_wakeup(); mysql_mutex_unlock(&buf_pool.flush_list_mutex); - mtr->commit_log_release(); + mtr->commit_log_release(); mtr->release(); } else { - mtr->commit_log_release(); + mtr->commit_log_release(); for (auto it= mtr->m_memo.rbegin(); it != mtr->m_memo.rend(); ) { @@ -893,7 +926,7 @@ log_t::append_prepare(size_t size, bool ex) noexcept (WRITE_TO_BUF - 1)) >= capacity() - (lsn= base_lsn.load(std::memory_order_relaxed)) - - first_lsn - size)) + first_lsn - size) && !resize_buf) { /* The following is inlined here instead of being part of append_prepare_wait(), in order to increase the locality of reference @@ -901,31 +934,12 @@ log_t::append_prepare(size_t size, bool ex) noexcept bool late(write_lsn_offset.fetch_or(WRITE_BACKOFF) & WRITE_BACKOFF); /* Subtract our LSN overshoot. */ write_lsn_offset.fetch_sub(size); - append_prepare_archived_mmap(late, ex); + archived_mmap_switch_prepare(late, ex); } lsn+= l; return {lsn, buf + FIRST_LSN + (lsn - first_lsn)}; } - -inline void log_t::archive_new_mmap() noexcept -{ - ut_ad(latch_have_any()); - ut_ad(!resize_log.is_opened()); - ut_ad(!resize_in_progress()); - ut_ad(resize_target >= 4U << 20); - ut_ad(is_latest()); - - resize_wrap_mutex.wr_lock(); - if (resize_buf) - { - my_munmap(buf, size_t(file_size)); - buf= resize_buf; - resize_buf= nullptr; - file_size= resize_target; - } - resize_wrap_mutex.wr_unlock(); -} #endif ATTRIBUTE_COLD void log_t::append_prepare_wait(bool late, bool ex) noexcept @@ -1299,14 +1313,14 @@ mtr_t::finish_writer(mtr_t *mtr, size_t len) #ifdef HAVE_PMEM else { - byte *const end= &log_sys.buf[log_sys.file_size]; + const size_t file_size= log_sys.file_size; + byte *const buf{log_sys.buf}; + byte *const end= &buf[file_size]; if (UNIV_LIKELY(start.second + len <= end)) goto write_normal; - if (mode == log_t::CIRCULAR_MMAP) - log_sys.archived_lsn= 0; - else - log_sys.archive_new_mmap(); - byte *const begin= &log_sys.buf[log_sys.START_OFFSET]; + byte *const begin= mode == log_t::ARCHIVED_MMAP + ? log_sys.get_archived_mmap_switch() + : buf + log_sys.START_OFFSET; for (const mtr_buf_t::block_t &b : mtr->m_log) { size_t size{b.used()};