diff --git a/include/my_sys.h b/include/my_sys.h index 4148ef0ba0408..b59dcdf4110f1 100644 --- a/include/my_sys.h +++ b/include/my_sys.h @@ -60,6 +60,7 @@ C_MODE_START #define MY_IGNORE_ENOENT 32U /* my_delete() ignores ENOENT (no such file) */ #define MY_ENCRYPT 64U /* Encrypt IO_CACHE temporary files */ #define MY_TEMPORARY 64U /* create_temp_file(): delete file at once */ +#define MY_OPEN_FOR_ASYNC_IO 128U /* my_open() open file for async io */ #define MY_NOSYMLINKS 512U /* my_open(): don't follow symlinks */ #define MY_FULL_IO 512U /* my_read(): loop until I/O is complete */ #define MY_DONT_CHECK_FILESIZE 128U /* Option to init_io_cache() */ diff --git a/mysql-test/suite/innodb/r/ext_buf_pool.result b/mysql-test/suite/innodb/r/ext_buf_pool.result new file mode 100644 index 0000000000000..1ecbab38815bc --- /dev/null +++ b/mysql-test/suite/innodb/r/ext_buf_pool.result @@ -0,0 +1,512 @@ +connect prevent_purge,localhost,root; +START TRANSACTION WITH CONSISTENT SNAPSHOT; +connection default; +SET GLOBAL innodb_limit_optimistic_insert_debug = 3; +SET GLOBAL DEBUG_DBUG='+d,ib_ext_bp_count_io_only_for_t'; +SET GLOBAL DEBUG_DBUG='+d,ib_ext_bp_disable_LRU_eviction_for_t'; +################################################################### +# Testing for encrypted ROW_FORMAT=COMPRESSED table # +################################################################### +CREATE TABLE t ( +`a` INT NOT NULL, +PRIMARY KEY (`a`) +) ENGINE=InnoDB STATS_PERSISTENT=0 ROW_FORMAT=COMPRESSED encrypted=yes +encryption_key_id=1; +SELECT variable_value INTO @prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL INTO @prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +SELECT variable_value INTO @prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL INTO @prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +SET @start_val = 6*100; +INSERT INTO t SET a = @start_val+1; +INSERT INTO t SET a = @start_val+2; +INSERT INTO t SET a = @start_val+3; +INSERT INTO t SET a = @start_val+4; +INSERT INTO t SET a = @start_val+5; +INSERT INTO t SET a = @start_val+6; +INSERT INTO t SET a = @start_val+7; +INSERT INTO t SET a = @start_val+8; +INSERT INTO t SET a = @start_val+9; +INSERT INTO t SET a = @start_val+10; +INSERT INTO t SET a = @start_val+11; +INSERT INTO t SET a = @start_val+12; +SET GLOBAL DEBUG_DBUG='-d,ib_ext_bp_disable_LRU_eviction_for_t'; +SET GLOBAL innodb_force_LRU_eviction = TRUE; +SELECT variable_value-@prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +variable_value-@prev_flushed_gs +9 +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +9 +SELECT variable_value-@prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +variable_value-@prev_reads_gs +0 +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +0 +SELECT * FROM t; +a +601 +602 +603 +604 +605 +606 +607 +608 +609 +610 +611 +612 +SELECT variable_value-@prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +variable_value-@prev_flushed_gs +9 +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +9 +SELECT variable_value-@prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +variable_value-@prev_reads_gs +7 +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +7 +DROP TABLE t; +SET GLOBAL DEBUG_DBUG='+d,ib_ext_bp_disable_LRU_eviction_for_t'; +################################################################### +# Testing for unencrypted ROW_FORMAT=COMPRESSED table # +################################################################### +CREATE TABLE t ( +`a` INT NOT NULL, +PRIMARY KEY (`a`) +) ENGINE=InnoDB STATS_PERSISTENT=0 ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=1; +SELECT variable_value INTO @prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL INTO @prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +SELECT variable_value INTO @prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL INTO @prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +SET @start_val = 5*100; +INSERT INTO t SET a = @start_val+1; +INSERT INTO t SET a = @start_val+2; +INSERT INTO t SET a = @start_val+3; +INSERT INTO t SET a = @start_val+4; +INSERT INTO t SET a = @start_val+5; +INSERT INTO t SET a = @start_val+6; +INSERT INTO t SET a = @start_val+7; +INSERT INTO t SET a = @start_val+8; +INSERT INTO t SET a = @start_val+9; +INSERT INTO t SET a = @start_val+10; +INSERT INTO t SET a = @start_val+11; +INSERT INTO t SET a = @start_val+12; +SET GLOBAL DEBUG_DBUG='-d,ib_ext_bp_disable_LRU_eviction_for_t'; +SET GLOBAL innodb_force_LRU_eviction = TRUE; +SELECT variable_value-@prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +variable_value-@prev_flushed_gs +9 +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +9 +SELECT variable_value-@prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +variable_value-@prev_reads_gs +0 +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +0 +SELECT * FROM t; +a +501 +502 +503 +504 +505 +506 +507 +508 +509 +510 +511 +512 +SELECT variable_value-@prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +variable_value-@prev_flushed_gs +9 +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +9 +SELECT variable_value-@prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +variable_value-@prev_reads_gs +7 +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +7 +DROP TABLE t; +SET GLOBAL DEBUG_DBUG='+d,ib_ext_bp_disable_LRU_eviction_for_t'; +################################################################### +# Testing for unencrypted uncompressed table # +################################################################### +CREATE TABLE t ( +`a` INT NOT NULL, +PRIMARY KEY (`a`) +) ENGINE=InnoDB STATS_PERSISTENT=0; +SELECT variable_value INTO @prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL INTO @prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +SELECT variable_value INTO @prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL INTO @prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +SET @start_val = 4*100; +INSERT INTO t SET a = @start_val+1; +INSERT INTO t SET a = @start_val+2; +INSERT INTO t SET a = @start_val+3; +INSERT INTO t SET a = @start_val+4; +INSERT INTO t SET a = @start_val+5; +INSERT INTO t SET a = @start_val+6; +INSERT INTO t SET a = @start_val+7; +INSERT INTO t SET a = @start_val+8; +INSERT INTO t SET a = @start_val+9; +INSERT INTO t SET a = @start_val+10; +INSERT INTO t SET a = @start_val+11; +INSERT INTO t SET a = @start_val+12; +SET GLOBAL DEBUG_DBUG='-d,ib_ext_bp_disable_LRU_eviction_for_t'; +SET GLOBAL innodb_force_LRU_eviction = TRUE; +SELECT variable_value-@prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +variable_value-@prev_flushed_gs +9 +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +9 +SELECT variable_value-@prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +variable_value-@prev_reads_gs +0 +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +0 +SELECT * FROM t; +a +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +SELECT variable_value-@prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +variable_value-@prev_flushed_gs +9 +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +9 +SELECT variable_value-@prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +variable_value-@prev_reads_gs +7 +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +7 +DROP TABLE t; +SET GLOBAL DEBUG_DBUG='+d,ib_ext_bp_disable_LRU_eviction_for_t'; +################################################################### +# Testing for encrypted uncompressed table # +################################################################### +CREATE TABLE t ( +`a` INT NOT NULL, +PRIMARY KEY (`a`) +) ENGINE=InnoDB STATS_PERSISTENT=0 encrypted=yes encryption_key_id=1; +SELECT variable_value INTO @prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL INTO @prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +SELECT variable_value INTO @prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL INTO @prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +SET @start_val = 3*100; +INSERT INTO t SET a = @start_val+1; +INSERT INTO t SET a = @start_val+2; +INSERT INTO t SET a = @start_val+3; +INSERT INTO t SET a = @start_val+4; +INSERT INTO t SET a = @start_val+5; +INSERT INTO t SET a = @start_val+6; +INSERT INTO t SET a = @start_val+7; +INSERT INTO t SET a = @start_val+8; +INSERT INTO t SET a = @start_val+9; +INSERT INTO t SET a = @start_val+10; +INSERT INTO t SET a = @start_val+11; +INSERT INTO t SET a = @start_val+12; +SET GLOBAL DEBUG_DBUG='-d,ib_ext_bp_disable_LRU_eviction_for_t'; +SET GLOBAL innodb_force_LRU_eviction = TRUE; +SELECT variable_value-@prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +variable_value-@prev_flushed_gs +9 +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +9 +SELECT variable_value-@prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +variable_value-@prev_reads_gs +0 +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +0 +SELECT * FROM t; +a +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +SELECT variable_value-@prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +variable_value-@prev_flushed_gs +9 +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +9 +SELECT variable_value-@prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +variable_value-@prev_reads_gs +7 +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +7 +DROP TABLE t; +SET GLOBAL DEBUG_DBUG='+d,ib_ext_bp_disable_LRU_eviction_for_t'; +################################################################### +# Testing for unencrypted PAGE_COMPRESSED=1 table # +################################################################### +CREATE TABLE t ( +`a` INT NOT NULL, +PRIMARY KEY (`a`) +) ENGINE=InnoDB STATS_PERSISTENT=0 PAGE_COMPRESSED=1; +SELECT variable_value INTO @prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL INTO @prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +SELECT variable_value INTO @prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL INTO @prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +SET @start_val = 2*100; +INSERT INTO t SET a = @start_val+1; +INSERT INTO t SET a = @start_val+2; +INSERT INTO t SET a = @start_val+3; +INSERT INTO t SET a = @start_val+4; +INSERT INTO t SET a = @start_val+5; +INSERT INTO t SET a = @start_val+6; +INSERT INTO t SET a = @start_val+7; +INSERT INTO t SET a = @start_val+8; +INSERT INTO t SET a = @start_val+9; +INSERT INTO t SET a = @start_val+10; +INSERT INTO t SET a = @start_val+11; +INSERT INTO t SET a = @start_val+12; +SET GLOBAL DEBUG_DBUG='-d,ib_ext_bp_disable_LRU_eviction_for_t'; +SET GLOBAL innodb_force_LRU_eviction = TRUE; +SELECT variable_value-@prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +variable_value-@prev_flushed_gs +9 +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +9 +SELECT variable_value-@prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +variable_value-@prev_reads_gs +0 +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +0 +SELECT * FROM t; +a +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +SELECT variable_value-@prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +variable_value-@prev_flushed_gs +9 +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +9 +SELECT variable_value-@prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +variable_value-@prev_reads_gs +7 +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +7 +DROP TABLE t; +SET GLOBAL DEBUG_DBUG='+d,ib_ext_bp_disable_LRU_eviction_for_t'; +################################################################### +# Testing for encrypted PAGE_COMPRESSED=1 table # +################################################################### +CREATE TABLE t ( +`a` INT NOT NULL, +PRIMARY KEY (`a`) +) ENGINE=InnoDB STATS_PERSISTENT=0 PAGE_COMPRESSED=1 encrypted=yes +encryption_key_id=1; +SELECT variable_value INTO @prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL INTO @prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +SELECT variable_value INTO @prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL INTO @prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +SET @start_val = 1*100; +INSERT INTO t SET a = @start_val+1; +INSERT INTO t SET a = @start_val+2; +INSERT INTO t SET a = @start_val+3; +INSERT INTO t SET a = @start_val+4; +INSERT INTO t SET a = @start_val+5; +INSERT INTO t SET a = @start_val+6; +INSERT INTO t SET a = @start_val+7; +INSERT INTO t SET a = @start_val+8; +INSERT INTO t SET a = @start_val+9; +INSERT INTO t SET a = @start_val+10; +INSERT INTO t SET a = @start_val+11; +INSERT INTO t SET a = @start_val+12; +SET GLOBAL DEBUG_DBUG='-d,ib_ext_bp_disable_LRU_eviction_for_t'; +SET GLOBAL innodb_force_LRU_eviction = TRUE; +SELECT variable_value-@prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +variable_value-@prev_flushed_gs +9 +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +9 +SELECT variable_value-@prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +variable_value-@prev_reads_gs +0 +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +0 +SELECT * FROM t; +a +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +SELECT variable_value-@prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +variable_value-@prev_flushed_gs +9 +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +9 +SELECT variable_value-@prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +variable_value-@prev_reads_gs +7 +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +7 +DROP TABLE t; +disconnect prevent_purge; diff --git a/mysql-test/suite/innodb/r/innodb_information_schema_buffer.result b/mysql-test/suite/innodb/r/innodb_information_schema_buffer.result index e87b35383a70e..7684c96f82b87 100644 --- a/mysql-test/suite/innodb/r/innodb_information_schema_buffer.result +++ b/mysql-test/suite/innodb/r/innodb_information_schema_buffer.result @@ -1,6 +1,6 @@ SELECT * FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; -POOL_ID POOL_SIZE FREE_BUFFERS DATABASE_PAGES OLD_DATABASE_PAGES MODIFIED_DATABASE_PAGES PENDING_DECOMPRESS PENDING_READS PENDING_FLUSH_LRU PENDING_FLUSH_LIST PAGES_MADE_YOUNG PAGES_NOT_MADE_YOUNG PAGES_MADE_YOUNG_RATE PAGES_MADE_NOT_YOUNG_RATE NUMBER_PAGES_READ NUMBER_PAGES_CREATED NUMBER_PAGES_WRITTEN PAGES_READ_RATE PAGES_CREATE_RATE PAGES_WRITTEN_RATE NUMBER_PAGES_GET HIT_RATE YOUNG_MAKE_PER_THOUSAND_GETS NOT_YOUNG_MAKE_PER_THOUSAND_GETS NUMBER_PAGES_READ_AHEAD NUMBER_READ_AHEAD_EVICTED READ_AHEAD_RATE READ_AHEAD_EVICTED_RATE LRU_IO_TOTAL LRU_IO_CURRENT UNCOMPRESS_TOTAL UNCOMPRESS_CURRENT -# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # +POOL_ID POOL_SIZE FREE_BUFFERS DATABASE_PAGES OLD_DATABASE_PAGES MODIFIED_DATABASE_PAGES PENDING_DECOMPRESS PENDING_READS PENDING_FLUSH_LRU PENDING_FLUSH_LIST PAGES_MADE_YOUNG PAGES_NOT_MADE_YOUNG PAGES_MADE_YOUNG_RATE PAGES_MADE_NOT_YOUNG_RATE NUMBER_PAGES_READ NUMBER_PAGES_CREATED NUMBER_PAGES_WRITTEN PAGES_READ_RATE PAGES_CREATE_RATE PAGES_WRITTEN_RATE NUMBER_PAGES_GET HIT_RATE YOUNG_MAKE_PER_THOUSAND_GETS NOT_YOUNG_MAKE_PER_THOUSAND_GETS NUMBER_PAGES_READ_AHEAD NUMBER_READ_AHEAD_EVICTED READ_AHEAD_RATE READ_AHEAD_EVICTED_RATE LRU_IO_TOTAL LRU_IO_CURRENT UNCOMPRESS_TOTAL UNCOMPRESS_CURRENT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL +# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # CREATE TABLE infoschema_buffer_test (col1 INT) ENGINE = INNODB; INSERT INTO infoschema_buffer_test VALUES(9); SELECT * FROM INFORMATION_SCHEMA.INNODB_BUFFER_PAGE diff --git a/mysql-test/suite/innodb/r/innodb_status_variables.result b/mysql-test/suite/innodb/r/innodb_status_variables.result index c6f4d4f27c45a..cc3843f283aaa 100644 --- a/mysql-test/suite/innodb/r/innodb_status_variables.result +++ b/mysql-test/suite/innodb/r/innodb_status_variables.result @@ -41,8 +41,10 @@ INNODB_BUFFER_POOL_READ_AHEAD INNODB_BUFFER_POOL_READ_AHEAD_EVICTED INNODB_BUFFER_POOL_READ_REQUESTS INNODB_BUFFER_POOL_READS +INNODB_EXT_BUFFER_POOL_READS INNODB_BUFFER_POOL_WAIT_FREE INNODB_BUFFER_POOL_WRITE_REQUESTS +INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED INNODB_CHECKPOINT_AGE INNODB_CHECKPOINT_MAX_AGE INNODB_DATA_FSYNCS diff --git a/mysql-test/suite/innodb/t/ext_buf_pool.opt b/mysql-test/suite/innodb/t/ext_buf_pool.opt new file mode 100644 index 0000000000000..2178b845a2fb7 --- /dev/null +++ b/mysql-test/suite/innodb/t/ext_buf_pool.opt @@ -0,0 +1 @@ +--innodb-buffer-pool-size=21M --innodb-extended-buffer-pool-size=1M diff --git a/mysql-test/suite/innodb/t/ext_buf_pool.test b/mysql-test/suite/innodb/t/ext_buf_pool.test new file mode 100644 index 0000000000000..e172686cf179a --- /dev/null +++ b/mysql-test/suite/innodb/t/ext_buf_pool.test @@ -0,0 +1,167 @@ +--source include/have_innodb.inc +--source include/have_debug.inc +--source include/have_debug_sync.inc +--source include/count_sessions.inc +--source ../encryption/include/have_file_key_management_plugin.inc +#--source include/innodb_page_size.inc + +--let $encrypted_row_compressed=6 +--let $unencrypted_row_compressed=5 +--let $unencrypted_uncompressed=4 +--let $encrypted_uncompressed=3 +--let $unencrypted_page_compressed=2 +--let $encrypted_page_compressed=1 +--let $i = $encrypted_row_compressed + +--let $page_size=`SELECT @@GLOBAL.innodb_page_size` +if ($page_size != 16384) { + --let $i=$unencrypted_uncompressed +} + +--connect (prevent_purge,localhost,root) +START TRANSACTION WITH CONSISTENT SNAPSHOT; + +--connection default + +--let $DATADIR = `select @@datadir` + +--disable_query_log +--error 0,ER_UNKNOWN_SYSTEM_VARIABLE +SET @old_innodb_limit_optimistic_insert_debug = @@innodb_limit_optimistic_insert_debug; +SET @old_debug_dbug = @@debug_dbug; +--enable_query_log + +--error 0,ER_UNKNOWN_SYSTEM_VARIABLE +SET GLOBAL innodb_limit_optimistic_insert_debug = 3; +SET GLOBAL DEBUG_DBUG='+d,ib_ext_bp_count_io_only_for_t'; + +while($i) { + + SET GLOBAL DEBUG_DBUG='+d,ib_ext_bp_disable_LRU_eviction_for_t'; + if ($i == $unencrypted_uncompressed) { + --echo ################################################################### + --echo # Testing for unencrypted uncompressed table # + --echo ################################################################### + CREATE TABLE t ( + `a` INT NOT NULL, + PRIMARY KEY (`a`) + ) ENGINE=InnoDB STATS_PERSISTENT=0; + } + if ($i == $encrypted_uncompressed) { + --echo ################################################################### + --echo # Testing for encrypted uncompressed table # + --echo ################################################################### + CREATE TABLE t ( + `a` INT NOT NULL, + PRIMARY KEY (`a`) + ) ENGINE=InnoDB STATS_PERSISTENT=0 encrypted=yes encryption_key_id=1; + } + if ($i == $unencrypted_page_compressed) { + --echo ################################################################### + --echo # Testing for unencrypted PAGE_COMPRESSED=1 table # + --echo ################################################################### + CREATE TABLE t ( + `a` INT NOT NULL, + PRIMARY KEY (`a`) + ) ENGINE=InnoDB STATS_PERSISTENT=0 PAGE_COMPRESSED=1; + } + if ($i == $unencrypted_row_compressed) { + --echo ################################################################### + --echo # Testing for unencrypted ROW_FORMAT=COMPRESSED table # + --echo ################################################################### + CREATE TABLE t ( + `a` INT NOT NULL, + PRIMARY KEY (`a`) + ) ENGINE=InnoDB STATS_PERSISTENT=0 ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=1; + } + if ($i == $encrypted_page_compressed) { + --echo ################################################################### + --echo # Testing for encrypted PAGE_COMPRESSED=1 table # + --echo ################################################################### + CREATE TABLE t ( + `a` INT NOT NULL, + PRIMARY KEY (`a`) + ) ENGINE=InnoDB STATS_PERSISTENT=0 PAGE_COMPRESSED=1 encrypted=yes + encryption_key_id=1; + } + if ($i == $encrypted_row_compressed) { + --echo ################################################################### + --echo # Testing for encrypted ROW_FORMAT=COMPRESSED table # + --echo ################################################################### + CREATE TABLE t ( + `a` INT NOT NULL, + PRIMARY KEY (`a`) + ) ENGINE=InnoDB STATS_PERSISTENT=0 ROW_FORMAT=COMPRESSED encrypted=yes + encryption_key_id=1; + } + + SELECT variable_value INTO @prev_flushed_gs + FROM information_schema.global_status + WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; + SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL INTO @prev_written_ps + FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; + SELECT variable_value INTO @prev_reads_gs + FROM information_schema.global_status + WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; + SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL INTO @prev_reads_ps + FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; + + --eval SET @start_val = $i*100 + INSERT INTO t SET a = @start_val+1; + INSERT INTO t SET a = @start_val+2; + INSERT INTO t SET a = @start_val+3; + INSERT INTO t SET a = @start_val+4; + INSERT INTO t SET a = @start_val+5; + INSERT INTO t SET a = @start_val+6; + INSERT INTO t SET a = @start_val+7; + INSERT INTO t SET a = @start_val+8; + INSERT INTO t SET a = @start_val+9; + INSERT INTO t SET a = @start_val+10; + INSERT INTO t SET a = @start_val+11; + INSERT INTO t SET a = @start_val+12; + + SET GLOBAL DEBUG_DBUG='-d,ib_ext_bp_disable_LRU_eviction_for_t'; + SET GLOBAL innodb_force_LRU_eviction = TRUE; + + let $wait_condition = + SELECT (variable_value-@prev_flushed_gs) >= 9 + FROM information_schema.global_status + WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; + --source include/wait_condition.inc + + SELECT variable_value-@prev_flushed_gs + FROM information_schema.global_status + WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; + SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps + FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; + SELECT variable_value-@prev_reads_gs + FROM information_schema.global_status + WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; + SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps + FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; + + SELECT * FROM t; + + SELECT variable_value-@prev_flushed_gs + FROM information_schema.global_status + WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; + SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps + FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; + SELECT variable_value-@prev_reads_gs + FROM information_schema.global_status + WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; + SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps + FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; + + DROP TABLE t; + --dec $i +} + +--disable_query_log +SET GLOBAL DEBUG_DBUG=@old_debug_dbug; +--error 0,ER_UNKNOWN_SYSTEM_VARIABLE +SET GLOBAL innodb_limit_optimistic_insert_debug = @old_innodb_limit_optimistic_insert_debug; +--enable_query_log + +--disconnect prevent_purge +--source include/wait_until_count_sessions.inc diff --git a/mysql-test/suite/innodb_i_s/innodb_buffer_pool_stats.result b/mysql-test/suite/innodb_i_s/innodb_buffer_pool_stats.result index f7fdd38f63111..c60caf02fb6be 100644 --- a/mysql-test/suite/innodb_i_s/innodb_buffer_pool_stats.result +++ b/mysql-test/suite/innodb_i_s/innodb_buffer_pool_stats.result @@ -32,5 +32,7 @@ INNODB_BUFFER_POOL_STATS CREATE TEMPORARY TABLE `INNODB_BUFFER_POOL_STATS` ( `LRU_IO_TOTAL` bigint(21) unsigned NOT NULL, `LRU_IO_CURRENT` bigint(21) unsigned NOT NULL, `UNCOMPRESS_TOTAL` bigint(21) unsigned NOT NULL, - `UNCOMPRESS_CURRENT` bigint(21) unsigned NOT NULL + `UNCOMPRESS_CURRENT` bigint(21) unsigned NOT NULL, + `NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL` bigint(21) unsigned NOT NULL, + `NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL` bigint(21) unsigned NOT NULL ) ENGINE=MEMORY DEFAULT CHARSET=utf8mb3 COLLATE=utf8mb3_general_ci diff --git a/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff b/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff index a2b062a586036..975409646c58e 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff +++ b/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff @@ -1,6 +1,6 @@ --- a/mysql-test/suite/sys_vars/r/sysvars_innodb.result +++ b/mysql-test/suite/sys_vars/r/sysvars_innodb.result -@@ -51,7 +51,7 @@ +@@ -52,7 +52,7 @@ VARIABLE_TYPE INT UNSIGNED VARIABLE_COMMENT Number of adaptive hash table cells in each partition; 16381 at start defaults to being derived from innodb_buffer_pool_size NUMERIC_MIN_VALUE 16381 @@ -9,7 +9,7 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO -@@ -60,7 +60,7 @@ +@@ -61,7 +61,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 8 VARIABLE_SCOPE GLOBAL @@ -18,7 +18,7 @@ VARIABLE_COMMENT Number of InnoDB Adaptive Hash Index Partitions (default 8) NUMERIC_MIN_VALUE 1 NUMERIC_MAX_VALUE 512 -@@ -96,7 +96,7 @@ +@@ -97,7 +97,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 1 VARIABLE_SCOPE GLOBAL @@ -27,7 +27,7 @@ VARIABLE_COMMENT The AUTOINC lock modes supported by InnoDB: 0 => Old style AUTOINC locking (for backward compatibility); 1 => New style AUTOINC locking; 2 => No AUTOINC locking (unsafe for SBR) NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 2 -@@ -108,10 +108,10 @@ +@@ -109,10 +109,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 0 VARIABLE_SCOPE GLOBAL @@ -40,7 +40,7 @@ NUMERIC_BLOCK_SIZE 1048576 ENUM_VALUE_LIST NULL READ_ONLY YES -@@ -144,7 +144,7 @@ +@@ -145,7 +145,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 25 VARIABLE_SCOPE GLOBAL @@ -49,7 +49,7 @@ VARIABLE_COMMENT Dump only the hottest N% of each buffer pool, defaults to 25 NUMERIC_MIN_VALUE 1 NUMERIC_MAX_VALUE 100 -@@ -216,10 +216,10 @@ +@@ -217,10 +217,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 134217728 VARIABLE_SCOPE GLOBAL @@ -62,7 +62,7 @@ NUMERIC_BLOCK_SIZE 1048576 ENUM_VALUE_LIST NULL READ_ONLY NO -@@ -228,11 +228,11 @@ +@@ -229,11 +229,11 @@ SESSION_VALUE NULL DEFAULT_VALUE 0 VARIABLE_SCOPE GLOBAL @@ -77,7 +77,7 @@ ENUM_VALUE_LIST NULL READ_ONLY NO COMMAND_LINE_ARGUMENT REQUIRED -@@ -240,11 +240,11 @@ +@@ -241,11 +241,11 @@ SESSION_VALUE NULL DEFAULT_VALUE 0 VARIABLE_SCOPE GLOBAL @@ -92,7 +92,7 @@ ENUM_VALUE_LIST NULL READ_ONLY YES COMMAND_LINE_ARGUMENT REQUIRED -@@ -252,7 +252,7 @@ +@@ -253,7 +253,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 0 VARIABLE_SCOPE GLOBAL @@ -101,7 +101,7 @@ VARIABLE_COMMENT A number that tells how often buffer pool dump status in percentages should be printed. E.g. 10 means that buffer pool dump status is printed when every 10% of number of buffer pool pages are dumped. Default is 0 (only start and end status is printed) NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 100 -@@ -324,7 +324,7 @@ +@@ -325,7 +325,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 5 VARIABLE_SCOPE GLOBAL @@ -110,7 +110,7 @@ VARIABLE_COMMENT If the compression failure rate of a table is greater than this number more padding is added to the pages to reduce the failures. A value of zero implies no padding NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 100 -@@ -348,7 +348,7 @@ +@@ -349,7 +349,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 50 VARIABLE_SCOPE GLOBAL @@ -119,7 +119,20 @@ VARIABLE_COMMENT Percentage of empty space on a data page that can be reserved to make the page compressible NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 75 -@@ -588,7 +588,7 @@ +@@ -589,10 +589,10 @@ + SESSION_VALUE NULL + DEFAULT_VALUE 0 + VARIABLE_SCOPE GLOBAL +-VARIABLE_TYPE BIGINT UNSIGNED ++VARIABLE_TYPE INT UNSIGNED + VARIABLE_COMMENT The extended buffer pool file size + NUMERIC_MIN_VALUE 0 +-NUMERIC_MAX_VALUE 18446744073709551615 ++NUMERIC_MAX_VALUE 4294967295 + NUMERIC_BLOCK_SIZE 0 + ENUM_VALUE_LIST NULL + READ_ONLY NO +@@ -613,7 +613,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 600 VARIABLE_SCOPE GLOBAL @@ -128,7 +141,7 @@ VARIABLE_COMMENT Maximum number of seconds that semaphore times out in InnoDB NUMERIC_MIN_VALUE 1 NUMERIC_MAX_VALUE 4294967295 -@@ -636,7 +636,7 @@ +@@ -661,7 +661,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 30 VARIABLE_SCOPE GLOBAL @@ -137,7 +150,7 @@ VARIABLE_COMMENT Number of iterations over which the background flushing is averaged NUMERIC_MIN_VALUE 1 NUMERIC_MAX_VALUE 1000 -@@ -660,7 +660,7 @@ +@@ -685,7 +685,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 1 VARIABLE_SCOPE GLOBAL @@ -146,7 +159,7 @@ VARIABLE_COMMENT Controls the durability/speed trade-off for commits. Set to 0 (write and flush redo log to disk only once per second), 1 (flush to disk at each commit), 2 (write to log at commit but flush to disk only once per second) or 3 (flush to disk at prepare and at commit, slower and usually redundant). 1 and 3 guarantees that after a crash, committed transactions will not be lost and will be consistent with the binlog and other transactional engines. 2 can get inconsistent and lose transactions if there is a power failure or kernel crash but not if mysqld crashes. 0 has no guarantees in case of crash. 0 and 2 can be faster than 1 or 3 NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 3 -@@ -684,7 +684,7 @@ +@@ -709,7 +709,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 1 VARIABLE_SCOPE GLOBAL @@ -155,7 +168,7 @@ VARIABLE_COMMENT Set to 0 (don't flush neighbors from buffer pool), 1 (flush contiguous neighbors from buffer pool) or 2 (flush neighbors from buffer pool), when flushing a block NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 2 -@@ -720,7 +720,7 @@ +@@ -745,7 +745,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 0 VARIABLE_SCOPE GLOBAL @@ -164,7 +177,7 @@ VARIABLE_COMMENT Helps to save your data in case the disk image of the database becomes corrupt. Value 5 can return bogus data, and 6 can permanently corrupt data NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 6 -@@ -744,10 +744,10 @@ +@@ -769,10 +769,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 8000000 VARIABLE_SCOPE GLOBAL @@ -177,7 +190,7 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO -@@ -780,7 +780,7 @@ +@@ -805,7 +805,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 84 VARIABLE_SCOPE GLOBAL @@ -186,16 +199,16 @@ VARIABLE_COMMENT InnoDB Fulltext search maximum token size in characters NUMERIC_MIN_VALUE 10 NUMERIC_MAX_VALUE 84 -@@ -792,7 +792,7 @@ +@@ -817,7 +817,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 3 VARIABLE_SCOPE GLOBAL -VARIABLE_TYPE BIGINT UNSIGNED +VARIABLE_TYPE INT UNSIGNED VARIABLE_COMMENT InnoDB Fulltext search minimum token size in characters - NUMERIC_MIN_VALUE 0 + NUMERIC_MIN_VALUE 1 NUMERIC_MAX_VALUE 16 -@@ -804,7 +804,7 @@ +@@ -829,7 +829,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 2000 VARIABLE_SCOPE GLOBAL @@ -204,7 +217,7 @@ VARIABLE_COMMENT InnoDB Fulltext search number of words to optimize for each optimize table call NUMERIC_MIN_VALUE 1000 NUMERIC_MAX_VALUE 10000 -@@ -816,10 +816,10 @@ +@@ -841,10 +841,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 2000000000 VARIABLE_SCOPE GLOBAL @@ -217,7 +230,7 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO -@@ -840,7 +840,7 @@ +@@ -865,7 +865,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 2 VARIABLE_SCOPE GLOBAL @@ -226,7 +239,7 @@ VARIABLE_COMMENT InnoDB Fulltext search parallel sort degree, will round up to nearest power of 2 number NUMERIC_MIN_VALUE 1 NUMERIC_MAX_VALUE 16 -@@ -852,10 +852,10 @@ +@@ -877,10 +877,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 640000000 VARIABLE_SCOPE GLOBAL @@ -239,7 +252,7 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO -@@ -900,7 +900,7 @@ +@@ -925,7 +925,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 200 VARIABLE_SCOPE GLOBAL @@ -248,7 +261,7 @@ VARIABLE_COMMENT Number of IOPs the server can do. Tunes the background IO rate NUMERIC_MIN_VALUE 100 NUMERIC_MAX_VALUE 4294967295 -@@ -912,7 +912,7 @@ +@@ -937,7 +937,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 4294967295 VARIABLE_SCOPE GLOBAL @@ -257,7 +270,7 @@ VARIABLE_COMMENT Limit to which innodb_io_capacity can be inflated NUMERIC_MIN_VALUE 100 NUMERIC_MAX_VALUE 4294967295 -@@ -1044,10 +1044,10 @@ +@@ -1069,10 +1069,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 32 VARIABLE_SCOPE GLOBAL @@ -270,7 +283,7 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO -@@ -1056,10 +1056,10 @@ +@@ -1081,10 +1081,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 1536 VARIABLE_SCOPE GLOBAL @@ -283,7 +296,7 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO -@@ -1092,10 +1092,10 @@ +@@ -1117,10 +1117,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 0 VARIABLE_SCOPE GLOBAL @@ -296,7 +309,7 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO -@@ -1104,7 +1104,7 @@ +@@ -1129,7 +1129,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 0 VARIABLE_SCOPE GLOBAL @@ -305,7 +318,7 @@ VARIABLE_COMMENT Maximum delay of user threads in micro-seconds NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 10000000 -@@ -1236,10 +1236,10 @@ +@@ -1261,10 +1261,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 0 VARIABLE_SCOPE GLOBAL @@ -318,7 +331,7 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY YES -@@ -1260,7 +1260,7 @@ +@@ -1285,7 +1285,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 16384 VARIABLE_SCOPE GLOBAL @@ -327,7 +340,7 @@ VARIABLE_COMMENT Page size to use for all InnoDB tablespaces NUMERIC_MIN_VALUE 4096 NUMERIC_MAX_VALUE 65536 -@@ -1296,7 +1296,7 @@ +@@ -1321,7 +1321,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 127 VARIABLE_SCOPE GLOBAL @@ -336,7 +349,7 @@ VARIABLE_COMMENT Number of UNDO log pages to purge in one batch from the history list NUMERIC_MIN_VALUE 1 NUMERIC_MAX_VALUE 5000 -@@ -1308,7 +1308,7 @@ +@@ -1333,7 +1333,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 128 VARIABLE_SCOPE GLOBAL @@ -345,7 +358,7 @@ VARIABLE_COMMENT Unused NUMERIC_MIN_VALUE 1 NUMERIC_MAX_VALUE 128 -@@ -1344,7 +1344,7 @@ +@@ -1369,7 +1369,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 56 VARIABLE_SCOPE GLOBAL @@ -354,7 +367,7 @@ VARIABLE_COMMENT Number of pages that must be accessed sequentially for InnoDB to trigger a readahead NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 64 -@@ -1428,7 +1428,7 @@ +@@ -1453,7 +1453,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 1048576 VARIABLE_SCOPE GLOBAL @@ -363,7 +376,7 @@ VARIABLE_COMMENT Memory buffer size for index creation NUMERIC_MIN_VALUE 65536 NUMERIC_MAX_VALUE 67108864 -@@ -1596,10 +1596,10 @@ +@@ -1621,10 +1621,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 30 VARIABLE_SCOPE GLOBAL diff --git a/mysql-test/suite/sys_vars/r/sysvars_innodb.result b/mysql-test/suite/sys_vars/r/sysvars_innodb.result index b9f1e8b68f60c..6cd87097f71bf 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_innodb.result +++ b/mysql-test/suite/sys_vars/r/sysvars_innodb.result @@ -6,7 +6,8 @@ variable_name not in ( 'innodb_use_native_aio', # default value depends on OS 'innodb_log_file_buffering', # only available on Linux and Windows 'innodb_linux_aio', # existence depends on OS -'innodb_buffer_pool_load_pages_abort') # debug build only, and is only for testing +'innodb_buffer_pool_load_pages_abort', # debug build only, and is only for testing +'innodb_force_lru_eviction') # debug build only, and is only for testing order by variable_name; VARIABLE_NAME INNODB_ADAPTIVE_FLUSHING SESSION_VALUE NULL @@ -572,6 +573,30 @@ NUMERIC_BLOCK_SIZE NULL ENUM_VALUE_LIST OFF,ON READ_ONLY YES COMMAND_LINE_ARGUMENT OPTIONAL +VARIABLE_NAME INNODB_EXTENDED_BUFFER_POOL_PATH +SESSION_VALUE NULL +DEFAULT_VALUE +VARIABLE_SCOPE GLOBAL +VARIABLE_TYPE VARCHAR +VARIABLE_COMMENT Path to extended buffer pool file +NUMERIC_MIN_VALUE NULL +NUMERIC_MAX_VALUE NULL +NUMERIC_BLOCK_SIZE NULL +ENUM_VALUE_LIST NULL +READ_ONLY YES +COMMAND_LINE_ARGUMENT REQUIRED +VARIABLE_NAME INNODB_EXTENDED_BUFFER_POOL_SIZE +SESSION_VALUE NULL +DEFAULT_VALUE 0 +VARIABLE_SCOPE GLOBAL +VARIABLE_TYPE BIGINT UNSIGNED +VARIABLE_COMMENT The extended buffer pool file size +NUMERIC_MIN_VALUE 0 +NUMERIC_MAX_VALUE 18446744073709551615 +NUMERIC_BLOCK_SIZE 0 +ENUM_VALUE_LIST NULL +READ_ONLY NO +COMMAND_LINE_ARGUMENT REQUIRED VARIABLE_NAME INNODB_FAST_SHUTDOWN SESSION_VALUE NULL DEFAULT_VALUE 1 diff --git a/mysql-test/suite/sys_vars/t/sysvars_innodb.test b/mysql-test/suite/sys_vars/t/sysvars_innodb.test index 250eb8b5c8f1b..1d38d8244d66f 100644 --- a/mysql-test/suite/sys_vars/t/sysvars_innodb.test +++ b/mysql-test/suite/sys_vars/t/sysvars_innodb.test @@ -17,5 +17,6 @@ select VARIABLE_NAME, SESSION_VALUE, DEFAULT_VALUE, VARIABLE_SCOPE, VARIABLE_TYP 'innodb_use_native_aio', # default value depends on OS 'innodb_log_file_buffering', # only available on Linux and Windows 'innodb_linux_aio', # existence depends on OS - 'innodb_buffer_pool_load_pages_abort') # debug build only, and is only for testing + 'innodb_buffer_pool_load_pages_abort', # debug build only, and is only for testing + 'innodb_force_lru_eviction') # debug build only, and is only for testing order by variable_name; diff --git a/mysys/my_create.c b/mysys/my_create.c index 32cc73a53c476..93da3d0a06399 100644 --- a/mysys/my_create.c +++ b/mysys/my_create.c @@ -41,7 +41,7 @@ File my_create(const char *FileName, mode_t CreateFlags, int access_flags, DBUG_PRINT("my",("Name: '%s' CreateFlags: %u AccessFlags: %d MyFlags: %lu", FileName, CreateFlags, access_flags, MyFlags)); #if defined(_WIN32) - fd= my_win_open(FileName, access_flags | O_CREAT); + fd= my_win_open(FileName, access_flags | O_CREAT, MyFlags); #else fd= open((char *) FileName, access_flags | O_CREAT | O_CLOEXEC, CreateFlags ? CreateFlags : my_umask); diff --git a/mysys/my_open.c b/mysys/my_open.c index 182bb14927743..1d229530bf169 100644 --- a/mysys/my_open.c +++ b/mysys/my_open.c @@ -50,7 +50,7 @@ File my_open(const char *FileName, int Flags, myf MyFlags) if (!(MyFlags & (MY_WME | MY_FAE | MY_FFNF))) MyFlags|= my_global_flags; #if defined(_WIN32) - fd= my_win_open(FileName, Flags); + fd= my_win_open(FileName, Flags, MyFlags); #else if (MyFlags & MY_NOSYMLINKS) fd = open_nosymlinks(FileName, Flags | O_CLOEXEC, my_umask); diff --git a/mysys/my_winfile.c b/mysys/my_winfile.c index 7a1e3e60b12ef..f01b970331f62 100644 --- a/mysys/my_winfile.c +++ b/mysys/my_winfile.c @@ -166,13 +166,16 @@ LPSECURITY_ATTRIBUTES my_win_file_secattr() oflag operation flags shflag share flag pmode permission flags + MyFlags flags, used to open files, currently only MY_OPEN_FOR_ASYNC_IO + is used RETURN VALUE File descriptor of opened file if success -1 and sets errno if fails. */ -File my_win_sopen(const char *path, int oflag, int shflag, int pmode) +File my_win_sopen(const char *path, int oflag, int shflag, int pmode, + myf MyFlags) { int fh; /* handle of opened file */ int mask; @@ -285,6 +288,11 @@ File my_win_sopen(const char *path, int oflag, int shflag, int pmode) fileaccess|= DELETE; } + if (MyFlags & MY_OPEN_FOR_ASYNC_IO) + { + fileattrib|= FILE_FLAG_OVERLAPPED; + } + /* Set temporary file (delay-flush-to-disk) attribute if requested.*/ if (oflag & _O_SHORT_LIVED) fileattrib|= FILE_ATTRIBUTE_TEMPORARY; @@ -317,11 +325,11 @@ File my_win_sopen(const char *path, int oflag, int shflag, int pmode) } -File my_win_open(const char *path, int flags) +File my_win_open(const char *path, int flags, myf MyFlags) { DBUG_ENTER("my_win_open"); DBUG_RETURN(my_win_sopen((char *) path, flags | _O_BINARY, _SH_DENYNO, - _S_IREAD | S_IWRITE)); + _S_IREAD | S_IWRITE, MyFlags)); } diff --git a/mysys/mysys_priv.h b/mysys/mysys_priv.h index efeb0c65af3bb..8a3558f5150f0 100644 --- a/mysys/mysys_priv.h +++ b/mysys/mysys_priv.h @@ -182,7 +182,7 @@ static int PROTO { NOSYMLINK_FUNCTION_BODY(AT,NOAT) } #ifdef _WIN32 #include /* my_winfile.c exports, should not be used outside mysys */ -extern File my_win_open(const char *path, int oflag); +extern File my_win_open(const char *path, int oflag, myf MyFlags); extern int my_win_close(File fd); extern size_t my_win_read(File fd, uchar *buffer, size_t count); extern size_t my_win_write(File fd, const uchar *buffer, size_t count); @@ -200,7 +200,7 @@ extern int my_win_stat(const char *path, struct _stati64 *buf); extern int my_win_fstat(File fd, struct _stati64 *buf); extern int my_win_fsync(File fd); extern File my_win_dup(File fd); -extern File my_win_sopen(const char *path, int oflag, int shflag, int perm); +extern File my_win_sopen(const char *path, int oflag, int shflag, int perm, myf MyFlags); extern File my_open_osfhandle(HANDLE handle, int oflag); diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index a22af4a07dfaa..c8499494dd6ff 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -1067,8 +1067,10 @@ inline void buf_pool_t::garbage_collect() noexcept size_in_bytes_requested= size; mysql_mutex_unlock(&mutex); mysql_mutex_lock(&flush_list_mutex); + ++done_flush_list_waiters_count; page_cleaner_wakeup(true); my_cond_wait(&done_flush_list, &flush_list_mutex.m_mutex); + --done_flush_list_waiters_count; mysql_mutex_unlock(&flush_list_mutex); # ifdef BTR_CUR_HASH_ADAPT bool ahi_disabled= btr_search.disable(); @@ -1285,6 +1287,34 @@ buf_block_t *buf_pool_t::allocate() noexcept return nullptr; } +ext_buf_page_t *buf_pool_t::alloc_ext_page(page_id_t page_id) noexcept +{ + mysql_mutex_assert_owner(&mutex); + ext_buf_page_t *p; + if ((p= UT_LIST_GET_FIRST(ext_free))) + UT_LIST_REMOVE(ext_free, p); + else if ((p= UT_LIST_GET_LAST(ext_LRU))) { + for (; p; p= UT_LIST_GET_PREV(ext_LRU_list, p)) { + hash_chain &hash_chain= page_hash.cell_get(p->id_.fold()); + page_hash_latch &hash_lock= page_hash.lock_get(hash_chain); + if (!hash_lock.try_lock()) + continue; + UT_LIST_REMOVE(ext_LRU, p); + page_hash.remove(hash_chain, reinterpret_cast(p)); + hash_lock.unlock(); + break; + } + if (!p) + return nullptr; + } + else + return nullptr; + p->id_= page_id; + ut_d(p->in_LRU_list= p->in_free_list= false); + ut_d(p->in_page_hash= true); + return p; +} + /** Create the hash table. @param n the lower bound of n_cells */ void buf_pool_t::page_hash_table::create(ulint n) noexcept @@ -1436,6 +1466,9 @@ bool buf_pool_t::create() noexcept n_blocks= get_n_blocks(actual_size); n_blocks_to_withdraw= 0; UT_LIST_INIT(free, &buf_page_t::list); + UT_LIST_INIT(ext_free, &ext_buf_page_t::free_list); + ut_d(force_LRU_eviction_to_ebp= 0); + const size_t ssize= srv_page_size_shift - UNIV_PAGE_SIZE_SHIFT_MIN; for (char *extent= memory, @@ -1459,6 +1492,19 @@ bool buf_pool_t::create() noexcept } } + size_t ext_buf_pages_array_size= extended_pages * sizeof(ext_buf_page_t); + ext_buf_pages_array= static_cast( + my_malloc(PSI_NOT_INSTRUMENTED, ext_buf_pages_array_size, MYF(0))); + UT_LIST_INIT(ext_free, &ext_buf_page_t::free_list); + for (ext_buf_page_t *page= ext_buf_pages_array, + *end= ext_buf_pages_array + extended_pages; + page != end; ++page) { + page->frame= reinterpret_cast(ext_buf_page_t::EXT_BUF_FRAME); + ut_d(page->in_free_list= true); + ut_d(page->in_LRU_list= page->in_free_list= false); + UT_LIST_ADD_LAST(ext_free, page); + } + #if defined(__aarch64__) mysql_mutex_init(buf_pool_mutex_key, &mutex, MY_MUTEX_INIT_FAST); #else @@ -1467,6 +1513,7 @@ bool buf_pool_t::create() noexcept UT_LIST_INIT(withdrawn, &buf_page_t::list); UT_LIST_INIT(LRU, &buf_page_t::LRU); + UT_LIST_INIT(ext_LRU, &ext_buf_page_t::ext_LRU_list); UT_LIST_INIT(flush_list, &buf_page_t::list); UT_LIST_INIT(unzip_LRU, &buf_block_t::unzip_LRU); @@ -1594,6 +1641,8 @@ void buf_pool_t::close() noexcept memory_unaligned= nullptr; } + my_free(ext_buf_pages_array); + pthread_cond_destroy(&done_flush_LRU); pthread_cond_destroy(&done_flush_list); pthread_cond_destroy(&do_flush_list); @@ -1877,8 +1926,10 @@ ATTRIBUTE_COLD buf_pool_t::shrink_status buf_pool_t::shrink(size_t size) try_LRU_scan= false; mysql_mutex_unlock(&mutex); + ++done_flush_list_waiters_count; page_cleaner_wakeup(true); my_cond_wait(&done_flush_list, &flush_list_mutex.m_mutex); + --done_flush_list_waiters_count; mysql_mutex_unlock(&flush_list_mutex); mysql_mutex_lock(&mutex); @@ -2095,8 +2146,10 @@ ATTRIBUTE_COLD void buf_pool_t::resize(size_t size, THD *thd) noexcept mysql_mutex_unlock(&mutex); DEBUG_SYNC_C("buf_pool_shrink_before_wakeup"); mysql_mutex_lock(&flush_list_mutex); + ++done_flush_list_waiters_count; page_cleaner_wakeup(true); my_cond_wait(&done_flush_list, &flush_list_mutex.m_mutex); + --done_flush_list_waiters_count; mysql_mutex_unlock(&flush_list_mutex); #ifdef BTR_CUR_HASH_ADAPT ahi_disabled= btr_search.disable(); @@ -3126,7 +3179,7 @@ buf_pool_t::page_hash_table::append(buf_pool_t::hash_chain &chain, *prev= bpage; } -inline void +void buf_pool_t::page_hash_table::replace(buf_pool_t::hash_chain &chain, buf_page_t *old, buf_page_t *bpage) noexcept @@ -3158,7 +3211,20 @@ static buf_block_t *buf_page_create_low(page_id_t page_id, ulint zip_size, retry: mysql_mutex_lock(&buf_pool.mutex); - buf_page_t *bpage= buf_pool.page_hash.get(page_id, chain); + buf_page_t *bpage= buf_pool.page_hash.get(page_id, chain); + + if (bpage && bpage->external()) { + page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain); + hash_lock.lock(); + buf_pool.page_hash.remove(chain, bpage); + hash_lock.unlock(); + ut_ad(!bpage->in_page_hash); + ext_buf_page_t *ext_buf_page= + reinterpret_cast(bpage); + buf_pool.remove_ext_page_from_LRU(*ext_buf_page); + buf_pool.free_ext_page(*ext_buf_page); + bpage= nullptr; + } if (bpage) { @@ -4046,6 +4112,8 @@ void buf_pool_t::get_info(buf_pool_info_t *pool_info) noexcept double(stat.n_pages_read - old_stat.n_pages_read) / elapsed; pool_info->pages_created_rate= double(stat.n_pages_created - old_stat.n_pages_created) / elapsed; + pool_info->n_pages_read_from_ebp= stat.n_pages_read_from_ebp; + pool_info->n_pages_written_to_ebp= stat.n_pages_written_to_ebp; pool_info->pages_written_rate= double(stat.n_pages_written - old_stat.n_pages_written) / elapsed; pool_info->n_page_get_delta= pool_info->n_page_gets - diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc index d8b08b4254a87..0c862d9f81467 100644 --- a/storage/innobase/buf/buf0dblwr.cc +++ b/storage/innobase/buf/buf0dblwr.cc @@ -34,6 +34,7 @@ Created 2011/12/19 #include "fil0crypt.h" #include "fil0pagecompress.h" #include "log.h" +#include "scope.h" using st_::span; @@ -86,7 +87,10 @@ bool buf_dblwr_t::create() noexcept { if (is_created()) return true; - + /* Disable external buffer pool flushing for the duration of double write + buffer creating, as double write pages will be removed from LRU */ + ++buf_pool.done_flush_list_waiters_count; + SCOPE_EXIT([]() { --buf_pool.done_flush_list_waiters_count; }); mtr_t mtr{nullptr}; const ulint size= block_size; @@ -680,7 +684,7 @@ bool buf_dblwr_t::flush_buffered_writes(const ulint size) noexcept #ifdef UNIV_DEBUG for (ulint len2= 0, i= 0; i < old_first_free; len2 += srv_page_size, i++) { - buf_page_t *bpage= flush_slot->buf_block_arr[i].request.bpage; + buf_page_t *bpage= flush_slot->buf_block_arr[i].request.bpage(); if (bpage->zip.data) /* No simple validate for ROW_FORMAT=COMPRESSED pages exists. */ @@ -716,7 +720,7 @@ static void *get_frame(const IORequest &request) noexcept { if (request.slot) return request.slot->out_buf; - const buf_page_t *bpage= request.bpage; + const buf_page_t *bpage= request.bpage(); return bpage->zip.data ? bpage->zip.data : bpage->frame; } @@ -726,8 +730,8 @@ void buf_dblwr_t::flush_buffered_writes_completed(const IORequest &request) ut_ad(this == &buf_dblwr); ut_ad(is_created()); ut_ad(!srv_read_only_mode); - ut_ad(!request.bpage); - ut_ad(request.node == fil_system.sys_space->chain.start); + ut_ad(!request.bpage()); + ut_ad(request.node() == fil_system.sys_space->chain.start); ut_ad(request.type == IORequest::DBLWR_BATCH); mysql_mutex_lock(&mutex); ut_ad(batch_running); @@ -753,14 +757,14 @@ void buf_dblwr_t::flush_buffered_writes_completed(const IORequest &request) log_checkpoint(). Writes to the system tablespace should be rare, except when executing DDL or using the non-default settings innodb_file_per_table=OFF or innodb_undo_tablespaces=0. */ - os_file_flush(request.node->handle); + os_file_flush(request.node()->handle); /* The writes have been flushed to disk now and in recovery we will find them in the doublewrite buffer blocks. Next, write the data pages. */ for (ulint i= 0, first_free= flush_slot->first_free; i < first_free; i++) { auto e= flush_slot->buf_block_arr[i]; - buf_page_t* bpage= e.request.bpage; + buf_page_t* bpage= e.request.bpage(); ut_ad(bpage->in_file()); void *frame= get_frame(e.request); @@ -785,10 +789,10 @@ void buf_dblwr_t::flush_buffered_writes_completed(const IORequest &request) ut_ad(lsn); ut_ad(lsn >= bpage->oldest_modification()); log_write_up_to(lsn, true); - ut_ad(!e.request.node->space->full_crc32() || + ut_ad(!e.request.node()->space->full_crc32() || !buf_page_is_corrupted(true, static_cast(frame), - e.request.node->space->flags)); - e.request.node->space->io(e.request, bpage->physical_offset(), e_size, + e.request.node()->space->flags)); + e.request.node()->space->io(e.request, bpage->physical_offset(), e_size, frame, bpage); } } @@ -820,13 +824,13 @@ flush_buffered_writes() will be invoked to make space. @param size payload size in bytes */ void buf_dblwr_t::add_to_batch(const IORequest &request, size_t size) noexcept { - ut_ad(request.bpage); - ut_ad(request.bpage->in_file()); - ut_ad(request.node); - ut_ad(!request.node->space->is_temporary()); - ut_ad(!request.node->space->is_being_imported()); - ut_ad(request.node->space->id == request.bpage->id().space()); - ut_ad(request.node->space->referenced()); + ut_ad(request.bpage()); + ut_ad(request.bpage()->in_file()); + ut_ad(request.node()); + ut_ad(!request.node()->space->is_temporary()); + ut_ad(!request.node()->space->is_being_imported()); + ut_ad(request.node()->space->id == request.bpage()->id().space()); + ut_ad(request.node()->space->referenced()); ut_ad(!srv_read_only_mode); const ulint buf_size= 2 * block_size; @@ -854,7 +858,7 @@ void buf_dblwr_t::add_to_batch(const IORequest &request, size_t size) noexcept are integer multiples of 256, so the above can translate into simple SIMD instructions. Currently, we make no such assumptions about the non-pointer parameters that are passed to the _aligned templates. */ - ut_ad(!request.bpage->zip_size() || request.bpage->zip_size() == size); + ut_ad(!request.bpage()->zip_size() || request.bpage()->zip_size() == size); ut_ad(active_slot->reserved == active_slot->first_free); ut_ad(active_slot->reserved < buf_size); new (active_slot->buf_block_arr + active_slot->first_free++) diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 2f10c3ded1215..8df9ba5e26ff3 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -43,6 +43,7 @@ Created 11/11/1995 Heikki Tuuri #include "fil0pagecompress.h" #include "lzo/lzo1x.h" #include "snappy-c.h" +#include "scope.h" /** Number of pages flushed via LRU. Protected by buf_pool.mutex. Also included in buf_pool.stat.n_pages_written. */ @@ -276,28 +277,29 @@ buf_flush_relocate_on_flush_list( ut_d(buf_flush_validate_low()); } -void buf_page_t::write_complete(bool persistent, bool error, uint32_t state) - noexcept +void buf_page_t::write_complete(space_type type, bool error, + uint32_t state) noexcept { - ut_ad(!persistent == fsp_is_system_temporary(id().space())); + ut_ad(type == EXT_BUF || + (type == TEMPORARY) == fsp_is_system_temporary(id().space())); ut_ad(state >= WRITE_FIX); ut_ad(!frame || frame == reinterpret_cast(this)->frame_address()); if (UNIV_LIKELY(!error)) { + bool persistent = (type == PERSISTENT); ut_d(lsn_t om= oldest_modification()); - ut_ad(om >= 2); + ut_ad(type == EXT_BUF || om >= 2); ut_ad(persistent == (om > 2)); + ut_ad(type != EXT_BUF || !oldest_modification()); /* We use release memory order to guarantee that callers of oldest_modification_acquire() will observe the block as being detached from buf_pool.flush_list, after reading the value 0. */ - oldest_modification_.store(persistent, std::memory_order_release); + if (type != EXT_BUF) + oldest_modification_.store(persistent, std::memory_order_release); } - zip.fix.fetch_sub((state >= WRITE_FIX_REINIT) - ? (WRITE_FIX_REINIT - UNFIXED) - : (WRITE_FIX - UNFIXED)); - lock.u_unlock(true); + write_complete_release(state); } inline void buf_pool_t::n_flush_inc() noexcept @@ -321,14 +323,14 @@ void buf_page_write_complete(const IORequest &request, bool error) noexcept { ut_ad(request.is_write()); ut_ad(!srv_read_only_mode); - buf_page_t *bpage= request.bpage; + buf_page_t *bpage= request.bpage(); ut_ad(bpage); const auto state= bpage->state(); /* io-fix can only be cleared by buf_page_t::write_complete() and buf_page_t::read_complete() */ ut_ad(state >= buf_page_t::WRITE_FIX); ut_ad(!buf_dblwr.is_inside(bpage->id())); - ut_ad(request.node->space->id == bpage->id().space()); + ut_ad(request.ext_buf() || request.node()->space->id == bpage->id().space()); if (request.slot) request.slot->release(); @@ -341,24 +343,53 @@ void buf_page_write_complete(const IORequest &request, bool error) noexcept mysql_mutex_assert_not_owner(&buf_pool.mutex); mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); - const bool persistent= bpage->oldest_modification() != 2; + buf_page_t::space_type type= request.ext_buf() + ? buf_page_t::EXT_BUF + : static_cast( + bpage->oldest_modification() == 2); - if (UNIV_UNLIKELY(!persistent) && UNIV_LIKELY(!error)) + if (UNIV_UNLIKELY(type != buf_page_t::PERSISTENT) && UNIV_LIKELY(!error)) { + if (type == buf_page_t::EXT_BUF) + { + ut_d(if (DBUG_IF("ib_ext_bp_count_io_only_for_t")) { + if (fil_space_t *space= fil_space_t::get(bpage->id_.space())) + { + auto space_name= space->name(); + if (fil_page_get_type(bpage->frame) == FIL_PAGE_INDEX && + space_name.data() && + !strncmp(space_name.data(), "test/t.ibd", space_name.size())) + { + ++buf_pool.stat.n_pages_written_to_ebp; + } + space->release(); + } + } else) + ++buf_pool.stat.n_pages_written_to_ebp; + } /* We must hold buf_pool.mutex while releasing the block, so that no other thread can access it before we have freed it. */ mysql_mutex_lock(&buf_pool.mutex); - bpage->write_complete(persistent, error, state); - buf_LRU_free_page(bpage, true); + bpage->write_complete(type, error, state); + buf_LRU_free_page(bpage, true, + request.ext_buf() ? request.ext_buf_page() : nullptr); mysql_mutex_unlock(&buf_pool.mutex); } else { - bpage->write_complete(persistent, error, state); + if (error && type == buf_page_t::EXT_BUF && + fil_system.ext_buf_pool_enabled()) + { + sql_print_warning("InnoDB: There was IO error during writing to " + "external buffer pool file, external buffer pool is " + "disabled."); + fil_system.ext_buf_pool_disable(); + } + bpage->write_complete(type, error, state); if (request.is_doublewritten()) { ut_ad(state < buf_page_t::WRITE_FIX_REINIT); - ut_ad(persistent); + ut_ad(type == buf_page_t::PERSISTENT); buf_dblwr.write_completed(); } } @@ -728,8 +759,9 @@ ATTRIBUTE_COLD void buf_pool_t::release_freed_page(buf_page_t *bpage) noexcept /** Write a flushable page to a file or free a freeable block. @param space tablespace +@param to_ext_buf whether to write the page to external buffer pull file @return whether a page write was initiated and buf_pool.mutex released */ -bool buf_page_t::flush(fil_space_t *space) noexcept +bool buf_page_t::flush(fil_space_t *space, bool to_ext_buf) noexcept { mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); ut_ad(in_file()); @@ -742,13 +774,15 @@ bool buf_page_t::flush(fil_space_t *space) noexcept const lsn_t lsn= mach_read_from_8(my_assume_aligned<8> (FIL_PAGE_LSN + (zip.data ? zip.data : frame))); - ut_ad(lsn - ? lsn >= oldest_modification() || oldest_modification() == 2 - : (space->is_temporary() || space->is_being_imported())); + ut_ad(to_ext_buf || + (lsn ? lsn >= oldest_modification() || oldest_modification() == 2 + : (space->is_temporary() || space->is_being_imported()))); if (s < UNFIXED) { ut_a(s >= FREED); + if (to_ext_buf) + return false; if (!space->is_temporary() && !space->is_being_imported()) { freed: @@ -767,15 +801,21 @@ bool buf_page_t::flush(fil_space_t *space) noexcept { ut_ad(!space->is_temporary()); ut_ad(!space->is_being_imported()); + if (to_ext_buf) + return false; goto freed; } + ext_buf_page_t *ext_page= nullptr; + if (to_ext_buf && !(ext_page= buf_pool.alloc_ext_page(id()))) + return false; + ut_d(const auto f=) zip.fix.fetch_add(WRITE_FIX - UNFIXED); ut_ad(f >= UNFIXED); ut_ad(f < READ_FIX); - ut_ad((space == fil_system.temp_space) + ut_ad(to_ext_buf || ((space == fil_system.temp_space) ? oldest_modification() == 2 - : oldest_modification() > 2); + : oldest_modification() > 2)); /* Increment the I/O operation count used for selecting LRU policy. */ buf_LRU_stat_inc_io(); @@ -790,7 +830,8 @@ bool buf_page_t::flush(fil_space_t *space) noexcept buf_block_t *block= reinterpret_cast(this); page_t *write_frame= zip.data; - space->reacquire(); + if (!to_ext_buf) + space->reacquire(); size_t size; #if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32 size_t orig_size; @@ -848,7 +889,12 @@ bool buf_page_t::flush(fil_space_t *space) noexcept write_frame= page; } - if ((s & LRU_MASK) == REINIT || !space->use_doublewrite()) + if (to_ext_buf) { + ut_ad(ext_page); + fil_system.ext_bp_io(*this, *ext_page, IORequest::WRITE_ASYNC, slot, size, + write_frame); + } + else if ((s & LRU_MASK) == REINIT || !space->use_doublewrite()) { if (!space->is_temporary() && !space->is_being_imported() && lsn > log_sys.get_flushed_lsn()) @@ -1260,9 +1306,11 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n, const size_t buf_lru_min_len= std::min((buf_pool.usable_size()) / 20 - 1, size_t{BUF_LRU_MIN_LEN}); + ulint free_or_flush= 0; for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.LRU); bpage && - ((UT_LIST_GET_LEN(buf_pool.LRU) > buf_lru_min_len && + (ut_d(buf_pool.force_LRU_eviction_to_ebp ||) + (UT_LIST_GET_LEN(buf_pool.LRU) > buf_lru_min_len && UT_LIST_GET_LEN(buf_pool.free) < free_limit) || to_withdraw || recv_recovery_is_on()); @@ -1274,6 +1322,7 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n, ut_ad(state >= buf_page_t::FREED); ut_ad(bpage->in_LRU_list); + bool flush_to_ebp= false; if (!bpage->oldest_modification()) { evict: @@ -1282,8 +1331,41 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n, continue; if (UNIV_UNLIKELY(to_withdraw != 0)) to_withdraw= buf_flush_LRU_to_withdraw(to_withdraw, *bpage); - buf_LRU_free_page(bpage, true); - ++n->evicted; + DBUG_EXECUTE_IF( + "ib_ext_bp_disable_LRU_eviction_for_t", + if (fil_space_t *space= fil_space_t::get(bpage->id_.space())) { + SCOPE_EXIT([space]() { space->release(); }); + auto space_name= space->name(); + if (space_name.data() && + !strncmp(space_name.data(), "test/t.ibd", space_name.size())) + continue; + }); + DBUG_EXECUTE_IF( + "ib_ext_bp_count_io_only_for_t", + if (fil_space_t *space= fil_space_t::get(bpage->id_.space())) { + SCOPE_EXIT([space]() { space->release(); }); + auto space_name= space->name(); + if (!space_name.data() || + strncmp(space_name.data(), "test/t.ibd", space_name.size())) + goto free_page;; + }); + // FIXME: currently every second page is flushed, consider more + // suitable algorithm there + if (!recv_recovery_is_on() && state != buf_page_t::FREED && + fil_system.ext_bp_size && !buf_pool.done_flush_list_waiters_count && + (ut_d(buf_pool.force_LRU_eviction_to_ebp ||)((++free_or_flush) & 1))) + { + flush_to_ebp= true; + goto flush_to_ebp; + } + else + { + #if !defined(DBUG_OFF) + free_page: + #endif + buf_LRU_free_page(bpage, true); + ++n->evicted; + } if (UNIV_LIKELY(scanned & 31)) continue; mysql_mutex_unlock(&buf_pool.mutex); @@ -1292,68 +1374,79 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n, continue; } - if (state < buf_page_t::READ_FIX && bpage->lock.u_lock_try(true)) +flush_to_ebp: + if ((flush_to_ebp || state < buf_page_t::READ_FIX) && + bpage->lock.u_lock_try(true)) + { + ut_ad(!bpage->is_io_fixed()); + switch (bpage->oldest_modification()) { - ut_ad(!bpage->is_io_fixed()); - switch (bpage->oldest_modification()) { - case 2: - /* LRU flushing will always evict pages of the temporary tablespace, - in buf_page_write_complete(). */ - ++n->evicted; - break; - case 1: - mysql_mutex_lock(&buf_pool.flush_list_mutex); - if (ut_d(lsn_t lsn=) bpage->oldest_modification()) - { - ut_ad(lsn == 1); /* It must be clean while we hold bpage->lock */ - buf_pool.delete_from_flush_list(bpage); - } - mysql_mutex_unlock(&buf_pool.flush_list_mutex); - /* fall through */ - case 0: + case 2: + /* LRU flushing will always evict pages of the temporary tablespace, + in buf_page_write_complete(). */ + ++n->evicted; + break; + case 1: + mysql_mutex_lock(&buf_pool.flush_list_mutex); + if (ut_d(lsn_t lsn=) bpage->oldest_modification()) + { + ut_ad(lsn == 1); /* It must be clean while we hold bpage->lock */ + buf_pool.delete_from_flush_list(bpage); + } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + /* fall through */ + case 0: + if (!flush_to_ebp) + { bpage->lock.u_unlock(true); goto evict; } - /* Block is ready for flush. Dispatch an IO request. */ - const page_id_t page_id(bpage->id()); - const uint32_t space_id= page_id.space(); - if (!space || space->id != space_id) + } + /* Block is ready for flush. Dispatch an IO request. */ + const page_id_t page_id(bpage->id()); + const uint32_t space_id= page_id.space(); + if (!space || space->id != space_id) + { + if (last_space_id != space_id) { - if (last_space_id != space_id) + buf_pool.lru_hp.set(bpage); + mysql_mutex_unlock(&buf_pool.mutex); + if (space) + space->release(); + auto p= buf_flush_space(space_id); + space= p.first; + last_space_id= space_id; + if (!space) { - buf_pool.lru_hp.set(bpage); - mysql_mutex_unlock(&buf_pool.mutex); - if (space) - space->release(); - auto p= buf_flush_space(space_id); - space= p.first; - last_space_id= space_id; - if (!space) - { - mysql_mutex_lock(&buf_pool.mutex); - goto no_space; - } mysql_mutex_lock(&buf_pool.mutex); - buf_pool.stat.n_pages_written+= p.second; - } - else - { - ut_ad(!space); goto no_space; } + mysql_mutex_lock(&buf_pool.mutex); + buf_pool.stat.n_pages_written+= p.second; } - else if (space->is_stopping_writes()) + else { - space->release(); - space= nullptr; - no_space: + ut_ad(!space); + goto no_space; + } + } + else if (space->is_stopping_writes()) + { + space->release(); + space= nullptr; + no_space: + if (flush_to_ebp && !bpage->oldest_modification()) { + bpage->lock.u_unlock(true); + buf_LRU_free_page(bpage, true); + } else { mysql_mutex_lock(&buf_pool.flush_list_mutex); buf_flush_discard_page(bpage); - ++n->evicted; - continue; } + ++n->evicted; + continue; + } - if (state < buf_page_t::UNFIXED) + if (!flush_to_ebp && state < buf_page_t::UNFIXED) goto flush; if (n->flushed >= max && !recv_recovery_is_on()) @@ -1362,13 +1455,24 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n, break; } - if (neighbors && space->is_rotational() && UNIV_LIKELY(!to_withdraw) && - /* Skip neighbourhood flush from LRU list if we haven't yet reached - half of the free page target. */ - UT_LIST_GET_LEN(buf_pool.free) * 2 >= free_limit) + if (flush_to_ebp) + { + /* The page latch will be released in io callback */ + if (!bpage->flush(space, true)) + { + buf_LRU_free_page(bpage, true); + bpage->lock.u_unlock(true); + ++n->evicted; + continue; + } + } + else if (neighbors && space->is_rotational() && + UNIV_LIKELY(!to_withdraw) && + /* Skip neighbourhood flush from LRU list if we haven't yet + reached half of the free page target. */ + UT_LIST_GET_LEN(buf_pool.free) * 2 >= free_limit) n->flushed+= buf_flush_try_neighbors(space, page_id, bpage, - neighbors == 1, - n->flushed, max); + neighbors == 1, n->flushed, max); else { flush: @@ -2095,9 +2199,11 @@ static void buf_flush_wait(lsn_t lsn) noexcept { buf_flush_sync_lsn= lsn; buf_pool.page_cleaner_set_idle(false); + ++buf_pool.done_flush_list_waiters_count; pthread_cond_signal(&buf_pool.do_flush_list); my_cond_wait(&buf_pool.done_flush_list, &buf_pool.flush_list_mutex.m_mutex); + --buf_pool.done_flush_list_waiters_count; oldest_lsn= buf_pool.get_oldest_modification(lsn); if (oldest_lsn >= lsn) break; @@ -2509,7 +2615,7 @@ bool buf_pool_t::need_LRU_eviction() const noexcept { /* try_LRU_scan==false means that buf_LRU_get_free_block() is waiting for buf_flush_page_cleaner() to evict some blocks */ - return UNIV_UNLIKELY(!try_LRU_scan || + return UNIV_UNLIKELY(ut_d(force_LRU_eviction_to_ebp ||) !try_LRU_scan || (UT_LIST_GET_LEN(LRU) > BUF_LRU_MIN_LEN && UT_LIST_GET_LEN(free) < LRU_scan_depth / 2)); } @@ -2874,11 +2980,13 @@ void buf_flush_sync() noexcept { log_sys.latch.wr_unlock(); mysql_mutex_lock(&buf_pool.flush_list_mutex); + ++buf_pool.done_flush_list_waiters_count; buf_flush_wait(lsn); /* Wait for the page cleaner to be idle (for log resizing at startup) */ while (buf_flush_sync_lsn) my_cond_wait(&buf_pool.done_flush_list, &buf_pool.flush_list_mutex.m_mutex); + --buf_pool.done_flush_list_waiters_count; mysql_mutex_unlock(&buf_pool.flush_list_mutex); log_sys.latch.wr_lock(SRW_LOCK_CALL); lsn_t new_lsn= log_sys.get_lsn(); diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc index 95e2a6cbe0972..419cd320bba54 100644 --- a/storage/innobase/buf/buf0lru.cc +++ b/storage/innobase/buf/buf0lru.cc @@ -105,10 +105,11 @@ uint buf_LRU_old_threshold_ms; If !bpage->frame && bpage->oldest_modification() <= 1, the object will be freed. -@param bpage buffer block -@param id page identifier -@param chain locked buf_pool.page_hash chain (will be released here) -@param zip whether bpage->zip of BUF_BLOCK_FILE_PAGE should be freed +@param bpage buffer block +@param id page identifier +@param chain locked buf_pool.page_hash chain (will be released here) +@param zip whether bpage->zip of BUF_BLOCK_FILE_PAGE should be freed +@param ext_buf_page external buffer page to replace bpage in page hash If a compressed page is freed other compressed pages may be relocated. @retval true if bpage with bpage->frame was removed from page_hash. The @@ -117,7 +118,8 @@ caller needs to free the page to the free list this case the block is already returned to the buddy allocator. */ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id, buf_pool_t::hash_chain &chain, - bool zip); + bool zip, + ext_buf_page_t *ext_buf_page= nullptr); /** Free a block to buf_pool */ static void buf_LRU_block_free_hashed_page(buf_block_t *block) @@ -736,7 +738,8 @@ The caller must hold buf_pool.mutex. @param zip whether to remove both copies of a ROW_FORMAT=COMPRESSED page @retval true if freed and buf_pool.mutex may have been temporarily released @retval false if the page was not freed */ -bool buf_LRU_free_page(buf_page_t *bpage, bool zip) +bool buf_LRU_free_page(buf_page_t *bpage, bool zip, + ext_buf_page_t *ext_buf_page) { const page_id_t id{bpage->id()}; buf_page_t* b = nullptr; @@ -820,7 +823,7 @@ bool buf_LRU_free_page(buf_page_t *bpage, bool zip) ut_ad(bpage->can_relocate()); - if (!buf_LRU_block_remove_hashed(bpage, id, chain, zip)) { + if (!buf_LRU_block_remove_hashed(bpage, id, chain, zip, ext_buf_page)) { ut_ad(!b); mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); return(true); @@ -993,7 +996,7 @@ ATTRIBUTE_COLD void buf_pool_t::free_block(buf_block_t *block) noexcept mysql_mutex_unlock(&mutex); } -inline void +void buf_pool_t::page_hash_table::remove(buf_pool_t::hash_chain &chain, buf_page_t *bpage) noexcept { @@ -1019,6 +1022,7 @@ If !bpage->frame && !bpage->oldest_modification(), the object will be freed. @param id page identifier @param chain locked buf_pool.page_hash chain (will be released here) @param zip whether bpage->zip of BUF_BLOCK_FILE_PAGE should be freed +@param ext_buf_page external buffer page to replace bpage in page hash If a compressed page is freed other compressed pages may be relocated. @retval true if BUF_BLOCK_FILE_PAGE was removed from page_hash. The @@ -1027,7 +1031,8 @@ caller needs to free the page to the free list this case the block is already returned to the buddy allocator. */ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id, buf_pool_t::hash_chain &chain, - bool zip) + bool zip, + ext_buf_page_t *ext_buf_page) { ut_a(bpage->can_relocate()); ut_ad(buf_pool.page_hash.lock_get(chain).is_write_locked()); @@ -1091,7 +1096,16 @@ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id, MEM_CHECK_ADDRESSABLE(bpage->zip.data, bpage->zip_size()); } - buf_pool.page_hash.remove(chain, bpage); + if (ext_buf_page) { + buf_pool.push_ext_page_to_LRU(*ext_buf_page); + ut_ad(ext_buf_page->id_ == bpage->id()); + ext_buf_page->hash= bpage->hash; + buf_pool.page_hash.replace(chain, bpage, + reinterpret_cast(ext_buf_page)); + } + else + buf_pool.page_hash.remove(chain, bpage); + page_hash_latch& hash_lock = buf_pool.page_hash.lock_get(chain); if (UNIV_UNLIKELY(!bpage->frame)) { @@ -1140,10 +1154,12 @@ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id, } /** Release and evict a corrupted page. -@param bpage x-latched page that was found corrupted -@param state expected current state of the page */ +@param bpage x-latched page that was found corrupted +@param state expected current state of the page +@param set_corrupt_id true to call bpage->set_corrupt_id() */ ATTRIBUTE_COLD -void buf_pool_t::corrupted_evict(buf_page_t *bpage, uint32_t state) noexcept +void buf_pool_t::corrupted_evict(buf_page_t *bpage, uint32_t state, + bool set_corrupt_id) noexcept { const page_id_t id{bpage->id()}; buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold()); @@ -1153,7 +1169,8 @@ void buf_pool_t::corrupted_evict(buf_page_t *bpage, uint32_t state) noexcept hash_lock.lock(); ut_ad(!bpage->oldest_modification()); - bpage->set_corrupt_id(); + if (set_corrupt_id) + bpage->set_corrupt_id(); auto unfix= state - buf_page_t::FREED; auto s= bpage->zip.fix.fetch_sub(unfix) - unfix; bpage->lock.x_unlock(true); diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc index 59bb3c5c2327e..b59a2eab817f1 100644 --- a/storage/innobase/buf/buf0rea.cc +++ b/storage/innobase/buf/buf0rea.cc @@ -57,6 +57,15 @@ read-ahead is not done: this is to prevent flooding the buffer pool with i/o-fixed buffer blocks */ #define BUF_READ_AHEAD_PEND_LIMIT 2 +/** The result, returned by buf_page_init_for_read() */ +struct page_init_result { +/* page_init_result() : bpage(nullptr), ext_buf_page(nullptr) {} */ + bool in_ext_buffer_pool() const noexcept { return ext_buf_page; } + buf_page_t* bpage; /* Initialized page */ + ext_buf_page_t *ext_buf_page; /* External buffer pool page if bpage can be + read from from exretnal buffer pool file */ +}; + /** Initialize a page for read to the buffer buf_pool. If the page is (1) already in buf_pool, or (2) if the tablespace has been or is being deleted, @@ -69,39 +78,59 @@ and the lock released later. bitwise-ORed with 1 in recovery @param chain buf_pool.page_hash cell for page_id @param block preallocated buffer block (set to nullptr if consumed) -@return pointer to the block -@retval nullptr in case of an error -@retval pointer to block | 1 if the page already exists in buf_pool */ -static buf_page_t *buf_page_init_for_read(const page_id_t page_id, - ulint zip_size, - buf_pool_t::hash_chain &chain, - buf_block_t *&block) noexcept +@retval page_init_result::bpage == nullptr in case of an error, + otherwise + page_init_result::bpage points to initialized page and + the first bit of page_init_result::bpage is set only if the + page already exists in buf_pool, + or + page_init_result::ext_buf_page point to external buffer pool + page if page_init_result::bpage can be read from external + buffer pool file */ +static page_init_result buf_page_init_for_read(const page_id_t page_id, + ulint zip_size, + buf_pool_t::hash_chain &chain, + buf_block_t *&block) noexcept { buf_page_t *bpage= !zip_size || (zip_size & 1) ? &block->page : nullptr; + ext_buf_page_t *ext_buf_page= nullptr; constexpr uint32_t READ_BUF_FIX{buf_page_t::READ_FIX + 1}; page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain); hash_lock.lock(); - buf_page_t *hash_page= buf_pool.page_hash.get(page_id, chain); + buf_page_t *hash_page= buf_pool.page_hash.get(page_id, chain); if (hash_page) { - page_exists: - /* The page is already in the buffer pool. */ - ut_d(const uint32_t state=) hash_page->fix(); - ut_ad(state >= buf_page_t::FREED); - hash_lock.unlock(); - return reinterpret_cast(uintptr_t(hash_page) | 1); + if (hash_page->external()) + ext_buf_page= reinterpret_cast(hash_page); + else + { + page_exists: + /* The page is already in the buffer pool. */ + ut_d(const uint32_t state=) hash_page->fix(); + ut_ad(state >= buf_page_t::FREED); + hash_lock.unlock(); + return page_init_result{ + reinterpret_cast(uintptr_t(hash_page) | 1), nullptr}; + } } if (UNIV_UNLIKELY(mysql_mutex_trylock(&buf_pool.mutex))) { hash_lock.unlock(); + /* ext_buf_page can be set previously, and should be zeroed out to prevent + wrong value usage afterwards */ + ext_buf_page= nullptr; mysql_mutex_lock(&buf_pool.mutex); hash_lock.lock(); - hash_page= buf_pool.page_hash.get(page_id, chain); + hash_page= buf_pool.page_hash.get(page_id, chain); if (hash_page) { - mysql_mutex_unlock(&buf_pool.mutex); - goto page_exists; + if (hash_page->external()) + ext_buf_page= reinterpret_cast(hash_page); + else { + mysql_mutex_unlock(&buf_pool.mutex); + goto page_exists; + } } } @@ -116,7 +145,18 @@ static buf_page_t *buf_page_init_for_read(const page_id_t page_id, in buf_page_t::read_complete() by the io-handler thread. */ bpage->lock.x_lock(true); /* Insert into the hash table of file pages */ - buf_pool.page_hash.append(chain, bpage); + if (ext_buf_page) + { + bpage->hash= ext_buf_page->hash; + ut_d(bpage->in_page_hash= true); + buf_pool.page_hash.replace( + chain, reinterpret_cast(ext_buf_page), + bpage); + ut_ad(!ext_buf_page->in_page_hash); + buf_pool.remove_ext_page_from_LRU(*ext_buf_page); + } + else + buf_pool.page_hash.append(chain, bpage); hash_lock.unlock(); /* The block must be put to the LRU list, to the old blocks */ @@ -135,7 +175,8 @@ static buf_page_t *buf_page_init_for_read(const page_id_t page_id, we have to add this block to unzip_LRU after block->page.zip.data is set. */ ut_ad(bpage->belongs_to_unzip_LRU()); - buf_unzip_LRU_add_block(reinterpret_cast(bpage), TRUE); + buf_unzip_LRU_add_block(reinterpret_cast(bpage), + TRUE); } } else @@ -153,20 +194,29 @@ static buf_page_t *buf_page_init_for_read(const page_id_t page_id, check the page_hash again, as it may have been modified. */ if (UNIV_UNLIKELY(lru)) { - hash_page= buf_pool.page_hash.get(page_id, chain); + hash_page= buf_pool.page_hash.get(page_id, chain); if (UNIV_LIKELY_NULL(hash_page)) { - /* The block was added by some other thread. */ - ut_d(const uint32_t state=) hash_page->fix(); - ut_ad(state >= buf_page_t::FREED); - buf_buddy_free(data, zip_size); - mysql_mutex_unlock(&buf_pool.mutex); - return reinterpret_cast(uintptr_t(hash_page) | 1); + if (UNIV_UNLIKELY(hash_page->external())) + ext_buf_page= reinterpret_cast(hash_page); + else + { + /* The block was added by some other thread. */ + ut_d(const uint32_t state=) hash_page->fix(); + ut_ad(state >= buf_page_t::FREED); + buf_buddy_free(data, zip_size); + mysql_mutex_unlock(&buf_pool.mutex); + return page_init_result{ + reinterpret_cast(uintptr_t(hash_page) | 1), + nullptr}; + } } } - bpage= static_cast(ut_zalloc_nokey(sizeof *bpage)); + bpage= + static_cast(ut_zalloc_nokey(sizeof *bpage)); + // TODO: do we need to init it for compressed pages? I think no. page_zip_des_init(&bpage->zip); page_zip_set_size(&bpage->zip, zip_size); bpage->zip.data = (page_zip_t*) data; @@ -179,19 +229,39 @@ static buf_page_t *buf_page_init_for_read(const page_id_t page_id, bpage->init(READ_BUF_FIX, page_id); bpage->lock.x_lock(true); - hash_lock.lock(); - buf_pool.page_hash.append(chain, bpage); - hash_lock.unlock(); + + if (ext_buf_page) { + bpage->hash= ext_buf_page->hash; + ut_d(bpage->in_page_hash= true); + hash_lock.lock(); + buf_pool.page_hash.replace( + chain, reinterpret_cast(ext_buf_page), bpage); + hash_lock.unlock(); + ut_ad(!ext_buf_page->in_page_hash); + buf_pool.remove_ext_page_from_LRU(*ext_buf_page); + } + else + { + hash_lock.lock(); + buf_pool.page_hash.append(chain, bpage); + hash_lock.unlock(); + } /* The block must be put to the LRU list, to the old blocks. The zip size is already set into the page zip */ buf_LRU_add_block(bpage, true/* to old blocks */); } - buf_pool.stat.n_pages_read++; + if (!ext_buf_page) + buf_pool.stat.n_pages_read++; ut_ad(!bpage || bpage->in_file()); + if (ext_buf_page && !fil_system.ext_buf_pool_enabled()) + { + buf_pool.free_ext_page(*ext_buf_page); + ext_buf_page= nullptr; + } mysql_mutex_unlock(&buf_pool.mutex); - return bpage; + return page_init_result{bpage, ext_buf_page}; } inline ulonglong mariadb_measure() noexcept @@ -273,14 +343,20 @@ buf_read_page_low( return nullptr; } - buf_page_t *bpage= buf_page_init_for_read(page_id, zip_size, chain, block); + auto init_page_result= + buf_page_init_for_read(page_id, zip_size, chain, block); + buf_page_t *bpage= init_page_result.bpage; if (UNIV_UNLIKELY(!bpage)) + { + ut_ad(!init_page_result.ext_buf_page); goto fail; + } const bool exist(uintptr_t(bpage) & 1); bpage= reinterpret_cast(uintptr_t(bpage) & ~uintptr_t{1}); trx_t *const trx= thd ? thd_to_trx(thd) : nullptr; if (exist) { + ut_ad(!init_page_result.ext_buf_page); if (!err) { bpage->unfix(); @@ -320,16 +396,28 @@ buf_read_page_low( void* dst= zip_size > 1 ? bpage->zip.data : bpage->frame; const size_t len= zip_size & ~1 ? zip_size & ~1 : srv_page_size; - + /* Synchronous read */ if (err != nullptr) { thd_wait_begin(thd, THD_WAIT_DISKIO); ha_handler_stats *const stats= trx ? trx->active_handler_stats : nullptr; const ulonglong start= stats ? mariadb_measure() : 0; - auto fio= space->io(IORequest(IORequest::READ_SYNC), + auto fio= + init_page_result.in_ext_buffer_pool() + ? fil_io_t{fil_system.ext_bp_io( + *bpage, *init_page_result.ext_buf_page, + IORequest::READ_SYNC, nullptr, len, dst), + nullptr} + : space->io(IORequest(IORequest::READ_SYNC), os_offset_t{page_id.page_no()} * len, len, dst, bpage); *err= fio.err; thd_wait_end(thd); + if (init_page_result.in_ext_buffer_pool()) + { + mysql_mutex_lock(&buf_pool.mutex); + buf_pool.free_ext_page(*init_page_result.ext_buf_page); + mysql_mutex_unlock(&buf_pool.mutex); + } if (stats) { stats->pages_read_count++; @@ -338,22 +426,44 @@ buf_read_page_low( } if (UNIV_LIKELY(*err == DB_SUCCESS)) { - *err= bpage->read_complete(*fio.node, recv_sys.recovery_on); + *err= bpage->read_complete(init_page_result.in_ext_buffer_pool() + ? *UT_LIST_GET_FIRST(space->chain) + : *fio.node, + recv_sys.recovery_on); if (*err) bpage= nullptr; space->release(); - + if (init_page_result.in_ext_buffer_pool()) + { + ut_d(if (DBUG_IF("ib_ext_bp_count_io_only_for_t")) { + auto space_name= space->name(); + if (fil_page_get_type(bpage->frame) == FIL_PAGE_INDEX && + space_name.data() && + !strncmp(space_name.data(), "test/t.ibd", space_name.size())) + ++buf_pool.stat.n_pages_read_from_ebp; + } else)++ buf_pool.stat.n_pages_read_from_ebp; + } /* FIXME: Remove this, and accumulate stats->pages_read_count to global statistics somewhere! */ buf_LRU_stat_inc_io(); return bpage; } } + else if (init_page_result.in_ext_buffer_pool()) + { + auto err= fil_system.ext_bp_io(*bpage, *init_page_result.ext_buf_page, + IORequest::READ_ASYNC, nullptr, len, dst); + space->release(); + if (UNIV_LIKELY(DB_SUCCESS == err)) + return reinterpret_cast(-1); + } else if (UNIV_LIKELY(DB_SUCCESS == - space->io(IORequest(IORequest::READ_ASYNC), - os_offset_t{page_id.page_no()} * len, len, - dst, bpage).err)) - return reinterpret_cast(-1); + space + ->io(IORequest(IORequest::READ_ASYNC), + os_offset_t{page_id.page_no()} * len, len, dst, + bpage) + .err)) + return reinterpret_cast(-1); recv_sys.free_corrupted_page(page_id, *space->chain.start); buf_pool.corrupted_evict(bpage, buf_page_t::READ_FIX + 1); @@ -791,7 +901,10 @@ void buf_read_recover(fil_space_t *space, const page_id_t page_id, if (init_lsn) { - buf_page_t *bpage= buf_page_init_for_read(page_id, zip_size, chain, block); + auto init_page_result= + buf_page_init_for_read(page_id, zip_size, chain, block); + ut_ad(!init_page_result.ext_buf_page); + buf_page_t *bpage= init_page_result.bpage; if (UNIV_UNLIKELY(!bpage)) goto fail; const bool exist(uintptr_t(bpage) & 1); diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index f6f2331f6d258..9d9c82a77b08a 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -59,6 +59,9 @@ Created 10/25/1995 Heikki Tuuri #include "bzlib.h" #include "snappy-c.h" +/* External buffer pool file name */ +const char *ext_bp_file_name= "ext_buffer_pool"; + ATTRIBUTE_COLD bool fil_space_t::set_corrupted() const noexcept { if (!is_stopping() && !is_corrupted.test_and_set()) @@ -497,7 +500,8 @@ pfs_os_file_t fil_node_t::detach() noexcept void fil_node_t::prepare_to_close_or_detach() noexcept { mysql_mutex_assert_owner(&fil_system.mutex); - ut_ad(space->is_ready_to_close() || srv_operation == SRV_OPERATION_BACKUP || + ut_ad(space->is_ready_to_close() || + srv_operation == SRV_OPERATION_BACKUP || srv_operation == SRV_OPERATION_RESTORE_DELTA); ut_a(is_open()); ut_a(!being_extended); @@ -1295,6 +1299,16 @@ void fil_system_t::close() noexcept if (is_initialised()) { + if (ext_bp_file != OS_FILE_CLOSED) + { + if (srv_thread_pool) + srv_thread_pool->unbind(ext_bp_file.m_file); + int res= mysql_file_close( + IF_WIN(my_win_handle2File((os_file_t) ext_bp_file), ext_bp_file), + MYF(MY_WME)); + ut_a(res != -1); + ext_bp_file= OS_FILE_CLOSED; + } spaces.free(); mysql_mutex_destroy(&mutex); fil_space_crypt_cleanup(); @@ -2917,18 +2931,77 @@ fil_io_t fil_space_t::io(const IORequest &type, os_offset_t offset, size_t len, return {err, node}; } +bool fil_system_t::create_ext_file() noexcept { + bool ret; + ext_bp_file= pfs_create_temp_file( + ext_bp_path ? ext_bp_path : fil_path_to_mysql_datadir, + "/Extended buffer pool file", "ext_buf_", true); + if (ext_bp_file == OS_FILE_CLOSED) + { + sql_print_error("Cannot open/create extended buffer pool file"); + /* Report OS error in error log */ + (void)os_file_get_last_error(true, false); + return false; + } + ret= os_file_set_size(ext_bp_file_name, ext_bp_file.m_file, ext_bp_size); + if (!ret) + { + os_file_close_func(ext_bp_file.m_file); + sql_print_error("Cannot set extended buffer pool file size to %zum", + ext_bp_size); + return false; + } + if (srv_thread_pool && srv_thread_pool->bind(ext_bp_file.m_file) != 0) + { + sql_print_error("Cannot set async io for extended buffer pool file"); + /* Report OS error in error log */ + (void) os_file_get_last_error(true, false); + return false; + } + return true; +} + +dberr_t fil_system_t::ext_bp_io(buf_page_t &bpage, ext_buf_page_t &ext_page, + IORequest::Type io_request_type, + buf_tmp_buffer_t *slot, size_t len, + void *buf) noexcept +{ + ut_ad(len % 512 == 0); /* page_compressed */ + ut_ad(io_request_type == IORequest::WRITE_ASYNC || + io_request_type == IORequest::READ_SYNC || + io_request_type == IORequest::READ_ASYNC) ; + /* Queue the aio request */ + return os_aio(IORequest{&bpage, slot, &ext_page, io_request_type}, buf, + buf_pool.ext_page_offset(ext_page), len, ext_bp_file, + ext_bp_file_name); +} + #include void IORequest::write_complete(int io_error) const noexcept { ut_ad(fil_validate_skip()); - ut_ad(node); - fil_space_t *space= node->space; + ut_ad(node_ptr); + buf_page_t *buf_page= bpage(); ut_ad(is_write()); - if (!bpage) + fil_space_t *space; + if (ext_buf()) + { + space= fil_space_t::get(buf_page->id().space()); + if (!space) + { + buf_page->write_complete_release(buf_page->state()); + return; + } + } + else + space= node_ptr->space; + + if (!buf_page) { ut_ad(!srv_read_only_mode); + ut_ad(!ext_buf()); if (type == IORequest::DBLWR_BATCH) { buf_dblwr.flush_buffered_writes_completed(*this); @@ -2942,30 +3015,65 @@ void IORequest::write_complete(int io_error) const noexcept else buf_page_write_complete(*this, io_error); - space->complete_write(); + if (!ext_buf()) + space->complete_write(); func_exit: space->release(); } void IORequest::read_complete(int io_error) const noexcept { + buf_page_t *buf_page= bpage(); ut_ad(fil_validate_skip()); - ut_ad(node); + ut_ad(node_ptr); ut_ad(is_read()); - ut_ad(bpage); - ut_d(auto s= bpage->state()); + ut_ad(bpage()); + ut_d(auto s= bpage()->state()); ut_ad(s > buf_page_t::READ_FIX); ut_ad(s <= buf_page_t::WRITE_FIX); - const page_id_t id(bpage->id()); + fil_space_t *space; + if (ext_buf()) { + ut_ad(ext_buf_page()->id_ == buf_page->id()); + mysql_mutex_lock(&buf_pool.mutex); + buf_pool.free_ext_page(*ext_buf_page()); + mysql_mutex_unlock(&buf_pool.mutex); + /* The space will be released at the end of this function */ + space= fil_space_t::get(buf_page->id().space()); + if (!space) { + buf_pool.corrupted_evict(buf_page, buf_page_t::READ_FIX + 1, false); + ++buf_pool.stat.n_pages_read_from_ebp; + return; + } + ut_d(if (DBUG_IF("ib_ext_bp_count_io_only_for_t")) { + auto space_name= space->name(); + if (fil_page_get_type(buf_page->frame) == FIL_PAGE_INDEX && + space_name.data() && + !strncmp(space_name.data(), "test/t.ibd", space_name.size())) + { + ++buf_pool.stat.n_pages_read_from_ebp; + } + } else) + ++buf_pool.stat.n_pages_read_from_ebp; + } + else + space= node_ptr->space; + + const page_id_t id(buf_page->id()); const bool in_recovery{recv_sys.recovery_on}; if (UNIV_UNLIKELY(io_error != 0)) { sql_print_error("InnoDB: Read error %d of page " UINT32PF " in file %s", - io_error, id.page_no(), node->name); - recv_sys.free_corrupted_page(id, *node); - buf_pool.corrupted_evict(bpage, buf_page_t::READ_FIX + 1); + io_error, id.page_no(), + ext_buf() ? "of external buffer pool, external buffer " + "pool is disabled" + : node_ptr->name); + if (ext_buf()) + fil_system.ext_buf_pool_disable(); + else + recv_sys.free_corrupted_page(id, *node_ptr); + buf_pool.corrupted_evict(buf_page, buf_page_t::READ_FIX + 1); corrupted: if (in_recovery && !srv_force_recovery) { @@ -2974,12 +3082,14 @@ void IORequest::read_complete(int io_error) const noexcept mysql_mutex_unlock(&recv_sys.mutex); } } - else if (bpage->read_complete(*node, in_recovery)) + else if (bpage()->read_complete(ext_buf() ? *UT_LIST_GET_FIRST(space->chain) + : *node_ptr, + in_recovery)) goto corrupted; else - bpage->unfix(); + bpage()->unfix(); - node->space->release(); + space->release(); } /** Flush to disk the writes in file spaces of the given type @@ -3347,3 +3457,45 @@ fil_space_t *fil_space_t::prev_in_unflushed_spaces() noexcept } #endif + +pfs_os_file_t pfs_create_temp_file(const char *path, const char *label, + const char *prefix, bool async_io) +{ + if (!path) + { + path= mysql_tmpdir; + } +#ifdef UNIV_PFS_IO + /* This temp file open does not go through normal + file APIs, add instrumentation to register with + performance schema */ + struct PSI_file_locker *locker; + PSI_file_locker_state state; + char *name= + static_cast(ut_malloc_nokey(strlen(path) + strlen(label) + 1)); + strcpy(name, path); + strcat(name, label); + + register_pfs_file_open_begin(&state, locker, innodb_temp_file_key, + PSI_FILE_CREATE, path ? name : label, __FILE__, + __LINE__); + +#endif + DBUG_ASSERT(strlen(path) + 2 <= FN_REFLEN); + char filename[FN_REFLEN]; + File f= create_temp_file( + filename, path, prefix, O_BINARY | O_SEQUENTIAL, + MYF(MY_WME | MY_TEMPORARY | (async_io ? MY_OPEN_FOR_ASYNC_IO : 0))); + pfs_os_file_t fd= IF_WIN((os_file_t) my_get_osfhandle(f), f); + +#ifdef UNIV_PFS_IO + register_pfs_file_open_end(locker, fd, (fd == OS_FILE_CLOSED) ? NULL : &fd); + ut_free(name); +#endif + + if (fd == OS_FILE_CLOSED) + { + ib::error() << "Cannot create temporary merge file"; + } + return (fd); +} diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index bb325313d8b57..0a332f628e290 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -963,8 +963,12 @@ static SHOW_VAR innodb_status_variables[]= { &buf_pool.stat.n_ra_pages_evicted, SHOW_SIZE_T}, {"buffer_pool_read_requests", &buf_pool.stat.n_page_gets, SHOW_SIZE_T}, {"buffer_pool_reads", &buf_pool.stat.n_pages_read, SHOW_SIZE_T}, + {"ext_buffer_pool_reads", &buf_pool.stat.n_pages_read_from_ebp, + SHOW_SIZE_T}, {"buffer_pool_wait_free", &buf_pool.stat.LRU_waits, SHOW_SIZE_T}, {"buffer_pool_write_requests", &buf_pool.flush_list_requests, SHOW_SIZE_T}, + {"ext_buffer_pool_pages_flushed", &buf_pool.stat.n_pages_written_to_ebp, + SHOW_SIZE_T}, {"checkpoint_age", &export_vars.innodb_checkpoint_age, SHOW_SIZE_T}, {"checkpoint_max_age", &export_vars.innodb_checkpoint_max_age, SHOW_SIZE_T}, {"data_fsyncs", (size_t*) &os_n_fsyncs, SHOW_SIZE_T}, @@ -3651,6 +3655,40 @@ static void innodb_buffer_pool_size_update(THD* thd,st_mysql_sys_var*,void*, buf_pool.resize(*static_cast(save), thd); } +static void innodb_extended_buffer_pool_size_update(THD *thd, + st_mysql_sys_var *, void *, + const void *save) +{ + buf_pool.extended_pages= + (*static_cast(save) >> srv_page_size_shift); + fil_system.ext_bp_size= buf_pool.extended_pages << srv_page_size_shift; +} + +#ifdef UNIV_DEBUG +static void innodb_force_LRU_eviction_set(THD *, st_mysql_sys_var *, void *, + const void *save) +{ + buf_pool.force_LRU_eviction_to_ebp= *static_cast(save); + if (buf_pool.force_LRU_eviction_to_ebp) + { + /* Wake up page cleaner twice, the first one is to flush dirty pages to + data files, the second one is to flush clean pages to external buffer pool. + */ + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_pool.page_cleaner_wakeup(true); + my_cond_wait(&buf_pool.done_flush_list, + &buf_pool.flush_list_mutex.m_mutex); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_pool.page_cleaner_wakeup(true); + my_cond_wait(&buf_pool.done_flush_list, + &buf_pool.flush_list_mutex.m_mutex); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + } + buf_pool.force_LRU_eviction_to_ebp= false; +} +#endif /* UNIV_DEBUG */ + static MYSQL_SYSVAR_SIZE_T(buffer_pool_size, buf_pool.size_in_bytes_requested, PLUGIN_VAR_RQCMDARG, "The size of the memory buffer InnoDB uses to cache data" @@ -3689,6 +3727,24 @@ static MYSQL_SYSVAR_UINT(log_write_ahead_size, log_sys.write_size, "Redo log write size to avoid read-on-write; must be a power of two", nullptr, nullptr, 512, 512, 4096, 1); +static MYSQL_SYSVAR_SIZE_T(extended_buffer_pool_size, fil_system.ext_bp_size, + PLUGIN_VAR_RQCMDARG, + "The extended buffer pool file size", + nullptr, innodb_extended_buffer_pool_size_update, + // TODO: set correct min and max values here. + 0, 0, SIZE_T_MAX, 0); + +static MYSQL_SYSVAR_STR(extended_buffer_pool_path, fil_system.ext_bp_path, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Path to extended buffer pool file", + nullptr, nullptr, nullptr); + +#ifdef UNIV_DEBUG +static MYSQL_SYSVAR_BOOL(force_LRU_eviction, buf_pool.force_LRU_eviction_to_ebp, + PLUGIN_VAR_OPCMDARG, + "Wake up page cleaner and wait for pages flushing end, used for testing only", + NULL, innodb_force_LRU_eviction_set, FALSE); +#endif #ifdef BTR_CUR_HASH_ADAPT static void innodb_adaptive_hash_index_update(THD*, st_mysql_sys_var*, void*, @@ -3821,6 +3877,8 @@ static int innodb_init_params() buf_pool.size_in_bytes_max; #endif + buf_pool.extended_pages = fil_system.ext_bp_size >> srv_page_size_shift; + if (innodb_buffer_pool_size < min) { sql_print_error("InnoDB: innodb_page_size=%lu requires " @@ -18650,9 +18708,11 @@ static void innodb_log_file_size_update(THD *thd, st_mysql_sys_var*, lsn_t resizing= log_sys.resize_in_progress(); if (resizing > buf_pool.get_oldest_modification(0)) { + ++buf_pool.done_flush_list_waiters_count; buf_pool.page_cleaner_wakeup(true); my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.flush_list_mutex.m_mutex, &abstime); + --buf_pool.done_flush_list_waiters_count; resizing= log_sys.resize_in_progress(); } mysql_mutex_unlock(&buf_pool.flush_list_mutex); @@ -19881,6 +19941,11 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(buffer_pool_size_auto_min), #endif MYSQL_SYSVAR(buffer_pool_size_max), + MYSQL_SYSVAR(extended_buffer_pool_size), + MYSQL_SYSVAR(extended_buffer_pool_path), +#ifdef UNIV_DEBUG + MYSQL_SYSVAR(force_LRU_eviction), +#endif MYSQL_SYSVAR(buffer_pool_chunk_size), MYSQL_SYSVAR(buffer_pool_filename), MYSQL_SYSVAR(buffer_pool_dump_now), diff --git a/storage/innobase/handler/i_s.cc b/storage/innobase/handler/i_s.cc index 5eed69d1f72e5..105619630534c 100644 --- a/storage/innobase/handler/i_s.cc +++ b/storage/innobase/handler/i_s.cc @@ -3361,6 +3361,12 @@ static ST_FIELD_INFO i_s_innodb_buffer_stats_fields_info[]= #define IDX_BUF_STATS_UNZIP_CUR 31 Column("UNCOMPRESS_CURRENT", ULonglong(), NOT_NULL), +#define IDX_BUF_STATS_PAGE_WRITTEN_TO_EBP 32 + Column("NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL",ULonglong(), NOT_NULL), + +#define IDX_BUF_STATS_PAGE_READ_FROM_EBP 33 + Column("NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL",ULonglong(), NOT_NULL), + CEnd() }; } // namespace Show @@ -3428,12 +3434,18 @@ static int i_s_innodb_stats_fill(THD *thd, TABLE_LIST * tables, Item *) OK(fields[IDX_BUF_STATS_PAGE_READ]->store(info.n_pages_read, true)); + OK(fields[IDX_BUF_STATS_PAGE_READ_FROM_EBP]->store( + info.n_pages_read_from_ebp, true)); + OK(fields[IDX_BUF_STATS_PAGE_CREATED]->store( info.n_pages_created, true)); OK(fields[IDX_BUF_STATS_PAGE_WRITTEN]->store( info.n_pages_written, true)); + OK(fields[IDX_BUF_STATS_PAGE_WRITTEN_TO_EBP]->store( + info.n_pages_written_to_ebp, true)); + OK(fields[IDX_BUF_STATS_GET]->store(info.n_page_gets, true)); OK(fields[IDX_BUF_STATS_PAGE_READ_RATE]->store( diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 231df868166f8..fec01096ff236 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -85,8 +85,12 @@ struct buf_pool_info_t ulint n_pages_made_young; /*!< number of pages made young */ ulint n_pages_not_made_young; /*!< number of pages not made young */ ulint n_pages_read; /*!< buf_pool.n_pages_read */ + /** buf_pool.n_pages_read_from_ebp */ + ulint n_pages_read_from_ebp; ulint n_pages_created; /*!< buf_pool.n_pages_created */ ulint n_pages_written; /*!< buf_pool.n_pages_written */ + /** buf_pool.n_pages_written_to_ebp */ + ulint n_pages_written_to_ebp; ulint n_page_gets; /*!< buf_pool.n_page_gets */ ulint n_ra_pages_read_rnd; /*!< buf_pool.n_ra_pages_read_rnd, number of pages readahead */ @@ -457,15 +461,11 @@ for compressed and uncompressed frames */ class buf_pool_t; -class buf_page_t +struct buf_page_base_t { - friend buf_pool_t; - friend buf_block_t; - - /** @name General fields */ - /* @{ */ - -public: // FIXME: fix fil_iterate() + /** ext_buf_page_t indentifier */ + static constexpr std::uintptr_t EXT_BUF_FRAME{1}; + // FIXME: fix fil_iterate() /** Page id. Protected by buf_pool.page_hash.lock_get() when the page is in buf_pool.page_hash. */ page_id_t id_; @@ -483,7 +483,57 @@ class buf_page_t uint16_t free_offset; }; }; -private: + /** pointer to aligned, uncompressed page frame of innodb_page_size */ + byte *frame; +#ifdef UNIV_DEBUG + /** whether this->LRU is in buf_pool.LRU (in_file()); + protected by buf_pool.mutex */ + bool in_LRU_list; + /** whether this is in buf_pool.page_hash (in_file()); + protected by buf_pool.mutex */ + bool in_page_hash; + /** whether this->list is in buf_pool.free (state() == NOT_USED); + protected by buf_pool.flush_list_mutex */ + bool in_free_list; +#endif /* UNIV_DEBUG */ + buf_page_base_t() : id_{0} {} + buf_page_base_t(const buf_page_base_t &b) + : id_(b.id_), hash(b.hash), frame(b.frame) +#ifdef UNIV_DEBUG + , + in_LRU_list(b.in_LRU_list), in_page_hash(b.in_page_hash), + in_free_list(b.in_free_list) +#endif /* UNIV_DEBUG */ + { + } + + bool external() const noexcept + { + /* TODO: we could just compare the address of the page, as it is done for + sentinel pages, and use *frame for something else */ + return reinterpret_cast(frame) == EXT_BUF_FRAME; + } +}; + +/* External buffer pool page. The first 3 members (6 for debug build) must be +the same as in buf_page_t. The "frame" member must always be equal to +EXT_BUF_FRAME, this is how we determine if some page is external one. */ +struct ext_buf_page_t : public buf_page_base_t { +public: + /** Node of buf_pool_t::ext_free */ + UT_LIST_NODE_T(ext_buf_page_t) free_list; + /** Node of buf_pool_t::ext_LRU */ + UT_LIST_NODE_T(ext_buf_page_t) ext_LRU_list; +}; + +class buf_page_t : public buf_page_base_t +{ + friend buf_pool_t; + friend buf_block_t; + + /** @name General fields */ + /* @{ */ + /** log sequence number of the START of the log entry written of the oldest modification to this block which has not yet been written to the data file; @@ -520,23 +570,10 @@ class buf_page_t /** lock covering the contents of frame() */ block_lock lock; - /** pointer to aligned, uncompressed page frame of innodb_page_size */ - byte *frame; /* @} */ /** ROW_FORMAT=COMPRESSED page; zip.data (but not the data it points to) is also protected by buf_pool.mutex */ page_zip_des_t zip; -#ifdef UNIV_DEBUG - /** whether this->LRU is in buf_pool.LRU (in_file()); - protected by buf_pool.mutex */ - bool in_LRU_list; - /** whether this is in buf_pool.page_hash (in_file()); - protected by buf_pool.mutex */ - bool in_page_hash; - /** whether this->list is in buf_pool.free (state() == NOT_USED); - protected by buf_pool.flush_list_mutex */ - bool in_free_list; -#endif /* UNIV_DEBUG */ /** list member in one of the lists of buf_pool; protected by buf_pool.mutex or buf_pool.flush_list_mutex @@ -569,21 +606,17 @@ class buf_page_t Atomic_counter access_time; /*!< time of first access, or 0 if the block was never accessed in the buffer pool. */ - buf_page_t() : id_{0} + buf_page_t() : buf_page_base_t() { static_assert(NOT_USED == 0, "compatibility"); memset((void*) this, 0, sizeof *this); } buf_page_t(const buf_page_t &b) : - id_(b.id_), hash(b.hash), + buf_page_base_t(b), oldest_modification_(b.oldest_modification_), - lock() /* not copied */, - frame(b.frame), zip(b.zip), -#ifdef UNIV_DEBUG - in_LRU_list(b.in_LRU_list), - in_page_hash(b.in_page_hash), in_free_list(b.in_free_list), -#endif /* UNIV_DEBUG */ + lock(), /* not copied */ + zip(b.zip), list(b.list), LRU(b.LRU), old(b.old), freed_page_clock(b.freed_page_clock), access_time(b.access_time) { @@ -724,16 +757,36 @@ class buf_page_t @param trx transaction (for updating trx->active_handler_stats) */ void read_wait(trx_t *trx) noexcept; + /** Space type for write complete */ + enum space_type + { + PERSISTENT, /* Persistent space */ + TEMPORARY, /* Temporary space */ + EXT_BUF /* External buffer pool space */ + }; + /** Release a write fix after a page write was completed. - @param persistent whether the page belongs to a persistent tablespace + @param type the type of space which the page was written to @param error whether an error may have occurred while writing @param state recently read state() value with the correct io-fix */ - void write_complete(bool persistent, bool error, uint32_t state) noexcept; + void write_complete(space_type type, bool error, + uint32_t state) noexcept; + + /** Set correct state and unlock the page on write completion. + @param state current page's state */ + void write_complete_release(uint32_t state) noexcept + { + zip.fix.fetch_sub((state >= WRITE_FIX_REINIT) + ? (WRITE_FIX_REINIT - UNFIXED) + : (WRITE_FIX - UNFIXED)); + lock.u_unlock(true); + } /** Write a flushable page to a file or free a freeable block. @param space tablespace + @param to_ext_buf wherher to write the page to external buffer pull file @return whether a page write was initiated and buf_pool.mutex released */ - bool flush(fil_space_t *space) noexcept; + bool flush(fil_space_t *space, bool to_ext_buf= false) noexcept; /** Notify that a page in a temporary tablespace has been modified. */ void set_temp_modified() noexcept @@ -1051,6 +1104,19 @@ struct buf_pool_stat_t{ }; ulint n_pages_read; /*!< number read operations */ ulint n_pages_written;/*!< number write operations */ + /* Make external buffer pool counters to be atomic for debug build to + avoid race conditions during MTR test case execution */ +#if defined(UNIV_DEBUG) || !defined(DBUG_OFF) + /** Number of pages, read from external buffer pool file */ + Atomic_counter n_pages_read_from_ebp; + /** Number of pages, written to external buffer pool file */ + Atomic_counter n_pages_written_to_ebp; +#else + /** Number of pages, read from external buffer pool file */ + ulint n_pages_read_from_ebp; + /** Number of pages, written to external buffer pool file */ + ulint n_pages_written_to_ebp; +#endif ulint n_pages_created;/*!< number of pages created in the pool with no read */ ulint n_ra_pages_read_rnd;/*!< number of pages read in @@ -1108,6 +1174,11 @@ class buf_pool_t protected by mutex */ Atomic_relaxed size_in_bytes; + /** External buffer pool pages array*/ + ext_buf_page_t *ext_buf_pages_array; + /** External buffer pool pages free list, protected with buf_pool.mutex */ + UT_LIST_BASE_NODE_T(ext_buf_page_t) ext_free; + public: /** The requested innodb_buffer_pool_size */ size_t size_in_bytes_requested; @@ -1117,6 +1188,71 @@ class buf_pool_t #endif /** The maximum allowed innodb_buffer_pool_size */ size_t size_in_bytes_max; + /** Amount of pages in extended buffr pool file and the size of + ext_buf_pages_array */ + size_t extended_pages; +#ifdef UNIV_DEBUG + /** Shows if force LRU eviction to external buffer poll is currently on. + Debug only. */ + my_bool force_LRU_eviction_to_ebp; +#endif + /** Hash cell chain in page_hash_table */ + struct hash_chain + { + /** pointer to the first block */ + buf_page_t *first; + }; + + /** Allocates external buffer pool page. Tries to get a page for external + buffer pool free list. If the list is empty, tries to get page from the tail + of external buffer pool LRU list, if the corresponding page hash chain is not + locked, removes the page from the chain. + @param page_id page id which will be assigned to allocated page + @return allocated external buffer pool page or nullptr if free list is empty + and all page hash chains were locked */ + ext_buf_page_t *alloc_ext_page(page_id_t page_id) noexcept; + + /** Frees external buffer pool page. Pushes a page to the head of external + buffer pool free list. + @param p page to free. */ + void free_ext_page(ext_buf_page_t &p) noexcept + { + ut_ad(&p >= ext_buf_pages_array && + &p < ext_buf_pages_array + extended_pages); + mysql_mutex_assert_owner(&mutex); + UT_LIST_ADD_FIRST(ext_free, &p); + ut_d(p.in_free_list= true); + } + + /** Pushes external buffer pool page to the head of external buffer pool LRU + list. + @param ext_page page to push */ + void push_ext_page_to_LRU(ext_buf_page_t &ext_page) noexcept { + ut_ad(&ext_page >= ext_buf_pages_array && + &ext_page < ext_buf_pages_array + extended_pages); + mysql_mutex_assert_owner(&mutex); + UT_LIST_ADD_FIRST(ext_LRU, &ext_page); + ut_d(ext_page.in_LRU_list= true); + } + + /** Removes external buffer pool page from external buffer pool LRU list. + @param ext_page page to remove */ + void remove_ext_page_from_LRU(ext_buf_page_t &ext_page) noexcept { + ut_ad(&ext_page >= ext_buf_pages_array && + &ext_page < ext_buf_pages_array + extended_pages); + mysql_mutex_assert_owner(&mutex); + UT_LIST_REMOVE(ext_LRU, &ext_page); + ut_d(ext_page.in_LRU_list= false); + } + + /** Calculates external buffer pool page offset in external buffer pool file. + @param ext_page page for which offset is calulated + @return offset in external biffer pool file */ + os_offset_t ext_page_offset(const ext_buf_page_t &ext_page) const noexcept { + ut_ad(&ext_page >= ext_buf_pages_array && + &ext_page < ext_buf_pages_array + extended_pages); + return (&ext_page - ext_buf_pages_array) << srv_page_size_shift; + } /** @return the current size of the buffer pool, in bytes */ size_t curr_pool_size() const noexcept { return size_in_bytes; } @@ -1144,12 +1280,6 @@ class buf_pool_t static int madvise_do_dump() noexcept; #endif - /** Hash cell chain in page_hash_table */ - struct hash_chain - { - /** pointer to the first block */ - buf_page_t *first; - }; private: /** Determine the number of blocks in a buffer pool of a particular size. @param size_in_bytes innodb_buffer_pool_size in bytes @@ -1205,10 +1335,11 @@ class buf_pool_t ATTRIBUTE_COLD bool withdraw(buf_page_t &bpage) noexcept; /** Release and evict a corrupted page. - @param bpage x-latched page that was found corrupted - @param state expected current state of the page */ - ATTRIBUTE_COLD void corrupted_evict(buf_page_t *bpage, uint32_t state) - noexcept; + @param bpage x-latched page that was found corrupted + @param state expected current state of the page + @param set_corrupt_id true to call bpage->set_corrupt_id() */ + ATTRIBUTE_COLD void corrupted_evict(buf_page_t *bpage, uint32_t state, + bool set_corrupt_id= true) noexcept; /** Release a memory block to the buffer pool. */ ATTRIBUTE_COLD void free_block(buf_block_t *block) noexcept; @@ -1452,14 +1583,17 @@ class buf_pool_t void append(hash_chain &chain, buf_page_t *bpage) noexcept; /** Remove a block descriptor from a hash bucket chain. */ - inline void remove(hash_chain &chain, buf_page_t *bpage) noexcept; + void remove(hash_chain &chain, buf_page_t *bpage) noexcept; /** Replace a block descriptor with another. */ - inline void replace(hash_chain &chain, buf_page_t *old, buf_page_t *bpage) + void replace(hash_chain &chain, buf_page_t *old, buf_page_t *bpage) noexcept; - /** Look up a page in a hash bucket chain. */ - inline buf_page_t *get(const page_id_t id, const hash_chain &chain) const - noexcept; + /** Look up a page in a hash bucket chain. + @tparam show_ext_pages false if external buffer pool pages must be ignored, + true otherwise */ + template + inline buf_page_t *get(const page_id_t id, + const hash_chain &chain) const noexcept; }; /** Buffer pool mutex */ @@ -1528,6 +1662,11 @@ class buf_pool_t /** broadcast when a batch completes; protected by flush_list_mutex */ pthread_cond_t done_flush_list; + /** The number of threads waiting for done_flush_list, must be set before + page cleaner wake up and reset after done_flush_list waiting is finished, + protected with flush_list_mutex */ + size_t done_flush_list_waiters_count; + /** @return number of pending LRU flush */ unsigned n_flush() const noexcept { @@ -1658,7 +1797,8 @@ class buf_pool_t UT_LIST_BASE_NODE_T(buf_block_t) unzip_LRU; /*!< base node of the unzip_LRU list */ - + /** base node of external LRU list */ + UT_LIST_BASE_NODE_T(ext_buf_page_t) ext_LRU; /* @} */ /** free ROW_FORMAT=COMPRESSED page frames */ UT_LIST_BASE_NODE_T(buf_buddy_free_t) zip_free[BUF_BUDDY_SIZES_MAX]; @@ -1731,6 +1871,7 @@ class buf_pool_t /** The InnoDB buffer pool */ extern buf_pool_t buf_pool; +template inline buf_page_t *buf_pool_t::page_hash_table::get(const page_id_t id, const hash_chain &chain) const noexcept @@ -1742,9 +1883,11 @@ inline buf_page_t *buf_pool_t::page_hash_table::get(const page_id_t id, for (buf_page_t *bpage= chain.first; bpage; bpage= bpage->hash) { ut_ad(bpage->in_page_hash); - ut_ad(bpage->in_file()); - if (bpage->id() == id) + ut_ad(bpage->external() || bpage->in_file()); + if (bpage->id() == id && (show_ext_pages || !bpage->external())) return bpage; + /* There can be sentinel pages, don't break the loop if external page + was found and ignored. */ } return nullptr; } diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h index a45fc72665289..6c9d0e55ab4bf 100644 --- a/storage/innobase/include/buf0lru.h +++ b/storage/innobase/include/buf0lru.h @@ -47,8 +47,10 @@ The caller must hold buf_pool.mutex. @param zip whether to remove both copies of a ROW_FORMAT=COMPRESSED page @retval true if freed and buf_pool.mutex may have been temporarily released @retval false if the page was not freed */ -bool buf_LRU_free_page(buf_page_t *bpage, bool zip) - MY_ATTRIBUTE((nonnull)); +bool buf_LRU_free_page(buf_page_t *bpage/* TODO: use reference instead of + pointer */, + bool zip, + ext_buf_page_t *ext_buf_page= nullptr); /** Try to free a replaceable block. @param limit maximum number of blocks to scan diff --git a/storage/innobase/include/buf0types.h b/storage/innobase/include/buf0types.h index cf5ab38df5f9f..1881b2b93c5ec 100644 --- a/storage/innobase/include/buf0types.h +++ b/storage/innobase/include/buf0types.h @@ -29,6 +29,7 @@ Created 11/17/1995 Heikki Tuuri /** Buffer page (uncompressed or compressed) */ class buf_page_t; +struct ext_buf_page_t; /** Buffer block for which an uncompressed page exists */ struct buf_block_t; /** Buffer pool statistics struct */ diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index 73bbc79f665c3..322bb8a3926c4 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -1412,9 +1412,13 @@ struct fil_system_t fil_system.mutex. */ fil_space_t *space_list_last_opened= nullptr; + /** External buffer pool file handler */ + pfs_os_file_t ext_bp_file; + #ifdef __linux__ /** available block devices that reside on non-rotational storage */ std::vector ssd; + public: /** @return whether a file system device is on non-rotational storage */ bool is_ssd(dev_t dev) const noexcept @@ -1440,6 +1444,15 @@ struct fil_system_t mysql_mutex_t mutex; fil_space_t* sys_space; /*!< The innodb_system tablespace */ fil_space_t* temp_space; /*!< The innodb_temporary tablespace */ +public: + + /** Extended buffer pool file path */ + char *ext_bp_path; + + /** Extended buffer pool file size, equals to 0 if extended buffer pool is + not used. */ + size_t ext_bp_size; + /** Map of fil_space_t::id to fil_space_t* */ hash_table_t spaces; @@ -1497,6 +1510,33 @@ struct fil_system_t potential space_id reuse */ bool space_id_reuse_warned; + /** Create external buffer pool file. + @return whether the creation failed */ + bool create_ext_file() noexcept; + + /** External bufer pool os_aio() wrapper. + @param bpage buffer pool page for read/write + @param ext_buf_page external buffer pool page which will be freed on read + completion and replace bpage in buffer pool on write + completion + @param io_request_type IORequest::WRITE_ASYNC, IORequest::READ_SYNC or + IORequest::READ_ASYNC + @param slot memory to be used for encrypted or page_compressed + pages + @param len length to read/write + @param buf buffer + @retval DB_SUCCESS if request was queued successfully + @retval DB_IO_ERROR on I/O error */ + dberr_t ext_bp_io(buf_page_t &bpage, ext_buf_page_t &ext_buf_page, + IORequest::Type io_request_type, buf_tmp_buffer_t *slot, + size_t len, void *buf) noexcept; + + /** Returns if external buffer pool is enabled. */ + bool ext_buf_pool_enabled() const { return ext_bp_size; } + + /** Disable external boffer pool */ + void ext_buf_pool_disable() { ext_bp_size= 0; } + /** Add the file to the end of opened spaces list in fil_system.space_list, so that fil_space_t::try_to_close() should close it as a last resort. @@ -1833,4 +1873,13 @@ ulint fil_space_get_block_size(const fil_space_t* space, unsigned offset) bool fil_crypt_check(fil_space_crypt_t *crypt_data, const char *f_name) noexcept; +/** Create temporary files in the given paramater path, and if +UNIV_PFS_IO defined, register the file descriptor with Performance Schema. +@param path location for creating temporary merge files, or NULL +@param label label for registration in Performance Schema if path == nullptr +@param prefix temporary file name prefix +@param async_io true if the file is going to be used with asynchronous IO +@return File descriptor */ +pfs_os_file_t pfs_create_temp_file(const char *path, const char *label, + const char *prefix, bool async_io); #endif /* UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index 5e3b208d85f6d..091dec5c2ed78 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -64,6 +64,7 @@ extern bool os_has_said_disk_full; typedef ib_uint64_t os_offset_t; class buf_tmp_buffer_t; +struct ext_buf_page_t; #ifdef _WIN32 @@ -206,13 +207,28 @@ class IORequest PUNCH_RANGE= WRITE_SYNC | 32, }; + /* This ctor is used inside of fil_space_t::io(...) */ constexpr IORequest(buf_page_t *bpage, buf_tmp_buffer_t *slot, - fil_node_t *node, Type type) : - bpage(bpage), slot(slot), node(node), type(type) {} + fil_node_t *node, Type type) + : node_ptr{node}, bpage_ptr(bpage), slot(slot), type(type) + { + } + /* This ctor is used by the callers of fil_space_t::io(...) */ constexpr IORequest(Type type= READ_SYNC, buf_page_t *bpage= nullptr, - buf_tmp_buffer_t *slot= nullptr) : - bpage(bpage), slot(slot), type(type) {} + buf_tmp_buffer_t *slot= nullptr) + : bpage_ptr(bpage), slot(slot), type(type) + { + } + + IORequest(buf_page_t *bpage, buf_tmp_buffer_t *slot, + ext_buf_page_t *ext_buf_page, Type type) + : ext_buf_page_ptr(ext_buf_page), + bpage_ptr(reinterpret_cast( + reinterpret_cast(bpage) | 1)), + slot(slot), type(type) + { + } bool is_read() const noexcept { return (type & READ_SYNC) != 0; } bool is_write() const noexcept { return (type & WRITE_SYNC) != 0; } @@ -224,7 +240,7 @@ class IORequest IORequest doublewritten() const noexcept { ut_ad(type == WRITE_ASYNC || type == PUNCH); - return IORequest{bpage, slot, node, Type(type | 4)}; + return IORequest{bpage(), slot, node(), Type(type | 4)}; } void write_complete(int io_error) const noexcept; @@ -237,9 +253,9 @@ class IORequest @return DB_SUCCESS or error code */ dberr_t maybe_punch_hole(os_offset_t off, ulint len) noexcept { - return off && len && node && (type & (PUNCH ^ WRITE_ASYNC)) - ? punch_hole(off, len) - : DB_SUCCESS; + return off && len && (type & (PUNCH ^ WRITE_ASYNC)) && node() + ? punch_hole(off, len) + : DB_SUCCESS; } private: @@ -249,18 +265,48 @@ class IORequest @return DB_SUCCESS or error code */ dberr_t punch_hole(os_offset_t off, ulint len) const noexcept; -public: + union + { + /** File descriptor */ + fil_node_t *const node_ptr= nullptr; + /** External buffer pool page if the request is for external buffer pool + file, nullptr otherwise */ + ext_buf_page_t *const ext_buf_page_ptr; + }; + /** Page to be written on write operation */ - buf_page_t *const bpage= nullptr; + buf_page_t *const bpage_ptr= nullptr; + +public: /** Memory to be used for encrypted or page_compressed pages */ buf_tmp_buffer_t *const slot= nullptr; - /** File descriptor */ - fil_node_t *const node= nullptr; + buf_page_t *bpage() const + { + return reinterpret_cast( + reinterpret_cast(bpage_ptr) & ~ptrdiff_t(1)); + }; + + bool ext_buf() const + { + return reinterpret_cast(bpage_ptr) & 1; + } + + fil_node_t *node() const + { + ut_ad(!ext_buf()); + return node_ptr; + } + + ext_buf_page_t *ext_buf_page() const { + ut_ad(ext_buf()); + return ext_buf_page_ptr; + }; /** Request type bit flags */ const Type type; + }; constexpr IORequest IORequestRead(IORequest::READ_SYNC); @@ -999,6 +1045,17 @@ void os_aio_free() noexcept; @param offset additional context */ void os_fake_read(const IORequest &type, os_offset_t offset) noexcept; +/** Request a read or write. +@param type I/O request +@param buf buffer +@param offset file offset +@param n number of bytes +@retval DB_SUCCESS if request was queued successfully +@retval DB_IO_ERROR on I/O error */ +dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, + size_t n, pfs_os_file_t handle, + const char *file_name) noexcept; + /** Request a read or write. @param type I/O request @param buf buffer diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index 6198cb04f9931..881c1f3fd241a 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -3584,10 +3584,10 @@ bool recv_recover_page(fil_space_t* space, buf_page_t* bpage) void IORequest::fake_read_complete(os_offset_t offset) const noexcept { - ut_ad(node); + ut_ad(node()); ut_ad(is_read()); - ut_ad(bpage); - ut_ad(bpage->frame); + ut_ad(bpage_ptr); + ut_ad(bpage_ptr->frame); ut_ad(recv_recovery_is_on()); ut_ad(offset); @@ -3595,30 +3595,30 @@ void IORequest::fake_read_complete(os_offset_t offset) const noexcept mtr.start(); mtr.set_log_mode(MTR_LOG_NO_REDO); - ut_ad(bpage->frame); + ut_ad(bpage_ptr->frame); /* Move the ownership of the x-latch on the page to this OS thread, so that we can acquire a second x-latch on it. This is needed for the operations to the page to pass the debug checks. */ - bpage->lock.claim_ownership(); - bpage->lock.x_lock_recursive(); - bpage->fix_on_recovery(); - mtr.memo_push(reinterpret_cast(bpage), MTR_MEMO_PAGE_X_FIX); + bpage_ptr->lock.claim_ownership(); + bpage_ptr->lock.x_lock_recursive(); + bpage_ptr->fix_on_recovery(); + mtr.memo_push(reinterpret_cast(bpage_ptr), MTR_MEMO_PAGE_X_FIX); page_recv_t &recs= *reinterpret_cast(slot); ut_ad(recs.being_processed == 1); const lsn_t init_lsn{offset}; ut_ad(init_lsn > 1); - if (recv_recover_page(reinterpret_cast(bpage), - mtr, recs, node->space, init_lsn)) + if (recv_recover_page(reinterpret_cast(bpage_ptr), + mtr, recs, node()->space, init_lsn)) { - ut_ad(bpage->oldest_modification() || bpage->is_freed()); - bpage->lock.x_unlock(true); + ut_ad(bpage_ptr->oldest_modification() || bpage_ptr->is_freed()); + bpage_ptr->lock.x_unlock(true); } recs.being_processed= -1; ut_ad(mtr.has_committed()); - node->space->release(); + node()->space->release(); } /** @return whether a page has been freed */ @@ -3982,9 +3982,12 @@ static void log_sort_flush_list() noexcept { os_aio_wait_until_no_pending_writes(false); mysql_mutex_lock(&buf_pool.flush_list_mutex); - if (buf_pool.page_cleaner_active()) + if (buf_pool.page_cleaner_active()) { + ++buf_pool.done_flush_list_waiters_count; my_cond_wait(&buf_pool.done_flush_list, &buf_pool.flush_list_mutex.m_mutex); + --buf_pool.done_flush_list_waiters_count; + } else if (!os_aio_pending_writes()) break; mysql_mutex_unlock(&buf_pool.flush_list_mutex); diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index f3c04027077b3..4c6764edcaa33 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -2566,8 +2566,8 @@ os_file_io( " Retrying.", n, type.is_read() ? "read" : "written", offset, - type.node - ? type.node->name + type.node() + ? type.node()->name : "(unknown file)", bytes_returned); } @@ -2719,7 +2719,7 @@ os_file_read_func( ulint n, ulint* o) noexcept { - ut_ad(!type.node || type.node->handle == file); + ut_ad(type.ext_buf() || !type.node() || type.node()->handle == file); ut_ad(n); os_bytes_read_since_printout+= n; @@ -2733,11 +2733,11 @@ os_file_read_func( if (ulint(n_bytes) == n || err != DB_SUCCESS) return err; - os_file_handle_error_no_exit(type.node ? type.node->name : nullptr, "read", - false); + os_file_handle_error_no_exit(type.node() ? type.node()->name : nullptr, + "read", false); sql_print_error("InnoDB: Tried to read %zu bytes at offset %" PRIu64 " of file %s, but was only able to read %zd", - n, offset, type.node ? type.node->name : "(unknown)", + n, offset, type.node() ? type.node()->name : "(unknown)", n_bytes); return err ? err : DB_IO_ERROR; @@ -2930,7 +2930,8 @@ os_file_punch_hole( @return DB_SUCCESS or error code */ dberr_t IORequest::punch_hole(os_offset_t off, ulint len) const noexcept { - ulint trim_len = bpage ? bpage->physical_size() - len : 0; + ut_ad(!ext_buf()); + ulint trim_len = bpage_ptr ? bpage_ptr->physical_size() - len : 0; if (trim_len == 0) { return(DB_SUCCESS); @@ -2940,18 +2941,18 @@ dberr_t IORequest::punch_hole(os_offset_t off, ulint len) const noexcept /* Check does file system support punching holes for this tablespace. */ - if (!node->punch_hole) { + if (!node()->punch_hole) { return DB_IO_NO_PUNCH_HOLE; } - dberr_t err = os_file_punch_hole(node->handle, off, trim_len); + dberr_t err = os_file_punch_hole(node()->handle, off, trim_len); switch (err) { case DB_SUCCESS: srv_stats.page_compressed_trim_op.inc(); return err; case DB_IO_NO_PUNCH_HOLE: - node->punch_hole = false; + node()->punch_hole = false; err = DB_SUCCESS; /* fall through */ default: @@ -3060,7 +3061,7 @@ static void write_io_callback(void *c) ib::info () << "IO Error: " << cb->m_err << " during write of " << cb->m_len << " bytes, for file " - << request.node->name << "(" << cb->m_fh << "), returned " + << request.node()->name << "(" << cb->m_fh << "), returned " << cb->m_ret_len; request.write_complete(cb->m_err); @@ -3247,7 +3248,7 @@ void os_fake_read(const IORequest &type, os_offset_t offset) noexcept tpool::aiocb *cb= read_slots->acquire(); cb->m_group= read_slots->get_task_group(); - cb->m_fh= type.node->handle.m_file; + cb->m_fh= type.node()->handle.m_file; cb->m_buffer= nullptr; cb->m_len= 0; cb->m_offset= offset; @@ -3268,16 +3269,15 @@ void os_fake_read(const IORequest &type, os_offset_t offset) noexcept @param n number of bytes @retval DB_SUCCESS if request was queued successfully @retval DB_IO_ERROR on I/O error */ -dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n) - noexcept +dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, + size_t n, pfs_os_file_t handle, + const char *file_name) noexcept { ut_ad(n > 0); ut_ad(!(n & 511)); /* payload of page_compressed tables */ ut_ad((offset % UNIV_ZIP_SIZE_MIN) == 0); ut_ad((reinterpret_cast(buf) % UNIV_ZIP_SIZE_MIN) == 0); ut_ad(type.is_read() || type.is_write()); - ut_ad(type.node); - ut_ad(type.node->is_open()); #ifdef WIN_ASYNC_IO ut_ad((n & 0xFFFFFFFFUL) == n); @@ -3286,7 +3286,7 @@ dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n) #ifdef UNIV_PFS_IO PSI_file_locker_state state; PSI_file_locker* locker= nullptr; - register_pfs_file_io_begin(&state, locker, type.node->handle, n, + register_pfs_file_io_begin(&state, locker, handle, n, type.is_write() ? PSI_FILE_WRITE : PSI_FILE_READ, __FILE__, __LINE__); @@ -3295,10 +3295,10 @@ dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n) if (!type.is_async()) { err = type.is_read() - ? os_file_read_func(type, type.node->handle, + ? os_file_read_func(type, handle, buf, offset, n, nullptr) - : os_file_write_func(type, type.node->name, - type.node->handle, + : os_file_write_func(type, file_name, + handle, buf, offset, n); func_exit: #ifdef UNIV_PFS_IO @@ -3329,7 +3329,7 @@ dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n) cb->m_buffer = buf; cb->m_callback = callback; cb->m_group = slots->get_task_group(); - cb->m_fh = type.node->handle.m_file; + cb->m_fh = handle.m_file; cb->m_len = (int)n; cb->m_offset = offset; cb->m_opcode = opcode; @@ -3337,16 +3337,34 @@ dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n) if (srv_thread_pool->submit_io(cb)) { slots->release(cb); - os_file_handle_error_no_exit(type.node->name, type.is_read() + os_file_handle_error_no_exit(file_name, type.is_read() ? "aio read" : "aio write", false); err = DB_IO_ERROR; - type.node->space->release(); } goto func_exit; } +/** Request a read or write. +@param type I/O request +@param buf buffer +@param offset file offset +@param n number of bytes +@retval DB_SUCCESS if request was queued successfully +@retval DB_IO_ERROR on I/O error */ +dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, + size_t n) noexcept +{ + ut_ad(type.node()); + ut_ad(type.node()->is_open()); + dberr_t err= + os_aio(type, buf, offset, n, type.node()->handle, type.node()->name); + if (err == DB_IO_ERROR) + type.node()->space->release(); + return err; +} + void os_aio_print(FILE *file) noexcept { time_t current_time; diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc index efb89cb069f3b..2c4f7b7c7dc45 100644 --- a/storage/innobase/row/row0merge.cc +++ b/storage/innobase/row/row0merge.cc @@ -61,6 +61,9 @@ Completed by Sunny Bains and Marko Makela /* Whether to disable file system cache */ char srv_disable_sort_file_cache; +static const char *merge_temp_file_label= "/Innodb Merge Temp File"; +static const char *merge_temp_file_prefix= "ib"; + /** Class that caches spatial index row tuples made from a single cluster index page scan, and then insert into corresponding index tree */ class spatial_index_info { @@ -4339,57 +4342,18 @@ void row_merge_drop_temp_indexes() } -/** Create temporary merge files in the given paramater path, and if -UNIV_PFS_IO defined, register the file descriptor with Performance Schema. -@param[in] path location for creating temporary merge files, or NULL -@return File descriptor */ -static pfs_os_file_t row_merge_file_create_mode(const char *path, int mode) -{ - if (!path) { - path = mysql_tmpdir; - } -#ifdef UNIV_PFS_IO - /* This temp file open does not go through normal - file APIs, add instrumentation to register with - performance schema */ - struct PSI_file_locker* locker; - PSI_file_locker_state state; - static const char label[] = "/Innodb Merge Temp File"; - char* name = static_cast( - ut_malloc_nokey(strlen(path) + sizeof label)); - strcpy(name, path); - strcat(name, label); - - register_pfs_file_open_begin( - &state, locker, innodb_temp_file_key, - PSI_FILE_CREATE, path ? name : label, __FILE__, __LINE__); - -#endif - DBUG_ASSERT(strlen(path) + 2 <= FN_REFLEN); - char filename[FN_REFLEN]; - File f = create_temp_file(filename, path, "ib", - O_BINARY | O_SEQUENTIAL, - MYF(MY_WME | MY_TEMPORARY)); - pfs_os_file_t fd = IF_WIN((os_file_t)my_get_osfhandle(f), f); - -#ifdef UNIV_PFS_IO - register_pfs_file_open_end(locker, fd, - (fd == OS_FILE_CLOSED)?NULL:&fd); - ut_free(name); -#endif - - if (fd == OS_FILE_CLOSED) { - ib::error() << "Cannot create temporary merge file"; - } - return(fd); -} - /** Create a temporary file at the specified path. @param path location for creating temporary merge files, or nullptr @return File descriptor */ pfs_os_file_t row_merge_file_create_low(const char *path) { - return row_merge_file_create_mode(path, O_BINARY | O_SEQUENTIAL); + auto fd= pfs_create_temp_file(path, merge_temp_file_label, + merge_temp_file_prefix, false); + if (fd == OS_FILE_CLOSED) + { + ib::error() << "Cannot create temporary merge file"; + } + return fd; } /** Create a merge file in the given location. @@ -4404,13 +4368,13 @@ row_merge_file_create( merge_file->offset = 0; merge_file->n_rec = 0; merge_file->fd = - row_merge_file_create_mode(path, -#if !defined _WIN32 && defined O_DIRECT - srv_disable_sort_file_cache - ? O_DIRECT | O_BINARY | O_SEQUENTIAL - : -#endif - O_BINARY | O_SEQUENTIAL); + pfs_create_temp_file(path, + merge_temp_file_label, + merge_temp_file_prefix, false); + if (merge_file->fd == OS_FILE_CLOSED) + { + ib::error() << "Cannot create temporary merge file"; + } return(merge_file->fd); } diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index 24b41f7602fdf..8c81e1a558da4 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -1426,10 +1426,18 @@ dberr_t srv_start(bool create_new_db) fil_system.create(srv_file_per_table ? 50000 : 5000); + if (!fil_system.is_initialised()) { + return srv_init_abort(DB_ERROR); + } + if (buf_pool.create()) { return(srv_init_abort(DB_ERROR)); } + if (srv_operation == SRV_OPERATION_NORMAL + && fil_system.ext_bp_size && !fil_system.create_ext_file()) + return(srv_init_abort(DB_ERROR)); + log_sys.create(); recv_sys.create(); lock_sys.create(srv_lock_table_size = 5 * buf_pool.curr_size()); @@ -2088,9 +2096,11 @@ void innodb_shutdown() mysql_mutex_lock(&buf_pool.flush_list_mutex); srv_shutdown_state = SRV_SHUTDOWN_CLEANUP; while (buf_page_cleaner_is_active) { + ++buf_pool.done_flush_list_waiters_count; pthread_cond_signal(&buf_pool.do_flush_list); my_cond_wait(&buf_pool.done_flush_list, &buf_pool.flush_list_mutex.m_mutex); + --buf_pool.done_flush_list_waiters_count; } mysql_mutex_unlock(&buf_pool.flush_list_mutex); break; diff --git a/tpool/aio_liburing.cc b/tpool/aio_liburing.cc index c3176adcf8cf4..1fba7874006c4 100644 --- a/tpool/aio_liburing.cc +++ b/tpool/aio_liburing.cc @@ -138,8 +138,9 @@ class aio_uring final : public aio auto it= std::lower_bound(files_.begin(), files_.end(), fd); assert(it == files_.end() || *it != fd); files_.insert(it, fd); - return io_uring_register_files_update(&uring_, 0, files_.data(), - files_.size()); + int err= io_uring_register_files_update(&uring_, 0, files_.data(), + files_.size()); + return err < 0 ? err : 0; } int unbind(const native_file_handle &fd) final diff --git a/tpool/tpool.h b/tpool/tpool.h index f2cf96b268922..3905f8bb7c7e8 100644 --- a/tpool/tpool.h +++ b/tpool/tpool.h @@ -197,9 +197,11 @@ class aio On completion, cb->m_callback is executed. */ virtual int submit_io(aiocb *cb)= 0; - /** "Bind" file to AIO handler (used on Windows only) */ + /** "Bind" file to AIO handler. Used at least with Windows and liburing. + @param fd file handle + @return 0 on success and error code on error */ virtual int bind(native_file_handle &fd)= 0; - /** "Unind" file to AIO handler (used on Windows only) */ + /** "Unbind" file to AIO handler. Used at least with Windows and liburing. */ virtual int unbind(const native_file_handle &fd)= 0; virtual const char *get_implementation() const=0; virtual ~aio(){};