Skip to content

Commit 14d9323

Browse files
committed
MDEV-37994 Race condition between checkpoint and .ibd file creation
It was possible that a log checkpoint was completed and the server killed between the time that fil_ibd_create() durably wrote a FILE_CREATE record, and the initialization of the tablespace. This could lead to a failure to start up after the server was killed during TRUNCATE TABLE or any table-rebuilding operation such as OPTIMIZE TABLE. In the case of TRUNCATE TABLE, an attempt to rename a file #sql-ibNNN.ibd (the contents of the table before truncation) to tablename.ibd would fail, because both files existed and the file tablename.ibd would have been filled with NUL bytes. It was possible to resume from this error by deleting the file tablename.ibd and restarting the server. We will prevent this class of errors by ensuring that both the FILE_CREATE record and the records written by fsp_header_init() will be part of the same atomic transaction, which must be durably written before any file is created or allocated. NOTE: There is another possible crash recovery problem, which we are not attempting to solve here and which will be covered by the subsequent change (MDEV-38026). If fil_ibd_create() fails to create the file and the server were killed, recovery would not even attempt to create the file at all. fil_space_t::create(): Remove the DBUG_EXECUTE_IF fault injection that was the only cause of return nullptr. This allows us to simplify several callers. fil_space_t::set_stopped(), fil_space_t::clear_stopped(): Accessor functions for fil_ibd_create() for preventing any concurrent access to an incompletely created tablespace. fil_ibd_create(): In a single atomic mini-transaction, write the FILE_CREATE record as well as the log for initializing the tablespace. After durably writing the log, create the file in the file system Only after the file has been successfully created and allocated, open the tablespace for business. Finally, release the exclusive page latches so that the header pages may be written to the file. fil_ibd_open(): Move some fault injection from fil_space_t::create() to a higher level, to the place where an existing file is being opened. Reviewed by: Thirunarayanan Balathandayuthapani Tested by: Saahil Alam
1 parent 9fa25e8 commit 14d9323

File tree

7 files changed

+120
-106
lines changed

7 files changed

+120
-106
lines changed

extra/mariabackup/xtrabackup.cc

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5461,16 +5461,12 @@ xb_delta_open_matching_space(
54615461
ut_ad(fil_space_t::physical_size(flags) == info.page_size);
54625462

54635463
mysql_mutex_lock(&fil_system.mutex);
5464-
fil_space_t* space = fil_space_t::create(uint32_t(info.space_id),
5465-
flags, false, 0,
5466-
FIL_ENCRYPTION_DEFAULT, true);
5464+
std::ignore = fil_space_t::create(uint32_t(info.space_id),
5465+
flags, false, 0,
5466+
FIL_ENCRYPTION_DEFAULT, true);
54675467
mysql_mutex_unlock(&fil_system.mutex);
5468-
if (space) {
5469-
*success = xb_space_create_file(real_name, info.space_id,
5470-
flags, &file);
5471-
} else {
5472-
msg("Can't create tablespace %s\n", dest_space_name);
5473-
}
5468+
*success = xb_space_create_file(real_name, info.space_id,
5469+
flags, &file);
54745470

54755471
goto exit;
54765472
}

mysql-test/suite/innodb_fts/r/innodb-fts-ddl.result

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,7 @@ call mtr.add_suppression("InnoDB: Operating system error number .* in a file ope
272272
call mtr.add_suppression("InnoDB: Error number .* means");
273273
call mtr.add_suppression("InnoDB: Cannot create file");
274274
call mtr.add_suppression("InnoDB: Failed to create");
275+
call mtr.add_suppression("InnoDB: The file '.*test/FTS_[0-9a-f]*_BEING_DELETED\\.ibd' already exists");
275276
CREATE TABLE t1(a TEXT, FTS_DOC_ID BIGINT UNSIGNED NOT NULL UNIQUE) ENGINE=InnoDB;
276277
ALTER TABLE t1 ADD FULLTEXT(a), ALGORITHM=INPLACE;
277278
ERROR HY000: Got error 11 "Resource temporarily unavailable" from storage engine InnoDB

mysql-test/suite/innodb_fts/t/innodb-fts-ddl.test

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,7 @@ call mtr.add_suppression("InnoDB: Operating system error number .* in a file ope
365365
call mtr.add_suppression("InnoDB: Error number .* means");
366366
call mtr.add_suppression("InnoDB: Cannot create file");
367367
call mtr.add_suppression("InnoDB: Failed to create");
368+
call mtr.add_suppression("InnoDB: The file '.*test/FTS_[0-9a-f]*_BEING_DELETED\\.ibd' already exists");
368369

369370
let MYSQLD_DATADIR=`select @@datadir`;
370371
CREATE TABLE t1(a TEXT, FTS_DOC_ID BIGINT UNSIGNED NOT NULL UNIQUE) ENGINE=InnoDB;

storage/innobase/fil/fil0fil.cc

Lines changed: 93 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -930,8 +930,6 @@ fil_space_t *fil_space_t::create(uint32_t id, ulint flags,
930930
ut_ad(fil_space_t::is_valid_flags(flags & ~FSP_FLAGS_MEM_MASK, id));
931931
ut_ad(srv_page_size == UNIV_PAGE_SIZE_ORIG || flags != 0);
932932

933-
DBUG_EXECUTE_IF("fil_space_create_failure", return nullptr;);
934-
935933
fil_space_t** after= fil_system.spaces.cell_get(id)->search
936934
(&fil_space_t::hash, [id](const fil_space_t *space)
937935
{ return !space || space->id == id; });
@@ -1840,16 +1838,12 @@ fil_ibd_create(
18401838
uint32_t key_id,
18411839
dberr_t* err) noexcept
18421840
{
1843-
pfs_os_file_t file;
1844-
bool success;
1845-
mtr_t mtr;
1846-
bool has_data_dir = FSP_FLAGS_HAS_DATA_DIR(flags) != 0;
1847-
18481841
ut_ad(!is_system_tablespace(space_id));
18491842
ut_ad(!srv_read_only_mode);
18501843
ut_a(space_id < SRV_SPACE_ID_UPPER_BOUND);
18511844
ut_a(size >= FIL_IBD_FILE_INITIAL_SIZE);
1852-
ut_a(fil_space_t::is_valid_flags(flags & ~FSP_FLAGS_MEM_MASK, space_id));
1845+
ut_a(fil_space_t::is_valid_flags(flags & ~FSP_FLAGS_MEM_MASK,
1846+
space_id));
18531847

18541848
/* Create the subdirectories in the path, if they are
18551849
not there already. */
@@ -1858,11 +1852,6 @@ fil_ibd_create(
18581852
return NULL;
18591853
}
18601854

1861-
mtr.start();
1862-
mtr.log_file_op(FILE_CREATE, space_id, path);
1863-
mtr.commit();
1864-
log_write_up_to(mtr.commit_lsn(), true);
1865-
18661855
static_assert(((UNIV_ZIP_SIZE_MIN >> 1) << 3) == 4096,
18671856
"compatibility");
18681857
#if defined _WIN32 || defined HAVE_FCNTL_DIRECT
@@ -1878,35 +1867,95 @@ fil_ibd_create(
18781867
#else
18791868
constexpr auto type = OS_DATA_FILE;
18801869
#endif
1870+
const bool is_compressed = fil_space_t::is_compressed(flags);
18811871

1882-
file = os_file_create(
1872+
if (fil_space_t::full_crc32(flags)) {
1873+
flags |= FSP_FLAGS_FCRC32_PAGE_SSIZE();
1874+
} else {
1875+
flags |= FSP_FLAGS_PAGE_SSIZE();
1876+
}
1877+
1878+
/* Create crypt data if the tablespace is either encrypted or user has
1879+
requested it to remain unencrypted. */
1880+
fil_space_crypt_t* crypt_data = (mode != FIL_ENCRYPTION_DEFAULT
1881+
|| srv_encrypt_tables)
1882+
? fil_space_create_crypt_data(mode, key_id)
1883+
: nullptr;
1884+
1885+
mysql_mutex_lock(&fil_system.mutex);
1886+
fil_space_t* space = fil_space_t::create(uint32_t(space_id),
1887+
flags, false,
1888+
crypt_data, mode, true);
1889+
fil_node_t* node = space->add(path, OS_FILE_CLOSED, size, false, true);
1890+
space->set_stopped();
1891+
mysql_mutex_unlock(&fil_system.mutex);
1892+
1893+
buf_block_t *header[2];
1894+
{
1895+
mtr_t mtr;
1896+
mtr.start();
1897+
mtr.log_file_op(FILE_CREATE, space_id, path);
1898+
mtr.set_named_space(space);
1899+
log_free_check();
1900+
ut_a(fsp_header_init(space, size, &mtr) == DB_SUCCESS);
1901+
ut_ad(mtr.get_savepoint() == 3);
1902+
header[0] = mtr.at_savepoint(1);
1903+
header[1] = mtr.at_savepoint(2);
1904+
ut_ad(header[0]->page.id() == page_id_t(space_id, 0));
1905+
ut_ad(header[1]->page.id() == page_id_t(space_id, 1));
1906+
/* Block any page writes before the file has been created. */
1907+
header[0]->page.lock.x_lock();
1908+
header[1]->page.lock.x_lock();
1909+
mtr.commit();
1910+
/* Durably write the FILE_CREATE record (as well as records for
1911+
the fsp_header_init() above before creating the file. */
1912+
log_write_up_to(mtr.commit_lsn(), true);
1913+
}
1914+
1915+
bool success;
1916+
pfs_os_file_t file = os_file_create(
18831917
innodb_data_file_key, path,
18841918
OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT,
18851919
type, srv_read_only_mode, &success);
1920+
ut_ad(!success == (file == OS_FILE_CLOSED));
18861921

18871922
if (!success) {
1923+
*err = DB_ERROR;
18881924
/* The following call will print an error message */
18891925
switch (os_file_get_last_error(true)) {
18901926
case OS_FILE_ALREADY_EXISTS:
1891-
ib::info() << "The file '" << path << "'"
1892-
" already exists though the"
1893-
" corresponding table did not exist"
1894-
" in the InnoDB data dictionary."
1895-
" You can resolve the problem by removing"
1896-
" the file.";
1927+
sql_print_error("InnoDB: The file '%s'"
1928+
" already exists though the"
1929+
" corresponding table did not exist"
1930+
" in the InnoDB data dictionary."
1931+
" You can resolve the problem"
1932+
" by removing"
1933+
" the file.", path);
18971934
*err = DB_TABLESPACE_EXISTS;
18981935
break;
18991936
case OS_FILE_DISK_FULL:
19001937
*err = DB_OUT_OF_FILE_SPACE;
1901-
break;
1938+
/* fall through */
19021939
default:
1903-
*err = DB_ERROR;
1940+
sql_print_error("InnoDB: Cannot create file '%s'",
1941+
path);
19041942
}
1905-
ib::error() << "Cannot create file '" << path << "'";
1906-
return NULL;
1943+
goto err_exit_after_cleanup;
1944+
err_exit:
1945+
os_file_delete(innodb_data_file_key, path);
1946+
os_file_close(file);
1947+
err_exit_after_cleanup:
1948+
space->release();
1949+
fil_space_free(space_id, false);
1950+
space = nullptr;
1951+
func_exit:
1952+
header[0]->page.lock.x_unlock();
1953+
header[1]->page.lock.x_unlock();
1954+
DBUG_EXECUTE_IF("checkpoint_after_file_create",
1955+
log_make_checkpoint(););
1956+
return space;
19071957
}
19081958

1909-
const bool is_compressed = fil_space_t::is_compressed(flags);
19101959
#ifdef _WIN32
19111960
const bool is_sparse = is_compressed;
19121961
if (is_compressed) {
@@ -1917,68 +1966,38 @@ fil_ibd_create(
19171966
&& DB_SUCCESS == os_file_punch_hole(file, 0, 4096)
19181967
&& !my_test_if_thinly_provisioned(file);
19191968
#endif
1920-
1921-
if (fil_space_t::full_crc32(flags)) {
1922-
flags |= FSP_FLAGS_FCRC32_PAGE_SSIZE();
1923-
} else {
1924-
flags |= FSP_FLAGS_PAGE_SSIZE();
1925-
}
1926-
1927-
/* Create crypt data if the tablespace is either encrypted or user has
1928-
requested it to remain unencrypted. */
1929-
fil_space_crypt_t* crypt_data = (mode != FIL_ENCRYPTION_DEFAULT
1930-
|| srv_encrypt_tables)
1931-
? fil_space_create_crypt_data(mode, key_id)
1932-
: nullptr;
1933-
19341969
if (!os_file_set_size(path, file,
19351970
os_offset_t(size) << srv_page_size_shift,
19361971
is_sparse)) {
19371972
*err = DB_OUT_OF_FILE_SPACE;
1938-
err_exit:
1939-
os_file_close(file);
1940-
os_file_delete(innodb_data_file_key, path);
1941-
free(crypt_data);
1942-
return nullptr;
1973+
goto err_exit;
19431974
}
19441975

1945-
fil_space_t::name_type space_name;
1976+
DBUG_EXECUTE_IF("fil_space_create_failure",
1977+
*err = DB_OUT_OF_FILE_SPACE; goto err_exit;);
19461978

1947-
if (has_data_dir) {
1979+
if (FSP_FLAGS_HAS_DATA_DIR(flags)) {
19481980
/* Make the ISL file if the IBD file is not
19491981
in the default location. */
1950-
space_name = {name.m_name, strlen(name.m_name)};
1982+
fil_space_t::name_type space_name{name.m_name,
1983+
strlen(name.m_name)};
19511984
*err = RemoteDatafile::create_link_file(space_name, path);
19521985
if (*err != DB_SUCCESS) {
19531986
goto err_exit;
19541987
}
19551988
}
19561989

1957-
DBUG_EXECUTE_IF("checkpoint_after_file_create",
1958-
log_make_checkpoint(););
1959-
19601990
mysql_mutex_lock(&fil_system.mutex);
1961-
if (fil_space_t* space = fil_space_t::create(uint32_t(space_id),
1962-
flags, false,
1963-
crypt_data, mode, true)) {
1964-
fil_node_t* node = space->add(path, file, size, false, true);
1965-
node->find_metadata(IF_WIN(,true));
1966-
mysql_mutex_unlock(&fil_system.mutex);
1967-
mtr.start();
1968-
mtr.set_named_space(space);
1969-
ut_a(fsp_header_init(space, size, &mtr) == DB_SUCCESS);
1970-
mtr.commit();
1971-
return space;
1972-
} else {
1973-
mysql_mutex_unlock(&fil_system.mutex);
1974-
}
1975-
1976-
if (space_name.data()) {
1977-
RemoteDatafile::delete_link_file(space_name);
1991+
space->clear_stopped();
1992+
node->handle = file;
1993+
node->find_metadata(IF_WIN(,true));
1994+
if (++fil_system.n_open >= srv_max_n_open_files) {
1995+
space->reacquire();
1996+
fil_space_t::try_to_close(space, true);
1997+
space->release();
19781998
}
1979-
1980-
*err = DB_ERROR;
1981-
goto err_exit;
1999+
mysql_mutex_unlock(&fil_system.mutex);
2000+
goto func_exit;
19822001
}
19832002

19842003
fil_space_t *fil_ibd_open(ulint id, ulint flags,
@@ -2162,7 +2181,9 @@ fil_space_t *fil_ibd_open(ulint id, ulint flags,
21622181
|| df_remote.is_open() != df_remote.is_valid()) {
21632182
goto corrupted;
21642183
}
2184+
#ifndef DBUG_OFF
21652185
error:
2186+
#endif
21662187
local_err = DB_ERROR;
21672188
goto func_exit;
21682189
}
@@ -2186,6 +2207,8 @@ fil_space_t *fil_ibd_open(ulint id, ulint flags,
21862207
ut_a(valid_tablespaces_found == 1);
21872208

21882209
skip_validate:
2210+
DBUG_EXECUTE_IF("fil_space_create_failure", goto error;);
2211+
21892212
const byte* first_page =
21902213
df_default.is_open() ? df_default.get_first_page() :
21912214
df_remote.get_first_page();
@@ -2199,11 +2222,6 @@ fil_space_t *fil_ibd_open(ulint id, ulint flags,
21992222
space = fil_space_t::create(uint32_t(id), flags,
22002223
validate == fil_space_t::VALIDATE_IMPORT,
22012224
crypt_data);
2202-
if (!space) {
2203-
mysql_mutex_unlock(&fil_system.mutex);
2204-
goto error;
2205-
}
2206-
22072225
/* We do not measure the size of the file, that is why
22082226
we pass the 0 below */
22092227

@@ -2508,11 +2526,6 @@ fil_ibd_load(
25082526
space = fil_space_t::create(uint32_t(space_id), flags, false,
25092527
crypt_data);
25102528

2511-
if (space == NULL) {
2512-
mysql_mutex_unlock(&fil_system.mutex);
2513-
return(FIL_LOAD_INVALID);
2514-
}
2515-
25162529
ut_ad(space->id == file.space_id());
25172530
ut_ad(space->id == space_id);
25182531

storage/innobase/fsp/fsp0space.cc

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ Tablespace::open_or_create(bool is_temp)
112112
/* We can close the handle now and open the tablespace
113113
the proper way. */
114114
it->close();
115+
mysql_mutex_lock(&fil_system.mutex);
115116

116117
if (it == begin()) {
117118
/* First data file. */
@@ -130,16 +131,9 @@ Tablespace::open_or_create(bool is_temp)
130131
fsp_flags = FSP_FLAGS_PAGE_SSIZE();
131132
}
132133

133-
mysql_mutex_lock(&fil_system.mutex);
134134
space = fil_space_t::create(
135135
uint32_t(m_space_id), fsp_flags,
136136
false, nullptr);
137-
if (!space) {
138-
mysql_mutex_unlock(&fil_system.mutex);
139-
return DB_ERROR;
140-
}
141-
} else {
142-
mysql_mutex_lock(&fil_system.mutex);
143137
}
144138
space->add(it->m_filepath, OS_FILE_CLOSED, it->m_size,
145139
false, true);

storage/innobase/fsp/fsp0sysspace.cc

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -938,21 +938,13 @@ SysTablespace::open_or_create(
938938
space = fil_space_t::create(
939939
SRV_TMP_SPACE_ID, flags(), false, nullptr);
940940
ut_ad(space == fil_system.temp_space);
941-
if (!space) {
942-
err = DB_ERROR;
943-
break;
944-
}
945941
ut_ad(!space->is_compressed());
946942
ut_ad(space->full_crc32());
947943
} else {
948944
ut_ad(space_id() == TRX_SYS_SPACE);
949945
space = fil_space_t::create(
950946
TRX_SYS_SPACE, it->flags(), false, nullptr);
951947
ut_ad(space == fil_system.sys_space);
952-
if (!space) {
953-
err = DB_ERROR;
954-
break;
955-
}
956948
}
957949

958950
uint32_t max_size = (++node_counter == m_files.size()

storage/innobase/include/fil0fil.h

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -593,6 +593,24 @@ struct fil_space_t final
593593
n_pending.fetch_and(~NEEDS_FSYNC, std::memory_order_release);
594594
}
595595

596+
/** Set the STOPPING flags when creating a file,
597+
to block premature fil_crypt_find_space_to_rotate() */
598+
inline void set_stopped() noexcept
599+
{
600+
ut_d(uint32_t n=)
601+
n_pending.fetch_add(STOPPING + 1, std::memory_order_relaxed);
602+
ut_ad(n == CLOSING);
603+
}
604+
605+
/** Clear the STOPPING flags after creating a file */
606+
inline void clear_stopped() noexcept
607+
{
608+
ut_d(uint32_t n=)
609+
n_pending.fetch_sub(STOPPING + CLOSING + 1, std::memory_order_relaxed);
610+
ut_ad(n & PENDING);
611+
ut_ad((n & ~PENDING) == (STOPPING | CLOSING));
612+
}
613+
596614
private:
597615
/** Clear the CLOSING flag */
598616
void clear_closing() noexcept
@@ -959,8 +977,7 @@ struct fil_space_t final
959977
@param crypt_data encryption information
960978
@param mode encryption mode
961979
@param opened whether the tablespace files are open
962-
@return pointer to created tablespace, to be filled in with add()
963-
@retval nullptr on failure (such as when the same tablespace exists) */
980+
@return pointer to created tablespace, to be filled in with add() */
964981
static fil_space_t *create(uint32_t id, ulint flags, bool being_imported,
965982
fil_space_crypt_t *crypt_data,
966983
fil_encryption_t mode= FIL_ENCRYPTION_DEFAULT,

0 commit comments

Comments
 (0)