diff --git a/format_spec/FORMAT_SPEC.md b/format_spec/FORMAT_SPEC.md index ed952a9dbbf..9545772458c 100644 --- a/format_spec/FORMAT_SPEC.md +++ b/format_spec/FORMAT_SPEC.md @@ -6,6 +6,8 @@ title: Format Specification * The current TileDB array format version number is **22** (`uint32_t`). * Other structures might be versioned separately. + * Readers should ignore files that do not follow the respective + format specification. * Data written by TileDB and referenced in this document is **little-endian** with the following exceptions: diff --git a/format_spec/metadata.md b/format_spec/metadata.md index 7be46fc3301..7f8b88a5a0b 100644 --- a/format_spec/metadata.md +++ b/format_spec/metadata.md @@ -17,7 +17,7 @@ my_array # array folder ``` The metadata folder can contain any number of [timestamped](./timestamped_name.md): -* [metadata files](#array-metadata-file) +* [metadata files](#metadata-file) * [vacuum files](./vacuum_file.md) * **Note**: the timestamped names do _not_ include the format version. diff --git a/test/src/unit-cppapi-metadata.cc b/test/src/unit-cppapi-metadata.cc index 7095daf8146..ae237a0f6d1 100644 --- a/test/src/unit-cppapi-metadata.cc +++ b/test/src/unit-cppapi-metadata.cc @@ -30,6 +30,8 @@ * Tests the C++ API for array metadata. */ +#include +#include #include "test/support/src/helpers.h" #include "test/support/src/vfs_helpers.h" #include "tiledb/sm/c_api/tiledb.h" @@ -180,6 +182,38 @@ TEST_CASE_METHOD( array.close(); } +TEST_CASE_METHOD( + CPPMetadataFx, + "C++ API: Metadata write / read multithread", + "[cppapi][metadata][multithread]") { + create_default_array_1d(); + Context ctx; + for (int i = 1; i <= 100; i++) { + // Grow the size of metadata each write. + std::vector b(100 * i); + std::iota(b.begin(), b.end(), 0); + std::latch get_metadata(2); + std::thread t1([&]() { + Array array(ctx, std::string(array_name_), TILEDB_WRITE); + array.put_metadata("a", TILEDB_UINT64, (uint32_t)b.size(), b.data()); + get_metadata.count_down(); + array.close(); + }); + + std::thread t2([&]() { + get_metadata.arrive_and_wait(); + Array read_array(ctx, std::string(array_name_), TILEDB_READ); + tiledb_datatype_t type; + uint32_t value_num = 0; + const void* data; + read_array.get_metadata("a", &type, &value_num, &data); + read_array.close(); + }); + + t1.join(), t2.join(); + } +} + TEST_CASE_METHOD( CPPMetadataFx, "C++ API: Metadata, write/read", diff --git a/tiledb/sm/array/array_directory.cc b/tiledb/sm/array/array_directory.cc index 6d515f0fb12..f215b7fd0ad 100644 --- a/tiledb/sm/array/array_directory.cc +++ b/tiledb/sm/array/array_directory.cc @@ -594,7 +594,7 @@ std::vector ArrayDirectory::ls(const URI& uri) const { std::vector uris; uris.reserve(dir_entries.size()); - for (auto entry : dir_entries) { + for (const auto& entry : dir_entries) { auto entry_uri = URI(entry.path().native()); // Always list directories @@ -837,8 +837,18 @@ void ArrayDirectory::load_array_meta_uris() { std::vector array_meta_dir_uris; { auto timer_se = stats_->start_timer("list_array_meta_uris"); - array_meta_dir_uris = - ls(uri_.join_path(constants::array_metadata_dir_name)); + std::vector unfiltered_uris; + throw_if_not_ok(resources_.get().vfs().ls( + uri_.join_path(constants::array_metadata_dir_name), &unfiltered_uris)); + for (const auto& unfiltered_uri : unfiltered_uris) { + const auto& uri = unfiltered_uri.to_string(); + if (!unfiltered_uri.is_timestamped_name()) { + resources_.get().logger()->debug( + "Skipping partial array metadata file: '{}'", uri); + continue; + } + array_meta_dir_uris.emplace_back(uri); + } } // Compute array metadata URIs and vacuum URIs to vacuum. */ diff --git a/tiledb/sm/filesystem/filesystem_base.h b/tiledb/sm/filesystem/filesystem_base.h index 20f821828e9..769f93196f5 100644 --- a/tiledb/sm/filesystem/filesystem_base.h +++ b/tiledb/sm/filesystem/filesystem_base.h @@ -147,6 +147,7 @@ class FilesystemBase { * Retrieves all the entries contained in the parent. * * @param parent The target directory to list. + * @param get_sizes Flag to toggle retrieving file sizes. * @return All entries that are contained in the parent */ virtual std::vector diff --git a/tiledb/sm/filesystem/local.cc b/tiledb/sm/filesystem/local.cc index 7cfedb209bb..3b0ae256f51 100644 --- a/tiledb/sm/filesystem/local.cc +++ b/tiledb/sm/filesystem/local.cc @@ -124,6 +124,20 @@ LsObjects LocalFilesystem::ls_filtered_v2( return qualifyingPaths; } +Status LocalFilesystem::ls( + const std::string& path, std::vector* paths) const { + for (auto& fs : ls_with_sizes(URI(path), false)) { + paths->emplace_back(fs.path().native()); + } + + return Status::Ok(); +} + +std::vector LocalFilesystem::ls_with_sizes( + const URI& uri) const { + return ls_with_sizes(uri, true); +} + void LocalFilesystem::copy_file(const URI& old_uri, const URI& new_uri) { std::filesystem::copy_file( old_uri.to_path(), diff --git a/tiledb/sm/filesystem/local.h b/tiledb/sm/filesystem/local.h index c686cc7a837..94d88c8da68 100644 --- a/tiledb/sm/filesystem/local.h +++ b/tiledb/sm/filesystem/local.h @@ -53,6 +53,21 @@ class LocalFilesystem : public FilesystemBase { ResultFilterV2 result_filter, bool recursive) const override; + /** + * Lists files one level deep under a given path. + * + * @param path The parent path to list sub-paths. + * @param paths Pointer to a vector of strings to store the retrieved paths. + * @return Status + */ + Status ls(const std::string& path, std::vector* paths) const; + + std::vector ls_with_sizes( + const URI& uri) const override; + + virtual std::vector + ls_with_sizes(const URI& uri, bool get_sizes) const = 0; + void copy_file(const URI& old_uri, const URI& new_uri) override; void copy_dir(const URI& old_uri, const URI& new_uri) override; diff --git a/tiledb/sm/filesystem/posix.cc b/tiledb/sm/filesystem/posix.cc index c29aa26d86b..74dd0739b94 100644 --- a/tiledb/sm/filesystem/posix.cc +++ b/tiledb/sm/filesystem/posix.cc @@ -361,7 +361,13 @@ void Posix::write( } } -std::vector Posix::ls_with_sizes(const URI& uri) const { +std::vector Posix::ls_with_sizes( + const URI& uri, bool get_sizes) const { + // Noop if the parent uri is not a directory, do not error out. + if (!is_dir(uri)) { + return {}; + } + std::string path = uri.to_path(); struct dirent* next_path = nullptr; auto dir = PosixDIR::open(path); @@ -376,29 +382,16 @@ std::vector Posix::ls_with_sizes(const URI& uri) const { continue; std::string abspath = path + "/" + next_path->d_name; - // Getting the file size here incurs an additional system call - // via file_size() and ls() calls will feel this too. - // If this penalty becomes noticeable, we should just duplicate - // this implementation in ls() and don't get the size if (next_path->d_type == DT_DIR) { entries.emplace_back(abspath, 0, true); } else { - uint64_t size = file_size(URI(abspath)); - entries.emplace_back(abspath, size, false); + entries.emplace_back( + abspath, get_sizes ? file_size(URI(abspath)) : 0, false); } } return entries; } -Status Posix::ls( - const std::string& path, std::vector* paths) const { - for (auto& fs : ls_with_sizes(URI(path))) { - paths->emplace_back(fs.path().native()); - } - - return Status::Ok(); -} - std::string Posix::abs_path(std::string_view path) { std::string resolved_path = abs_path_internal(path); diff --git a/tiledb/sm/filesystem/posix.h b/tiledb/sm/filesystem/posix.h index bddb4b84284..dca4a3ff297 100644 --- a/tiledb/sm/filesystem/posix.h +++ b/tiledb/sm/filesystem/posix.h @@ -228,16 +228,7 @@ class Posix : public LocalFilesystem { * @return A list of directory_entry objects */ std::vector ls_with_sizes( - const URI& uri) const override; - - /** - * Lists files one level deep under a given path. - * - * @param path The parent path to list sub-paths. - * @param paths Pointer to a vector of strings to store the retrieved paths. - * @return Status - */ - Status ls(const std::string& path, std::vector* paths) const; + const URI& uri, bool get_sizes) const override; /** * Returns the absolute posix (string) path of the input in the diff --git a/tiledb/sm/filesystem/test/unit_uri.cc b/tiledb/sm/filesystem/test/unit_uri.cc index 2fc70070952..0e57ac451be 100644 --- a/tiledb/sm/filesystem/test/unit_uri.cc +++ b/tiledb/sm/filesystem/test/unit_uri.cc @@ -311,6 +311,33 @@ TEST_CASE("URI: Test REST components, invalid", "[uri]") { } } +TEST_CASE("URI: Test is_timestamped_name", "[uri][is_timestamped_name]") { + std::vector> test_uris = { + {"__1764100213547_1764100213550_035477e475b011ac8c2f01a13532ccad.vac", + true}, + {"__1764100213547_1764100213550_035477e475b011ac8c2f01a13532ccad", true}, + {"__1_1_035477e475b011ac8c2f01a13532ccad", true}, + {"__1_1_035477e475b011ac8c2f01a13532ccad_22", true}, + {"__1_1_035477e475b011ac8c2f01a13532ccad_22.vac", true}, + {"__1_1_035477e475b011ac8c2f01a13532ccad_22.tmp", false}, + {"__1_1_035477e475b011ac8c2f01a13532ccad.tmp", false}, + {"__1_1_035477e475b011ac8c2f01a13532cca", false}, + {"___1_035477e475b011ac8c2f01a13532cca", false}, + {"_1_1_035477e475b011ac8c2f01a13532ccad", false}, + {"__1_1_035477e475b011ac8c2f01a13532ccad.", false}, + {"__1_1_035477e475b011a_c8c2f01a13532ccad.vaC", false}, + {"", false}, + {"______", false}, + {"__1_2_3", false}, + }; + for (const auto& test : test_uris) { + URI uri(test.first, false); + DYNAMIC_SECTION("URI: " << test.first) { + CHECK(uri.is_timestamped_name() == test.second); + } + } +} + TEST_CASE("URI: Test get_fragment_name", "[uri][get_fragment_name]") { std::vector>> cases = { {URI("a randomish string"), std::nullopt}, diff --git a/tiledb/sm/filesystem/uri.cc b/tiledb/sm/filesystem/uri.cc index 766af22c83a..426fdb3dc4a 100644 --- a/tiledb/sm/filesystem/uri.cc +++ b/tiledb/sm/filesystem/uri.cc @@ -191,6 +191,71 @@ bool URI::is_tiledb() const { return utils::parse::starts_with(uri_, "tiledb://"); } +bool URI::is_timestamped_name() const { + std::string part = last_path_part(); + // __1_2_<32-digit-UUID> must be at minimum 38 characters long. + if (!part.starts_with("__") || part.size() < 38) { + return false; + } + + // Separator between t1_t2. + size_t t1_separator = part.find('_', 2); + // Separator between t2_uuid. + size_t t2_separator = part.find('_', t1_separator + 1); + if (t1_separator == std::string::npos || t2_separator == std::string::npos) { + return false; + } + + // Validate the timestamps are formatted correctly. + std::string t1 = part.substr(2, t1_separator - 2); + std::string t2 = + part.substr(t1_separator + 1, t2_separator - t1_separator - 1); + for (const auto& t : {t1, t2}) { + if (!std::all_of( + t.begin(), t.end(), [](const char c) { return std::isdigit(c); })) { + return false; + } + } + auto get_suffix = [](const std::string& p) -> std::string { + size_t suffix_separator = p.find('.'); + if (suffix_separator == std::string::npos) { + return ""; + } + return p.substr(suffix_separator); + }; + + // Separator between uuid[_v]. + size_t uuid_separator = part.find('_', t2_separator + 1); + std::string uuid; + std::string suffix; + if (uuid_separator == std::string::npos) { + // There is no version; Check the UUID for the final suffix. + uuid = part.substr(t2_separator + 1); + suffix = get_suffix(uuid); + uuid.resize(uuid.size() - suffix.size()); + } else { + uuid = part.substr(t2_separator + 1, uuid_separator - t2_separator - 1); + std::string version = part.substr(uuid_separator + 1); + // Check the version for the final suffix. + suffix = get_suffix(version); + version.resize(version.size() - suffix.size()); + if (version.size() > std::to_string(constants::format_version).size()) { + return false; + } + } + + // UUIDs generated for timestamped names are 32 characters long. + if (uuid.size() != 32) { + return false; + } + + if (!suffix.empty() && suffix != constants::vacuum_file_suffix) { + return false; + } + + return true; +} + std::optional URI::get_storage_component_index( size_t start_index) const { // Find '://' between path and array name. iff it exists diff --git a/tiledb/sm/filesystem/uri.h b/tiledb/sm/filesystem/uri.h index 97a1aa3c871..3f00f7ea9a2 100644 --- a/tiledb/sm/filesystem/uri.h +++ b/tiledb/sm/filesystem/uri.h @@ -226,6 +226,13 @@ class URI { */ bool is_tiledb() const; + /** + * Checks if the last part of the URI is a valid timestamped name. + * + * @return True if the last part if a timestamped name, else false. + */ + bool is_timestamped_name() const; + /** * Get the position in the input URI where the embedded storage URI starts, if * one can be found. diff --git a/tiledb/sm/filesystem/vfs.cc b/tiledb/sm/filesystem/vfs.cc index 482aca3a1c4..ec89d8a18cb 100644 --- a/tiledb/sm/filesystem/vfs.cc +++ b/tiledb/sm/filesystem/vfs.cc @@ -364,10 +364,21 @@ bool VFS::is_bucket(const URI& uri) const { Status VFS::ls(const URI& parent, std::vector* uris) const { stats_->add_counter("ls_num", 1); - for (auto& fs : ls_with_sizes(parent)) { + auto entries = parent.is_file() ? local_.ls_with_sizes(parent, false) : + ls_with_sizes(parent); + if (parent.is_file()) { + parallel_sort( + compute_tp_, + entries.begin(), + entries.end(), + [](const directory_entry& l, const directory_entry& r) { + return l.path().native() < r.path().native(); + }); + } + + for (const auto& fs : entries) { uris->emplace_back(fs.path().native()); } - return Status::Ok(); } diff --git a/tiledb/sm/filesystem/win.cc b/tiledb/sm/filesystem/win.cc index c0e8a51f6f9..dd2bf97ab92 100644 --- a/tiledb/sm/filesystem/win.cc +++ b/tiledb/sm/filesystem/win.cc @@ -348,15 +348,13 @@ bool Win::is_file(const URI& uri) const { return PathFileExists(path.c_str()) && !PathIsDirectory(path.c_str()); } -Status Win::ls(const std::string& path, std::vector* paths) const { - for (auto& fs : ls_with_sizes(URI(path))) { - paths->emplace_back(fs.path().native()); +std::vector Win::ls_with_sizes( + const URI& uri, bool get_sizes) const { + // Noop if the parent path is not a directory, do not error out. + if (!is_dir(uri)) { + return {}; } - return Status::Ok(); -} - -std::vector Win::ls_with_sizes(const URI& uri) const { auto path = uri.to_path(); bool ends_with_slash = path.length() > 0 && path[path.length() - 1] == '\\'; const std::string glob = path + (ends_with_slash ? "*" : "\\*"); @@ -383,13 +381,14 @@ std::vector Win::ls_with_sizes(const URI& uri) const { strcmp(find_data.cFileName, "..") != 0) { std::string file_path = path + (ends_with_slash ? "" : "\\") + find_data.cFileName; + if (find_data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) { entries.emplace_back(file_path, 0, true); } else { ULARGE_INTEGER size; size.LowPart = find_data.nFileSizeLow; size.HighPart = find_data.nFileSizeHigh; - entries.emplace_back(file_path, size.QuadPart, false); + entries.emplace_back(file_path, get_sizes ? size.QuadPart : 0, false); } } diff --git a/tiledb/sm/filesystem/win.h b/tiledb/sm/filesystem/win.h index 134d74c58e1..6a07bb572bd 100644 --- a/tiledb/sm/filesystem/win.h +++ b/tiledb/sm/filesystem/win.h @@ -174,16 +174,6 @@ class Win : public LocalFilesystem { */ bool is_file(const URI& uri) const override; - /** - * - * Lists files one level deep under a given path. - * - * @param path The parent path to list sub-paths. - * @param paths Pointer to a vector of strings to store the retrieved paths. - * @return Status - */ - Status ls(const std::string& path, std::vector* paths) const; - /** * * Lists files and file information under a given path. @@ -192,7 +182,7 @@ class Win : public LocalFilesystem { * @return A list of directory_entry objects */ std::vector ls_with_sizes( - const URI& path) const override; + const URI& path, bool get_sizes) const override; /** * Move a given filesystem path. diff --git a/tiledb/sm/metadata/metadata.cc b/tiledb/sm/metadata/metadata.cc index 9f0bb2a2a26..dcef2d8f94b 100644 --- a/tiledb/sm/metadata/metadata.cc +++ b/tiledb/sm/metadata/metadata.cc @@ -210,8 +210,15 @@ void Metadata::store( resources.stats().add_counter( "write_meta_size", size_computation_serializer.size()); - // Create a metadata file name - GenericTileIO::store_data(resources, get_uri(uri), tile, encryption_key); + if (uri.is_file()) { + // For LocalFS use a .tmp extension until write is flushed. + URI metadata_uri(get_uri(uri).to_string() + ".tmp"); + GenericTileIO::store_data(resources, metadata_uri, tile, encryption_key); + resources.vfs().move_file(metadata_uri, get_uri(uri)); + } else { + // Create a metadata file name + GenericTileIO::store_data(resources, get_uri(uri), tile, encryption_key); + } } void Metadata::del(const char* key) {