Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions format_spec/FORMAT_SPEC.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ title: Format Specification

* The current TileDB array format version number is **22** (`uint32_t`).
* Other structures might be versioned separately.
* Readers should ignore files that do not follow the respective
format specification.
Comment on lines +9 to +10
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A better phrasing would be:

When listing directories, readers should ignore files that do not follow the expected format for each respective case.

* Data written by TileDB and referenced in this document is **little-endian**
with the following exceptions:

Expand Down
2 changes: 1 addition & 1 deletion format_spec/metadata.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ my_array # array folder
```

The metadata folder can contain any number of [timestamped](./timestamped_name.md):
* [metadata files](#array-metadata-file)
* [metadata files](#metadata-file)
* [vacuum files](./vacuum_file.md)
* **Note**: the timestamped names do _not_ include the format version.

Expand Down
34 changes: 34 additions & 0 deletions test/src/unit-cppapi-metadata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
* Tests the C++ API for array metadata.
*/

#include <tiledb/api/c_api/array/array_api_internal.h>
#include <latch>
#include "test/support/src/helpers.h"
#include "test/support/src/vfs_helpers.h"
#include "tiledb/sm/c_api/tiledb.h"
Expand Down Expand Up @@ -180,6 +182,38 @@ TEST_CASE_METHOD(
array.close();
}

TEST_CASE_METHOD(
CPPMetadataFx,
"C++ API: Metadata write / read multithread",
"[cppapi][metadata][multithread]") {
create_default_array_1d();
Context ctx;
for (int i = 1; i <= 100; i++) {
// Grow the size of metadata each write.
std::vector<uint64_t> b(100 * i);
std::iota(b.begin(), b.end(), 0);
std::latch get_metadata(2);
std::thread t1([&]() {
Array array(ctx, std::string(array_name_), TILEDB_WRITE);
array.put_metadata("a", TILEDB_UINT64, (uint32_t)b.size(), b.data());
get_metadata.count_down();
array.close();
});

std::thread t2([&]() {
get_metadata.arrive_and_wait();
Array read_array(ctx, std::string(array_name_), TILEDB_READ);
tiledb_datatype_t type;
uint32_t value_num = 0;
const void* data;
read_array.get_metadata("a", &type, &value_num, &data);
read_array.close();
});

t1.join(), t2.join();
}
}

TEST_CASE_METHOD(
CPPMetadataFx,
"C++ API: Metadata, write/read",
Expand Down
16 changes: 13 additions & 3 deletions tiledb/sm/array/array_directory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -594,7 +594,7 @@ std::vector<URI> ArrayDirectory::ls(const URI& uri) const {
std::vector<URI> uris;
uris.reserve(dir_entries.size());

for (auto entry : dir_entries) {
for (const auto& entry : dir_entries) {
auto entry_uri = URI(entry.path().native());

// Always list directories
Expand Down Expand Up @@ -837,8 +837,18 @@ void ArrayDirectory::load_array_meta_uris() {
std::vector<URI> array_meta_dir_uris;
{
auto timer_se = stats_->start_timer("list_array_meta_uris");
array_meta_dir_uris =
ls(uri_.join_path(constants::array_metadata_dir_name));
std::vector<URI> unfiltered_uris;
throw_if_not_ok(resources_.get().vfs().ls(
uri_.join_path(constants::array_metadata_dir_name), &unfiltered_uris));
for (const auto& unfiltered_uri : unfiltered_uris) {
const auto& uri = unfiltered_uri.to_string();
if (!unfiltered_uri.is_timestamped_name()) {
resources_.get().logger()->debug(
"Skipping partial array metadata file: '{}'", uri);
continue;
}
array_meta_dir_uris.emplace_back(uri);
}
}

// Compute array metadata URIs and vacuum URIs to vacuum. */
Expand Down
1 change: 1 addition & 0 deletions tiledb/sm/filesystem/filesystem_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ class FilesystemBase {
* Retrieves all the entries contained in the parent.
*
* @param parent The target directory to list.
* @param get_sizes Flag to toggle retrieving file sizes.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Leftover?

* @return All entries that are contained in the parent
*/
virtual std::vector<tiledb::common::filesystem::directory_entry>
Expand Down
14 changes: 14 additions & 0 deletions tiledb/sm/filesystem/local.cc
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,20 @@ LsObjects LocalFilesystem::ls_filtered_v2(
return qualifyingPaths;
}

Status LocalFilesystem::ls(
const std::string& path, std::vector<std::string>* paths) const {
for (auto& fs : ls_with_sizes(URI(path), false)) {
paths->emplace_back(fs.path().native());
}

return Status::Ok();
}

std::vector<filesystem::directory_entry> LocalFilesystem::ls_with_sizes(
const URI& uri) const {
return ls_with_sizes(uri, true);
}

void LocalFilesystem::copy_file(const URI& old_uri, const URI& new_uri) {
std::filesystem::copy_file(
old_uri.to_path(),
Expand Down
15 changes: 15 additions & 0 deletions tiledb/sm/filesystem/local.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,21 @@ class LocalFilesystem : public FilesystemBase {
ResultFilterV2 result_filter,
bool recursive) const override;

/**
* Lists files one level deep under a given path.
*
* @param path The parent path to list sub-paths.
* @param paths Pointer to a vector of strings to store the retrieved paths.
* @return Status
*/
Status ls(const std::string& path, std::vector<std::string>* paths) const;

std::vector<tiledb::common::filesystem::directory_entry> ls_with_sizes(
const URI& uri) const override;

virtual std::vector<tiledb::common::filesystem::directory_entry>
ls_with_sizes(const URI& uri, bool get_sizes) const = 0;

void copy_file(const URI& old_uri, const URI& new_uri) override;

void copy_dir(const URI& old_uri, const URI& new_uri) override;
Expand Down
25 changes: 9 additions & 16 deletions tiledb/sm/filesystem/posix.cc
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,13 @@ void Posix::write(
}
}

std::vector<directory_entry> Posix::ls_with_sizes(const URI& uri) const {
std::vector<directory_entry> Posix::ls_with_sizes(
const URI& uri, bool get_sizes) const {
// Noop if the parent uri is not a directory, do not error out.
if (!is_dir(uri)) {
return {};
}

std::string path = uri.to_path();
struct dirent* next_path = nullptr;
auto dir = PosixDIR::open(path);
Expand All @@ -376,29 +382,16 @@ std::vector<directory_entry> Posix::ls_with_sizes(const URI& uri) const {
continue;
std::string abspath = path + "/" + next_path->d_name;

// Getting the file size here incurs an additional system call
// via file_size() and ls() calls will feel this too.
// If this penalty becomes noticeable, we should just duplicate
// this implementation in ls() and don't get the size
if (next_path->d_type == DT_DIR) {
entries.emplace_back(abspath, 0, true);
} else {
uint64_t size = file_size(URI(abspath));
entries.emplace_back(abspath, size, false);
entries.emplace_back(
abspath, get_sizes ? file_size(URI(abspath)) : 0, false);
}
}
return entries;
}

Status Posix::ls(
const std::string& path, std::vector<std::string>* paths) const {
for (auto& fs : ls_with_sizes(URI(path))) {
paths->emplace_back(fs.path().native());
}

return Status::Ok();
}

std::string Posix::abs_path(std::string_view path) {
std::string resolved_path = abs_path_internal(path);

Expand Down
11 changes: 1 addition & 10 deletions tiledb/sm/filesystem/posix.h
Original file line number Diff line number Diff line change
Expand Up @@ -228,16 +228,7 @@ class Posix : public LocalFilesystem {
* @return A list of directory_entry objects
*/
std::vector<tiledb::common::filesystem::directory_entry> ls_with_sizes(
const URI& uri) const override;

/**
* Lists files one level deep under a given path.
*
* @param path The parent path to list sub-paths.
* @param paths Pointer to a vector of strings to store the retrieved paths.
* @return Status
*/
Status ls(const std::string& path, std::vector<std::string>* paths) const;
const URI& uri, bool get_sizes) const override;

/**
* Returns the absolute posix (string) path of the input in the
Expand Down
27 changes: 27 additions & 0 deletions tiledb/sm/filesystem/test/unit_uri.cc
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,33 @@ TEST_CASE("URI: Test REST components, invalid", "[uri]") {
}
}

TEST_CASE("URI: Test is_timestamped_name", "[uri][is_timestamped_name]") {
std::vector<std::pair<std::string, bool>> test_uris = {
{"__1764100213547_1764100213550_035477e475b011ac8c2f01a13532ccad.vac",
true},
{"__1764100213547_1764100213550_035477e475b011ac8c2f01a13532ccad", true},
{"__1_1_035477e475b011ac8c2f01a13532ccad", true},
{"__1_1_035477e475b011ac8c2f01a13532ccad_22", true},
{"__1_1_035477e475b011ac8c2f01a13532ccad_22.vac", true},
{"__1_1_035477e475b011ac8c2f01a13532ccad_22.tmp", false},
{"__1_1_035477e475b011ac8c2f01a13532ccad.tmp", false},
{"__1_1_035477e475b011ac8c2f01a13532cca", false},
{"___1_035477e475b011ac8c2f01a13532cca", false},
{"_1_1_035477e475b011ac8c2f01a13532ccad", false},
{"__1_1_035477e475b011ac8c2f01a13532ccad.", false},
{"__1_1_035477e475b011a_c8c2f01a13532ccad.vaC", false},
{"", false},
{"______", false},
{"__1_2_3", false},
};
for (const auto& test : test_uris) {
URI uri(test.first, false);
DYNAMIC_SECTION("URI: " << test.first) {
CHECK(uri.is_timestamped_name() == test.second);
}
}
}

TEST_CASE("URI: Test get_fragment_name", "[uri][get_fragment_name]") {
std::vector<std::pair<URI, std::optional<URI>>> cases = {
{URI("a randomish string"), std::nullopt},
Expand Down
65 changes: 65 additions & 0 deletions tiledb/sm/filesystem/uri.cc
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,71 @@ bool URI::is_tiledb() const {
return utils::parse::starts_with(uri_, "tiledb://");
}

bool URI::is_timestamped_name() const {
std::string part = last_path_part();
// __1_2_<32-digit-UUID> must be at minimum 38 characters long.
if (!part.starts_with("__") || part.size() < 38) {
return false;
}

// Separator between t1_t2.
size_t t1_separator = part.find('_', 2);
// Separator between t2_uuid.
size_t t2_separator = part.find('_', t1_separator + 1);
if (t1_separator == std::string::npos || t2_separator == std::string::npos) {
return false;
}

// Validate the timestamps are formatted correctly.
std::string t1 = part.substr(2, t1_separator - 2);
std::string t2 =
part.substr(t1_separator + 1, t2_separator - t1_separator - 1);
for (const auto& t : {t1, t2}) {
if (!std::all_of(
t.begin(), t.end(), [](const char c) { return std::isdigit(c); })) {
return false;
}
}
auto get_suffix = [](const std::string& p) -> std::string {
size_t suffix_separator = p.find('.');
if (suffix_separator == std::string::npos) {
return "";
}
return p.substr(suffix_separator);
};

// Separator between uuid[_v].
size_t uuid_separator = part.find('_', t2_separator + 1);
std::string uuid;
std::string suffix;
if (uuid_separator == std::string::npos) {
// There is no version; Check the UUID for the final suffix.
uuid = part.substr(t2_separator + 1);
suffix = get_suffix(uuid);
uuid.resize(uuid.size() - suffix.size());
} else {
uuid = part.substr(t2_separator + 1, uuid_separator - t2_separator - 1);
std::string version = part.substr(uuid_separator + 1);
// Check the version for the final suffix.
suffix = get_suffix(version);
version.resize(version.size() - suffix.size());
if (version.size() > std::to_string(constants::format_version).size()) {
return false;
}
}

// UUIDs generated for timestamped names are 32 characters long.
if (uuid.size() != 32) {
return false;
}

if (!suffix.empty() && suffix != constants::vacuum_file_suffix) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why special-case .vac extension?

return false;
}

return true;
}

Comment on lines +194 to +258
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Share this logic with FragmentID?

std::optional<size_t> URI::get_storage_component_index(
size_t start_index) const {
// Find '://' between path and array name. iff it exists
Expand Down
7 changes: 7 additions & 0 deletions tiledb/sm/filesystem/uri.h
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,13 @@ class URI {
*/
bool is_tiledb() const;

/**
* Checks if the last part of the URI is a valid timestamped name.
*
* @return True if the last part if a timestamped name, else false.
*/
bool is_timestamped_name() const;

/**
* Get the position in the input URI where the embedded storage URI starts, if
* one can be found.
Expand Down
15 changes: 13 additions & 2 deletions tiledb/sm/filesystem/vfs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -364,10 +364,21 @@ bool VFS::is_bucket(const URI& uri) const {
Status VFS::ls(const URI& parent, std::vector<URI>* uris) const {
stats_->add_counter("ls_num", 1);

for (auto& fs : ls_with_sizes(parent)) {
auto entries = parent.is_file() ? local_.ls_with_sizes(parent, false) :
ls_with_sizes(parent);
if (parent.is_file()) {
parallel_sort(
compute_tp_,
entries.begin(),
entries.end(),
[](const directory_entry& l, const directory_entry& r) {
return l.path().native() < r.path().native();
});
}
Comment on lines +369 to +377
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is pre-existing, but curious; is there code in this repo that assumes that paths are sorted? Sorting paths when listing is not a good idea in general, and precludes incrementally reading them.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The only code expecting sorted results that I'm aware of is test code. For example here's the CI run with the failures that this produced when I accidentally bypassed sorting in c2e1816

https://github.com/TileDB-Inc/TileDB/actions/runs/19717674222/job/56493579308#step:14:32027


for (const auto& fs : entries) {
uris->emplace_back(fs.path().native());
}

return Status::Ok();
}

Expand Down
15 changes: 7 additions & 8 deletions tiledb/sm/filesystem/win.cc
Original file line number Diff line number Diff line change
Expand Up @@ -348,15 +348,13 @@ bool Win::is_file(const URI& uri) const {
return PathFileExists(path.c_str()) && !PathIsDirectory(path.c_str());
}

Status Win::ls(const std::string& path, std::vector<std::string>* paths) const {
for (auto& fs : ls_with_sizes(URI(path))) {
paths->emplace_back(fs.path().native());
std::vector<directory_entry> Win::ls_with_sizes(
const URI& uri, bool get_sizes) const {
// Noop if the parent path is not a directory, do not error out.
if (!is_dir(uri)) {
return {};
}

return Status::Ok();
}

std::vector<directory_entry> Win::ls_with_sizes(const URI& uri) const {
auto path = uri.to_path();
bool ends_with_slash = path.length() > 0 && path[path.length() - 1] == '\\';
const std::string glob = path + (ends_with_slash ? "*" : "\\*");
Expand All @@ -383,13 +381,14 @@ std::vector<directory_entry> Win::ls_with_sizes(const URI& uri) const {
strcmp(find_data.cFileName, "..") != 0) {
std::string file_path =
path + (ends_with_slash ? "" : "\\") + find_data.cFileName;

if (find_data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
entries.emplace_back(file_path, 0, true);
} else {
ULARGE_INTEGER size;
size.LowPart = find_data.nFileSizeLow;
size.HighPart = find_data.nFileSizeHigh;
entries.emplace_back(file_path, size.QuadPart, false);
entries.emplace_back(file_path, get_sizes ? size.QuadPart : 0, false);
}
}

Expand Down
Loading
Loading