Skip to content

Commit 371ae03

Browse files
Add support for Azure Data Lake Storage. (#5652)
This PR updates the Azure VFS to support storage accounts with [hierarchical namespace](https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-namespace) enabled, also known as Azure Data Lake Storage Gen2. Before the first Azure operation, we check whether the account support hierarchical namespace, and if it does, specialize some of the operations to use a `DatalakeServiceClient`. A dependency to `azure-storage-datalake-cpp` was also added. Automated testing is not currently implemented, due to lack of support by Azurite. Let me know if you want to test this on a real Azure Storage account. --- TYPE: FEATURE DESC: Added support for Azure Data Lake Storage.
1 parent d3d2cc8 commit 371ae03

File tree

13 files changed

+520
-186
lines changed

13 files changed

+520
-186
lines changed

cmake/inputs/Config.cmake.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ if(NOT @BUILD_SHARED_LIBS@) # NOT BUILD_SHARED_LIBS
2828
if(@TILEDB_AZURE@) # TILEDB_AZURE
2929
find_dependency(azure-identity-cpp)
3030
find_dependency(azure-storage-blobs-cpp)
31+
find_dependency(azure-storage-files-datalake-cpp)
3132
endif()
3233
if(@TILEDB_GCS@) # TILEDB_GCS
3334
find_dependency(google_cloud_cpp_storage)

test/src/unit-capi-config.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -725,6 +725,7 @@ TEST_CASE("C API: Test config iter", "[capi][config]") {
725725
all_param_values["vfs.azure.storage_account_key"] = "";
726726
all_param_values["vfs.azure.storage_sas_token"] = "";
727727
all_param_values["vfs.azure.blob_endpoint"] = "";
728+
all_param_values["vfs.azure.is_data_lake_endpoint"] = "";
728729
all_param_values["vfs.azure.block_list_block_size"] = "5242880";
729730
all_param_values["vfs.azure.max_parallel_ops"] =
730731
std::to_string(std::thread::hardware_concurrency());
@@ -797,6 +798,7 @@ TEST_CASE("C API: Test config iter", "[capi][config]") {
797798
vfs_param_values["azure.storage_account_key"] = "";
798799
vfs_param_values["azure.storage_sas_token"] = "";
799800
vfs_param_values["azure.blob_endpoint"] = "";
801+
vfs_param_values["azure.is_data_lake_endpoint"] = "";
800802
vfs_param_values["azure.block_list_block_size"] = "5242880";
801803
vfs_param_values["azure.max_parallel_ops"] =
802804
std::to_string(std::thread::hardware_concurrency());
@@ -863,6 +865,7 @@ TEST_CASE("C API: Test config iter", "[capi][config]") {
863865
azure_param_values["storage_account_key"] = "";
864866
azure_param_values["storage_sas_token"] = "";
865867
azure_param_values["blob_endpoint"] = "";
868+
azure_param_values["is_data_lake_endpoint"] = "";
866869
azure_param_values["block_list_block_size"] = "5242880";
867870
azure_param_values["max_parallel_ops"] =
868871
std::to_string(std::thread::hardware_concurrency());

test/src/unit-cppapi-config.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ TEST_CASE("C++ API: Config iterator", "[cppapi][config]") {
7777
names.push_back(it->first);
7878
}
7979
// Check number of VFS params in default config object.
80-
CHECK(names.size() == 67);
80+
CHECK(names.size() == 68);
8181
}
8282

8383
TEST_CASE("C++ API: Config Environment Variables", "[cppapi][config]") {

test/src/unit-vfs.cc

Lines changed: 2 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ TEST_CASE("VFS: Test long local paths", "[vfs][long-paths]") {
205205
using AllBackends = std::tuple<LocalFsTest, GCSTest, GSTest, S3Test, AzureTest>;
206206
TEMPLATE_LIST_TEST_CASE(
207207
"VFS: URI semantics and file management", "[vfs][uri]", AllBackends) {
208-
TestType fs({0});
208+
TestType fs({});
209209
if (!fs.is_supported()) {
210210
return;
211211
}
@@ -218,19 +218,6 @@ TEMPLATE_LIST_TEST_CASE(
218218

219219
URI path = fs.temp_dir_.add_trailing_slash();
220220

221-
// Set up
222-
if (path.is_gcs() || path.is_s3() || path.is_azure()) {
223-
if (vfs.is_bucket(path)) {
224-
REQUIRE_NOTHROW(vfs.remove_bucket(path));
225-
}
226-
REQUIRE_NOTHROW(vfs.create_bucket(path));
227-
} else {
228-
if (vfs.is_dir(path)) {
229-
REQUIRE_NOTHROW(vfs.remove_dir(path));
230-
}
231-
REQUIRE_NOTHROW(vfs.create_dir(path));
232-
}
233-
234221
/* Create the following file hierarchy:
235222
*
236223
* path/dir1/subdir/file1
@@ -480,21 +467,6 @@ TEMPLATE_LIST_TEST_CASE("VFS: File I/O", "[vfs][uri][file_io]", AllBackends) {
480467
CHECK_THROWS(vfs.file_size(non_existent));
481468
}
482469

483-
// Set up
484-
if (path.is_gcs() || path.is_s3() || path.is_azure()) {
485-
if (vfs.is_bucket(path)) {
486-
REQUIRE_NOTHROW(vfs.remove_bucket(path));
487-
}
488-
REQUIRE_NOTHROW(vfs.create_bucket(path));
489-
} else {
490-
if (vfs.is_dir(path)) {
491-
REQUIRE_NOTHROW(vfs.remove_dir(path));
492-
}
493-
REQUIRE_NOTHROW(vfs.create_dir(path));
494-
// Bucket-specific operations are only valid for object store filesystems.
495-
CHECK_THROWS(vfs.create_bucket(path));
496-
}
497-
498470
// Prepare buffers
499471
uint64_t buffer_size = multiplier * max_parallel_ops * chunk_size;
500472
auto write_buffer = new char[buffer_size];
@@ -801,6 +773,7 @@ TEST_CASE("VFS: Construct Azure Blob Storage endpoint URIs", "[azure][uri]") {
801773
config.set("vfs.azure.storage_account_name", "exampleaccount"));
802774
require_tiledb_ok(config.set("vfs.azure.blob_endpoint", custom_endpoint));
803775
require_tiledb_ok(config.set("vfs.azure.storage_sas_token", sas_token));
776+
require_tiledb_ok(config.set("vfs.azure.is_data_lake_endpoint", "false"));
804777
if (sas_token.empty()) {
805778
// If the SAS token is empty, the VFS will try to connect to Microsoft Entra
806779
// ID to obtain credentials, which can take a long time because of retries.

tiledb/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -524,10 +524,12 @@ if (TILEDB_AZURE)
524524

525525
find_package(azure-identity-cpp CONFIG REQUIRED)
526526
find_package(azure-storage-blobs-cpp CONFIG REQUIRED)
527+
find_package(azure-storage-files-datalake-cpp CONFIG REQUIRED)
527528
target_link_libraries(TILEDB_CORE_OBJECTS_ILIB
528529
INTERFACE
529530
Azure::azure-identity
530531
Azure::azure-storage-blobs
532+
Azure::azure-storage-files-datalake
531533
)
532534
endif()
533535

tiledb/api/c_api/config/config_api_external.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -442,6 +442,12 @@ TILEDB_EXPORT void tiledb_config_free(tiledb_config_t** config) TILEDB_NOEXCEPT;
442442
* that at least one of these two options must be set (or both if shared
443443
* key authentication is used). <br>
444444
* **Default**: ""
445+
* - `vfs.azure.is_data_lake_endpoint` <br>
446+
* Sets whether the Azure Storage account is known to have hierarchical
447+
* namespace enabled or disabled. This option can be used to reduce latency
448+
* when performing the first Azure request. If not set, the account's
449+
* capabilities will be automatically detected. <br>
450+
* **Default**: <unset>
445451
* - `vfs.azure.block_list_block_size` <br>
446452
* The block size (in bytes) used in Azure blob block list writes.
447453
* Any `uint64_t` value is acceptable. Note: `vfs.azure.block_list_block_size

tiledb/sm/config/config.cc

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@ const std::string Config::VFS_AZURE_STORAGE_ACCOUNT_NAME = "";
188188
const std::string Config::VFS_AZURE_STORAGE_ACCOUNT_KEY = "";
189189
const std::string Config::VFS_AZURE_STORAGE_SAS_TOKEN = "";
190190
const std::string Config::VFS_AZURE_BLOB_ENDPOINT = "";
191+
const std::string Config::VFS_AZURE_IS_DATA_LAKE_ENDPOINT = "";
191192
const std::string Config::VFS_AZURE_MAX_PARALLEL_OPS =
192193
Config::SM_IO_CONCURRENCY_LEVEL;
193194
const std::string Config::VFS_AZURE_BLOCK_LIST_BLOCK_SIZE = "5242880";
@@ -438,6 +439,9 @@ const std::map<std::string, std::string> default_config_values = {
438439
std::make_pair(
439440
"vfs.azure.storage_sas_token", Config::VFS_AZURE_STORAGE_SAS_TOKEN),
440441
std::make_pair("vfs.azure.blob_endpoint", Config::VFS_AZURE_BLOB_ENDPOINT),
442+
std::make_pair(
443+
"vfs.azure.is_data_lake_endpoint",
444+
Config::VFS_AZURE_IS_DATA_LAKE_ENDPOINT),
441445
std::make_pair(
442446
"vfs.azure.max_parallel_ops", Config::VFS_AZURE_MAX_PARALLEL_OPS),
443447
std::make_pair(
@@ -878,6 +882,10 @@ Status Config::sanity_check(
878882
msg << "value " << param << " invalid canned acl for " << param;
879883
return Status_Error(msg.str());
880884
}
885+
} else if (param == "vfs.azure.is_data_lake_endpoint") {
886+
if (!value.empty()) {
887+
RETURN_NOT_OK(utils::parse::convert(value, &v));
888+
}
881889
}
882890

883891
return Status::Ok();

tiledb/sm/config/config.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -488,6 +488,10 @@ class Config {
488488
/** Azure blob endpoint. */
489489
static const std::string VFS_AZURE_BLOB_ENDPOINT;
490490

491+
/** Whether the Azure storage account is known to support hierarchical
492+
* namespace or not. */
493+
static const std::string VFS_AZURE_IS_DATA_LAKE_ENDPOINT;
494+
491495
/** Azure max parallel ops. */
492496
static const std::string VFS_AZURE_MAX_PARALLEL_OPS;
493497

tiledb/sm/cpp_api/config.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -613,6 +613,12 @@ class Config {
613613
* that at least one of these two options must be set (or both if shared
614614
* key authentication is used). <br>
615615
* **Default**: ""
616+
* - `vfs.azure.is_data_lake_endpoint` <br>
617+
* Sets whether the Azure Storage account is known to have hierarchical
618+
* namespace enabled or disabled. This option can be used to reduce latency
619+
* when performing the first Azure request. If not specified, the account's
620+
* capabilities will be automatically detected. <br>
621+
* **Default**: <unset>
616622
* - `vfs.azure.block_list_block_size` <br>
617623
* The block size (in bytes) used in Azure blob block list writes.
618624
* Any `uint64_t` value is acceptable. Note:

0 commit comments

Comments
 (0)