Skip to content

Commit 68774bf

Browse files
authored
Make HNS check and client instantiation more reliable (#478)
* Fix HNS check and credentials passing * add default credential test * add azure-identity for tests * Add config for environment credentials in azure * Use root dir to check HNS fallback * Update history + prep for release * mypy pass * Make history more explicit about issue fixed
1 parent 8207b3d commit 68774bf

File tree

8 files changed

+88
-22
lines changed

8 files changed

+88
-22
lines changed

.env.example

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,9 @@ LIVE_GS_BUCKET=a-bucket-you-can-access
3030
# Custom S3, e.g. MinIO
3131
CUSTOM_S3_BUCKET=a-bucket-you-can-access
3232
CUSTOM_S3_ENDPOINT=your_custom_s3_endpoint
33+
34+
# From a registered Azure App as a service principal; currently used just for
35+
# live tests with DefaultCredentials
36+
AZURE_CLIENT_ID=
37+
AZURE_TENANT_ID=
38+
AZURE_CLIENT_SECRET=

.github/workflows/tests.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,9 @@ jobs:
103103
LIVE_AZURE_CONTAINER: ${{ secrets.LIVE_AZURE_CONTAINER }}
104104
AZURE_STORAGE_CONNECTION_STRING: ${{ secrets.AZURE_STORAGE_CONNECTION_STRING }}
105105
AZURE_STORAGE_GEN2_CONNECTION_STRING: ${{ secrets.AZURE_STORAGE_GEN2_CONNECTION_STRING }}
106+
AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
107+
AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
108+
AZURE_CLIENT_SECRET: ${{ secrets.AZURE_CLIENT_SECRET }}
106109
LIVE_GS_BUCKET: ${{ secrets.LIVE_GS_BUCKET }}
107110
LIVE_S3_BUCKET: ${{ secrets.LIVE_S3_BUCKET }}
108111
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}

HISTORY.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
# cloudpathlib Changelog
22

3-
## UNRELEASED
3+
## v0.20.0 (2024-10-18)
44

55
- Added support for custom schemes in CloudPath and Client subclases. (Issue [#466](https://github.com/drivendataorg/cloudpathlib/issues/466), PR [#467](https://github.com/drivendataorg/cloudpathlib/pull/467))
6+
- Fixed `ResourceNotFoundError` on Azure gen2 storage accounts with HNS enabled and issue that some Azure credentials do not have `account_name`. (Issue [#470](https://github.com/drivendataorg/cloudpathlib/issues/470), Issue [#476](https://github.com/drivendataorg/cloudpathlib/issues/476), PR [#478](https://github.com/drivendataorg/cloudpathlib/pull/478))
67

78
## v0.19.0 (2024-08-29)
89

cloudpathlib/azure/azblobclient.py

Lines changed: 59 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
try:
1616
from azure.core.exceptions import ResourceNotFoundError
1717
from azure.core.credentials import AzureNamedKeyCredential
18+
1819
from azure.storage.blob import (
1920
BlobPrefix,
2021
BlobSasPermissions,
@@ -24,7 +25,15 @@
2425
generate_blob_sas,
2526
)
2627

28+
from azure.storage.blob._shared.authentication import (
29+
SharedKeyCredentialPolicy as BlobSharedKeyCredentialPolicy,
30+
)
31+
2732
from azure.storage.filedatalake import DataLakeServiceClient, FileProperties
33+
from azure.storage.filedatalake._shared.authentication import (
34+
SharedKeyCredentialPolicy as DataLakeSharedKeyCredentialPolicy,
35+
)
36+
2837
except ModuleNotFoundError:
2938
implementation_registry["azure"].dependencies_loaded = False
3039

@@ -104,19 +113,29 @@ def __init__(
104113
if connection_string is None:
105114
connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING", None)
106115

107-
self.data_lake_client = None # only needs to end up being set if HNS is enabled
116+
self.data_lake_client: Optional[DataLakeServiceClient] = (
117+
None # only needs to end up being set if HNS is enabled
118+
)
108119

109120
if blob_service_client is not None:
110121
self.service_client = blob_service_client
111122

112123
# create from blob service client if not passed
113124
if data_lake_client is None:
114-
self.data_lake_client = DataLakeServiceClient(
115-
account_url=self.service_client.url.replace(".blob.", ".dfs.", 1),
116-
credential=AzureNamedKeyCredential(
125+
credential = (
126+
blob_service_client.credential
127+
if not isinstance(
128+
blob_service_client.credential, BlobSharedKeyCredentialPolicy
129+
)
130+
else AzureNamedKeyCredential(
117131
blob_service_client.credential.account_name,
118132
blob_service_client.credential.account_key,
119-
),
133+
)
134+
)
135+
136+
self.data_lake_client = DataLakeServiceClient(
137+
account_url=self.service_client.url.replace(".blob.", ".dfs.", 1),
138+
credential=credential,
120139
)
121140
else:
122141
self.data_lake_client = data_lake_client
@@ -125,12 +144,21 @@ def __init__(
125144
self.data_lake_client = data_lake_client
126145

127146
if blob_service_client is None:
128-
self.service_client = BlobServiceClient(
129-
account_url=self.data_lake_client.url.replace(".dfs.", ".blob.", 1),
130-
credential=AzureNamedKeyCredential(
147+
148+
credential = (
149+
data_lake_client.credential
150+
if not isinstance(
151+
data_lake_client.credential, DataLakeSharedKeyCredentialPolicy
152+
)
153+
else AzureNamedKeyCredential(
131154
data_lake_client.credential.account_name,
132155
data_lake_client.credential.account_key,
133-
),
156+
)
157+
)
158+
159+
self.service_client = BlobServiceClient(
160+
account_url=self.data_lake_client.url.replace(".dfs.", ".blob.", 1),
161+
credential=credential,
134162
)
135163

136164
elif connection_string is not None:
@@ -167,19 +195,31 @@ def __init__(
167195
"Credentials are required; see docs for options."
168196
)
169197

170-
self._hns_enabled = None
198+
self._hns_enabled: Optional[bool] = None
171199

172-
def _check_hns(self) -> Optional[bool]:
200+
def _check_hns(self, cloud_path: AzureBlobPath) -> Optional[bool]:
173201
if self._hns_enabled is None:
174-
account_info = self.service_client.get_account_information() # type: ignore
175-
self._hns_enabled = account_info.get("is_hns_enabled", False) # type: ignore
202+
try:
203+
account_info = self.service_client.get_account_information() # type: ignore
204+
self._hns_enabled = account_info.get("is_hns_enabled", False) # type: ignore
205+
except ResourceNotFoundError:
206+
# get_account_information() not supported with this credential; we have to fallback to
207+
# checking if the root directory exists and is a has 'metadata': {'hdi_isfolder': 'true'}
208+
root_dir = self.service_client.get_blob_client(
209+
container=cloud_path.container, blob="/"
210+
)
211+
self._hns_enabled = (
212+
root_dir.exists()
213+
and root_dir.get_blob_properties().metadata.get("hdi_isfolder", False)
214+
== "true"
215+
)
176216

177217
return self._hns_enabled
178218

179219
def _get_metadata(
180220
self, cloud_path: AzureBlobPath
181221
) -> Union["BlobProperties", "FileProperties", Dict[str, Any]]:
182-
if self._check_hns():
222+
if self._check_hns(cloud_path):
183223

184224
# works on both files and directories
185225
fsc = self.data_lake_client.get_file_system_client(cloud_path.container) # type: ignore
@@ -292,7 +332,7 @@ def _list_dir(
292332
if prefix and not prefix.endswith("/"):
293333
prefix += "/"
294334

295-
if self._check_hns():
335+
if self._check_hns(cloud_path):
296336
file_system_client = self.data_lake_client.get_file_system_client(cloud_path.container) # type: ignore
297337
paths = file_system_client.get_paths(path=cloud_path.blob, recursive=recursive)
298338

@@ -334,7 +374,7 @@ def _move_file(
334374
)
335375

336376
# we can use rename API when the same account on adls gen2
337-
elif remove_src and (src.client is dst.client) and self._check_hns():
377+
elif remove_src and (src.client is dst.client) and self._check_hns(src):
338378
fsc = self.data_lake_client.get_file_system_client(src.container) # type: ignore
339379

340380
if src.is_dir():
@@ -358,7 +398,7 @@ def _move_file(
358398
def _mkdir(
359399
self, cloud_path: AzureBlobPath, parents: bool = False, exist_ok: bool = False
360400
) -> None:
361-
if self._check_hns():
401+
if self._check_hns(cloud_path):
362402
file_system_client = self.data_lake_client.get_file_system_client(cloud_path.container) # type: ignore
363403
directory_client = file_system_client.get_directory_client(cloud_path.blob)
364404

@@ -379,7 +419,7 @@ def _mkdir(
379419
def _remove(self, cloud_path: AzureBlobPath, missing_ok: bool = True) -> None:
380420
file_or_dir = self._is_file_or_dir(cloud_path)
381421
if file_or_dir == "dir":
382-
if self._check_hns():
422+
if self._check_hns(cloud_path):
383423
_hns_rmtree(self.data_lake_client, cloud_path.container, cloud_path.blob)
384424
return
385425

@@ -432,7 +472,7 @@ def _generate_presigned_url(
432472
self, cloud_path: AzureBlobPath, expire_seconds: int = 60 * 60
433473
) -> str:
434474
sas_token = generate_blob_sas(
435-
self.service_client.account_name,
475+
self.service_client.account_name, # type: ignore[arg-type]
436476
container_name=cloud_path.container,
437477
blob_name=cloud_path.blob,
438478
account_key=self.service_client.credential.account_key,

cloudpathlib/azure/azblobpath.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def replace(self, target: "AzureBlobPath") -> "AzureBlobPath":
9191

9292
# we can rename directories on ADLS Gen2
9393
except CloudPathIsADirectoryError:
94-
if self.client._check_hns():
94+
if self.client._check_hns(self):
9595
return self.client._move_file(self, target)
9696
else:
9797
raise

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "flit_core.buildapi"
44

55
[project]
66
name = "cloudpathlib"
7-
version = "0.19.0"
7+
version = "0.20.0"
88
description = "pathlib-style classes for cloud storage services."
99
readme = "README.md"
1010
authors = [{ name = "DrivenData", email = "[email protected]" }]

requirements-dev.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
-e .[all]
22

3+
azure-identity
34
black[jupyter]>=24.1.0;python_version>='3.8'
45
build
56
flake8

tests/test_azure_specific.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import os
22

33
from azure.core.credentials import AzureNamedKeyCredential
4+
from azure.identity import DefaultAzureCredential
45
from azure.storage.blob import (
56
BlobServiceClient,
67
StorageStreamDownloader,
@@ -134,6 +135,20 @@ def _check_access(az_client, gen2=False):
134135
)
135136
_check_access(cl, gen2=azure_rigs.is_adls_gen2)
136137

138+
# discover and use credentials for service principal by having set:
139+
# AZURE_CLIENT_ID, AZURE_CLIENT_SECRET, AZURE_TENANT_ID
140+
credential = DefaultAzureCredential()
141+
cl: AzureBlobClient = azure_rigs.client_class(credential=credential, account_url=bsc.url)
142+
_check_access(cl, gen2=azure_rigs.is_adls_gen2)
143+
144+
# add basic checks for gen2 to exercise limited-privilege access scenarios
145+
p = azure_rigs.create_cloud_path("new_dir/new_file.txt", client=cl)
146+
assert cl._check_hns(p) == azure_rigs.is_adls_gen2
147+
p.write_text("Hello")
148+
assert p.exists()
149+
assert p.read_text() == "Hello"
150+
assert list(p.parent.iterdir()) == [p]
151+
137152

138153
def test_adls_gen2_mkdir(azure_gen2_rig):
139154
"""Since directories can be created on gen2, we should test mkdir, rmdir, rmtree, and unlink

0 commit comments

Comments
 (0)