Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 53 additions & 73 deletions adlfs/spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,13 @@ def __init__(
max_concurrency = batch_size
self.max_concurrency = max_concurrency

@property
def account_url(self) -> str:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Technically, to keep the existing behavior, we will need to check whether the connection string is set and if so return None. We should add that since if a connection_string is provided, an account_name would not be and we could end up encoding None into the account url.

While it is possible to piece together an account url from a connection string, that would be quite a bit of logic for the purpose of this PR.

if hasattr(self, "account_host"):
return f"https://{self.account_host}"
else:
return f"https://{self.account_name}.blob.core.windows.net"

@classmethod
def _strip_protocol(cls, path: str):
"""
Expand Down Expand Up @@ -464,55 +471,15 @@ def _get_default_azure_credential(self, **kwargs):

def do_connect(self):
"""Connect to the BlobServiceClient, using user-specified connection details.
Tries credentials first, then connection string and finally account key
Tries connection string first, then credential and finally account key

Raises
------
ValueError if none of the connection details are available
"""

try:
if self.connection_string is not None:
self.service_client = AIOBlobServiceClient.from_connection_string(
conn_str=self.connection_string
)
elif self.account_name is not None:
if hasattr(self, "account_host"):
self.account_url: str = f"https://{self.account_host}"
else:
self.account_url: str = (
f"https://{self.account_name}.blob.core.windows.net"
)

creds = [self.credential, self.account_key]
if any(creds):
self.service_client = [
AIOBlobServiceClient(
account_url=self.account_url,
credential=cred,
_location_mode=self.location_mode,
)
for cred in creds
if cred is not None
][0]
elif self.sas_token is not None:
if not self.sas_token.startswith("?"):
self.sas_token = f"?{self.sas_token}"
self.service_client = AIOBlobServiceClient(
account_url=self.account_url + self.sas_token,
credential=None,
_location_mode=self.location_mode,
)
else:
# Fall back to anonymous login, and assume public container
self.service_client = AIOBlobServiceClient(
account_url=self.account_url
)
else:
raise ValueError(
"Must provide either a connection_string or account_name with credentials!!"
)

self.service_client = self._get_service_client()
except RuntimeError:
loop = get_loop()
asyncio.set_event_loop(loop)
Expand All @@ -521,6 +488,46 @@ def do_connect(self):
except Exception as e:
raise ValueError(f"unable to connect to account for {e}") from e

def _get_service_client(self) -> AIOBlobServiceClient:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we hoist this helper internal method to the module level instead of making it a method on the class? Mainly that helps better avoid accessing internal methods of one class from another class.

"""Connect to the Asynchronous BlobServiceClient.

Tries connection string first, then credential and finally account key
"""
# Shortcut for connection string
if self.connection_string is not None:
logger.info("Connect using connection string")
return AIOBlobServiceClient.from_connection_string(
conn_str=self.connection_string
)

if self.account_name is not None:
kwargs = {
"account_url": self.account_url,
"credential": None,
"_location_mode": self.location_mode,
}
creds = ["credential", "sync_credential", "account_key"]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any ideas on why the file system's client and the file-object's client were out of sync in terms of credential resolution here in the first place? I need to go through the git history on how these became out of sync, but I'd probably lean toward even removing sync_credential from that list so that:

  1. We have complete parity with the do_connect() method. Typically this is the method that most people will use to authenticate since by default even in the AzureBlobFile class, it will just reuse the client from the file system unless connect_client() is explicitly called.
  2. While technically it is possible to provide a sync credential to an async client, it does not seem like a practice that we should be continuing, instead async clients should be using async credentials.

I'll look more into this and circle back on what we do here.

for name in creds:
if (cred := getattr(self, name, None)) is not None:
logger.info("Connect using %s", name.replace("_", " "))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For these logging statements, let's set the level to debug. Mainly debug is the prevalent level used in the codebase and these statements are intended for debugging which credentials are used.

kwargs["credential"] = cred
break
else:
if self.sas_token is not None:
logger.info("Connect using SAS token")
if not self.sas_token.startswith("?"):
self.sas_token = f"?{self.sas_token}"
Comment on lines +518 to +519
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be interesting if we could hoist this logic to the __init__. Mainly, going over this method now this is the last place where any mutation is happening on the file system. So if we move it, it give the nice property that this method does not mutate state.

kwargs["account_url"] = f'{kwargs["account_url"]}{self.sas_token}'
else:
logger.info("Connect using anonymous login")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the previous logic, the location_mode was never used in anonymous mode, meaning if a value of secondary was set adlfs would still use the primary endpoint. I'd lean toward retaining that behavior for now since the PR is more about refactoring and logging. But I'm open to respecting location_mode for anonymous connections if it supported for GRS accounts but would need to double check to confirm if it is.

# Fall back to anonymous login, and assume public container

return AIOBlobServiceClient(**kwargs)

raise ValueError(
"Must provide either a connection_string or account_name with credentials!!"
)

def split_path(
self, path, delimiter="/", return_container: bool = False, **kwargs
) -> Tuple[str, str, Optional[str]]:
Expand Down Expand Up @@ -2030,43 +2037,16 @@ def close(self):

def connect_client(self):
"""Connect to the Asynchronous BlobServiceClient, using user-specified connection details.
Tries credentials first, then connection string and finally account key
Tries connection string first, then credential and finally account key

Raises
------
ValueError if none of the connection details are available
"""
try:
if hasattr(self.fs, "account_host"):
self.fs.account_url: str = f"https://{self.fs.account_host}"
else:
self.fs.account_url: str = (
f"https://{self.fs.account_name}.blob.core.windows.net"
)

creds = [self.fs.sync_credential, self.fs.account_key, self.fs.credential]
if any(creds):
self.container_client = [
AIOBlobServiceClient(
account_url=self.fs.account_url,
credential=cred,
_location_mode=self.fs.location_mode,
).get_container_client(self.container_name)
for cred in creds
if cred is not None
][0]
elif self.fs.connection_string is not None:
self.container_client = AIOBlobServiceClient.from_connection_string(
conn_str=self.fs.connection_string
).get_container_client(self.container_name)
elif self.fs.sas_token is not None:
self.container_client = AIOBlobServiceClient(
account_url=self.fs.account_url + self.fs.sas_token, credential=None
).get_container_client(self.container_name)
else:
self.container_client = AIOBlobServiceClient(
account_url=self.fs.account_url
).get_container_client(self.container_name)
self.container_client = self.fs._get_service_client().get_container_client(
self.container_name
)

except Exception as e:
raise ValueError(
Expand Down
Loading