diff --git a/Cargo.toml b/Cargo.toml index e02d3b35..805ab348 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,6 +40,7 @@ humantime = "2.1" itertools = "0.14.0" parking_lot = { version = "0.12" } percent-encoding = "2.1" +regex = "1.11.1" thiserror = "2.0.2" tracing = { version = "0.1" } url = "2.2" diff --git a/src/azure/builder.rs b/src/azure/builder.rs index e824217f..578318a2 100644 --- a/src/azure/builder.rs +++ b/src/azure/builder.rs @@ -28,6 +28,8 @@ use percent_encoding::percent_decode_str; use serde::{Deserialize, Serialize}; use std::str::FromStr; use std::sync::Arc; +use std::sync::OnceLock; +use regex::Regex; use url::Url; /// The well-known account used by Azurite and the legacy Azure Storage Emulator. @@ -671,36 +673,92 @@ impl MicrosoftAzureBuilder { self.container_name = Some(validate(parsed.username())?); self.account_name = Some(validate(a)?); self.use_fabric_endpoint = true.into(); + } else if let Some(a) = host.strip_suffix(".blob.core.windows.net") { + self.container_name = Some(validate(parsed.username())?); + self.account_name = Some(validate(a)?); + } else if let Some(a) = host.strip_suffix(".blob.fabric.microsoft.com") { + self.container_name = Some(validate(parsed.username())?); + self.account_name = Some(validate(a)?); + self.use_fabric_endpoint = true.into(); + } else if let Some(a) = host.strip_suffix("-api.onelake.fabric.microsoft.com") { + self.container_name = Some(validate(parsed.username())?); + self.account_name = Some(validate(a)?); + self.use_fabric_endpoint = true.into(); } else { return Err(Error::UrlNotRecognised { url: url.into() }.into()); } } - "https" => match host.split_once('.') { - Some((a, "dfs.core.windows.net")) | Some((a, "blob.core.windows.net")) => { - self.account_name = Some(validate(a)?); - let container = parsed.path_segments().unwrap().next().expect( - "iterator always contains at least one string (which may be empty)", - ); - if !container.is_empty() { - self.container_name = Some(validate(container)?); + "https" => { + // Regex to match WS-PL FQDN: + // "{workspaceid}.z??.(onelake|dfs|blob).fabric.microsoft.com" + static WS_PL_REGEX: OnceLock = OnceLock::new(); + + let ws_pl_regex = WS_PL_REGEX.get_or_init(|| { + Regex::new( + r"(?i)^(?P[0-9a-f]{32})\.z(?P[0-9a-f]{2})\.(onelake|dfs|blob)\.fabric\.microsoft\.com$" + ).unwrap() + }); + + // WS-PL Fabric endpoint, eg- 1) c047b3e34e89407a98d7cf9949ae92a3.zc0.dfs.fabric.microsoft.com, 2) c047b3e34e89407a98d7cf9949ae92a3.zc0.blob.fabric.microsoft.com + if let Some(captures) = ws_pl_regex.captures(host) { + let workspaceid = captures.name("workspaceid").unwrap().as_str(); + let xy = captures.name("xy").unwrap().as_str(); + + if !workspaceid.get(0..2).is_some_and(|pfx| pfx.eq_ignore_ascii_case(xy)) { + return Err(Error::UrlNotRecognised { url: url.into() }.into()); } + + self.account_name = Some(format!("{workspaceid}.z{xy}")); + self.container_name = Some(validate(workspaceid)?); + self.use_fabric_endpoint = true.into(); + return Ok(()); } - Some((a, "dfs.fabric.microsoft.com")) | Some((a, "blob.fabric.microsoft.com")) => { - self.account_name = Some(validate(a)?); - // Attempt to infer the container name from the URL - // - https://onelake.dfs.fabric.microsoft.com///Files/test.csv - // - https://onelake.dfs.fabric.microsoft.com//.// - // - // See - let workspace = parsed.path_segments().unwrap().next().expect( - "iterator always contains at least one string (which may be empty)", - ); + + // Api Onelake Fabric endpoint + if host.ends_with("-api.onelake.fabric.microsoft.com") { + let account = host.strip_suffix("-api.onelake.fabric.microsoft.com").unwrap(); + self.account_name = Some(validate(account)?); + let workspace = parsed.path_segments().unwrap().next() + .expect("iterator always contains at least one string (which may be empty)"); + if !workspace.is_empty() { - self.container_name = Some(workspace.to_string()) + self.container_name = Some(workspace.to_string()); } + self.use_fabric_endpoint = true.into(); + return Ok(()); + } + + match host.split_once('.') { + // Azure Storage public + Some((a, "dfs.core.windows.net")) | Some((a, "blob.core.windows.net")) => { + self.account_name = Some(validate(a)?); + + // Attempt to infer the container name from the URL + let container = parsed.path_segments().unwrap().next() + .expect("iterator always contains at least one string (which may be empty)"); + + if !container.is_empty() { + self.container_name = Some(validate(container)?); + } + } + + // Fabric endpoints + Some((a, "dfs.fabric.microsoft.com")) | Some((a, "blob.fabric.microsoft.com")) => { + self.account_name = Some(validate(a)?); + + let workspace = parsed.path_segments().unwrap().next() + .expect("iterator always contains at least one string (which may be empty)"); + + if !workspace.is_empty() { + self.container_name = Some(workspace.to_string()); + } + + self.use_fabric_endpoint = true.into(); + } + + _ => return Err(Error::UrlNotRecognised { url: url.into() }.into()), } - _ => return Err(Error::UrlNotRecognised { url: url.into() }.into()), }, scheme => { let scheme = scheme.into(); @@ -1119,6 +1177,14 @@ mod tests { assert_eq!(builder.container_name, Some("file_system".to_string())); assert!(builder.use_fabric_endpoint.get().unwrap()); + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("abfss://file_system@account-api.onelake.fabric.microsoft.com/") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert_eq!(builder.container_name, Some("file_system".to_string())); + assert!(builder.use_fabric_endpoint.get().unwrap()); + let mut builder = MicrosoftAzureBuilder::new(); builder.parse_url("abfs://container/path").unwrap(); assert_eq!(builder.container_name, Some("container".to_string())); @@ -1166,6 +1232,14 @@ mod tests { assert_eq!(builder.container_name, None); assert!(builder.use_fabric_endpoint.get().unwrap()); + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://account-api.onelake.fabric.microsoft.com/") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert_eq!(builder.container_name, None); + assert!(builder.use_fabric_endpoint.get().unwrap()); + let mut builder = MicrosoftAzureBuilder::new(); builder .parse_url("https://account.dfs.fabric.microsoft.com/container") @@ -1190,6 +1264,38 @@ mod tests { assert_eq!(builder.container_name.as_deref(), Some("container")); assert!(builder.use_fabric_endpoint.get().unwrap()); + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://Ab000000000000000000000000000000.zAb.dfs.fabric.microsoft.com/") + .unwrap(); + assert_eq!(builder.account_name, Some("ab000000000000000000000000000000.zab".to_string())); + assert_eq!(builder.container_name.as_deref(), Some("ab000000000000000000000000000000")); + assert!(builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://ab000000000000000000000000000000.zab.dfs.fabric.microsoft.com/") + .unwrap(); + assert_eq!(builder.account_name, Some("ab000000000000000000000000000000.zab".to_string())); + assert_eq!(builder.container_name.as_deref(), Some("ab000000000000000000000000000000")); + assert!(builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://ab000000000000000000000000000000.zab.blob.fabric.microsoft.com/") + .unwrap(); + assert_eq!(builder.account_name, Some("ab000000000000000000000000000000.zab".to_string())); + assert_eq!(builder.container_name.as_deref(), Some("ab000000000000000000000000000000")); + assert!(builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://ab000000000000000000000000000000.zab.onelake.fabric.microsoft.com/") + .unwrap(); + assert_eq!(builder.account_name, Some("ab000000000000000000000000000000.zab".to_string())); + assert_eq!(builder.container_name.as_deref(), Some("ab000000000000000000000000000000")); + assert!(builder.use_fabric_endpoint.get().unwrap()); + let err_cases = [ "mailto://account.blob.core.windows.net/", "az://blob.mydomain/", @@ -1256,4 +1362,4 @@ mod tests { panic!("{key} not propagated as ClientConfigKey"); } } -} +} \ No newline at end of file