From 3960a7ab152c8e8bfe19a466a0ddef5ad7d51ee7 Mon Sep 17 00:00:00 2001 From: Smriti Agrawal Date: Tue, 25 Nov 2025 14:41:42 +0530 Subject: [PATCH 1/4] Whitelisting Onelake API & Workspace PL FQDNs --- src/azure/builder.rs | 83 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 67 insertions(+), 16 deletions(-) diff --git a/src/azure/builder.rs b/src/azure/builder.rs index e824217f..6d5bfafa 100644 --- a/src/azure/builder.rs +++ b/src/azure/builder.rs @@ -25,6 +25,7 @@ use crate::client::{HttpConnector, TokenCredentialProvider, http_connector}; use crate::config::ConfigValue; use crate::{ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider}; use percent_encoding::percent_decode_str; +use regex::Regex; use serde::{Deserialize, Serialize}; use std::str::FromStr; use std::sync::Arc; @@ -657,6 +658,9 @@ impl MicrosoftAzureBuilder { false => Ok(s.to_string()), }; + const DFS_FABRIC_SUFFIX: &str = "dfs.fabric.microsoft.com"; + const BLOB_FABRIC_SUFFIX: &str = "blob.fabric.microsoft.com"; + match parsed.scheme() { "adl" | "azure" => self.container_name = Some(validate(host)?), "az" | "abfs" | "abfss" => { @@ -675,32 +679,79 @@ impl MicrosoftAzureBuilder { return Err(Error::UrlNotRecognised { url: url.into() }.into()); } } - "https" => match host.split_once('.') { - Some((a, "dfs.core.windows.net")) | Some((a, "blob.core.windows.net")) => { - self.account_name = Some(validate(a)?); - let container = parsed.path_segments().unwrap().next().expect( + "https" => { + const DFS_FABRIC_SUFFIX: &str = "dfs.fabric.microsoft.com"; + const BLOB_FABRIC_SUFFIX: &str = "blob.fabric.microsoft.com"; + const DFS_AZURE_SUFFIX: &str = "dfs.core.windows.net"; + const BLOB_AZURE_SUFFIX: &str = "blob.core.windows.net"; + + // Regex to match WS-PL FQDN: "{workspaceid}.z??.dfs.fabric.microsoft.com" + // workspaceid = 32 hex chars, z?? = z + first two chars of workspaceid + lazy_static::lazy_static! { + static ref WS_PL_REGEX: Regex = Regex::new(r"^(?P[0-9a-f]{32})\.z(?P[0-9a-f]{2})\.(dfs|blob)\.fabric\.microsoft\.com$").unwrap(); + } + + if let Some(captures) = WS_PL_REGEX.captures(host) { + let workspaceid = captures.name("workspaceid").unwrap().as_str(); + let xy = captures.name("xy").unwrap().as_str(); + + // Validate z?? matches first 2 chars of workspaceid + if &workspaceid[0..2] != xy { + return Err(Error::UrlNotRecognised { url: url.into() }.into()); + } + + self.account_name = Some(validate(workspaceid)?); + self.use_fabric_endpoint = true; + + let container = parsed + .path_segments() + .and_then(|mut s| s.next()) + .unwrap_or(""); + if !container.is_empty() { + self.container_name = Some(validate(container)?); + } + + return Ok(()); + } + + // Otherwise, check Fabric global / Onelake API FQDN + if host.ends_with(DFS_FABRIC_SUFFIX) || host.ends_with(BLOB_FABRIC_SUFFIX) { + let labels: Vec<&str> = host.split('.').collect(); + let account_name = if labels.len() >= 2 && labels[0].contains("api") && labels[1] == "onelake" { + format!("{}-{}", labels[0], labels[1]) + } else { + labels[0].to_string() + }; + + self.account_name = Some(validate(&account_name)?); + self.use_fabric_endpoint = true; + + let container = parsed.path_segments().unwrap().next().expect( "iterator always contains at least one string (which may be empty)", ); if !container.is_empty() { self.container_name = Some(validate(container)?); } + + return Ok(()); } - Some((a, "dfs.fabric.microsoft.com")) | Some((a, "blob.fabric.microsoft.com")) => { - self.account_name = Some(validate(a)?); - // Attempt to infer the container name from the URL - // - https://onelake.dfs.fabric.microsoft.com///Files/test.csv - // - https://onelake.dfs.fabric.microsoft.com//.// - // - // See - let workspace = parsed.path_segments().unwrap().next().expect( + + // Azure Storage public + if host.ends_with(DFS_AZURE_SUFFIX) || host.ends_with(BLOB_AZURE_SUFFIX) { + let first_label = host.split('.').next().unwrap_or_default(); + self.account_name = Some(validate(first_label)?); + + let container = parsed.path_segments().unwrap().next().expect( "iterator always contains at least one string (which may be empty)", ); - if !workspace.is_empty() { - self.container_name = Some(workspace.to_string()) + if !container.is_empty() { + self.container_name = Some(validate(container)?); } - self.use_fabric_endpoint = true.into(); + + return Ok(()); } - _ => return Err(Error::UrlNotRecognised { url: url.into() }.into()), + + return Err(Error::UrlNotRecognised { url: url.into() }.into()); }, scheme => { let scheme = scheme.into(); From 5b9dd47ef11c002016a75500cfc51fa1fbdfcf27 Mon Sep 17 00:00:00 2001 From: Smriti Agrawal Date: Thu, 8 Jan 2026 12:34:57 +0530 Subject: [PATCH 2/4] Addressing comments to whitelist api-onelake fqdns and add UTs --- Cargo.toml | 1 + src/azure/builder.rs | 166 +++++++++++++++++++++++++++---------------- 2 files changed, 105 insertions(+), 62 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index e02d3b35..805ab348 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,6 +40,7 @@ humantime = "2.1" itertools = "0.14.0" parking_lot = { version = "0.12" } percent-encoding = "2.1" +regex = "1.11.1" thiserror = "2.0.2" tracing = { version = "0.1" } url = "2.2" diff --git a/src/azure/builder.rs b/src/azure/builder.rs index 6d5bfafa..b5918743 100644 --- a/src/azure/builder.rs +++ b/src/azure/builder.rs @@ -25,10 +25,11 @@ use crate::client::{HttpConnector, TokenCredentialProvider, http_connector}; use crate::config::ConfigValue; use crate::{ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider}; use percent_encoding::percent_decode_str; -use regex::Regex; use serde::{Deserialize, Serialize}; use std::str::FromStr; use std::sync::Arc; +use std::sync::OnceLock; +use regex::Regex; use url::Url; /// The well-known account used by Azurite and the legacy Azure Storage Emulator. @@ -658,9 +659,6 @@ impl MicrosoftAzureBuilder { false => Ok(s.to_string()), }; - const DFS_FABRIC_SUFFIX: &str = "dfs.fabric.microsoft.com"; - const BLOB_FABRIC_SUFFIX: &str = "blob.fabric.microsoft.com"; - match parsed.scheme() { "adl" | "azure" => self.container_name = Some(validate(host)?), "az" | "abfs" | "abfss" => { @@ -675,83 +673,87 @@ impl MicrosoftAzureBuilder { self.container_name = Some(validate(parsed.username())?); self.account_name = Some(validate(a)?); self.use_fabric_endpoint = true.into(); + } else if let Some(a) = host.strip_suffix(".blob.core.windows.net") { + self.container_name = Some(validate(parsed.username())?); + self.account_name = Some(validate(a)?); + } else if let Some(a) = host.strip_suffix(".blob.fabric.microsoft.com") { + self.container_name = Some(validate(parsed.username())?); + self.account_name = Some(validate(a)?); + self.use_fabric_endpoint = true.into(); + } else if let Some(a) = host.strip_suffix("-api.onelake.fabric.microsoft.com") { + self.container_name = Some(validate(parsed.username())?); + self.account_name = Some(validate(a)?); + self.use_fabric_endpoint = true.into(); } else { return Err(Error::UrlNotRecognised { url: url.into() }.into()); } } "https" => { - const DFS_FABRIC_SUFFIX: &str = "dfs.fabric.microsoft.com"; - const BLOB_FABRIC_SUFFIX: &str = "blob.fabric.microsoft.com"; - const DFS_AZURE_SUFFIX: &str = "dfs.core.windows.net"; - const BLOB_AZURE_SUFFIX: &str = "blob.core.windows.net"; - - // Regex to match WS-PL FQDN: "{workspaceid}.z??.dfs.fabric.microsoft.com" - // workspaceid = 32 hex chars, z?? = z + first two chars of workspaceid - lazy_static::lazy_static! { - static ref WS_PL_REGEX: Regex = Regex::new(r"^(?P[0-9a-f]{32})\.z(?P[0-9a-f]{2})\.(dfs|blob)\.fabric\.microsoft\.com$").unwrap(); - } - - if let Some(captures) = WS_PL_REGEX.captures(host) { + // Regex to match WS-PL FQDN: + // "{workspaceid}.z??.(onelake|dfs|blob).fabric.microsoft.com" + static WS_PL_REGEX: OnceLock = OnceLock::new(); + let ws_pl_regex = WS_PL_REGEX.get_or_init(|| { + Regex::new( + r"^(?P[0-9a-f]{32})\.z(?P[0-9a-f]{2})\.(onelake|dfs|blob)\.fabric\.microsoft\.com$" + ).unwrap() + }); + + // WS-PL Fabric endpoint + if let Some(captures) = ws_pl_regex.captures(host) { let workspaceid = captures.name("workspaceid").unwrap().as_str(); let xy = captures.name("xy").unwrap().as_str(); - // Validate z?? matches first 2 chars of workspaceid - if &workspaceid[0..2] != xy { - return Err(Error::UrlNotRecognised { url: url.into() }.into()); - } + self.account_name = Some(format!("{workspaceid}.z{xy}")); + self.container_name = Some(validate(workspaceid)?); + self.use_fabric_endpoint = true.into(); + return Ok(()); + } - self.account_name = Some(validate(workspaceid)?); - self.use_fabric_endpoint = true; + // Api Onelake Fabric endpoint + if host.ends_with("-api.onelake.fabric.microsoft.com") { + let account = host.strip_suffix("-api.onelake.fabric.microsoft.com").unwrap(); + self.account_name = Some(validate(account)?); + let workspace = parsed.path_segments().unwrap().next() + .expect("iterator always contains at least one string (which may be empty)"); - let container = parsed - .path_segments() - .and_then(|mut s| s.next()) - .unwrap_or(""); - if !container.is_empty() { - self.container_name = Some(validate(container)?); + if !workspace.is_empty() { + self.container_name = Some(workspace.to_string()); } + self.use_fabric_endpoint = true.into(); return Ok(()); } - // Otherwise, check Fabric global / Onelake API FQDN - if host.ends_with(DFS_FABRIC_SUFFIX) || host.ends_with(BLOB_FABRIC_SUFFIX) { - let labels: Vec<&str> = host.split('.').collect(); - let account_name = if labels.len() >= 2 && labels[0].contains("api") && labels[1] == "onelake" { - format!("{}-{}", labels[0], labels[1]) - } else { - labels[0].to_string() - }; - - self.account_name = Some(validate(&account_name)?); - self.use_fabric_endpoint = true; - - let container = parsed.path_segments().unwrap().next().expect( - "iterator always contains at least one string (which may be empty)", - ); - if !container.is_empty() { - self.container_name = Some(validate(container)?); + match host.split_once('.') { + // Azure Storage public + Some((a, "dfs.core.windows.net")) | Some((a, "blob.core.windows.net")) => { + self.account_name = Some(validate(a)?); + + let container = parsed.path_segments().unwrap().next() + .expect("iterator always contains at least one string (which may be empty)"); + + if !container.is_empty() { + self.container_name = Some(validate(container)?); + } } - return Ok(()); - } + // Fabric endpoints + Some((a, "dfs.fabric.microsoft.com")) | Some((a, "blob.fabric.microsoft.com")) => { + self.account_name = Some(validate(a)?); - // Azure Storage public - if host.ends_with(DFS_AZURE_SUFFIX) || host.ends_with(BLOB_AZURE_SUFFIX) { - let first_label = host.split('.').next().unwrap_or_default(); - self.account_name = Some(validate(first_label)?); + // Attempt to infer the container name from the URL + let workspace = parsed.path_segments().unwrap().next() + .expect("iterator always contains at least one string (which may be empty)"); - let container = parsed.path_segments().unwrap().next().expect( - "iterator always contains at least one string (which may be empty)", - ); - if !container.is_empty() { - self.container_name = Some(validate(container)?); + if !workspace.is_empty() { + self.container_name = Some(workspace.to_string()); + } + + self.use_fabric_endpoint = true.into(); } - return Ok(()); + _ => return Err(Error::UrlNotRecognised { url: url.into() }.into()), } - - return Err(Error::UrlNotRecognised { url: url.into() }.into()); }, scheme => { let scheme = scheme.into(); @@ -1170,6 +1172,14 @@ mod tests { assert_eq!(builder.container_name, Some("file_system".to_string())); assert!(builder.use_fabric_endpoint.get().unwrap()); + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("abfss://file_system@account-api.onelake.fabric.microsoft.com/") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert_eq!(builder.container_name, Some("file_system".to_string())); + assert!(builder.use_fabric_endpoint.get().unwrap()); + let mut builder = MicrosoftAzureBuilder::new(); builder.parse_url("abfs://container/path").unwrap(); assert_eq!(builder.container_name, Some("container".to_string())); @@ -1217,6 +1227,14 @@ mod tests { assert_eq!(builder.container_name, None); assert!(builder.use_fabric_endpoint.get().unwrap()); + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://account-api.onelake.fabric.microsoft.com/") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert_eq!(builder.container_name, None); + assert!(builder.use_fabric_endpoint.get().unwrap()); + let mut builder = MicrosoftAzureBuilder::new(); builder .parse_url("https://account.dfs.fabric.microsoft.com/container") @@ -1235,10 +1253,34 @@ mod tests { let mut builder = MicrosoftAzureBuilder::new(); builder - .parse_url("https://account.blob.fabric.microsoft.com/container") + .parse_url("https://account.blob.fabric.microsoft.com/") .unwrap(); assert_eq!(builder.account_name, Some("account".to_string())); - assert_eq!(builder.container_name.as_deref(), Some("container")); + assert_eq!(builder.container_name, None); + assert!(builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://ab000000000000000000000000000000.zab.dfs.fabric.microsoft.com/") + .unwrap(); + assert_eq!(builder.account_name, Some("ab000000000000000000000000000000.zab".to_string())); + assert_eq!(builder.container_name.as_deref(), Some("ab000000000000000000000000000000")); + assert!(builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://ab000000000000000000000000000000.zab.blob.fabric.microsoft.com/") + .unwrap(); + assert_eq!(builder.account_name, Some("ab000000000000000000000000000000.zab".to_string())); + assert_eq!(builder.container_name.as_deref(), Some("ab000000000000000000000000000000")); + assert!(builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://ab000000000000000000000000000000.zab.onelake.fabric.microsoft.com/") + .unwrap(); + assert_eq!(builder.account_name, Some("ab000000000000000000000000000000.zab".to_string())); + assert_eq!(builder.container_name.as_deref(), Some("ab000000000000000000000000000000")); assert!(builder.use_fabric_endpoint.get().unwrap()); let err_cases = [ @@ -1307,4 +1349,4 @@ mod tests { panic!("{key} not propagated as ClientConfigKey"); } } -} +} \ No newline at end of file From 4ba4321853dc5f7543f9e038a8f0f6827ae10cb1 Mon Sep 17 00:00:00 2001 From: Smriti Agrawal Date: Mon, 12 Jan 2026 11:10:04 +0530 Subject: [PATCH 3/4] Reverting the mistakenly modified UT --- src/azure/builder.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/azure/builder.rs b/src/azure/builder.rs index b5918743..077cafb5 100644 --- a/src/azure/builder.rs +++ b/src/azure/builder.rs @@ -741,7 +741,6 @@ impl MicrosoftAzureBuilder { Some((a, "dfs.fabric.microsoft.com")) | Some((a, "blob.fabric.microsoft.com")) => { self.account_name = Some(validate(a)?); - // Attempt to infer the container name from the URL let workspace = parsed.path_segments().unwrap().next() .expect("iterator always contains at least one string (which may be empty)"); @@ -1253,10 +1252,10 @@ mod tests { let mut builder = MicrosoftAzureBuilder::new(); builder - .parse_url("https://account.blob.fabric.microsoft.com/") + .parse_url("https://account.blob.fabric.microsoft.com/container") .unwrap(); assert_eq!(builder.account_name, Some("account".to_string())); - assert_eq!(builder.container_name, None); + assert_eq!(builder.container_name.as_deref(), Some("container")); assert!(builder.use_fabric_endpoint.get().unwrap()); let mut builder = MicrosoftAzureBuilder::new(); From 3826e5bb85a8a4f319a1a2f02415291074ecc85e Mon Sep 17 00:00:00 2001 From: Smriti Agrawal Date: Tue, 13 Jan 2026 15:11:36 +0530 Subject: [PATCH 4/4] Adding validation for xy in WS-PL URL & case insensitive regex --- src/azure/builder.rs | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/azure/builder.rs b/src/azure/builder.rs index 077cafb5..578318a2 100644 --- a/src/azure/builder.rs +++ b/src/azure/builder.rs @@ -692,16 +692,21 @@ impl MicrosoftAzureBuilder { // Regex to match WS-PL FQDN: // "{workspaceid}.z??.(onelake|dfs|blob).fabric.microsoft.com" static WS_PL_REGEX: OnceLock = OnceLock::new(); + let ws_pl_regex = WS_PL_REGEX.get_or_init(|| { Regex::new( - r"^(?P[0-9a-f]{32})\.z(?P[0-9a-f]{2})\.(onelake|dfs|blob)\.fabric\.microsoft\.com$" + r"(?i)^(?P[0-9a-f]{32})\.z(?P[0-9a-f]{2})\.(onelake|dfs|blob)\.fabric\.microsoft\.com$" ).unwrap() }); - // WS-PL Fabric endpoint + // WS-PL Fabric endpoint, eg- 1) c047b3e34e89407a98d7cf9949ae92a3.zc0.dfs.fabric.microsoft.com, 2) c047b3e34e89407a98d7cf9949ae92a3.zc0.blob.fabric.microsoft.com if let Some(captures) = ws_pl_regex.captures(host) { let workspaceid = captures.name("workspaceid").unwrap().as_str(); let xy = captures.name("xy").unwrap().as_str(); + + if !workspaceid.get(0..2).is_some_and(|pfx| pfx.eq_ignore_ascii_case(xy)) { + return Err(Error::UrlNotRecognised { url: url.into() }.into()); + } self.account_name = Some(format!("{workspaceid}.z{xy}")); self.container_name = Some(validate(workspaceid)?); @@ -728,7 +733,8 @@ impl MicrosoftAzureBuilder { // Azure Storage public Some((a, "dfs.core.windows.net")) | Some((a, "blob.core.windows.net")) => { self.account_name = Some(validate(a)?); - + + // Attempt to infer the container name from the URL let container = parsed.path_segments().unwrap().next() .expect("iterator always contains at least one string (which may be empty)"); @@ -1258,6 +1264,14 @@ mod tests { assert_eq!(builder.container_name.as_deref(), Some("container")); assert!(builder.use_fabric_endpoint.get().unwrap()); + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://Ab000000000000000000000000000000.zAb.dfs.fabric.microsoft.com/") + .unwrap(); + assert_eq!(builder.account_name, Some("ab000000000000000000000000000000.zab".to_string())); + assert_eq!(builder.container_name.as_deref(), Some("ab000000000000000000000000000000")); + assert!(builder.use_fabric_endpoint.get().unwrap()); + let mut builder = MicrosoftAzureBuilder::new(); builder .parse_url("https://ab000000000000000000000000000000.zab.dfs.fabric.microsoft.com/")