|
| 1 | +use std::any::Any; |
| 2 | +use std::sync::Arc; |
| 3 | +use std::time::Duration; |
| 4 | + |
| 5 | +use arrow::array::{ArrayRef, AsArray, LargeStringArray, StringArray, StringViewArray}; |
| 6 | +use arrow::datatypes::DataType; |
| 7 | +use datafusion::common::{Result, exec_err, not_impl_err, plan_datafusion_err, plan_err}; |
| 8 | +use datafusion::config::ConfigOptions; |
| 9 | +use datafusion::execution::object_store::ObjectStoreUrl; |
| 10 | +use datafusion::logical_expr::{ |
| 11 | + ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, |
| 12 | + async_udf::AsyncScalarUDFImpl, |
| 13 | +}; |
| 14 | +use datafusion_macros::user_doc; |
| 15 | +use datafusion_session::SessionStore; |
| 16 | +use http::Method; |
| 17 | +use itertools::Itertools; |
| 18 | +use object_store::ObjectStore; |
| 19 | +use object_store::aws::AmazonS3; |
| 20 | +use object_store::azure::MicrosoftAzure; |
| 21 | +use object_store::gcp::GoogleCloudStorage; |
| 22 | +use object_store::local::LocalFileSystem; |
| 23 | +use object_store::memory::InMemory; |
| 24 | +use object_store::path::Path; |
| 25 | +use object_store::signer::Signer; |
| 26 | +use url::Url; |
| 27 | + |
| 28 | +/// SignStorageUrl is a scalar user-defined function that signs a URL. |
| 29 | +/// |
| 30 | +/// The resulting URL can be used to access the signed resource (file, directory/key). |
| 31 | +#[user_doc( |
| 32 | + doc_section(label = "String Functions"), |
| 33 | + description = "Sign a URL", |
| 34 | + syntax_example = "sign_storage_url('https://example.com')" |
| 35 | +)] |
| 36 | +#[derive(Debug)] |
| 37 | +pub struct SignStorageUrl { |
| 38 | + signature: Signature, |
| 39 | + session_store: Arc<SessionStore>, |
| 40 | +} |
| 41 | + |
| 42 | +impl SignStorageUrl { |
| 43 | + pub fn new(session_store: Arc<SessionStore>) -> Self { |
| 44 | + Self { |
| 45 | + session_store, |
| 46 | + signature: Signature::uniform( |
| 47 | + 1, |
| 48 | + vec![DataType::Utf8, DataType::Utf8View, DataType::LargeUtf8], |
| 49 | + Volatility::Volatile, |
| 50 | + ), |
| 51 | + } |
| 52 | + } |
| 53 | +} |
| 54 | + |
| 55 | +/// Implement the ScalarUDFImpl trait for AddOne |
| 56 | +impl ScalarUDFImpl for SignStorageUrl { |
| 57 | + fn as_any(&self) -> &dyn Any { |
| 58 | + self |
| 59 | + } |
| 60 | + |
| 61 | + fn name(&self) -> &str { |
| 62 | + "sign_storage_url" |
| 63 | + } |
| 64 | + |
| 65 | + fn signature(&self) -> &Signature { |
| 66 | + &self.signature |
| 67 | + } |
| 68 | + |
| 69 | + fn return_type(&self, args: &[DataType]) -> Result<DataType> { |
| 70 | + if !matches!( |
| 71 | + args.first(), |
| 72 | + Some(&DataType::Utf8) | Some(&DataType::LargeUtf8) | Some(&DataType::Utf8View) |
| 73 | + ) { |
| 74 | + return plan_err!("sign_storage_url only accepts string-like arguments"); |
| 75 | + } |
| 76 | + // safety: we just checked above that the argument is a Some(..) variant |
| 77 | + Ok(args.first().unwrap().clone()) |
| 78 | + } |
| 79 | + |
| 80 | + // The actual implementation would add one to the argument |
| 81 | + fn invoke_with_args(&self, _: ScalarFunctionArgs) -> Result<ColumnarValue> { |
| 82 | + not_impl_err!("SignStorageUrl can only be called from async contexts") |
| 83 | + } |
| 84 | + |
| 85 | + fn documentation(&self) -> Option<&Documentation> { |
| 86 | + self.doc() |
| 87 | + } |
| 88 | +} |
| 89 | + |
| 90 | +#[async_trait::async_trait] |
| 91 | +impl AsyncScalarUDFImpl for SignStorageUrl { |
| 92 | + async fn invoke_async_with_args( |
| 93 | + &self, |
| 94 | + args: ScalarFunctionArgs, |
| 95 | + _options: &ConfigOptions, |
| 96 | + ) -> Result<ArrayRef> { |
| 97 | + let args = ColumnarValue::values_to_arrays(&args.args)?; |
| 98 | + |
| 99 | + // we parse the url and split it into the base URL for the storage bucket |
| 100 | + // and the path within the bucket. We track the indices of the original values |
| 101 | + // in order to send only valid values to the storage provider and later |
| 102 | + // recinstruct the proper response. |
| 103 | + let parse_str = |(idx, value): (usize, Option<&str>)| { |
| 104 | + value.and_then(|v| { |
| 105 | + url::Url::parse(v).ok().and_then(|url| { |
| 106 | + Some(( |
| 107 | + ObjectStoreUrl::parse(&url[..url::Position::BeforePath]).ok()?, |
| 108 | + (idx, Path::from_url_path(url.path()).ok()?), |
| 109 | + )) |
| 110 | + }) |
| 111 | + }) |
| 112 | + }; |
| 113 | + let urls: Vec<_> = if let Some(vals) = args[0].as_string_opt::<i32>() { |
| 114 | + vals.iter().enumerate().flat_map(parse_str).collect() |
| 115 | + } else if let Some(vals) = args[0].as_string_opt::<i64>() { |
| 116 | + vals.iter().enumerate().flat_map(parse_str).collect() |
| 117 | + } else if let Some(vals) = args[0].as_string_view_opt() { |
| 118 | + vals.iter().enumerate().flat_map(parse_str).collect() |
| 119 | + } else { |
| 120 | + return plan_err!("sign_storage_url only accepts string arguments"); |
| 121 | + }; |
| 122 | + |
| 123 | + let registry = self |
| 124 | + .session_store |
| 125 | + .get_session() |
| 126 | + .upgrade() |
| 127 | + .ok_or_else(|| plan_datafusion_err!("session store is not available"))? |
| 128 | + .read() |
| 129 | + .runtime_env() |
| 130 | + .object_store_registry |
| 131 | + .clone(); |
| 132 | + // TODO: allow passing the desired duration as method argument |
| 133 | + let expires_in = Duration::new(60 * 60, 0); |
| 134 | + |
| 135 | + // we group all valid urls by their store and generate signed urls |
| 136 | + // for all urls under that store. The signers usually need to communicate |
| 137 | + // with the storage service only once to generate the signing key, after which |
| 138 | + // they can sign any number of urls without further communication. |
| 139 | + let store_map = urls.into_iter().into_group_map(); |
| 140 | + let mut result_buffer = Vec::with_capacity(args[0].len()); |
| 141 | + for (store_url, paths_and_idx) in store_map { |
| 142 | + let store = registry.get_store(store_url.as_ref())?; |
| 143 | + let (indices, paths): (Vec<_>, Vec<_>) = paths_and_idx.into_iter().unzip(); |
| 144 | + let signed_urls = |
| 145 | + signed_urls(&store_url, store, &paths, Method::GET, expires_in).await?; |
| 146 | + result_buffer.extend(indices.into_iter().zip(signed_urls)); |
| 147 | + } |
| 148 | + |
| 149 | + // construct a result vector from the individual store results. |
| 150 | + let mut results = vec![None; args[0].len()]; |
| 151 | + for (i, url) in result_buffer.into_iter() { |
| 152 | + results[i] = Some(url.to_string()); |
| 153 | + } |
| 154 | + |
| 155 | + // return the results as the same data type as the input array |
| 156 | + match args[0].data_type() { |
| 157 | + DataType::Utf8 => Ok(Arc::new(StringArray::from(results))), |
| 158 | + DataType::LargeUtf8 => Ok(Arc::new(LargeStringArray::from(results))), |
| 159 | + DataType::Utf8View => Ok(Arc::new(StringViewArray::from(results))), |
| 160 | + // safety: We limited the data types when we are reading the paths from the array. |
| 161 | + _ => unreachable!(), |
| 162 | + } |
| 163 | + } |
| 164 | +} |
| 165 | + |
| 166 | +// auxiliary trait to cast `ObjectStore` to `Any`. |
| 167 | +trait DowncastableStore: ObjectStore + Any { |
| 168 | + fn as_any(&self) -> &dyn Any; |
| 169 | +} |
| 170 | + |
| 171 | +impl<T: ObjectStore + Any> DowncastableStore for T { |
| 172 | + fn as_any(&self) -> &dyn Any { |
| 173 | + self |
| 174 | + } |
| 175 | +} |
| 176 | + |
| 177 | +async fn signed_urls( |
| 178 | + store_url: &ObjectStoreUrl, |
| 179 | + store: Arc<dyn ObjectStore>, |
| 180 | + paths: &[Path], |
| 181 | + method: Method, |
| 182 | + expires_in: Duration, |
| 183 | +) -> Result<Vec<Url>> { |
| 184 | + if let Some(signer) = store.as_any().downcast_ref::<MicrosoftAzure>() { |
| 185 | + return Ok(signer.signed_urls(method, paths, expires_in).await?); |
| 186 | + } |
| 187 | + |
| 188 | + if let Some(signer) = store.as_any().downcast_ref::<AmazonS3>() { |
| 189 | + return Ok(signer.signed_urls(method, paths, expires_in).await?); |
| 190 | + } |
| 191 | + |
| 192 | + if let Some(signer) = store.as_any().downcast_ref::<GoogleCloudStorage>() { |
| 193 | + return Ok(signer.signed_urls(method, paths, expires_in).await?); |
| 194 | + } |
| 195 | + |
| 196 | + if store.as_any().downcast_ref::<LocalFileSystem>().is_some() { |
| 197 | + return Ok(paths |
| 198 | + .iter() |
| 199 | + .map(|path| AsRef::<Url>::as_ref(store_url).join(path.as_ref()).unwrap()) |
| 200 | + .collect()); |
| 201 | + } |
| 202 | + |
| 203 | + if store.as_any().downcast_ref::<InMemory>().is_some() { |
| 204 | + return Ok(paths |
| 205 | + .iter() |
| 206 | + .map(|path| AsRef::<Url>::as_ref(store_url).join(path.as_ref()).unwrap()) |
| 207 | + .collect()); |
| 208 | + } |
| 209 | + |
| 210 | + exec_err!("not a signing store") |
| 211 | +} |
0 commit comments