Skip to content

Commit d844f86

Browse files
authored
Add:arrow_metadata() UDF (#19435)
## Which issue does this PR close? Closes #19356 ## Rationale for this change This PR implements the arrow_metadata UDF as requested in issue #19356. ## What changes are included in this PR? Added arrow_metadata UDF Refactored Tests ## Are these changes tested? Yes. ## Are there any user-facing changes? Yes.
1 parent 258e18c commit d844f86

File tree

5 files changed

+198
-60
lines changed

5 files changed

+198
-60
lines changed
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use arrow::array::{MapBuilder, StringBuilder};
19+
use arrow::datatypes::{DataType, Field, Fields};
20+
use datafusion_common::{Result, ScalarValue, exec_err};
21+
use datafusion_expr::{
22+
ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
23+
Volatility,
24+
};
25+
use datafusion_macros::user_doc;
26+
use std::any::Any;
27+
use std::sync::Arc;
28+
29+
#[user_doc(
30+
doc_section(label = "Other Functions"),
31+
description = "Returns the metadata of the input expression. If a key is provided, returns the value for that key. If no key is provided, returns a Map of all metadata.",
32+
syntax_example = "arrow_metadata(expression, [key])",
33+
sql_example = r#"```sql
34+
> select arrow_metadata(col) from table;
35+
+----------------------------+
36+
| arrow_metadata(table.col) |
37+
+----------------------------+
38+
| {k: v} |
39+
+----------------------------+
40+
> select arrow_metadata(col, 'k') from table;
41+
+-------------------------------+
42+
| arrow_metadata(table.col, 'k')|
43+
+-------------------------------+
44+
| v |
45+
+-------------------------------+
46+
```"#,
47+
argument(
48+
name = "expression",
49+
description = "The expression to retrieve metadata from. Can be a column or other expression."
50+
),
51+
argument(
52+
name = "key",
53+
description = "Optional. The specific metadata key to retrieve."
54+
)
55+
)]
56+
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
57+
pub struct ArrowMetadataFunc {
58+
signature: Signature,
59+
}
60+
61+
impl ArrowMetadataFunc {
62+
pub fn new() -> Self {
63+
Self {
64+
signature: Signature::variadic_any(Volatility::Immutable),
65+
}
66+
}
67+
}
68+
69+
impl Default for ArrowMetadataFunc {
70+
fn default() -> Self {
71+
Self::new()
72+
}
73+
}
74+
75+
impl ScalarUDFImpl for ArrowMetadataFunc {
76+
fn as_any(&self) -> &dyn Any {
77+
self
78+
}
79+
80+
fn name(&self) -> &str {
81+
"arrow_metadata"
82+
}
83+
84+
fn signature(&self) -> &Signature {
85+
&self.signature
86+
}
87+
88+
fn documentation(&self) -> Option<&Documentation> {
89+
self.doc()
90+
}
91+
92+
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
93+
if arg_types.len() == 2 {
94+
Ok(DataType::Utf8)
95+
} else if arg_types.len() == 1 {
96+
Ok(DataType::Map(
97+
Arc::new(Field::new(
98+
"entries",
99+
DataType::Struct(Fields::from(vec![
100+
Field::new("keys", DataType::Utf8, false),
101+
Field::new("values", DataType::Utf8, true),
102+
])),
103+
false,
104+
)),
105+
false,
106+
))
107+
} else {
108+
exec_err!("arrow_metadata requires 1 or 2 arguments")
109+
}
110+
}
111+
112+
fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
113+
let metadata = args.arg_fields[0].metadata();
114+
115+
if args.args.len() == 2 {
116+
let key = match &args.args[1] {
117+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(k))) => k,
118+
_ => {
119+
return exec_err!(
120+
"Second argument to arrow_metadata must be a string literal key"
121+
);
122+
}
123+
};
124+
let value = metadata.get(key).cloned();
125+
Ok(ColumnarValue::Scalar(ScalarValue::Utf8(value)))
126+
} else if args.args.len() == 1 {
127+
let mut map_builder =
128+
MapBuilder::new(None, StringBuilder::new(), StringBuilder::new());
129+
130+
let mut entries: Vec<_> = metadata.iter().collect();
131+
entries.sort_by_key(|(k, _)| *k);
132+
133+
for (k, v) in entries {
134+
map_builder.keys().append_value(k);
135+
map_builder.values().append_value(v);
136+
}
137+
map_builder.append(true)?;
138+
139+
let map_array = map_builder.finish();
140+
141+
Ok(ColumnarValue::Scalar(ScalarValue::try_from_array(
142+
&map_array, 0,
143+
)?))
144+
} else {
145+
exec_err!("arrow_metadata requires 1 or 2 arguments")
146+
}
147+
}
148+
}

datafusion/functions/src/core/mod.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ use datafusion_expr::ScalarUDF;
2121
use std::sync::Arc;
2222

2323
pub mod arrow_cast;
24+
pub mod arrow_metadata;
2425
pub mod arrowtypeof;
2526
pub mod coalesce;
2627
pub mod expr_ext;
@@ -55,6 +56,7 @@ make_udf_function!(least::LeastFunc, least);
5556
make_udf_function!(union_extract::UnionExtractFun, union_extract);
5657
make_udf_function!(union_tag::UnionTagFunc, union_tag);
5758
make_udf_function!(version::VersionFunc, version);
59+
make_udf_function!(arrow_metadata::ArrowMetadataFunc, arrow_metadata);
5860

5961
pub mod expr_fn {
6062
use datafusion_expr::{Expr, Literal};
@@ -83,6 +85,10 @@ pub mod expr_fn {
8385
arrow_typeof,
8486
"Returns the Arrow type of the input expression.",
8587
arg1
88+
),(
89+
arrow_metadata,
90+
"Returns the metadata of the input expression",
91+
args,
8692
),(
8793
r#struct,
8894
"Returns a struct with the given arguments",
@@ -127,6 +133,7 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
127133
vec![
128134
nullif(),
129135
arrow_cast(),
136+
arrow_metadata(),
130137
nvl(),
131138
nvl2(),
132139
overlay(),

datafusion/sqllogictest/src/test_context.rs

Lines changed: 1 addition & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ use arrow::record_batch::RecordBatch;
3232
use datafusion::catalog::{
3333
CatalogProvider, MemoryCatalogProvider, MemorySchemaProvider, Session,
3434
};
35-
use datafusion::common::{DataFusionError, Result, ScalarValue, exec_err, not_impl_err};
35+
use datafusion::common::{DataFusionError, Result, not_impl_err};
3636
use datafusion::functions::math::abs;
3737
use datafusion::logical_expr::async_udf::{AsyncScalarUDF, AsyncScalarUDFImpl};
3838
use datafusion::logical_expr::{
@@ -398,60 +398,6 @@ pub async fn register_metadata_tables(ctx: &SessionContext) {
398398
.unwrap();
399399

400400
ctx.register_batch("table_with_metadata", batch).unwrap();
401-
402-
// Register the get_metadata UDF for testing metadata preservation
403-
ctx.register_udf(ScalarUDF::from(GetMetadataUdf::new()));
404-
}
405-
406-
/// UDF to extract metadata from a field for testing purposes
407-
/// Usage: get_metadata(expr, 'key') -> returns the metadata value or NULL
408-
#[derive(Debug, PartialEq, Eq, Hash)]
409-
struct GetMetadataUdf {
410-
signature: Signature,
411-
}
412-
413-
impl GetMetadataUdf {
414-
fn new() -> Self {
415-
Self {
416-
signature: Signature::any(2, Volatility::Immutable),
417-
}
418-
}
419-
}
420-
421-
impl ScalarUDFImpl for GetMetadataUdf {
422-
fn as_any(&self) -> &dyn Any {
423-
self
424-
}
425-
426-
fn name(&self) -> &str {
427-
"get_metadata"
428-
}
429-
430-
fn signature(&self) -> &Signature {
431-
&self.signature
432-
}
433-
434-
fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
435-
Ok(DataType::Utf8)
436-
}
437-
438-
fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
439-
// Get the metadata key from the second argument (must be a string literal)
440-
let key = match &args.args[1] {
441-
ColumnarValue::Scalar(ScalarValue::Utf8(Some(k))) => k.clone(),
442-
_ => {
443-
return exec_err!(
444-
"get_metadata second argument must be a string literal"
445-
);
446-
}
447-
};
448-
449-
// Get metadata from the first argument's field
450-
let metadata_value = args.arg_fields[0].metadata().get(&key).cloned();
451-
452-
// Return as a scalar (same value for all rows)
453-
Ok(ColumnarValue::Scalar(ScalarValue::Utf8(metadata_value)))
454-
}
455401
}
456402

457403
/// Create a UDF function named "example". See the `sample_udf.rs` example

datafusion/sqllogictest/test_files/metadata.slt

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -237,21 +237,21 @@ NULL 1
237237

238238
# Regression test: first_value should preserve metadata
239239
query IT
240-
select first_value(id order by id asc nulls last), get_metadata(first_value(id order by id asc nulls last), 'metadata_key')
240+
select first_value(id order by id asc nulls last), arrow_metadata(first_value(id order by id asc nulls last), 'metadata_key')
241241
from table_with_metadata;
242242
----
243243
1 the id field
244244

245245
# Regression test: last_value should preserve metadata
246246
query IT
247-
select last_value(id order by id asc nulls first), get_metadata(last_value(id order by id asc nulls first), 'metadata_key')
247+
select last_value(id order by id asc nulls first), arrow_metadata(last_value(id order by id asc nulls first), 'metadata_key')
248248
from table_with_metadata;
249249
----
250250
3 the id field
251251

252252
# Regression test: DISTINCT ON should preserve metadata (uses first_value internally)
253253
query ITTT
254-
select distinct on (id) id, get_metadata(id, 'metadata_key'), name, get_metadata(name, 'metadata_key')
254+
select distinct on (id) id, arrow_metadata(id, 'metadata_key'), name, arrow_metadata(name, 'metadata_key')
255255
from table_with_metadata order by id asc nulls last;
256256
----
257257
1 the id field NULL the name field
@@ -263,7 +263,7 @@ query ITTT
263263
with res AS (
264264
select distinct id, name from table_with_metadata
265265
)
266-
select id, get_metadata(id, 'metadata_key'), name, get_metadata(name, 'metadata_key')
266+
select id, arrow_metadata(id, 'metadata_key'), name, arrow_metadata(name, 'metadata_key')
267267
from res
268268
order by id asc nulls last;
269269
----
@@ -278,13 +278,19 @@ with res AS (
278278
from table_with_metadata
279279
group by id, name
280280
)
281-
select id, get_metadata(id, 'metadata_key'), name, get_metadata(name, 'metadata_key')
281+
select id, arrow_metadata(id, 'metadata_key'), name, arrow_metadata(name, 'metadata_key')
282282
from res
283283
order by id asc nulls last, name asc nulls last
284284
----
285285
1 the id field NULL the name field
286286
3 the id field baz the name field
287287
NULL the id field bar the name field
288288

289+
# Test arrow_metadata with single argument (returns Map)
290+
query ?
291+
select arrow_metadata(id) from table_with_metadata limit 1;
292+
----
293+
{metadata_key: the id field}
294+
289295
statement ok
290296
drop table table_with_metadata;

docs/source/user-guide/sql/scalar_functions.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4988,6 +4988,7 @@ union_tag(union_expression)
49884988
## Other Functions
49894989

49904990
- [arrow_cast](#arrow_cast)
4991+
- [arrow_metadata](#arrow_metadata)
49914992
- [arrow_typeof](#arrow_typeof)
49924993
- [get_field](#get_field)
49934994
- [version](#version)
@@ -5030,6 +5031,36 @@ arrow_cast(expression, datatype)
50305031
+---------------------------+---------------------+
50315032
```
50325033

5034+
### `arrow_metadata`
5035+
5036+
Returns the metadata of the input expression. If a key is provided, returns the value for that key. If no key is provided, returns a Map of all metadata.
5037+
5038+
```sql
5039+
arrow_metadata(expression, [key])
5040+
```
5041+
5042+
#### Arguments
5043+
5044+
- **expression**: The expression to retrieve metadata from. Can be a column or other expression.
5045+
- **key**: Optional. The specific metadata key to retrieve.
5046+
5047+
#### Example
5048+
5049+
```sql
5050+
> select arrow_metadata(col) from table;
5051+
+----------------------------+
5052+
| arrow_metadata(table.col) |
5053+
+----------------------------+
5054+
| {k: v} |
5055+
+----------------------------+
5056+
> select arrow_metadata(col, 'k') from table;
5057+
+-------------------------------+
5058+
| arrow_metadata(table.col, 'k')|
5059+
+-------------------------------+
5060+
| v |
5061+
+-------------------------------+
5062+
```
5063+
50335064
### `arrow_typeof`
50345065

50355066
Returns the name of the underlying [Arrow data type](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) of the expression.

0 commit comments

Comments
 (0)