Skip to content

Commit 14a7ade

Browse files
xanderbaileyxbaileyalamb
authored
Fix ambiguous column names in substrait conversion as a result of literals having the same name during conversion. (#17299)
* Fix ambigious column names in substrate conversion as a result of literals having the same names * move it to the project * do it only for projects * comment * fmt * comments --------- Co-authored-by: Xander Bailey <xbailey@palantir.com> Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent a281b25 commit 14a7ade

File tree

6 files changed

+368
-17
lines changed

6 files changed

+368
-17
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion/substrait/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ prost = { workspace = true }
4242
substrait = { version = "0.58", features = ["serde"] }
4343
url = { workspace = true }
4444
tokio = { workspace = true, features = ["fs"] }
45+
uuid = { version = "1.17.0", features = ["v4"] }
4546

4647
[dev-dependencies]
4748
datafusion = { workspace = true, features = ["nested_expressions"] }

datafusion/substrait/src/logical_plan/consumer/rel/project_rel.rs

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,20 @@ pub async fn from_project_rel(
6262
// to transform it into a column reference
6363
window_exprs.insert(e.clone());
6464
}
65-
explicit_exprs.push(name_tracker.get_uniquely_named_expr(e)?);
65+
// Substrait plans are ordinal based, so they do not provide names for columns.
66+
// Names for columns are generated by Datafusion during conversion, and for literals
67+
// Datafusion produces names based on the literal value. It is possible to construct
68+
// valid Substrait plans that result in duplicated names if the same literal value is
69+
// used in multiple relations. To avoid this issue, we alias literals with unique names.
70+
// The name tracker will ensure that two literals in the same project would have
71+
// unique names but, it does not ensure that if a literal column exists in a previous
72+
// project say before a join that it is deduplicated with respect to those columns.
73+
// See: https://github.com/apache/datafusion/pull/17299
74+
let maybe_apply_alias = match e {
75+
lit @ Expr::Literal(_, _) => lit.alias(uuid::Uuid::new_v4().to_string()),
76+
_ => e,
77+
};
78+
explicit_exprs.push(name_tracker.get_uniquely_named_expr(maybe_apply_alias)?);
6679
}
6780

6881
let input = if !window_exprs.is_empty() {

datafusion/substrait/tests/cases/consumer_integration.rs

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -647,23 +647,31 @@ mod tests {
647647
#[tokio::test]
648648
async fn test_multiple_unions() -> Result<()> {
649649
let plan_str = test_plan_to_string("multiple_unions.json").await?;
650-
assert_snapshot!(
651-
plan_str,
652-
@r#"
653-
Projection: Utf8("people") AS product_category, Utf8("people")__temp__0 AS product_type, product_key
654-
Union
655-
Projection: Utf8("people"), Utf8("people") AS Utf8("people")__temp__0, sales.product_key
656-
Left Join: sales.product_key = food.@food_id
657-
TableScan: sales
658-
TableScan: food
659-
Union
660-
Projection: people.$f3, people.$f5, people.product_key0
661-
Left Join: people.product_key0 = food.@food_id
662-
TableScan: people
663-
TableScan: food
664-
TableScan: more_products
665-
"#
650+
651+
let mut settings = insta::Settings::clone_current();
652+
settings.add_filter(
653+
r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}",
654+
"[UUID]",
655+
);
656+
settings.bind(|| {
657+
assert_snapshot!(
658+
plan_str,
659+
@r#"
660+
Projection: [UUID] AS product_category, [UUID] AS product_type, product_key
661+
Union
662+
Projection: Utf8("people") AS [UUID], Utf8("people") AS [UUID], sales.product_key
663+
Left Join: sales.product_key = food.@food_id
664+
TableScan: sales
665+
TableScan: food
666+
Union
667+
Projection: people.$f3, people.$f5, people.product_key0
668+
Left Join: people.product_key0 = food.@food_id
669+
TableScan: people
670+
TableScan: food
671+
TableScan: more_products
672+
"#
666673
);
674+
});
667675

668676
Ok(())
669677
}

datafusion/substrait/tests/cases/logical_plans.rs

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,47 @@ mod tests {
144144
Ok(())
145145
}
146146

147+
#[tokio::test]
148+
async fn null_literal_before_and_after_joins() -> Result<()> {
149+
// Confirms that literals used before and after a join but for different columns
150+
// are correctly handled.
151+
152+
// File generated with substrait-java's Isthmus:
153+
// ./isthmus-cli/build/graal/isthmus --create "create table A (a int); create table B (a int, c int); create table C (a int, d int)" "select t.*, C.d, CAST(NULL AS VARCHAR) as e from (select a, CAST(NULL AS VARCHAR) as c from A UNION ALL select a, c from B) t LEFT JOIN C ON t.a = C.a"
154+
let proto_plan =
155+
read_json("tests/testdata/test_plans/disambiguate_literals_with_same_name.substrait.json");
156+
let ctx = add_plan_schemas_to_ctx(SessionContext::new(), &proto_plan)?;
157+
let plan = from_substrait_plan(&ctx.state(), &proto_plan).await?;
158+
159+
let mut settings = insta::Settings::clone_current();
160+
settings.add_filter(
161+
r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}",
162+
"[UUID]",
163+
);
164+
settings.bind(|| {
165+
assert_snapshot!(
166+
plan,
167+
@r#"
168+
Projection: left.A, left.[UUID] AS C, right.D, Utf8(NULL) AS [UUID] AS E
169+
Left Join: left.A = right.A
170+
SubqueryAlias: left
171+
Union
172+
Projection: A.A, Utf8(NULL) AS [UUID]
173+
TableScan: A
174+
Projection: B.A, CAST(B.C AS Utf8)
175+
TableScan: B
176+
SubqueryAlias: right
177+
TableScan: C
178+
"#
179+
);
180+
});
181+
182+
// Trigger execution to ensure plan validity
183+
DataFrame::new(ctx.state(), plan).show().await?;
184+
185+
Ok(())
186+
}
187+
147188
#[tokio::test]
148189
async fn non_nullable_lists() -> Result<()> {
149190
// DataFusion's Substrait consumer treats all lists as nullable, even if the Substrait plan specifies them as non-nullable.

0 commit comments

Comments
 (0)