Skip to content

Commit 3fb84ff

Browse files
authored
GH-48629: [R] Add tests for duplicate column names and incompatible types in joins (#48630)
### Rationale for this change Two TODOs in `test-dplyr-join.R` requested test coverage for edge cases in join operations: 1. Testing duplicate column names in joins (with suffixes) 2. Testing type casting behavior when joining on columns with incompatible types ### What changes are included in this PR? 1. Duplicated column names test - Default suffixes (`.x` and `.y`) when no explicit suffix is provided - Custom suffixes (`_left` and `_right`) - Merged the existing "suffix" test into this test block 2. Join key cast failure test - Tests that joining on columns with incompatible types (int32 vs double) correctly errors ### Are these changes tested? Yes, corresponding tests were added ### Are there any user-facing changes? No, test-only changes. * GitHub Issue: #48629
1 parent a0f3d7b commit 3fb84ff

File tree

1 file changed

+89
-22
lines changed

1 file changed

+89
-22
lines changed

r/tests/testthat/test-dplyr-join.R

Lines changed: 89 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -188,8 +188,95 @@ test_that("Error handling for unsupported expressions in join_by", {
188188
)
189189
})
190190

191-
# TODO: test duplicate col names
192-
# TODO: casting: int and float columns?
191+
test_that("joins with duplicate column names", {
192+
# When column names are duplicated (not in by), suffixes are added
193+
left_dup <- tibble::tibble(
194+
key = 1:5,
195+
shared = 1:5,
196+
shared_float = c(1.1, 2.2, 3.3, 4.4, 5.5),
197+
left_unique = letters[1:5]
198+
)
199+
right_dup <- tibble::tibble(
200+
key = 1:5,
201+
shared = 6:10,
202+
shared_float = c(6.1, 7.2, 8.3, 9.4, 10.5),
203+
right_unique = LETTERS[1:5]
204+
)
205+
206+
# Test with default suffixes (.x and .y)
207+
compare_dplyr_binding(
208+
.input |>
209+
left_join(right_dup, by = "key") |>
210+
collect(),
211+
left_dup
212+
)
213+
214+
compare_dplyr_binding(
215+
.input |>
216+
inner_join(right_dup, by = "key") |>
217+
collect(),
218+
left_dup
219+
)
220+
221+
# Test with custom suffixes
222+
compare_dplyr_binding(
223+
.input |>
224+
left_join(right_dup, by = "key", suffix = c("_left", "_right")) |>
225+
collect(),
226+
left_dup
227+
)
228+
229+
compare_dplyr_binding(
230+
.input |>
231+
inner_join(right_dup, by = "key", suffix = c("_left", "_right")) |>
232+
collect(),
233+
left_dup
234+
)
235+
236+
# Test that column names are correctly suffixed
237+
# Verify exact column names match expected pattern using the same fixture
238+
result <- arrow_table(left_dup) |>
239+
inner_join(
240+
arrow_table(right_dup),
241+
by = "key",
242+
suffix = c("_left", "_right")
243+
) |>
244+
collect()
245+
res_col_names <- names(result)
246+
# Column order: join key first, then left table columns (with suffixes),
247+
# then right table columns (with suffixes)
248+
expected_col_names <- c(
249+
"key",
250+
"shared_left",
251+
"shared_float_left",
252+
"left_unique",
253+
"shared_right",
254+
"shared_float_right",
255+
"right_unique"
256+
)
257+
expect_equal(expected_col_names, res_col_names)
258+
})
259+
260+
test_that("joins with incompatible types for join keys", {
261+
# Test that joining on columns with incompatible types (int vs float) fails
262+
# Arrow requires join keys to have compatible types - type casting is not
263+
# automatically performed for join keys
264+
left_int <- Table$create(
265+
x = c(1L, 2L),
266+
shared = c(10L, 20L)
267+
)
268+
right_float <- Table$create(
269+
x = c(1.0, 2.0),
270+
shared = c(10.1, 20.2)
271+
)
272+
273+
expect_error(
274+
left_int |>
275+
left_join(right_float, by = "x") |>
276+
collect(),
277+
"Incompatible data types for corresponding join field keys"
278+
)
279+
})
193280

194281
test_that("right_join", {
195282
compare_dplyr_binding(
@@ -317,26 +404,6 @@ test_that("arrow dplyr query correctly filters then joins", {
317404
)
318405
})
319406

320-
test_that("suffix", {
321-
left_suf <- Table$create(
322-
key = c(1, 2),
323-
left_unique = c(2.1, 3.1),
324-
shared = c(10.1, 10.3)
325-
)
326-
327-
right_suf <- Table$create(
328-
key = c(1, 2, 3, 10, 20),
329-
right_unique = c(1.1, 1.2, 3.1, 4.1, 4.3),
330-
shared = c(20.1, 30, 40, 50, 60)
331-
)
332-
333-
join_op <- inner_join(left_suf, right_suf, by = "key", suffix = c("_left", "_right"))
334-
output <- collect(join_op)
335-
res_col_names <- names(output)
336-
expected_col_names <- c("key", "left_unique", "shared_left", "right_unique", "shared_right")
337-
expect_equal(expected_col_names, res_col_names)
338-
})
339-
340407
test_that("suffix and implicit schema", {
341408
left_suf <- Table$create(
342409
key = c(1, 2),

0 commit comments

Comments
 (0)