From 37b175a3e21aa2f1af95dc206dc98fe102ef0d3c Mon Sep 17 00:00:00 2001 From: G26Karthik Date: Sun, 5 Oct 2025 23:00:22 +0530 Subject: [PATCH 1/3] TST: Add regression test for pyarrow datetime merge with duplicates Add test for GH#61926 to ensure merge operations work correctly with pyarrow datetime columns when there are duplicate values on the right side. This was fixed by PR#62276 which improved Index._get_join_target handling for pyarrow datetime types. --- pandas/tests/reshape/merge/test_merge.py | 25 ++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d3bef4c863b28..3fd99ff399b0c 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -3097,3 +3097,28 @@ def test_merge_categorical_key_recursion(): right.astype("float64"), on="key", how="outer" ) tm.assert_frame_equal(result, expected) + + +def test_merge_pyarrow_datetime_duplicates(): + # GH#61926 + # Regression test for merge failing on pyarrow datetime columns with duplicates + pytest.importorskip("pyarrow") + + # Create datetime index + t = pd.date_range("2025-07-06", periods=3, freq="h") + + # Left dataframe: one row per timestamp + df1 = DataFrame({"time": t, "val1": [1, 2, 3]}) + df1 = df1.convert_dtypes(dtype_backend="pyarrow") + + # Right dataframe: two rows per timestamp (duplicates) + df2 = DataFrame({"time": t.repeat(2), "val2": [10, 20, 30, 40, 50, 60]}) + df2 = df2.convert_dtypes(dtype_backend="pyarrow") + + # This should work without raising ValueError + result = merge(df1, df2, on="time", how="left") + + # Should return 6 rows (df1's 3 timestamps × 2 matches each from df2) + assert len(result) == 6 + assert result["val1"].tolist() == [1, 1, 2, 2, 3, 3] + assert result["val2"].tolist() == [10, 20, 30, 40, 50, 60] From ec3521ee89d7cc4b59090e6cefcf337ce08917b6 Mon Sep 17 00:00:00 2001 From: G26Karthik Date: Sun, 5 Oct 2025 23:57:53 +0530 Subject: [PATCH 2/3] Fix ambiguous multiplication character in comment --- pandas/tests/reshape/merge/test_merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 3fd99ff399b0c..7f56cb0a00732 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -3118,7 +3118,7 @@ def test_merge_pyarrow_datetime_duplicates(): # This should work without raising ValueError result = merge(df1, df2, on="time", how="left") - # Should return 6 rows (df1's 3 timestamps × 2 matches each from df2) + # Should return 6 rows (df1's 3 timestamps x 2 matches each from df2) assert len(result) == 6 assert result["val1"].tolist() == [1, 1, 2, 2, 3, 3] assert result["val2"].tolist() == [10, 20, 30, 40, 50, 60] From 6f859a6d87a1ee3c6bff19ad64f97a3c3a166f3d Mon Sep 17 00:00:00 2001 From: G26Karthik Date: Mon, 6 Oct 2025 15:01:58 +0530 Subject: [PATCH 3/3] Address review feedback: remove comments and use tm.assert_frame_equal --- pandas/tests/reshape/merge/test_merge.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 7f56cb0a00732..c38ee32cb7226 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -3101,24 +3101,23 @@ def test_merge_categorical_key_recursion(): def test_merge_pyarrow_datetime_duplicates(): # GH#61926 - # Regression test for merge failing on pyarrow datetime columns with duplicates pytest.importorskip("pyarrow") - # Create datetime index t = pd.date_range("2025-07-06", periods=3, freq="h") - - # Left dataframe: one row per timestamp df1 = DataFrame({"time": t, "val1": [1, 2, 3]}) df1 = df1.convert_dtypes(dtype_backend="pyarrow") - # Right dataframe: two rows per timestamp (duplicates) df2 = DataFrame({"time": t.repeat(2), "val2": [10, 20, 30, 40, 50, 60]}) df2 = df2.convert_dtypes(dtype_backend="pyarrow") - # This should work without raising ValueError result = merge(df1, df2, on="time", how="left") - # Should return 6 rows (df1's 3 timestamps x 2 matches each from df2) - assert len(result) == 6 - assert result["val1"].tolist() == [1, 1, 2, 2, 3, 3] - assert result["val2"].tolist() == [10, 20, 30, 40, 50, 60] + expected = DataFrame( + { + "time": t.repeat(2), + "val1": [1, 1, 2, 2, 3, 3], + "val2": [10, 20, 30, 40, 50, 60], + } + ) + expected = expected.convert_dtypes(dtype_backend="pyarrow") + tm.assert_frame_equal(result, expected)