Skip to content

Commit 76b252f

Browse files
authored
fix: only do row identity based joins when joining by index (#356)
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [x] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [x] Ensure the tests and linter pass - [x] Code coverage does not decrease (if any source code was changed) - [x] Appropriate docs were updated (if necessary) Fixes #355 🦕
1 parent 0b34402 commit 76b252f

File tree

5 files changed

+81
-4
lines changed

5 files changed

+81
-4
lines changed

bigframes/core/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -349,7 +349,7 @@ def join(
349349
self,
350350
other: ArrayValue,
351351
join_def: join_def.JoinDefinition,
352-
allow_row_identity_join: bool = True,
352+
allow_row_identity_join: bool = False,
353353
):
354354
return ArrayValue(
355355
nodes.JoinNode(

bigframes/core/compile/single_column.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def join_by_column_ordered(
3333
left: compiled.OrderedIR,
3434
right: compiled.OrderedIR,
3535
join: join_defs.JoinDefinition,
36-
allow_row_identity_join: bool = True,
36+
allow_row_identity_join: bool = False,
3737
) -> compiled.OrderedIR:
3838
"""Join two expressions by column equality.
3939
@@ -134,7 +134,7 @@ def join_by_column_unordered(
134134
left: compiled.UnorderedIR,
135135
right: compiled.UnorderedIR,
136136
join: join_defs.JoinDefinition,
137-
allow_row_identity_join: bool = True,
137+
allow_row_identity_join: bool = False,
138138
) -> compiled.UnorderedIR:
139139
"""Join two expressions by column equality.
140140

bigframes/core/nodes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ class JoinNode(BigFrameNode):
115115
left_child: BigFrameNode
116116
right_child: BigFrameNode
117117
join: JoinDefinition
118-
allow_row_identity_join: bool = True
118+
allow_row_identity_join: bool = False
119119

120120
@property
121121
def row_preserving(self) -> bool:

tests/system/conftest.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,13 @@ def scalars_table_id(test_data_tables) -> str:
285285
return test_data_tables["scalars"]
286286

287287

288+
@pytest.fixture(scope="session")
289+
def baseball_schedules_df(session: bigframes.Session) -> bigframes.dataframe.DataFrame:
290+
"""Public BQ table"""
291+
df = session.read_gbq("bigquery-public-data.baseball.schedules")
292+
return df
293+
294+
288295
@pytest.fixture(scope="session")
289296
def hockey_table_id(test_data_tables) -> str:
290297
return test_data_tables["hockey_players"]
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pandas as pd
16+
import pytest
17+
18+
from tests.system.utils import assert_pandas_df_equal
19+
20+
21+
@pytest.mark.parametrize(
22+
("merge_how",),
23+
[
24+
("inner",),
25+
("outer",),
26+
("left",),
27+
("right",),
28+
],
29+
)
30+
def test_merge_after_filter(baseball_schedules_df, merge_how):
31+
on = ["awayTeamName"]
32+
left_columns = [
33+
"gameId",
34+
"year",
35+
"homeTeamName",
36+
"awayTeamName",
37+
"duration_minutes",
38+
]
39+
right_columns = [
40+
"gameId",
41+
"year",
42+
"homeTeamName",
43+
"awayTeamName",
44+
"duration_minutes",
45+
]
46+
47+
left = baseball_schedules_df[left_columns]
48+
left = left[left["homeTeamName"] == "Rays"]
49+
# Offset the rows somewhat so that outer join can have an effect.
50+
right = baseball_schedules_df[right_columns]
51+
right = right[right["homeTeamName"] == "White Sox"]
52+
53+
df = left.merge(right, on=on, how=merge_how)
54+
bf_result = df.to_pandas()
55+
56+
left_pandas = baseball_schedules_df.to_pandas()[left_columns]
57+
left_pandas = left_pandas[left_pandas["homeTeamName"] == "Rays"]
58+
59+
right_pandas = baseball_schedules_df.to_pandas()[right_columns]
60+
right_pandas = right_pandas[right_pandas["homeTeamName"] == "White Sox"]
61+
62+
pd_result = pd.merge(
63+
left_pandas,
64+
right_pandas,
65+
merge_how,
66+
on,
67+
sort=True,
68+
)
69+
70+
assert_pandas_df_equal(bf_result, pd_result, ignore_order=True)

0 commit comments

Comments
 (0)