Skip to content

Commit c8d16c0

Browse files
authored
feat: include index columns in DataFrame.sql if they are named (#788)
1 parent a4ac82e commit c8d16c0

File tree

2 files changed

+99
-1
lines changed

2 files changed

+99
-1
lines changed

bigframes/dataframe.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -379,7 +379,8 @@ def _to_sql_query(
379379
@property
380380
def sql(self) -> str:
381381
"""Compiles this DataFrame's expression tree to SQL."""
382-
sql, _, _ = self._to_sql_query(include_index=False)
382+
include_index = self.index.name is not None or len(self.index.names) > 1
383+
sql, _, _ = self._to_sql_query(include_index=include_index)
383384
return sql
384385

385386
@property

tests/system/small/test_dataframe_io.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
import google.api_core.exceptions
1818
import pandas as pd
19+
import pandas.testing
1920
import pyarrow as pa
2021
import pytest
2122

@@ -35,6 +36,102 @@
3536
import bigframes.pandas as bpd
3637

3738

39+
def test_sql_executes(scalars_df_default_index, bigquery_client):
40+
"""Test that DataFrame.sql returns executable SQL.
41+
42+
DF.sql is used in public documentation such as
43+
https://cloud.google.com/blog/products/data-analytics/using-bigquery-dataframes-with-carto-geospatial-tools
44+
as a way to pass a DataFrame on to carto without executing the SQL
45+
immediately.
46+
47+
Make sure that this SQL can be run outside of BigQuery DataFrames (assuming
48+
similar credentials / access to the referenced tables).
49+
"""
50+
# Do some operations to make for more complex SQL.
51+
df = (
52+
scalars_df_default_index.drop(columns=["geography_col"])
53+
.groupby("string_col")
54+
.max()
55+
)
56+
df.index.name = None # Don't include unnamed indexes.
57+
query = df.sql
58+
59+
bf_result = df.to_pandas().sort_values("rowindex").reset_index(drop=True)
60+
bq_result = (
61+
bigquery_client.query_and_wait(query)
62+
.to_dataframe()
63+
.sort_values("rowindex")
64+
.reset_index(drop=True)
65+
)
66+
pandas.testing.assert_frame_equal(bf_result, bq_result, check_dtype=False)
67+
68+
69+
def test_sql_executes_and_includes_named_index(
70+
scalars_df_default_index, bigquery_client
71+
):
72+
"""Test that DataFrame.sql returns executable SQL.
73+
74+
DF.sql is used in public documentation such as
75+
https://cloud.google.com/blog/products/data-analytics/using-bigquery-dataframes-with-carto-geospatial-tools
76+
as a way to pass a DataFrame on to carto without executing the SQL
77+
immediately.
78+
79+
Make sure that this SQL can be run outside of BigQuery DataFrames (assuming
80+
similar credentials / access to the referenced tables).
81+
"""
82+
# Do some operations to make for more complex SQL.
83+
df = (
84+
scalars_df_default_index.drop(columns=["geography_col"])
85+
.groupby("string_col")
86+
.max()
87+
)
88+
query = df.sql
89+
90+
bf_result = df.to_pandas().sort_values("rowindex")
91+
bq_result = (
92+
bigquery_client.query_and_wait(query)
93+
.to_dataframe()
94+
.set_index("string_col")
95+
.sort_values("rowindex")
96+
)
97+
pandas.testing.assert_frame_equal(
98+
bf_result, bq_result, check_dtype=False, check_index_type=False
99+
)
100+
101+
102+
def test_sql_executes_and_includes_named_multiindex(
103+
scalars_df_default_index, bigquery_client
104+
):
105+
"""Test that DataFrame.sql returns executable SQL.
106+
107+
DF.sql is used in public documentation such as
108+
https://cloud.google.com/blog/products/data-analytics/using-bigquery-dataframes-with-carto-geospatial-tools
109+
as a way to pass a DataFrame on to carto without executing the SQL
110+
immediately.
111+
112+
Make sure that this SQL can be run outside of BigQuery DataFrames (assuming
113+
similar credentials / access to the referenced tables).
114+
"""
115+
# Do some operations to make for more complex SQL.
116+
df = (
117+
scalars_df_default_index.drop(columns=["geography_col"])
118+
.groupby(["string_col", "bool_col"])
119+
.max()
120+
)
121+
query = df.sql
122+
123+
bf_result = df.to_pandas().sort_values("rowindex")
124+
bq_result = (
125+
bigquery_client.query_and_wait(query)
126+
.to_dataframe()
127+
.set_index(["string_col", "bool_col"])
128+
.sort_values("rowindex")
129+
)
130+
pandas.testing.assert_frame_equal(
131+
bf_result, bq_result, check_dtype=False, check_index_type=False
132+
)
133+
134+
38135
def test_to_pandas_w_correct_dtypes(scalars_df_default_index):
39136
"""Verify to_pandas() APIs returns the expected dtypes."""
40137
actual = scalars_df_default_index.to_pandas().dtypes

0 commit comments

Comments
 (0)