Skip to content

Commit f192324

Browse files
authored
chore: add guiance for read_parquet using wildcard paths (#1149)
1 parent b4d17ff commit f192324

File tree

2 files changed

+44
-11
lines changed

2 files changed

+44
-11
lines changed

bigframes/session/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -946,6 +946,14 @@ def read_parquet(
946946
path, table, job_config=job_config
947947
)
948948
else:
949+
if "*" in path:
950+
raise ValueError(
951+
"The provided path contains a wildcard character (*), which is not "
952+
"supported by the current engine. To read files from wildcard paths, "
953+
"please use the 'bigquery' engine by setting `engine='bigquery'` in "
954+
"your configuration."
955+
)
956+
949957
read_parquet_kwargs: Dict[str, Any] = {}
950958
if pandas.__version__.startswith("1."):
951959
read_parquet_kwargs["use_nullable_dtypes"] = True

tests/system/small/test_session.py

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1126,32 +1126,57 @@ def test_read_pickle_gcs(session, penguins_pandas_df_default_index, gcs_folder):
11261126

11271127

11281128
@pytest.mark.parametrize(
1129-
("engine",),
1129+
("engine", "filename"),
11301130
(
1131-
("auto",),
1132-
("bigquery",),
1131+
pytest.param(
1132+
"auto",
1133+
"000000000000.parquet",
1134+
id="auto",
1135+
),
1136+
pytest.param(
1137+
"pyarrow",
1138+
"000000000000.parquet",
1139+
id="pyarrow",
1140+
),
1141+
pytest.param(
1142+
"bigquery",
1143+
"000000000000.parquet",
1144+
id="bigquery",
1145+
),
1146+
pytest.param(
1147+
"bigquery",
1148+
"*.parquet",
1149+
id="bigquery_wildcard",
1150+
),
1151+
pytest.param(
1152+
"auto",
1153+
"*.parquet",
1154+
id="auto_wildcard",
1155+
marks=pytest.mark.xfail(
1156+
raises=ValueError,
1157+
),
1158+
),
11331159
),
11341160
)
1135-
def test_read_parquet_gcs(session: bigframes.Session, scalars_dfs, gcs_folder, engine):
1161+
def test_read_parquet_gcs(
1162+
session: bigframes.Session, scalars_dfs, gcs_folder, engine, filename
1163+
):
11361164
scalars_df, _ = scalars_dfs
11371165
# Include wildcard so that multiple files can be written/read if > 1 GB.
11381166
# https://cloud.google.com/bigquery/docs/exporting-data#exporting_data_into_one_or_more_files
1139-
path = gcs_folder + test_read_parquet_gcs.__name__ + "*.parquet"
1167+
write_path = gcs_folder + test_read_parquet_gcs.__name__ + "*.parquet"
1168+
read_path = gcs_folder + test_read_parquet_gcs.__name__ + filename
11401169

11411170
df_in: bigframes.dataframe.DataFrame = scalars_df.copy()
11421171
# GEOGRAPHY not supported in parquet export.
11431172
df_in = df_in.drop(columns="geography_col")
11441173
# Make sure we can also serialize the order.
11451174
df_write = df_in.reset_index(drop=False)
11461175
df_write.index.name = f"ordering_id_{random.randrange(1_000_000)}"
1147-
df_write.to_parquet(path, index=True)
1148-
1149-
# Only bigquery engine for reads supports wildcards in path name.
1150-
if engine != "bigquery":
1151-
path = utils.get_first_file_from_wildcard(path)
1176+
df_write.to_parquet(write_path, index=True)
11521177

11531178
df_out = (
1154-
session.read_parquet(path, engine=engine)
1179+
session.read_parquet(read_path, engine=engine)
11551180
# Restore order.
11561181
.set_index(df_write.index.name).sort_index()
11571182
# Restore index.

0 commit comments

Comments
 (0)