Skip to content

Commit 79a4f24

Browse files
committed
fix: read_csv fails when check file size for wildcard gcs files
1 parent 935af10 commit 79a4f24

File tree

4 files changed

+50
-43
lines changed

4 files changed

+50
-43
lines changed

bigframes/session/__init__.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1344,12 +1344,13 @@ def read_json(
13441344
def _check_file_size(self, filepath: str):
13451345
max_size = 1024 * 1024 * 1024 # 1 GB in bytes
13461346
if filepath.startswith("gs://"): # GCS file path
1347+
bucket_name, blob_path = filepath.split("/", 3)[2:]
1348+
13471349
client = storage.Client()
1348-
bucket_name, blob_name = filepath.split("/", 3)[2:]
13491350
bucket = client.bucket(bucket_name)
1350-
blob = bucket.blob(blob_name)
1351-
blob.reload()
1352-
file_size = blob.size
1351+
1352+
matching_blobs = bucket.list_blobs(match_glob=blob_path)
1353+
file_size = sum(blob.size for blob in matching_blobs)
13531354
elif os.path.exists(filepath): # local file path
13541355
file_size = os.path.getsize(filepath)
13551356
else:

tests/system/small/test_dataframe_io.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -423,11 +423,14 @@ def test_to_csv_index(
423423
dtype.pop("rowindex")
424424
# read_csv will decode into bytes inproperly, convert_pandas_dtypes will encode properly from string
425425
dtype.pop("bytes_col")
426+
# `expand=True` is needed to read from wildcard paths. See details:
427+
# https://github.com/fsspec/gcsfs/issues/616
426428
gcs_df = pd.read_csv(
427-
utils.get_first_file_from_wildcard(path),
429+
path,
428430
dtype=dtype,
429431
date_format={"timestamp_col": "YYYY-MM-DD HH:MM:SS Z"},
430432
index_col=index_col,
433+
storage_options={"expand": True},
431434
)
432435
utils.convert_pandas_dtypes(gcs_df, bytes_col=True)
433436
gcs_df.index.name = scalars_df.index.name
@@ -462,11 +465,14 @@ def test_to_csv_tabs(
462465
# read_csv will decode into bytes inproperly, convert_pandas_dtypes will encode properly from string
463466
dtype.pop("bytes_col")
464467
gcs_df = pd.read_csv(
465-
utils.get_first_file_from_wildcard(path),
468+
path,
466469
sep="\t",
467470
dtype=dtype,
468471
date_format={"timestamp_col": "YYYY-MM-DD HH:MM:SS Z"},
469472
index_col=index_col,
473+
# `expand=True` is needed to read from wildcard paths. See details:
474+
# https://github.com/fsspec/gcsfs/issues/616,
475+
storage_options={"expand": True},
470476
)
471477
utils.convert_pandas_dtypes(gcs_df, bytes_col=True)
472478
gcs_df.index.name = scalars_df.index.name
@@ -959,10 +965,13 @@ def test_to_json_index_records_orient(
959965

960966
scalars_df.to_json(path, index=index, orient="records", lines=True)
961967

968+
# `expand=True` is needed to read from wildcard paths. See details:
969+
# https://github.com/fsspec/gcsfs/issues/616,
962970
gcs_df = pd.read_json(
963-
utils.get_first_file_from_wildcard(path),
971+
path,
964972
lines=True,
965973
convert_dates=["datetime_col"],
974+
storage_options={"expand": True},
966975
)
967976
utils.convert_pandas_dtypes(gcs_df, bytes_col=True)
968977
if index and scalars_df.index.name is not None:

tests/system/small/test_series.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,7 @@
3232
import bigframes.features
3333
import bigframes.pandas
3434
import bigframes.series as series
35-
from bigframes.testing.utils import (
36-
assert_pandas_df_equal,
37-
assert_series_equal,
38-
get_first_file_from_wildcard,
39-
)
35+
from bigframes.testing.utils import assert_pandas_df_equal, assert_series_equal
4036

4137

4238
def test_series_construct_copy(scalars_dfs):
@@ -3344,7 +3340,9 @@ def test_to_frame_no_name(scalars_dfs):
33443340
def test_to_json(gcs_folder, scalars_df_index, scalars_pandas_df_index):
33453341
path = gcs_folder + "test_series_to_json*.jsonl"
33463342
scalars_df_index["int64_col"].to_json(path, lines=True, orient="records")
3347-
gcs_df = pd.read_json(get_first_file_from_wildcard(path), lines=True)
3343+
# `expand=True` is needed to read from wildcard paths. See details:
3344+
# https://github.com/fsspec/gcsfs/issues/616,
3345+
gcs_df = pd.read_json(path, lines=True, storage_options={"expand": True})
33483346

33493347
pd.testing.assert_series_equal(
33503348
gcs_df["int64_col"].astype(pd.Int64Dtype()),
@@ -3357,7 +3355,9 @@ def test_to_json(gcs_folder, scalars_df_index, scalars_pandas_df_index):
33573355
def test_to_csv(gcs_folder, scalars_df_index, scalars_pandas_df_index):
33583356
path = gcs_folder + "test_series_to_csv*.csv"
33593357
scalars_df_index["int64_col"].to_csv(path)
3360-
gcs_df = pd.read_csv(get_first_file_from_wildcard(path))
3358+
# `expand=True` is needed to read from wildcard paths. See details:
3359+
# https://github.com/fsspec/gcsfs/issues/616
3360+
gcs_df = pd.read_csv(path, storage_options={"expand": True})
33613361

33623362
pd.testing.assert_series_equal(
33633363
gcs_df["int64_col"].astype(pd.Int64Dtype()),

tests/system/small/test_session.py

Lines changed: 26 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1041,15 +1041,7 @@ def test_read_pandas_w_nested_json_fails(session, write_engine):
10411041
session.read_pandas(pd_s, write_engine=write_engine)
10421042

10431043

1044-
@pytest.mark.parametrize(
1045-
("write_engine"),
1046-
[
1047-
pytest.param("default"),
1048-
pytest.param("bigquery_inline"),
1049-
pytest.param("bigquery_streaming"),
1050-
pytest.param("bigquery_write"),
1051-
],
1052-
)
1044+
@all_write_engines
10531045
def test_read_pandas_w_nested_json(session, write_engine):
10541046
# TODO: supply a reason why this isn't compatible with pandas 1.x
10551047
pytest.importorskip("pandas", minversion="2.0.0")
@@ -1074,15 +1066,7 @@ def test_read_pandas_w_nested_json(session, write_engine):
10741066
pd.testing.assert_series_equal(bq_s, pd_s)
10751067

10761068

1077-
@pytest.mark.parametrize(
1078-
("write_engine"),
1079-
[
1080-
pytest.param("default"),
1081-
pytest.param("bigquery_inline"),
1082-
pytest.param("bigquery_load"),
1083-
pytest.param("bigquery_streaming"),
1084-
],
1085-
)
1069+
@all_write_engines
10861070
def test_read_pandas_w_nested_invalid_json(session, write_engine):
10871071
# TODO: supply a reason why this isn't compatible with pandas 1.x
10881072
pytest.importorskip("pandas", minversion="2.0.0")
@@ -1127,15 +1111,7 @@ def test_read_pandas_w_nested_json_index_fails(session, write_engine):
11271111
session.read_pandas(pd_idx, write_engine=write_engine)
11281112

11291113

1130-
@pytest.mark.parametrize(
1131-
("write_engine"),
1132-
[
1133-
pytest.param("default"),
1134-
pytest.param("bigquery_inline"),
1135-
pytest.param("bigquery_streaming"),
1136-
pytest.param("bigquery_write"),
1137-
],
1138-
)
1114+
@all_write_engines
11391115
def test_read_pandas_w_nested_json_index(session, write_engine):
11401116
# TODO: supply a reason why this isn't compatible with pandas 1.x
11411117
pytest.importorskip("pandas", minversion="2.0.0")
@@ -1287,6 +1263,28 @@ def test_read_csv_raises_error_for_invalid_index_col(
12871263
session.read_csv(path, engine="bigquery", index_col=index_col)
12881264

12891265

1266+
def test_read_csv_for_gcs_wildcard_path(session, df_and_gcs_csv):
1267+
scalars_pandas_df, path = df_and_gcs_csv
1268+
path = path.replace(".csv", "*.csv")
1269+
1270+
index_col = "rowindex"
1271+
bf_df = session.read_csv(path, engine="bigquery", index_col=index_col)
1272+
1273+
# Convert default pandas dtypes to match BigQuery DataFrames dtypes.
1274+
# Also, `expand=True` is needed to read from wildcard paths. See details:
1275+
# https://github.com/fsspec/gcsfs/issues/616,
1276+
pd_df = session.read_csv(
1277+
path,
1278+
index_col=index_col,
1279+
dtype=scalars_pandas_df.dtypes.to_dict(),
1280+
storage_options={"expand": True},
1281+
)
1282+
1283+
assert bf_df.shape == pd_df.shape
1284+
assert bf_df.columns.tolist() == pd_df.columns.tolist()
1285+
pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas())
1286+
1287+
12901288
def test_read_csv_for_names(session, df_and_gcs_csv_for_two_columns):
12911289
_, path = df_and_gcs_csv_for_two_columns
12921290

@@ -1566,10 +1564,9 @@ def test_read_csv_default_engine_throws_not_implemented_error(
15661564
gcs_folder
15671565
+ "test_read_csv_gcs_default_engine_throws_not_implemented_error*.csv"
15681566
)
1569-
read_path = utils.get_first_file_from_wildcard(path)
15701567
scalars_df_index.to_csv(path)
15711568
with pytest.raises(NotImplementedError, match=match):
1572-
session.read_csv(read_path, **kwargs)
1569+
session.read_csv(path, **kwargs)
15731570

15741571

15751572
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)