Skip to content

Commit fee82dd

Browse files
committed
fix: read_csv fails when check file size for wildcard gcs files
1 parent 209d0d4 commit fee82dd

File tree

2 files changed

+44
-4
lines changed

2 files changed

+44
-4
lines changed

bigframes/session/__init__.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818

1919
from collections import abc
2020
import datetime
21+
import fnmatch
22+
import inspect
2123
import logging
2224
import os
2325
import secrets
@@ -1344,12 +1346,24 @@ def read_json(
13441346
def _check_file_size(self, filepath: str):
13451347
max_size = 1024 * 1024 * 1024 # 1 GB in bytes
13461348
if filepath.startswith("gs://"): # GCS file path
1349+
bucket_name, blob_path = filepath.split("/", 3)[2:]
1350+
13471351
client = storage.Client()
1348-
bucket_name, blob_name = filepath.split("/", 3)[2:]
13491352
bucket = client.bucket(bucket_name)
1350-
blob = bucket.blob(blob_name)
1351-
blob.reload()
1352-
file_size = blob.size
1353+
1354+
list_blobs_params = inspect.signature(bucket.list_blobs).parameters
1355+
if "match_glob" in list_blobs_params:
1356+
# Modern, efficient method for new library versions
1357+
matching_blobs = bucket.list_blobs(match_glob=blob_path)
1358+
file_size = sum(blob.size for blob in matching_blobs)
1359+
else:
1360+
# Fallback method for older library versions
1361+
prefix = blob_path.split("*", 1)[0]
1362+
all_blobs = bucket.list_blobs(prefix=prefix)
1363+
matching_blobs = [
1364+
blob for blob in all_blobs if fnmatch.fnmatch(blob.name, blob_path)
1365+
]
1366+
file_size = sum(blob.size for blob in matching_blobs)
13531367
elif os.path.exists(filepath): # local file path
13541368
file_size = os.path.getsize(filepath)
13551369
else:

tests/system/small/test_session.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1287,6 +1287,32 @@ def test_read_csv_raises_error_for_invalid_index_col(
12871287
session.read_csv(path, engine="bigquery", index_col=index_col)
12881288

12891289

1290+
def test_read_csv_for_gcs_wildcard_path(session, df_and_gcs_csv):
1291+
scalars_pandas_df, path = df_and_gcs_csv
1292+
path = path.replace(".csv", "*.csv")
1293+
1294+
index_col = "rowindex"
1295+
bf_df = session.read_csv(path, engine="bigquery", index_col=index_col)
1296+
1297+
# Convert default pandas dtypes to match BigQuery DataFrames dtypes.
1298+
# Also, `expand=True` is needed to read from wildcard paths. See details:
1299+
# https://github.com/fsspec/gcsfs/issues/616,
1300+
if not pd.__version__.startswith("1."):
1301+
storage_options = {"expand": True}
1302+
else:
1303+
storage_options = None
1304+
pd_df = session.read_csv(
1305+
path,
1306+
index_col=index_col,
1307+
dtype=scalars_pandas_df.dtypes.to_dict(),
1308+
storage_options=storage_options,
1309+
)
1310+
1311+
assert bf_df.shape == pd_df.shape
1312+
assert bf_df.columns.tolist() == pd_df.columns.tolist()
1313+
pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas())
1314+
1315+
12901316
def test_read_csv_for_names(session, df_and_gcs_csv_for_two_columns):
12911317
_, path = df_and_gcs_csv_for_two_columns
12921318

0 commit comments

Comments
 (0)