fix: read_csv fails when check file size for wildcard gcs files

chelsea-lin · chelsea-lin · commit 79a4f243888a · 2025-08-27T23:10:05.000Z
diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
@@ -1344,12 +1344,13 @@ def read_json(
     def _check_file_size(self, filepath: str):
         max_size = 1024 * 1024 * 1024  # 1 GB in bytes
         if filepath.startswith("gs://"):  # GCS file path
+            bucket_name, blob_path = filepath.split("/", 3)[2:]
+
             client = storage.Client()
-            bucket_name, blob_name = filepath.split("/", 3)[2:]
             bucket = client.bucket(bucket_name)
-            blob = bucket.blob(blob_name)
-            blob.reload()
-            file_size = blob.size
+
+            matching_blobs = bucket.list_blobs(match_glob=blob_path)
+            file_size = sum(blob.size for blob in matching_blobs)
         elif os.path.exists(filepath):  # local file path
             file_size = os.path.getsize(filepath)
         else:
diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py
@@ -423,11 +423,14 @@ def test_to_csv_index(
     dtype.pop("rowindex")
     # read_csv will decode into bytes inproperly, convert_pandas_dtypes will encode properly from string
     dtype.pop("bytes_col")
+    # `expand=True` is needed to read from wildcard paths. See details:
+    # https://github.com/fsspec/gcsfs/issues/616
     gcs_df = pd.read_csv(
-        utils.get_first_file_from_wildcard(path),
+        path,
         dtype=dtype,
         date_format={"timestamp_col": "YYYY-MM-DD HH:MM:SS Z"},
         index_col=index_col,
+        storage_options={"expand": True},
     )
     utils.convert_pandas_dtypes(gcs_df, bytes_col=True)
     gcs_df.index.name = scalars_df.index.name
@@ -462,11 +465,14 @@ def test_to_csv_tabs(
     # read_csv will decode into bytes inproperly, convert_pandas_dtypes will encode properly from string
     dtype.pop("bytes_col")
     gcs_df = pd.read_csv(
-        utils.get_first_file_from_wildcard(path),
+        path,
         sep="\t",
         dtype=dtype,
         date_format={"timestamp_col": "YYYY-MM-DD HH:MM:SS Z"},
         index_col=index_col,
+        # `expand=True` is needed to read from wildcard paths. See details:
+        # https://github.com/fsspec/gcsfs/issues/616,
+        storage_options={"expand": True},
     )
     utils.convert_pandas_dtypes(gcs_df, bytes_col=True)
     gcs_df.index.name = scalars_df.index.name
@@ -959,10 +965,13 @@ def test_to_json_index_records_orient(
 
     scalars_df.to_json(path, index=index, orient="records", lines=True)
 
+    # `expand=True` is needed to read from wildcard paths. See details:
+    # https://github.com/fsspec/gcsfs/issues/616,
     gcs_df = pd.read_json(
-        utils.get_first_file_from_wildcard(path),
+        path,
         lines=True,
         convert_dates=["datetime_col"],
+        storage_options={"expand": True},
     )
     utils.convert_pandas_dtypes(gcs_df, bytes_col=True)
     if index and scalars_df.index.name is not None:
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
@@ -32,11 +32,7 @@
 import bigframes.features
 import bigframes.pandas
 import bigframes.series as series
-from bigframes.testing.utils import (
-    assert_pandas_df_equal,
-    assert_series_equal,
-    get_first_file_from_wildcard,
-)
+from bigframes.testing.utils import assert_pandas_df_equal, assert_series_equal
 
 
 def test_series_construct_copy(scalars_dfs):
@@ -3344,7 +3340,9 @@ def test_to_frame_no_name(scalars_dfs):
 def test_to_json(gcs_folder, scalars_df_index, scalars_pandas_df_index):
     path = gcs_folder + "test_series_to_json*.jsonl"
     scalars_df_index["int64_col"].to_json(path, lines=True, orient="records")
-    gcs_df = pd.read_json(get_first_file_from_wildcard(path), lines=True)
+    # `expand=True` is needed to read from wildcard paths. See details:
+    # https://github.com/fsspec/gcsfs/issues/616,
+    gcs_df = pd.read_json(path, lines=True, storage_options={"expand": True})
 
     pd.testing.assert_series_equal(
         gcs_df["int64_col"].astype(pd.Int64Dtype()),
@@ -3357,7 +3355,9 @@ def test_to_json(gcs_folder, scalars_df_index, scalars_pandas_df_index):
 def test_to_csv(gcs_folder, scalars_df_index, scalars_pandas_df_index):
     path = gcs_folder + "test_series_to_csv*.csv"
     scalars_df_index["int64_col"].to_csv(path)
-    gcs_df = pd.read_csv(get_first_file_from_wildcard(path))
+    # `expand=True` is needed to read from wildcard paths. See details:
+    # https://github.com/fsspec/gcsfs/issues/616
+    gcs_df = pd.read_csv(path, storage_options={"expand": True})
 
     pd.testing.assert_series_equal(
         gcs_df["int64_col"].astype(pd.Int64Dtype()),
diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py
@@ -1041,15 +1041,7 @@ def test_read_pandas_w_nested_json_fails(session, write_engine):
         session.read_pandas(pd_s, write_engine=write_engine)
 
 
-@pytest.mark.parametrize(
-    ("write_engine"),
-    [
-        pytest.param("default"),
-        pytest.param("bigquery_inline"),
-        pytest.param("bigquery_streaming"),
-        pytest.param("bigquery_write"),
-    ],
-)
+@all_write_engines
 def test_read_pandas_w_nested_json(session, write_engine):
     # TODO: supply a reason why this isn't compatible with pandas 1.x
     pytest.importorskip("pandas", minversion="2.0.0")
@@ -1074,15 +1066,7 @@ def test_read_pandas_w_nested_json(session, write_engine):
     pd.testing.assert_series_equal(bq_s, pd_s)
 
 
-@pytest.mark.parametrize(
-    ("write_engine"),
-    [
-        pytest.param("default"),
-        pytest.param("bigquery_inline"),
-        pytest.param("bigquery_load"),
-        pytest.param("bigquery_streaming"),
-    ],
-)
+@all_write_engines
 def test_read_pandas_w_nested_invalid_json(session, write_engine):
     # TODO: supply a reason why this isn't compatible with pandas 1.x
     pytest.importorskip("pandas", minversion="2.0.0")
@@ -1127,15 +1111,7 @@ def test_read_pandas_w_nested_json_index_fails(session, write_engine):
         session.read_pandas(pd_idx, write_engine=write_engine)
 
 
-@pytest.mark.parametrize(
-    ("write_engine"),
-    [
-        pytest.param("default"),
-        pytest.param("bigquery_inline"),
-        pytest.param("bigquery_streaming"),
-        pytest.param("bigquery_write"),
-    ],
-)
+@all_write_engines
 def test_read_pandas_w_nested_json_index(session, write_engine):
     # TODO: supply a reason why this isn't compatible with pandas 1.x
     pytest.importorskip("pandas", minversion="2.0.0")
@@ -1287,6 +1263,28 @@ def test_read_csv_raises_error_for_invalid_index_col(
         session.read_csv(path, engine="bigquery", index_col=index_col)
 
 
+def test_read_csv_for_gcs_wildcard_path(session, df_and_gcs_csv):
+    scalars_pandas_df, path = df_and_gcs_csv
+    path = path.replace(".csv", "*.csv")
+
+    index_col = "rowindex"
+    bf_df = session.read_csv(path, engine="bigquery", index_col=index_col)
+
+    # Convert default pandas dtypes to match BigQuery DataFrames dtypes.
+    # Also, `expand=True` is needed to read from wildcard paths. See details:
+    # https://github.com/fsspec/gcsfs/issues/616,
+    pd_df = session.read_csv(
+        path,
+        index_col=index_col,
+        dtype=scalars_pandas_df.dtypes.to_dict(),
+        storage_options={"expand": True},
+    )
+
+    assert bf_df.shape == pd_df.shape
+    assert bf_df.columns.tolist() == pd_df.columns.tolist()
+    pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas())
+
+
 def test_read_csv_for_names(session, df_and_gcs_csv_for_two_columns):
     _, path = df_and_gcs_csv_for_two_columns
 
@@ -1566,10 +1564,9 @@ def test_read_csv_default_engine_throws_not_implemented_error(
         gcs_folder
         + "test_read_csv_gcs_default_engine_throws_not_implemented_error*.csv"
     )
-    read_path = utils.get_first_file_from_wildcard(path)
     scalars_df_index.to_csv(path)
     with pytest.raises(NotImplementedError, match=match):
-        session.read_csv(read_path, **kwargs)
+        session.read_csv(path, **kwargs)
 
 
 @pytest.mark.parametrize(