Skip to content

Commit 624d381

Browse files
authored
chore: add experimental bpd.from_glob_path method (#1230)
* chore: add experimental bpd.from_glob_path method * fix mypy * fix mypy
1 parent bcbc732 commit 624d381

File tree

3 files changed

+73
-0
lines changed

3 files changed

+73
-0
lines changed

bigframes/pandas/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
import bigframes.enums
3737
import bigframes.functions._utils as functions_utils
3838
from bigframes.pandas.io.api import (
39+
from_glob_path,
3940
read_csv,
4041
read_gbq,
4142
read_gbq_function,
@@ -311,6 +312,7 @@ def reset_session():
311312
"read_pickle",
312313
"remote_function",
313314
"to_datetime",
315+
"from_glob_path",
314316
# pandas dtype attributes
315317
"NA",
316318
"BooleanDtype",

bigframes/pandas/io/api.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,20 @@ def read_gbq_function(function_name: str, is_row_processor: bool = False):
307307
read_gbq_function.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq_function)
308308

309309

310+
def from_glob_path(
311+
path: str, *, connection: Optional[str] = None, name: Optional[str] = None
312+
) -> bigframes.dataframe.DataFrame:
313+
return global_session.with_default_session(
314+
bigframes.session.Session.from_glob_path,
315+
path=path,
316+
connection=connection,
317+
name=name,
318+
)
319+
320+
321+
from_glob_path.__doc__ = inspect.getdoc(bigframes.session.Session.from_glob_path)
322+
323+
310324
def _set_default_session_location_if_possible(query):
311325
# Set the location as per the query if this is the first query the user is
312326
# running and:

bigframes/session/__init__.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1475,6 +1475,63 @@ def _start_query_ml_ddl(
14751475
self.bqclient, sql, job_config, metrics=self._metrics
14761476
)
14771477

1478+
def _create_object_table(self, path: str, connection: str) -> str:
1479+
"""Create a random id Object Table from the input path and connection."""
1480+
table = str(self._loader._storage_manager._random_table())
1481+
1482+
import textwrap
1483+
1484+
sql = textwrap.dedent(
1485+
f"""
1486+
CREATE EXTERNAL TABLE `{table}`
1487+
WITH CONNECTION `{connection}`
1488+
OPTIONS(
1489+
object_metadata = 'SIMPLE',
1490+
uris = ['{path}']);
1491+
"""
1492+
)
1493+
bf_io_bigquery.start_query_with_client(
1494+
self.bqclient,
1495+
sql,
1496+
job_config=bigquery.QueryJobConfig(),
1497+
metrics=self._metrics,
1498+
)
1499+
1500+
return table
1501+
1502+
def from_glob_path(
1503+
self, path: str, *, connection: Optional[str] = None, name: Optional[str] = None
1504+
) -> dataframe.DataFrame:
1505+
r"""Create a BigFrames DataFrame that contains a BigFrames Blob column from a global wildcard path.
1506+
1507+
Args:
1508+
path (str):
1509+
The wildcard global path, such as "gs://<bucket>/<folder>/\*".
1510+
connection (str or None, default None):
1511+
Connection to connect with remote service. str of the format <PROJECT_NUMBER/PROJECT_ID>.<LOCATION>.<CONNECTION_ID>.
1512+
If None, use default connection in session context. BigQuery DataFrame will try to create the connection and attach
1513+
permission if the connection isn't fully set up.
1514+
name (str):
1515+
The column name of the Blob column.
1516+
Returns:
1517+
bigframes.pandas.DataFrame:
1518+
Result BigFrames DataFrame.
1519+
"""
1520+
if not bigframes.options.experiments.blob:
1521+
raise NotImplementedError()
1522+
1523+
connection = connection or self._bq_connection
1524+
connection = bigframes.clients.resolve_full_bq_connection_name(
1525+
connection,
1526+
default_project=self._project,
1527+
default_location=self._location,
1528+
)
1529+
1530+
table = self._create_object_table(path, connection)
1531+
1532+
s = self.read_gbq(table)["uri"].str.to_blob(connection)
1533+
return s.rename(name).to_frame()
1534+
14781535

14791536
def connect(context: Optional[bigquery_options.BigQueryOptions] = None) -> Session:
14801537
return Session(context)

0 commit comments

Comments
 (0)