Skip to content

Commit 2d18815

Browse files
refactor: move reader functions from __init__.py to a separate file under the pandas package (#1023)
* refactor: move reader functions from __init__.py to a separate file under pandas package * fix type in __all__ * move read functions under bigframes.pandas.io.api to reflect the structure of pandas * move read functions under bigframes.pandas.io.api to reflect the structure of pandas * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * update function import * fix missing comma * try to fix dup doc by directly import types from pandas * fix doc generation --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
1 parent ff11ac8 commit 2d18815

File tree

5 files changed

+382
-318
lines changed

5 files changed

+382
-318
lines changed

bigframes/pandas/__init__.py

Lines changed: 19 additions & 316 deletions
Original file line numberDiff line numberDiff line change
@@ -21,37 +21,15 @@
2121
import inspect
2222
import sys
2323
import typing
24-
from typing import (
25-
Any,
26-
Callable,
27-
Dict,
28-
IO,
29-
Iterable,
30-
List,
31-
Literal,
32-
MutableSequence,
33-
Optional,
34-
Sequence,
35-
Tuple,
36-
Union,
37-
)
24+
from typing import Any, Iterable, List, Literal, Optional, Sequence, Tuple, Union
3825

3926
import bigframes_vendored.constants as constants
4027
import bigframes_vendored.pandas.core.reshape.concat as vendored_pandas_concat
4128
import bigframes_vendored.pandas.core.reshape.encoding as vendored_pandas_encoding
4229
import bigframes_vendored.pandas.core.reshape.merge as vendored_pandas_merge
4330
import bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile
4431
import bigframes_vendored.pandas.core.tools.datetimes as vendored_pandas_datetimes
45-
import bigframes_vendored.pandas.io.gbq as vendored_pandas_gbq
46-
from google.cloud import bigquery
47-
import numpy
4832
import pandas
49-
from pandas._typing import (
50-
CompressionOptions,
51-
FilePath,
52-
ReadPickleBuffer,
53-
StorageOptions,
54-
)
5533

5634
import bigframes._config as config
5735
import bigframes.core.blocks
@@ -65,6 +43,18 @@
6543
import bigframes.enums
6644
import bigframes.functions._utils as functions_utils
6745
import bigframes.operations as ops
46+
from bigframes.pandas.io.api import (
47+
read_csv,
48+
read_gbq,
49+
read_gbq_function,
50+
read_gbq_model,
51+
read_gbq_query,
52+
read_gbq_table,
53+
read_json,
54+
read_pandas,
55+
read_parquet,
56+
read_pickle,
57+
)
6858
import bigframes.series
6959
import bigframes.session
7060
import bigframes.session._io.bigquery
@@ -373,286 +363,6 @@ def merge(
373363
merge.__doc__ = vendored_pandas_merge.merge.__doc__
374364

375365

376-
def _set_default_session_location_if_possible(query):
377-
# Set the location as per the query if this is the first query the user is
378-
# running and:
379-
# (1) Default session has not started yet, and
380-
# (2) Location is not set yet, and
381-
# (3) Use of regional endpoints is not set.
382-
# If query is a table name, then it would be the location of the table.
383-
# If query is a SQL with a table, then it would be table's location.
384-
# If query is a SQL with no table, then it would be the BQ default location.
385-
if (
386-
options.bigquery._session_started
387-
or options.bigquery.location
388-
or options.bigquery.use_regional_endpoints
389-
):
390-
return
391-
392-
clients_provider = bigframes.session.clients.ClientsProvider(
393-
project=options.bigquery.project,
394-
location=options.bigquery.location,
395-
use_regional_endpoints=options.bigquery.use_regional_endpoints,
396-
credentials=options.bigquery.credentials,
397-
application_name=options.bigquery.application_name,
398-
bq_kms_key_name=options.bigquery.kms_key_name,
399-
)
400-
401-
bqclient = clients_provider.bqclient
402-
403-
if bigframes.session._io.bigquery.is_query(query):
404-
# Intentionally run outside of the session so that we can detect the
405-
# location before creating the session. Since it's a dry_run, labels
406-
# aren't necessary.
407-
job = bqclient.query(query, bigquery.QueryJobConfig(dry_run=True))
408-
options.bigquery.location = job.location
409-
else:
410-
table = bqclient.get_table(query)
411-
options.bigquery.location = table.location
412-
413-
414-
# Note: the following methods are duplicated from Session. This duplication
415-
# enables the following:
416-
#
417-
# 1. Static type checking knows the argument and return types, which is
418-
# difficult to do with decorators. Aside: When we require Python 3.10, we
419-
# can use Concatenate for generic typing in decorators. See:
420-
# https://stackoverflow.com/a/68290080/101923
421-
# 2. docstrings get processed by static processing tools, such as VS Code's
422-
# autocomplete.
423-
# 3. Positional arguments function as expected. If we were to pull in the
424-
# methods directly from Session, a Session object would need to be the first
425-
# argument, even if we allow a default value.
426-
# 4. Allows to set BigQuery options for the BigFrames session based on the
427-
# method and its arguments.
428-
429-
430-
def read_csv(
431-
filepath_or_buffer: str | IO["bytes"],
432-
*,
433-
sep: Optional[str] = ",",
434-
header: Optional[int] = 0,
435-
names: Optional[
436-
Union[MutableSequence[Any], numpy.ndarray[Any, Any], Tuple[Any, ...], range]
437-
] = None,
438-
index_col: Optional[
439-
Union[
440-
int,
441-
str,
442-
Sequence[Union[str, int]],
443-
bigframes.enums.DefaultIndexKind,
444-
Literal[False],
445-
]
446-
] = None,
447-
usecols: Optional[
448-
Union[
449-
MutableSequence[str],
450-
Tuple[str, ...],
451-
Sequence[int],
452-
pandas.Series,
453-
pandas.Index,
454-
numpy.ndarray[Any, Any],
455-
Callable[[Any], bool],
456-
]
457-
] = None,
458-
dtype: Optional[Dict] = None,
459-
engine: Optional[
460-
Literal["c", "python", "pyarrow", "python-fwf", "bigquery"]
461-
] = None,
462-
encoding: Optional[str] = None,
463-
**kwargs,
464-
) -> bigframes.dataframe.DataFrame:
465-
return global_session.with_default_session(
466-
bigframes.session.Session.read_csv,
467-
filepath_or_buffer=filepath_or_buffer,
468-
sep=sep,
469-
header=header,
470-
names=names,
471-
index_col=index_col,
472-
usecols=usecols,
473-
dtype=dtype,
474-
engine=engine,
475-
encoding=encoding,
476-
**kwargs,
477-
)
478-
479-
480-
read_csv.__doc__ = inspect.getdoc(bigframes.session.Session.read_csv)
481-
482-
483-
def read_json(
484-
path_or_buf: str | IO["bytes"],
485-
*,
486-
orient: Literal[
487-
"split", "records", "index", "columns", "values", "table"
488-
] = "columns",
489-
dtype: Optional[Dict] = None,
490-
encoding: Optional[str] = None,
491-
lines: bool = False,
492-
engine: Literal["ujson", "pyarrow", "bigquery"] = "ujson",
493-
**kwargs,
494-
) -> bigframes.dataframe.DataFrame:
495-
return global_session.with_default_session(
496-
bigframes.session.Session.read_json,
497-
path_or_buf=path_or_buf,
498-
orient=orient,
499-
dtype=dtype,
500-
encoding=encoding,
501-
lines=lines,
502-
engine=engine,
503-
**kwargs,
504-
)
505-
506-
507-
read_json.__doc__ = inspect.getdoc(bigframes.session.Session.read_json)
508-
509-
510-
def read_gbq(
511-
query_or_table: str,
512-
*,
513-
index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (),
514-
columns: Iterable[str] = (),
515-
configuration: Optional[Dict] = None,
516-
max_results: Optional[int] = None,
517-
filters: vendored_pandas_gbq.FiltersType = (),
518-
use_cache: Optional[bool] = None,
519-
col_order: Iterable[str] = (),
520-
) -> bigframes.dataframe.DataFrame:
521-
_set_default_session_location_if_possible(query_or_table)
522-
return global_session.with_default_session(
523-
bigframes.session.Session.read_gbq,
524-
query_or_table,
525-
index_col=index_col,
526-
columns=columns,
527-
configuration=configuration,
528-
max_results=max_results,
529-
filters=filters,
530-
use_cache=use_cache,
531-
col_order=col_order,
532-
)
533-
534-
535-
read_gbq.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq)
536-
537-
538-
def read_gbq_model(model_name: str):
539-
return global_session.with_default_session(
540-
bigframes.session.Session.read_gbq_model,
541-
model_name,
542-
)
543-
544-
545-
read_gbq_model.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq_model)
546-
547-
548-
def read_gbq_query(
549-
query: str,
550-
*,
551-
index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (),
552-
columns: Iterable[str] = (),
553-
configuration: Optional[Dict] = None,
554-
max_results: Optional[int] = None,
555-
use_cache: Optional[bool] = None,
556-
col_order: Iterable[str] = (),
557-
filters: vendored_pandas_gbq.FiltersType = (),
558-
) -> bigframes.dataframe.DataFrame:
559-
_set_default_session_location_if_possible(query)
560-
return global_session.with_default_session(
561-
bigframes.session.Session.read_gbq_query,
562-
query,
563-
index_col=index_col,
564-
columns=columns,
565-
configuration=configuration,
566-
max_results=max_results,
567-
use_cache=use_cache,
568-
col_order=col_order,
569-
filters=filters,
570-
)
571-
572-
573-
read_gbq_query.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq_query)
574-
575-
576-
def read_gbq_table(
577-
query: str,
578-
*,
579-
index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (),
580-
columns: Iterable[str] = (),
581-
max_results: Optional[int] = None,
582-
filters: vendored_pandas_gbq.FiltersType = (),
583-
use_cache: bool = True,
584-
col_order: Iterable[str] = (),
585-
) -> bigframes.dataframe.DataFrame:
586-
_set_default_session_location_if_possible(query)
587-
return global_session.with_default_session(
588-
bigframes.session.Session.read_gbq_table,
589-
query,
590-
index_col=index_col,
591-
columns=columns,
592-
max_results=max_results,
593-
filters=filters,
594-
use_cache=use_cache,
595-
col_order=col_order,
596-
)
597-
598-
599-
read_gbq_table.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq_table)
600-
601-
602-
@typing.overload
603-
def read_pandas(pandas_dataframe: pandas.DataFrame) -> bigframes.dataframe.DataFrame:
604-
...
605-
606-
607-
@typing.overload
608-
def read_pandas(pandas_dataframe: pandas.Series) -> bigframes.series.Series:
609-
...
610-
611-
612-
@typing.overload
613-
def read_pandas(pandas_dataframe: pandas.Index) -> bigframes.core.indexes.Index:
614-
...
615-
616-
617-
def read_pandas(pandas_dataframe: Union[pandas.DataFrame, pandas.Series, pandas.Index]):
618-
return global_session.with_default_session(
619-
bigframes.session.Session.read_pandas,
620-
pandas_dataframe,
621-
)
622-
623-
624-
read_pandas.__doc__ = inspect.getdoc(bigframes.session.Session.read_pandas)
625-
626-
627-
def read_pickle(
628-
filepath_or_buffer: FilePath | ReadPickleBuffer,
629-
compression: CompressionOptions = "infer",
630-
storage_options: StorageOptions = None,
631-
):
632-
return global_session.with_default_session(
633-
bigframes.session.Session.read_pickle,
634-
filepath_or_buffer=filepath_or_buffer,
635-
compression=compression,
636-
storage_options=storage_options,
637-
)
638-
639-
640-
read_pickle.__doc__ = inspect.getdoc(bigframes.session.Session.read_pickle)
641-
642-
643-
def read_parquet(
644-
path: str | IO["bytes"], *, engine: str = "auto"
645-
) -> bigframes.dataframe.DataFrame:
646-
return global_session.with_default_session(
647-
bigframes.session.Session.read_parquet,
648-
path,
649-
engine=engine,
650-
)
651-
652-
653-
read_parquet.__doc__ = inspect.getdoc(bigframes.session.Session.read_parquet)
654-
655-
656366
def remote_function(
657367
input_types: Union[None, type, Sequence[type]] = None,
658368
output_type: Optional[type] = None,
@@ -697,17 +407,6 @@ def remote_function(
697407
remote_function.__doc__ = inspect.getdoc(bigframes.session.Session.remote_function)
698408

699409

700-
def read_gbq_function(function_name: str, is_row_processor: bool = False):
701-
return global_session.with_default_session(
702-
bigframes.session.Session.read_gbq_function,
703-
function_name=function_name,
704-
is_row_processor=is_row_processor,
705-
)
706-
707-
708-
read_gbq_function.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq_function)
709-
710-
711410
@typing.overload
712411
def to_datetime(
713412
arg: Union[
@@ -893,15 +592,19 @@ def reset_session():
893592
pass
894593

895594
# Use __all__ to let type checkers know what is part of the public API.
896-
__all___ = [
595+
__all__ = [
897596
# Functions
898597
"concat",
899598
"merge",
900599
"read_csv",
901600
"read_gbq",
902601
"read_gbq_function",
903602
"read_gbq_model",
603+
"read_gbq_query",
604+
"read_gbq_table",
605+
"read_json",
904606
"read_pandas",
607+
"read_parquet",
905608
"read_pickle",
906609
"remote_function",
907610
"to_datetime",
@@ -911,7 +614,7 @@ def reset_session():
911614
"Float64Dtype",
912615
"Int64Dtype",
913616
"StringDtype",
914-
"ArrowDtype"
617+
"ArrowDtype",
915618
# Class aliases
916619
"DataFrame",
917620
"Index",

0 commit comments

Comments
 (0)