Skip to content

Commit 7e78b99

Browse files
authored
ENH: Add max_results argument to read_gbq. (#286)
* ENH: Add max_results argument to read_gbq. This argument allows you to set a maximum number of rows in the resulting dataframe. Set to 0 to ignore any results. * Call get_table on destination so listing works on older versions of the client library. * Add max_results to changelog. * Fix unit tests for extra call to get_table. * Add unit tests for max_results. * Adjust fail-under percent
1 parent 9990047 commit 7e78b99

File tree

6 files changed

+85
-24
lines changed

6 files changed

+85
-24
lines changed

docs/source/changelog.rst

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,16 @@
11
Changelog
22
=========
33

4+
.. _changelog-0.12.0:
5+
6+
0.12.0 / TBD
7+
------------
8+
9+
- Add ``max_results`` argument to :func:`~pandas_gbq.read_gbq()`. Use this
10+
argument to limit the number of rows in the results DataFrame. Set
11+
``max_results`` to 0 to ignore query outputs, such as for DML or DDL
12+
queries. (:issue:`102`)
13+
414
.. _changelog-0.11.0:
515

616
0.11.0 / 2019-07-29

noxfile.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def unit(session):
4848
@nox.session
4949
def cover(session, python=latest_python):
5050
session.install("coverage", "pytest-cov")
51-
session.run("coverage", "report", "--show-missing", "--fail-under=40")
51+
session.run("coverage", "report", "--show-missing", "--fail-under=74")
5252
session.run("coverage", "erase")
5353

5454

pandas_gbq/gbq.py

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -436,7 +436,7 @@ def process_http_error(ex):
436436

437437
raise GenericGBQException("Reason: {0}".format(ex))
438438

439-
def run_query(self, query, **kwargs):
439+
def run_query(self, query, max_results=None, **kwargs):
440440
from concurrent.futures import TimeoutError
441441
from google.auth.exceptions import RefreshError
442442
from google.cloud import bigquery
@@ -526,15 +526,33 @@ def run_query(self, query, **kwargs):
526526
)
527527
)
528528

529+
return self._download_results(query_reply, max_results=max_results)
530+
531+
def _download_results(self, query_job, max_results=None):
532+
# No results are desired, so don't bother downloading anything.
533+
if max_results == 0:
534+
return None
535+
536+
if max_results is None:
537+
# Only use the BigQuery Storage API if the full result set is requested.
538+
bqstorage_client = self.bqstorage_client
539+
else:
540+
bqstorage_client = None
541+
529542
try:
530-
rows_iter = query_reply.result()
543+
query_job.result()
544+
# Get the table schema, so that we can list rows.
545+
destination = self.client.get_table(query_job.destination)
546+
rows_iter = self.client.list_rows(
547+
destination, max_results=max_results
548+
)
531549
except self.http_error as ex:
532550
self.process_http_error(ex)
533551

534552
schema_fields = [field.to_api_repr() for field in rows_iter.schema]
535553
nullsafe_dtypes = _bqschema_to_nullsafe_dtypes(schema_fields)
536554
df = rows_iter.to_dataframe(
537-
dtypes=nullsafe_dtypes, bqstorage_client=self.bqstorage_client
555+
dtypes=nullsafe_dtypes, bqstorage_client=bqstorage_client
538556
)
539557

540558
if df.empty:
@@ -812,6 +830,7 @@ def read_gbq(
812830
configuration=None,
813831
credentials=None,
814832
use_bqstorage_api=False,
833+
max_results=None,
815834
verbose=None,
816835
private_key=None,
817836
):
@@ -907,9 +926,16 @@ def read_gbq(
907926
``configuration`` dictionary.
908927
909928
This feature requires the ``google-cloud-bigquery-storage`` and
910-
``fastavro`` packages.
929+
``pyarrow`` packages.
930+
931+
This value is ignored if ``max_results`` is set.
911932
912933
.. versionadded:: 0.10.0
934+
max_results : int, optional
935+
If set, limit the maximum number of rows to fetch from the query
936+
results.
937+
938+
.. versionadded:: 0.12.0
913939
verbose : None, deprecated
914940
Deprecated in Pandas-GBQ 0.4.0. Use the `logging module
915941
to adjust verbosity instead
@@ -969,7 +995,9 @@ def read_gbq(
969995
use_bqstorage_api=use_bqstorage_api,
970996
)
971997

972-
final_df = connector.run_query(query, configuration=configuration)
998+
final_df = connector.run_query(
999+
query, configuration=configuration, max_results=max_results
1000+
)
9731001

9741002
# Reindex the DataFrame on the provided column
9751003
if index_col is not None:

tests/system/test_gbq.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -584,6 +584,30 @@ def test_download_dataset_larger_than_200k_rows(self, project_id):
584584
)
585585
assert len(df.drop_duplicates()) == test_size
586586

587+
def test_ddl(self, random_dataset, project_id):
588+
# Bug fix for https://github.com/pydata/pandas-gbq/issues/45
589+
df = gbq.read_gbq(
590+
"CREATE OR REPLACE TABLE {}.test_ddl (x INT64)".format(
591+
random_dataset.dataset_id
592+
)
593+
)
594+
assert len(df) == 0
595+
596+
def test_ddl_w_max_results(self, random_dataset, project_id):
597+
df = gbq.read_gbq(
598+
"CREATE OR REPLACE TABLE {}.test_ddl (x INT64)".format(
599+
random_dataset.dataset_id
600+
),
601+
max_results=0,
602+
)
603+
assert df is None
604+
605+
def test_max_results(self, random_dataset, project_id):
606+
df = gbq.read_gbq(
607+
"SELECT * FROM UNNEST(GENERATE_ARRAY(1, 100))", max_results=10
608+
)
609+
assert len(df) == 10
610+
587611
def test_zero_rows(self, project_id):
588612
# Bug fix for https://github.com/pandas-dev/pandas/issues/10273
589613
df = gbq.read_gbq(

tests/unit/conftest.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ def reset_context():
1515

1616
@pytest.fixture(autouse=True)
1717
def mock_bigquery_client(monkeypatch):
18-
from google.api_core.exceptions import NotFound
1918
import google.cloud.bigquery
2019
import google.cloud.bigquery.table
2120

@@ -35,7 +34,6 @@ def mock_bigquery_client(monkeypatch):
3534
mock_query.result.return_value = mock_rows
3635
mock_client.query.return_value = mock_query
3736
# Mock table creation.
38-
mock_client.get_table.side_effect = NotFound("nope")
3937
monkeypatch.setattr(google.cloud.bigquery, "Client", mock_client)
4038
mock_client.reset_mock()
4139
return mock_client

tests/unit/test_gbq.py

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,6 @@ def min_bq_version():
3434
return pkg_resources.parse_version("1.9.0")
3535

3636

37-
def mock_none_credentials(*args, **kwargs):
38-
return None, None
39-
40-
4137
def mock_get_credentials_no_project(*args, **kwargs):
4238
import google.auth.credentials
4339

@@ -76,16 +72,6 @@ def mock_compute_engine_credentials():
7672
return mock_credentials
7773

7874

79-
@pytest.fixture
80-
def mock_get_user_credentials(*args, **kwargs):
81-
import google.auth.credentials
82-
83-
mock_credentials = mock.create_autospec(
84-
google.auth.credentials.Credentials
85-
)
86-
return mock_credentials
87-
88-
8975
@pytest.fixture(autouse=True)
9076
def no_auth(monkeypatch):
9177
import pydata_google_auth
@@ -351,6 +337,17 @@ def test_read_gbq_without_inferred_project_id_from_compute_engine_credentials(
351337
)
352338

353339

340+
def test_read_gbq_with_max_results_zero(monkeypatch):
341+
df = gbq.read_gbq("SELECT 1", dialect="standard", max_results=0)
342+
assert df is None
343+
344+
345+
def test_read_gbq_with_max_results_ten(monkeypatch, mock_bigquery_client):
346+
df = gbq.read_gbq("SELECT 1", dialect="standard", max_results=10)
347+
assert df is not None
348+
mock_bigquery_client.list_rows.assert_called_with(mock.ANY, max_results=10)
349+
350+
354351
def test_read_gbq_with_invalid_private_key_json_should_fail():
355352
with pytest.raises(pandas_gbq.exceptions.InvalidPrivateKeyFormat):
356353
gbq.read_gbq(
@@ -511,8 +508,12 @@ def test_generate_bq_schema_deprecated():
511508
gbq.generate_bq_schema(df)
512509

513510

514-
def test_load_does_not_modify_schema_arg():
515-
# Test of Issue # 277
511+
def test_load_does_not_modify_schema_arg(mock_bigquery_client):
512+
"""Test of Issue # 277."""
513+
from google.api_core.exceptions import NotFound
514+
515+
# Create table with new schema.
516+
mock_bigquery_client.get_table.side_effect = NotFound("nope")
516517
df = DataFrame(
517518
{
518519
"field1": ["a", "b"],

0 commit comments

Comments
 (0)