Skip to content

Commit 428845d

Browse files
authored
Faster dataframe construction (#128)
* push df construction to pandas * lint * gitignore additions * `.astype` conversion * ignore warnings from Google Cloud SDK auth * handle repeated fields in schema * remove comment * whats new * remove duplicate test
1 parent d6b7507 commit 428845d

File tree

5 files changed

+38
-31
lines changed

5 files changed

+38
-31
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,8 @@ dist
6969
**/wheelhouse/*
7070
# coverage
7171
.coverage
72+
.testmondata
73+
.pytest_cache
7274
.nox
7375

7476
# OS generated files #

docs/source/changelog.rst

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,16 @@
11
Changelog
22
=========
33

4+
.. _changelog-0.6.1:
5+
6+
0.6.1 / [unreleased]
7+
--------------------
8+
9+
- Improved ``read_gbq`` performance and memory consumption by delegating
10+
``DataFrame`` construction to the Pandas library, radically reducing
11+
the number of loops that execute in python
12+
(:issue:`128`)
13+
414
.. _changelog-0.6.0:
515

616
0.6.0 / 2018-08-15

pandas_gbq/gbq.py

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,14 @@
33
import os
44
import time
55
import warnings
6+
from collections import OrderedDict
67
from datetime import datetime
78

89
import numpy as np
910
from pandas import DataFrame
10-
from pandas.compat import lzip
1111

1212
from pandas_gbq.exceptions import AccessDenied
1313

14-
1514
logger = logging.getLogger(__name__)
1615

1716

@@ -444,29 +443,30 @@ def _get_credentials_file():
444443
'PANDAS_GBQ_CREDENTIALS_FILE')
445444

446445

447-
def _parse_data(schema, rows):
446+
def _parse_schema(schema_fields):
448447
# see:
449448
# http://pandas.pydata.org/pandas-docs/dev/missing_data.html
450449
# #missing-data-casting-rules-and-indexing
451450
dtype_map = {'FLOAT': np.dtype(float),
452451
'TIMESTAMP': 'M8[ns]'}
453452

454-
fields = schema['fields']
455-
col_types = [field['type'] for field in fields]
456-
col_names = [str(field['name']) for field in fields]
457-
col_dtypes = [
458-
dtype_map.get(field['type'].upper(), object)
459-
if field['mode'].lower() != 'repeated'
460-
else object
461-
for field in fields
462-
]
463-
page_array = np.zeros((len(rows),), dtype=lzip(col_names, col_dtypes))
464-
for row_num, entries in enumerate(rows):
465-
for col_num in range(len(col_types)):
466-
field_value = entries[col_num]
467-
page_array[row_num][col_num] = field_value
468-
469-
return DataFrame(page_array, columns=col_names)
453+
for field in schema_fields:
454+
name = str(field['name'])
455+
if field['mode'].upper() == 'REPEATED':
456+
yield name, object
457+
else:
458+
dtype = dtype_map.get(field['type'].upper(), object)
459+
yield name, dtype
460+
461+
462+
def _parse_data(schema, rows):
463+
464+
column_dtypes = OrderedDict(_parse_schema(schema['fields']))
465+
466+
df = DataFrame(data=(iter(r) for r in rows), columns=column_dtypes.keys())
467+
for column in df:
468+
df[column] = df[column].astype(column_dtypes[column])
469+
return df
470470

471471

472472
def read_gbq(query, project_id=None, index_col=None, col_order=None,

tests/system/test_gbq.py

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,14 @@
1313

1414
from pandas_gbq import gbq
1515

16-
1716
TABLE_ID = 'new_test'
1817

1918

2019
def _get_dataset_prefix_random():
2120
return ''.join(['pandas_gbq_', str(randint(1, 100000))])
2221

2322

24-
@pytest.fixture(autouse=True, scope='module')
25-
def _test_imports():
23+
def test_imports():
2624
try:
2725
import pkg_resources # noqa
2826
except ImportError:
@@ -143,13 +141,6 @@ def setup(self, project, credentials):
143141
project, private_key=credentials)
144142
self.credentials = credentials
145143

146-
def test_should_properly_handle_valid_strings(self, project_id):
147-
query = 'SELECT "PI" AS valid_string'
148-
df = gbq.read_gbq(query, project_id=project_id,
149-
private_key=self.credentials,
150-
dialect='legacy')
151-
tm.assert_frame_equal(df, DataFrame({'valid_string': ['PI']}))
152-
153144
def test_should_properly_handle_empty_strings(self, project_id):
154145
query = 'SELECT "" AS empty_string'
155146
df = gbq.read_gbq(query, project_id=project_id,
@@ -392,7 +383,8 @@ def test_bad_project_id(self):
392383
with pytest.raises(gbq.GenericGBQException):
393384
gbq.read_gbq('SELCET * FROM [publicdata:samples.shakespeare]',
394385
project_id='not-my-project',
395-
private_key=self.credentials)
386+
private_key=self.credentials,
387+
dialect='legacy')
396388

397389
def test_bad_table_name(self, project_id):
398390
with pytest.raises(gbq.GenericGBQException):
@@ -427,7 +419,7 @@ def test_zero_rows(self, project_id):
427419
('is_bot', np.dtype(bool)), ('ts', 'M8[ns]')])
428420
expected_result = DataFrame(
429421
page_array, columns=['title', 'id', 'is_bot', 'ts'])
430-
tm.assert_frame_equal(df, expected_result)
422+
tm.assert_frame_equal(df, expected_result, check_index_type=False)
431423

432424
def test_legacy_sql(self, project_id):
433425
legacy_sql = "SELECT id FROM [publicdata.samples.wikipedia] LIMIT 10"

tests/unit/test_gbq.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@
1313
except ImportError: # pragma: NO COVER
1414
from unittest import mock
1515

16+
pytestmark = pytest.mark.filter_warnings(
17+
"ignore:credentials from Google Cloud SDK")
18+
1619

1720
@pytest.fixture
1821
def min_bq_version():

0 commit comments

Comments
 (0)