Faster dataframe construction (#128)

max-sixty · web-flow · commit 428845d25149 · 2018-08-22T13:34:15.000-04:00
* push df construction to pandas

* lint

* gitignore additions

* `.astype` conversion

* ignore warnings from Google Cloud SDK auth

* handle repeated fields in schema

* remove comment

* whats new

* remove duplicate test
diff --git a/.gitignore b/.gitignore
@@ -69,6 +69,8 @@ dist
 **/wheelhouse/*
 # coverage
 .coverage
+.testmondata
+.pytest_cache
 .nox
 
 # OS generated files #
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -1,6 +1,16 @@
 Changelog
 =========
 
+.. _changelog-0.6.1:
+
+0.6.1 / [unreleased]
+--------------------
+
+- Improved ``read_gbq`` performance and memory consumption by delegating
+  ``DataFrame`` construction to the Pandas library, radically reducing
+  the number of loops that execute in python
+  (:issue:`128`)
+
 .. _changelog-0.6.0:
 
 0.6.0 / 2018-08-15
diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py
@@ -3,15 +3,14 @@
 import os
 import time
 import warnings
+from collections import OrderedDict
 from datetime import datetime
 
 import numpy as np
 from pandas import DataFrame
-from pandas.compat import lzip
 
 from pandas_gbq.exceptions import AccessDenied
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -444,29 +443,30 @@ def _get_credentials_file():
         'PANDAS_GBQ_CREDENTIALS_FILE')
 
 
-def _parse_data(schema, rows):
+def _parse_schema(schema_fields):
     # see:
     # http://pandas.pydata.org/pandas-docs/dev/missing_data.html
     # #missing-data-casting-rules-and-indexing
     dtype_map = {'FLOAT': np.dtype(float),
                  'TIMESTAMP': 'M8[ns]'}
 
-    fields = schema['fields']
-    col_types = [field['type'] for field in fields]
-    col_names = [str(field['name']) for field in fields]
-    col_dtypes = [
-        dtype_map.get(field['type'].upper(), object)
-        if field['mode'].lower() != 'repeated'
-        else object
-        for field in fields
-    ]
-    page_array = np.zeros((len(rows),), dtype=lzip(col_names, col_dtypes))
-    for row_num, entries in enumerate(rows):
-        for col_num in range(len(col_types)):
-            field_value = entries[col_num]
-            page_array[row_num][col_num] = field_value
-
-    return DataFrame(page_array, columns=col_names)
+    for field in schema_fields:
+        name = str(field['name'])
+        if field['mode'].upper() == 'REPEATED':
+            yield name, object
+        else:
+            dtype = dtype_map.get(field['type'].upper(), object)
+            yield name, dtype
+
+
+def _parse_data(schema, rows):
+
+    column_dtypes = OrderedDict(_parse_schema(schema['fields']))
+
+    df = DataFrame(data=(iter(r) for r in rows), columns=column_dtypes.keys())
+    for column in df:
+        df[column] = df[column].astype(column_dtypes[column])
+    return df
 
 
 def read_gbq(query, project_id=None, index_col=None, col_order=None,
diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py
@@ -13,16 +13,14 @@
 
 from pandas_gbq import gbq
 
-
 TABLE_ID = 'new_test'
 
 
 def _get_dataset_prefix_random():
     return ''.join(['pandas_gbq_', str(randint(1, 100000))])
 
 
-@pytest.fixture(autouse=True, scope='module')
-def _test_imports():
+def test_imports():
     try:
         import pkg_resources  # noqa
     except ImportError:
@@ -143,13 +141,6 @@ def setup(self, project, credentials):
             project, private_key=credentials)
         self.credentials = credentials
 
-    def test_should_properly_handle_valid_strings(self, project_id):
-        query = 'SELECT "PI" AS valid_string'
-        df = gbq.read_gbq(query, project_id=project_id,
-                          private_key=self.credentials,
-                          dialect='legacy')
-        tm.assert_frame_equal(df, DataFrame({'valid_string': ['PI']}))
-
     def test_should_properly_handle_empty_strings(self, project_id):
         query = 'SELECT "" AS empty_string'
         df = gbq.read_gbq(query, project_id=project_id,
@@ -392,7 +383,8 @@ def test_bad_project_id(self):
         with pytest.raises(gbq.GenericGBQException):
             gbq.read_gbq('SELCET * FROM [publicdata:samples.shakespeare]',
                          project_id='not-my-project',
-                         private_key=self.credentials)
+                         private_key=self.credentials,
+                         dialect='legacy')
 
     def test_bad_table_name(self, project_id):
         with pytest.raises(gbq.GenericGBQException):
@@ -427,7 +419,7 @@ def test_zero_rows(self, project_id):
                          ('is_bot', np.dtype(bool)), ('ts', 'M8[ns]')])
         expected_result = DataFrame(
             page_array, columns=['title', 'id', 'is_bot', 'ts'])
-        tm.assert_frame_equal(df, expected_result)
+        tm.assert_frame_equal(df, expected_result, check_index_type=False)
 
     def test_legacy_sql(self, project_id):
         legacy_sql = "SELECT id FROM [publicdata.samples.wikipedia] LIMIT 10"
diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py
@@ -13,6 +13,9 @@
 except ImportError:  # pragma: NO COVER
     from unittest import mock
 
+pytestmark = pytest.mark.filter_warnings(
+    "ignore:credentials from Google Cloud SDK")
+
 
 @pytest.fixture
 def min_bq_version():