Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4522,6 +4522,7 @@ See the documentation for `pyarrow <http://arrow.apache.org/docs/python/>`__ and
.. note::

These engines are very similar and should read/write nearly identical parquet format files.
Currently ``pyarrow`` does not support timedelta data, and ``fastparquet`` does not support timezone aware datetimes (they are coerced to UTC).
These libraries differ by having different underlying dependencies (``fastparquet`` by using ``numba``, while ``pyarrow`` uses a c-library).

.. ipython:: python
Expand All @@ -4548,8 +4549,8 @@ Read from a parquet file.

.. ipython:: python

result = pd.read_parquet('example_pa.parquet', engine='pyarrow')
result = pd.read_parquet('example_fp.parquet', engine='fastparquet')
result = pd.read_parquet('example_pa.parquet', engine='pyarrow')

result.dtypes

Expand Down
97 changes: 54 additions & 43 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import numpy as np
import pandas as pd
from pandas.compat import PY3
from pandas.compat import PY3, is_platform_windows
from pandas.io.parquet import (to_parquet, read_parquet, get_engine,
PyArrowImpl, FastParquetImpl)
from pandas.util import testing as tm
Expand Down Expand Up @@ -80,16 +80,36 @@ def df_compat():
def df_cross_compat():
df = pd.DataFrame({'a': list('abc'),
'b': list(range(1, 4)),
'c': np.arange(3, 6).astype('u1'),
# 'c': np.arange(3, 6).astype('u1'),
'd': np.arange(4.0, 7.0, dtype='float64'),
'e': [True, False, True],
'f': pd.date_range('20130101', periods=3),
'g': pd.date_range('20130101', periods=3,
tz='US/Eastern'),
'h': pd.date_range('20130101', periods=3, freq='ns')})
# 'g': pd.date_range('20130101', periods=3,
# tz='US/Eastern'),
# 'h': pd.date_range('20130101', periods=3, freq='ns')
})
return df


@pytest.fixture
def df_full():
return pd.DataFrame(
{'string': list('abc'),
'string_with_nan': ['a', np.nan, 'c'],
'string_with_none': ['a', None, 'c'],
'bytes': [b'foo', b'bar', b'baz'],
'unicode': [u'foo', u'bar', u'baz'],
'int': list(range(1, 4)),
'uint': np.arange(3, 6).astype('u1'),
'float': np.arange(4.0, 7.0, dtype='float64'),
'float_with_nan': [2., np.nan, 3.],
'bool': [True, False, True],
'datetime': pd.date_range('20130101', periods=3),
'datetime_with_nat': [pd.Timestamp('20130101'),
pd.NaT,
pd.Timestamp('20130103')]})


def test_invalid_engine(df_compat):

with pytest.raises(ValueError):
Expand Down Expand Up @@ -154,7 +174,8 @@ def test_options_get_engine(fp, pa):
assert isinstance(get_engine('fastparquet'), FastParquetImpl)


@pytest.mark.xfail(reason="fp does not ignore pa index __index_level_0__")
@pytest.mark.xfail(is_platform_windows(),
reason="reading pa metadata failing on Windows")
def test_cross_engine_pa_fp(df_cross_compat, pa, fp):
# cross-compat with differing reading/writing engines

Expand All @@ -166,7 +187,6 @@ def test_cross_engine_pa_fp(df_cross_compat, pa, fp):
tm.assert_frame_equal(result, df)


@pytest.mark.xfail(reason="pyarrow reading fp in some cases")
def test_cross_engine_fp_pa(df_cross_compat, pa, fp):
# cross-compat with differing reading/writing engines

Expand Down Expand Up @@ -300,27 +320,31 @@ def test_read_columns(self, engine):

class TestParquetPyArrow(Base):

def test_basic(self, pa):
def test_basic(self, pa, df_full):

df = pd.DataFrame({'string': list('abc'),
'string_with_nan': ['a', np.nan, 'c'],
'string_with_none': ['a', None, 'c'],
'bytes': [b'foo', b'bar', b'baz'],
'unicode': [u'foo', u'bar', u'baz'],
'int': list(range(1, 4)),
'uint': np.arange(3, 6).astype('u1'),
'float': np.arange(4.0, 7.0, dtype='float64'),
'float_with_nan': [2., np.nan, 3.],
'bool': [True, False, True],
'bool_with_none': [True, None, True],
'datetime_ns': pd.date_range('20130101', periods=3),
'datetime_with_nat': [pd.Timestamp('20130101'),
pd.NaT,
pd.Timestamp('20130103')]
})
df = df_full

# additional supported types for pyarrow
import pyarrow
if LooseVersion(pyarrow.__version__) >= LooseVersion('0.7.0'):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can remove this after we change the dep (@dhirschfeld PR)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep. For the rest any comments?

df['datetime_tz'] = pd.date_range('20130101', periods=3,
tz='Europe/Brussels')
df['bool_with_none'] = [True, None, True]

self.check_round_trip(df, pa)

@pytest.mark.xfail(reason="pyarrow fails on this (ARROW-1883)")
def test_basic_subset_columns(self, pa, df_full):
# GH18628

df = df_full
# additional supported types for pyarrow
df['datetime_tz'] = pd.date_range('20130101', periods=3,
tz='Europe/Brussels')

self.check_round_trip(df, pa, expected=df[['string', 'int']],
read_kwargs={'columns': ['string', 'int']})

def test_duplicate_columns(self, pa):

# not currently able to handle duplicate columns
Expand Down Expand Up @@ -363,25 +387,12 @@ def test_categorical_unsupported(self, pa_lt_070):

class TestParquetFastParquet(Base):

def test_basic(self, fp):

df = pd.DataFrame(
{'string': list('abc'),
'string_with_nan': ['a', np.nan, 'c'],
'string_with_none': ['a', None, 'c'],
'bytes': [b'foo', b'bar', b'baz'],
'unicode': [u'foo', u'bar', u'baz'],
'int': list(range(1, 4)),
'uint': np.arange(3, 6).astype('u1'),
'float': np.arange(4.0, 7.0, dtype='float64'),
'float_with_nan': [2., np.nan, 3.],
'bool': [True, False, True],
'datetime': pd.date_range('20130101', periods=3),
'datetime_with_nat': [pd.Timestamp('20130101'),
pd.NaT,
pd.Timestamp('20130103')],
'timedelta': pd.timedelta_range('1 day', periods=3),
})
def test_basic(self, fp, df_full):

df = df_full

# additional supported types for fastparquet
df['timedelta'] = pd.timedelta_range('1 day', periods=3)

self.check_round_trip(df, fp, write_kwargs={'compression': None})

Expand Down