diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index 0eeee8ccfddf6..721a49f5d58ce 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -14,11 +14,13 @@ Highlights include: - Backwards incompatible change to ``Timedelta`` to conform the ``.seconds`` attribute with ``datetime.timedelta``, see :ref:`here ` - Changes to the ``.loc`` slicing API to conform with the behavior of ``.ix`` see :ref:`here ` - Changes to the default for ordering in the ``Categorical`` constructor, see :ref:`here ` +- Enhancement to the ``.str`` accessor to make string operations easier, see :ref:`here ` Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. .. contents:: What's new in v0.16.0 :local: + :backlinks: none .. _whatsnew_0160.enhancements: @@ -120,6 +122,45 @@ from a ``scipy.sparse.coo_matrix``: ss = SparseSeries.from_coo(A) ss +.. _whatsnew_0160.enhancements.string: + +String Methods Enhancements +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- Following new methods are accesible via ``.str`` accessor to apply the function to each values. This is intended to make it more consistent with standard methods on strings. (:issue:`9282`, :issue:`9352`, :issue:`9386`, :issue:`9387`, :issue:`9439`) + +============= ============= ============= =============== =============== +.. .. Methods .. .. +============= ============= ============= =============== =============== +``isalnum()`` ``isalpha()`` ``isdigit()`` ``isdigit()`` ``isspace()`` +``islower()`` ``isupper()`` ``istitle()`` ``isnumeric()`` ``isdecimal()`` +``find()`` ``rfind()`` ``ljust()`` ``rjust()`` ``zfill()`` +============= ============= ============= =============== =============== + +.. ipython:: python + + s = Series(['abcd', '3456', 'EFGH']) + s.str.isalpha() + s.str.find('ab') + + +- :meth:`Series.str.pad` and :meth:`Series.str.center` now accept ``fillchar`` option to specify filling character (:issue:`9352`) + +.. ipython:: python + + s = Series(['12', '300', '25']) + s.str.pad(5, fillchar='_') + + +- Added :meth:`Series.str.slice_replace`, which previously raised ``NotImplementedError`` (:issue:`8888`) + +.. ipython:: python + + s = Series(['ABCD', 'EFGH', 'IJK']) + s.str.slice_replace(1, 3, 'X') + # replaced with empty char + s.str.slice_replace(0, 1) + .. _whatsnew_0160.enhancements.other: Other enhancements @@ -137,7 +178,6 @@ Other enhancements - Allow Stata files to be read incrementally with an iterator; support for long strings in Stata files. See the docs :ref:`here`. (issue:`9493`:) - Paths beginning with ~ will now be expanded to begin with the user's home directory (:issue:`9066`) - Added time interval selection in ``get_data_yahoo`` (:issue:`9071`) -- Added ``Series.str.slice_replace()``, which previously raised ``NotImplementedError`` (:issue:`8888`) - Added ``Timestamp.to_datetime64()`` to complement ``Timedelta.to_timedelta64()`` (:issue:`9255`) - ``tseries.frequencies.to_offset()`` now accepts ``Timedelta`` as input (:issue:`9064`) - Lag parameter was added to the autocorrelation method of ``Series``, defaults to lag-1 autocorrelation (:issue:`9192`) @@ -145,15 +185,8 @@ Other enhancements - SQL code now safely escapes table and column names (:issue:`8986`) - Added auto-complete for ``Series.str.``, ``Series.dt.`` and ``Series.cat.`` (:issue:`9322`) -- Added ``StringMethods.isalnum()``, ``isalpha()``, ``isdigit()``, ``isspace()``, ``islower()``, - ``isupper()``, ``istitle()`` which behave as the same as standard ``str`` (:issue:`9282`) - -- Added ``StringMethods.find()`` and ``rfind()`` which behave as the same as standard ``str`` (:issue:`9386`) - - ``Index.get_indexer`` now supports ``method='pad'`` and ``method='backfill'`` even for any target array, not just monotonic targets. These methods also work for monotonic decreasing as well as monotonic increasing indexes (:issue:`9258`). - ``Index.asof`` now works on all index types (:issue:`9258`). - -- Added ``StringMethods.isnumeric`` and ``isdecimal`` which behave as the same as standard ``str`` (:issue:`9439`) - The ``read_excel()`` function's :ref:`sheetname <_io.specifying_sheets>` argument now accepts a list and ``None``, to get multiple or all sheets respectively. If more than one sheet is specified, a dictionary is returned. (:issue:`9450`) .. code-block:: python @@ -162,9 +195,6 @@ Other enhancements pd.read_excel('path_to_file.xls',sheetname=['Sheet1',3]) - A ``verbose`` argument has been augmented in ``io.read_excel()``, defaults to False. Set to True to print sheet names as they are parsed. (:issue:`9450`) -- Added ``StringMethods.ljust()`` and ``rjust()`` which behave as the same as standard ``str`` (:issue:`9352`) -- ``StringMethods.pad()`` and ``center()`` now accept ``fillchar`` option to specify filling character (:issue:`9352`) -- Added ``StringMethods.zfill()`` which behave as the same as standard ``str`` (:issue:`9387`) - Added ``days_in_month`` (compatibility alias ``daysinmonth``) property to ``Timestamp``, ``DatetimeIndex``, ``Period``, ``PeriodIndex``, and ``Series.dt`` (:issue:`9572`) - Added ``decimal`` option in ``to_csv`` to provide formatting for non-'.' decimal separators (:issue:`781`) - Added ``normalize`` option for ``Timestamp`` to normalized to midnight (:issue:`8794`) @@ -454,6 +484,15 @@ Other API Changes To reproduce the old behavior, simply add more precision to the label (e.g., use ``2000-02-01`` instead of ``2000-02``). +- A Spurious ``SettingWithCopy`` Warning was generated when setting a new item in a frame in some cases (:issue:`8730`) + + The following would previously report a ``SettingWithCopy`` Warning. + + .. ipython:: python + + df1 = DataFrame({'x': Series(['a','b','c']), 'y': Series(['d','e','f'])}) + df2 = df1[['x']] + df2['y'] = ['g', 'h', 'i'] .. _whatsnew_0160.deprecations: @@ -505,6 +544,7 @@ Performance Improvements - Performance improvements in ``MultiIndex.sortlevel`` (:issue:`9445`) - Performance and memory usage improvements in ``DataFrame.duplicated`` (:issue:`9398`) - Cythonized ``Period`` (:issue:`9440`) +- Decreased memory usage on ``to_hdf`` (:issue:`9648`) .. _whatsnew_0160.bug_fixes: @@ -567,3 +607,39 @@ Bug Fixes - Bug in ``Series.values_counts`` with excluding ``NaN`` for categorical type ``Series`` with ``dropna=True`` (:issue:`9443`) - Fixed mising numeric_only option for ``DataFrame.std/var/sem`` (:issue:`9201`) - Support constructing ``Panel`` or ``Panel4D`` with scalar data (:issue:`8285`) +- ``Series`` text representation disconnected from `max_rows`/`max_columns` (:issue:`7508`). +- ``Series`` number formatting inconsistent when truncated (:issue:`8532`). + + Previous Behavior + + .. code-block:: python + + In [2]: pd.options.display.max_rows = 10 + In [3]: s = pd.Series([1,1,1,1,1,1,1,1,1,1,0.9999,1,1]*10) + In [4]: s + Out[4]: + 0 1 + 1 1 + 2 1 + ... + 127 0.9999 + 128 1.0000 + 129 1.0000 + Length: 130, dtype: float64 + + New Behavior + + .. code-block:: python + + 0 1.0000 + 1 1.0000 + 2 1.0000 + 3 1.0000 + 4 1.0000 + ... + 125 1.0000 + 126 1.0000 + 127 0.9999 + 128 1.0000 + 129 1.0000 + dtype: float64 diff --git a/pandas/core/format.py b/pandas/core/format.py index 3efcfec254591..b21ca9050ffd0 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -129,45 +129,63 @@ def to_string(self): class SeriesFormatter(object): - def __init__(self, series, buf=None, header=True, length=True, - na_rep='NaN', name=False, float_format=None, dtype=True): + def __init__(self, series, buf=None, length=True, header=True, + na_rep='NaN', name=False, float_format=None, dtype=True, + max_rows=None): self.series = series self.buf = buf if buf is not None else StringIO() self.name = name self.na_rep = na_rep - self.length = length self.header = header + self.length = length + self.max_rows = max_rows if float_format is None: float_format = get_option("display.float_format") self.float_format = float_format self.dtype = dtype + self._chk_truncate() + + def _chk_truncate(self): + from pandas.tools.merge import concat + max_rows = self.max_rows + truncate_v = max_rows and (len(self.series) > max_rows) + series = self.series + if truncate_v: + if max_rows == 1: + row_num = max_rows + series = series.iloc[:max_rows] + else: + row_num = max_rows // 2 + series = concat((series.iloc[:row_num], series.iloc[-row_num:])) + self.tr_row_num = row_num + self.tr_series = series + self.truncate_v = truncate_v + def _get_footer(self): + name = self.series.name footer = u('') - if self.name: - if getattr(self.series.index, 'freq', None): - footer += 'Freq: %s' % self.series.index.freqstr + if getattr(self.series.index, 'freq', None) is not None: + footer += 'Freq: %s' % self.series.index.freqstr - if footer and self.series.name is not None: - # categories have already a comma + linebreak - if not com.is_categorical_dtype(self.series.dtype): - footer += ', ' + if self.name is not False and name is not None: + if footer: + footer += ', ' - series_name = com.pprint_thing(self.series.name, + series_name = com.pprint_thing(name, escape_chars=('\t', '\r', '\n')) footer += ("Name: %s" % - series_name) if self.series.name is not None else "" + series_name) if name is not None else "" if self.length: if footer: footer += ', ' footer += 'Length: %d' % len(self.series) - # TODO: in tidy_repr, with freq index, no dtype is shown -> also include a guard here? - if self.dtype: - name = getattr(self.series.dtype, 'name', None) + if self.dtype is not False and self.dtype is not None: + name = getattr(self.tr_series.dtype, 'name', None) if name: if footer: footer += ', ' @@ -175,8 +193,8 @@ def _get_footer(self): # level infos are added to the end and in a new line, like it is done for Categoricals # Only added when we request a name - if self.name and com.is_categorical_dtype(self.series.dtype): - level_info = self.series.values._repr_categories_info() + if name and com.is_categorical_dtype(self.tr_series.dtype): + level_info = self.tr_series.values._repr_categories_info() if footer: footer += "\n" footer += level_info @@ -184,7 +202,7 @@ def _get_footer(self): return compat.text_type(footer) def _get_formatted_index(self): - index = self.series.index + index = self.tr_series.index is_multi = isinstance(index, MultiIndex) if is_multi: @@ -196,35 +214,44 @@ def _get_formatted_index(self): return fmt_index, have_header def _get_formatted_values(self): - return format_array(self.series.get_values(), None, + return format_array(self.tr_series.get_values(), None, float_format=self.float_format, na_rep=self.na_rep) def to_string(self): - series = self.series + series = self.tr_series + footer = self._get_footer() if len(series) == 0: - return u('') + return 'Series([], ' + footer + ')' fmt_index, have_header = self._get_formatted_index() fmt_values = self._get_formatted_values() - maxlen = max(len(x) for x in fmt_index) + maxlen = max(len(x) for x in fmt_index) # max index len pad_space = min(maxlen, 60) - result = ['%s %s'] * len(fmt_values) - for i, (k, v) in enumerate(zip(fmt_index[1:], fmt_values)): - idx = k.ljust(pad_space) - result[i] = result[i] % (idx, v) + if self.truncate_v: + n_header_rows = 0 + row_num = self.tr_row_num + width = len(fmt_values[row_num-1]) + if width > 3: + dot_str = '...' + else: + dot_str = '..' + dot_str = dot_str.center(width) + fmt_values.insert(row_num + n_header_rows, dot_str) + fmt_index.insert(row_num + 1, '') + + result = adjoin(3, *[fmt_index[1:], fmt_values]) if self.header and have_header: - result.insert(0, fmt_index[0]) + result = fmt_index[0] + '\n' + result - footer = self._get_footer() if footer: - result.append(footer) + result += '\n' + footer - return compat.text_type(u('\n').join(result)) + return compat.text_type(u('').join(result)) def _strlen_func(): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 90a6cf60fa76b..e05709d7a180f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1265,6 +1265,14 @@ def _check_setitem_copy(self, stacklevel=4, t='setting', force=False): except: pass + # we might be a false positive + try: + if self.is_copy().shape == self.shape: + self.is_copy = None + return + except: + pass + # a custom message if isinstance(self.is_copy, string_types): t = self.is_copy @@ -1344,8 +1352,9 @@ def take(self, indices, axis=0, convert=True, is_copy=True): result = self._constructor(new_data).__finalize__(self) # maybe set copy if we didn't actually change the index - if is_copy and not result._get_axis(axis).equals(self._get_axis(axis)): - result._set_is_copy(self) + if is_copy: + if not result._get_axis(axis).equals(self._get_axis(axis)): + result._set_is_copy(self) return result @@ -2005,6 +2014,14 @@ def __setattr__(self, name, value): #---------------------------------------------------------------------- # Consolidation of internals + def _protect_consolidate(self, f): + """ consolidate _data. if the blocks have changed, then clear the cache """ + blocks_before = len(self._data.blocks) + result = f() + if len(self._data.blocks) != blocks_before: + self._clear_item_cache() + return result + def _consolidate_inplace(self): f = lambda: self._data.consolidate() self._data = self._protect_consolidate(f) @@ -2029,8 +2046,6 @@ def consolidate(self, inplace=False): else: f = lambda: self._data.consolidate() cons_data = self._protect_consolidate(f) - if cons_data is self._data: - cons_data = cons_data.copy() return self._constructor(cons_data).__finalize__(self) @property @@ -2066,13 +2081,6 @@ def _check_inplace_setting(self, value): return True - def _protect_consolidate(self, f): - blocks_before = len(self._data.blocks) - result = f() - if len(self._data.blocks) != blocks_before: - self._clear_item_cache() - return result - def _get_numeric_data(self): return self._constructor( self._data.get_numeric_data()).__finalize__(self) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 5cb032521d51a..7a16fb2b6b0d7 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1752,7 +1752,7 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None, if self.is_categorical_astype(dtype): values = self.values else: - values = np.array(self.values).astype(dtype) + values = np.asarray(self.values).astype(dtype, copy=False) if copy: values = values.copy() diff --git a/pandas/core/series.py b/pandas/core/series.py index d34657f0dc256..7e3b21be13525 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -36,7 +36,7 @@ from pandas.tseries.period import PeriodIndex, Period from pandas import compat from pandas.util.terminal import get_terminal_size -from pandas.compat import zip, u, OrderedDict +from pandas.compat import zip, u, OrderedDict, StringIO import pandas.core.ops as ops from pandas.core.algorithms import select_n @@ -883,43 +883,16 @@ def __unicode__(self): Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3. """ + buf = StringIO(u("")) width, height = get_terminal_size() max_rows = (height if get_option("display.max_rows") == 0 else get_option("display.max_rows")) - if max_rows and len(self.index) > max_rows: - result = self._tidy_repr(min(30, max_rows - 4)) - elif len(self.index) > 0: - result = self._get_repr(print_header=True, - length=len(self) > 50, - name=True, - dtype=True) - elif self.name is None: - result = u('Series([], dtype: %s)') % (self.dtype) - else: - result = u('Series([], name: %s, dtype: %s)') % (self.name, - self.dtype) - return result - def _tidy_repr(self, max_vals=20): - """ + self.to_string(buf=buf, name=self.name, dtype=self.dtype, + max_rows=max_rows) + result = buf.getvalue() - Internal function, should always return unicode string - """ - if max_vals > 1: - num = max_vals // 2 - else: - num = 1 - max_vals = 2 - head = self.iloc[:num]._get_repr(print_header=True, length=False, - dtype=False, name=False) - tail = self.iloc[-(max_vals - num):]._get_repr(print_header=False, - length=False, - name=False, - dtype=False) - result = head + '\n...\n' + tail - result = '%s\n%s' % (result, self._repr_footer()) - - return compat.text_type(result) + return result def _repr_footer(self): @@ -948,8 +921,8 @@ def _repr_footer(self): len(self), str(self.dtype.name)) - def to_string(self, buf=None, na_rep='NaN', float_format=None, - length=False, dtype=False, name=False): + def to_string(self, buf=None, na_rep='NaN', float_format=None, header=True, + length=False, dtype=False, name=False, max_rows=None): """ Render a string representation of the Series @@ -962,12 +935,17 @@ def to_string(self, buf=None, na_rep='NaN', float_format=None, float_format : one-parameter function, optional formatter function to apply to columns' elements if they are floats default None + header: boolean, default True + Add the Series header (index name) length : boolean, default False Add the Series length dtype : boolean, default False Add the Series dtype name : boolean, default False - Add the Series name (which may be None) + Add the Series name if not None + max_rows : int, optional + Maximum number of rows to show before truncating. If None, show + all. Returns ------- @@ -975,7 +953,8 @@ def to_string(self, buf=None, na_rep='NaN', float_format=None, """ the_repr = self._get_repr(float_format=float_format, na_rep=na_rep, - length=length, dtype=dtype, name=name) + header=header, length=length, dtype=dtype, + name=name, max_rows=max_rows) # catch contract violations if not isinstance(the_repr, compat.text_type): @@ -993,17 +972,18 @@ def to_string(self, buf=None, na_rep='NaN', float_format=None, f.write(the_repr) def _get_repr( - self, name=False, print_header=False, length=True, dtype=True, - na_rep='NaN', float_format=None): + self, name=False, header=True, length=True, dtype=True, na_rep='NaN', + float_format=None, max_rows=None): """ Internal function, should always return unicode string """ - - formatter = fmt.SeriesFormatter(self, name=name, header=print_header, - length=length, dtype=dtype, + formatter = fmt.SeriesFormatter(self, name=name, + length=length, header=header, + dtype=dtype, na_rep=na_rep, - float_format=float_format) + float_format=float_format, + max_rows=max_rows) result = formatter.to_string() # TODO: following check prob. not neces. diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index e784934ea28b2..d95465d524e27 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -18,6 +18,7 @@ from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel from pandas.sparse.array import BlockIndex, IntIndex from pandas.tseries.api import PeriodIndex, DatetimeIndex +from pandas.tseries.tdi import TimedeltaIndex from pandas.core.base import StringMixin from pandas.core.common import adjoin, pprint_thing from pandas.core.algorithms import match, unique @@ -1782,13 +1783,13 @@ def set_atom(self, block, block_items, existing_col, min_itemsize, return self.set_atom_timedelta64(block) dtype = block.dtype.name - rvalues = block.values.ravel() - inferred_type = lib.infer_dtype(rvalues) + inferred_type = lib.infer_dtype(block.values) if inferred_type == 'date': raise TypeError( "[date] is not implemented as a table column") elif inferred_type == 'datetime': + rvalues = block.values.ravel() if getattr(rvalues[0], 'tzinfo', None) is not None: # if this block has more than one timezone, raise @@ -1917,7 +1918,7 @@ def get_atom_data(self, block, kind=None): def set_atom_data(self, block): self.kind = block.dtype.name self.typ = self.get_atom_data(block) - self.set_data(block.values.astype(self.typ.type)) + self.set_data(block.values.astype(self.typ.type, copy=False)) def set_atom_categorical(self, block, items, info=None, values=None): # currently only supports a 1-D categorical @@ -2016,7 +2017,7 @@ def convert(self, values, nan_rep, encoding): index = DatetimeIndex( self.data.ravel(), tz='UTC').tz_convert(self.tz) - self.data = np.array( + self.data = np.asarray( index.tolist(), dtype=object).reshape(self.data.shape) else: @@ -2026,14 +2027,14 @@ def convert(self, values, nan_rep, encoding): self.data = np.asarray(self.data, dtype='m8[ns]') elif dtype == u('date'): try: - self.data = np.array( + self.data = np.asarray( [date.fromordinal(v) for v in self.data], dtype=object) except ValueError: - self.data = np.array( + self.data = np.asarray( [date.fromtimestamp(v) for v in self.data], dtype=object) elif dtype == u('datetime'): - self.data = np.array( + self.data = np.asarray( [datetime.fromtimestamp(v) for v in self.data], dtype=object) @@ -2048,9 +2049,9 @@ def convert(self, values, nan_rep, encoding): else: try: - self.data = self.data.astype(dtype) + self.data = self.data.astype(dtype, copy=False) except: - self.data = self.data.astype('O') + self.data = self.data.astype('O', copy=False) # convert nans / decode if _ensure_decoded(self.kind) == u('string'): @@ -2337,9 +2338,9 @@ def read_array(self, key): ret = data if dtype == u('datetime64'): - ret = np.array(ret, dtype='M8[ns]') + ret = np.asarray(ret, dtype='M8[ns]') elif dtype == u('timedelta64'): - ret = np.array(ret, dtype='m8[ns]') + ret = np.asarray(ret, dtype='m8[ns]') if transposed: return ret.T @@ -3793,7 +3794,7 @@ def write_data(self, chunksize, dropna=True): # figure the mask: only do if we can successfully process this # column, otherwise ignore the mask mask = com.isnull(a.data).all(axis=0) - masks.append(mask.astype('u1')) + masks.append(mask.astype('u1', copy=False)) # consolidate masks mask = masks[0] @@ -3803,8 +3804,7 @@ def write_data(self, chunksize, dropna=True): else: - mask = np.empty(nrows, dtype='u1') - mask.fill(False) + mask = None # broadcast the indexes if needed indexes = [a.cvalues for a in self.index_axes] @@ -3833,12 +3833,13 @@ def write_data(self, chunksize, dropna=True): bvalues = [] for i, v in enumerate(values): new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape - bvalues.append(values[i].ravel().reshape(new_shape)) + bvalues.append(values[i].reshape(new_shape)) # write the chunks if chunksize is None: chunksize = 100000 + rows = np.empty(min(chunksize,nrows), dtype=self.dtype) chunks = int(nrows / chunksize) + 1 for i in range(chunks): start_i = i * chunksize @@ -3847,11 +3848,20 @@ def write_data(self, chunksize, dropna=True): break self.write_data_chunk( + rows, indexes=[a[start_i:end_i] for a in bindexes], - mask=mask[start_i:end_i], + mask=mask[start_i:end_i] if mask is not None else None, values=[v[start_i:end_i] for v in bvalues]) - def write_data_chunk(self, indexes, mask, values): + def write_data_chunk(self, rows, indexes, mask, values): + """ + Parameters + ---------- + rows : an empty memory space where we are putting the chunk + indexes : an array of the indexes + mask : an array of the masks + values : an array of the values + """ # 0 len for v in values: @@ -3860,7 +3870,8 @@ def write_data_chunk(self, indexes, mask, values): try: nrows = indexes[0].shape[0] - rows = np.empty(nrows, dtype=self.dtype) + if nrows != len(rows): + rows = np.empty(nrows, dtype=self.dtype) names = self.dtype.names nindexes = len(indexes) @@ -3873,7 +3884,10 @@ def write_data_chunk(self, indexes, mask, values): rows[names[i + nindexes]] = v # mask - rows = rows[~mask.ravel().astype(bool)] + if mask is not None: + m = ~mask.ravel().astype(bool, copy=False) + if not m.all(): + rows = rows[m] except Exception as detail: raise Exception("cannot create row-data -> %s" % detail) @@ -4220,6 +4234,11 @@ def _convert_index(index, encoding=None, format_type=None): freq=getattr(index, 'freq', None), tz=getattr(index, 'tz', None), index_name=index_name) + elif isinstance(index, TimedeltaIndex): + converted = index.asi8 + return IndexCol(converted, 'timedelta64', _tables().Int64Col(), + freq=getattr(index, 'freq', None), + index_name=index_name) elif isinstance(index, (Int64Index, PeriodIndex)): atom = _tables().Int64Col() return IndexCol( @@ -4239,15 +4258,20 @@ def _convert_index(index, encoding=None, format_type=None): freq=getattr(index, 'freq', None), tz=getattr(index, 'tz', None), index_name=index_name) + elif inferred_type == 'timedelta64': + converted = values.view('i8') + return IndexCol(converted, 'timedelta64', _tables().Int64Col(), + freq=getattr(index, 'freq', None), + index_name=index_name) elif inferred_type == 'datetime': - converted = np.array([(time.mktime(v.timetuple()) + - v.microsecond / 1E6) for v in values], - dtype=np.float64) + converted = np.asarray([(time.mktime(v.timetuple()) + + v.microsecond / 1E6) for v in values], + dtype=np.float64) return IndexCol(converted, 'datetime', _tables().Time64Col(), index_name=index_name) elif inferred_type == 'date': - converted = np.array([v.toordinal() for v in values], - dtype=np.int32) + converted = np.asarray([v.toordinal() for v in values], + dtype=np.int32) return IndexCol(converted, 'date', _tables().Time32Col(), index_name=index_name) elif inferred_type == 'string': @@ -4289,22 +4313,24 @@ def _unconvert_index(data, kind, encoding=None): kind = _ensure_decoded(kind) if kind == u('datetime64'): index = DatetimeIndex(data) + elif kind == u('timedelta64'): + index = TimedeltaIndex(data) elif kind == u('datetime'): - index = np.array([datetime.fromtimestamp(v) for v in data], - dtype=object) + index = np.asarray([datetime.fromtimestamp(v) for v in data], + dtype=object) elif kind == u('date'): try: - index = np.array( + index = np.asarray( [date.fromordinal(v) for v in data], dtype=object) except (ValueError): - index = np.array( + index = np.asarray( [date.fromtimestamp(v) for v in data], dtype=object) elif kind in (u('integer'), u('float')): - index = np.array(data) + index = np.asarray(data) elif kind in (u('string')): index = _unconvert_string_array(data, nan_rep=None, encoding=encoding) elif kind == u('object'): - index = np.array(data[0]) + index = np.asarray(data[0]) else: # pragma: no cover raise ValueError('unrecognized index type %s' % kind) return index @@ -4315,7 +4341,7 @@ def _unconvert_index_legacy(data, kind, legacy=False, encoding=None): if kind == u('datetime'): index = lib.time64_to_datetime(data) elif kind in (u('integer')): - index = np.array(data, dtype=object) + index = np.asarray(data, dtype=object) elif kind in (u('string')): index = _unconvert_string_array(data, nan_rep=None, encoding=encoding) else: # pragma: no cover @@ -4334,13 +4360,13 @@ def _convert_string_array(data, encoding, itemsize=None): if itemsize is None: itemsize = lib.max_len_string_array(com._ensure_object(data.ravel())) - data = np.array(data, dtype="S%d" % itemsize) + data = np.asarray(data, dtype="S%d" % itemsize) return data def _unconvert_string_array(data, nan_rep=None, encoding=None): """ deserialize a string array, possibly decoding """ shape = data.shape - data = np.array(data.ravel(), dtype=object) + data = np.asarray(data.ravel(), dtype=object) # guard against a None encoding in PY3 (because of a legacy # where the passed encoding is actually None) @@ -4353,7 +4379,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None): dtype = "U{0}".format(itemsize) else: dtype = "S{0}".format(itemsize) - data = data.astype(dtype).astype(object) + data = data.astype(dtype, copy=False).astype(object, copy=False) except (Exception) as e: f = np.vectorize(lambda x: x.decode(encoding), otypes=[np.object]) data = f(data) @@ -4376,7 +4402,7 @@ def _maybe_convert(values, val_kind, encoding): def _get_converter(kind, encoding): kind = _ensure_decoded(kind) if kind == 'datetime64': - return lambda x: np.array(x, dtype='M8[ns]') + return lambda x: np.asarray(x, dtype='M8[ns]') elif kind == 'datetime': return lib.convert_timestamps elif kind == 'string': @@ -4421,7 +4447,7 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs): try: inferred = lib.infer_dtype(where) if inferred == 'integer' or inferred == 'boolean': - where = np.array(where) + where = np.asarray(where) if where.dtype == np.bool_: start, stop = self.start, self.stop if start is None: diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index e95d46f66f17f..ea30bb9c9ae44 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -10,7 +10,7 @@ import pandas from pandas import (Series, DataFrame, Panel, MultiIndex, Categorical, bdate_range, - date_range, Index, DatetimeIndex, isnull) + date_range, timedelta_range, Index, DatetimeIndex, TimedeltaIndex, isnull) from pandas.io.pytables import _tables try: @@ -4593,12 +4593,17 @@ def test_categorical(self): with ensure_clean_store(self.path) as store: - s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=['a','b','c','d'])) - + # basic + s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=['a','b','c','d'], ordered=False)) store.append('s', s, format='table') result = store.select('s') tm.assert_series_equal(s, result) + s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=['a','b','c','d'], ordered=True)) + store.append('s_ordered', s, format='table') + result = store.select('s_ordered') + tm.assert_series_equal(s, result) + df = DataFrame({"s":s, "vals":[1,2,3,4,5,6]}) store.append('df', df, format='table') result = store.select('df') @@ -4639,6 +4644,10 @@ def test_categorical(self): result = store.select('df3', where = ['s in ["b","c"]']) tm.assert_frame_equal(result, expected) + expected = df[df.s.isin(['b','c'])] + result = store.select('df3', where = ['s = ["b","c"]']) + tm.assert_frame_equal(result, expected) + expected = df[df.s.isin(['d'])] result = store.select('df3', where = ['s in ["d"]']) tm.assert_frame_equal(result, expected) @@ -4678,6 +4687,18 @@ def test_duplicate_column_name(self): other = read_hdf(path, 'df') tm.assert_frame_equal(df, other) + def test_preserve_timedeltaindex_type(self): + # GH9635 + # Storing TimedeltaIndexed DataFrames in fixed stores did not preserve + # the type of the index. + df = DataFrame(np.random.normal(size=(10,5))) + df.index = timedelta_range(start='0s',periods=10,freq='1s',name='example') + + with ensure_clean_store(self.path) as store: + + store['df'] = df + assert_frame_equal(store['df'], df) + def _test_sort(obj): if isinstance(obj, DataFrame): diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index cd78fd22e64ca..7f4b3fcb94dfa 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -13,6 +13,7 @@ from pandas import Categorical, Index, Series, DataFrame, PeriodIndex, Timestamp +from pandas.core.config import option_context import pandas.core.common as com import pandas.compat as compat import pandas.util.testing as tm @@ -1559,12 +1560,12 @@ def test_repr(self): self.assertEqual(exp, a.__unicode__()) - a = pd.Series(pd.Categorical(["a","b"] *25, name="a", ordered=True)) - exp = u("".join(["%s a\n%s b\n"%(i,i+1) for i in range(0,10,2)]) + "...\n" + - "".join(["%s a\n%s b\n"%(i,i+1) for i in range(40,50,2)]) + - "Name: a, Length: 50, dtype: category\n" + - "Categories (2, object): [a < b]") - self.assertEqual(exp,a._tidy_repr()) + a = pd.Series(pd.Categorical(["a","b"] *25, name="a")) + exp = u("0 a\n1 b\n" + " ..\n" + + "48 a\n49 b\n" + + "Name: a, dtype: category\nCategories (2, object): [a, b]") + with option_context("display.max_rows", 5): + self.assertEqual(exp, repr(a)) levs = list("abcdefghijklmnopqrstuvwxyz") a = pd.Series(pd.Categorical(["a","b"], name="a", categories=levs, ordered=True)) @@ -1573,7 +1574,6 @@ def test_repr(self): "Categories (26, object): [a < b < c < d ... w < x < y < z]") self.assertEqual(exp,a.__unicode__()) - def test_info(self): # make sure it works diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index b52e4f7e3947b..94a7dd4dd9e87 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -2438,16 +2438,16 @@ def test_to_string(self): # pass float_format format = '%.4f'.__mod__ result = self.ts.to_string(float_format=format) - result = [x.split()[1] for x in result.split('\n')] + result = [x.split()[1] for x in result.split('\n')[:-1]] expected = [format(x) for x in self.ts] self.assertEqual(result, expected) # empty string result = self.ts[:0].to_string() - self.assertEqual(result, '') + self.assertEqual(result, 'Series([], Freq: B)') result = self.ts[:0].to_string(length=0) - self.assertEqual(result, '') + self.assertEqual(result, 'Series([], Freq: B)') # name and length cp = self.ts.copy() @@ -2623,7 +2623,7 @@ def test_max_multi_index_display(self): with option_context("display.max_rows", 2): self.assertEqual(len(str(s).split('\n')),5) with option_context("display.max_rows", 1): - self.assertEqual(len(str(s).split('\n')),5) + self.assertEqual(len(str(s).split('\n')),4) with option_context("display.max_rows", 0): self.assertEqual(len(str(s).split('\n')),10) @@ -2637,10 +2637,137 @@ def test_max_multi_index_display(self): with option_context("display.max_rows", 2): self.assertEqual(len(str(s).split('\n')),4) with option_context("display.max_rows", 1): - self.assertEqual(len(str(s).split('\n')),4) + self.assertEqual(len(str(s).split('\n')),3) with option_context("display.max_rows", 0): self.assertEqual(len(str(s).split('\n')),9) + # Make sure #8532 is fixed + def test_consistent_format(self): + s = pd.Series([1,1,1,1,1,1,1,1,1,1,0.9999,1,1]*10) + with option_context("display.max_rows", 10): + res = repr(s) + exp = ('0 1.0000\n1 1.0000\n2 1.0000\n3 ' + '1.0000\n4 1.0000\n ... \n125 ' + '1.0000\n126 1.0000\n127 0.9999\n128 ' + '1.0000\n129 1.0000\ndtype: float64') + self.assertEqual(res, exp) + + @staticmethod + def gen_test_series(): + s1 = pd.Series(['a']*100) + s2 = pd.Series(['ab']*100) + s3 = pd.Series(['a', 'ab', 'abc', 'abcd', 'abcde', 'abcdef']) + s4 = s3[::-1] + test_sers = {'onel': s1, 'twol': s2, 'asc': s3, 'desc': s4} + return test_sers + + def chck_ncols(self, s): + with option_context("display.max_rows", 10): + res = repr(s) + lines = res.split('\n') + lines = [line for line in repr(s).split('\n') \ + if not re.match('[^\.]*\.+', line)][:-1] + ncolsizes = len(set(len(line.strip()) for line in lines)) + self.assertEqual(ncolsizes, 1) + + def test_format_explicit(self): + test_sers = self.gen_test_series() + with option_context("display.max_rows", 4): + res = repr(test_sers['onel']) + exp = '0 a\n1 a\n ..\n98 a\n99 a\ndtype: object' + self.assertEqual(exp, res) + res = repr(test_sers['twol']) + exp = ('0 ab\n1 ab\n ..\n98 ab\n99 ab\ndtype:' + ' object') + self.assertEqual(exp, res) + res = repr(test_sers['asc']) + exp = ('0 a\n1 ab\n ... \n4 abcde\n5' + ' abcdef\ndtype: object') + self.assertEqual(exp, res) + res = repr(test_sers['desc']) + exp = ('5 abcdef\n4 abcde\n ... \n1 ab\n0' + ' a\ndtype: object') + self.assertEqual(exp, res) + + def test_ncols(self): + test_sers = self.gen_test_series() + for s in test_sers.values(): + self.chck_ncols(s) + + def test_max_rows_eq_one(self): + s = Series(range(10)) + with option_context("display.max_rows", 1): + strrepr = repr(s).split('\n') + exp1 = ['0', '0'] + res1 = strrepr[0].split() + self.assertEqual(exp1, res1) + exp2 = ['..'] + res2 = strrepr[1].split() + self.assertEqual(exp2, res2) + + def test_truncate_ndots(self): + def getndots(s): + return len(re.match('[^\.]*(\.*)', s).groups()[0]) + + s = Series([0, 2, 3, 6]) + with option_context("display.max_rows", 2): + strrepr = repr(s).replace('\n', '') + self.assertEqual(getndots(strrepr), 2) + + s = Series([0, 100, 200, 400]) + with option_context("display.max_rows", 2): + strrepr = repr(s).replace('\n', '') + self.assertEqual(getndots(strrepr), 3) + + def test_to_string_name(self): + s = Series(range(100)) + s.name = 'myser' + res = s.to_string(max_rows=2, name=True) + exp = '0 0\n ..\n99 99\nName: myser' + self.assertEqual(res, exp) + res = s.to_string(max_rows=2, name=False) + exp = '0 0\n ..\n99 99' + self.assertEqual(res, exp) + + def test_to_string_dtype(self): + s = Series(range(100)) + res = s.to_string(max_rows=2, dtype=True) + exp = '0 0\n ..\n99 99\ndtype: int64' + self.assertEqual(res, exp) + res = s.to_string(max_rows=2, dtype=False) + exp = '0 0\n ..\n99 99' + self.assertEqual(res, exp) + + def test_to_string_length(self): + s = Series(range(100)) + res = s.to_string(max_rows=2, length=True) + exp = '0 0\n ..\n99 99\nLength: 100' + self.assertEqual(res, exp) + + def test_to_string_na_rep(self): + s = pd.Series(index=range(100)) + res = s.to_string(na_rep='foo', max_rows=2) + exp = '0 foo\n ..\n99 foo' + self.assertEqual(res, exp) + + def test_to_string_float_format(self): + s = pd.Series(range(10), dtype=float) + res = s.to_string(float_format=lambda x: '{0:2.1f}'.format(x), + max_rows=2) + exp = '0 0.0\n ..\n9 9.0' + self.assertEqual(res, exp) + + def test_to_string_header(self): + s = pd.Series(range(10)) + s.index.name = 'foo' + res = s.to_string(header=True, max_rows=2) + exp = 'foo\n0 0\n ..\n9 9' + self.assertEqual(res, exp) + res = s.to_string(header=False, max_rows=2) + exp = '0 0\n ..\n9 9' + self.assertEqual(res, exp) + + class TestEngFormatter(tm.TestCase): _multiprocess_can_split_ = True diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 18baa941b814a..ee6140828882c 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -3751,14 +3751,6 @@ def f(): assert_series_equal(s,df.iloc[:,0].order()) assert_series_equal(s,df[0].order()) - # operating on a copy - df = pd.DataFrame({'a': list(range(4)), 'b': list('ab..'), 'c': ['a', 'b', np.nan, 'd']}) - mask = pd.isnull(df.c) - - def f(): - df[['c']][mask] = df[['b']][mask] - self.assertRaises(com.SettingWithCopyError, f) - # false positives GH6025 df = DataFrame ({'column1':['a', 'a', 'a'], 'column2': [4,8,9] }) str(df) @@ -3790,6 +3782,24 @@ def f(): df['C'][2] = 'foo' self.assertRaises(com.SettingWithCopyError, f) + def test_setting_with_copy_bug(self): + + # operating on a copy + df = pd.DataFrame({'a': list(range(4)), 'b': list('ab..'), 'c': ['a', 'b', np.nan, 'd']}) + mask = pd.isnull(df.c) + + def f(): + df[['c']][mask] = df[['b']][mask] + self.assertRaises(com.SettingWithCopyError, f) + + # invalid warning as we are returning a new object + # GH 8730 + df1 = DataFrame({'x': Series(['a','b','c']), 'y': Series(['d','e','f'])}) + df2 = df1[['x']] + + # this should not raise + df2['y'] = ['g', 'h', 'i'] + def test_detect_chained_assignment_warnings(self): # warnings diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 7e0dbaa735456..ae2ed4eaca2f4 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -2046,7 +2046,7 @@ def test_repr(self): # with empty series (#4651) s = Series([], dtype=np.int64, name='foo') - self.assertEqual(repr(s), 'Series([], name: foo, dtype: int64)') + self.assertEqual(repr(s), 'Series([], Name: foo, dtype: int64)') s = Series([], dtype=np.int64, name=None) self.assertEqual(repr(s), 'Series([], dtype: int64)')