diff --git a/doc/source/io.rst b/doc/source/io.rst index d123e8fcbc8a1..653b01f1011ca 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -915,8 +915,8 @@ a subset of the data. This allows one to have a very large on-disk table and ret A query is specified using the ``Term`` class under the hood. - - 'index' and 'column' are supported indexers of a DataFrame - - 'major_axis' and 'minor_axis' are supported indexers of the Panel + - 'index' and 'columns' are supported indexers of a DataFrame + - 'major_axis', 'minor_axis', and 'items' are supported indexers of the Panel Valid terms can be created from ``dict, list, tuple, or string``. Objects can be embeded as values. Allowed operations are: ``<, <=, >, >=, =``. ``=`` will be inferred as an implicit set operation (e.g. if 2 or more values are provided). The following are all valid terms. @@ -925,7 +925,7 @@ Valid terms can be created from ``dict, list, tuple, or string``. Objects can be - ``'index>20121114'`` - ``('index', '>', datetime(2012,11,14))`` - ``('index', ['20121114','20121115'])`` - - ``('major', '=', Timestamp('2012/11/14'))`` + - ``('major_axis', '=', Timestamp('2012/11/14'))`` - ``('minor_axis', ['A','B'])`` Queries are built up using a list of ``Terms`` (currently only **anding** of terms is supported). An example query for a panel might be specified as follows. @@ -934,6 +934,7 @@ Queries are built up using a list of ``Terms`` (currently only **anding** of ter .. ipython:: python store.append('wp',wp) + store store.select('wp',[ 'major_axis>20000102', ('minor_axis', '=', ['A','B']) ]) Delete from a Table @@ -941,7 +942,7 @@ Delete from a Table .. ipython:: python - store.remove('wp', 'index>20000102' ) + store.remove('wp', 'major_axis>20000102' ) store.select('wp') Notes & Caveats diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 7ad9147faa955..1848ec8e7a2c3 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -759,3 +759,37 @@ def block2d_to_block3d(values, items, shape, major_labels, minor_labels, ref_items = items return make_block(pvalues, items, ref_items) + +def block2d_to_blocknd(values, items, shape, labels, ref_items=None): + """ pivot to the labels shape """ + from pandas.core.internals import make_block + panel_shape = (len(items),) + shape + + # TODO: lexsort depth needs to be 2!! + + # Create observation selection vector using major and minor + # labels, for converting to panel format. + selector = factor_indexer(shape[1:],labels) + mask = np.zeros(np.prod(shape), dtype=bool) + mask.put(selector, True) + + pvalues = np.empty(panel_shape, dtype=values.dtype) + if not issubclass(pvalues.dtype.type, (np.integer, np.bool_)): + pvalues.fill(np.nan) + elif not mask.all(): + pvalues = com._maybe_upcast(pvalues) + pvalues.fill(np.nan) + + values = values + for i in xrange(len(items)): + pvalues[i].flat[mask] = values[:, i] + + if ref_items is None: + ref_items = items + + return make_block(pvalues, items, ref_items) + +def factor_indexer(shape, labels): + """ given a tuple of shape and a list of Factor lables, return the expanded label indexer """ + mult = np.array(shape)[::-1].cumprod()[::-1] + return np.sum(np.array(labels).T * np.append(mult,[1]), axis=1).T diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 2e75d3f067b3a..86627563854b3 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -13,7 +13,7 @@ import numpy as np from pandas import ( - Series, TimeSeries, DataFrame, Panel, Index, MultiIndex, Int64Index + Series, TimeSeries, DataFrame, Panel, Panel4D, Index, MultiIndex, Int64Index ) from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel from pandas.sparse.array import BlockIndex, IntIndex @@ -24,7 +24,7 @@ from pandas.core.categorical import Factor from pandas.core.common import _asarray_tuplesafe, _try_sort from pandas.core.internals import BlockManager, make_block, form_blocks -from pandas.core.reshape import block2d_to_block3d +from pandas.core.reshape import block2d_to_block3d, block2d_to_blocknd, factor_indexer import pandas.core.common as com from pandas.tools.merge import concat @@ -42,6 +42,7 @@ DataFrame: 'frame', SparseDataFrame: 'sparse_frame', Panel: 'wide', + Panel4D : 'ndim', SparsePanel: 'sparse_panel' } @@ -632,10 +633,19 @@ def _write_wide_table(self, group, panel, append=False, comp=None, **kwargs): t.write(axes_to_index=[1,2], obj=panel, append=append, compression=comp, **kwargs) + def _write_ndim_table(self, group, obj, append=False, comp=None, axes_to_index=None, **kwargs): + if axes_to_index is None: + axes_to_index=[1,2,3] + t = create_table(self, group, typ = 'appendable_ndim') + t.write(axes_to_index=axes_to_index, obj=obj, + append=append, compression=comp, **kwargs) + def _read_wide_table(self, group, where=None): t = create_table(self, group) return t.read(where) + _read_ndim_table = _read_wide_table + def _write_index(self, group, key, index): if isinstance(index, MultiIndex): setattr(group._v_attrs, '%s_variety' % key, 'multi') @@ -1098,6 +1108,7 @@ class Table(object): """ table_type = None + obj_type = None ndim = None def __init__(self, parent, group): @@ -1108,13 +1119,17 @@ def __init__(self, parent, group): self.values_axes = [] self.selection = None + @property + def table_type_short(self): + return self.table_type.split('_')[0] + @property def pandas_type(self): return getattr(self.group._v_attrs,'pandas_type',None) def __repr__(self): """ return a pretty representatgion of myself """ - return "%s (typ->%s,nrows->%s)" % (self.pandas_type,self.table_type,self.nrows) + return "%s (typ->%s,nrows->%s)" % (self.pandas_type,self.table_type_short,self.nrows) __str__ = __repr__ @@ -1163,9 +1178,9 @@ def description(self): def axes(self): return itertools.chain(self.index_axes, self.values_axes) - def kinds_map(self): - """ return a list of the kinds for each columns """ - return [ (a.cname,a.kind) for a in self.index_axes ] + def queryables(self): + """ return a dict of the kinds allowable columns for this object """ + return dict([ (a.cname,a.kind) for a in self.index_axes ] + [ (self.obj_type._AXIS_NAMES[axis],None) for axis, values in self.non_index_axes ]) def index_cols(self): """ return a list of my index cols """ @@ -1386,38 +1401,37 @@ def write(self, **kwargs): raise Exception("write operations are not allowed on legacy tables!") def read(self, where=None): - """ we have 2 indexable columns, with an arbitrary number of data axes """ + """ we have n indexable columns, with an arbitrary number of data axes """ self.read_axes(where) - index = self.index_axes[0].values - column = self.index_axes[1].values - - major = Factor.from_array(index) - minor = Factor.from_array(column) + indicies = [ i.values for i in self.index_axes ] + factors = [ Factor.from_array(i) for i in indicies ] + levels = [ f.levels for f in factors ] + N = [ len(f.levels) for f in factors ] + labels = [ f.labels for f in factors ] - J, K = len(major.levels), len(minor.levels) - key = major.labels * K + minor.labels + # compute the key + key = factor_indexer(N[1:], labels) - panels = [] + objs = [] if len(unique(key)) == len(key): - sorter, _ = algos.groupsort_indexer(com._ensure_int64(key), J * K) + + sorter, _ = algos.groupsort_indexer(com._ensure_int64(key), np.prod(N)) sorter = com._ensure_platform_int(sorter) - # create the panels + # create the objs for c in self.values_axes: # the data need to be sorted sorted_values = c.data.take(sorter, axis=0) - major_labels = major.labels.take(sorter) - minor_labels = minor.labels.take(sorter) - items = Index(c.values) - - block = block2d_to_block3d(sorted_values, items, (J, K), - major_labels, minor_labels) + + take_labels = [ l.take(sorter) for l in labels ] + items = Index(c.values) - mgr = BlockManager([block], [items, major.levels, minor.levels]) - panels.append(Panel(mgr)) + block = block2d_to_blocknd(sorted_values, items, tuple(N), take_labels) + mgr = BlockManager([block], [items] + levels) + objs.append(self.obj_type(mgr)) else: if not self._quiet: # pragma: no cover @@ -1425,9 +1439,8 @@ def read(self, where=None): 'appended') # reconstruct - long_index = MultiIndex.from_arrays([index, column]) + long_index = MultiIndex.from_arrays(indicies) - panels = [] for c in self.values_axes: lp = DataFrame(c.data, index=long_index, columns=c.values) @@ -1444,10 +1457,10 @@ def read(self, where=None): new_values = lp.values.take(indexer, axis=0) lp = DataFrame(new_values, index=new_index, columns=lp.columns) - panels.append(lp.to_panel()) + objs.append(lp.to_panel()) - # append the panels - wp = concat(panels, axis = 0, verify_integrity = True) + # create the composite object + wp = concat(objs, axis = 0, verify_integrity = True) # reorder by any non_index_axes for axis,labels in self.non_index_axes: @@ -1462,12 +1475,14 @@ def read(self, where=None): class LegacyFrameTable(LegacyTable): """ support the legacy frame table """ table_type = 'legacy_frame' + obj_type = Panel def read(self, *args, **kwargs): return super(LegacyFrameTable, self).read(*args, **kwargs)['value'] class LegacyPanelTable(LegacyTable): """ support the legacy panel table """ table_type = 'legacy_panel' + obj_type = Panel class AppendableTable(LegacyTable): """ suppor the new appendable table formats """ @@ -1586,6 +1601,7 @@ class AppendableFrameTable(AppendableTable): """ suppor the new appendable table formats """ table_type = 'appendable_frame' ndim = 2 + obj_type = DataFrame def read(self, where=None): @@ -1620,11 +1636,19 @@ class AppendablePanelTable(AppendableTable): """ suppor the new appendable table formats """ table_type = 'appendable_panel' ndim = 3 + obj_type = Panel + +class AppendableNDimTable(AppendablePanelTable): + """ suppor the new appendable table formats """ + table_type = 'appendable_ndim' + ndim = 4 + obj_type = Panel4D # table maps _TABLE_MAP = { 'appendable_frame' : AppendableFrameTable, 'appendable_panel' : AppendablePanelTable, + 'appendable_ndim' : AppendableNDimTable, 'worm' : WORMTable, 'legacy_frame' : LegacyFrameTable, 'legacy_panel' : LegacyPanelTable, @@ -1818,7 +1842,7 @@ class Term(object): op : a valid op (defaults to '=') (optional) >, >=, <, <=, =, != (not equal) are allowed value : a value or list of values (required) - kinds : the kinds map (dict of column name -> kind) + queryables : a kinds map (dict of column name -> kind), or None i column is non-indexable Returns ------- @@ -1831,24 +1855,19 @@ class Term(object): Term('index', '>', '20121114') Term('index', ['20121114','20121114']) Term('index', datetime(2012,11,14)) - Term('major>20121114') - Term('minor', ['A','B']) + Term('major_axis>20121114') + Term('minor_axis', ['A','B']) """ _ops = ['<=','<','>=','>','!=','='] _search = re.compile("^(?P\w+)(?P%s)(?P.+)$" % '|'.join(_ops)) - _index = ['index','major_axis','major'] - _column = ['column','columns','minor_axis','minor'] - def __init__(self, field, op = None, value = None, kinds = None): + def __init__(self, field, op = None, value = None, queryables = None): self.field = None self.op = None self.value = None - - if kinds is None: - kinds = [] - self.kinds = dict(kinds) + self.q = queryables or dict() self.filter = None self.condition = None @@ -1901,12 +1920,6 @@ def __init__(self, field, op = None, value = None, kinds = None): if self.field is None or self.op is None or self.value is None: raise Exception("Could not create this term [%s]" % str(self)) - # map alias for field names - if self.field in self._index and len(kinds) > 0: - self.field = kinds[0][0] - elif self.field in self._column and len(kinds) > 1: - self.field = kinds[1][0] - # we have valid conditions if self.op in ['>','>=','<','<=']: if hasattr(self.value,'__iter__') and len(self.value) > 1: @@ -1915,26 +1928,35 @@ def __init__(self, field, op = None, value = None, kinds = None): if not hasattr(self.value,'__iter__'): self.value = [ self.value ] - self.eval() + if len(self.q): + self.eval() def __str__(self): return "field->%s,op->%s,value->%s" % (self.field,self.op,self.value) __repr__ = __str__ + @property + def is_valid(self): + """ return True if this is a valid field """ + return self.field in self.q + @property def is_in_table(self): """ return True if this is a valid column name for generation (e.g. an actual column in the table) """ - return self.field in self.kinds + return self.q.get(self.field) is not None @property def kind(self): """ the kind of my field """ - return self.kinds.get(self.field) + return self.q.get(self.field) def eval(self): """ set the numexpr expression for this term """ + if not self.is_valid: + raise Exception("query term is not valid [%s]" % str(self)) + # convert values values = [ self.convert_value(v) for v in self.value ] @@ -2014,7 +2036,8 @@ def generate(self, where): if not any([ isinstance(w, (list,tuple,Term)) for w in where ]): where = [ where ] - return [ Term(c, kinds = self.table.kinds_map()) for c in where ] + queryables = self.table.queryables() + return [ Term(c, queryables = queryables) for c in where ] def select(self): """ diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 6f7d348caa266..7ecb0bc2fd5ee 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -185,11 +185,18 @@ def test_append(self): tm.assert_frame_equal(store['df3 foo'], df) warnings.filterwarnings('always', category=tables.NaturalNameWarning) + # panel wp = tm.makePanel() store.append('wp1', wp.ix[:,:10,:]) store.append('wp1', wp.ix[:,10:,:]) tm.assert_panel_equal(store['wp1'], wp) + # ndim + p4d = tm.makePanel4D() + store.append('p4d', p4d.ix[:,:,:10,:]) + store.append('p4d', p4d.ix[:,:,10:,:]) + tm.assert_panel4d_equal(store['p4d'], p4d) + except: raise finally: @@ -351,7 +358,7 @@ def test_remove_where(self): # non-table ok (where = None) wp = tm.makePanel() self.store.put('wp', wp, table=True) - self.store.remove('wp', [('column', ['A', 'D'])]) + self.store.remove('wp', [('minor_axis', ['A', 'D'])]) rs = self.store.select('wp') expected = wp.reindex(minor_axis = ['B','C']) tm.assert_panel_equal(rs,expected) @@ -378,8 +385,8 @@ def test_remove_crit(self): self.store.put('wp', wp, table=True) date = wp.major_axis[len(wp.major_axis) // 2] - crit1 = Term('index','>',date) - crit2 = Term('column',['A', 'D']) + crit1 = Term('major_axis','>',date) + crit2 = Term('minor_axis',['A', 'D']) self.store.remove('wp', where=[crit1]) self.store.remove('wp', where=[crit2]) result = self.store['wp'] @@ -394,9 +401,9 @@ def test_remove_crit(self): date2 = wp.major_axis[5] date3 = [wp.major_axis[7],wp.major_axis[9]] - crit1 = Term('index',date1) - crit2 = Term('index',date2) - crit3 = Term('index',date3) + crit1 = Term('major_axis',date1) + crit2 = Term('major_axis',date2) + crit3 = Term('major_axis',date3) self.store.remove('wp2', where=[crit1]) self.store.remove('wp2', where=[crit2]) @@ -415,7 +422,9 @@ def test_remove_crit(self): def test_terms(self): wp = tm.makePanel() + p4d = tm.makePanel4D() self.store.put('wp', wp, table=True) + self.store.put('p4d', p4d, table=True) # some invalid terms terms = [ @@ -432,28 +441,34 @@ def test_terms(self): self.assertRaises(Exception, Term.__init__, 'index', '==') self.assertRaises(Exception, Term.__init__, 'index', '>', 5) + # panel result = self.store.select('wp',[ Term('major_axis<20000108'), Term('minor_axis', '=', ['A','B']) ]) expected = wp.truncate(after='20000108').reindex(minor=['A', 'B']) tm.assert_panel_equal(result, expected) + # p4d + result = self.store.select('p4d',[ Term('major_axis<20000108'), Term('minor_axis', '=', ['A','B']) ]) + expected = p4d.truncate(after='20000108').reindex(minor=['A', 'B']) + tm.assert_panel4d_equal(result, expected) + # valid terms terms = [ - dict(field = 'index', op = '>', value = '20121114'), - ('index', '20121114'), - ('index', '>', '20121114'), - (('index', ['20121114','20121114']),), - ('index', datetime(2012,11,14)), - 'index>20121114', - 'major>20121114', + dict(field = 'major_axis', op = '>', value = '20121114'), + ('major_axis', '20121114'), + ('major_axis', '>', '20121114'), + (('major_axis', ['20121114','20121114']),), + ('major_axis', datetime(2012,11,14)), + 'major_axis>20121114', 'major_axis>20121114', - (('minor', ['A','B']),), + 'major_axis>20121114', + (('minor_axis', ['A','B']),), (('minor_axis', ['A','B']),), ((('minor_axis', ['A','B']),),), - (('column', ['A','B']),), ] for t in terms: self.store.select('wp', t) + self.store.select('p4d', t) def test_series(self): s = tm.makeStringSeries() @@ -790,8 +805,8 @@ def test_panel_select(self): self.store.put('wp', wp, table=True) date = wp.major_axis[len(wp.major_axis) // 2] - crit1 = ('index','>=',date) - crit2 = ('column', '=', ['A', 'D']) + crit1 = ('major_axis','>=',date) + crit2 = ('minor_axis', '=', ['A', 'D']) result = self.store.select('wp', [crit1, crit2]) expected = wp.truncate(before=date).reindex(minor=['A', 'D']) @@ -807,8 +822,8 @@ def test_frame_select(self): date = df.index[len(df) // 2] crit1 = ('index','>=',date) - crit2 = ('column',['A', 'D']) - crit3 = ('column','A') + crit2 = ('columns',['A', 'D']) + crit3 = ('columns','A') result = self.store.select('frame', [crit1, crit2]) expected = df.ix[date:, ['A', 'D']] @@ -829,7 +844,7 @@ def test_select_filter_corner(self): df.columns = ['%.3d' % c for c in df.columns] self.store.put('frame', df, table=True) - crit = Term('column', df.columns[:75]) + crit = Term('columns', df.columns[:75]) result = self.store.select('frame', [crit]) tm.assert_frame_equal(result, df.ix[:, df.columns[:75]]) diff --git a/pandas/lib.pyx b/pandas/lib.pyx index e776eb9df7265..6570ce7abb0ae 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -825,6 +825,56 @@ def create_hdf_rows_3d(ndarray indexer0, ndarray indexer1, return l +@cython.boundscheck(False) +@cython.wraparound(False) +def create_hdf_rows_4d(ndarray indexer0, ndarray indexer1, ndarray indexer2, + ndarray[np.uint8_t, ndim=3] mask, list values): + """ return a list of objects ready to be converted to rec-array format """ + + cdef: + unsigned int i, j, k, b, n_indexer0, n_indexer1, n_indexer2, n_blocks, tup_size + ndarray v + list l + object tup, val + + n_indexer0 = indexer0.shape[0] + n_indexer1 = indexer1.shape[0] + n_indexer2 = indexer2.shape[0] + n_blocks = len(values) + tup_size = n_blocks+3 + l = [] + for i from 0 <= i < n_indexer0: + + for j from 0 <= j < n_indexer1: + + for k from 0 <= k < n_indexer2: + + if not mask[i, j, k]: + + tup = PyTuple_New(tup_size) + + val = indexer0[i] + PyTuple_SET_ITEM(tup, 0, val) + Py_INCREF(val) + + val = indexer1[j] + PyTuple_SET_ITEM(tup, 1, val) + Py_INCREF(val) + + val = indexer2[k] + PyTuple_SET_ITEM(tup, 2, val) + Py_INCREF(val) + + for b from 0 <= b < n_blocks: + + v = values[b][:, i, j, k] + PyTuple_SET_ITEM(tup, b+3, v) + Py_INCREF(v) + + l.append(tup) + + return l + #------------------------------------------------------------------------------- # Groupby-related functions