|
| 1 | +#!/usr/bin/env python |
| 2 | +'''dataset implements a poor man's version of pandas data frames, it is |
| 3 | + only intended to illustrate a number of concepts about iterators |
| 4 | + and can be used when dependencies on third party libraries should |
| 5 | + be avoided''' |
| 6 | + |
| 7 | +import collections |
| 8 | +import operator |
| 9 | + |
| 10 | +ColumnDef = collections.namedtuple('ColumnSpecs', ['name', 'type']) |
| 11 | + |
| 12 | + |
| 13 | +class DatasetError(Exception): |
| 14 | + '''Base class for Dataset exceptions''' |
| 15 | + |
| 16 | + pass |
| 17 | + |
| 18 | + |
| 19 | +class DataLenError(DatasetError): |
| 20 | + '''Exception indicating that a list of data is being appended with |
| 21 | + a length different from the number of headers of the data set''' |
| 22 | + |
| 23 | + pass |
| 24 | + |
| 25 | + |
| 26 | +class ConversionError(DatasetError): |
| 27 | + '''Exception indicating that a type conversion failed, i.e., a |
| 28 | + value is appended that can not be converted to its column's |
| 29 | + type''' |
| 30 | + |
| 31 | + pass |
| 32 | + |
| 33 | + |
| 34 | +class ColumnOverwriteError(DataLenError): |
| 35 | + '''Exception indicating that a new column would overwrite an |
| 36 | + existing one.''' |
| 37 | + |
| 38 | + pass |
| 39 | + |
| 40 | + |
| 41 | +class UndefinedColumnError(DataLenError): |
| 42 | + '''Exception indicating that a column does not exist in the dataset''' |
| 43 | + |
| 44 | + pass |
| 45 | + |
| 46 | + |
| 47 | +class ComputeError(DataLenError): |
| 48 | + '''Exception indicating that a computation failed''' |
| 49 | + |
| 50 | + pass |
| 51 | + |
| 52 | + |
| 53 | +class Dataset(object): |
| 54 | + '''Class representing data sets''' |
| 55 | + |
| 56 | + def __init__(self, col_defs): |
| 57 | + '''Constructor that optonally takes the data set headers''' |
| 58 | + self._headers = [col_def.name for col_def in col_defs] |
| 59 | + self._type_map = {col_def.name: col_def.type |
| 60 | + for col_def in col_defs} |
| 61 | + self._data = {header: [] for header in self._headers} |
| 62 | + self._nr_data = 0 |
| 63 | + self._next = 0 |
| 64 | + |
| 65 | + @property |
| 66 | + def headers(self): |
| 67 | + '''get the list of headers for the data set''' |
| 68 | + return list(self._headers) |
| 69 | + |
| 70 | + @property |
| 71 | + def nr_columns(self): |
| 72 | + '''returns number of columns in the dataset''' |
| 73 | + return len(self._headers) |
| 74 | + |
| 75 | + @property |
| 76 | + def column_defs(self): |
| 77 | + '''retrieve the column definitions of the dataset''' |
| 78 | + col_defs = [] |
| 79 | + for header in self._headers: |
| 80 | + col_defs.append(ColumnDef(header, self._type_map[header])) |
| 81 | + return col_defs |
| 82 | + |
| 83 | + def __len__(self): |
| 84 | + '''retrieve the length of the data set''' |
| 85 | + return self._nr_data |
| 86 | + |
| 87 | + def _convert(self, header, value): |
| 88 | + '''convert the value to the appropriate data type''' |
| 89 | + return self._type_map[header](value) |
| 90 | + |
| 91 | + def append(self, data): |
| 92 | + '''append a row of data to the set''' |
| 93 | + if len(data) != len(self._headers): |
| 94 | + msg = '{0:d} headers, {1:d} items'.format(len(self._headers), |
| 95 | + len(data)) |
| 96 | + raise DataLenError(msg) |
| 97 | + for i, header in enumerate(self._headers): |
| 98 | + try: |
| 99 | + value = self._convert(header, data[i]) |
| 100 | + except ValueError as error: |
| 101 | + msg = 'type conversion failed: {0}'.format(str(error)) |
| 102 | + raise ConversionError(msg) |
| 103 | + self._data[header].append(value) |
| 104 | + self._nr_data += 1 |
| 105 | + |
| 106 | + def __iter__(self): |
| 107 | + '''iterator over the data values in the data set, each returning |
| 108 | + a list ordered according to the headers of the data set''' |
| 109 | + self._RowTuple = collections.namedtuple('RowTuple', self._headers) |
| 110 | + self._next = 0 |
| 111 | + return self |
| 112 | + |
| 113 | + def __next__(self): |
| 114 | + '''return next data value when dataset is used as an iterator''' |
| 115 | + if self._next < self._nr_data: |
| 116 | + values = self._RowTuple._make((self._data[header][self._next] |
| 117 | + for header in self._headers)) |
| 118 | + self._next += 1 |
| 119 | + return values |
| 120 | + else: |
| 121 | + self._next = 0 |
| 122 | + raise StopIteration |
| 123 | + |
| 124 | + def compute(self, col_defs, args, function): |
| 125 | + '''perform a computation producing extra columns by applying a |
| 126 | + function using the specified argument names''' |
| 127 | + for col_def in col_defs: |
| 128 | + if col_def.name in self._headers: |
| 129 | + msg = 'column {0} already exists'.format(col_def.name) |
| 130 | + raise ColumnOverwriteError(msg) |
| 131 | + self._data[col_def.name] = [] |
| 132 | + for name in args: |
| 133 | + if name not in self._headers: |
| 134 | + msg = 'no column {0} in dataset'.format(name) |
| 135 | + raise UndefinedColumnError(msg) |
| 136 | + arg_idx = tuple(self._headers.index(name) for name in args) |
| 137 | + selector = operator.itemgetter(*arg_idx) |
| 138 | + names = [col_def.name for col_def in col_defs] |
| 139 | + for row in self: |
| 140 | + args = selector(row) |
| 141 | + try: |
| 142 | + values = function(*args) |
| 143 | + except Exception as error: |
| 144 | + args_str = ', '.join([str(arg) for arg in args]) |
| 145 | + msg = "computation for '{0}' failed: {1}".format(args_str, |
| 146 | + str(error)) |
| 147 | + raise ComputeError(msg) |
| 148 | + for name, value in zip(names, values): |
| 149 | + self._data[name].append(value) |
| 150 | + for col_def in col_defs: |
| 151 | + self._headers.append(col_def.name) |
| 152 | + self._type_map[col_def.name] = col_def.type |
| 153 | + |
| 154 | + def __str__(self): |
| 155 | + '''create string representation of the data set''' |
| 156 | + str_repr = ', '.join(self._headers) |
| 157 | + for row in self: |
| 158 | + str_repr += '\n' + ', '.join([str(x) for x in row]) |
| 159 | + return str_repr |
| 160 | + |
| 161 | + |
| 162 | +if __name__ == '__main__': |
| 163 | + data = Dataset([ |
| 164 | + ColumnDef('x', int), |
| 165 | + ColumnDef('y', int), |
| 166 | + ColumnDef('z', int), |
| 167 | + ]) |
| 168 | + for x in range(10): |
| 169 | + data.append((x, x**2, x**3)) |
| 170 | + print(data) |
| 171 | + data.compute([ColumnDef('sum', int), ColumnDef('prod', int)], ['x', 'y'], |
| 172 | + lambda x, y: (x + y, x*y)) |
| 173 | + print(data) |
| 174 | + for row in data: |
| 175 | + print('{0:d} + {1:d} = {2:d}'.format(row.x, row.y, row.sum)) |
| 176 | + print('{0:d} data items'.format(len(data))) |
| 177 | + data.compute([ColumnDef('substr', int)], ['x', 'y'], |
| 178 | + lambda x, y: (y - x, )) |
| 179 | + print(data) |
| 180 | + try: |
| 181 | + data.append(['bla'] * data.nr_columns) |
| 182 | + except Exception as error: |
| 183 | + print('### error: {0}'.format(error)) |
0 commit comments