From 2ae7c93549d393837eb7bde0b369c5a27cfe623e Mon Sep 17 00:00:00 2001 From: avsthiago Date: Sat, 24 Oct 2020 19:07:58 +0200 Subject: [PATCH 1/4] Fixing typos and pep8 issues --- tests/test_dataframe.py | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/tests/test_dataframe.py b/tests/test_dataframe.py index 481c0b2..31d238e 100644 --- a/tests/test_dataframe.py +++ b/tests/test_dataframe.py @@ -1,6 +1,6 @@ import numpy as np -from numpy.testing import assert_array_equal import pytest +from numpy.testing import assert_array_equal import pandas_cub as pdc from tests import assert_df_equals @@ -28,20 +28,20 @@ def test_input_types(self): pdc.DataFrame({'a': np.array([1]), 'b': 10}) with pytest.raises(ValueError): - pdc.DataFrame({'a': np.array([1]), + pdc.DataFrame({'a': np.array([1]), 'b': np.array([[1]])}) # correct construction. no error - pdc.DataFrame({'a': np.array([1]), + pdc.DataFrame({'a': np.array([1]), 'b': np.array([1])}) def test_array_length(self): with pytest.raises(ValueError): - pdc.DataFrame({'a': np.array([1, 2]), + pdc.DataFrame({'a': np.array([1, 2]), 'b': np.array([1])}) # correct construction. no error - pdc.DataFrame({'a': np.array([1, 2]), - 'b': np.array([5, 10])}) + pdc.DataFrame({'a': np.array([1, 2]), + 'b': np.array([5, 10])}) def test_unicode_to_object(self): a_object = a.astype('O') @@ -110,8 +110,8 @@ def test_simple_boolean(self): bool_arr = np.array([True, False, False]) df_bool = pdc.DataFrame({'col': bool_arr}) df_result = df[df_bool] - df_answer = pdc.DataFrame({'a': a[bool_arr], 'b': b[bool_arr], - 'c': c[bool_arr], 'd': d[bool_arr], + df_answer = pdc.DataFrame({'a': a[bool_arr], 'b': b[bool_arr], + 'c': c[bool_arr], 'd': d[bool_arr], 'e': e[bool_arr]}) assert_df_equals(df_result, df_answer) @@ -131,7 +131,7 @@ def test_multiple_columns_tuple(self): df_answer = pdc.DataFrame({'a': a, 'c': c}) assert_df_equals(df_result, df_answer) - def test_int_selcetion(self): + def test_int_selection(self): assert_df_equals(df[:, 3], pdc.DataFrame({'d': d})) def test_simultaneous_tuple(self): @@ -221,7 +221,7 @@ def test_new_column(self): with pytest.raises(NotImplementedError): df[['a', 'b']] = 5 - + with pytest.raises(ValueError): df['a'] = np.random.rand(5, 5) @@ -246,7 +246,7 @@ def test_head_tail(self): df_result = df.tail(2) df_answer = pdc.DataFrame({'a': a[-2:], 'b': b[-2:], 'c': c[-2:], - 'd':d[-2:], 'e': e[-2:]}) + 'd': d[-2:], 'e': e[-2:]}) assert_df_equals(df_result, df_answer) @@ -263,7 +263,6 @@ def test_head_tail(self): class TestAggregation: - def test_min(self): df_result = df1.min() df_answer = pdc.DataFrame({'a': np.array(['a'], dtype='O'), @@ -555,7 +554,7 @@ def test_sort_values(self): def test_sort_values_desc(self): df_result = df6.sort_values('a', asc=False) a = np.array(['c', 'b', 'b', 'a', 'a']) - b = np.array([5.1, 6, 3.4, 1,2]) + b = np.array([5.1, 6, 3.4, 1, 2]) df_answer = pdc.DataFrame({'a': a, 'b': b}) assert_df_equals(df_result, df_answer) @@ -600,8 +599,9 @@ def test_sample(self): class TestGrouping: def test_value_counts(self): - df_temp = pdc.DataFrame({'state': np.array(['texas', 'texas', 'texas', 'florida', 'florida', 'florida', 'florida', 'ohio']), - 'fruit': np.array(['a', 'a', 'a', 'a', 'b', 'b', 'b', 'a'])}) + df_temp = pdc.DataFrame( + {'state': np.array(['texas', 'texas', 'texas', 'florida', 'florida', 'florida', 'florida', 'ohio']), + 'fruit': np.array(['a', 'a', 'a', 'a', 'b', 'b', 'b', 'a'])}) df_results = df_temp.value_counts() df_answer = pdc.DataFrame({'state': np.array(['florida', 'texas', 'ohio'], dtype=object), 'count': np.array([4, 3, 1])}) @@ -612,8 +612,9 @@ def test_value_counts(self): assert_df_equals(df_results[1], df_answer) def test_value_counts_normalize(self): - df_temp = pdc.DataFrame({'state': np.array(['texas', 'texas', 'texas', 'florida', 'florida', 'florida', 'florida', 'ohio']), - 'fruit': np.array(['a', 'a', 'a', 'a', 'b', 'b', 'b', 'a'])}) + df_temp = pdc.DataFrame( + {'state': np.array(['texas', 'texas', 'texas', 'florida', 'florida', 'florida', 'florida', 'ohio']), + 'fruit': np.array(['a', 'a', 'a', 'a', 'b', 'b', 'b', 'a'])}) df_results = df_temp.value_counts(normalize=True) df_answer = pdc.DataFrame({'state': np.array(['florida', 'texas', 'ohio'], dtype=object), 'count': np.array([.5, .375, .125])}) @@ -841,4 +842,4 @@ def test_head(self): 'salary': np.array([45279, 63166, 66614, 71680, 42390])} result = df_emp.head() answer = pdc.DataFrame(data) - assert_df_equals(result, answer) \ No newline at end of file + assert_df_equals(result, answer) From 6db227c7480350b04622e57a5f372fccbb0fc28b Mon Sep 17 00:00:00 2001 From: avsthiago Date: Sat, 24 Oct 2020 19:31:50 +0200 Subject: [PATCH 2/4] Adding nb_conda_kernels as a dependency for for loading up the environment on jupyter automatically --- environment.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index ada51a9..28ddd9d 100644 --- a/environment.yml +++ b/environment.yml @@ -3,4 +3,5 @@ dependencies: - python=3.6 - pandas - jupyter -- pytest \ No newline at end of file +- pytest +- nb_conda_kernels \ No newline at end of file From 64db054af99503b9405e5f77cd21815ecd6f6388 Mon Sep 17 00:00:00 2001 From: avsthiago Date: Sun, 1 Nov 2020 23:12:35 +0100 Subject: [PATCH 3/4] Creating new methods and validations for init --- pandas_cub/__init__.py | 57 ++++++++++++++++++++++++++++++++---------- 1 file changed, 44 insertions(+), 13 deletions(-) diff --git a/pandas_cub/__init__.py b/pandas_cub/__init__.py index 029fc52..b743010 100644 --- a/pandas_cub/__init__.py +++ b/pandas_cub/__init__.py @@ -30,13 +30,31 @@ def __init__(self, data): self._add_docs() def _check_input_types(self, data): - pass + if not isinstance(data, dict): + raise TypeError('`data must be a dictionary') + for key, value in data.items(): + if not isinstance(key, str): + raise TypeError('The keys o `data` must be strings.') + if not isinstance(value, np.ndarray): + raise TypeError('values of `data` must be Numpy arrays.') + if value.ndim != 1: + raise ValueError('Value of `data` must be 1d ndarray') def _check_array_lengths(self, data): - pass + for i, value in enumerate(data.values()): + if i == 0: + length = len(value) + elif length != len(value): + raise ValueError('values of `data` must be a one-dimensional array') def _convert_unicode_to_object(self, data): new_data = {} + for key, value in data.items(): + if value.dtype.kind == 'U': + new_data[key] = value.astype('object') + else: + new_data[key] = value + return new_data def __len__(self): @@ -47,7 +65,8 @@ def __len__(self): ------- int: the number of rows in the dataframe """ - pass + for value in self._data.values(): + return len(value) @property def columns(self): @@ -60,7 +79,7 @@ def columns(self): ------- list of column names """ - pass + return list(self._data) @columns.setter def columns(self, columns): @@ -76,7 +95,16 @@ def columns(self, columns): ------- None """ - pass + if not isinstance(columns, list): + raise TypeError('`columns` must be a list') + if len(columns) != len(self._data): + raise ValueError('New `columns` must be same length as current DataFrame') + if any(i for i in columns if not isinstance(i, str)): + raise TypeError('All column names must be strings') + if len(columns) != len(set(columns)): + raise ValueError('Your columns have duplicates') + self._data = dict(zip(columns, self._data.values())) + @property def shape(self): @@ -437,8 +465,10 @@ def diff(self, n=1): ------- A DataFrame """ + def func(): pass + return self._non_agg(func) def pct_change(self, n=1): @@ -454,8 +484,10 @@ def pct_change(self, n=1): ------- A DataFrame """ + def func(): pass + return self._non_agg(func) #### Arithmetic and Comparison Operators #### @@ -588,14 +620,13 @@ def pivot_table(self, rows=None, columns=None, values=None, aggfunc=None): def _add_docs(self): agg_names = ['min', 'max', 'mean', 'median', 'sum', 'var', 'std', 'any', 'all', 'argmax', 'argmin'] - agg_doc = \ - """ - Find the {} of each column - - Returns - ------- - DataFrame - """ + agg_doc = (""" + Find the {} of each column + + Returns + ------- + DataFrame + """) for name in agg_names: getattr(DataFrame, name).__doc__ = agg_doc.format(name) From a9c8a8a2541928f80f1967df9eee50a24270e5ea Mon Sep 17 00:00:00 2001 From: avsthiago Date: Sun, 8 Nov 2020 12:57:53 +0100 Subject: [PATCH 4/4] Adding new functions for __init__ --- Test Notebook.ipynb | 136 ++++++++++++++++--- pandas_cub/__init__.py | 301 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 393 insertions(+), 44 deletions(-) diff --git a/Test Notebook.ipynb b/Test Notebook.ipynb index 955ce40..b77e80e 100644 --- a/Test Notebook.ipynb +++ b/Test Notebook.ipynb @@ -11,9 +11,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'/Users/thiagoalves/miniconda3/envs/pandas_cub/bin/python'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import sys\n", "sys.executable" @@ -21,9 +32,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], "source": [ "%load_ext autoreload\n", "%autoreload 2" @@ -31,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -43,7 +63,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -63,27 +83,111 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ - "df" + "\n", + "df = pdc.DataFrame({'a': name,'b': name,'thiahitiathia': name,'dshuadsdhai': name,'dsodkasokdpa': name,'daysurdusduasda': name,'dsijdtaosjdaosijdosad': name,'hdusahdisaiasihdia': name,'djssaijdsaojdiasjodaoa': name, 'dsijdaosjdaosijddosad': name,'dsijdaosjdaosijdoslad': name,'dsijdaosjdposijdosad': name,'dsijdaosjdaosijdoosad': name,'dsijdaosjdyaosijdosad': name})" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
a b thiahitiathiadshuadsdhaidsodkasokdpadaysurdusduasdadsijdtaosjdaosijdosadhdusahdisaiasihdiadjssaijdsaojdiasjodaoadsijdaosjdaosijddosaddsijdaosjdaosijdosladdsijdaosjdposijdosaddsijdaosjdaosijdoosaddsijdaosjdyaosijdosad
0Penelope Penelope Penelope Penelope Penelope Penelope Penelope Penelope Penelope Penelope Penelope Penelope Penelope Penelope
1Niko Niko Niko Niko Niko Niko Niko Niko Niko Niko Niko Niko Niko Niko
2Eleni Eleni Eleni Eleni Eleni Eleni Eleni Eleni Eleni Eleni Eleni Eleni Eleni Eleni
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "df_final" + "df" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namestateheightschoolweight
0PenelopeTexas3.6True45
1NikoCalifornia3.5False40
2EleniTexas5.2True130
\n", + "
" + ], + "text/plain": [ + " name state height school weight\n", + "0 Penelope Texas 3.6 True 45\n", + "1 Niko California 3.5 False 40\n", + "2 Eleni Texas 5.2 True 130" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df_pandas" ] @@ -98,9 +202,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python [conda env:pandas_cub] *", "language": "python", - "name": "python3" + "name": "conda-env-pandas_cub-py" }, "language_info": { "codemirror_mode": { @@ -112,7 +216,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.6.12" } }, "nbformat": 4, diff --git a/pandas_cub/__init__.py b/pandas_cub/__init__.py index b743010..b6b98c0 100644 --- a/pandas_cub/__init__.py +++ b/pandas_cub/__init__.py @@ -105,7 +105,6 @@ def columns(self, columns): raise ValueError('Your columns have duplicates') self._data = dict(zip(columns, self._data.values())) - @property def shape(self): """ @@ -113,7 +112,7 @@ def shape(self): ------- two-item tuple of number of rows and columns """ - pass + return len(self), len(self._data) def _repr_html_(self): """ @@ -147,7 +146,61 @@ def _repr_html_(self): """ - pass + html = '' + for col in self.columns: + html += f"" + + html += '' + html += "" + + only_head = False + num_head = 10 + num_tail = 10 + if len(self) <= 20: + only_head = True + num_head = len(self) + + for i in range(num_head): + html += f'' + for col, values in self._data.items(): + kind = values.dtype.kind + if kind == 'f': + html += f'' + elif kind == 'b': + html += f'' + elif kind == 'O': + v = values[i] + if v is None: + v = 'None' + html += f'' + else: + html += f'' + html += '' + + if not only_head: + html += '' + for i in range(len(self.columns)): + html += '' + html += '' + for i in range(-num_tail, 0): + html += f'' + for col, values in self._data.items(): + kind = values.dtype.kind + if kind == 'f': + html += f'' + elif kind == 'b': + html += f'' + elif kind == 'O': + v = values[i] + if v is None: + v = 'None' + html += f'' + else: + html += f'' + html += '' + + html += '
{col:10}
{i}{values[i]:10.3f}{values[i]}{v:10}{values[i]:10}
......
{len(self) + i}{values[i]:10.3f}{values[i]}{v:10}{values[i]:10}
' + return html @property def values(self): @@ -156,7 +209,7 @@ def values(self): ------- A single 2D NumPy array of the underlying data """ - pass + return np.column_stack(list(self._data.values())) @property def dtypes(self): @@ -167,7 +220,10 @@ def dtypes(self): their data type in the other """ DTYPE_NAME = {'O': 'string', 'i': 'int', 'f': 'float', 'b': 'bool'} - pass + col_names = np.array(list((self._data.keys()))) + dtypes = np.array([DTYPE_NAME[i.dtype.kind] for i in self._data.values()]) + new_data = {'Column Name': col_names, 'Data Type': dtypes} + return DataFrame(new_data) def __getitem__(self, item): """ @@ -183,19 +239,100 @@ def __getitem__(self, item): ------- A subset of the original DataFrame """ - pass + if isinstance(item, str): + return DataFrame({item: self._data[item]}) + if isinstance(item, list): + return DataFrame({col: self._data[col] for col in item}) + if isinstance(item, DataFrame): + if item.shape[1] != 1: + raise ValueError('item must be one-column DataFrame') + arr = next(iter(item._data.values())) + if arr.dtype.kind != 'b': + raise ValueError('item must be a one-column boolean DataFrame') + return DataFrame({col: value[arr] for col, value in self._data.items()}) + if isinstance(item, tuple): + return self._getitem_tuple(item) + + raise TypeError('You must pass either a string, list, DataFrame, or tuple to the selection operator') def _getitem_tuple(self, item): # simultaneous selection of rows and cols -> df[rs, cs] - pass + if len(item) != 2: + raise ValueError('item must have length 2') + row_selection, col_selection = item + + if isinstance(row_selection, int): + row_selection = [row_selection] + elif isinstance(row_selection, DataFrame): + if row_selection.shape[1] != 1: + raise ValueError('row selection DataFrame must be one column') + row_selection = next(iter(row_selection._data.values())) + if row_selection.dtype.kind != 'b': + raise TypeError('row selection DataFrame must be a boolean') + elif not isinstance(row_selection, (list, slice)): + raise TypeError('row selection must be an int, list, slice or DataFrame') + + if isinstance(col_selection, int): + col_selection = [self.columns[col_selection]] + elif isinstance(col_selection, str): + col_selection = [col_selection] + elif isinstance(col_selection, list): + new_col_selection = [] + for col in col_selection: + if isinstance(col, int): + new_col_selection.append(self.columns[col]) + else: + new_col_selection.append(col) + col_selection = new_col_selection + elif isinstance(col_selection, slice): + start = col_selection.start + stop = col_selection.stop + step = col_selection.step + + if isinstance(start, str): + start = self.columns.index(start) + + if isinstance(stop, str): + stop = self.columns.index(stop) + 1 + + col_selection = self.columns[start:stop:step] + else: + raise TypeError('column selection must be int, str, list or DataFrame') + + new_data = {} + for col in col_selection: + new_data[col] = self._data[col][row_selection] + + return DataFrame(new_data) def _ipython_key_completions_(self): # allows for tab completion when doing df['c - pass + return self.columns def __setitem__(self, key, value): # adds a new column or a overwrites an old column - pass + if not isinstance(key, str): + raise NotImplementedError('Setting columns is only done with a numpy array') + if isinstance(value, np.ndarray): + if value.ndim != 1: + raise ValueError('The numpy array must be 1D') + if len(value) != len(self): + raise ValueError('Length of setting array must be length of the DataFrame') + elif isinstance(value, DataFrame): + if value.shape[1] != 1: + raise ValueError('Setting DataFrame must be one column') + if len(value) != len(self): + raise ValueError('Setting DataFrame must be same length') + value = next(iter(value._data.values())) + elif isinstance(value, (int, bool, str, float)): + value = np.repeat(value, len(self)) + else: + raise TypeError('Setting DataFrame must be a int, bool, str, float or DataFrame') + + if value.dtype.kind == 'U': + value = value.astype('object') + + self._data[key] = value def head(self, n=5): """ @@ -209,7 +346,7 @@ def head(self, n=5): ------- DataFrame """ - pass + return self[:n, :] def tail(self, n=5): """ @@ -223,7 +360,7 @@ def tail(self, n=5): ------- DataFrame """ - pass + return self[-n:, :] #### Aggregation Methods #### @@ -273,7 +410,13 @@ def _agg(self, aggfunc): ------- A DataFrame """ - pass + new_data = {} + for col, value in self._data.items(): + try: + new_data[col] = np.array([aggfunc(value)]) + except TypeError: + pass + return DataFrame(new_data) def isna(self): """ @@ -283,7 +426,13 @@ def isna(self): ------- A DataFrame of booleans the same size as the calling DataFrame """ - pass + new_data = {} + for col, value in self._data.items(): + if value.dtype.kind == 'O': + new_data[col] = value == None + else: + new_data[col] = np.isnan(value) + return DataFrame(new_data) def count(self): """ @@ -293,7 +442,12 @@ def count(self): ------- A DataFrame """ - pass + df = self.isna() + new_data = {} + length = len(df) + for col, value in df._data.items(): + new_data[col] = np.array([length - value.sum()]) + return DataFrame(new_data) def unique(self): """ @@ -303,7 +457,13 @@ def unique(self): ------- A list of one-column DataFrames """ - pass + dfs = [] + for col, value in self._data.items(): + new_data = {col: np.unique(value)} + dfs.append(DataFrame(new_data)) + if len(dfs) == 1: + return dfs[0] + return dfs def nunique(self): """ @@ -313,7 +473,10 @@ def nunique(self): ------- A DataFrame """ - pass + new_data = {} + for col, value in self._data.items(): + new_data[col] = np.array([len(np.unique(value))]) + return DataFrame(new_data) def value_counts(self, normalize=False): """ @@ -328,7 +491,19 @@ def value_counts(self, normalize=False): ------- A list of DataFrames or a single DataFrame if one column """ - pass + dfs = [] + for col, value in self._data.items(): + uniques, counts = np.unique(value, return_counts=True) + order = np.argsort(-counts) + uniques = uniques[order] + counts = counts[order] + if normalize: + counts = counts / len(self) + new_data = {col: uniques, 'count': counts} + dfs.append(DataFrame(new_data)) + if len(dfs) == 1: + return dfs[0] + return dfs def rename(self, columns): """ @@ -343,7 +518,14 @@ def rename(self, columns): ------- A DataFrame """ - pass + if not isinstance(columns, dict): + return TypeError('`column` must be a dict') + + new_data = {} + for col, value in self._data.items(): + new_col = columns.get(col, col) + new_data[new_col] = value + return DataFrame(new_data) def drop(self, columns): """ @@ -357,7 +539,16 @@ def drop(self, columns): ------- A DataFrame """ - pass + if isinstance(columns, str): + columns = [columns] + elif not isinstance(columns, list): + raise TypeError('`columns` must be either a string or a list') + + new_data = {} + for col, value in self._data.items(): + if col not in columns: + new_data[col] = value + return DataFrame(new_data) #### Non-Aggregation Methods #### @@ -450,7 +641,13 @@ def _non_agg(self, funcname, **kwargs): ------- A DataFrame """ - pass + new_data = {} + for col, value in self._data.items(): + if value.dtype.kind == 'O': + new_data[col] = value.copy() + else: + new_data[col] = funcname(value, **kwargs) + return DataFrame(new_data) def diff(self, n=1): """ @@ -466,8 +663,15 @@ def diff(self, n=1): A DataFrame """ - def func(): - pass + def func(value): + value = value.astype('float') + value_shifted = np.roll(value, n) + value = value - value_shifted + if n >= 0: + value[:n] = np.nan + else: + value[n:] = np.nan + return value return self._non_agg(func) @@ -485,12 +689,19 @@ def pct_change(self, n=1): A DataFrame """ - def func(): - pass + def func(value): + value = value.astype('float') + value_shifted = np.roll(value, n) + value = (value - value_shifted) / value_shifted + if n >= 0: + value[:n] = np.nan + else: + value[n:] = np.nan + return value return self._non_agg(func) - #### Arithmetic and Comparison Operators #### + #### Arithmetic and Comparison Operators #### def __add__(self, other): return self._oper('__add__', other) @@ -559,7 +770,18 @@ def _oper(self, op, other): ------- A DataFrame """ - pass + if isinstance(other, DataFrame): + if other.shape[1] != 1: + raise ValueError('DataFrame must be single column') + else: + other = next(iter(other._data.values())) + + new_data = {} + for col, value in self._data.items(): + method = getattr(value, op) + new_data[col] = method(other) + + return DataFrame(new_data) def sort_values(self, by, asc=True): """ @@ -574,7 +796,18 @@ def sort_values(self, by, asc=True): ------- A DataFrame """ - pass + if isinstance(by, str): + order = np.argsort(self._data[by]) + elif isinstance(by, list): + by = [self._data[col] for col in by[::-1]] + order = np.lexsort(by) + else: + raise TypeError('`by` must be either a list or a string') + + if not asc: + order = order[::-1] + + return self[order.tolist(), :] def sample(self, n=None, frac=None, replace=False, seed=None): """ @@ -595,7 +828,19 @@ def sample(self, n=None, frac=None, replace=False, seed=None): ------- A DataFrame """ - pass + if seed: + np.random.seed(seed=seed) + + if frac: + if frac <= 0: + raise ValueError('`frac` must be positive') + n = int(frac * len(self)) + + if not isinstance(n, int): + raise TypeError('`n` must be an integer') + + rows = np.random.choice(range(len(self)), n, replace=replace) + return self[rows.tolist(), :] def pivot_table(self, rows=None, columns=None, values=None, aggfunc=None): """