Skip to content

Commit 0d1bea1

Browse files
authored
Merge pull request #2 from maxhumber/easygather
Easygather
2 parents d99c7d1 + d92c83f commit 0d1bea1

File tree

9 files changed

+169
-42
lines changed

9 files changed

+169
-42
lines changed

CHANGELOG

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
- 1.3b1
2+
- NEW: `gather(beside=...)` argument!
3+
- IMPROVED: `sample` errors are more explicit
14
- 1.2
25
- NEW: `cross` join verb!
36
- NEW: `join(..., postfix=("_lhs, "_rhs"))` argument

TODO

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
- remove summarize
12
- explode, collapse verbs
23
- more research into `.assign` mutate(..., vectorized=True)?
34
- datasets
@@ -8,4 +9,4 @@
89
- hide/protect/private
910
- 10 minutes tutorial
1011
- cheatsheet (pandas/dplyr/tidyr)
11-
- anaconda
12+
- anaconda?

redframes/core.py

Lines changed: 60 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -819,50 +819,88 @@ def filter(self, func: Func) -> DataFrame:
819819
def gather(
820820
self,
821821
columns: Columns | None = None,
822+
beside: LazyColumns | None = None,
822823
into: tuple[Column, Column] = ("variable", "value"),
823824
):
824-
"""Lengthen data, increase rows, decrease columns (opposite of `spread`)
825+
"""Lengthen data by increasing rows and decreasing columns (opposite of `spread`)
825826
826827
pandas: `melt`
827828
tidyverse: `gather`, `pivot_longer`
828829
829830
Examples:
830831
831832
```python
832-
df = rf.DataFrame({"foo": [1, 2], "bar": [3, 4], "baz": [4, 5]})
833+
df = rf.DataFrame({
834+
"foo": [1, 2, 1, 2],
835+
"bar": ["A", "B", "C", "D"],
836+
"baz": ["!", "@", "#", "$"],
837+
"jaz": range(4)
838+
})
833839
```
834-
| foo | bar | baz |
835-
|------:|------:|------:|
836-
| 1 | 3 | 4 |
837-
| 2 | 4 | 5 |
840+
| foo | bar | baz | jaz |
841+
|------:|:------|:------|------:|
842+
| 1 | A | ! | 0 |
843+
| 2 | B | @ | 1 |
844+
| 1 | C | # | 2 |
845+
| 2 | D | $ | 3 |
838846
839847
All columns:
840848
841849
```python
842850
df.gather()
843851
```
844-
| variable | value |
845-
|:-----------|--------:|
846-
| foo | 1 |
847-
| foo | 2 |
848-
| bar | 3 |
849-
| bar | 4 |
850-
| baz | 4 |
851-
| baz | 5 |
852+
| variable | value |
853+
|:-----------|:--------|
854+
| foo | 1 |
855+
| foo | 2 |
856+
| foo | 1 |
857+
| foo | 2 |
858+
| bar | A |
859+
| bar | B |
860+
| bar | C |
861+
| bar | D |
862+
| baz | ! |
863+
| baz | @ |
864+
| baz | # |
865+
| baz | $ |
866+
| jaz | 0 |
867+
| jaz | 1 |
868+
| jaz | 2 |
869+
| jaz | 3 |
852870
853871
Multiple columns:
854872
855873
```python
856874
df.gather(["foo", "bar"], into=("var", "val"))
857875
```
858-
| baz | var | val |
859-
|------:|:------|------:|
860-
| 4 | foo | 1 |
861-
| 5 | foo | 2 |
862-
| 4 | bar | 3 |
863-
| 5 | bar | 4 |
876+
| baz | jaz | var | val |
877+
|:------|------:|:------|:------|
878+
| ! | 0 | foo | 1 |
879+
| @ | 1 | foo | 2 |
880+
| # | 2 | foo | 1 |
881+
| $ | 3 | foo | 2 |
882+
| ! | 0 | bar | A |
883+
| @ | 1 | bar | B |
884+
| # | 2 | bar | C |
885+
| $ | 3 | bar | D |
886+
887+
All columns except:
888+
889+
```python
890+
df.gather(beside=["foo", "bar"])
891+
```
892+
| foo | bar | variable | value |
893+
|------:|:------|:-----------|:--------|
894+
| 1 | A | baz | ! |
895+
| 2 | B | baz | @ |
896+
| 1 | C | baz | # |
897+
| 2 | D | baz | $ |
898+
| 1 | A | jaz | 0 |
899+
| 2 | B | jaz | 1 |
900+
| 1 | C | jaz | 2 |
901+
| 2 | D | jaz | 3 |
864902
"""
865-
return _wrap(gather(self._data, columns, into))
903+
return _wrap(gather(self._data, columns, beside, into))
866904

867905
def group(self, by: LazyColumns) -> GroupedFrame:
868906
"""Create a GroupedFrame overwhich split-apply-combine operations can be applied
@@ -1324,7 +1362,7 @@ def split(
13241362
return _wrap(split(self._data, column, into, sep, drop))
13251363

13261364
def spread(self, column: Column, using: Column) -> DataFrame:
1327-
"""Widen data, increase columns, decreas rows (opposite of `gather`)
1365+
"""Widen data by increasing columns and decreasing rows (opposite of `gather`)
13281366
13291367
pandas: `pivot_table`
13301368
tidyverse: `spread`, `pivot_wider`

redframes/verbs/gather.py

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,31 +3,41 @@
33
import pandas as pd
44

55
from ..checks import _check_type
6-
from ..types import Column, Columns, PandasDataFrame
6+
from ..types import Column, Columns, LazyColumns, PandasDataFrame
77

88

99
def gather(
1010
df: PandasDataFrame,
1111
columns: Columns | None = None,
12+
beside: LazyColumns | None = None,
1213
into: tuple[Column, Column] = ("variable", "value"),
1314
) -> PandasDataFrame:
1415
_check_type(columns, {list, None})
16+
_check_type(beside, {str, list, None})
1517
_check_type(into, tuple)
16-
if not (isinstance(into, tuple) and len(into) == 2):
18+
if not (isinstance(into, tuple) and (len(into) == 2)):
1719
raise TypeError("must be tuple[str, str]")
1820
if into[0] == into[1]:
1921
raise TypeError("must be unique")
20-
if into[0] in df.columns:
22+
if (into[0] in df.columns) or (into[1] in df.columns):
2123
raise TypeError("must not be an existing column key")
22-
if into[1] in df.columns:
23-
raise TypeError("must not be an existing column key")
24-
if columns == None:
25-
columns = list(df.columns)
26-
index = [col for col in df.columns if col not in columns] # type: ignore
24+
if (columns != None) and (beside != None):
25+
raise ValueError("columns OR beside must be None")
26+
if (columns == None) and (beside == None):
27+
id_vars = []
28+
value_vars = list(df.columns)
29+
if isinstance(beside, str):
30+
beside = [beside]
31+
if isinstance(beside, list):
32+
id_vars = beside
33+
value_vars = [col for col in df.columns if col not in id_vars]
34+
if isinstance(columns, list):
35+
id_vars = [col for col in df.columns if col not in columns]
36+
value_vars = columns
2737
df = pd.melt(
2838
df,
29-
id_vars=index,
30-
value_vars=columns,
39+
id_vars=id_vars,
40+
value_vars=value_vars,
3141
var_name=into[0],
3242
value_name=into[1],
3343
)

redframes/verbs/sample.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,11 @@ def sample(
1010
_check_type(rows, {int, float})
1111
if rows >= 1:
1212
if isinstance(rows, float):
13-
raise ValueError("rows (int) must be >= 1")
13+
raise ValueError("must be int if > 1")
1414
df = df.sample(rows, random_state=seed)
1515
elif 0 < rows < 1:
1616
df = df.sample(frac=rows, random_state=seed)
1717
else:
18-
raise ValueError("rows (float) must be (0, 1)")
18+
raise ValueError("must be > 0")
1919
df = df.reset_index(drop=True)
2020
return df

redframes/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "1.2"
1+
__version__ = "1.3b1"

tests/test_docstrings.py

Lines changed: 58 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -218,24 +218,76 @@ def test_filter(self):
218218
self.assertEqual(result3, expected3)
219219

220220
def test_gather(self):
221-
df = rf.DataFrame({"foo": [1, 2], "bar": [3, 4], "baz": [4, 5]})
221+
df = rf.DataFrame(
222+
{
223+
"foo": [1, 2, 1, 2],
224+
"bar": ["A", "B", "C", "D"],
225+
"baz": ["!", "@", "#", "$"],
226+
"jaz": range(4),
227+
}
228+
)
222229
result1 = df.gather()
223230
result2 = df.gather(["foo", "bar"], into=("var", "val"))
231+
result3 = df.gather(beside=["foo", "bar"])
224232
expected1 = rf.DataFrame(
225233
{
226-
"variable": ["foo", "foo", "bar", "bar", "baz", "baz"],
227-
"value": [1, 2, 3, 4, 4, 5],
234+
"variable": [
235+
"foo",
236+
"foo",
237+
"foo",
238+
"foo",
239+
"bar",
240+
"bar",
241+
"bar",
242+
"bar",
243+
"baz",
244+
"baz",
245+
"baz",
246+
"baz",
247+
"jaz",
248+
"jaz",
249+
"jaz",
250+
"jaz",
251+
],
252+
"value": [
253+
1,
254+
2,
255+
1,
256+
2,
257+
"A",
258+
"B",
259+
"C",
260+
"D",
261+
"!",
262+
"@",
263+
"#",
264+
"$",
265+
0,
266+
1,
267+
2,
268+
3,
269+
],
228270
}
229271
)
230272
expected2 = rf.DataFrame(
231273
{
232-
"baz": [4, 5, 4, 5],
233-
"var": ["foo", "foo", "bar", "bar"],
234-
"val": [1, 2, 3, 4],
274+
"baz": ["!", "@", "#", "$", "!", "@", "#", "$"],
275+
"jaz": [0, 1, 2, 3, 0, 1, 2, 3],
276+
"var": ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
277+
"val": [1, 2, 1, 2, "A", "B", "C", "D"],
278+
}
279+
)
280+
expected3 = rf.DataFrame(
281+
{
282+
"foo": [1, 2, 1, 2, 1, 2, 1, 2],
283+
"bar": ["A", "B", "C", "D", "A", "B", "C", "D"],
284+
"variable": ["baz", "baz", "baz", "baz", "jaz", "jaz", "jaz", "jaz"],
285+
"value": ["!", "@", "#", "$", 0, 1, 2, 3],
235286
}
236287
)
237288
self.assertEqual(result1, expected1)
238289
self.assertEqual(result2, expected2)
290+
self.assertEqual(result3, expected3)
239291

240292
def test_group(self):
241293
df = rf.DataFrame(

tests/test_ladybugs.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,13 @@ def test_comine_overwrite_and_drop_other(self):
2525
result = df.combine(["foo", "bar"], into="foo", sep="-", drop=True)
2626
expected = rf.DataFrame({"foo": ["1-1", "2-2", "3-3"]})
2727
self.assertEqual(result, expected)
28+
29+
def test_sample_float_1_point_0(self):
30+
df = rf.DataFrame({"foo": range(100)})
31+
with self.assertRaisesRegex(ValueError, "must be int if > 1"):
32+
df.sample(1.0)
33+
34+
def test_sample_negative_1(self):
35+
df = rf.DataFrame({"foo": range(100)})
36+
with self.assertRaisesRegex(ValueError, "must be > 0"):
37+
df.sample(-1)

tests/test_type_hints.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,10 +156,23 @@ def test_gather_bad_columns(self):
156156
with self.assertRaisesRegex(TypeError, "must be list | None"):
157157
self.df.gather(1)
158158

159+
def test_gather_bad_beside(self):
160+
with self.assertRaisesRegex(TypeError, "must be str | list | None"):
161+
self.df.gather(beside=1)
162+
159163
def test_gather_bad_into_column(self):
160164
with self.assertRaisesRegex(TypeError, "must be tuple"):
161165
self.df.gather(["foo", "bar"], into=1)
162166

167+
def test_gather_bad_into_tuple(self):
168+
# with self.assertRaisesRegex(TypeError, f'must be tuple[str, str]'):
169+
# self.df.gather(into=("one", "two", "three"))
170+
pass
171+
172+
def test_gather_bad_both_not_none(self):
173+
with self.assertRaisesRegex(ValueError, "columns OR beside must be None"):
174+
self.df.gather(columns=["foo", "bar"], beside=["baz"])
175+
163176
def test_group_bad_by_columns(self):
164177
with self.assertRaisesRegex(TypeError, "must be list | str"):
165178
self.df.group(1)

0 commit comments

Comments
 (0)