Skip to content

Commit e01d8b9

Browse files
committed
Fix bug in dummification of DataFrames
1 parent e25010c commit e01d8b9

File tree

4 files changed

+131
-49
lines changed

4 files changed

+131
-49
lines changed

danfojs-browser/src/core/get_dummies.js

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,11 @@ function dummyEncode(data, options) {
129129
const index = uniqueValues.indexOf(colData[j]);
130130
oneHotArr[j][index] = 1;
131131
const prefixToAdd = prefix ? prefix[i] : column;
132-
newColumnNames.push(`${prefixToAdd}${prefixSeparator[i]}${colData[j]}`);
132+
const newColName = `${prefixToAdd}${prefixSeparator[i]}${colData[j]}`;
133+
134+
if (!newColumnNames.includes(newColName)) {
135+
newColumnNames.push(newColName);
136+
}
133137
}
134138

135139
for (let k = 0; k < newData.length; k++) {

danfojs-browser/tests/core/get_dummies.js

Lines changed: 80 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -3,85 +3,85 @@
33
describe("DummyEncoder", function () {
44
it("get_dummies works on Series", function () {
55

6-
const data = [ "dog", "male", "female", "male", "female", "male", "dog" ];
6+
const data = ["dog", "male", "female", "male", "female", "male", "dog"];
77
const series = new dfd.Series(data);
88
const df = dfd.get_dummies(series, { prefix: "test", prefixSeparator: "/" });
99

1010
const dfValues = [
11-
[ 1, 0, 0 ],
12-
[ 0, 1, 0 ],
13-
[ 0, 0, 1 ],
14-
[ 0, 1, 0 ],
15-
[ 0, 0, 1 ],
16-
[ 0, 1, 0 ],
17-
[ 1, 0, 0 ]
11+
[1, 0, 0],
12+
[0, 1, 0],
13+
[0, 0, 1],
14+
[0, 1, 0],
15+
[0, 0, 1],
16+
[0, 1, 0],
17+
[1, 0, 0]
1818
];
19-
const dfColumns = [ 'test/dog', 'test/male', 'test/female' ];
19+
const dfColumns = ['test/dog', 'test/male', 'test/female'];
2020
assert.deepEqual(df.values, dfValues);
2121
assert.deepEqual(df.columns, dfColumns);
2222
});
2323
it("get_dummies works on Series with default prefix and prefixSeperator", function () {
2424

25-
const data = [ "dog", "male", "female", "male", "female", "male", "dog" ];
25+
const data = ["dog", "male", "female", "male", "female", "male", "dog"];
2626
const series = new dfd.Series(data);
2727
const df = dfd.get_dummies(series);
2828

2929
const dfValues = [
30-
[ 1, 0, 0 ],
31-
[ 0, 1, 0 ],
32-
[ 0, 0, 1 ],
33-
[ 0, 1, 0 ],
34-
[ 0, 0, 1 ],
35-
[ 0, 1, 0 ],
36-
[ 1, 0, 0 ]
30+
[1, 0, 0],
31+
[0, 1, 0],
32+
[0, 0, 1],
33+
[0, 1, 0],
34+
[0, 0, 1],
35+
[0, 1, 0],
36+
[1, 0, 0]
3737
];
38-
const dfColumns = [ '0_dog', '1_male', '2_female' ];
38+
const dfColumns = ['0_dog', '1_male', '2_female'];
3939
assert.deepEqual(df.values, dfValues);
4040
assert.deepEqual(df.columns, dfColumns);
4141
});
4242

4343
it("get_dummies works on DataFrame", function () {
4444

45-
const data = [ [ 1, "dog", 1.0, "fat" ], [ 3, "fog", 2.0, "good" ], [ 4, "gof", 3.0, "best" ] ];
46-
const columns = [ "A", "B", "C", "d" ];
45+
const data = [[1, "dog", 1.0, "fat"], [3, "fog", 2.0, "good"], [4, "gof", 3.0, "best"]];
46+
const columns = ["A", "B", "C", "d"];
4747
const df = new dfd.DataFrame(data, { columns: columns });
4848

49-
const df1 = dfd.get_dummies(df, { prefixSeparator: [ "_", "#" ], columns: [ "A", "d" ], prefix: "test" });
50-
const expectedColumns = [ 'B', 'C', 'test_1', 'test_3', 'test_4', 'test#fat', 'test#good', 'test#best' ];
51-
const expected = [ [ 'dog', 1.0, 1, 0, 0, 1, 0, 0 ],
52-
[ 'fog', 2.0, 0, 1, 0, 0, 1, 0 ],
53-
[ 'gof', 3.0, 0, 0, 1, 0, 0, 1 ] ];
49+
const df1 = dfd.get_dummies(df, { prefixSeparator: ["_", "#"], columns: ["A", "d"], prefix: "test" });
50+
const expectedColumns = ['B', 'C', 'test_1', 'test_3', 'test_4', 'test#fat', 'test#good', 'test#best'];
51+
const expected = [['dog', 1.0, 1, 0, 0, 1, 0, 0],
52+
['fog', 2.0, 0, 1, 0, 0, 1, 0],
53+
['gof', 3.0, 0, 0, 1, 0, 0, 1]];
5454
assert.deepEqual(df1.values, expected);
5555
assert.deepEqual(df1.columns, expectedColumns);
5656

5757
});
5858
it("Throw error if the prefix specified is not equal to the column specified", function () {
5959

60-
const data = [ [ 1, "dog", 1.0, "fat" ], [ 3, "fog", 2.0, "good" ], [ 4, "gof", 3.0, "best" ] ];
61-
const columns = [ "A", "B", "C", "d" ];
60+
const data = [[1, "dog", 1.0, "fat"], [3, "fog", 2.0, "good"], [4, "gof", 3.0, "best"]];
61+
const columns = ["A", "B", "C", "d"];
6262
const df = new dfd.DataFrame(data, { columns: columns });
6363

64-
assert.throws(function () { dfd.get_dummies(df, { prefix: [ "fg" ], prefixSeparator: "_", columns: [ "A", "d" ] }); }, Error,
64+
assert.throws(function () { dfd.get_dummies(df, { prefix: ["fg"], prefixSeparator: "_", columns: ["A", "d"] }); }, Error,
6565
`ParamError: prefix and data array must be of the same length. If you need to use the same prefix, then pass a string param instead. e.g {prefix: "fg"}`);
6666

6767
});
6868
it("replace column sepecified with prefix", function () {
6969

70-
const data = [ [ 1, "dog", 1.0, "fat" ], [ 3, "fog", 2.0, "good" ], [ 4, "gof", 3.0, "best" ] ];
71-
const columns = [ "A", "B", "C", "d" ];
70+
const data = [[1, "dog", 1.0, "fat"], [3, "fog", 2.0, "good"], [4, "gof", 3.0, "best"]];
71+
const columns = ["A", "B", "C", "d"];
7272
const df = new dfd.DataFrame(data, { columns: columns });
7373

74-
const df1 = dfd.get_dummies(df, { prefix: [ "F", "G" ], prefixSeparator: "_", columns: [ "A", "d" ] });
74+
const df1 = dfd.get_dummies(df, { prefix: ["F", "G"], prefixSeparator: "_", columns: ["A", "d"] });
7575
const expectedColumns = [
7676
'B', 'C',
7777
'F_1', 'F_3',
7878
'F_4', 'G_fat',
7979
'G_good', 'G_best'
8080
];
8181

82-
const expected = [ [ 'dog', 1.0, 1, 0, 0, 1, 0, 0 ],
83-
[ 'fog', 2.0, 0, 1, 0, 0, 1, 0 ],
84-
[ 'gof', 3.0, 0, 0, 1, 0, 0, 1 ] ];
82+
const expected = [['dog', 1.0, 1, 0, 0, 1, 0, 0],
83+
['fog', 2.0, 0, 1, 0, 0, 1, 0],
84+
['gof', 3.0, 0, 0, 1, 0, 0, 1]];
8585

8686
assert.deepEqual(df1.values, expected);
8787
assert.deepEqual(df1.columns, expectedColumns);
@@ -90,8 +90,8 @@ describe("DummyEncoder", function () {
9090

9191
it("get_dummies auto infers and encode columns with string dtype", function () {
9292

93-
const data = [ [ 1, "dog", 1.0, "fat" ], [ 3, "fog", 2.0, "good" ], [ 4, "gof", 3.0, "best" ] ];
94-
const columns = [ "A", "B", "C", "d" ];
93+
const data = [[1, "dog", 1.0, "fat"], [3, "fog", 2.0, "good"], [4, "gof", 3.0, "best"]];
94+
const columns = ["A", "B", "C", "d"];
9595
const df = new dfd.DataFrame(data, { columns: columns });
9696

9797
const df1 = dfd.get_dummies(df, { prefixSeparator: "_" });
@@ -122,16 +122,54 @@ describe("DummyEncoder", function () {
122122

123123
it("should one hot encode all other columns", function () {
124124

125-
const data = [ [ 1, "dog", 1.0, "fat" ], [ 3, "fog", 2.0, "good" ], [ 4, "gof", 3.0, "best" ] ];
126-
const columns = [ "A", "B", "C", "d" ];
125+
const data = [[1, "dog", 1.0, "fat"], [3, "fog", 2.0, "good"], [4, "gof", 3.0, "best"]];
126+
const columns = ["A", "B", "C", "d"];
127127
const df = new dfd.DataFrame(data, { columns: columns });
128128
const rslt = [
129-
[ 1, 'dog', 1, 1, 0, 0 ],
130-
[ 3, 'fog', 2, 0, 1, 0 ],
131-
[ 4, 'gof', 3, 0, 0, 1 ]
129+
[1, 'dog', 1, 1, 0, 0],
130+
[3, 'fog', 2, 0, 1, 0],
131+
[4, 'gof', 3, 0, 0, 1]
132132
];
133133

134-
assert.deepEqual(dfd.get_dummies(df, { columns: [ "d" ] }).values, rslt);
134+
assert.deepEqual(dfd.get_dummies(df, { columns: ["d"] }).values, rslt);
135+
136+
});
137+
138+
139+
it("Dummification works for object DF", function () {
140+
141+
let data = {
142+
fruits: ['pear', 'mango', "pawpaw", "mango", "bean"],
143+
Count: [20, 30, 89, 12, 30],
144+
Country: ["NG", "NG", "GH", "RU", "RU"]
145+
};
146+
147+
let df = new dfd.DataFrame(data);
148+
const expected = [
149+
[
150+
20, 1, 0, 0,
151+
0, 1, 0, 0
152+
],
153+
[
154+
30, 0, 1, 0,
155+
0, 1, 0, 0
156+
],
157+
[
158+
89, 0, 0, 1,
159+
0, 0, 1, 0
160+
],
161+
[
162+
12, 0, 1, 0,
163+
0, 0, 0, 1
164+
],
165+
[
166+
30, 0, 0, 0,
167+
1, 0, 0, 1
168+
]
169+
];
170+
171+
let dum_df = dfd.get_dummies(df, { prefixSeparator: "_" });
172+
assert.deepEqual(dum_df.values, expected);
135173

136174
});
137175
});

danfojs-node/src/core/get_dummies.js

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,6 @@ function dummyEncode(data, options) {
114114
return new DataFrame(oneHotArr, { columns: newColumnNames });
115115

116116
} else {
117-
118117
const dfWithSelectedColumnsDropped = data.drop({ columns });
119118
let newData = dfWithSelectedColumnsDropped.values;
120119
const newColumnNames = dfWithSelectedColumnsDropped.columns;
@@ -129,7 +128,10 @@ function dummyEncode(data, options) {
129128
const index = uniqueValues.indexOf(colData[j]);
130129
oneHotArr[j][index] = 1;
131130
const prefixToAdd = prefix ? prefix[i] : column;
132-
newColumnNames.push(`${prefixToAdd}${prefixSeparator[i]}${colData[j]}`);
131+
const newColName = `${prefixToAdd}${prefixSeparator[i]}${colData[j]}`;
132+
if (!newColumnNames.includes(newColName)) {
133+
newColumnNames.push(newColName);
134+
}
133135
}
134136

135137
for (let k = 0; k < newData.length; k++) {

danfojs-node/tests/core/get_dummies.js

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ describe("DummyEncoder", function () {
5050
const df1 = get_dummies(df, { prefixSeparator: ["_", "#"], columns: ["A", "d"], prefix: "test" });
5151
const expectedColumns = ['B', 'C', 'test_1', 'test_3', 'test_4', 'test#fat', 'test#good', 'test#best'];
5252
const expected = [['dog', 1.0, 1, 0, 0, 1, 0, 0],
53-
['fog', 2.0, 0, 1, 0, 0, 1, 0],
54-
['gof', 3.0, 0, 0, 1, 0, 0, 1]];
53+
['fog', 2.0, 0, 1, 0, 0, 1, 0],
54+
['gof', 3.0, 0, 0, 1, 0, 0, 1]];
5555
assert.deepEqual(df1.values, expected);
5656
assert.deepEqual(df1.columns, expectedColumns);
5757

@@ -81,8 +81,8 @@ describe("DummyEncoder", function () {
8181
];
8282

8383
const expected = [['dog', 1.0, 1, 0, 0, 1, 0, 0],
84-
['fog', 2.0, 0, 1, 0, 0, 1, 0],
85-
['gof', 3.0, 0, 0, 1, 0, 0, 1]];
84+
['fog', 2.0, 0, 1, 0, 0, 1, 0],
85+
['gof', 3.0, 0, 0, 1, 0, 0, 1]];
8686

8787
assert.deepEqual(df1.values, expected);
8888
assert.deepEqual(df1.columns, expectedColumns);
@@ -135,4 +135,42 @@ describe("DummyEncoder", function () {
135135
assert.deepEqual(get_dummies(df, { columns: ["d"] }).values, rslt);
136136

137137
});
138+
139+
it("Dummification works for object DF", function () {
140+
141+
let data = {
142+
fruits: ['pear', 'mango', "pawpaw", "mango", "bean"],
143+
Count: [20, 30, 89, 12, 30],
144+
Country: ["NG", "NG", "GH", "RU", "RU"]
145+
};
146+
147+
let df = new DataFrame(data);
148+
const expected = [
149+
[
150+
20, 1, 0, 0,
151+
0, 1, 0, 0
152+
],
153+
[
154+
30, 0, 1, 0,
155+
0, 1, 0, 0
156+
],
157+
[
158+
89, 0, 0, 1,
159+
0, 0, 1, 0
160+
],
161+
[
162+
12, 0, 1, 0,
163+
0, 0, 0, 1
164+
],
165+
[
166+
30, 0, 0, 0,
167+
1, 0, 0, 1
168+
]
169+
];
170+
171+
let dum_df = get_dummies(df, { prefixSeparator: "_" });
172+
dum_df.print();
173+
assert.deepEqual(dum_df.values, expected);
174+
175+
});
138176
});

0 commit comments

Comments
 (0)