Skip to content

Commit 93e352c

Browse files
authored
Merge pull request #153 from scikit-learn-contrib/pull_149
Output heterogeneous data types
2 parents 534f48d + 62bf316 commit 93e352c

File tree

3 files changed

+50
-11
lines changed

3 files changed

+50
-11
lines changed

README.rst

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -190,14 +190,14 @@ By default the output of the dataframe mapper is a numpy array. This is so becau
190190
... ], df_out=True)
191191
>>> np.round(mapper_df.fit_transform(data.copy()), 2)
192192
pet_cat pet_dog pet_fish children
193-
0 1.0 0.0 0.0 0.21
194-
1 0.0 1.0 0.0 1.88
195-
2 0.0 1.0 0.0 -0.63
196-
3 0.0 0.0 1.0 -0.63
197-
4 1.0 0.0 0.0 -1.46
198-
5 0.0 1.0 0.0 -0.63
199-
6 1.0 0.0 0.0 1.04
200-
7 0.0 0.0 1.0 0.21
193+
0 1 0 0 0.21
194+
1 0 1 0 1.88
195+
2 0 1 0 -0.63
196+
3 0 0 1 -0.63
197+
4 1 0 0 -1.46
198+
5 0 1 0 -0.63
199+
6 1 0 0 1.04
200+
7 0 0 1 0.21
201201

202202
The names for the columns are the same ones present in the ``transformed_names_``
203203
attribute.
@@ -413,6 +413,7 @@ Development
413413
******************
414414
* Add ``strategy`` and ``replacement`` parameters to ``CategoricalImputer`` to allow imputing
415415
with values other than the mode. (#144)
416+
* Preserve input data types when no transform is supplied (#138)
416417

417418

418419
1.6.0 (2017-10-28)
@@ -502,5 +503,6 @@ Other contributors:
502503
* Paul Butler (@paulgb)
503504
* Richard Miller (@rwjmiller)
504505
* Ritesh Agrawal (@ragrawal)
506+
* Timothy Sweetser (@hacktuarial)
505507
* Vitaley Zaretskey (@vzaretsk)
506508
* Zac Stewart (@zacstewart)

sklearn_pandas/dataframe_mapper.py

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,19 @@ def get_names(self, columns, transformer, x, alias=None):
260260
else:
261261
return [name]
262262

263+
def get_dtypes(self, extracted):
264+
dtypes_features = [self.get_dtype(ex) for ex in extracted]
265+
return [dtype for dtype_feature in dtypes_features
266+
for dtype in dtype_feature]
267+
268+
def get_dtype(self, ex):
269+
if isinstance(ex, np.ndarray) or sparse.issparse(ex):
270+
return [ex.dtype] * ex.shape[1]
271+
elif isinstance(ex, pd.DataFrame):
272+
return list(ex.dtypes)
273+
else:
274+
raise TypeError(type(ex))
275+
263276
def transform(self, X):
264277
"""
265278
Transform the given data. Assumes that fit has already been called.
@@ -323,8 +336,15 @@ def transform(self, X):
323336
else:
324337
index = None
325338

326-
return pd.DataFrame(stacked,
327-
columns=self.transformed_names_,
328-
index=index)
339+
# output different data types, if appropriate
340+
dtypes = self.get_dtypes(extracted)
341+
df_out = pd.DataFrame(
342+
stacked,
343+
columns=self.transformed_names_,
344+
index=index)
345+
# preserve types
346+
for col, dtype in zip(self.transformed_names_, dtypes):
347+
df_out[col] = df_out[col].astype(dtype)
348+
return df_out
329349
else:
330350
return stacked

tests/test_dataframe_mapper.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -829,3 +829,20 @@ def test_direct_cross_validation(iris_dataframe):
829829
scores = sklearn_cv_score(pipeline, data, labels)
830830
assert scores.mean() > 0.96
831831
assert (scores.std() * 2) < 0.04
832+
833+
834+
def test_heterogeneous_output_types_input_df():
835+
"""
836+
Modify feat2, but pass feat1 through unmodified.
837+
This fails if input_df == False
838+
"""
839+
df = pd.DataFrame({
840+
'feat1': [1, 2, 3, 4, 5, 6],
841+
'feat2': [1.0, 2.0, 3.0, 2.0, 3.0, 4.0]
842+
})
843+
M = DataFrameMapper([
844+
(['feat2'], StandardScaler())
845+
], input_df=True, df_out=True, default=None)
846+
dft = M.fit_transform(df)
847+
assert dft['feat1'].dtype == np.dtype('int64')
848+
assert dft['feat2'].dtype == np.dtype('float64')

0 commit comments

Comments
 (0)