Skip to content

Commit 8ddddbc

Browse files
yuzhenmaobrandonlockhart
authored andcommitted
fix(eda): change dtype 'string' to 'object'
1 parent df32e1d commit 8ddddbc

File tree

4 files changed

+27
-4
lines changed

4 files changed

+27
-4
lines changed

dataprep/eda/create_report/formatter.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,14 @@
1818
from ..distribution.compute.overview import calc_stats
1919
from ..distribution.compute.univariate import cont_comps, nom_comps
2020
from ..distribution.render import format_cat_stats, format_num_stats, format_ov_stats
21-
from ..dtypes import CATEGORICAL_DTYPES, Continuous, Nominal, detect_dtype, is_dtype
21+
from ..dtypes import (
22+
CATEGORICAL_DTYPES,
23+
Continuous,
24+
Nominal,
25+
detect_dtype,
26+
is_dtype,
27+
string_dtype_to_object,
28+
)
2229
from ..intermediate import Intermediate
2330
from ..missing import render_missing
2431
from ..missing.compute.nullivariate import compute_missing_nullivariate
@@ -51,6 +58,7 @@ def format_report(
5158
# pylint: disable=too-many-locals,too-many-statements
5259
with ProgressBar(minimum=1, disable=not progress):
5360
df = to_dask(df)
61+
df = string_dtype_to_object(df)
5462
if mode == "basic":
5563
comps = format_basic(df)
5664
# elif mode == "full":

dataprep/eda/distribution/compute/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import dask.dataframe as dd
66
import pandas as pd
77

8-
from ...dtypes import DTypeDef
8+
from ...dtypes import DTypeDef, string_dtype_to_object
99
from ...intermediate import Intermediate
1010
from ...utils import to_dask
1111
from .bivariate import compute_bivariate
@@ -93,9 +93,9 @@ def compute(
9393
dtype = {"a": Continuous(), "b": "nominal"}
9494
or dtype = Continuous() or dtype = "Continuous" or dtype = Continuous()
9595
""" # pylint: disable=too-many-locals
96-
9796
df = to_dask(df)
9897
df.columns = df.columns.astype(str)
98+
df = string_dtype_to_object(df)
9999

100100
if not any((x, y, z)):
101101
return compute_overview(df, bins, ngroups, largest, timeunit, dtype)

dataprep/eda/dtypes.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515
CATEGORICAL_PANDAS_DTYPES = [pd.CategoricalDtype, pd.PeriodDtype]
1616
CATEGORICAL_DTYPES = CATEGORICAL_NUMPY_DTYPES + CATEGORICAL_PANDAS_DTYPES
1717

18+
STRING_PANDAS_DTYPES = [pd.StringDtype]
19+
STRING_DTYPES = STRING_PANDAS_DTYPES
20+
1821
NUMERICAL_NUMPY_DTYPES = [np.number]
1922
NUMERICAL_DTYPES = NUMERICAL_NUMPY_DTYPES
2023

@@ -256,6 +259,17 @@ def is_pandas_categorical(dtype: Any) -> bool:
256259
return any(isinstance(dtype, c) for c in CATEGORICAL_PANDAS_DTYPES)
257260

258261

262+
def string_dtype_to_object(df: dd.DataFrame) -> dd.DataFrame:
263+
"""
264+
Convert string dtype to object dtype
265+
"""
266+
for col in df.columns:
267+
if any(isinstance(df[col].dtype, c) for c in STRING_DTYPES):
268+
df[col] = df[col].astype(object)
269+
270+
return df
271+
272+
259273
def drop_null(
260274
var: Union[dd.Series, pd.DataFrame, dd.DataFrame]
261275
) -> Union[pd.Series, dd.Series, pd.DataFrame, dd.DataFrame]:

dataprep/eda/missing/compute/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from warnings import catch_warnings, filterwarnings
66

77
from ...data_array import DataArray, DataFrame
8-
from ...dtypes import DTypeDef
8+
from ...dtypes import DTypeDef, string_dtype_to_object
99
from ...intermediate import Intermediate
1010
from .bivariate import compute_missing_bivariate
1111
from .nullivariate import compute_missing_nullivariate
@@ -53,6 +53,7 @@ def compute_missing( # pylint: disable=too-many-arguments
5353
>>> plot_missing(df, "HDI_for_year")
5454
>>> plot_missing(df, "HDI_for_year", "population")
5555
"""
56+
df = string_dtype_to_object(df)
5657
df = DataArray(df)
5758

5859
# pylint: disable=no-else-raise

0 commit comments

Comments
 (0)