Skip to content

Commit f8dda7d

Browse files
committed
Fix tests
1 parent 9d44b3d commit f8dda7d

File tree

2 files changed

+31
-18
lines changed

2 files changed

+31
-18
lines changed

deepnote_toolkit/ocelots/pandas/analyze.py

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77

88
from deepnote_toolkit.ocelots.constants import DEEPNOTE_INDEX_COLUMN
99
from deepnote_toolkit.ocelots.pandas.utils import (
10+
is_numeric_or_temporal,
1011
is_type_datetime_or_timedelta,
11-
is_type_numeric,
1212
safe_convert_to_string,
1313
)
1414
from deepnote_toolkit.ocelots.types import ColumnsStatsRecord, ColumnStats
@@ -58,8 +58,6 @@ def _get_histogram(pd_series):
5858
try:
5959
if is_type_datetime_or_timedelta(pd_series):
6060
np_array = np.array(pd_series.dropna().astype(int))
61-
elif np.issubdtype(pd_series.dtype, np.complexfloating):
62-
return None
6361
else:
6462
# let's drop infinite values because they break histograms
6563
np_array = np.array(pd_series.replace([np.inf, -np.inf], np.nan).dropna())
@@ -90,14 +88,7 @@ def _calculate_min_max(column):
9088
"""
9189
Calculate min and max values for a given column.
9290
"""
93-
if not is_type_numeric(column.dtype):
94-
return None, None
95-
96-
# Complex numbers cannot be compared for min/max
97-
# Check for datetime/timedelta types before because np.issubdtype doesn't work reliably on them
98-
if not is_type_datetime_or_timedelta(column) and np.issubdtype(
99-
column.dtype, np.complexfloating
100-
):
91+
if not is_numeric_or_temporal(column.dtype):
10192
return None, None
10293

10394
try:
@@ -164,7 +155,7 @@ def analyze_columns(
164155
unique_count=_count_unique(column), nan_count=column.isnull().sum().item()
165156
)
166157

167-
if is_type_numeric(column.dtype):
158+
if is_numeric_or_temporal(column.dtype):
168159
min_value, max_value = _calculate_min_max(column)
169160
columns[i].stats.min = min_value
170161
columns[i].stats.max = max_value
@@ -184,7 +175,7 @@ def analyze_columns(
184175
for i in range(max_columns_to_analyze, len(df.columns)):
185176
# Ignore columns that are not numeric
186177
column = df.iloc[:, i]
187-
if not is_type_numeric(column.dtype):
178+
if not is_numeric_or_temporal(column.dtype):
188179
continue
189180

190181
column_name = columns[i].name

deepnote_toolkit/ocelots/pandas/utils.py

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ def to_string_truncated(elem):
9595
)
9696

9797
for column in df:
98-
if not is_type_numeric(df[column].dtype):
98+
if not is_pure_numeric(df[column].dtype):
9999
# if the dtype is not a number, we want to convert it to string and truncate
100100
df[column] = df[column].apply(to_string_truncated)
101101

@@ -111,17 +111,39 @@ def is_type_datetime_or_timedelta(series_or_dtype):
111111
) or pd.api.types.is_timedelta64_dtype(series_or_dtype)
112112

113113

114-
def is_type_numeric(dtype):
114+
def is_numeric_or_temporal(dtype):
115115
"""
116-
Returns True if dtype is numeric, False otherwise
116+
Returns True if dtype is numeric or temporal (datetime/timedelta), False otherwise.
117117
118-
Numeric means either a number (int, float, complex) or a datetime or timedelta.
118+
This includes numbers (int, float), datetime, and timedelta types.
119+
Use this to determine if values can be plotted on a histogram or have min/max calculated.
119120
"""
120121
if is_type_datetime_or_timedelta(dtype):
121122
return True
122123

123124
try:
124-
return np.issubdtype(dtype, np.number)
125+
return np.issubdtype(dtype, np.number) and not np.issubdtype(
126+
dtype, np.complexfloating
127+
)
128+
except TypeError:
129+
# np.issubdtype crashes on categorical column dtype, and also on others, e.g. geopandas types
130+
return False
131+
132+
133+
def is_pure_numeric(dtype):
134+
"""
135+
Returns True if dtype is a pure number (int, float), False otherwise.
136+
137+
Use this to determine if a value will be serialized as a JSON number.
138+
"""
139+
if is_type_datetime_or_timedelta(dtype):
140+
# np.issubdtype(dtype, np.number) returns True for timedelta, which we don't want
141+
return False
142+
143+
try:
144+
return np.issubdtype(dtype, np.number) and not np.issubdtype(
145+
dtype, np.complexfloating
146+
)
125147
except TypeError:
126148
# np.issubdtype crashes on categorical column dtype, and also on others, e.g. geopandas types
127149
return False

0 commit comments

Comments
 (0)