66import pandas as pd
77
88from deepnote_toolkit .ocelots .constants import DEEPNOTE_INDEX_COLUMN
9+ from deepnote_toolkit .ocelots .pandas .utils import (
10+ is_type_datetime_or_timedelta ,
11+ is_type_numeric ,
12+ safe_convert_to_string ,
13+ )
914from deepnote_toolkit .ocelots .types import ColumnsStatsRecord , ColumnStats
1015
1116
@@ -24,7 +29,10 @@ def _get_categories(np_array):
2429 # special treatment for empty values
2530 num_nans = pandas_series .isna ().sum ().item ()
2631
27- counter = Counter (pandas_series .dropna ().astype (str ))
32+ try :
33+ counter = Counter (pandas_series .dropna ().astype (str ))
34+ except (TypeError , UnicodeDecodeError , AttributeError ):
35+ counter = Counter (pandas_series .dropna ().apply (safe_convert_to_string ))
2836
2937 max_items = 3
3038 if num_nans > 0 :
@@ -46,34 +54,12 @@ def _get_categories(np_array):
4654 return [{"name" : name , "count" : count } for name , count in categories ]
4755
4856
49- def _is_type_numeric (dtype ):
50- """
51- Returns True if dtype is numeric, False otherwise
52-
53- Numeric means either a number (int, float, complex) or a datetime or timedelta.
54- It means e.g. that a range of these values can be plotted on a histogram.
55- """
56-
57- # datetime doesn't play nice with np.issubdtype, so we need to check explicitly
58- if pd .api .types .is_datetime64_any_dtype (dtype ) or pd .api .types .is_timedelta64_dtype (
59- dtype
60- ):
61- return True
62-
63- try :
64- return np .issubdtype (dtype , np .number )
65- except TypeError :
66- # np.issubdtype crashes on categorical column dtype, and also on others, e.g. geopandas types
67- return False
68-
69-
7057def _get_histogram (pd_series ):
7158 try :
72- if pd .api .types .is_datetime64_any_dtype (
73- pd_series
74- ) or pd .api .types .is_timedelta64_dtype (pd_series ):
75- # convert datetime or timedelta to an integer so that a histogram can be created
59+ if is_type_datetime_or_timedelta (pd_series ):
7660 np_array = np .array (pd_series .dropna ().astype (int ))
61+ elif np .issubdtype (pd_series .dtype , np .complexfloating ):
62+ return None
7763 else :
7864 # let's drop infinite values because they break histograms
7965 np_array = np .array (pd_series .replace ([np .inf , - np .inf ], np .nan ).dropna ())
@@ -104,11 +90,22 @@ def _calculate_min_max(column):
10490 """
10591 Calculate min and max values for a given column.
10692 """
107- if _is_type_numeric (column .dtype ):
93+ if not is_type_numeric (column .dtype ):
94+ return None , None
95+
96+ # Complex numbers cannot be compared for min/max
97+ # Check for datetime/timedelta types before because np.issubdtype doesn't work reliably on them
98+ if not is_type_datetime_or_timedelta (column ) and np .issubdtype (
99+ column .dtype , np .complexfloating
100+ ):
101+ return None , None
102+
103+ try :
108104 min_value = str (min (column .dropna ())) if len (column .dropna ()) > 0 else None
109105 max_value = str (max (column .dropna ())) if len (column .dropna ()) > 0 else None
110106 return min_value , max_value
111- return None , None
107+ except (TypeError , ValueError ):
108+ return None , None
112109
113110
114111def analyze_columns (
@@ -167,7 +164,7 @@ def analyze_columns(
167164 unique_count = _count_unique (column ), nan_count = column .isnull ().sum ().item ()
168165 )
169166
170- if _is_type_numeric (column .dtype ):
167+ if is_type_numeric (column .dtype ):
171168 min_value , max_value = _calculate_min_max (column )
172169 columns [i ].stats .min = min_value
173170 columns [i ].stats .max = max_value
@@ -187,7 +184,7 @@ def analyze_columns(
187184 for i in range (max_columns_to_analyze , len (df .columns )):
188185 # Ignore columns that are not numeric
189186 column = df .iloc [:, i ]
190- if not _is_type_numeric (column .dtype ):
187+ if not is_type_numeric (column .dtype ):
191188 continue
192189
193190 column_name = columns [i ].name
0 commit comments