fix: Multiple fixes for handling different data types in pandas column analysis

OlegWock · OlegWock · commit 9d44b3d76a8f · 2025-11-06T13:36:38.000+01:00
diff --git a/.cursorrules b/.cursorrules
@@ -94,6 +94,12 @@ Additional for integration tests:
 # Run local tests
 ./bin/test-local
 
+# Run a specific test file
+./bin/test-local tests/unit/test_file.py
+
+# ... or specific test from file
+./bin/test-local tests/unit/test_file.py::TestClass::test_method
+
 # Run specific test type
 export TEST_TYPE="unit|integration"
 export TOOLKIT_VERSION="local-build"
diff --git a/deepnote_toolkit/ocelots/pandas/analyze.py b/deepnote_toolkit/ocelots/pandas/analyze.py
@@ -6,6 +6,11 @@
 import pandas as pd
 
 from deepnote_toolkit.ocelots.constants import DEEPNOTE_INDEX_COLUMN
+from deepnote_toolkit.ocelots.pandas.utils import (
+    is_type_datetime_or_timedelta,
+    is_type_numeric,
+    safe_convert_to_string,
+)
 from deepnote_toolkit.ocelots.types import ColumnsStatsRecord, ColumnStats
 
 
@@ -24,7 +29,10 @@ def _get_categories(np_array):
     # special treatment for empty values
     num_nans = pandas_series.isna().sum().item()
 
-    counter = Counter(pandas_series.dropna().astype(str))
+    try:
+        counter = Counter(pandas_series.dropna().astype(str))
+    except (TypeError, UnicodeDecodeError, AttributeError):
+        counter = Counter(pandas_series.dropna().apply(safe_convert_to_string))
 
     max_items = 3
     if num_nans > 0:
@@ -46,34 +54,12 @@ def _get_categories(np_array):
     return [{"name": name, "count": count} for name, count in categories]
 
 
-def _is_type_numeric(dtype):
-    """
-    Returns True if dtype is numeric, False otherwise
-
-    Numeric means either a number (int, float, complex) or a datetime or timedelta.
-    It means e.g. that a range of these values can be plotted on a histogram.
-    """
-
-    # datetime doesn't play nice with np.issubdtype, so we need to check explicitly
-    if pd.api.types.is_datetime64_any_dtype(dtype) or pd.api.types.is_timedelta64_dtype(
-        dtype
-    ):
-        return True
-
-    try:
-        return np.issubdtype(dtype, np.number)
-    except TypeError:
-        # np.issubdtype crashes on categorical column dtype, and also on others, e.g. geopandas types
-        return False
-
-
 def _get_histogram(pd_series):
     try:
-        if pd.api.types.is_datetime64_any_dtype(
-            pd_series
-        ) or pd.api.types.is_timedelta64_dtype(pd_series):
-            # convert datetime or timedelta to an integer so that a histogram can be created
+        if is_type_datetime_or_timedelta(pd_series):
             np_array = np.array(pd_series.dropna().astype(int))
+        elif np.issubdtype(pd_series.dtype, np.complexfloating):
+            return None
         else:
             # let's drop infinite values because they break histograms
             np_array = np.array(pd_series.replace([np.inf, -np.inf], np.nan).dropna())
@@ -104,11 +90,22 @@ def _calculate_min_max(column):
     """
     Calculate min and max values for a given column.
     """
-    if _is_type_numeric(column.dtype):
+    if not is_type_numeric(column.dtype):
+        return None, None
+
+    # Complex numbers cannot be compared for min/max
+    # Check for datetime/timedelta types before because np.issubdtype doesn't work reliably on them
+    if not is_type_datetime_or_timedelta(column) and np.issubdtype(
+        column.dtype, np.complexfloating
+    ):
+        return None, None
+
+    try:
         min_value = str(min(column.dropna())) if len(column.dropna()) > 0 else None
         max_value = str(max(column.dropna())) if len(column.dropna()) > 0 else None
         return min_value, max_value
-    return None, None
+    except (TypeError, ValueError):
+        return None, None
 
 
 def analyze_columns(
@@ -167,7 +164,7 @@ def analyze_columns(
             unique_count=_count_unique(column), nan_count=column.isnull().sum().item()
         )
 
-        if _is_type_numeric(column.dtype):
+        if is_type_numeric(column.dtype):
             min_value, max_value = _calculate_min_max(column)
             columns[i].stats.min = min_value
             columns[i].stats.max = max_value
@@ -187,7 +184,7 @@ def analyze_columns(
     for i in range(max_columns_to_analyze, len(df.columns)):
         # Ignore columns that are not numeric
         column = df.iloc[:, i]
-        if not _is_type_numeric(column.dtype):
+        if not is_type_numeric(column.dtype):
             continue
 
         column_name = columns[i].name
diff --git a/deepnote_toolkit/ocelots/pandas/utils.py b/deepnote_toolkit/ocelots/pandas/utils.py
@@ -1,10 +1,21 @@
+import base64
+
 import numpy as np
 import pandas as pd
 from packaging.requirements import Requirement
 
 from deepnote_toolkit.ocelots.constants import MAX_STRING_CELL_LENGTH
 
 
+def safe_convert_to_string(value):
+    if isinstance(value, bytes):
+        return base64.b64encode(value).decode("ascii")
+    try:
+        return str(value)
+    except Exception:
+        return "<unconvertible>"
+
+
 # like fillna, but only fills NaT (not a time) values in datetime columns with the specified value
 def fill_nat(df, value):
     df_datetime_columns = df.select_dtypes(
@@ -76,33 +87,38 @@ def deduplicate_columns(df):
 # Cast dataframe contents to strings and trim them to avoid sending too much data
 def cast_objects_to_string(df):
     def to_string_truncated(elem):
-        elem_string = str(elem)
+        elem_string = safe_convert_to_string(elem)
         return (
             (elem_string[: MAX_STRING_CELL_LENGTH - 1] + "…")
             if len(elem_string) > MAX_STRING_CELL_LENGTH
             else elem_string
         )
 
     for column in df:
-        if not _is_type_number(df[column].dtype):
+        if not is_type_numeric(df[column].dtype):
             # if the dtype is not a number, we want to convert it to string and truncate
             df[column] = df[column].apply(to_string_truncated)
 
     return df
 
 
-def _is_type_number(dtype):
+def is_type_datetime_or_timedelta(series_or_dtype):
     """
-    Returns True if dtype is a number, False otherwise. Datetime and timedelta will return False.
+    Returns True if the series or dtype is datetime or timedelta, False otherwise.
+    """
+    return pd.api.types.is_datetime64_any_dtype(
+        series_or_dtype
+    ) or pd.api.types.is_timedelta64_dtype(series_or_dtype)
+
 
-    The primary intent of this is to recognize a value that will converted to a JSON number during serialization.
+def is_type_numeric(dtype):
     """
+    Returns True if dtype is numeric, False otherwise
 
-    if pd.api.types.is_datetime64_any_dtype(dtype) or pd.api.types.is_timedelta64_dtype(
-        dtype
-    ):
-        # np.issubdtype(dtype, np.number) returns True for timedelta, which we don't want
-        return False
+    Numeric means either a number (int, float, complex) or a datetime or timedelta.
+    """
+    if is_type_datetime_or_timedelta(dtype):
+        return True
 
     try:
         return np.issubdtype(dtype, np.number)
diff --git a/deepnote_toolkit/ocelots/pyspark/implementation.py b/deepnote_toolkit/ocelots/pyspark/implementation.py
@@ -243,7 +243,7 @@ def select_column(field: StructField) -> Column:
             # We slice binary field before encoding to avoid encoding potentially big blob. Round slicing to
             # 4 bytes to avoid breaking multi-byte sequences
             if isinstance(field.dataType, BinaryType):
-                sliced = F.substring(field, 1, keep_bytes)
+                sliced = F.substring(F.col(field.name), 1, keep_bytes)
                 return F.base64(sliced)
 
             # String just needs to be trimmed
diff --git a/tests/unit/helpers/testing_dataframes.py b/tests/unit/helpers/testing_dataframes.py
@@ -261,12 +261,14 @@ def create_dataframe_with_duplicate_column_names():
                     datetime.datetime(2023, 1, 1, 12, 0, 0),
                     datetime.datetime(2023, 1, 2, 12, 0, 0),
                 ],
+                "binary": [b"hello", b"world"],
             }
         ),
         "pyspark_schema": pst.StructType(
             [
                 pst.StructField("list", pst.ArrayType(pst.IntegerType()), True),
                 pst.StructField("datetime", pst.TimestampType(), True),
+                pst.StructField("binary", pst.BinaryType(), True),
             ]
         ),
     },
diff --git a/tests/unit/test_analyze_columns_pandas.py b/tests/unit/test_analyze_columns_pandas.py
diff --git a/tests/unit/test_ocelots.py b/tests/unit/test_ocelots.py

Original file line number	Diff line number	Diff line change
`@@ -261,12 +261,14 @@ def create_dataframe_with_duplicate_column_names():`
`261`	`261`	`datetime.datetime(2023, 1, 1, 12, 0, 0),`
`262`	`262`	`datetime.datetime(2023, 1, 2, 12, 0, 0),`
`263`	`263`	`],`
	`264`	`+ "binary": [b"hello", b"world"],`
`264`	`265`	`}`
`265`	`266`	`),`
`266`	`267`	`"pyspark_schema": pst.StructType(`
`267`	`268`	`[`
`268`	`269`	`pst.StructField("list", pst.ArrayType(pst.IntegerType()), True),`
`269`	`270`	`pst.StructField("datetime", pst.TimestampType(), True),`
	`271`	`+ pst.StructField("binary", pst.BinaryType(), True),`
`270`	`272`	`]`
`271`	`273`	`),`
`272`	`274`	`},`