diff --git a/Orange/statistics/util.py b/Orange/statistics/util.py index 87af6833f7a..6ef6509e395 100644 --- a/Orange/statistics/util.py +++ b/Orange/statistics/util.py @@ -9,6 +9,7 @@ import bottleneck as bn import numpy as np +import pandas import scipy.stats.stats from scipy import sparse as sp @@ -373,8 +374,7 @@ def weighted_mean(): X.shape[0] - non_zero, non_zero)) else: - X_str = X.astype(str) - nans = ((X_str == "nan") | (X_str == "")).sum(axis=0) \ + nans = (pandas.isnull(X).sum(axis=0) + (X == "").sum(axis=0)) \ if X.size else np.zeros(X.shape[1]) return np.column_stack(( np.tile(np.inf, X.shape[1]), diff --git a/Orange/tests/test_statistics.py b/Orange/tests/test_statistics.py index e3807573d46..81bfddb5466 100644 --- a/Orange/tests/test_statistics.py +++ b/Orange/tests/test_statistics.py @@ -1,4 +1,5 @@ # pylint: disable=no-self-use +import time import unittest import warnings from itertools import chain @@ -145,6 +146,28 @@ def test_stats_non_numeric(self): [np.inf, -np.inf, 0, 0, 2, 1], [np.inf, -np.inf, 0, 0, 0, 3]]) + def test_stats_long_string_mem_use(self): + X = np.full((1000, 1000), "a", dtype=object) + t = time.time() + stats(X) + t_a = time.time() - t # time for an array with constant-len strings + + # Add one very long string + X[0, 0] = "a"*2000 + + # The implementation of stats() in Orange 3.30.2 used .astype("str") + # internally. X.astype("str") would take ~1000x the memory as X, + # because its type would be "