Fix memory use in stats() for dtype=object

markotoplak · markotoplak · commit d0ae501e30e7 · 2021-11-23T18:06:16.000+01:00
Before, .astype("str") was applied to the input array. This created
output array of fixed-length strings of the size of the longest string
in the table. Converting numpy object arrays to string arrays should
be, therefore, avoided.
diff --git a/Orange/statistics/util.py b/Orange/statistics/util.py
@@ -9,6 +9,7 @@
 
 import bottleneck as bn
 import numpy as np
+import pandas
 import scipy.stats.stats
 from scipy import sparse as sp
 
@@ -373,8 +374,7 @@ def weighted_mean():
             X.shape[0] - non_zero,
             non_zero))
     else:
-        X_str = X.astype(str)
-        nans = ((X_str == "nan") | (X_str == "")).sum(axis=0) \
+        nans = (pandas.isnull(X).sum(axis=0) + (X == "").sum(axis=0)) \
             if X.size else np.zeros(X.shape[1])
         return np.column_stack((
             np.tile(np.inf, X.shape[1]),
diff --git a/Orange/tests/test_statistics.py b/Orange/tests/test_statistics.py
@@ -1,4 +1,5 @@
 # pylint: disable=no-self-use
+import time
 import unittest
 import warnings
 from itertools import chain
@@ -145,6 +146,28 @@ def test_stats_non_numeric(self):
                                            [np.inf, -np.inf, 0, 0, 2, 1],
                                            [np.inf, -np.inf, 0, 0, 0, 3]])
 
+    def test_stats_long_string_mem_use(self):
+        X = np.full((1000, 1000), "a", dtype=object)
+        t = time.time()
+        stats(X)
+        t_a = time.time() - t  # time for an array with constant-len strings
+
+        # Add one very long string
+        X[0, 0] = "a"*2000
+
+        # The implementation of stats() in Orange 3.30.2 used .astype("str")
+        # internally. X.astype("str") would take ~1000x the memory as X,
+        # because its type would be "<U1000" (the length of the longest string).
+        # That is about 7.5 GiB of memory on a 64-bit Linux system
+
+        # Because it is hard to measure CPU, we here measure time as
+        # memory allocation of such big tables takes time. On Marko's
+        # Linux system .astype("str") took ~3 seconds.
+        t = time.time()
+        stats(X)
+        t_b = time.time() - t
+        self.assertLess(t_b, 2*t_a + 0.1)  # some grace period
+
     def test_nanmin_nanmax(self):
         warnings.filterwarnings("ignore", r".*All-NaN slice encountered.*")
         for X in self.data: