Skip to content

Commit d0ae501

Browse files
committed
Fix memory use in stats() for dtype=object
Before, .astype("str") was applied to the input array. This created output array of fixed-length strings of the size of the longest string in the table. Converting numpy object arrays to string arrays should be, therefore, avoided.
1 parent 351ac65 commit d0ae501

File tree

2 files changed

+25
-2
lines changed

2 files changed

+25
-2
lines changed

Orange/statistics/util.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
import bottleneck as bn
1111
import numpy as np
12+
import pandas
1213
import scipy.stats.stats
1314
from scipy import sparse as sp
1415

@@ -373,8 +374,7 @@ def weighted_mean():
373374
X.shape[0] - non_zero,
374375
non_zero))
375376
else:
376-
X_str = X.astype(str)
377-
nans = ((X_str == "nan") | (X_str == "")).sum(axis=0) \
377+
nans = (pandas.isnull(X).sum(axis=0) + (X == "").sum(axis=0)) \
378378
if X.size else np.zeros(X.shape[1])
379379
return np.column_stack((
380380
np.tile(np.inf, X.shape[1]),

Orange/tests/test_statistics.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# pylint: disable=no-self-use
2+
import time
23
import unittest
34
import warnings
45
from itertools import chain
@@ -145,6 +146,28 @@ def test_stats_non_numeric(self):
145146
[np.inf, -np.inf, 0, 0, 2, 1],
146147
[np.inf, -np.inf, 0, 0, 0, 3]])
147148

149+
def test_stats_long_string_mem_use(self):
150+
X = np.full((1000, 1000), "a", dtype=object)
151+
t = time.time()
152+
stats(X)
153+
t_a = time.time() - t # time for an array with constant-len strings
154+
155+
# Add one very long string
156+
X[0, 0] = "a"*2000
157+
158+
# The implementation of stats() in Orange 3.30.2 used .astype("str")
159+
# internally. X.astype("str") would take ~1000x the memory as X,
160+
# because its type would be "<U1000" (the length of the longest string).
161+
# That is about 7.5 GiB of memory on a 64-bit Linux system
162+
163+
# Because it is hard to measure CPU, we here measure time as
164+
# memory allocation of such big tables takes time. On Marko's
165+
# Linux system .astype("str") took ~3 seconds.
166+
t = time.time()
167+
stats(X)
168+
t_b = time.time() - t
169+
self.assertLess(t_b, 2*t_a + 0.1) # some grace period
170+
148171
def test_nanmin_nanmax(self):
149172
warnings.filterwarnings("ignore", r".*All-NaN slice encountered.*")
150173
for X in self.data:

0 commit comments

Comments
 (0)