Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Orange/statistics/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import bottleneck as bn
import numpy as np
import pandas
import scipy.stats.stats
from scipy import sparse as sp

Expand Down Expand Up @@ -373,8 +374,7 @@ def weighted_mean():
X.shape[0] - non_zero,
non_zero))
else:
X_str = X.astype(str)
nans = ((X_str == "nan") | (X_str == "")).sum(axis=0) \
nans = (pandas.isnull(X).sum(axis=0) + (X == "").sum(axis=0)) \
if X.size else np.zeros(X.shape[1])
return np.column_stack((
np.tile(np.inf, X.shape[1]),
Expand Down
23 changes: 23 additions & 0 deletions Orange/tests/test_statistics.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# pylint: disable=no-self-use
import time
import unittest
import warnings
from itertools import chain
Expand Down Expand Up @@ -145,6 +146,28 @@ def test_stats_non_numeric(self):
[np.inf, -np.inf, 0, 0, 2, 1],
[np.inf, -np.inf, 0, 0, 0, 3]])

def test_stats_long_string_mem_use(self):
X = np.full((1000, 1000), "a", dtype=object)
t = time.time()
stats(X)
t_a = time.time() - t # time for an array with constant-len strings

# Add one very long string
X[0, 0] = "a"*2000

# The implementation of stats() in Orange 3.30.2 used .astype("str")
# internally. X.astype("str") would take ~1000x the memory as X,
# because its type would be "<U1000" (the length of the longest string).
# That is about 7.5 GiB of memory on a 64-bit Linux system

# Because it is hard to measure CPU, we here measure time as
# memory allocation of such big tables takes time. On Marko's
# Linux system .astype("str") took ~3 seconds.
t = time.time()
stats(X)
t_b = time.time() - t
self.assertLess(t_b, 2*t_a + 0.1) # some grace period

def test_nanmin_nanmax(self):
warnings.filterwarnings("ignore", r".*All-NaN slice encountered.*")
for X in self.data:
Expand Down