|
1 | 1 | # pylint: disable=no-self-use |
| 2 | +import time |
2 | 3 | import unittest |
3 | 4 | import warnings |
4 | 5 | from itertools import chain |
@@ -145,6 +146,28 @@ def test_stats_non_numeric(self): |
145 | 146 | [np.inf, -np.inf, 0, 0, 2, 1], |
146 | 147 | [np.inf, -np.inf, 0, 0, 0, 3]]) |
147 | 148 |
|
| 149 | + def test_stats_long_string_mem_use(self): |
| 150 | + X = np.full((1000, 1000), "a", dtype=object) |
| 151 | + t = time.time() |
| 152 | + stats(X) |
| 153 | + t_a = time.time() - t # time for an array with constant-len strings |
| 154 | + |
| 155 | + # Add one very long string |
| 156 | + X[0, 0] = "a"*2000 |
| 157 | + |
| 158 | + # The implementation of stats() in Orange 3.30.2 used .astype("str") |
| 159 | + # internally. X.astype("str") would take ~1000x the memory as X, |
| 160 | + # because its type would be "<U1000" (the length of the longest string). |
| 161 | + # That is about 7.5 GiB of memory on a 64-bit Linux system |
| 162 | + |
| 163 | + # Because it is hard to measure CPU, we here measure time as |
| 164 | + # memory allocation of such big tables takes time. On Marko's |
| 165 | + # Linux system .astype("str") took ~3 seconds. |
| 166 | + t = time.time() |
| 167 | + stats(X) |
| 168 | + t_b = time.time() - t |
| 169 | + self.assertLess(t_b, 2*t_a + 0.1) # some grace period |
| 170 | + |
148 | 171 | def test_nanmin_nanmax(self): |
149 | 172 | warnings.filterwarnings("ignore", r".*All-NaN slice encountered.*") |
150 | 173 | for X in self.data: |
|
0 commit comments