Skip to content

Commit 461e0d2

Browse files
authored
Merge pull request #6278 from markotoplak/fix-from-table-boolean
[FIX] Table.from_table works correctly with boolean indices
2 parents e0bef2e + d4c595e commit 461e0d2

File tree

2 files changed

+104
-39
lines changed

2 files changed

+104
-39
lines changed

Orange/data/table.py

Lines changed: 66 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -734,13 +734,10 @@ def from_table(cls, domain, source, row_indices=...):
734734
table = assure_domain_conversion_sparsity(table, source)
735735
return table
736736

737-
if row_indices is ...:
738-
n_rows = len(source)
739-
elif isinstance(row_indices, slice):
740-
row_indices_range = range(*row_indices.indices(source.X.shape[0]))
741-
n_rows = len(row_indices_range)
742-
else:
743-
n_rows = len(row_indices)
737+
# avoid boolean indices; also convert to slices if possible
738+
row_indices = _optimize_indices(row_indices, len(source))
739+
740+
n_rows = _selection_length(row_indices, len(source))
744741

745742
self = cls()
746743
self.domain = domain
@@ -783,13 +780,8 @@ def from_table(cls, domain, source, row_indices=...):
783780

784781
while i_done < n_rows:
785782
target_indices = slice(i_done, min(n_rows, i_done + PART))
786-
if row_indices is ...:
787-
source_indices = target_indices
788-
elif isinstance(row_indices, slice):
789-
r = row_indices_range[target_indices]
790-
source_indices = slice(r.start, r.stop, r.step)
791-
else:
792-
source_indices = row_indices[target_indices]
783+
source_indices = _select_from_selection(row_indices, target_indices,
784+
len(source))
793785
part_rows = min(n_rows, i_done+PART) - i_done
794786

795787
for array_conv in table_conversion.columnwise:
@@ -810,15 +802,9 @@ def from_table(cls, domain, source, row_indices=...):
810802
out = cparts if not array_conv.is_sparse else sp.vstack(cparts)
811803
setattr(self, array_conv.target, out)
812804

813-
if source.has_weights():
814-
self.W = source.W[row_indices]
815-
else:
816-
self.W = np.empty((n_rows, 0))
805+
self.W = source.W[row_indices]
817806
self.name = getattr(source, 'name', '')
818-
if hasattr(source, 'ids'):
819-
self.ids = source.ids[row_indices]
820-
else:
821-
cls._init_ids(self)
807+
self.ids = source.ids[row_indices]
822808
self.attributes = deepcopy(getattr(source, 'attributes', {}))
823809
_idcache_save(_thread_local.conversion_cache, (domain, source), self)
824810
return self
@@ -876,7 +862,7 @@ def from_table_rows(cls, source, row_indices):
876862
self.metas = self.metas.reshape(-1, len(self.domain.metas))
877863
self.W = source.W[row_indices]
878864
self.name = getattr(source, 'name', '')
879-
self.ids = np.array(source.ids[row_indices])
865+
self.ids = source.ids[row_indices]
880866
self.attributes = deepcopy(getattr(source, 'attributes', {}))
881867
return self
882868

@@ -2421,19 +2407,24 @@ def _subarray(arr, rows, cols):
24212407
# so they need to be reshaped to produce an open mesh
24222408
return arr[np.ix_(rows, cols)]
24232409

2424-
def _optimize_indices(indices, maxlen):
2410+
2411+
def _optimize_indices(indices, size):
24252412
"""
2426-
Convert integer indices to slice if possible. It only converts increasing
2427-
integer ranges with positive steps and valid starts and ends.
2428-
Only convert valid ends so that invalid ranges will still raise
2429-
an exception.
2413+
Convert boolean indices to integer indices and convert these to a slice
2414+
if possible.
2415+
2416+
A slice is created from only from indices with positive steps and
2417+
valid starts and ends (so that invalid ranges will still raise an
2418+
exception. An IndexError is raised if boolean indices do not conform
2419+
to input size.
24302420
24312421
Allows numpy to reuse the data array, because it defaults to copying
24322422
if given indices.
24332423
24342424
Parameters
24352425
----------
24362426
indices : 1D sequence, slice or Ellipsis
2427+
size : int
24372428
"""
24382429
if isinstance(indices, slice):
24392430
return indices
@@ -2450,19 +2441,58 @@ def _optimize_indices(indices, maxlen):
24502441

24512442
if len(indices) >= 1:
24522443
indices = np.asarray(indices)
2453-
if indices.dtype != bool:
2454-
begin = indices[0]
2455-
end = indices[-1]
2456-
steps = np.diff(indices) if len(indices) > 1 else np.array([1])
2457-
step = steps[0]
2444+
if indices.dtype == bool:
2445+
if len(indices) == size:
2446+
indices = np.nonzero(indices)[0]
2447+
else:
2448+
# raise an exception that numpy would if boolean indices were used
2449+
raise IndexError("boolean indices did not match dimension")
2450+
2451+
if len(indices) >= 1: # conversion from boolean indices could result in an empty array
2452+
begin = indices[0]
2453+
end = indices[-1]
2454+
steps = np.diff(indices) if len(indices) > 1 else np.array([1])
2455+
step = steps[0]
24582456

2459-
# continuous ranges with constant step and valid start and stop index can be slices
2460-
if np.all(steps == step) and step > 0 and begin >= 0 and end < maxlen:
2461-
return slice(begin, end + step, step)
2457+
# continuous ranges with constant step and valid start and stop index can be slices
2458+
if np.all(steps == step) and step > 0 and begin >= 0 and end < size:
2459+
return slice(begin, end + step, step)
24622460

24632461
return indices
24642462

24652463

2464+
def _selection_length(indices, maxlen):
2465+
""" Return the selection length.
2466+
Args:
2467+
indices: 1D sequence, slice or Ellipsis
2468+
maxlen: maximum length of the sequence
2469+
"""
2470+
if indices is ...:
2471+
return maxlen
2472+
elif isinstance(indices, slice):
2473+
return len(range(*indices.indices(maxlen)))
2474+
else:
2475+
return len(indices)
2476+
2477+
2478+
def _select_from_selection(source_indices, selection_indices, maxlen):
2479+
"""
2480+
Create efficient selection indices from a previous selection.
2481+
Try to keep slices as slices.
2482+
Args:
2483+
source_indices: 1D sequence, slice or Ellipsis
2484+
selection_indices: 1D sequence or slice
2485+
maxlen: maximum length of the sequence
2486+
"""
2487+
if source_indices is ...:
2488+
return selection_indices
2489+
elif isinstance(source_indices, slice):
2490+
r = range(*source_indices.indices(maxlen))[selection_indices]
2491+
return slice(r.start, r.stop, r.step)
2492+
else:
2493+
return source_indices[selection_indices]
2494+
2495+
24662496
def assure_domain_conversion_sparsity(target, source):
24672497
"""
24682498
Assure that the table obeys the domain conversion's suggestions about sparsity.

Orange/tests/test_table.py

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1819,6 +1819,36 @@ def test_creates_table_with_given_domain_and_row_filter(self):
18191819
self.assert_table_with_filter_matches(
18201820
new_table, self.table[:0], xcols=order[:a], ycols=order[a:a+c], mcols=order[a+c:])
18211821

1822+
def test_from_table_with_boolean_row_filter(self):
1823+
a, c, m = column_sizes(self.table)
1824+
domain = self.table.domain
1825+
1826+
sel = [False]*len(self.table)
1827+
sel[2] = True
1828+
1829+
with patch.object(Table, "from_table_rows", wraps=Table.from_table_rows) \
1830+
as from_table_rows:
1831+
new_table = Table.from_table(self.table.domain, self.table, row_indices=sel)
1832+
from_table_rows.assert_called()
1833+
self.assert_table_with_filter_matches(
1834+
new_table, self.table[2:3])
1835+
1836+
new_domain1 = Domain(domain.attributes[:1], domain.class_vars[:1], domain.metas[:1])
1837+
with patch.object(Table, "from_table_rows", wraps=Table.from_table_rows) \
1838+
as from_table_rows:
1839+
new_table = Table.from_table(new_domain1, self.table, row_indices=sel)
1840+
from_table_rows.assert_not_called()
1841+
self.assert_table_with_filter_matches(
1842+
new_table, self.table[2:3],
1843+
xcols=[0], ycols=[a], mcols=[a+c+m-1])
1844+
1845+
new_domain2 = Domain(domain.attributes[:1] + (ContinuousVariable("new"),),
1846+
domain.class_vars[:1], domain.metas[:1])
1847+
new_table = Table.from_table(new_domain2, self.table, row_indices=sel)
1848+
self.assert_table_with_filter_matches(
1849+
new_table.transform(new_domain1), self.table[2:3],
1850+
xcols=[0], ycols=[a], mcols=[a+c+m-1])
1851+
18221852
def test_from_table_sparse_move_some_to_empty_metas(self):
18231853
iris = data.Table("iris").to_sparse()
18241854
new_domain = data.domain.Domain(
@@ -2053,7 +2083,6 @@ def test_can_select_a_subset_of_rows_and_columns(self):
20532083
np.testing.assert_almost_equal(table.metas,
20542084
self.table.metas[r, metas])
20552085

2056-
20572086
def test_optimize_indices(self):
20582087
# ordinary conversion
20592088
self.assertEqual(_optimize_indices([1, 2, 3], 4), slice(1, 4, 1))
@@ -2064,8 +2093,14 @@ def test_optimize_indices(self):
20642093
np.testing.assert_equal(_optimize_indices([1, 2, 4], 5), [1, 2, 4])
20652094
np.testing.assert_equal(_optimize_indices((1, 2, 4), 5), [1, 2, 4])
20662095

2067-
# leave boolean arrays
2068-
np.testing.assert_equal(_optimize_indices([True, False, True], 3), [True, False, True])
2096+
# internally convert boolean arrays into indices
2097+
np.testing.assert_equal(_optimize_indices([False, False, False, False], 4), [])
2098+
np.testing.assert_equal(_optimize_indices([True, False, True, True], 4), [0, 2, 3])
2099+
np.testing.assert_equal(_optimize_indices([True, False, True], 3), slice(0, 4, 2))
2100+
with self.assertRaises(IndexError):
2101+
_optimize_indices([True, False, True], 2)
2102+
with self.assertRaises(IndexError):
2103+
_optimize_indices([True, False, True], 4)
20692104

20702105
# do not convert if step is negative
20712106
np.testing.assert_equal(_optimize_indices([4, 2, 0], 5), [4, 2, 0])

0 commit comments

Comments
 (0)