diff --git a/Orange/data/table.py b/Orange/data/table.py index 736369919f0..45f4c55e8ff 100644 --- a/Orange/data/table.py +++ b/Orange/data/table.py @@ -734,13 +734,10 @@ def from_table(cls, domain, source, row_indices=...): table = assure_domain_conversion_sparsity(table, source) return table - if row_indices is ...: - n_rows = len(source) - elif isinstance(row_indices, slice): - row_indices_range = range(*row_indices.indices(source.X.shape[0])) - n_rows = len(row_indices_range) - else: - n_rows = len(row_indices) + # avoid boolean indices; also convert to slices if possible + row_indices = _optimize_indices(row_indices, len(source)) + + n_rows = _selection_length(row_indices, len(source)) self = cls() self.domain = domain @@ -783,13 +780,8 @@ def from_table(cls, domain, source, row_indices=...): while i_done < n_rows: target_indices = slice(i_done, min(n_rows, i_done + PART)) - if row_indices is ...: - source_indices = target_indices - elif isinstance(row_indices, slice): - r = row_indices_range[target_indices] - source_indices = slice(r.start, r.stop, r.step) - else: - source_indices = row_indices[target_indices] + source_indices = _select_from_selection(row_indices, target_indices, + len(source)) part_rows = min(n_rows, i_done+PART) - i_done for array_conv in table_conversion.columnwise: @@ -810,15 +802,9 @@ def from_table(cls, domain, source, row_indices=...): out = cparts if not array_conv.is_sparse else sp.vstack(cparts) setattr(self, array_conv.target, out) - if source.has_weights(): - self.W = source.W[row_indices] - else: - self.W = np.empty((n_rows, 0)) + self.W = source.W[row_indices] self.name = getattr(source, 'name', '') - if hasattr(source, 'ids'): - self.ids = source.ids[row_indices] - else: - cls._init_ids(self) + self.ids = source.ids[row_indices] self.attributes = deepcopy(getattr(source, 'attributes', {})) _idcache_save(_thread_local.conversion_cache, (domain, source), self) return self @@ -876,7 +862,7 @@ def from_table_rows(cls, source, row_indices): self.metas = self.metas.reshape(-1, len(self.domain.metas)) self.W = source.W[row_indices] self.name = getattr(source, 'name', '') - self.ids = np.array(source.ids[row_indices]) + self.ids = source.ids[row_indices] self.attributes = deepcopy(getattr(source, 'attributes', {})) return self @@ -2421,12 +2407,16 @@ def _subarray(arr, rows, cols): # so they need to be reshaped to produce an open mesh return arr[np.ix_(rows, cols)] -def _optimize_indices(indices, maxlen): + +def _optimize_indices(indices, size): """ - Convert integer indices to slice if possible. It only converts increasing - integer ranges with positive steps and valid starts and ends. - Only convert valid ends so that invalid ranges will still raise - an exception. + Convert boolean indices to integer indices and convert these to a slice + if possible. + + A slice is created from only from indices with positive steps and + valid starts and ends (so that invalid ranges will still raise an + exception. An IndexError is raised if boolean indices do not conform + to input size. Allows numpy to reuse the data array, because it defaults to copying if given indices. @@ -2434,6 +2424,7 @@ def _optimize_indices(indices, maxlen): Parameters ---------- indices : 1D sequence, slice or Ellipsis + size : int """ if isinstance(indices, slice): return indices @@ -2450,19 +2441,58 @@ def _optimize_indices(indices, maxlen): if len(indices) >= 1: indices = np.asarray(indices) - if indices.dtype != bool: - begin = indices[0] - end = indices[-1] - steps = np.diff(indices) if len(indices) > 1 else np.array([1]) - step = steps[0] + if indices.dtype == bool: + if len(indices) == size: + indices = np.nonzero(indices)[0] + else: + # raise an exception that numpy would if boolean indices were used + raise IndexError("boolean indices did not match dimension") + + if len(indices) >= 1: # conversion from boolean indices could result in an empty array + begin = indices[0] + end = indices[-1] + steps = np.diff(indices) if len(indices) > 1 else np.array([1]) + step = steps[0] - # continuous ranges with constant step and valid start and stop index can be slices - if np.all(steps == step) and step > 0 and begin >= 0 and end < maxlen: - return slice(begin, end + step, step) + # continuous ranges with constant step and valid start and stop index can be slices + if np.all(steps == step) and step > 0 and begin >= 0 and end < size: + return slice(begin, end + step, step) return indices +def _selection_length(indices, maxlen): + """ Return the selection length. + Args: + indices: 1D sequence, slice or Ellipsis + maxlen: maximum length of the sequence + """ + if indices is ...: + return maxlen + elif isinstance(indices, slice): + return len(range(*indices.indices(maxlen))) + else: + return len(indices) + + +def _select_from_selection(source_indices, selection_indices, maxlen): + """ + Create efficient selection indices from a previous selection. + Try to keep slices as slices. + Args: + source_indices: 1D sequence, slice or Ellipsis + selection_indices: 1D sequence or slice + maxlen: maximum length of the sequence + """ + if source_indices is ...: + return selection_indices + elif isinstance(source_indices, slice): + r = range(*source_indices.indices(maxlen))[selection_indices] + return slice(r.start, r.stop, r.step) + else: + return source_indices[selection_indices] + + def assure_domain_conversion_sparsity(target, source): """ Assure that the table obeys the domain conversion's suggestions about sparsity. diff --git a/Orange/tests/test_table.py b/Orange/tests/test_table.py index da9e28a4d80..f352760f4fe 100644 --- a/Orange/tests/test_table.py +++ b/Orange/tests/test_table.py @@ -1819,6 +1819,36 @@ def test_creates_table_with_given_domain_and_row_filter(self): self.assert_table_with_filter_matches( new_table, self.table[:0], xcols=order[:a], ycols=order[a:a+c], mcols=order[a+c:]) + def test_from_table_with_boolean_row_filter(self): + a, c, m = column_sizes(self.table) + domain = self.table.domain + + sel = [False]*len(self.table) + sel[2] = True + + with patch.object(Table, "from_table_rows", wraps=Table.from_table_rows) \ + as from_table_rows: + new_table = Table.from_table(self.table.domain, self.table, row_indices=sel) + from_table_rows.assert_called() + self.assert_table_with_filter_matches( + new_table, self.table[2:3]) + + new_domain1 = Domain(domain.attributes[:1], domain.class_vars[:1], domain.metas[:1]) + with patch.object(Table, "from_table_rows", wraps=Table.from_table_rows) \ + as from_table_rows: + new_table = Table.from_table(new_domain1, self.table, row_indices=sel) + from_table_rows.assert_not_called() + self.assert_table_with_filter_matches( + new_table, self.table[2:3], + xcols=[0], ycols=[a], mcols=[a+c+m-1]) + + new_domain2 = Domain(domain.attributes[:1] + (ContinuousVariable("new"),), + domain.class_vars[:1], domain.metas[:1]) + new_table = Table.from_table(new_domain2, self.table, row_indices=sel) + self.assert_table_with_filter_matches( + new_table.transform(new_domain1), self.table[2:3], + xcols=[0], ycols=[a], mcols=[a+c+m-1]) + def test_from_table_sparse_move_some_to_empty_metas(self): iris = data.Table("iris").to_sparse() new_domain = data.domain.Domain( @@ -2053,7 +2083,6 @@ def test_can_select_a_subset_of_rows_and_columns(self): np.testing.assert_almost_equal(table.metas, self.table.metas[r, metas]) - def test_optimize_indices(self): # ordinary conversion self.assertEqual(_optimize_indices([1, 2, 3], 4), slice(1, 4, 1)) @@ -2064,8 +2093,14 @@ def test_optimize_indices(self): np.testing.assert_equal(_optimize_indices([1, 2, 4], 5), [1, 2, 4]) np.testing.assert_equal(_optimize_indices((1, 2, 4), 5), [1, 2, 4]) - # leave boolean arrays - np.testing.assert_equal(_optimize_indices([True, False, True], 3), [True, False, True]) + # internally convert boolean arrays into indices + np.testing.assert_equal(_optimize_indices([False, False, False, False], 4), []) + np.testing.assert_equal(_optimize_indices([True, False, True, True], 4), [0, 2, 3]) + np.testing.assert_equal(_optimize_indices([True, False, True], 3), slice(0, 4, 2)) + with self.assertRaises(IndexError): + _optimize_indices([True, False, True], 2) + with self.assertRaises(IndexError): + _optimize_indices([True, False, True], 4) # do not convert if step is negative np.testing.assert_equal(_optimize_indices([4, 2, 0], 5), [4, 2, 0])