Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 28 additions & 23 deletions Orange/preprocess/impute.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import numpy
from scipy.sparse import issparse
import numpy as np
import scipy.sparse as sp

import Orange.data
from Orange.statistics import distribution, basic_stats
Expand All @@ -26,11 +26,11 @@ def __init__(self, variable, value=0):
self.value = value

def transform(self, c):
if issparse(c):
c.data = numpy.where(numpy.isnan(c.data), self.value, c.data)
if sp.issparse(c):
c.data = np.where(np.isnan(c.data), self.value, c.data)
return c
else:
return numpy.where(numpy.isnan(c), self.value, c)
return np.where(np.isnan(c), self.value, c)


class BaseImputeMethod(Reprable):
Expand Down Expand Up @@ -83,7 +83,7 @@ class DropInstances(BaseImputeMethod):

def __call__(self, data, variable):
index = data.domain.index(variable)
return numpy.isnan(data[:, index]).reshape(-1)
return np.isnan(data[:, index]).reshape(-1)


class Average(BaseImputeMethod):
Expand Down Expand Up @@ -154,13 +154,13 @@ def __init__(self, variable, model):

def __call__(self, data):
if isinstance(data, Orange.data.Instance):
column = numpy.array([float(data[self.variable])])
column = np.array([float(data[self.variable])])
else:
column = numpy.array(data.get_column_view(self.variable)[0],
column = np.array(data.get_column_view(self.variable)[0],
copy=True)

mask = numpy.isnan(column)
if not numpy.any(mask):
mask = np.isnan(column)
if not np.any(mask):
return column

if isinstance(data, Orange.data.Instance):
Expand Down Expand Up @@ -224,7 +224,9 @@ def domain_with_class_var(domain, class_var):

class IsDefined(Transformation):
def transform(self, c):
return ~numpy.isnan(c)
if sp.issparse(c):
c = c.toarray()
return ~np.isnan(c)


class AsValue(BaseImputeMethod):
Expand All @@ -243,7 +245,7 @@ def __call__(self, data, variable):
base_value=variable.base_value,
compute_value=Lookup(
variable,
numpy.arange(len(variable.values), dtype=int),
np.arange(len(variable.values), dtype=int),
unknown=len(variable.values))
)
return var
Expand Down Expand Up @@ -281,29 +283,32 @@ def __init__(self, variable, distribution):
self.distribution = distribution

if variable.is_discrete:
counts = numpy.array(distribution)
counts = np.array(distribution)
elif variable.is_continuous:
counts = numpy.array(distribution)[1, :]
counts = np.array(distribution)[1, :]
else:
raise TypeError("Only discrete and continuous "
"variables are supported")
csum = numpy.sum(counts)
csum = np.sum(counts)
if csum > 0:
self.sample_prob = counts / csum
else:
self.sample_prob = numpy.ones_like(counts) / len(counts)
self.sample_prob = np.ones_like(counts) / len(counts)

def transform(self, c):
c = numpy.array(c, copy=True)
nanindices = numpy.flatnonzero(numpy.isnan(c))
if not sp.issparse(c):
c = np.array(c, copy=True)
else:
c = c.toarray().ravel()
nanindices = np.flatnonzero(np.isnan(c))

if self.variable.is_discrete:
sample = numpy.random.choice(
sample = np.random.choice(
len(self.variable.values), size=len(nanindices),
replace=True, p=self.sample_prob)
else:
sample = numpy.random.choice(
numpy.asarray(self.distribution)[0, :], size=len(nanindices),
sample = np.random.choice(
np.asarray(self.distribution)[0, :], size=len(nanindices),
replace=True, p=self.sample_prob)

c[nanindices] = sample
Expand All @@ -328,9 +333,9 @@ def __call__(self, data, variable):
raise ValueError("'{}' has an unknown distribution"
.format(variable))

if variable.is_discrete and numpy.sum(dist) == 0:
if variable.is_discrete and np.sum(dist) == 0:
dist += 1 / len(dist)
elif variable.is_continuous and numpy.sum(dist[1, :]) == 0:
elif variable.is_continuous and np.sum(dist[1, :]) == 0:
dist[1, :] += 1 / dist.shape[1]
return variable.copy(
compute_value=ReplaceUnknownsRandom(variable, dist))
27 changes: 17 additions & 10 deletions Orange/statistics/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,9 +216,9 @@ def weighted_mean():
non_zero = np.bincount(X.nonzero()[1], minlength=X.shape[1])
X = X.tocsc()
return np.column_stack((
X.min(axis=0).toarray().ravel(),
X.max(axis=0).toarray().ravel(),
np.asarray(X.mean(axis=0)).ravel() if not weighted else weighted_mean(),
nanmin(X, axis=0),
nanmax(X, axis=0),
nanmean(X, axis=0) if not weighted else weighted_mean(),
np.zeros(X.shape[1]), # variance not supported
X.shape[0] - non_zero,
non_zero))
Expand Down Expand Up @@ -280,15 +280,22 @@ def mean(x):
n_values = np.prod(x.shape)
return np.sum(x.data) / n_values


def nanmean(x):
def nanmean(x, axis=None):
""" Equivalent of np.nanmean that supports sparse or dense matrices. """
if not sp.issparse(x):
return np.nanmean(x)

n_values = np.prod(x.shape) - np.sum(np.isnan(x.data))
return np.nansum(x.data) / n_values
def nanmean_sparse(x):
n_values = np.prod(x.shape) - np.sum(np.isnan(x.data))
return np.nansum(x.data) / n_values

if not sp.issparse(x):
return np.nanmean(x, axis=axis)
if axis is None:
return nanmean_sparse(x)
if axis in [0, 1]:
arr = x if axis == 1 else x.T
arr = arr.tocsr()
return np.array([nanmean_sparse(row) for row in arr])
else:
raise NotImplementedError

def unique(x, return_counts=False):
""" Equivalent of np.unique that supports sparse or dense matrices. """
Expand Down
22 changes: 20 additions & 2 deletions Orange/tests/test_impute.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def test_str(self):


class TestAsValue(unittest.TestCase):
def test_replacement(self):
def _create_table(self):
nan = np.nan
X = [
[1.0, nan, 0.0],
Expand All @@ -170,7 +170,11 @@ def test_replacement(self):
data.ContinuousVariable("B"),
data.ContinuousVariable("C"))
)
table = data.Table.from_numpy(domain, np.array(X))
return data.Table.from_numpy(domain, np.array(X))

def test_replacement(self):
table = self._create_table()
domain = table.domain

v1 = impute.AsValue()(table, domain[0])
self.assertTrue(np.all(np.isfinite(v1.compute_value(table))))
Expand Down Expand Up @@ -200,6 +204,20 @@ def test_replacement(self):
[3, 1.0, 0, 1.5, 0]]
)

def test_sparse(self):
"""
Impute: As a distinct value test. Sparse support.
GH-2357
"""
table = self._create_table()
domain = table.domain
table.X = sp.csr_matrix(table.X)

v1, v2 = impute.AsValue()(table, domain[1])
self.assertTrue(np.all(np.isfinite(v2.compute_value(table))))
self.assertEqual([v2.str_val(v) for v in v2.compute_value(table)],
["undef", "def", "undef"])


class TestModel(unittest.TestCase):
def test_replacement(self):
Expand Down
11 changes: 11 additions & 0 deletions Orange/tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@

from Orange.util import export_globals, flatten, deprecated, try_, deepgetattr, \
OrangeDeprecationWarning
from Orange.data import Table
from Orange.data.util import vstack, hstack
from Orange.statistics.util import stats

SOMETHING = 0xf00babe

Expand Down Expand Up @@ -106,3 +108,12 @@ def assertCorrectArrayType(self, array, shape, sparsity):
def test_raise_deprecations(self):
with self.assertRaises(OrangeDeprecationWarning):
warnings.warn('foo', OrangeDeprecationWarning)

def test_stats_sparse(self):
"""
Stats should not fail when trying to calculate mean on sparse data.
GH-2357
"""
data = Table("iris")
sparse_x = sp.csr_matrix(data.X)
self.assertTrue(stats(data.X).all() == stats(sparse_x).all())
5 changes: 5 additions & 0 deletions Orange/widgets/data/owimpute.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ class OWImpute(OWWidget):

class Error(OWWidget.Error):
imputation_failed = Msg("Imputation failed for '{}'")
model_based_imputer_sparse = Msg("Model based imputer does not work for sparse data")

DEFAULT_LEARNER = SimpleTreeLearner()
METHODS = [AsDefault(), impute.DoNotImpute(), impute.Average(),
Expand Down Expand Up @@ -258,9 +259,13 @@ def commit(self):

self.warning()
self.Error.imputation_failed.clear()
self.Error.model_based_imputer_sparse.clear()
with self.progressBar(len(self.varmodel)) as progress:
for i, var in enumerate(self.varmodel):
method = self.variable_methods.get(i, self.default_method)
if isinstance(method, impute.Model) and data.is_sparse():
self.Error.model_based_imputer_sparse()
continue

try:
if not method.supports_variable(var):
Expand Down