Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
561bcd2
distance: Implement Euclidean, Manhattan and Jaccard distances
janezd Oct 7, 2016
544f6e1
distances: Add Excel file with computation of distances for tests
janezd Jul 7, 2017
426c108
distance: Implement Cosine distance
janezd Jul 10, 2017
6ae811f
distances: Fix blunders
janezd Jul 13, 2017
a45ad22
distances: Remove inapplicable old tests
janezd Jul 13, 2017
ef9df3a
distances: Improve fallbacks to skl distances
janezd Jul 13, 2017
cada2ee
distances: Adapt OWDistances to new distance classes
janezd Jul 13, 2017
adbb988
distances: Refactoring of Pearson and Spearman
janezd Jul 13, 2017
4534c14
distances: Clip cosine distances to (0, 1)
janezd Jul 13, 2017
09aca05
distances: Refactor checks in OWDistances
janezd Jul 13, 2017
bee0112
util: fix nanmean and nanvar
janezd Jul 13, 2017
00dde91
manifold.py: update to use new distances
janezd Jul 13, 2017
2f0ffce
distances: Add nogil where possible
janezd Jul 13, 2017
2d18d70
distances: Fix tests for OWDistance
janezd Jul 13, 2017
e32fb0d
distances: Make DistanceModel.axis a read-only property
janezd Jul 14, 2017
4342931
distances: Move attribute to the base class
janezd Jul 14, 2017
868e733
OWSilhouettePlot: Remove code needed for old distances
janezd Jul 15, 2017
689b406
distances: Update documentation
janezd Jul 15, 2017
5c0d074
distances: Speed up Euclidean distance
janezd Jul 15, 2017
50d7757
distances: Speed up all distances
janezd Jul 20, 2017
8072f27
distances: Lint
janezd Jul 20, 2017
415be18
distances: Compatibility with numpy 1.13
janezd Jul 20, 2017
13b7b7a
distances: docstrings
janezd Jul 20, 2017
62a5072
distances: Refactor into separate files
janezd Jul 21, 2017
890c5e1
distances: Reformat old test_distances.py
janezd Jul 22, 2017
5969543
Distances: Add distances/tests/__init__.py
janezd Aug 18, 2017
13ede45
distances: Convert lower to symetric only once
janezd Aug 25, 2017
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
265 changes: 5 additions & 260 deletions Orange/distance/__init__.py
Original file line number Diff line number Diff line change
@@ -1,261 +1,6 @@
import numpy as np
from scipy import stats
import sklearn.metrics as skl_metrics
from .distance import (Distance, DistanceModel,
Euclidean, Manhattan, Cosine, Jaccard,
SpearmanR, SpearmanRAbsolute, PearsonR, PearsonRAbsolute,
Mahalanobis, MahalanobisDistance)

from Orange import data
from Orange.misc import DistMatrix
from Orange.preprocess import SklImpute

__all__ = ['Euclidean', 'Manhattan', 'Cosine', 'Jaccard', 'SpearmanR',
'SpearmanRAbsolute', 'PearsonR', 'PearsonRAbsolute', 'Mahalanobis',
'MahalanobisDistance']


def _preprocess(table):
"""Remove categorical attributes and impute missing values."""
if not len(table):
return table
new_domain = data.Domain(
[a for a in table.domain.attributes if a.is_continuous],
table.domain.class_vars,
table.domain.metas)
new_data = table.transform(new_domain)
new_data = SklImpute()(new_data)
return new_data


def _orange_to_numpy(x):
"""Convert :class:`Orange.data.Table` and :class:`Orange.data.RowInstance`
to :class:`numpy.ndarray`.
"""
if isinstance(x, data.Table):
return x.X
elif isinstance(x, data.Instance):
return np.atleast_2d(x.x)
elif isinstance(x, np.ndarray):
return np.atleast_2d(x)
else:
return x # e.g. None


class Distance:
def __call__(self, e1, e2=None, axis=1, impute=False):
"""
:param e1: input data instances, we calculate distances between all
pairs
:type e1: :class:`Orange.data.Table` or
:class:`Orange.data.RowInstance` or :class:`numpy.ndarray`
:param e2: optional second argument for data instances if provided,
distances between each pair, where first item is from e1 and
second is from e2, are calculated
:type e2: :class:`Orange.data.Table` or
:class:`Orange.data.RowInstance` or :class:`numpy.ndarray`
:param axis: if axis=1 we calculate distances between rows, if axis=0
we calculate distances between columns
:type axis: int
:param impute: if impute=True all NaN values in matrix are replaced
with 0
:type impute: bool
:return: the matrix with distances between given examples
:rtype: :class:`Orange.misc.distmatrix.DistMatrix`
"""
raise NotImplementedError(
'Distance is an abstract class and should not be used directly.')


class SklDistance(Distance):
"""Generic scikit-learn distance."""
def __init__(self, metric, name, supports_sparse):
"""
Args:
metric: The metric to be used for distance calculation
name (str): Name of the distance
supports_sparse (boolean): Whether this metric works on sparse data
or not.
"""
self.metric = metric
self.name = name
self.supports_sparse = supports_sparse

def __call__(self, e1, e2=None, axis=1, impute=False):
x1 = _orange_to_numpy(e1)
x2 = _orange_to_numpy(e2)
if axis == 0:
x1 = x1.T
if x2 is not None:
x2 = x2.T
dist = skl_metrics.pairwise.pairwise_distances(
x1, x2, metric=self.metric)
if isinstance(e1, data.Table) or isinstance(e1, data.RowInstance):
dist = DistMatrix(dist, e1, e2, axis)
else:
dist = DistMatrix(dist)
return dist

Euclidean = SklDistance('euclidean', 'Euclidean', True)
Manhattan = SklDistance('manhattan', 'Manhattan', True)
Cosine = SklDistance('cosine', 'Cosine', True)
Jaccard = SklDistance('jaccard', 'Jaccard', False)


class SpearmanDistance(Distance):
""" Generic Spearman's rank correlation coefficient. """
def __init__(self, absolute, name):
"""
Constructor for Spearman's and Absolute Spearman's distances.

Args:
absolute (boolean): Whether to use absolute values or not.
name (str): Name of the distance

Returns:
If absolute=True return Spearman's Absolute rank class else return
Spearman's rank class.
"""
self.absolute = absolute
self.name = name
self.supports_sparse = False

def __call__(self, e1, e2=None, axis=1, impute=False):
x1 = _orange_to_numpy(e1)
x2 = _orange_to_numpy(e2)
if x2 is None:
x2 = x1
slc = len(x1) if axis == 1 else x1.shape[1]
rho, _ = stats.spearmanr(x1, x2, axis=axis)
if np.isnan(rho).any() and impute:
rho = np.nan_to_num(rho)
if self.absolute:
dist = (1. - np.abs(rho)) / 2.
else:
dist = (1. - rho) / 2.
if isinstance(dist, np.float):
dist = np.array([[dist]])
elif isinstance(dist, np.ndarray):
dist = dist[:slc, slc:]
if isinstance(e1, data.Table) or isinstance(e1, data.RowInstance):
dist = DistMatrix(dist, e1, e2, axis)
else:
dist = DistMatrix(dist)
return dist

SpearmanR = SpearmanDistance(absolute=False, name='Spearman')
SpearmanRAbsolute = SpearmanDistance(absolute=True, name='Spearman absolute')


class PearsonDistance(Distance):
""" Generic Pearson's rank correlation coefficient. """
def __init__(self, absolute, name):
"""
Constructor for Pearson's and Absolute Pearson's distances.

Args:
absolute (boolean): Whether to use absolute values or not.
name (str): Name of the distance

Returns:
If absolute=True return Pearson's Absolute rank class else return
Pearson's rank class.
"""
self.absolute = absolute
self.name = name
self.supports_sparse = False

def __call__(self, e1, e2=None, axis=1, impute=False):
x1 = _orange_to_numpy(e1)
x2 = _orange_to_numpy(e2)
if x2 is None:
x2 = x1
if axis == 0:
x1 = x1.T
x2 = x2.T
rho = np.array([[stats.pearsonr(i, j)[0] for j in x2] for i in x1])
if np.isnan(rho).any() and impute:
rho = np.nan_to_num(rho)
if self.absolute:
dist = (1. - np.abs(rho)) / 2.
else:
dist = (1. - rho) / 2.
if isinstance(e1, data.Table) or isinstance(e1, data.RowInstance):
dist = DistMatrix(dist, e1, e2, axis)
else:
dist = DistMatrix(dist)
return dist

PearsonR = PearsonDistance(absolute=False, name='Pearson')
PearsonRAbsolute = PearsonDistance(absolute=True, name='Pearson absolute')


class MahalanobisDistance(Distance):
"""Mahalanobis distance."""
def __init__(self, data=None, axis=1, name='Mahalanobis'):
self.name = name
self.supports_sparse = False
self.axis = None
self.VI = None
if data is not None:
self.fit(data, axis)

def fit(self, data, axis=1):
"""
Compute the covariance matrix needed for calculating distances.

Args:
data: The dataset used for calculating covariances.
axis: If axis=1 we calculate distances between rows, if axis=0 we
calculate distances between columns.
"""
x = _orange_to_numpy(data)
if axis == 0:
x = x.T
self.axis = axis
try:
c = np.cov(x.T)
except:
raise MemoryError("Covariance matrix is too large.")
try:
self.VI = np.linalg.inv(c)
except:
raise ValueError("Computation of inverse covariance matrix failed.")

def __call__(self, e1, e2=None, axis=None, impute=False):
assert self.VI is not None, \
"Mahalanobis distance must be initialized with the fit() method."

x1 = _orange_to_numpy(e1)
x2 = _orange_to_numpy(e2)

if axis is not None:
assert axis == self.axis, \
"Axis must match its value at initialization."
if self.axis == 0:
x1 = x1.T
if x2 is not None:
x2 = x2.T
if not x1.shape[1] == self.VI.shape[0] or \
x2 is not None and not x2.shape[1] == self.VI.shape[0]:
raise ValueError('Incorrect number of features.')

dist = skl_metrics.pairwise.pairwise_distances(
x1, x2, metric='mahalanobis', VI=self.VI)
if np.isnan(dist).any() and impute:
dist = np.nan_to_num(dist)
if isinstance(e1, data.Table) or isinstance(e1, data.RowInstance):
dist = DistMatrix(dist, e1, e2, self.axis)
else:
dist = DistMatrix(dist)
return dist


# Only retain this to raise errors on use. Remove in some future version.
class __MahalanobisDistanceError(MahalanobisDistance):
def _raise_error(self, *args, **kwargs):
raise RuntimeError(
"Invalid use of MahalanobisDistance.\n"
"Create a new MahalanobisDistance instance first, e.g.\n"
">>> metric = MahalanobisDistance(data)\n"
">>> dist = metric(data)"
)
fit = _raise_error
__call__ = _raise_error
Mahalanobis = __MahalanobisDistanceError()
from .base import _preprocess, remove_discrete_features, impute
Loading