Skip to content

Commit 99618cd

Browse files
victorprincipeagoscinski
authored andcommitted
Added Directional Convex Hull functionality.
* implements the class DirectionalConvexHull in the sample selection submodule * implements tests for the class DirectionalConvexHull.
1 parent 5ba4dcd commit 99618cd

File tree

3 files changed

+433
-1
lines changed

3 files changed

+433
-1
lines changed

skcosmo/sample_selection/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,10 @@
66
from ._base import (
77
CUR,
88
FPS,
9+
DirectionalConvexHull,
910
PCovCUR,
1011
PCovFPS,
1112
)
1213
from ._voronoi_fps import VoronoiFPS
1314

14-
__all__ = ["PCovFPS", "PCovCUR", "FPS", "CUR", "VoronoiFPS"]
15+
__all__ = ["PCovFPS", "PCovCUR", "FPS", "CUR", "DirectionalConvexHull", "VoronoiFPS"]

skcosmo/sample_selection/_base.py

Lines changed: 280 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,21 @@
22
Sequential sample selection
33
"""
44

5+
import warnings
6+
7+
import numpy as np
8+
from scipy.interpolate import (
9+
LinearNDInterpolator,
10+
interp1d,
11+
)
12+
from scipy.interpolate.interpnd import _ndim_coords_from_arrays
13+
from scipy.spatial import ConvexHull
14+
from sklearn.utils.validation import (
15+
check_array,
16+
check_is_fitted,
17+
check_X_y,
18+
)
19+
520
from .._selection import (
621
_CUR,
722
_FPS,
@@ -10,6 +25,42 @@
1025
)
1126

1227

28+
def _linear_interpolator(points, values):
29+
"""
30+
Returns linear interpolater for unstructured D-D data. Tessellate the input point set to N-D simplices, and interpolate linearly on each simplex. See `LinearNDInterpolator` for more details.
31+
32+
points : 2-D ndarray of floats with shape (n, D), or length D tuple of 1-D ndarrays with shape (n,).
33+
Data point coordinates.
34+
values : ndarray of float or complex, shape (n,)
35+
Data values.
36+
37+
38+
Reference:
39+
---------
40+
The code is an adapted excerpt from
41+
https://github.com/scipy/scipy/blob/dde50595862a4f9cede24b5d1c86935c30f1f88a/scipy/interpolate/_ndgriddata.py#L119-L273
42+
"""
43+
44+
points = _ndim_coords_from_arrays(points)
45+
46+
if points.ndim < 2:
47+
ndim = points.ndim
48+
else:
49+
ndim = points.shape[-1]
50+
51+
if ndim == 1:
52+
points = points.ravel()
53+
# Sort points/values together, necessary as input for interp1d
54+
idx = np.argsort(points)
55+
points = points[idx]
56+
values = values[idx]
57+
return interp1d(
58+
points, values, kind="linear", axis=0, bounds_error=False, fill_value=np.nan
59+
)
60+
else:
61+
return LinearNDInterpolator(points, values, fill_value=np.nan, rescale=False)
62+
63+
1364
class FPS(_FPS):
1465
"""
1566
Transformer that performs Greedy Sample Selection using Farthest Point Sampling.
@@ -346,3 +397,232 @@ def __init__(
346397
full=full,
347398
random_state=random_state,
348399
)
400+
401+
402+
class DirectionalConvexHull:
403+
"""
404+
Performs Sample Selection by constructing a Directional Convex Hull and determining the distance to the hull as outlined in the reference
405+
406+
Parameters
407+
----------
408+
409+
low_dim_idx : list of ints, default None
410+
Indices of columns of X containing features to be used for the
411+
directional convex hull construction (also known as the low-
412+
dimensional (LD) hull). By default [0] is used.
413+
414+
Attributes
415+
----------
416+
417+
high_dim_idx_ : list of ints
418+
Indices of columns in data containing high-dimensional
419+
features (i.e. those not used for the convex hull
420+
construction)
421+
422+
selected_idx_ : numpy.ndarray
423+
Indices of datapoints that form the vertices of the
424+
convex hull
425+
interpolator_high_dim_ : scipy.interpolate.interpnd.LinearNDInterpolator
426+
Interpolater for the features in the high-
427+
dimensional space
428+
429+
interpolator_y_ : scipy.interpolate.interpnd.LinearNDInterpolator
430+
Interpolater for the targets y
431+
432+
References
433+
----------
434+
.. [dch] A. Anelli, E. A. Engel, C. J. Pickard and M. Ceriotti,
435+
Physical Review Materials, 2018.
436+
"""
437+
438+
def __init__(self, low_dim_idx=None):
439+
self.low_dim_idx = low_dim_idx
440+
441+
if low_dim_idx is None:
442+
self.low_dim_idx = [0]
443+
else:
444+
self.low_dim_idx = low_dim_idx
445+
446+
def fit(self, X, y):
447+
"""
448+
Learn the samples that form the convex hull.
449+
450+
Parameters
451+
----------
452+
X : ndarray of shape (n_samples, n_features)
453+
Feature matrix of samples to use for constructing the convex
454+
hull.
455+
y : ndarray of shape (n_samples,)
456+
Target values (property on which the convex hull should be
457+
constructed, e.g. Gibbs free energy)
458+
459+
Returns
460+
-------
461+
self : object
462+
Fitted scorer.
463+
"""
464+
465+
X, y = self._check_X_y(X, y)
466+
self.n_features_in_ = X.shape[1]
467+
468+
if len(y.shape) == 1:
469+
y = y.reshape((len(y), 1))
470+
471+
if (max(np.abs(self.low_dim_idx)) > X.shape[1]) and (
472+
min(self.low_dim_idx) >= 0
473+
):
474+
raise ValueError(
475+
"One or more columns indexed with low_dim_idx is"
476+
" out of bounds with the dimensions of X."
477+
)
478+
479+
self.high_dim_idx_ = np.setdiff1d(np.arange(X.shape[1]), self.low_dim_idx)
480+
481+
# get number of dimensions for the convex (lower dimensional) hull construction
482+
n_low_dim_idx = len(self.low_dim_idx)
483+
484+
# append features and target property to the same data matrix (for the
485+
# convex hull construction)
486+
convex_hull_data = np.zeros((X.shape[0], n_low_dim_idx + 1))
487+
convex_hull_data[:, :1] = y
488+
convex_hull_data[:, 1:] = X[:, self.low_dim_idx].copy()
489+
490+
# create high-dimensional feature matrix
491+
high_dim_feats = X[:, self.high_dim_idx_]
492+
# get scipy convex hull
493+
convex_hull = ConvexHull(convex_hull_data)
494+
# get normal equations to the hull simplices
495+
y_normal = convex_hull.equations[:, 0]
496+
497+
# get vertices_idx of the convex hull
498+
self.selected_idx_ = np.unique(
499+
convex_hull.simplices[np.where(y_normal < 0)[0]].flatten()
500+
)
501+
502+
# required for the score_feature_matrix function
503+
self.interpolator_high_dim_ = _linear_interpolator(
504+
points=convex_hull_data[self.selected_idx_, 1:],
505+
values=high_dim_feats[self.selected_idx_],
506+
)
507+
508+
# required to compute the distance of the low-dimensional feature to the convex hull
509+
self.interpolator_y_ = _linear_interpolator(
510+
points=convex_hull_data[self.selected_idx_, 1:],
511+
values=convex_hull_data[self.selected_idx_, 0],
512+
)
513+
514+
return self
515+
516+
def _check_X_y(self, X, y):
517+
return check_X_y(X, y, ensure_min_features=2, multi_output=False)
518+
519+
def _check_is_fitted(self, X):
520+
check_is_fitted(
521+
self,
522+
[
523+
"high_dim_idx_",
524+
"interpolator_high_dim_",
525+
"interpolator_y_",
526+
"selected_idx_",
527+
],
528+
)
529+
n_features = X.shape[1]
530+
if n_features != self.n_features_in_:
531+
raise ValueError(
532+
f"X has {n_features} features, but {self.__class__.__name__} "
533+
f"is expecting {self.n_features_in_} features as input."
534+
)
535+
return True
536+
537+
def score_samples(self, X, y):
538+
"""
539+
Calculate the distance of the samples to the convex hull in the target
540+
direction y. Samples with a distance > 0 lie above the convex surface.
541+
Samples with a distance of zero lie on the convex surface. Samples with
542+
a distance value < 0 lie below the convex surface.
543+
544+
Parameters
545+
----------
546+
X : ndarray of shape (n_samples, n_features)
547+
Feature matrix of samples to use for determining distance
548+
to the convex hull. Please note that samples provided should
549+
have the same dimensions (features) as used during fitting
550+
of the convex hull. The same column indices will be used for
551+
the low- and high-dimensional features.
552+
553+
y : ndarray of shape (n_samples,)
554+
Target values (property on which the convex hull should be
555+
constructed, e.g. Gibbs free energy)
556+
557+
Returns
558+
-------
559+
dch_distance : numpy.array of shape (n_samples, len(high_dim_idx_))
560+
The distance (residuals) of samples to the convex hull in
561+
the higher-dimensional space.
562+
"""
563+
X, y = self._check_X_y(X, y)
564+
self._check_is_fitted(X)
565+
566+
# features used for the convex hull construction
567+
low_dim_feats = X[:, self.low_dim_idx]
568+
569+
# the X points projected on the convex surface
570+
interpolated_y = self.interpolator_y_(low_dim_feats).reshape(y.shape)
571+
572+
if np.any(np.isnan(interpolated_y)):
573+
warnings.warn(
574+
"There are samples in X with a low-dimensional part that is outside of the range of the convex surface. Distance will contain nans.",
575+
UserWarning,
576+
)
577+
578+
return y - interpolated_y
579+
580+
def score_feature_matrix(self, X):
581+
"""
582+
Calculate the distance (or more specifically, the residuals) of the
583+
samples to the convex hull in the high-dimensional space. Samples
584+
with a distance value of zero in all the higher dimensions lie on
585+
the convex hull.
586+
587+
588+
Parameters
589+
----------
590+
X : ndarray of shape (n_samples, n_features)
591+
Feature matrix of samples to use for determining distance
592+
to the convex hull. Please note that samples provided should
593+
have the same dimensions (features) as used during fitting
594+
of the convex hull. The same column indices will be used for
595+
the low- and high-dimensional features.
596+
597+
Returns
598+
-------
599+
dch_distance : numpy.array of shape (n_samples, len(high_dim_idx_))
600+
The distance (residuals) of samples to the convex hull in
601+
the higher-dimensional space.
602+
"""
603+
X = check_array(X)
604+
self._check_is_fitted(X)
605+
606+
# features used for the convex hull construction
607+
low_dim_feats = X[:, self.low_dim_idx]
608+
# HD features not used for the convex hull
609+
high_dim_feats = X[:, self.high_dim_idx_]
610+
611+
if len(self.low_dim_idx) == 1:
612+
low_dim_feats = low_dim_feats.reshape(
613+
-1,
614+
)
615+
# interpolate features
616+
interpolated_high_dim_feats = self.interpolator_high_dim_(low_dim_feats)
617+
618+
if np.any(np.isnan(interpolated_high_dim_feats)):
619+
warnings.warn(
620+
"There are samples in X with a low-dimensional part that is outside of the range of the convex surface. Distance will contain nans.",
621+
UserWarning,
622+
)
623+
624+
# determine the distance between the original high-dimensional data and
625+
# interpolated high-dimensional data
626+
dch_distance = high_dim_feats - interpolated_high_dim_feats
627+
628+
return dch_distance

0 commit comments

Comments
 (0)