|
2 | 2 | Sequential sample selection |
3 | 3 | """ |
4 | 4 |
|
| 5 | +import warnings |
| 6 | + |
| 7 | +import numpy as np |
| 8 | +from scipy.interpolate import ( |
| 9 | + LinearNDInterpolator, |
| 10 | + interp1d, |
| 11 | +) |
| 12 | +from scipy.interpolate.interpnd import _ndim_coords_from_arrays |
| 13 | +from scipy.spatial import ConvexHull |
| 14 | +from sklearn.utils.validation import ( |
| 15 | + check_array, |
| 16 | + check_is_fitted, |
| 17 | + check_X_y, |
| 18 | +) |
| 19 | + |
5 | 20 | from .._selection import ( |
6 | 21 | _CUR, |
7 | 22 | _FPS, |
|
10 | 25 | ) |
11 | 26 |
|
12 | 27 |
|
| 28 | +def _linear_interpolator(points, values): |
| 29 | + """ |
| 30 | + Returns linear interpolater for unstructured D-D data. Tessellate the input point set to N-D simplices, and interpolate linearly on each simplex. See `LinearNDInterpolator` for more details. |
| 31 | +
|
| 32 | + points : 2-D ndarray of floats with shape (n, D), or length D tuple of 1-D ndarrays with shape (n,). |
| 33 | + Data point coordinates. |
| 34 | + values : ndarray of float or complex, shape (n,) |
| 35 | + Data values. |
| 36 | +
|
| 37 | +
|
| 38 | + Reference: |
| 39 | + --------- |
| 40 | + The code is an adapted excerpt from |
| 41 | + https://github.com/scipy/scipy/blob/dde50595862a4f9cede24b5d1c86935c30f1f88a/scipy/interpolate/_ndgriddata.py#L119-L273 |
| 42 | + """ |
| 43 | + |
| 44 | + points = _ndim_coords_from_arrays(points) |
| 45 | + |
| 46 | + if points.ndim < 2: |
| 47 | + ndim = points.ndim |
| 48 | + else: |
| 49 | + ndim = points.shape[-1] |
| 50 | + |
| 51 | + if ndim == 1: |
| 52 | + points = points.ravel() |
| 53 | + # Sort points/values together, necessary as input for interp1d |
| 54 | + idx = np.argsort(points) |
| 55 | + points = points[idx] |
| 56 | + values = values[idx] |
| 57 | + return interp1d( |
| 58 | + points, values, kind="linear", axis=0, bounds_error=False, fill_value=np.nan |
| 59 | + ) |
| 60 | + else: |
| 61 | + return LinearNDInterpolator(points, values, fill_value=np.nan, rescale=False) |
| 62 | + |
| 63 | + |
13 | 64 | class FPS(_FPS): |
14 | 65 | """ |
15 | 66 | Transformer that performs Greedy Sample Selection using Farthest Point Sampling. |
@@ -346,3 +397,232 @@ def __init__( |
346 | 397 | full=full, |
347 | 398 | random_state=random_state, |
348 | 399 | ) |
| 400 | + |
| 401 | + |
| 402 | +class DirectionalConvexHull: |
| 403 | + """ |
| 404 | + Performs Sample Selection by constructing a Directional Convex Hull and determining the distance to the hull as outlined in the reference |
| 405 | +
|
| 406 | + Parameters |
| 407 | + ---------- |
| 408 | +
|
| 409 | + low_dim_idx : list of ints, default None |
| 410 | + Indices of columns of X containing features to be used for the |
| 411 | + directional convex hull construction (also known as the low- |
| 412 | + dimensional (LD) hull). By default [0] is used. |
| 413 | +
|
| 414 | + Attributes |
| 415 | + ---------- |
| 416 | +
|
| 417 | + high_dim_idx_ : list of ints |
| 418 | + Indices of columns in data containing high-dimensional |
| 419 | + features (i.e. those not used for the convex hull |
| 420 | + construction) |
| 421 | +
|
| 422 | + selected_idx_ : numpy.ndarray |
| 423 | + Indices of datapoints that form the vertices of the |
| 424 | + convex hull |
| 425 | + interpolator_high_dim_ : scipy.interpolate.interpnd.LinearNDInterpolator |
| 426 | + Interpolater for the features in the high- |
| 427 | + dimensional space |
| 428 | +
|
| 429 | + interpolator_y_ : scipy.interpolate.interpnd.LinearNDInterpolator |
| 430 | + Interpolater for the targets y |
| 431 | +
|
| 432 | + References |
| 433 | + ---------- |
| 434 | + .. [dch] A. Anelli, E. A. Engel, C. J. Pickard and M. Ceriotti, |
| 435 | + Physical Review Materials, 2018. |
| 436 | + """ |
| 437 | + |
| 438 | + def __init__(self, low_dim_idx=None): |
| 439 | + self.low_dim_idx = low_dim_idx |
| 440 | + |
| 441 | + if low_dim_idx is None: |
| 442 | + self.low_dim_idx = [0] |
| 443 | + else: |
| 444 | + self.low_dim_idx = low_dim_idx |
| 445 | + |
| 446 | + def fit(self, X, y): |
| 447 | + """ |
| 448 | + Learn the samples that form the convex hull. |
| 449 | +
|
| 450 | + Parameters |
| 451 | + ---------- |
| 452 | + X : ndarray of shape (n_samples, n_features) |
| 453 | + Feature matrix of samples to use for constructing the convex |
| 454 | + hull. |
| 455 | + y : ndarray of shape (n_samples,) |
| 456 | + Target values (property on which the convex hull should be |
| 457 | + constructed, e.g. Gibbs free energy) |
| 458 | +
|
| 459 | + Returns |
| 460 | + ------- |
| 461 | + self : object |
| 462 | + Fitted scorer. |
| 463 | + """ |
| 464 | + |
| 465 | + X, y = self._check_X_y(X, y) |
| 466 | + self.n_features_in_ = X.shape[1] |
| 467 | + |
| 468 | + if len(y.shape) == 1: |
| 469 | + y = y.reshape((len(y), 1)) |
| 470 | + |
| 471 | + if (max(np.abs(self.low_dim_idx)) > X.shape[1]) and ( |
| 472 | + min(self.low_dim_idx) >= 0 |
| 473 | + ): |
| 474 | + raise ValueError( |
| 475 | + "One or more columns indexed with low_dim_idx is" |
| 476 | + " out of bounds with the dimensions of X." |
| 477 | + ) |
| 478 | + |
| 479 | + self.high_dim_idx_ = np.setdiff1d(np.arange(X.shape[1]), self.low_dim_idx) |
| 480 | + |
| 481 | + # get number of dimensions for the convex (lower dimensional) hull construction |
| 482 | + n_low_dim_idx = len(self.low_dim_idx) |
| 483 | + |
| 484 | + # append features and target property to the same data matrix (for the |
| 485 | + # convex hull construction) |
| 486 | + convex_hull_data = np.zeros((X.shape[0], n_low_dim_idx + 1)) |
| 487 | + convex_hull_data[:, :1] = y |
| 488 | + convex_hull_data[:, 1:] = X[:, self.low_dim_idx].copy() |
| 489 | + |
| 490 | + # create high-dimensional feature matrix |
| 491 | + high_dim_feats = X[:, self.high_dim_idx_] |
| 492 | + # get scipy convex hull |
| 493 | + convex_hull = ConvexHull(convex_hull_data) |
| 494 | + # get normal equations to the hull simplices |
| 495 | + y_normal = convex_hull.equations[:, 0] |
| 496 | + |
| 497 | + # get vertices_idx of the convex hull |
| 498 | + self.selected_idx_ = np.unique( |
| 499 | + convex_hull.simplices[np.where(y_normal < 0)[0]].flatten() |
| 500 | + ) |
| 501 | + |
| 502 | + # required for the score_feature_matrix function |
| 503 | + self.interpolator_high_dim_ = _linear_interpolator( |
| 504 | + points=convex_hull_data[self.selected_idx_, 1:], |
| 505 | + values=high_dim_feats[self.selected_idx_], |
| 506 | + ) |
| 507 | + |
| 508 | + # required to compute the distance of the low-dimensional feature to the convex hull |
| 509 | + self.interpolator_y_ = _linear_interpolator( |
| 510 | + points=convex_hull_data[self.selected_idx_, 1:], |
| 511 | + values=convex_hull_data[self.selected_idx_, 0], |
| 512 | + ) |
| 513 | + |
| 514 | + return self |
| 515 | + |
| 516 | + def _check_X_y(self, X, y): |
| 517 | + return check_X_y(X, y, ensure_min_features=2, multi_output=False) |
| 518 | + |
| 519 | + def _check_is_fitted(self, X): |
| 520 | + check_is_fitted( |
| 521 | + self, |
| 522 | + [ |
| 523 | + "high_dim_idx_", |
| 524 | + "interpolator_high_dim_", |
| 525 | + "interpolator_y_", |
| 526 | + "selected_idx_", |
| 527 | + ], |
| 528 | + ) |
| 529 | + n_features = X.shape[1] |
| 530 | + if n_features != self.n_features_in_: |
| 531 | + raise ValueError( |
| 532 | + f"X has {n_features} features, but {self.__class__.__name__} " |
| 533 | + f"is expecting {self.n_features_in_} features as input." |
| 534 | + ) |
| 535 | + return True |
| 536 | + |
| 537 | + def score_samples(self, X, y): |
| 538 | + """ |
| 539 | + Calculate the distance of the samples to the convex hull in the target |
| 540 | + direction y. Samples with a distance > 0 lie above the convex surface. |
| 541 | + Samples with a distance of zero lie on the convex surface. Samples with |
| 542 | + a distance value < 0 lie below the convex surface. |
| 543 | +
|
| 544 | + Parameters |
| 545 | + ---------- |
| 546 | + X : ndarray of shape (n_samples, n_features) |
| 547 | + Feature matrix of samples to use for determining distance |
| 548 | + to the convex hull. Please note that samples provided should |
| 549 | + have the same dimensions (features) as used during fitting |
| 550 | + of the convex hull. The same column indices will be used for |
| 551 | + the low- and high-dimensional features. |
| 552 | +
|
| 553 | + y : ndarray of shape (n_samples,) |
| 554 | + Target values (property on which the convex hull should be |
| 555 | + constructed, e.g. Gibbs free energy) |
| 556 | +
|
| 557 | + Returns |
| 558 | + ------- |
| 559 | + dch_distance : numpy.array of shape (n_samples, len(high_dim_idx_)) |
| 560 | + The distance (residuals) of samples to the convex hull in |
| 561 | + the higher-dimensional space. |
| 562 | + """ |
| 563 | + X, y = self._check_X_y(X, y) |
| 564 | + self._check_is_fitted(X) |
| 565 | + |
| 566 | + # features used for the convex hull construction |
| 567 | + low_dim_feats = X[:, self.low_dim_idx] |
| 568 | + |
| 569 | + # the X points projected on the convex surface |
| 570 | + interpolated_y = self.interpolator_y_(low_dim_feats).reshape(y.shape) |
| 571 | + |
| 572 | + if np.any(np.isnan(interpolated_y)): |
| 573 | + warnings.warn( |
| 574 | + "There are samples in X with a low-dimensional part that is outside of the range of the convex surface. Distance will contain nans.", |
| 575 | + UserWarning, |
| 576 | + ) |
| 577 | + |
| 578 | + return y - interpolated_y |
| 579 | + |
| 580 | + def score_feature_matrix(self, X): |
| 581 | + """ |
| 582 | + Calculate the distance (or more specifically, the residuals) of the |
| 583 | + samples to the convex hull in the high-dimensional space. Samples |
| 584 | + with a distance value of zero in all the higher dimensions lie on |
| 585 | + the convex hull. |
| 586 | +
|
| 587 | +
|
| 588 | + Parameters |
| 589 | + ---------- |
| 590 | + X : ndarray of shape (n_samples, n_features) |
| 591 | + Feature matrix of samples to use for determining distance |
| 592 | + to the convex hull. Please note that samples provided should |
| 593 | + have the same dimensions (features) as used during fitting |
| 594 | + of the convex hull. The same column indices will be used for |
| 595 | + the low- and high-dimensional features. |
| 596 | +
|
| 597 | + Returns |
| 598 | + ------- |
| 599 | + dch_distance : numpy.array of shape (n_samples, len(high_dim_idx_)) |
| 600 | + The distance (residuals) of samples to the convex hull in |
| 601 | + the higher-dimensional space. |
| 602 | + """ |
| 603 | + X = check_array(X) |
| 604 | + self._check_is_fitted(X) |
| 605 | + |
| 606 | + # features used for the convex hull construction |
| 607 | + low_dim_feats = X[:, self.low_dim_idx] |
| 608 | + # HD features not used for the convex hull |
| 609 | + high_dim_feats = X[:, self.high_dim_idx_] |
| 610 | + |
| 611 | + if len(self.low_dim_idx) == 1: |
| 612 | + low_dim_feats = low_dim_feats.reshape( |
| 613 | + -1, |
| 614 | + ) |
| 615 | + # interpolate features |
| 616 | + interpolated_high_dim_feats = self.interpolator_high_dim_(low_dim_feats) |
| 617 | + |
| 618 | + if np.any(np.isnan(interpolated_high_dim_feats)): |
| 619 | + warnings.warn( |
| 620 | + "There are samples in X with a low-dimensional part that is outside of the range of the convex surface. Distance will contain nans.", |
| 621 | + UserWarning, |
| 622 | + ) |
| 623 | + |
| 624 | + # determine the distance between the original high-dimensional data and |
| 625 | + # interpolated high-dimensional data |
| 626 | + dch_distance = high_dim_feats - interpolated_high_dim_feats |
| 627 | + |
| 628 | + return dch_distance |
0 commit comments