Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 41 additions & 6 deletions skfp/fingerprints/electroshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,12 @@ class ElectroShapeFingerprint(BaseFingerprintTransformer):

It first computes atomic partial charges, and then uses both conformational
(spatial) structure, and this electric information, to compute reference
points (centroids). First three are like in USR, and last two
additionally use partial charge in distance calculation. See the original paper
[1]_ for details. For each centroid, the distribution of distances between atoms
and the centroid is aggregated using the first three moments (mean, standard
deviation, cubic root of skewness). This results in 15 features.
points (centroids). First three are similar to USR: centroid, atom farthest from
centroid, and atom farthest from that atom. The last two additionally use partial
charge in distance calculation. See the original paper [1]_ for details. For each
centroid, the distribution of distances between atoms and the centroid is aggregated
using the first three moments (mean, standard deviation, cubic root of skewness).
This results in 15 features.

This is a 3D fingerprint, and requires molecules with ``conf_id`` integer property
set. They can be generated with :class:`~skfp.preprocessing.ConformerGenerator`.
Expand Down Expand Up @@ -150,6 +151,40 @@ def __init__(
self.charge_errors = charge_errors
self.errors = errors

def get_feature_names_out(self, input_features=None) -> np.ndarray: # noqa: ARG002
"""
Get fingerprint output feature names. They correspond to aggregates
of atomic distances to 5 centroid-based points.

Parameters
----------
input_features : array-like of str or None, default=None
Unused, kept for scikit-learn compatibility.

Returns
-------
feature_names_out : ndarray of str objects
ElectroShape feature names.
"""
feature_names = [
"centroid_dists_mean",
"centroid_dists_std",
"centroid_dists_skewness_cubic_root",
"farthest_atom_from_centroid_mean",
"farthest_atom_from_centroid_std",
"farthest_atom_from_centroid_skewness_cubic_root",
"farthest_atom_from_farthest_to_centroid_mean",
"farthest_atom_from_farthest_to_centroid_std",
"farthest_atom_from_farthest_to_centroid_cubic_root",
"centroid_highest_partial_charge_mean",
"centroid_highest_partial_charge_std",
"centroid_highest_partial_charge_skewness_cubic_root",
"centroid_lowest_partial_charge_mean",
"centroid_lowest_partial_charge_std",
"centroid_lowest_partial_charge_skewness_cubic_root",
]
return np.asarray(feature_names, dtype=object)

def transform(
self, X: Sequence[str | Mol], copy: bool = False
) -> np.ndarray | csr_array:
Expand Down Expand Up @@ -280,7 +315,7 @@ def _get_centroid_distances(
else:
vec_c = (norm(vec_a) / (2 * cross_ab_norm)) * cross_ab

# geometric mean centroid moved in the direction of smallest and largest charge
# geometric mean centroid moved in the direction of largest and smallest charge
# note that charges were already scaled before
c4 = np.append(c1[:3] + vec_c, np.max(charges))
c5 = np.append(c1[:3] + vec_c, np.min(charges))
Expand Down
35 changes: 33 additions & 2 deletions skfp/fingerprints/usr.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ class USRFingerprint(BaseFingerprintTransformer):
characterizes the shape of the molecule by encoding the relative positions of its
atoms [1]_ [2]_.

Four points are considered: molecular centroid (ctd), the closest atom to centroid (cst),
the farthest atom from centroid (fct), and atom the fartest from fct (ftf). Distances
Four points are considered: molecular centroid (ctd), the closest atom to centroid (catc),
the farthest atom from centroid (fact), and atom the fartest from fct (fatf). Distances
from all atoms to each of those four points are computed, and each of those distributions
is summarized its first three moments: mean, variance, and skewness. Concretely, standard
deviation and cubic root of skewness are used to keep the same unit. This results in
Expand Down Expand Up @@ -125,6 +125,37 @@ def __init__(
)
self.errors = errors

def get_feature_names_out(self, input_features=None) -> np.ndarray: # noqa: ARG002
"""
Get fingerprint output feature names. They correspond to aggregates
of atomic distances to 4 centroid-based points.

Parameters
----------
input_features : array-like of str or None, default=None
Unused, kept for scikit-learn compatibility.

Returns
-------
feature_names_out : ndarray of str objects
USR feature names.
"""
feature_names = [
"centroid_dists_mean",
"centroid_dists_std",
"centroid_dists_skewness_cubic_root",
"closest_atom_to_centroid_mean",
"closest_atom_to_centroid_std",
"closest_atom_to_centroid_skewness_cubic_root",
"farthest_atom_from_centroid_mean",
"farthest_atom_from_centroid_std",
"farthest_atom_from_centroid_skewness_cubic_root",
"farthest_atom_from_farthest_to_centroid_mean",
"farthest_atom_from_farthest_to_centroid_std",
"farthest_atom_from_farthest_to_centroid_cubic_root",
]
return np.asarray(feature_names, dtype=object)

def transform(
self, X: Sequence[str | Mol], copy: bool = False
) -> np.ndarray | csr_array:
Expand Down
45 changes: 45 additions & 0 deletions skfp/fingerprints/usrcat.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,51 @@ def __init__(
)
self.errors = errors

def get_feature_names_out(self, input_features=None) -> np.ndarray: # noqa: ARG002
"""
Get fingerprint output feature names. They correspond to aggregates
of atomic distances to 4 centroid-based points, for each of 5 atom
type subsets.

Parameters
----------
input_features : array-like of str or None, default=None
Unused, kept for scikit-learn compatibility.

Returns
-------
feature_names_out : ndarray of str objects
USRCAT feature names.
"""
base_feature_names = [
"centroid_dists_mean",
"centroid_dists_std",
"centroid_dists_skewness_cubic_root",
"closest_atom_to_centroid_mean",
"closest_atom_to_centroid_std",
"closest_atom_to_centroid_skewness_cubic_root",
"farthest_atom_from_centroid_mean",
"farthest_atom_from_centroid_std",
"farthest_atom_from_centroid_skewness_cubic_root",
"farthest_atom_from_farthest_to_centroid_mean",
"farthest_atom_from_farthest_to_centroid_std",
"farthest_atom_from_farthest_to_centroid_cubic_root",
]
atom_types = [
"all",
"hydrophobic",
"aromatic",
"hydrogen_bond_donor",
"hydrogen_bond_acceptor",
]
feature_names = [
f"{atom_type}_{base_feature_name}"
for atom_type in atom_types
for base_feature_name in base_feature_names
]

return np.asarray(feature_names, dtype=object)

def transform(
self, X: Sequence[str | Mol], copy: bool = False
) -> np.ndarray | csr_array:
Expand Down
11 changes: 11 additions & 0 deletions tests/fingerprints/electroshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,14 @@ def test_electroshape_transform_x_y(mols_conformers_3_plus_atoms):

assert len(X_skfp) == len(y_skfp)
assert np.all(y_skfp == 1)


def test_electroshape_feature_names():
electroshape_fp = ElectroShapeFingerprint()
feature_names = electroshape_fp.get_feature_names_out()

assert len(feature_names) == electroshape_fp.n_features_out
assert len(feature_names) == len(set(feature_names))

assert feature_names[0] == "centroid_dists_mean"
assert feature_names[-1] == "centroid_lowest_partial_charge_skewness_cubic_root"
11 changes: 11 additions & 0 deletions tests/fingerprints/usr.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,14 @@ def test_usr_copy(mols_conformers_3_plus_atoms):

assert np.array_equal(labels, labels_out)
assert labels is not labels_out


def test_usr_feature_names():
usr_fp = USRFingerprint()
feature_names = usr_fp.get_feature_names_out()

assert len(feature_names) == usr_fp.n_features_out
assert len(feature_names) == len(set(feature_names))

assert feature_names[0] == "centroid_dists_mean"
assert feature_names[-1] == "farthest_atom_from_farthest_to_centroid_cubic_root"
32 changes: 32 additions & 0 deletions tests/fingerprints/usrcat.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,35 @@ def test_usrcat_copy(mols_conformers_3_plus_atoms):

assert np.array_equal(labels, labels_out)
assert labels is not labels_out


def test_usrcat_feature_names():
usrcat_fp = USRCATFingerprint()
feature_names = usrcat_fp.get_feature_names_out()

assert len(feature_names) == usrcat_fp.n_features_out
assert len(feature_names) == len(set(feature_names))

assert feature_names[0] == "all_centroid_dists_mean"
assert feature_names[12] == "hydrophobic_centroid_dists_mean"
assert feature_names[24] == "aromatic_centroid_dists_mean"
assert feature_names[36] == "hydrogen_bond_donor_centroid_dists_mean"
assert feature_names[48] == "hydrogen_bond_acceptor_centroid_dists_mean"

assert feature_names[11] == "all_farthest_atom_from_farthest_to_centroid_cubic_root"
assert (
feature_names[23]
== "hydrophobic_farthest_atom_from_farthest_to_centroid_cubic_root"
)
assert (
feature_names[35]
== "aromatic_farthest_atom_from_farthest_to_centroid_cubic_root"
)
assert (
feature_names[47]
== "hydrogen_bond_donor_farthest_atom_from_farthest_to_centroid_cubic_root"
)
assert (
feature_names[59]
== "hydrogen_bond_acceptor_farthest_atom_from_farthest_to_centroid_cubic_root"
)
Loading