Skip to content

Commit 1b76715

Browse files
authored
Add feature names to USR, USRCAT and ElectroShape (#477)
1 parent 4315719 commit 1b76715

File tree

6 files changed

+173
-8
lines changed

6 files changed

+173
-8
lines changed

skfp/fingerprints/electroshape.py

Lines changed: 41 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,12 @@ class ElectroShapeFingerprint(BaseFingerprintTransformer):
2323
2424
It first computes atomic partial charges, and then uses both conformational
2525
(spatial) structure, and this electric information, to compute reference
26-
points (centroids). First three are like in USR, and last two
27-
additionally use partial charge in distance calculation. See the original paper
28-
[1]_ for details. For each centroid, the distribution of distances between atoms
29-
and the centroid is aggregated using the first three moments (mean, standard
30-
deviation, cubic root of skewness). This results in 15 features.
26+
points (centroids). First three are similar to USR: centroid, atom farthest from
27+
centroid, and atom farthest from that atom. The last two additionally use partial
28+
charge in distance calculation. See the original paper [1]_ for details. For each
29+
centroid, the distribution of distances between atoms and the centroid is aggregated
30+
using the first three moments (mean, standard deviation, cubic root of skewness).
31+
This results in 15 features.
3132
3233
This is a 3D fingerprint, and requires molecules with ``conf_id`` integer property
3334
set. They can be generated with :class:`~skfp.preprocessing.ConformerGenerator`.
@@ -150,6 +151,40 @@ def __init__(
150151
self.charge_errors = charge_errors
151152
self.errors = errors
152153

154+
def get_feature_names_out(self, input_features=None) -> np.ndarray: # noqa: ARG002
155+
"""
156+
Get fingerprint output feature names. They correspond to aggregates
157+
of atomic distances to 5 centroid-based points.
158+
159+
Parameters
160+
----------
161+
input_features : array-like of str or None, default=None
162+
Unused, kept for scikit-learn compatibility.
163+
164+
Returns
165+
-------
166+
feature_names_out : ndarray of str objects
167+
ElectroShape feature names.
168+
"""
169+
feature_names = [
170+
"centroid_dists_mean",
171+
"centroid_dists_std",
172+
"centroid_dists_skewness_cubic_root",
173+
"farthest_atom_from_centroid_mean",
174+
"farthest_atom_from_centroid_std",
175+
"farthest_atom_from_centroid_skewness_cubic_root",
176+
"farthest_atom_from_farthest_to_centroid_mean",
177+
"farthest_atom_from_farthest_to_centroid_std",
178+
"farthest_atom_from_farthest_to_centroid_cubic_root",
179+
"centroid_highest_partial_charge_mean",
180+
"centroid_highest_partial_charge_std",
181+
"centroid_highest_partial_charge_skewness_cubic_root",
182+
"centroid_lowest_partial_charge_mean",
183+
"centroid_lowest_partial_charge_std",
184+
"centroid_lowest_partial_charge_skewness_cubic_root",
185+
]
186+
return np.asarray(feature_names, dtype=object)
187+
153188
def transform(
154189
self, X: Sequence[str | Mol], copy: bool = False
155190
) -> np.ndarray | csr_array:
@@ -280,7 +315,7 @@ def _get_centroid_distances(
280315
else:
281316
vec_c = (norm(vec_a) / (2 * cross_ab_norm)) * cross_ab
282317

283-
# geometric mean centroid moved in the direction of smallest and largest charge
318+
# geometric mean centroid moved in the direction of largest and smallest charge
284319
# note that charges were already scaled before
285320
c4 = np.append(c1[:3] + vec_c, np.max(charges))
286321
c5 = np.append(c1[:3] + vec_c, np.min(charges))

skfp/fingerprints/usr.py

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ class USRFingerprint(BaseFingerprintTransformer):
1818
characterizes the shape of the molecule by encoding the relative positions of its
1919
atoms [1]_ [2]_.
2020
21-
Four points are considered: molecular centroid (ctd), the closest atom to centroid (cst),
22-
the farthest atom from centroid (fct), and atom the fartest from fct (ftf). Distances
21+
Four points are considered: molecular centroid (ctd), the closest atom to centroid (catc),
22+
the farthest atom from centroid (fact), and atom the fartest from fct (fatf). Distances
2323
from all atoms to each of those four points are computed, and each of those distributions
2424
is summarized its first three moments: mean, variance, and skewness. Concretely, standard
2525
deviation and cubic root of skewness are used to keep the same unit. This results in
@@ -125,6 +125,37 @@ def __init__(
125125
)
126126
self.errors = errors
127127

128+
def get_feature_names_out(self, input_features=None) -> np.ndarray: # noqa: ARG002
129+
"""
130+
Get fingerprint output feature names. They correspond to aggregates
131+
of atomic distances to 4 centroid-based points.
132+
133+
Parameters
134+
----------
135+
input_features : array-like of str or None, default=None
136+
Unused, kept for scikit-learn compatibility.
137+
138+
Returns
139+
-------
140+
feature_names_out : ndarray of str objects
141+
USR feature names.
142+
"""
143+
feature_names = [
144+
"centroid_dists_mean",
145+
"centroid_dists_std",
146+
"centroid_dists_skewness_cubic_root",
147+
"closest_atom_to_centroid_mean",
148+
"closest_atom_to_centroid_std",
149+
"closest_atom_to_centroid_skewness_cubic_root",
150+
"farthest_atom_from_centroid_mean",
151+
"farthest_atom_from_centroid_std",
152+
"farthest_atom_from_centroid_skewness_cubic_root",
153+
"farthest_atom_from_farthest_to_centroid_mean",
154+
"farthest_atom_from_farthest_to_centroid_std",
155+
"farthest_atom_from_farthest_to_centroid_cubic_root",
156+
]
157+
return np.asarray(feature_names, dtype=object)
158+
128159
def transform(
129160
self, X: Sequence[str | Mol], copy: bool = False
130161
) -> np.ndarray | csr_array:

skfp/fingerprints/usrcat.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,51 @@ def __init__(
121121
)
122122
self.errors = errors
123123

124+
def get_feature_names_out(self, input_features=None) -> np.ndarray: # noqa: ARG002
125+
"""
126+
Get fingerprint output feature names. They correspond to aggregates
127+
of atomic distances to 4 centroid-based points, for each of 5 atom
128+
type subsets.
129+
130+
Parameters
131+
----------
132+
input_features : array-like of str or None, default=None
133+
Unused, kept for scikit-learn compatibility.
134+
135+
Returns
136+
-------
137+
feature_names_out : ndarray of str objects
138+
USRCAT feature names.
139+
"""
140+
base_feature_names = [
141+
"centroid_dists_mean",
142+
"centroid_dists_std",
143+
"centroid_dists_skewness_cubic_root",
144+
"closest_atom_to_centroid_mean",
145+
"closest_atom_to_centroid_std",
146+
"closest_atom_to_centroid_skewness_cubic_root",
147+
"farthest_atom_from_centroid_mean",
148+
"farthest_atom_from_centroid_std",
149+
"farthest_atom_from_centroid_skewness_cubic_root",
150+
"farthest_atom_from_farthest_to_centroid_mean",
151+
"farthest_atom_from_farthest_to_centroid_std",
152+
"farthest_atom_from_farthest_to_centroid_cubic_root",
153+
]
154+
atom_types = [
155+
"all",
156+
"hydrophobic",
157+
"aromatic",
158+
"hydrogen_bond_donor",
159+
"hydrogen_bond_acceptor",
160+
]
161+
feature_names = [
162+
f"{atom_type}_{base_feature_name}"
163+
for atom_type in atom_types
164+
for base_feature_name in base_feature_names
165+
]
166+
167+
return np.asarray(feature_names, dtype=object)
168+
124169
def transform(
125170
self, X: Sequence[str | Mol], copy: bool = False
126171
) -> np.ndarray | csr_array:

tests/fingerprints/electroshape.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,3 +92,14 @@ def test_electroshape_transform_x_y(mols_conformers_3_plus_atoms):
9292

9393
assert len(X_skfp) == len(y_skfp)
9494
assert np.all(y_skfp == 1)
95+
96+
97+
def test_electroshape_feature_names():
98+
electroshape_fp = ElectroShapeFingerprint()
99+
feature_names = electroshape_fp.get_feature_names_out()
100+
101+
assert len(feature_names) == electroshape_fp.n_features_out
102+
assert len(feature_names) == len(set(feature_names))
103+
104+
assert feature_names[0] == "centroid_dists_mean"
105+
assert feature_names[-1] == "centroid_lowest_partial_charge_skewness_cubic_root"

tests/fingerprints/usr.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,3 +68,14 @@ def test_usr_copy(mols_conformers_3_plus_atoms):
6868

6969
assert np.array_equal(labels, labels_out)
7070
assert labels is not labels_out
71+
72+
73+
def test_usr_feature_names():
74+
usr_fp = USRFingerprint()
75+
feature_names = usr_fp.get_feature_names_out()
76+
77+
assert len(feature_names) == usr_fp.n_features_out
78+
assert len(feature_names) == len(set(feature_names))
79+
80+
assert feature_names[0] == "centroid_dists_mean"
81+
assert feature_names[-1] == "farthest_atom_from_farthest_to_centroid_cubic_root"

tests/fingerprints/usrcat.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,3 +68,35 @@ def test_usrcat_copy(mols_conformers_3_plus_atoms):
6868

6969
assert np.array_equal(labels, labels_out)
7070
assert labels is not labels_out
71+
72+
73+
def test_usrcat_feature_names():
74+
usrcat_fp = USRCATFingerprint()
75+
feature_names = usrcat_fp.get_feature_names_out()
76+
77+
assert len(feature_names) == usrcat_fp.n_features_out
78+
assert len(feature_names) == len(set(feature_names))
79+
80+
assert feature_names[0] == "all_centroid_dists_mean"
81+
assert feature_names[12] == "hydrophobic_centroid_dists_mean"
82+
assert feature_names[24] == "aromatic_centroid_dists_mean"
83+
assert feature_names[36] == "hydrogen_bond_donor_centroid_dists_mean"
84+
assert feature_names[48] == "hydrogen_bond_acceptor_centroid_dists_mean"
85+
86+
assert feature_names[11] == "all_farthest_atom_from_farthest_to_centroid_cubic_root"
87+
assert (
88+
feature_names[23]
89+
== "hydrophobic_farthest_atom_from_farthest_to_centroid_cubic_root"
90+
)
91+
assert (
92+
feature_names[35]
93+
== "aromatic_farthest_atom_from_farthest_to_centroid_cubic_root"
94+
)
95+
assert (
96+
feature_names[47]
97+
== "hydrogen_bond_donor_farthest_atom_from_farthest_to_centroid_cubic_root"
98+
)
99+
assert (
100+
feature_names[59]
101+
== "hydrogen_bond_acceptor_farthest_atom_from_farthest_to_centroid_cubic_root"
102+
)

0 commit comments

Comments
 (0)