MLCIL · j-adamczyk · Aug 3, 2025 · Aug 1, 2025
@@ -23,11 +23,12 @@ class ElectroShapeFingerprint(BaseFingerprintTransformer):
 
     It first computes atomic partial charges, and then uses both conformational
     (spatial) structure, and this electric information, to compute reference
-    points (centroids). First three are like in USR, and last two
-    additionally use partial charge in distance calculation. See the original paper
-    [1]_ for details. For each centroid, the distribution of distances between atoms
-    and the centroid is aggregated using the first three moments (mean, standard
-    deviation, cubic root of skewness). This results in 15 features.
+    points (centroids). First three are similar to USR: centroid, atom farthest from
+    centroid, and atom farthest from that atom. The last two additionally use partial
+    charge in distance calculation. See the original paper [1]_ for details. For each
+    centroid, the distribution of distances between atoms and the centroid is aggregated
+    using the first three moments (mean, standard deviation, cubic root of skewness).
+    This results in 15 features.
 
     This is a 3D fingerprint, and requires molecules with ``conf_id`` integer property
     set. They can be generated with :class:`~skfp.preprocessing.ConformerGenerator`.
@@ -150,6 +151,40 @@ def __init__(
         self.charge_errors = charge_errors
         self.errors = errors
 
+    def get_feature_names_out(self, input_features=None) -> np.ndarray:  # noqa: ARG002
+        """
+        Get fingerprint output feature names. They correspond to aggregates
+        of atomic distances to 5 centroid-based points.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Unused, kept for scikit-learn compatibility.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            ElectroShape feature names.
+        """
+        feature_names = [
+            "centroid_dists_mean",
+            "centroid_dists_std",
+            "centroid_dists_skewness_cubic_root",
+            "farthest_atom_from_centroid_mean",
+            "farthest_atom_from_centroid_std",
+            "farthest_atom_from_centroid_skewness_cubic_root",
+            "farthest_atom_from_farthest_to_centroid_mean",
+            "farthest_atom_from_farthest_to_centroid_std",
+            "farthest_atom_from_farthest_to_centroid_cubic_root",
+            "centroid_highest_partial_charge_mean",
+            "centroid_highest_partial_charge_std",
+            "centroid_highest_partial_charge_skewness_cubic_root",
+            "centroid_lowest_partial_charge_mean",
+            "centroid_lowest_partial_charge_std",
+            "centroid_lowest_partial_charge_skewness_cubic_root",
+        ]
+        return np.asarray(feature_names, dtype=object)
+
     def transform(
         self, X: Sequence[str | Mol], copy: bool = False
     ) -> np.ndarray | csr_array:
@@ -280,7 +315,7 @@ def _get_centroid_distances(
         else:
             vec_c = (norm(vec_a) / (2 * cross_ab_norm)) * cross_ab
 
-        # geometric mean centroid moved in the direction of smallest and largest charge
+        # geometric mean centroid moved in the direction of largest and smallest charge
         # note that charges were already scaled before
         c4 = np.append(c1[:3] + vec_c, np.max(charges))
         c5 = np.append(c1[:3] + vec_c, np.min(charges))

@@ -18,8 +18,8 @@ class USRFingerprint(BaseFingerprintTransformer):
     characterizes the shape of the molecule by encoding the relative positions of its
     atoms [1]_ [2]_.
 
-    Four points are considered: molecular centroid (ctd), the closest atom to centroid (cst),
-    the farthest atom from centroid (fct), and atom the fartest from fct (ftf). Distances
+    Four points are considered: molecular centroid (ctd), the closest atom to centroid (catc),
+    the farthest atom from centroid (fact), and atom the fartest from fct (fatf). Distances
     from all atoms to each of those four points are computed, and each of those distributions
     is summarized its first three moments: mean, variance, and skewness. Concretely, standard
     deviation and cubic root of skewness are used to keep the same unit. This results in
@@ -125,6 +125,37 @@ def __init__(
         )
         self.errors = errors
 
+    def get_feature_names_out(self, input_features=None) -> np.ndarray:  # noqa: ARG002
+        """
+        Get fingerprint output feature names. They correspond to aggregates
+        of atomic distances to 4 centroid-based points.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Unused, kept for scikit-learn compatibility.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            USR feature names.
+        """
+        feature_names = [
+            "centroid_dists_mean",
+            "centroid_dists_std",
+            "centroid_dists_skewness_cubic_root",
+            "closest_atom_to_centroid_mean",
+            "closest_atom_to_centroid_std",
+            "closest_atom_to_centroid_skewness_cubic_root",
+            "farthest_atom_from_centroid_mean",
+            "farthest_atom_from_centroid_std",
+            "farthest_atom_from_centroid_skewness_cubic_root",
+            "farthest_atom_from_farthest_to_centroid_mean",
+            "farthest_atom_from_farthest_to_centroid_std",
+            "farthest_atom_from_farthest_to_centroid_cubic_root",
+        ]
+        return np.asarray(feature_names, dtype=object)
+
     def transform(
         self, X: Sequence[str | Mol], copy: bool = False
     ) -> np.ndarray | csr_array:

@@ -121,6 +121,51 @@ def __init__(
         )
         self.errors = errors
 
+    def get_feature_names_out(self, input_features=None) -> np.ndarray:  # noqa: ARG002
+        """
+        Get fingerprint output feature names. They correspond to aggregates
+        of atomic distances to 4 centroid-based points, for each of 5 atom
+        type subsets.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Unused, kept for scikit-learn compatibility.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            USRCAT feature names.
+        """
+        base_feature_names = [
+            "centroid_dists_mean",
+            "centroid_dists_std",
+            "centroid_dists_skewness_cubic_root",
+            "closest_atom_to_centroid_mean",
+            "closest_atom_to_centroid_std",
+            "closest_atom_to_centroid_skewness_cubic_root",
+            "farthest_atom_from_centroid_mean",
+            "farthest_atom_from_centroid_std",
+            "farthest_atom_from_centroid_skewness_cubic_root",
+            "farthest_atom_from_farthest_to_centroid_mean",
+            "farthest_atom_from_farthest_to_centroid_std",
+            "farthest_atom_from_farthest_to_centroid_cubic_root",
+        ]
+        atom_types = [
+            "all",
+            "hydrophobic",
+            "aromatic",
+            "hydrogen_bond_donor",
+            "hydrogen_bond_acceptor",
+        ]
+        feature_names = [
+            f"{atom_type}_{base_feature_name}"
+            for atom_type in atom_types
+            for base_feature_name in base_feature_names
+        ]
+
+        return np.asarray(feature_names, dtype=object)
+
     def transform(
         self, X: Sequence[str | Mol], copy: bool = False
     ) -> np.ndarray | csr_array:

@@ -92,3 +92,14 @@ def test_electroshape_transform_x_y(mols_conformers_3_plus_atoms):
 
     assert len(X_skfp) == len(y_skfp)
     assert np.all(y_skfp == 1)
+
+
+def test_electroshape_feature_names():
+    electroshape_fp = ElectroShapeFingerprint()
+    feature_names = electroshape_fp.get_feature_names_out()
+
+    assert len(feature_names) == electroshape_fp.n_features_out
+    assert len(feature_names) == len(set(feature_names))
+
+    assert feature_names[0] == "centroid_dists_mean"
+    assert feature_names[-1] == "centroid_lowest_partial_charge_skewness_cubic_root"
@@ -68,3 +68,14 @@ def test_usr_copy(mols_conformers_3_plus_atoms):
 
     assert np.array_equal(labels, labels_out)
     assert labels is not labels_out
+
+
+def test_usr_feature_names():
+    usr_fp = USRFingerprint()
+    feature_names = usr_fp.get_feature_names_out()
+
+    assert len(feature_names) == usr_fp.n_features_out
+    assert len(feature_names) == len(set(feature_names))
+
+    assert feature_names[0] == "centroid_dists_mean"
+    assert feature_names[-1] == "farthest_atom_from_farthest_to_centroid_cubic_root"
@@ -68,3 +68,35 @@ def test_usrcat_copy(mols_conformers_3_plus_atoms):
 
     assert np.array_equal(labels, labels_out)
     assert labels is not labels_out
+
+
+def test_usrcat_feature_names():
+    usrcat_fp = USRCATFingerprint()
+    feature_names = usrcat_fp.get_feature_names_out()
+
+    assert len(feature_names) == usrcat_fp.n_features_out
+    assert len(feature_names) == len(set(feature_names))
+
+    assert feature_names[0] == "all_centroid_dists_mean"
+    assert feature_names[12] == "hydrophobic_centroid_dists_mean"
+    assert feature_names[24] == "aromatic_centroid_dists_mean"
+    assert feature_names[36] == "hydrogen_bond_donor_centroid_dists_mean"
+    assert feature_names[48] == "hydrogen_bond_acceptor_centroid_dists_mean"
+
+    assert feature_names[11] == "all_farthest_atom_from_farthest_to_centroid_cubic_root"
+    assert (
+        feature_names[23]
+        == "hydrophobic_farthest_atom_from_farthest_to_centroid_cubic_root"
+    )
+    assert (
+        feature_names[35]
+        == "aromatic_farthest_atom_from_farthest_to_centroid_cubic_root"
+    )
+    assert (
+        feature_names[47]
+        == "hydrogen_bond_donor_farthest_atom_from_farthest_to_centroid_cubic_root"
+    )
+    assert (
+        feature_names[59]
+        == "hydrogen_bond_acceptor_farthest_atom_from_farthest_to_centroid_cubic_root"
+    )