ntampellini
diff --git a/‎examples/example_notebook.ipynb‎
Lines changed: 8 additions & 8 deletions b/‎examples/example_notebook.ipynb‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎pixi.lock‎
Lines changed: 1 addition & 1 deletion b/‎pixi.lock‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎prism_pruner/algebra.py‎
Lines changed: 12 additions & 16 deletions b/‎prism_pruner/algebra.py‎
Lines changed: 12 additions & 16 deletions
diff --git a/‎prism_pruner/conformer_ensemble.py‎
Lines changed: 13 additions & 5 deletions b/‎prism_pruner/conformer_ensemble.py‎
Lines changed: 13 additions & 5 deletions
diff --git a/‎prism_pruner/pruner.py‎
Lines changed: 58 additions & 17 deletions b/‎prism_pruner/pruner.py‎
Lines changed: 58 additions & 17 deletions
diff --git a/‎prism_pruner/rmsd.py‎
Lines changed: 1 addition & 1 deletion b/‎prism_pruner/rmsd.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎prism_pruner/torsion_module.py‎
Lines changed: 6 additions & 6 deletions b/‎prism_pruner/torsion_module.py‎
Lines changed: 6 additions & 6 deletions
@@ -52,7 +52,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "id": "31198a3f",
    "metadata": {},
    "outputs": [
@@ -61,12 +61,12 @@
      "output_type": "stream",
      "text": [
       "DEBUG: MOIPrunerConfig - k=50, rejected 449 (keeping 637/1086), in 0.1 s\n",
-      "DEBUG: MOIPrunerConfig - k=20, rejected 109 (keeping 528/1086), in 0.1 s\n",
-      "DEBUG: MOIPrunerConfig - k=10, rejected 27 (keeping 501/1086), in 0.1 s\n",
-      "DEBUG: MOIPrunerConfig - k=5, rejected 28 (keeping 473/1086), in 0.4 s\n",
-      "DEBUG: MOIPrunerConfig - k=2, rejected 38 (keeping 435/1086), in 0.5 s\n",
-      "DEBUG: MOIPrunerConfig - k=1, rejected 10 (keeping 425/1086), in 0.6 s\n",
-      "DEBUG: MOIPrunerConfig - keeping 425/1086 (1.9 s)\n",
+      "DEBUG: MOIPrunerConfig - k=20, rejected 109 (keeping 528/1086), in 0.0 s\n",
+      "DEBUG: MOIPrunerConfig - k=10, rejected 27 (keeping 501/1086), in 0.0 s\n",
+      "DEBUG: MOIPrunerConfig - k=5, rejected 28 (keeping 473/1086), in 0.1 s\n",
+      "DEBUG: MOIPrunerConfig - k=2, rejected 38 (keeping 435/1086), in 0.2 s\n",
+      "DEBUG: MOIPrunerConfig - k=1, rejected 10 (keeping 425/1086), in 0.3 s\n",
+      "DEBUG: MOIPrunerConfig - keeping 425/1086 (0.8 s)\n",
       "DEBUG: MOIPrunerConfig - Used cached data 105595/211707 times, 49.88% of total calls\n"
      ]
     },
@@ -76,7 +76,7 @@
        "(425, 136, 3)"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
 
@@ -118,27 +118,23 @@ def quaternion_to_rotation_matrix(quat: Array1D_float | Sequence[float]) -> Arra
 
 
 def get_inertia_moments(coords: Array3D_float, masses: Array1D_float) -> Array1D_float:
-    """
-    Find the moments of inertia of the three principal axes.
+    """Compute the principal moments of inertia of a molecule.
 
-    :return: diagonal of the diagonalized inertia tensor, that is
-    a shape (3,) array with the moments of inertia along the main axes.
-    (I_x, I_y and largest I_z last)
+    Returns a length-3 array [I_x, I_y, I_z], sorted ascending.
     """
-    # Center coordinates around the center of mass
-    coords = coords - np.sum(coords * masses[:, np.newaxis], axis=0)
-
-    # Compute r^2 for each atom
-    norms_squared = np.einsum("ni,ni->n", coords, coords)
+    # Shift to center of mass
+    com = np.sum(coords * masses[:, np.newaxis], axis=0) / np.sum(masses)
+    coords = coords - com
 
-    # Build inertia tensor using einsum
-    total = np.sum(masses * norms_squared)
-    inertia_moment_matrix = total * np.eye(3) - np.einsum("n,ni,nj->ij", masses, coords, coords)
+    # Compute inertia tensor
+    norms_sq = np.einsum("ni,ni->n", coords, coords)
+    total = np.sum(masses * norms_sq)
+    I_matrix = total * np.eye(3) - np.einsum("n,ni,nj->ij", masses, coords, coords)
 
-    # diagonalize the matrix and return the diagonal
-    inertia_moment_matrix = diagonalize(inertia_moment_matrix)
+    # Principal moments via symmetric eigendecomposition
+    moments, _ = np.linalg.eigh(I_matrix)
 
-    return np.diag(inertia_moment_matrix)
+    return np.sort(moments)
 
 
 def diagonalize(a: Array2D_float) -> Array2D_float:
 
@@ -1,12 +1,13 @@
 """ConformerEnsemble class."""
 
-from dataclasses import dataclass
+import re
+from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Self
 
 import numpy as np
 
-from prism_pruner.typing import Array1D_str, Array2D_float, Array3D_float
+from prism_pruner.typing import Array1D_float, Array1D_str, Array2D_float, Array3D_float
 
 
 @dataclass
@@ -15,15 +16,22 @@ class ConformerEnsemble:
 
     coords: Array3D_float
     atoms: Array1D_str
+    energies: Array1D_float = field(default_factory=lambda: np.array([]))
 
     @classmethod
-    def from_xyz(cls, file: Path | str) -> Self:
+    def from_xyz(cls, file: Path | str, read_energies: bool = False) -> Self:
         """Generate ensemble from a multiple conformer xyz file."""
         coords = []
         atoms = []
+        energies = []
         with Path(file).open() as f:
             for num in f:
-                _comment = next(f)
+                if read_energies:
+                    energy = next(re.finditer(r"-*\d+\.\d+", next(f))).group()
+                    energies.append(float(energy))
+                else:
+                    _comment = next(f)
+
                 conf_atoms = []
                 conf_coords = []
                 for _ in range(int(num)):
@@ -34,7 +42,7 @@ def from_xyz(cls, file: Path | str) -> Self:
                 atoms.append(conf_atoms)
                 coords.append(conf_coords)
 
-        return cls(coords=np.array(coords), atoms=np.array(atoms[0]))
+        return cls(coords=np.array(coords), atoms=np.array(atoms[0]), energies=np.array(energies))
 
     def to_xyz(self, file: Path | str) -> None:
         """Write ensemble to an xyz file."""
 
@@ -38,11 +38,11 @@ class PrunerConfig:
 
     # Optional parameters that get initialized
     energies: Array1D_float = field(default_factory=lambda: np.array([]))
-    ewin: float = field(default=0.0)
+    max_dE: float = field(default=0.0)
     debugfunction: Callable[[str], None] | None = field(default=None)
 
     # Computed fields
-    calls: int = field(default=0, init=False)
+    eval_calls: int = field(default=0, init=False)
     cache_calls: int = field(default=0, init=False)
     cache: set[tuple[int, int]] = field(default_factory=lambda: set(), init=False)
 
@@ -51,16 +51,21 @@ def __post_init__(self) -> None:
         self.mask = np.ones(shape=(self.structures.shape[0],), dtype=np.bool_)
 
         if len(self.energies) != 0:
-            assert self.ewin > 0.0, (
-                "If you provide energies, please also provide an appropriate energy window ewin."
+            assert self.max_dE > 0.0, (
+                "If you provide energies, please also provide an appropriate energy window max_dE."
             )
 
         # Set defaults for optional parameters
         if len(self.energies) == 0:
-            self.energies = np.zeros(self.structures.shape[0])
+            self.energies = np.zeros(self.structures.shape[0], dtype=float)
 
-        if self.ewin == 0.0:
-            self.ewin = 1.0
+        assert len(self.energies) == len(self.structures), (
+            "Please make sure that the energies "
+            + "provided have the same len as the input structures."
+        )
+
+        if self.max_dE == 0.0:
+            self.max_dE = 1.0
 
     def evaluate_sim(self, *args: Any, **kwargs: Any) -> bool:
         """Stub method - override in subclasses as needed."""
@@ -176,7 +181,7 @@ def _main_compute_subrow(
     structure in structures, returning at the first instance of a match.
     Ignores structures that are False (0) in in_mask and does not perform
     the comparison if the energy difference between the structures is less
-    than self.ewin. Saves dissimilar structural pairs (i.e. that evaluate to
+    than self.max_dE. Saves dissimilar structural pairs (i.e. that evaluate to
     False (0)) by adding them to self.cache, avoiding redundant calcaulations.
     """
     i1 = first_abs_index
@@ -191,16 +196,18 @@ def _main_compute_subrow(
             i2 = first_abs_index + 1 + i
             hash_value = (i1, i2)
 
-            prunerconfig.calls += 1
             if hash_value in prunerconfig.cache:
                 prunerconfig.cache_calls += 1
                 continue
 
             # if we have not computed the value before, check if the two
             # structures have close enough energy before running the comparison
-            elif np.abs(prunerconfig.energies[i1] - prunerconfig.energies[i2]) < prunerconfig.ewin:
+            elif (
+                np.abs(prunerconfig.energies[i1] - prunerconfig.energies[i2]) < prunerconfig.max_dE
+            ):
                 # function will return True whether the structures are similar,
                 # and will stop iterating on this row, returning
+                prunerconfig.eval_calls += 1
                 if prunerconfig.evaluate_sim(i1, i2):
                     return True
 
@@ -309,6 +316,14 @@ def prune(prunerconfig: PrunerConfig) -> tuple[Array2D_float, Array1D_bool]:
     out_mask = np.ones(shape=prunerconfig.structures.shape[0], dtype=np.bool_)
     prunerconfig.cache = set()
 
+    # sort structures by ascending energy: this will have the effect of
+    # having energetically similar structures end up in the same chunk
+    # and therefore being pruned early
+    if np.abs(prunerconfig.energies[-1]) > 0:
+        sorting_indices = np.argsort(prunerconfig.energies)
+        prunerconfig.structures = prunerconfig.structures[sorting_indices]
+        prunerconfig.energies = prunerconfig.energies[sorting_indices]
+
     # split the structure array in subgroups and prune them internally
     for k in (
         500_000,
@@ -365,11 +380,17 @@ def prune(prunerconfig: PrunerConfig) -> tuple[Array2D_float, Array1D_bool]:
             + f"({time_to_string(elapsed)})"
         )
 
-        fraction = 0 if prunerconfig.calls == 0 else prunerconfig.cache_calls / prunerconfig.calls
+        if prunerconfig.eval_calls == 0:
+            fraction = 0.0
+        else:
+            fraction = prunerconfig.cache_calls / (
+                prunerconfig.eval_calls + prunerconfig.cache_calls
+            )
+
         prunerconfig.debugfunction(
             f"DEBUG: {prunerconfig.__class__.__name__} - Used cached data "
-            + f"{prunerconfig.cache_calls}/{prunerconfig.calls} times, "
-            + f"{100 * fraction:.2f}% of total calls"
+            + f"{prunerconfig.cache_calls}/{prunerconfig.eval_calls + prunerconfig.cache_calls}"
+            + f" times, {100 * fraction:.2f}% of total calls"
         )
 
     return prunerconfig.structures[out_mask], out_mask
@@ -380,6 +401,8 @@ def prune_by_rmsd(
     atoms: Array1D_str,
     max_rmsd: float = 0.25,
     max_dev: float | None = None,
+    energies: Array1D_float | None = None,
+    max_dE: float = 0.0,
     debugfunction: Callable[[str], None] | None = None,
 ) -> tuple[Array3D_float, Array1D_bool]:
     """Remove duplicate structures using a heavy-atom RMSD metric.
@@ -391,6 +414,9 @@ def prune_by_rmsd(
     Similarity occurs for structures with both RMSD < max_rmsd and
     maximum deviation < max_dev. max_dev by default is 2 * max_rmsd.
     """
+    if energies is None:
+        energies = np.array([])
+
     # set default max_dev if not provided
     max_dev = max_dev or 2 * max_rmsd
 
@@ -400,6 +426,8 @@ def prune_by_rmsd(
         atoms=atoms,
         max_rmsd=max_rmsd,
         max_dev=max_dev,
+        energies=energies,
+        max_dE=max_dE,
         debugfunction=debugfunction,
     )
 
@@ -413,6 +441,8 @@ def prune_by_rmsd_rot_corr(
     graph: Graph,
     max_rmsd: float = 0.25,
     max_dev: float | None = None,
+    energies: Array1D_float | None = None,
+    max_dE: float = 0.0,
     logfunction: Callable[[str], None] | None = None,
     debugfunction: Callable[[str], None] | None = None,
 ) -> tuple[Array3D_float, Array1D_bool]:
@@ -535,10 +565,15 @@ def prune_by_rmsd_rot_corr(
             )
         logfunction("\n")
 
+    if energies is None:
+        energies = np.array([])
+
     # Initialize PrunerConfig
     prunerconfig = RMSDRotCorrPrunerConfig(
         structures=structures,
         atoms=atoms,
+        energies=energies,
+        max_dE=max_dE,
         graph=graph,
         torsions=torsions_ids,
         debugfunction=debugfunction,
@@ -561,19 +596,25 @@ def prune_by_moment_of_inertia(
     structures: Array3D_float,
     atoms: Array1D_str,
     max_deviation: float = 1e-2,
+    energies: Array1D_float | None = None,
+    max_dE: float = 0.0,
     debugfunction: Callable[[str], None] | None = None,
 ) -> tuple[Array3D_float, Array1D_bool]:
     """Remove duplicate structures using a moments of inertia-based metric.
 
     Remove duplicate structures (enantiomeric or rotameric) based on the
-    moments of inertia on the principal axes. If all three MOI
-    deviate less than max_deviation percent from another structure,
-    they are classified as rotamers or enantiomers and therefore only one
-    of them is kept (i.e. max_deviation = 0.1 is 10% relative deviation).
+    moment of inertia on the principal axes. If all three deviate less than
+    max_deviation percent from another one, the structure is removed from
+    the ensemble (i.e. max_deviation = 0.1 is 10% relative deviation).
     """
+    if energies is None:
+        energies = np.array([])
+
     # set up PrunerConfig dataclass
     prunerconfig = MOIPrunerConfig(
         structures=structures,
+        energies=energies,
+        max_dE=max_dE,
         debugfunction=debugfunction,
         max_dev=max_deviation,
         masses=np.array([elements.symbol(a).mass for a in atoms]),
 
@@ -9,7 +9,7 @@
 def rmsd_and_max(
     p: Array2D_float,
     q: Array2D_float,
-    center: bool = False,
+    center: bool = True,
 ) -> tuple[float, float]:
     """Return RMSD and max deviation.
 
 
@@ -418,6 +418,10 @@ def rotationally_corrected_rmsd_and_max(
 
     torsion_corrections = [0 for _ in torsions]
 
+    mask = (
+        np.array([a != "H" for a in atoms]) if heavy_atoms_only else np.ones(len(atoms), dtype=bool)
+    )
+
     # Now rotate every dummy torsion by the appropriate increment until we minimize local RMSD
     for i, torsion in enumerate(torsions):
         best_rmsd = 1e10
@@ -432,7 +436,7 @@ def rotationally_corrected_rmsd_and_max(
                 best_rmsd = locally_corrected_rmsd
                 torsion_corrections[i] = angle
 
-            # it is faster to undo the rotation rather than working with a copy of coords
+            # it is faster to undo the rotation rather than working with a copy of coordss
             coord = rotate_dihedral(coord, torsion, -angle, indices_to_be_moved=[torsion[3]])
 
         # now rotate that angle to the desired orientation before going to the next angle
@@ -442,18 +446,14 @@ def rotationally_corrected_rmsd_and_max(
             )
 
         if debugfunction is not None:
-            heavy_mask = np.array([a != "H" for a in atoms])
-            global_rmsd = rmsd_and_max(ref[heavy_mask], coord[heavy_mask])[0]
+            global_rmsd = rmsd_and_max(ref[mask], coord[mask])[0]
             debugfunction(
                 f"    Torsion {i + 1} - {torsion}: best θ = {torsion_corrections[i]}°, "
                 + f"4-atom RMSD: {best_rmsd:.3f} Å, global RMSD: {global_rmsd:.3f} Å"
             )
 
     # we should have the optimal orientation on all torsions now:
     # calculate the RMSD
-    mask = (
-        np.array([a != "H" for a in atoms]) if heavy_atoms_only else np.ones(len(atoms), dtype=bool)
-    )
     rmsd, maxdev = rmsd_and_max(ref[mask], coord[mask])
 
     # since we could have segmented graphs, and therefore potentially only rotate