Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions yammbs/_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -704,6 +704,7 @@ def get_rmsd(
force_field: str,
molecule_ids: list[int] | None = None,
skip_check: bool = False,
include_hydrogens: bool = False,
) -> RMSDCollection:
if not molecule_ids:
molecule_ids = self.get_molecule_ids()
Expand Down Expand Up @@ -737,7 +738,12 @@ def get_rmsd(
rmsds.append(
RMSD(
qcarchive_id=id,
rmsd=get_rmsd(molecule, qm, mm),
rmsd=get_rmsd(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(not blocking) It's quite confusing that get_rmsd is the name of both a function and a method (with different purposes/signatures) - could be a good thing to clean up/disambiguate in the code

molecule,
qm,
mm,
include_hydrogens=include_hydrogens,
),
),
)

Expand Down Expand Up @@ -857,14 +863,18 @@ def get_outputs(

def get_metrics(
self,
include_hydrogens: bool = False,
) -> MetricCollection:
metrics = MetricCollection()
import pandas

# TODO: Optimize this for speed
for force_field in self.get_force_fields():
ddes = self.get_dde(force_field=force_field).to_dataframe()
rmsds = self.get_rmsd(force_field=force_field).to_dataframe()
rmsds = self.get_rmsd(
force_field=force_field,
include_hydrogens=include_hydrogens,
).to_dataframe()
tfds = self.get_tfd(force_field=force_field).to_dataframe()
icrmsds = self.get_internal_coordinate_rmsd(
force_field=force_field,
Expand Down
24 changes: 23 additions & 1 deletion yammbs/_tests/unit_tests/test_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,14 @@


class TestAnalysis:
def test_rmsd(self, allicin, conformers):
@pytest.mark.parametrize("include_hydrogens", [True, False])
def test_rmsd(self, allicin, conformers, include_hydrogens):
# Passing the same conformers should return 0.0
last_last = get_rmsd(
molecule=allicin,
reference=conformers[-1],
target=conformers[-1],
include_hydrogens=include_hydrogens,
)

assert last_last == 0.0
Expand All @@ -28,6 +30,7 @@ def test_rmsd(self, allicin, conformers):
molecule=allicin,
reference=conformers[0],
target=conformers[-1],
include_hydrogens=include_hydrogens,
)

assert isinstance(first_last, float)
Expand All @@ -36,6 +39,7 @@ def test_rmsd(self, allicin, conformers):
molecule=allicin,
reference=conformers[0],
target=conformers[1],
include_hydrogens=include_hydrogens,
)

assert first_second != first_last
Expand All @@ -44,10 +48,28 @@ def test_rmsd(self, allicin, conformers):
molecule=allicin,
reference=conformers[-1],
target=conformers[0],
include_hydrogens=include_hydrogens,
)

assert last_first == pytest.approx(first_last)

def test_rmsd_all_atom_rmsd_differs(self, allicin, conformers):
heavy_atom_rmsd = get_rmsd(
molecule=allicin,
reference=conformers[0],
target=conformers[1],
include_hydrogens=False,
)

all_atom_rmsd = get_rmsd(
molecule=allicin,
reference=conformers[0],
target=conformers[1],
include_hydrogens=True,
)

assert heavy_atom_rmsd != all_atom_rmsd

def test_tfd(self, allicin, conformers):
# Passing the same conformers should return 0.0
last_last = get_tfd(
Expand Down
4 changes: 2 additions & 2 deletions yammbs/_tests/unit_tests/torsion/test_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ def test_minimize_basic(single_torsion_dataset, tmp_path):
database_name=tmp_path / "test.sqlite",
)

TORSION_ID = 119466834

store.optimize_mm(
force_field="openff-2.2.0",
n_processes=os.cpu_count(),
Expand All @@ -110,12 +112,10 @@ def test_minimize_basic(single_torsion_dataset, tmp_path):
assert len(metrics["metrics"]["openff-2.2.0"]) == 1

expected_metrics = {
"rmsd": 0.07475493617511018,
"rmse": 0.8193199571663233,
"mean_error": -0.35170719027937586,
"js_distance": (0.3168201337322116, 500.0),
}
TORSION_ID = 119466834

assert len(expected_metrics) == len(metrics["metrics"]["openff-2.2.0"][TORSION_ID])

Expand Down
7 changes: 4 additions & 3 deletions yammbs/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ def get_rmsd(
molecule: Molecule,
reference: Array,
target: Array,
include_hydrogens: bool = False,
) -> float:
"""Compute the RMSD between two sets of coordinates."""
from openeye import oechem
Expand All @@ -144,9 +145,9 @@ def get_rmsd(
return oechem.OERMSD(
molecule1.to_openeye(),
molecule2.to_openeye(),
True,
True,
True,
True, # automorph
not include_hydrogens, # heavyOnly
True, # overlay
)


Expand Down
30 changes: 24 additions & 6 deletions yammbs/torsion/_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
RMSDCollection,
RMSECollection,
_normalize,
get_rmsd,
)
from yammbs.torsion.inputs import QCArchiveTorsionDataset
from yammbs.torsion.models import (
Expand Down Expand Up @@ -385,13 +386,16 @@ def get_rmsd(
force_field: str,
torsion_ids: list[int] | None = None,
skip_check: bool = False,
include_hydrogens: bool = False,
restraint_k: float = 0.0,
) -> RMSDCollection:
"""Get the RMSD summed over the torsion profile."""
from openff.toolkit import Molecule

if not torsion_ids:
torsion_ids = self.get_torsion_ids()

if not skip_check:
if skip_check is None:
# TODO: Copy this into each get_* method?
LOGGER.info("Calling optimize_mm from inside of get_log_sse.")
self.optimize_mm(force_field=force_field, restraint_k=restraint_k)
Expand All @@ -407,7 +411,21 @@ def get_rmsd(
allow_undefined_stereo=True,
)

rmsds.append(RMSD.from_data(torsion_id, molecule, qm_points, mm_points))
rmsds.append(
RMSD(
id=torsion_id,
rmsd=sum(
get_rmsd(
molecule,
qm_points[key],
mm_points[key],
include_hydrogens=include_hydrogens,
)
for key in qm_points
)
/ len(qm_points),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note this detail - without which there were numeric differences, i.e. 24x for a scan of 24 grid points

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just noting that RMSD.from_data computes the RMS RMSD as opposed to the mean. Computing the mean is probably a better way to do it, but maybe RMSD.from_data should also be changed to use the mean? Should RMSD.from_data be used (and modified to have an include_hydrogens kwarg) instead of duplicating (a tiny bit)? Thanks.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch! Would the same suggestion also apply to RMSE.from_data?

rmse=numpy.sqrt(((qm_energies - mm_energies) ** 2).mean()),

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this is not strictly related to the intent of the PR I'm going to split it out into a separate issue: #186

),
)

return rmsds

Expand Down Expand Up @@ -541,6 +559,7 @@ def get_outputs(self) -> MinimizedTorsionDataset:

def get_metrics(
self,
include_hydrogens: bool = False,
force_fields: Iterable[str] | None = None,
js_temperature: float = 500.0,
restraint_k: float = 0.0,
Expand All @@ -550,6 +569,8 @@ def get_metrics(

Parameters
----------
include_hydrogens
Whether RMSDs should include hydrogens (all-atom) or not (heavy atom)
force_fields : Iterable[str] | None
Iterable of force fields to compute metrics for. If None, compute for all available.
js_temperature : float
Expand Down Expand Up @@ -583,21 +604,19 @@ def get_metrics(
# TODO: Optimize this for speed
for force_field in force_fields:
rmses = self.get_rmse(force_field=force_field, skip_check=True).to_dataframe()
rmsds = self.get_rmsd(force_field=force_field, skip_check=True).to_dataframe()
mean_errors = self.get_mean_error(force_field=force_field, skip_check=True).to_dataframe()
js_distances = self.get_js_distance(
force_field=force_field,
skip_check=True,
temperature=js_temperature,
).to_dataframe()

dataframe = rmses.join(rmsds).join(mean_errors).join(js_distances)
dataframe = rmses.join(mean_errors).join(js_distances)

dataframe = dataframe.replace({pandas.NA: numpy.nan})

metrics.metrics[force_field] = {
id: Metric( # type: ignore[misc]
rmsd=row["rmsd"],
rmse=row["rmse"],
mean_error=row["mean_error"],
js_distance=(row["js_distance"], row["js_temperature"]),
Expand Down Expand Up @@ -829,7 +848,6 @@ def get_summary_df(
rows = []
metrics = self.get_metrics().metrics
metrics_to_plot = {
"RMSD / A": lambda x: x.rmsd,
"RMSE / kcal mol-1": lambda x: x.rmse,
"Mean Error / kcal mol-1": lambda x: x.mean_error,
"JS Distance": lambda x: x.js_distance[0],
Expand Down
1 change: 0 additions & 1 deletion yammbs/torsion/outputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ class MinimizedTorsionDataset(ImmutableModel):
class Metric(ImmutableModel):
"""Summary metrics for a given force field and QM reference."""

rmsd: float
rmse: float
mean_error: float
js_distance: tuple[float, float]
Expand Down