Skip to content

Commit 1506c7d

Browse files
authored
chore: code changes for test-case updates (#62)
1 parent 4b92935 commit 1506c7d

21 files changed

+3919
-1154
lines changed

src/mlipaudit/benchmarks/folding_stability/folding_stability.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,13 @@
4444
"orexin_beta_1cq0_nmr",
4545
]
4646

47+
BOX_SIZES = {
48+
"chignolin_1uao_xray": [23.98, 22.45, 20.68],
49+
"trp_cage_2jof_xray": [29.33, 29.74, 23.59],
50+
"amyloid_beta_1ba6_nmr": [51.90, 33.74, 39.50],
51+
"orexin_beta_1cq0_nmr": [40.30, 29.56, 33.97],
52+
}
53+
4754
SIMULATION_CONFIG = {
4855
"num_steps": 100_000,
4956
"snapshot_interval": 100,
@@ -182,9 +189,13 @@ def run_model(self) -> None:
182189
simulation_states=[],
183190
)
184191

185-
structure_names = (
186-
STRUCTURE_NAMES[:1] if self.run_mode == RunMode.DEV else STRUCTURE_NAMES
187-
)
192+
if self.run_mode == RunMode.DEV:
193+
structure_names = STRUCTURE_NAMES[:1]
194+
elif self.run_mode == RunMode.FAST:
195+
structure_names = STRUCTURE_NAMES[:2]
196+
else:
197+
structure_names = STRUCTURE_NAMES
198+
188199
if self.run_mode == RunMode.DEV:
189200
md_kwargs = SIMULATION_CONFIG_FAST
190201
else:
@@ -195,7 +206,9 @@ def run_model(self) -> None:
195206
xyz_filename = structure_name + ".xyz"
196207
atoms = ase_read(self.data_input_dir / self.name / xyz_filename)
197208

198-
md_engine = get_simulation_engine(atoms, self.force_field, **md_kwargs)
209+
md_engine = get_simulation_engine(
210+
atoms, self.force_field, box=BOX_SIZES[structure_name], **md_kwargs
211+
)
199212
md_engine.run()
200213

201214
final_state = md_engine.state
@@ -239,11 +252,18 @@ def analyze(self) -> FoldingStabilityResult:
239252
topology_filename = structure_name + ".pdb"
240253
ref_filename = structure_name + "_ref.pdb"
241254

242-
mdtraj_traj = create_mdtraj_trajectory_from_simulation_state(
255+
mdtraj_traj_solv = create_mdtraj_trajectory_from_simulation_state(
243256
simulation_state,
244257
topology_path=self.data_input_dir / self.name / topology_filename,
245258
)
246-
ase_traj = create_ase_trajectory_from_simulation_state(simulation_state)
259+
ase_traj_solv = create_ase_trajectory_from_simulation_state(
260+
simulation_state
261+
)
262+
263+
non_solvent_idx = mdtraj_traj_solv.top.select("not resname HOH")
264+
265+
mdtraj_traj = mdtraj_traj_solv.atom_slice(non_solvent_idx)
266+
ase_traj = [atoms[non_solvent_idx] for atoms in ase_traj_solv]
247267

248268
# 1. Radius of gyration
249269
rg_values = [

src/mlipaudit/benchmarks/small_molecule_minimization/small_molecule_minimization.py

Lines changed: 2 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,9 @@
3737

3838
logger = logging.getLogger("mlipaudit")
3939

40-
QM9_NEUTRAL_FILENAME = "qm9_n100_neutral.json"
41-
QM9_CHARGED_FILENAME = "qm9_n10_charged.json"
42-
OPENFF_NEUTRAL_FILENAME = "openff_n100_neutral.json"
43-
OPENFF_CHARGED_FILENAME = "openff_n10_charged.json"
40+
OPENFF_NEUTRAL_FILENAME = "openff_n200_neutral.json"
41+
OPENFF_CHARGED_FILENAME = "openff_n20_charged.json"
4442
DATASET_PREFIXES = [
45-
"qm9_neutral",
46-
"qm9_charged",
4743
"openff_neutral",
4844
"openff_charged",
4945
]
@@ -107,14 +103,10 @@ class SmallMoleculeMinimizationModelOutput(ModelOutput):
107103
"""ModelOutput object for small molecule conformer minimization benchmark.
108104
109105
Attributes:
110-
qm9_neutral: A list of simulation states for each molecule in the dataset.
111-
qm9_charged: A list of simulation states for each molecule in the dataset.
112106
openff_neutral: A list of simulation states for each molecule in the dataset.
113107
openff_charged: A list of simulation states for each molecule in the dataset.
114108
"""
115109

116-
qm9_neutral: list[MoleculeSimulationOutput]
117-
qm9_charged: list[MoleculeSimulationOutput]
118110
openff_neutral: list[MoleculeSimulationOutput]
119111
openff_charged: list[MoleculeSimulationOutput]
120112

@@ -141,17 +133,13 @@ class SmallMoleculeMinimizationResult(BenchmarkResult):
141133
"""Results object for small molecule minimization benchmark.
142134
143135
Attributes:
144-
qm9_neutral: The results for the qm9 neutral dataset.
145-
qm9_charged: The results for the qm9 charged dataset.
146136
openff_neutral: The results for the openff neutral dataset.
147137
openff_charged: The results for the openff charged dataset.
148138
avg_rmsd: The average rmsd across all datasets.
149139
score: The final score for the benchmark between
150140
0 and 1.
151141
"""
152142

153-
qm9_neutral: SmallMoleculeMinimizationDatasetResult
154-
qm9_charged: SmallMoleculeMinimizationDatasetResult
155143
openff_neutral: SmallMoleculeMinimizationDatasetResult
156144
openff_charged: SmallMoleculeMinimizationDatasetResult
157145
avg_rmsd: NonNegativeFloat | None = None
@@ -200,8 +188,6 @@ def run_model(self) -> None:
200188
md_kwargs = SIMULATION_CONFIG
201189

202190
self.model_output = SmallMoleculeMinimizationModelOutput(
203-
qm9_neutral=[],
204-
qm9_charged=[],
205191
openff_neutral=[],
206192
openff_charged=[],
207193
)
@@ -354,14 +340,6 @@ def _load_dataset_from_file(self, filename: str) -> dict[str, Molecule]:
354340

355341
return dataset
356342

357-
@functools.cached_property
358-
def _qm9_neutral_dataset(self) -> dict[str, Molecule]:
359-
return self._load_dataset_from_file(QM9_NEUTRAL_FILENAME)
360-
361-
@functools.cached_property
362-
def _qm9_charged_dataset(self) -> dict[str, Molecule]:
363-
return self._load_dataset_from_file(QM9_CHARGED_FILENAME)
364-
365343
@functools.cached_property
366344
def _openff_neutral_dataset(self) -> dict[str, Molecule]:
367345
return self._load_dataset_from_file(OPENFF_NEUTRAL_FILENAME)

src/mlipaudit/benchmarks/stability/stability.py

Lines changed: 54 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -58,50 +58,57 @@ class StructureMetadata(TypedDict):
5858

5959

6060
STRUCTURES: dict[str, StructureMetadata] = {
61-
# Small proteins/peptides
62-
"1JRS_Leupeptin": {
63-
"xyz": "71_1jrs_leupeptin.xyz",
64-
"pdb": "71_1jrs_leupeptin.pdb",
65-
"description": "Leupeptin inhibitor (71 atoms)",
61+
# Small molecules in vacuum
62+
"Small_molecule_HCNO": {
63+
"xyz": "small_molecule_HCNO.xyz",
64+
"pdb": "small_molecule_HCNO.pdb",
65+
"description": "SMall molecule (HCNO)",
6666
},
67-
"Chignolin": {
68-
"xyz": "138_1uao_chignolin.xyz",
69-
"pdb": "138_1uao_chignolin.pdb",
70-
"description": "Chignolin peptide (138 atoms)",
67+
"Small_molecule_Sulfur": {
68+
"xyz": "small_molecule_S.xyz",
69+
"pdb": "small_molecule_S.pdb",
70+
"description": "Small molecule (contains Sulfur)",
7171
},
72-
"RNA fragment": {
73-
"xyz": "168_1p79_RNA.xyz",
74-
"pdb": "168_1p79_RNA.pdb",
75-
"description": "RNA fragment (168 atoms)",
72+
"Small_molecule_Halogen": {
73+
"xyz": "small_molecule_Hal.xyz",
74+
"pdb": "small_molecule_Hal.pdb",
75+
"description": "Small molecule (contains Halogens)",
7676
},
77-
# Medium proteins
78-
"5KGZ": {
79-
"xyz": "634_5kgz.xyz",
80-
"pdb": "634_5kgz.pdb",
81-
"description": "Protein structure (634 atoms)",
77+
# peptides in vacuum
78+
"Peptide_HCNO": {
79+
"xyz": "peptide_HCNO.xyz",
80+
"pdb": "peptide_HCNO.pdb",
81+
"description": "Neurotensin in vacuum (PDB: 2LNF)",
8282
},
83-
"1AB7": {
84-
"xyz": "1432_1ab7.xyz",
85-
"pdb": "1432_1ab7.pdb",
86-
"description": "Protein structure (1,432 atoms)",
83+
"Peptide_cys": {
84+
"xyz": "peptide_cys.xyz",
85+
"pdb": "peptide_cys.pdb",
86+
"description": "Cyclic peptide with cysteines in vacuum (Oxytocin; PDB: 7OFG)",
8787
},
88-
"1BIP": {
89-
"xyz": "1818_1bip.xyz",
90-
"pdb": "1818_1bip.pdb",
91-
"description": "Protein structure (1,818 atoms)",
88+
# Medium protein in vacuum
89+
"Protein": {
90+
"xyz": "protein_1a7m.xyz",
91+
"pdb": "protein_1a7m.pdb",
92+
"description": "Protein structure in vacuum (PDB: 1A7M)",
9293
},
93-
"1A5E": {
94-
"xyz": "2301_1a5e.xyz",
95-
"pdb": "2301_1a5e.pdb",
96-
"description": "Protein structure (2,301 atoms)",
94+
# solvated systems
95+
"Peptide_solvated": {
96+
"xyz": "peptide_solv.xyz",
97+
"pdb": "peptide_solv.pdb",
98+
"description": "Solvated Oxytocin (PDB: 7OFG)",
9799
},
98-
"1A7M": {
99-
"xyz": "2803_1a7m.xyz",
100-
"pdb": "2803_1a7m.pdb",
101-
"description": "Protein structure (2,803 atoms)",
100+
"Peptide_solvated_ions": {
101+
"xyz": "peptide_solv_ion.xyz",
102+
"pdb": "peptide_solv_ion.pdb",
103+
"description": "Solvated Neurotensin with counter-ions (PDB: 2LNF)",
102104
},
103105
}
104106

107+
BOX_SIZES = {
108+
"Peptide_solvated": [23.43, 28.96, 20.90],
109+
"Peptide_solvated_ions": [25.62, 27.89, 37.36],
110+
}
111+
105112
STRUCTURE_NAMES = list(STRUCTURES.keys())
106113

107114

@@ -376,7 +383,7 @@ class StabilityBenchmark(Benchmark):
376383
result_class = StabilityResult
377384
model_output_class = StabilityModelOutput
378385

379-
required_elements = {"N", "H", "O", "S", "P", "C"}
386+
required_elements = {"N", "H", "O", "S", "P", "C", "Cl", "F"}
380387

381388
def run_model(self) -> None:
382389
"""Run MD for each structure.
@@ -392,16 +399,24 @@ def run_model(self) -> None:
392399
if self.run_mode == RunMode.DEV:
393400
structure_names = STRUCTURE_NAMES[:2]
394401
elif self.run_mode == RunMode.FAST:
395-
structure_names = STRUCTURE_NAMES[:4]
402+
structure_names = STRUCTURE_NAMES[:5]
396403

397404
for structure_name in structure_names:
398405
logger.info("Running MD for %s", structure_name)
399406
xyz_filename = STRUCTURES[structure_name]["xyz"]
400407
atoms = ase_read(self.data_input_dir / self.name / xyz_filename)
401408

402-
md_engine = get_simulation_engine(
403-
atoms, self.force_field, **self._md_kwargs
404-
)
409+
if structure_name in BOX_SIZES:
410+
md_engine = get_simulation_engine(
411+
atoms,
412+
self.force_field,
413+
box=BOX_SIZES[structure_name],
414+
**self._md_kwargs,
415+
)
416+
else:
417+
md_engine = get_simulation_engine(
418+
atoms, self.force_field, **self._md_kwargs
419+
)
405420
md_engine.run()
406421

407422
final_state = md_engine.state

src/mlipaudit/ui/small_molecule_minimization.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def small_molecule_minimization_page(
8484
"Small molecule energy minimization benchmark. We run energy"
8585
" minimizations with "
8686
"our MLIP starting from reference structures extracted from the"
87-
" QM9 dataset and "
87+
" OpenFF dataset and "
8888
"calculate after the minimization, how much the atomic positions"
8989
" of the "
9090
"heavy atoms deviate from the reference structure. The key metric"
@@ -96,12 +96,12 @@ def small_molecule_minimization_page(
9696

9797
st.markdown(
9898
"Here, we test this ability on two datasets of organic small molecules: "
99-
"the QM9 dataset and the OpenFF dataset. To be able to verfify the MLIP's"
99+
"the OpenFF dataset. To be able to verfify the MLIP's "
100100
"ability to represent charged systems, we split the two datasets into neutral "
101101
" and charged subsets. "
102102
"To ensure that the benchmark can be run within an acceptable time, we "
103-
"reduce the number of test structures to 100 for the neutral datasets"
104-
" and 10 for "
103+
"reduce the number of test structures to 200 for the neutral datasets"
104+
" and 20 for "
105105
"the charged datasets. The subsets are constructed so that the chemical "
106106
"diversity, "
107107
"as represented by Morgan fingerprints, is maximized. For each of these"

0 commit comments

Comments
 (0)