Skip to content
This repository was archived by the owner on Mar 10, 2026. It is now read-only.

Commit 7a1426f

Browse files
authored
Merge pull request #81 from MDverse/update-molecule-model
feat: Add sequence and external_identifiers fields in the Molecule model
2 parents 476158b + e3630bc commit 7a1426f

File tree

4 files changed

+158
-11
lines changed

4 files changed

+158
-11
lines changed

src/mdverse_scrapers/models/enums.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,10 @@ class DatasetSourceName(StrEnum):
2020
ATLAS = "atlas"
2121
GPCRMD = "gpcrmd"
2222
NMRLIPIDS = "nmrlipids"
23+
24+
25+
class ExternalDatabaseName(StrEnum):
26+
"""External database names."""
27+
28+
PDB = "pdb"
29+
UNIPROT = "uniprot"

src/mdverse_scrapers/models/simulation.py

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,39 @@
55

66
from pydantic import BaseModel, ConfigDict, Field, StringConstraints, field_validator
77

8+
from .enums import ExternalDatabaseName
9+
810
DOI = Annotated[
911
str,
1012
StringConstraints(pattern=r"^10\.\d{4,9}/[\w\-.]+$"),
1113
]
1214

1315

16+
class ExternalIdentifier(BaseModel):
17+
"""External database identifier."""
18+
19+
# Ensure scraped metadata matches the expected schema exactly
20+
# and numbers are coerced to strings when needed.
21+
model_config = ConfigDict(extra="forbid", coerce_numbers_to_str=True)
22+
23+
database_name: ExternalDatabaseName = Field(
24+
...,
25+
description=(
26+
"Name of the external database. "
27+
"Allowed values are defined in ExternalDatabaseName enum. "
28+
"Examples: PDB, UNIPROT..."
29+
),
30+
)
31+
identifier: str = Field(
32+
...,
33+
min_length=1,
34+
description="Identifier in the external database.",
35+
)
36+
url: str | None = Field(
37+
None, min_length=1, description="Direct URL to the identifier into the database"
38+
)
39+
40+
1441
class Molecule(BaseModel):
1542
"""Molecule in a simulation."""
1643

@@ -19,15 +46,20 @@ class Molecule(BaseModel):
1946

2047
name: str = Field(..., description="Name of the molecule.")
2148
number_of_atoms: int | None = Field(
22-
None, ge=0, description="Number of atoms in the molecule, if known."
49+
None, ge=0, description="Number of atoms in the molecule."
2350
)
24-
formula: str | None = Field(
25-
None, description="Chemical formula of the molecule, if known."
51+
formula: str | None = Field(None, description="Chemical formula of the molecule.")
52+
sequence: str | None = Field(
53+
None, description="Sequence of the molecule for protein and nucleic acid."
2654
)
2755
number_of_molecules: int | None = Field(
2856
None,
2957
ge=0,
30-
description="Number of molecules of this type in the simulation, if known.",
58+
description="Number of molecules of this type in the simulation.",
59+
)
60+
external_identifiers: list[ExternalIdentifier] | None = Field(
61+
None,
62+
description=("List of external database identifiers for this molecule."),
3163
)
3264

3365

tests/models/test_simulation.py

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
import pytest
44
from pydantic import ValidationError
55

6+
from mdverse_scrapers.models.enums import ExternalDatabaseName
67
from mdverse_scrapers.models.simulation import (
8+
ExternalIdentifier,
79
ForceFieldModel,
810
Molecule,
911
SimulationMetadata,
@@ -61,7 +63,16 @@ def test_structured_fields_creation():
6163
software=[Software(name="GROMACS", version="2023.1")],
6264
molecules=[
6365
Molecule(
64-
name="H2O", number_of_atoms=3, formula="H2O", number_of_molecules=100
66+
name="H2O",
67+
number_of_atoms=3,
68+
formula="H2O",
69+
number_of_molecules=100,
70+
sequence="PEPTIDE",
71+
external_identifiers=[
72+
ExternalIdentifier(
73+
database_name=ExternalDatabaseName.PDB, identifier="1ABC"
74+
)
75+
],
6576
)
6677
],
6778
forcefields_models=[ForceFieldModel(name="AMBER", version="ff14SB")],
@@ -70,17 +81,21 @@ def test_structured_fields_creation():
7081
assert metadata.molecules[0].number_of_atoms == 3
7182
assert metadata.molecules[0].number_of_molecules == 100
7283
assert metadata.forcefields_models[0].version == "ff14SB"
84+
assert metadata.molecules[0].sequence == "PEPTIDE"
85+
assert (
86+
metadata.molecules[0].external_identifiers[0].database_name
87+
== ExternalDatabaseName.PDB
88+
)
89+
assert metadata.molecules[0].external_identifiers[0].identifier == "1ABC"
7390

7491

7592
# -------------------------------------------------------------------
76-
# Test invalid values in structured fields
93+
# Test invalid fields
7794
# -------------------------------------------------------------------
78-
def test_invalid_molecule_number_of_atoms():
79-
"""Test that molecule number_of_atoms cannot be negative."""
95+
def test_invalid_fields():
96+
"""Test with a non-existing fields."""
8097
with pytest.raises(ValidationError):
81-
SimulationMetadata(
82-
molecules=[Molecule(name="H2O", number_of_atoms=-1, formula="H2O")]
83-
)
98+
SimulationMetadata(total_number_of_something=1000)
8499

85100

86101
# -------------------------
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
"""Tests for the Pydantic molecule model."""
2+
3+
import pytest
4+
from pydantic import ValidationError
5+
6+
from mdverse_scrapers.models.enums import ExternalDatabaseName
7+
from mdverse_scrapers.models.simulation import (
8+
ExternalIdentifier,
9+
Molecule,
10+
)
11+
12+
13+
# -------------------------------------------------------------------
14+
# Test invalid fields
15+
# -------------------------------------------------------------------
16+
def test_invalid_fields():
17+
"""Test with a non-existing field."""
18+
with pytest.raises(ValidationError):
19+
Molecule(
20+
name="water",
21+
dummy_number=1000, # type: ignore
22+
)
23+
with pytest.raises(ValidationError):
24+
Molecule(
25+
name="water",
26+
dummy_str="dummy_value", # type: ignore
27+
)
28+
29+
30+
# -------------------------------------------------------------------
31+
# Test invalid values
32+
# -------------------------------------------------------------------
33+
def test_invalid_number_of_atoms():
34+
"""Test that number_of_atoms cannot be negative."""
35+
with pytest.raises(ValidationError):
36+
Molecule(name="H2O", number_of_atoms=-10, formula="H2O")
37+
38+
39+
def test_invalid_number_of_molecules():
40+
"""Test that number_of_molecules cannot be negative."""
41+
with pytest.raises(ValidationError):
42+
Molecule(name="H2O", number_of_molecules=-10, formula="H2O")
43+
44+
45+
# -------------------------------------------------------------------
46+
# Test ExternalIdentifier
47+
# -------------------------------------------------------------------
48+
@pytest.mark.parametrize(
49+
("database_name", "identifier", "expected_identifier", "url"),
50+
[
51+
(
52+
ExternalDatabaseName.PDB,
53+
"1K79",
54+
"1K79",
55+
"https://www.rcsb.org/structure/1K79",
56+
),
57+
(ExternalDatabaseName.PDB, 1234, "1234", None),
58+
(
59+
ExternalDatabaseName.UNIPROT,
60+
"P06213",
61+
"P06213",
62+
"https://www.uniprot.org/uniprotkb/P06213/entry",
63+
),
64+
(ExternalDatabaseName.UNIPROT, 123456, "123456", None),
65+
],
66+
)
67+
def test_external_identifier_creation(
68+
database_name, identifier, expected_identifier, url
69+
):
70+
"""Test creation of ExternalIdentifier instances."""
71+
external_identifier = ExternalIdentifier(
72+
database_name=database_name,
73+
identifier=identifier,
74+
url=url,
75+
)
76+
assert external_identifier.database_name == database_name
77+
assert external_identifier.identifier == expected_identifier
78+
assert external_identifier.url == url
79+
80+
81+
def test_invalid_database_name_in_external_identifiers():
82+
"""Test invalid database names."""
83+
# Invalid database name
84+
with pytest.raises(ValidationError):
85+
ExternalIdentifier(
86+
database_name="INVALID_DB", # type: ignore
87+
identifier="1ABC",
88+
)
89+
with pytest.raises(AttributeError):
90+
ExternalIdentifier(
91+
database_name=ExternalDatabaseName.DUMMY, # type: ignore
92+
identifier="1ABC",
93+
)

0 commit comments

Comments
 (0)