Skip to content

Commit 138a29b

Browse files
authored
Merge pull request #92 from MDverse/feat/add-sequence-for-atlas
Feat/add sequence for atlas
2 parents 3cd7ed8 + 6bbfedb commit 138a29b

File tree

2 files changed

+64
-5
lines changed

2 files changed

+64
-5
lines changed

docs/atlas.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
# ATLAS
22

3-
ATLAS (Atlas of proTein moLecular dynAmicS) is an open-access data repository that gathers standardized molecular dynamics simulations of protein structures, accompanied by their analysis in the form of interactive diagrams and trajectory visualisation. All raw trajectories as well as the results of analysis are available for download.
3+
ATLAS (Atlas of proTein moLecular dynAmicS) is an open-access data repository that gathers standardized molecular dynamics simulations of protein structures,
4+
accompanied by their analysis in the form of interactive diagrams and trajectory visualisation.
5+
All raw trajectories as well as the results of analysis are available for download.
46

57
- web site: <https://www.dsimb.inserm.fr/ATLAS/>
68
- publication: [ATLAS: protein flexibility description from atomistic molecular dynamics simulations](https://academic.oup.com/nar/article/52/D1/D384/7438909), Nucleic Acids Research, 2024.
@@ -38,7 +40,8 @@ Example with dataset id `1k5n_A`:
3840
Remarks:
3941

4042
- The title of the dataset is the protein name.
41-
- No comment or description is provided. We used the organism as description.
43+
- No comment or description is provided. We used the organism name as description.
44+
- Parameters of molecular dynamics simulations are provided through the API endpoint <https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters>.
4245

4346
### Metadata for files
4447

src/mdverse_scrapers/scrapers/atlas.py

Lines changed: 59 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,9 @@
1919
)
2020
from ..core.toolbox import print_statistics
2121
from ..models.dataset import DatasetMetadata
22-
from ..models.enums import DatasetSourceName
22+
from ..models.enums import DatasetSourceName, ExternalDatabaseName, MoleculeType
2323
from ..models.scraper import ScraperContext
24+
from ..models.simulation import ExternalIdentifier, ForceFieldModel, Molecule, Software
2425
from ..models.utils import (
2526
export_list_of_models_to_parquet,
2627
normalize_datasets_metadata,
@@ -40,6 +41,14 @@
4041
],
4142
"doi": "10.1093/nar/gkad1084", # https://academic.oup.com/nar/article/52/D1/D384/7438909
4243
"external_link": ["https://www.dsimb.inserm.fr/ATLAS/"],
44+
"software_name": "GROMACS", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters
45+
"software_version": "v2019.6", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters
46+
"forcefield_name": "CHARMM36m", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters
47+
"forcefield_version": "July 2020", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters
48+
"water_model": "TIP3P", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters
49+
"simulation_temperature": 300, # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters
50+
"simulation_time": "100 ns", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters
51+
"simulation_timestep": 2, # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters
4352
}
4453

4554

@@ -115,7 +124,7 @@ def extract_file_sizes_from_html(
115124
return files_metadata
116125

117126

118-
def scrape_metadata_for_a_dataset(
127+
def scrape_metadata_for_one_dataset(
119128
client: httpx.Client,
120129
chain_id: str,
121130
logger: "loguru.Logger" = loguru.logger,
@@ -165,6 +174,53 @@ def scrape_metadata_for_a_dataset(
165174
"doi": ATLAS_METADATA["doi"],
166175
"external_links": ATLAS_METADATA["external_link"],
167176
}
177+
# Add molecules.
178+
external_identifiers = []
179+
if meta_json.get("PDB"):
180+
external_identifiers.append(
181+
ExternalIdentifier(
182+
database_name=ExternalDatabaseName.PDB,
183+
identifier=meta_json["PDB"].split("_", maxsplit=1)[0],
184+
)
185+
)
186+
if meta_json.get("UniProt"):
187+
external_identifiers.append(
188+
ExternalIdentifier(
189+
database_name=ExternalDatabaseName.UNIPROT,
190+
identifier=meta_json["UniProt"],
191+
)
192+
)
193+
metadata["molecules"] = [
194+
Molecule(
195+
name=meta_json.get("protein_name"),
196+
sequence=meta_json.get("sequence"),
197+
external_identifiers=external_identifiers,
198+
type=MoleculeType.PROTEIN,
199+
)
200+
]
201+
# Add software.
202+
metadata["software"] = [
203+
Software(
204+
name=ATLAS_METADATA["software_name"],
205+
version=ATLAS_METADATA["software_version"],
206+
)
207+
]
208+
# Add forcefields and models.
209+
metadata["forcefields_models"] = [
210+
ForceFieldModel(
211+
name=ATLAS_METADATA["forcefield_name"],
212+
version=ATLAS_METADATA["forcefield_version"],
213+
),
214+
ForceFieldModel(name=ATLAS_METADATA["water_model"]),
215+
]
216+
# Add simulation temperature.
217+
metadata["simulation_temperatures_in_kelvin"] = [
218+
ATLAS_METADATA["simulation_temperature"]
219+
]
220+
# Add simulation time.
221+
metadata["simulation_times"] = [ATLAS_METADATA["simulation_time"]]
222+
# Add simulation time step.
223+
metadata["simulation_timesteps_in_fs"] = [ATLAS_METADATA["simulation_timestep"]]
168224
logger.info("Done.")
169225
return metadata
170226

@@ -223,7 +279,7 @@ def scrape_all_datasets(
223279
datasets_meta = []
224280
logger.info("Starting scraping of all datasets...")
225281
for pdb_counter, pdb_chain in enumerate(pdb_chains, start=1):
226-
metadata = scrape_metadata_for_a_dataset(client, pdb_chain, logger=logger)
282+
metadata = scrape_metadata_for_one_dataset(client, pdb_chain, logger=logger)
227283
if metadata:
228284
datasets_meta.append(metadata)
229285
logger.info(

0 commit comments

Comments
 (0)