|
19 | 19 | ) |
20 | 20 | from ..core.toolbox import print_statistics |
21 | 21 | from ..models.dataset import DatasetMetadata |
22 | | -from ..models.enums import DatasetSourceName |
| 22 | +from ..models.enums import DatasetSourceName, ExternalDatabaseName, MoleculeType |
23 | 23 | from ..models.scraper import ScraperContext |
| 24 | +from ..models.simulation import ExternalIdentifier, ForceFieldModel, Molecule, Software |
24 | 25 | from ..models.utils import ( |
25 | 26 | export_list_of_models_to_parquet, |
26 | 27 | normalize_datasets_metadata, |
|
40 | 41 | ], |
41 | 42 | "doi": "10.1093/nar/gkad1084", # https://academic.oup.com/nar/article/52/D1/D384/7438909 |
42 | 43 | "external_link": ["https://www.dsimb.inserm.fr/ATLAS/"], |
| 44 | + "software_name": "GROMACS", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters |
| 45 | + "software_version": "v2019.6", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters |
| 46 | + "forcefield_name": "CHARMM36m", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters |
| 47 | + "forcefield_version": "July 2020", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters |
| 48 | + "water_model": "TIP3P", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters |
| 49 | + "simulation_temperature": 300, # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters |
| 50 | + "simulation_time": "100 ns", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters |
| 51 | + "simulation_timestep": 2, # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters |
43 | 52 | } |
44 | 53 |
|
45 | 54 |
|
@@ -115,7 +124,7 @@ def extract_file_sizes_from_html( |
115 | 124 | return files_metadata |
116 | 125 |
|
117 | 126 |
|
118 | | -def scrape_metadata_for_a_dataset( |
| 127 | +def scrape_metadata_for_one_dataset( |
119 | 128 | client: httpx.Client, |
120 | 129 | chain_id: str, |
121 | 130 | logger: "loguru.Logger" = loguru.logger, |
@@ -165,6 +174,53 @@ def scrape_metadata_for_a_dataset( |
165 | 174 | "doi": ATLAS_METADATA["doi"], |
166 | 175 | "external_links": ATLAS_METADATA["external_link"], |
167 | 176 | } |
| 177 | + # Add molecules. |
| 178 | + external_identifiers = [] |
| 179 | + if meta_json.get("PDB"): |
| 180 | + external_identifiers.append( |
| 181 | + ExternalIdentifier( |
| 182 | + database_name=ExternalDatabaseName.PDB, |
| 183 | + identifier=meta_json["PDB"].split("_", maxsplit=1)[0], |
| 184 | + ) |
| 185 | + ) |
| 186 | + if meta_json.get("UniProt"): |
| 187 | + external_identifiers.append( |
| 188 | + ExternalIdentifier( |
| 189 | + database_name=ExternalDatabaseName.UNIPROT, |
| 190 | + identifier=meta_json["UniProt"], |
| 191 | + ) |
| 192 | + ) |
| 193 | + metadata["molecules"] = [ |
| 194 | + Molecule( |
| 195 | + name=meta_json.get("protein_name"), |
| 196 | + sequence=meta_json.get("sequence"), |
| 197 | + external_identifiers=external_identifiers, |
| 198 | + type=MoleculeType.PROTEIN, |
| 199 | + ) |
| 200 | + ] |
| 201 | + # Add software. |
| 202 | + metadata["software"] = [ |
| 203 | + Software( |
| 204 | + name=ATLAS_METADATA["software_name"], |
| 205 | + version=ATLAS_METADATA["software_version"], |
| 206 | + ) |
| 207 | + ] |
| 208 | + # Add forcefields and models. |
| 209 | + metadata["forcefields_models"] = [ |
| 210 | + ForceFieldModel( |
| 211 | + name=ATLAS_METADATA["forcefield_name"], |
| 212 | + version=ATLAS_METADATA["forcefield_version"], |
| 213 | + ), |
| 214 | + ForceFieldModel(name=ATLAS_METADATA["water_model"]), |
| 215 | + ] |
| 216 | + # Add simulation temperature. |
| 217 | + metadata["simulation_temperatures_in_kelvin"] = [ |
| 218 | + ATLAS_METADATA["simulation_temperature"] |
| 219 | + ] |
| 220 | + # Add simulation time. |
| 221 | + metadata["simulation_times"] = [ATLAS_METADATA["simulation_time"]] |
| 222 | + # Add simulation time step. |
| 223 | + metadata["simulation_timesteps_in_fs"] = [ATLAS_METADATA["simulation_timestep"]] |
168 | 224 | logger.info("Done.") |
169 | 225 | return metadata |
170 | 226 |
|
@@ -223,7 +279,7 @@ def scrape_all_datasets( |
223 | 279 | datasets_meta = [] |
224 | 280 | logger.info("Starting scraping of all datasets...") |
225 | 281 | for pdb_counter, pdb_chain in enumerate(pdb_chains, start=1): |
226 | | - metadata = scrape_metadata_for_a_dataset(client, pdb_chain, logger=logger) |
| 282 | + metadata = scrape_metadata_for_one_dataset(client, pdb_chain, logger=logger) |
227 | 283 | if metadata: |
228 | 284 | datasets_meta.append(metadata) |
229 | 285 | logger.info( |
|
0 commit comments