update conversion script and yaml definitions

Jhsmit · Jhsmit · commit 07ad2d0ad52d · 2025-07-31T11:25:11.000+02:00
diff --git a/tests/datasets/1744801204_SecA_cluster_Krishnamurthy/convert/convert.py b/tests/datasets/1744801204_SecA_cluster_Krishnamurthy/convert/convert.py
@@ -1,7 +1,8 @@
 # %%
 """convert v 0.2.x datasets to v 0.3.x dataset"""
 
-from hdxms_datasets.v2.models import (
+from hdxms_datasets.database import submit_dataset
+from hdxms_datasets.models import (
     Author,
     HDXDataSet,
     HDXState,
@@ -14,7 +15,9 @@
 
 from pathlib import Path
 import yaml
-from hdxms_datasets.v2.migration.v020 import get_peptides
+from hdxms_datasets.migration.v020 import get_peptides
+from hdxms_datasets.verification import verify_dataset
+from hdxms_datasets.view import StructureView
 # %%
 
 root = Path(__file__).parent
@@ -44,50 +47,60 @@
 )
 
 structure = Structure(
-    data_file=Path("data/SecA_monomer.pdb"),
+    data_file=root / "SecA_monomer.pdb",
     format="pdb",
     description="NMR structure with ligand removed in sillico",
     pdb_id="2VDA",
 )
 
 # %%
-states = list(hdx_spec["peptides"].keys())
+StructureView(structure)
 
 # %%
+orig_states = list(hdx_spec["peptides"].keys())
+# states = list(hdx_spec["peptides"].keys())
+orig_states
 
+# %%
 
 states_mapping = {
-    "WT ADP": "SecA-WT_ADP",
-    "Monomer ADP": "SecA-monomer_ADP",
-    "1-834 ADP": "SecA-1-834_ADP",
     "WT apo": "SecA-WT_apo",
     "Monomer apo": "SecA-monomer_apo",
     "1-834 apo": "SecA-1-834_apo",
+    "WT ADP": "SecA-WT_ADP",
+    "Monomer ADP": "SecA-monomer_ADP",
+    "1-834 ADP": "SecA-1-834_ADP",
 }
 
-
 # %%
 
 # load the seca state data as reference for protein data
 seca_state_dir = root.parent.parent / "1665149400_SecA_Krishnamurthy"
 json_str = (seca_state_dir / "dataset.json").read_text()
 
 # %%
-ds_state = HDXDataSet.model_validate_json(json_str)
-wt_state = ds_state.get_state("SecA-WT_ADP")
+ref_dataset = HDXDataSet.model_validate_json(json_str)
+wt_state = ref_dataset.get_state("SecA-WT_apo")
 wt_state.protein_state
 # %%
 
 hdx_states = []
-for state in states:
-    peptide_spec = hdx_spec["peptides"][state]
-    peptides = get_peptides(peptide_spec, hdx_spec["data_files"])
+for state in states_mapping:
     new_state_name = states_mapping[state]
 
-    ref_state = ds_state.get_state(new_state_name)
+    peptide_spec = hdx_spec["peptides"][state]
+    peptides = get_peptides(peptide_spec, hdx_spec["data_files"], root_dir=root.parent, chain=["B"])
+    # we only keep the FD control for the WT apo state; other states should use the same one
+    if new_state_name != "SecA-WT_apo":
+        peptides = [p for p in peptides if p.deuteration_type != "fully_deuterated"]
+    print(state)
+    for peptide in peptides:
+        print("state filter:", peptide.filters["State"])
+
+    ref_state = ref_dataset.get_state(new_state_name)
 
     hdx_state = HDXState(
-        name=state,
+        name=new_state_name,
         protein_state=ref_state.protein_state,
         peptides=peptides,
     )
@@ -96,15 +109,29 @@
 
 # %%
 
+
+# %%
 dataset = HDXDataSet(
-    description="SecA quiescent states dataset",
+    description="SecA quiescent states dataset (cluster data)",
     states=hdx_states,
     structure=structure,
     protein_identifiers=protein_info,
     metadata=metadata,
 )
 
-s = dataset.model_dump_json(indent=2, exclude_none=True)
-Path(root.parent / "dataset.json").write_text(s)
+# %%
+
+offending_state = dataset.get_state("SecA-1-834_apo")
+offending_state.peptides
+
+# %%
+verify_dataset(dataset)
+database_dir = root.parent
+# submit the dataset to our database
+success, msg_or_id = submit_dataset(dataset, database_dir)
+if success:
+    print(f"Dataset submitted successfully with ID: {msg_or_id}")
+else:
+    print(f"Failed to submit dataset: {msg_or_id}")
 
 # %%
diff --git a/tests/datasets/1744801204_SecA_cluster_Krishnamurthy/convert/hdx_spec.yaml b/tests/datasets/1744801204_SecA_cluster_Krishnamurthy/convert/hdx_spec.yaml
@@ -97,12 +97,12 @@ peptides:
     non_deuterated:
       data_file: SecA_cluster
       filters:
-        State: SecA1-834 apo
+        State: SecA1-901 wt apo
         Exposure: 0
     partially_deuterated:
       data_file: SecA_cluster
       filters:
-        State: SecA1-834 apo
+        State: SecA1-901 wt apo
         Exposure:
         - 0.167
         - 0.5