|
1 | 1 | # HDXMS Datasets |
2 | 2 |
|
| 3 | +Welcome to the HDXMS datasets repository. |
3 | 4 |
|
4 | | -* Free software: MIT license |
| 5 | +The `hdxms-datasets` package provides tools handling HDX-MS datasets. |
5 | 6 |
|
6 | | -## Installation |
| 7 | +The package offers the following features: |
7 | 8 |
|
8 | | -```bash |
9 | | -$ pip install hdxms-datasets |
10 | | -``` |
| 9 | + - Defining datasets and their experimental metadata |
| 10 | + - Verification of datasets and metadata |
| 11 | + - Loading datasets from local or remote (WIP) database |
| 12 | + - Conversion of datasets from various formats (e.g., DynamX, HDExaminer) to a standardized format |
| 13 | + - Propagation of standard deviations from replicates to fractional relative uptake values |
11 | 14 |
|
12 | | -## HDX-MS database |
13 | 15 |
|
14 | | -Currently a beta test database is set up at: |
15 | | -https://github.com/Jhsmit/HDX-MS-datasets |
| 16 | +## Example Usage |
16 | 17 |
|
17 | | -## Using HDX-MS datasets |
| 18 | +```python {title="Loading a dataset"} |
18 | 19 |
|
19 | | -### Example code |
| 20 | +from hdxms_datasets import DataBase |
20 | 21 |
|
| 22 | +db = DataBase('path/to/local_db') |
| 23 | +dataset = db.get_dataset('HDX_D9096080') |
21 | 24 |
|
22 | | -```python |
23 | | -from pathlib import Path |
24 | | -from hdxms_datasets import DataVault |
| 25 | +# Protein identifier information |
| 26 | +print(dataset.protein_identifiers.uniprot_entry_name) |
| 27 | +#> 'SECB_ECOLI' |
25 | 28 |
|
26 | | -# local path the download datasets to |
27 | | -cache_dir = Path('.cache') |
| 29 | +# Access HDX states |
| 30 | +print([state.name for state in dataset.states]) |
| 31 | +#> ['Tetramer', 'Dimer'] |
28 | 32 |
|
29 | | -# create a vault with local cache dir, set `remote_url` to connect to a different database |
30 | | -vault = DataVault(cache_dir=cache_dir) |
| 33 | +# Get the sequence of the first state |
| 34 | +state = dataset.states[0] |
| 35 | +print(state.protein_state.sequence) |
| 36 | +#> 'MSEQNNTEMTFQIQRIYT...' |
31 | 37 |
|
32 | | -# Download a specific HDX dataset |
33 | | -vault.fetch_dataset("20221007_1530_SecA_Krishnamurthy") |
| 38 | +# Load peptides |
| 39 | +peptides = state.peptides[0] |
34 | 40 |
|
35 | | -# Load the dataset |
36 | | -ds = vault.load_dataset("20221007_1530_SecA_Krishnamurthy") |
| 41 | +# Access peptide information |
| 42 | +print(peptides.deuteration_type, peptides.pH, peptides.temperature) |
| 43 | +#> DeuterationType.partially_deuterated 8.0 303.15 |
37 | 44 |
|
38 | | -# Load the FD control of the first 'state' in the dataset. |
39 | | -fd_control = ds.load_peptides(0, "FD_control") |
| 45 | +# Load the peptide table as standardized narwhals DataFrame |
| 46 | +df = peptides.load( |
| 47 | + convert=True, # convert column header names to open hdx stanard |
| 48 | + aggregate=True, # aggregate centroids / uptake values across replicates |
| 49 | +) |
40 | 50 |
|
41 | | -# Load the corresponding experimental peptides. |
42 | | -peptides = ds.load_peptides(0, "experiment") |
| 51 | +print(df.columns) |
| 52 | +#> ['start', 'end', 'sequence', 'state', 'exposure', 'centroid_mz', 'rt', 'rt_sd', 'uptake', ... |
43 | 53 |
|
44 | 54 | ``` |
45 | 55 |
|
46 | | -## Web infterface |
| 56 | +```python {title="Define a set of peptides for a state"} |
| 57 | +from hdxms_datasets import ProteinState, Peptides, verify_sequence, merge_peptides, compute_uptake_metrics |
| 58 | + |
| 59 | +# Define the protein state |
| 60 | +protein_state = ProteinState( |
| 61 | + sequence="MSEQNNTEMTFQIQRIYTKDISFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVYEVVLRVTVTASLGEETAFLCEVQQGGIFSIAGIEGTQMAHCLGAYCPNILFPYARECITSMVSRGTFPQLNLAPVNFDALFMNYLQQQAGEGTEEHQDA", |
| 62 | + n_term=1, |
| 63 | + c_term=155, |
| 64 | + oligomeric_state=4, |
| 65 | +) |
| 66 | + |
| 67 | +# Define the partially deuterated peptides for the SecB state |
| 68 | +pd_peptides = Peptides( |
| 69 | + data_file=data_dir / "ecSecB_apo.csv", |
| 70 | + data_format=PeptideFormat.DynamX_v3_state, |
| 71 | + deuteration_type=DeuterationType.partially_deuterated, |
| 72 | + filters={ |
| 73 | + "State": "SecB WT apo", |
| 74 | + "Exposure": [0.167, 0.5, 1.0, 10.0, 100.000008], |
| 75 | + }, |
| 76 | + pH=8.0, |
| 77 | + temperature=303.15, |
| 78 | + d_percentage=90.0, |
| 79 | +) |
| 80 | + |
| 81 | +# check for difference between the protein state sequence and the peptide sequences |
| 82 | +mismatches = verify_sequence(pd_peptides.load(), protein_state.sequence, n_term=protein_state.n_term) |
| 83 | +print(mismatches) |
| 84 | +#> [] # sequences match |
| 85 | + |
| 86 | +# Define the fully deuterated peptides for the SecB state |
| 87 | +fd_peptides = Peptides( |
| 88 | + data_file=data_dir / "ecSecB_apo.csv", |
| 89 | + data_format=PeptideFormat.DynamX_v3_state, |
| 90 | + deuteration_type=DeuterationType.fully_deuterated, |
| 91 | + filters={ |
| 92 | + "State": "Full deuteration control", |
| 93 | + "Exposure": 0.167, |
| 94 | + }, |
| 95 | +) |
| 96 | + |
| 97 | +# merge both peptides together in a single dataframe |
| 98 | +merged = merge_peptides([pd_peptides, fd_peptides]) |
| 99 | +print(merged.columns) |
| 100 | +#> ['start', 'end', 'sequence', ... 'uptake', 'uptake_sd', 'fd_uptake', 'fd_uptake_sd'] |
| 101 | + |
| 102 | +# compute uptake metrics for the merged peptides |
| 103 | +# this function computes uptake from centroid mass if not present |
| 104 | +# as well as fractional uptake |
| 105 | +processed = compute_uptake_metrics(merged) |
| 106 | +print(processed.columns) |
| 107 | +#> ['start', 'end', 'sequence', ... 'uptake', 'uptake_sd', 'fd_uptake', 'fd_uptake_sd', 'fractional_uptake', 'fractional_uptake_sd'] |
47 | 108 |
|
48 | | -To run the web interface: |
49 | | -(requires a local clone of the code) |
| 109 | +``` |
| 110 | + |
| 111 | +## Installation |
50 | 112 |
|
51 | 113 | ```bash |
52 | | -solara run hdxms_datasets/web/upload_form.py --production |
53 | | -``` |
| 114 | +$ pip install hdxms-datasets |
| 115 | +``` |
0 commit comments