Skip to content

Commit 0ae4398

Browse files
committed
feat: converting dynamx cluster data to state data
- Introduced new test fixtures for cluster and state data using read_dynamx. - Added a test for converting dynamx cluster data to state data, validating the output against expected metrics.
1 parent dac2f68 commit 0ae4398

File tree

6 files changed

+37079
-16
lines changed

6 files changed

+37079
-16
lines changed

examples/read_csv_data.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,28 @@
11
from pathlib import Path
2+
from hdxms_datasets.process import dynamx_cluster_to_state
23
from hdxms_datasets.reader import read_dynamx
4+
import narwhals as nw
35

46

57
# %%
68
ROOT = Path(__file__).parent.parent
79
TEST_PTH = ROOT / "tests"
810
DATA_ID = "1665149400_SecA_Krishnamurthy"
11+
12+
# read a state data file
913
csv_file = TEST_PTH / "datasets" / DATA_ID / "data" / "SecA.csv"
1014
csv_dynamx = read_dynamx(csv_file).to_native()
15+
csv_dynamx
16+
# %%
1117

18+
# read a cluster data file, select a single state and convert to state data
19+
csv_file = TEST_PTH / "test_data" / "quiescent state cluster data.csv"
20+
cluster_data = read_dynamx(csv_file)
21+
22+
state = "SecA1-901 wt apo"
23+
cluster_data = cluster_data.filter(nw.col("state") == state)
24+
25+
converted_state_data = dynamx_cluster_to_state(cluster_data)
26+
converted_state_data.to_native()
1227

13-
csv_dynamx
1428
# %%

hdxms_datasets/process.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,14 @@
11
from __future__ import annotations
22

3+
from collections import defaultdict
34
from pathlib import Path
45
from typing import Literal, Optional, Union, TYPE_CHECKING
56

67
import narwhals as nw
8+
from statsmodels.stats.weightstats import DescrStatsW
9+
from uncertainties import ufloat, Variable
10+
11+
from hdxms_datasets.backend import BACKEND
712

813

914
if TYPE_CHECKING:
@@ -12,6 +17,128 @@
1217

1318
TIME_FACTORS = {"s": 1, "m": 60.0, "min": 60.0, "h": 3600, "d": 86400}
1419
TEMPERATURE_OFFSETS = {"c": 273.15, "celsius": 273.15, "k": 0.0, "kelvin": 0.0}
20+
PROTON_MASS = 1.0072764665789
21+
22+
23+
STATE_DATA_COLUMN_ORDER = [
24+
"protein",
25+
"start",
26+
"end",
27+
"stop",
28+
"sequence",
29+
"modification",
30+
"fragment",
31+
"maxuptake",
32+
"mhp",
33+
"state",
34+
"exposure",
35+
"center",
36+
"center_sd",
37+
"uptake",
38+
"uptake_sd",
39+
"rt",
40+
"rt_sd",
41+
]
42+
43+
44+
def ufloat_stats(array, weights) -> Variable:
45+
"""Calculate the weighted mean and standard deviation."""
46+
weighted_stats = DescrStatsW(array, weights=weights, ddof=0)
47+
return ufloat(weighted_stats.mean, weighted_stats.std)
48+
49+
50+
def records_to_dict(records: list[dict]) -> dict[str, list]:
51+
"""
52+
Convert a list of records to a dictionary of lists.
53+
54+
Args:
55+
records: List of dictionaries.
56+
57+
Returns:
58+
Dictionary with keys as column names and values as lists.
59+
"""
60+
61+
df_dict = defaultdict(list)
62+
for record in records:
63+
for key, value in record.items():
64+
df_dict[key].append(value)
65+
66+
return dict(df_dict)
67+
68+
69+
def dynamx_cluster_to_state(cluster_data: nw.DataFrame, nd_exposure: float = 0.0) -> nw.DataFrame:
70+
"""
71+
convert dynamx cluster data to state data
72+
must contain only a single state
73+
"""
74+
75+
assert len(cluster_data["state"].unique()) == 1, "Multiple states found in data"
76+
77+
# determine undeuterated masses per peptide
78+
nd_data = cluster_data.filter(nw.col("exposure") == nd_exposure)
79+
nd_peptides: list[tuple[int, int]] = sorted(
80+
{(start, end) for start, end in zip(nd_data["start"], nd_data["end"])}
81+
)
82+
83+
peptides_nd_mass = {}
84+
for p in nd_peptides:
85+
start, end = p
86+
df_nd_peptide = nd_data.filter((nw.col("start") == start) & (nw.col("end") == end))
87+
88+
masses = df_nd_peptide["z"] * (df_nd_peptide["center"] - PROTON_MASS)
89+
nd_mass = ufloat_stats(masses, df_nd_peptide["inten"])
90+
91+
peptides_nd_mass[p] = nd_mass
92+
93+
groups = cluster_data.group_by(["start", "end", "exposure"])
94+
unique_columns = [
95+
"end",
96+
"exposure",
97+
"fragment",
98+
"maxuptake",
99+
"mhp",
100+
"modification",
101+
"protein",
102+
"sequence",
103+
"start",
104+
"state",
105+
"stop",
106+
]
107+
records = []
108+
for (start, end, exposure), df_group in groups:
109+
record = {col: df_group[col][0] for col in unique_columns}
110+
111+
rt = ufloat_stats(df_group["rt"], df_group["inten"])
112+
record["rt"] = rt.nominal_value
113+
record["rt_sd"] = rt.std_dev
114+
115+
# state data 'center' is mass as if |charge| would be 1
116+
center = ufloat_stats(
117+
df_group["z"] * (df_group["center"] - PROTON_MASS) + PROTON_MASS, df_group["inten"]
118+
)
119+
record["center"] = center.nominal_value
120+
record["center_sd"] = center.std_dev
121+
122+
masses = df_group["z"] * (df_group["center"] - PROTON_MASS)
123+
exp_mass = ufloat_stats(masses, df_group["inten"])
124+
125+
if (start, end) in peptides_nd_mass:
126+
uptake = exp_mass - peptides_nd_mass[(start, end)]
127+
record["uptake"] = uptake.nominal_value
128+
record["uptake_sd"] = uptake.std_dev
129+
else:
130+
record["uptake"] = None
131+
record["uptake_sd"] = None
132+
133+
records.append(record)
134+
135+
d = records_to_dict(records)
136+
df = nw.from_dict(d, backend=BACKEND)
137+
138+
if set(df.columns) == set(STATE_DATA_COLUMN_ORDER):
139+
df = df[STATE_DATA_COLUMN_ORDER]
140+
141+
return df
15142

16143

17144
# overload typing to get correct return type

pyproject.toml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,13 @@ classifiers = [
1818
"Intended Audience :: Science/Research",
1919
]
2020

21-
dependencies = ["narwhals", "PyYAML", "requests"]
21+
dependencies = [
22+
"narwhals",
23+
"PyYAML",
24+
"requests",
25+
'statsmodels',
26+
'uncertainties',
27+
]
2228

2329
dynamic = ["version"]
2430

0 commit comments

Comments
 (0)