Skip to content

Commit 2e6dd36

Browse files
Merge pull request #13 from CompOmics/add_retention_time_margins
Add retention time margins & directLFQ quant
2 parents 85505c9 + af27b44 commit 2e6dd36

File tree

6 files changed

+940
-16
lines changed

6 files changed

+940
-16
lines changed

mumdia.py

Lines changed: 264 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@ def __getattr__(self, name):
6363
from prediction_wrappers.wrapper_ms2pip import (
6464
get_predictions_fragment_intensity_main_loop,
6565
)
66+
from quantification.lfq import quantify_fragments
67+
from utilities.plotting import plot_XIC_with_margins, plot_rt_margin_histogram
6668
from utilities.logger import log_info
6769

6870
# Re-export for backward compatibility
@@ -283,7 +285,7 @@ def run_mokapot(output_dir="results/") -> None:
283285
f"mokapot is not installed or failed to import ({e}). Skipping mokapot run."
284286
)
285287
return None
286-
psms = mokapot.read_pin(f"{output_dir}outfile.pin")
288+
psms = mokapot.read_pin(f"{output_dir}/outfile.pin")
287289

288290
model = KerasClassifier(
289291
build_fn=create_model, epochs=100, batch_size=1000, verbose=10
@@ -911,6 +913,233 @@ def extract_intensities(scannr, charge, calcmass):
911913
return df_psms
912914

913915

916+
def calculate_rt_margins_intensity_based(df_fragments: pl.DataFrame, intensity_threshold: float, output_dir='xics') -> pl.DataFrame:
917+
"""
918+
Calculate retention time margins based on a relative intensity threshold of the apex intensity fragment.
919+
The margins are determined by finding the retention times where the fragment intensity
920+
drops below the specified fraction of the apex intensity on both sides of the apex.
921+
If the intensity never drops below the threshold on one side, the margin is set to the
922+
first/last retention time where the most intense fragment was detected.
923+
The function also generates and saves a plot of the XIC with the calculated margins.
924+
925+
Parameters
926+
----------
927+
df_fragments : pl.DataFrame
928+
DataFrame containing fragment ion information for a single peptidoform.
929+
intensity_threshold : float
930+
Intensity threshold (as a fraction of apex intensity) to define retention time margins.
931+
output_dir : str
932+
Directory to save the XIC plots with margins.
933+
Returns
934+
-------
935+
left_bound : float
936+
Left retention time margin.
937+
right_bound : float
938+
Right retention time margin.
939+
apex_rt : float
940+
Retention time at apex intensity.
941+
"""
942+
943+
# Sort by rt
944+
df_sorted = df_fragments.sort("rt")
945+
# Find apex
946+
apex_idx = df_sorted["fragment_intensity"].arg_max()
947+
apex_rt = df_sorted["rt"][apex_idx]
948+
apex_intensity = df_sorted["fragment_intensity"][apex_idx]
949+
# Threshold value
950+
cutoff = intensity_threshold * apex_intensity
951+
apex_fragment_name = df_sorted["fragment_name"][apex_idx]
952+
953+
# Left of apex
954+
left_df = df_sorted.filter(pl.col("fragment_name") == apex_fragment_name) # only consider the apex fragment
955+
apex_idx_left = left_df["fragment_intensity"].arg_max()
956+
left_df = left_df[:apex_idx_left][::-1] # reverse to go from apex down
957+
left_bound = apex_rt
958+
959+
for rt, intensity in zip(left_df["rt"], left_df["fragment_intensity"]):
960+
if intensity < cutoff:
961+
left_bound = rt
962+
break
963+
964+
# if the left bound is still the apex rt, set it to the first rt where fragment was detected
965+
if left_bound == apex_rt and len(left_df) > 0:
966+
left_bound = left_df["rt"][-1]
967+
968+
# Right of apex
969+
right_df = df_sorted.filter(pl.col("fragment_name") == apex_fragment_name) # only consider the apex fragment
970+
apex_idx_right = right_df["fragment_intensity"].arg_max()
971+
right_df = right_df[apex_idx_right+1:]
972+
right_bound = apex_rt
973+
for rt, intensity in zip(right_df["rt"], right_df["fragment_intensity"]):
974+
if intensity < cutoff:
975+
right_bound = rt
976+
break
977+
978+
# if the right bound is still the apex rt, set it to the last rt where fragment was detected
979+
if right_bound == apex_rt and len(right_df) > 0:
980+
right_bound = right_df["rt"][-1]
981+
982+
# plot XIC with the margins
983+
# plot_XIC_with_margins(df_sorted, output_dir=output_dir, adapted_interval=(left_bound, right_bound), apex_rt=apex_rt, cutoff=cutoff)
984+
985+
return left_bound, right_bound, apex_rt
986+
987+
988+
def calculate_min_max_margins(df_psms: pl.DataFrame, df_fragments: pl.DataFrame, top_n: int = 100, intensity_threshold: float = 0.01) -> dict:
989+
"""
990+
Calculate the retention time distribution of the top N peptidoforms (with at least 6 PSMs, and then ranked by spectrum peptide q value)
991+
Min and max margins are defined as the 5th and 95th percentiles of the distribution of retention time margins
992+
across the top N peptidoforms.
993+
Returns a tuple with (min_diff, max_diff).
994+
995+
Parameters
996+
----------
997+
df_psms : pl.DataFrame
998+
DataFrame containing PSM information
999+
df_fragments : pl.DataFrame
1000+
DataFrame containing fragment ion information
1001+
top_n : int, optional
1002+
Number of top peptidoforms to consider based on the lowest 'peptide_q' value (default is 100).
1003+
intensity_threshold : float, optional
1004+
Intensity threshold (as a fraction of apex intensity) to define retention time margins (default is 0.01).
1005+
"""
1006+
1007+
# Step 1: Identify the 100 best scoring peptidoforms based on sage qvalue
1008+
# group by peptide and charge to get unique peptidoforms, aggregate number of PSMs, keep min peptide_q
1009+
1010+
df_top_peptidoforms = (
1011+
df_psms.group_by(["peptide", "charge"])
1012+
.agg(
1013+
[pl.count().alias("num_psms"), pl.min("peptide_q").alias("min_peptide_q")]
1014+
)
1015+
.sort("min_peptide_q")
1016+
)
1017+
1018+
# filter for peptidoforms with at least 6 PSMs
1019+
df_top_peptidoforms = df_top_peptidoforms.filter(pl.col("num_psms") >= 6)
1020+
1021+
# get the top N peptidoforms
1022+
df_top_peptidoforms = df_top_peptidoforms.head(top_n)
1023+
1024+
# Step 2: Extract the retention times of the entire XICs from df_fragments of these peptidoforms
1025+
df_fragments_top100 = df_fragments.filter(pl.col("peptide").is_in(df_top_peptidoforms["peptide"]) & pl.col("charge").is_in(df_top_peptidoforms["charge"]))
1026+
diffs = []
1027+
1028+
for (peptidoform, charge), df_fragments_top100_sub in tqdm(
1029+
df_fragments_top100.group_by(["peptide", "charge"])
1030+
):
1031+
left_bound, right_bound, apex_rt = calculate_rt_margins_intensity_based(df_fragments_top100_sub, intensity_threshold, output_dir='debug/calibration_xics')
1032+
left_diff = apex_rt - left_bound
1033+
right_diff = right_bound - apex_rt
1034+
diffs.append(left_diff)
1035+
diffs.append(right_diff)
1036+
1037+
# remove 0 diffs (if the apex is at the start or end of the XIC)
1038+
diffs = [d for d in diffs if d > 0]
1039+
1040+
# Step 3: Calculate the min and max retention times across all these XICs
1041+
if len(diffs) == 0:
1042+
log_info("Could not calibrate retention time margins, using default values.")
1043+
min_diff = 0.02
1044+
max_diff = 0.2
1045+
else:
1046+
# get 5th and 95th percentiles
1047+
min_diff = np.percentile(diffs, 5)
1048+
max_diff = np.percentile(diffs, 95)
1049+
log_info(f"Using min and max retention time margins: {min_diff}, {max_diff}")
1050+
1051+
# plot histogram of diffs
1052+
plot_rt_margin_histogram(diffs, output_dir='debug/calibration_xics', min_diff=min_diff, max_diff=max_diff)
1053+
1054+
return min_diff, max_diff
1055+
1056+
1057+
def add_retention_time_margins(df_psms: pl.DataFrame, df_fragment: pl.DataFrame, min_diff: float, max_diff: float, intensity_threshold: float) -> pl.DataFrame:
1058+
"""
1059+
Add retention time margin features to the PSM DataFrame.
1060+
"""
1061+
1062+
pept2lowermargins = {}
1063+
pept2highermargins = {}
1064+
1065+
log_info("Calculating adapted retention time margins based on intensity for all peptides")
1066+
1067+
for (peptidoform, charge), df_fragments_sub in tqdm(
1068+
df_fragment.group_by(["peptide", "charge"])
1069+
):
1070+
1071+
# speed up: skip peptidoforms with only 1 PSM
1072+
if df_fragments_sub['psm_id'].n_unique() < 2:
1073+
pept2lowermargins[(peptidoform, charge)] = np.nan
1074+
pept2highermargins[(peptidoform, charge)] = np.nan
1075+
continue
1076+
1077+
intensity_based_margins = calculate_rt_margins_intensity_based(df_fragments_sub, intensity_threshold, output_dir='xics')
1078+
left_bound, right_bound, apex_rt = intensity_based_margins
1079+
1080+
# check if the intensity based margins are higher than max or lower than min
1081+
left_diff = apex_rt - left_bound
1082+
right_diff = right_bound - apex_rt
1083+
1084+
if left_diff < min_diff:
1085+
left_bound = apex_rt - min_diff
1086+
if right_diff < min_diff:
1087+
right_bound = apex_rt + min_diff
1088+
if left_diff > max_diff:
1089+
left_bound = apex_rt - max_diff
1090+
if right_diff > max_diff:
1091+
right_bound = apex_rt + max_diff
1092+
1093+
pept2lowermargins[(peptidoform, charge)] = left_bound
1094+
pept2highermargins[(peptidoform, charge)] = right_bound
1095+
1096+
log_info("Adding retention time margin features to PSM DataFrame...")
1097+
1098+
# add rt_lower_margin and rt_higher_margin to df_psms
1099+
df_psms = df_psms.with_columns(
1100+
[
1101+
pl.struct(["peptide", "charge"])
1102+
.map_elements(lambda row: pept2lowermargins.get((row["peptide"], row["charge"]), np.nan))
1103+
.alias("rt_lower_margin"),
1104+
pl.struct(["peptide", "charge"])
1105+
.map_elements(lambda row: pept2highermargins.get((row["peptide"], row["charge"]), np.nan))
1106+
.alias("rt_higher_margin")
1107+
]
1108+
)
1109+
1110+
log_info("Adding retention time margin features to Fragment DataFrame...")
1111+
1112+
# add rt_lower_margin and rt_higher_margin to df_fragment
1113+
df_fragment = df_fragment.with_columns(
1114+
[
1115+
pl.struct(["peptide", "charge"])
1116+
.map_elements(lambda row: pept2lowermargins.get((row["peptide"], row["charge"]), np.nan))
1117+
.alias("rt_lower_margin"),
1118+
pl.struct(["peptide", "charge"])
1119+
.map_elements(lambda row: pept2highermargins.get((row["peptide"], row["charge"]), np.nan))
1120+
.alias("rt_higher_margin")
1121+
]
1122+
)
1123+
1124+
return df_psms, df_fragment
1125+
1126+
1127+
def add_retention_time_margins_loop(df_psms: pl.DataFrame, df_fragment: pl.DataFrame, top_n: int = 10, intensity_threshold: float = 0.05) -> pl.DataFrame:
1128+
"""
1129+
Add retention time margin features to the PSM DataFrame.
1130+
"""
1131+
log_info("Calculating min max retention time margins based on intensity...")
1132+
# Step 1: Calculate min and max retention time window based on top 100 peptidoforms
1133+
min_diff, max_diff = calculate_min_max_margins(df_psms, df_fragment, top_n, intensity_threshold)
1134+
1135+
# Step 2: Calculate adapted margins for each PSM based on the intensity of the most intense fragment
1136+
# and use the retention time distribution as min and max
1137+
log_info("Adding retention time margin features to PSM DataFrame...")
1138+
df_psms, df_fragment = add_retention_time_margins(df_psms, df_fragment, min_diff, max_diff, intensity_threshold)
1139+
1140+
return df_psms, df_fragment
1141+
1142+
9141143
def calculate_features(
9151144
df_psms: pl.DataFrame,
9161145
df_fragment: pl.DataFrame,
@@ -993,6 +1222,23 @@ def calculate_features(
9931222
.unique(subset=["peptide", "charge"], keep="first")
9941223
)
9951224

1225+
log_info("Regenerated df_fragment_max_peptide:")
1226+
log_info(" Shape: {}".format(df_fragment_max_peptide.shape))
1227+
log_info(" Sample entries:")
1228+
#for row in df_fragment_max_peptide.head(3).iter_rows(named=True):
1229+
# log_info(
1230+
# " Peptide: {}, Charge: {}, PSM ID: {}, RT: {}, Fragment Intensity: {}".format(
1231+
# row["peptide"],
1232+
# row["charge"],
1233+
# row["psm_id"],
1234+
# row["rt"],
1235+
# row["fragment_intensity"],
1236+
# )
1237+
# )
1238+
1239+
log_info(
1240+
"Counting individual peptides per MS2 and filtering by minimum occurrences"
1241+
)
9961242
df_psms = add_count_and_filter_peptides(df_psms, min_occurrences)
9971243

9981244
# Filter df_fragment to only include PSMs that passed all filtering
@@ -1147,6 +1393,7 @@ def calculate_features(
11471393
f"{config['mumdia']['result_dir']}/outfile.pin", separator="\t"
11481394
)
11491395

1396+
return df_fragment, df_psms
11501397

11511398
def main(
11521399
df_fragment: Optional[pl.DataFrame] = None,
@@ -1181,7 +1428,7 @@ def main(
11811428
df_psms = df_psms.filter(~df_psms["peptide"].str.contains("U"))
11821429
df_psms = df_psms.sort("rt")
11831430

1184-
calculate_features(
1431+
df_fragment, df_psms = calculate_features(
11851432
df_psms,
11861433
df_fragment,
11871434
df_fragment_max,
@@ -1193,8 +1440,21 @@ def main(
11931440
)
11941441

11951442
log_info("Done running MuMDIA...")
1196-
# run_mokapot(output_dir=config["mumdia"]["result_dir"])
1197-
1443+
mokapot_results = run_mokapot(output_dir=config["mumdia"]["result_dir"])
1444+
1445+
df_fragment.write_csv("debug/df_fragment_before_quant.tsv", separator="\t")
1446+
df_psms.write_csv("debug/df_psms_before_quant.tsv", separator="\t")
1447+
1448+
# this file will later be used for quantification of proteins with directLFQ (combined with all runs)
1449+
if mokapot_results is not None and isinstance(mokapot_results, (list, tuple)) and len(mokapot_results) > 1:
1450+
df_quant_fragment = quantify_fragments(
1451+
df_fragment,
1452+
mokapot_results[1],
1453+
config=config,
1454+
output_dir=config["mumdia"]["result_dir"]
1455+
)
1456+
else:
1457+
logging.warning("mokapot_results is None or does not have enough elements; skipping quantification step.")
11981458

11991459
if __name__ == "__main__":
12001460
# In practice, load your input DataFrames (e.g., from parquet files) and then call main().

prediction_wrappers/wrapper_ms2pip.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -177,14 +177,4 @@ def get_predictions_fragment_intensity_main_loop(
177177

178178
log_info("Df_fragment shape after filtering: {}".format(df_fragment.shape))
179179

180-
df_fragment = df_fragment.with_columns(
181-
pl.Series(
182-
"fragment_name",
183-
df_fragment["fragment_type"]
184-
+ df_fragment["fragment_ordinals"]
185-
+ "/"
186-
+ df_fragment["fragment_charge"],
187-
)
188-
)
189-
190180
return df_fragment, ms2pip_predictions

0 commit comments

Comments
 (0)