@@ -63,6 +63,8 @@ def __getattr__(self, name):
6363from prediction_wrappers .wrapper_ms2pip import (
6464 get_predictions_fragment_intensity_main_loop ,
6565)
66+ from quantification .lfq import quantify_fragments
67+ from utilities .plotting import plot_XIC_with_margins , plot_rt_margin_histogram
6668from utilities .logger import log_info
6769
6870# Re-export for backward compatibility
@@ -283,7 +285,7 @@ def run_mokapot(output_dir="results/") -> None:
283285 f"mokapot is not installed or failed to import ({ e } ). Skipping mokapot run."
284286 )
285287 return None
286- psms = mokapot .read_pin (f"{ output_dir } outfile.pin" )
288+ psms = mokapot .read_pin (f"{ output_dir } / outfile.pin" )
287289
288290 model = KerasClassifier (
289291 build_fn = create_model , epochs = 100 , batch_size = 1000 , verbose = 10
@@ -911,6 +913,233 @@ def extract_intensities(scannr, charge, calcmass):
911913 return df_psms
912914
913915
916+ def calculate_rt_margins_intensity_based (df_fragments : pl .DataFrame , intensity_threshold : float , output_dir = 'xics' ) -> pl .DataFrame :
917+ """
918+ Calculate retention time margins based on a relative intensity threshold of the apex intensity fragment.
919+ The margins are determined by finding the retention times where the fragment intensity
920+ drops below the specified fraction of the apex intensity on both sides of the apex.
921+ If the intensity never drops below the threshold on one side, the margin is set to the
922+ first/last retention time where the most intense fragment was detected.
923+ The function also generates and saves a plot of the XIC with the calculated margins.
924+
925+ Parameters
926+ ----------
927+ df_fragments : pl.DataFrame
928+ DataFrame containing fragment ion information for a single peptidoform.
929+ intensity_threshold : float
930+ Intensity threshold (as a fraction of apex intensity) to define retention time margins.
931+ output_dir : str
932+ Directory to save the XIC plots with margins.
933+ Returns
934+ -------
935+ left_bound : float
936+ Left retention time margin.
937+ right_bound : float
938+ Right retention time margin.
939+ apex_rt : float
940+ Retention time at apex intensity.
941+ """
942+
943+ # Sort by rt
944+ df_sorted = df_fragments .sort ("rt" )
945+ # Find apex
946+ apex_idx = df_sorted ["fragment_intensity" ].arg_max ()
947+ apex_rt = df_sorted ["rt" ][apex_idx ]
948+ apex_intensity = df_sorted ["fragment_intensity" ][apex_idx ]
949+ # Threshold value
950+ cutoff = intensity_threshold * apex_intensity
951+ apex_fragment_name = df_sorted ["fragment_name" ][apex_idx ]
952+
953+ # Left of apex
954+ left_df = df_sorted .filter (pl .col ("fragment_name" ) == apex_fragment_name ) # only consider the apex fragment
955+ apex_idx_left = left_df ["fragment_intensity" ].arg_max ()
956+ left_df = left_df [:apex_idx_left ][::- 1 ] # reverse to go from apex down
957+ left_bound = apex_rt
958+
959+ for rt , intensity in zip (left_df ["rt" ], left_df ["fragment_intensity" ]):
960+ if intensity < cutoff :
961+ left_bound = rt
962+ break
963+
964+ # if the left bound is still the apex rt, set it to the first rt where fragment was detected
965+ if left_bound == apex_rt and len (left_df ) > 0 :
966+ left_bound = left_df ["rt" ][- 1 ]
967+
968+ # Right of apex
969+ right_df = df_sorted .filter (pl .col ("fragment_name" ) == apex_fragment_name ) # only consider the apex fragment
970+ apex_idx_right = right_df ["fragment_intensity" ].arg_max ()
971+ right_df = right_df [apex_idx_right + 1 :]
972+ right_bound = apex_rt
973+ for rt , intensity in zip (right_df ["rt" ], right_df ["fragment_intensity" ]):
974+ if intensity < cutoff :
975+ right_bound = rt
976+ break
977+
978+ # if the right bound is still the apex rt, set it to the last rt where fragment was detected
979+ if right_bound == apex_rt and len (right_df ) > 0 :
980+ right_bound = right_df ["rt" ][- 1 ]
981+
982+ # plot XIC with the margins
983+ # plot_XIC_with_margins(df_sorted, output_dir=output_dir, adapted_interval=(left_bound, right_bound), apex_rt=apex_rt, cutoff=cutoff)
984+
985+ return left_bound , right_bound , apex_rt
986+
987+
988+ def calculate_min_max_margins (df_psms : pl .DataFrame , df_fragments : pl .DataFrame , top_n : int = 100 , intensity_threshold : float = 0.01 ) -> dict :
989+ """
990+ Calculate the retention time distribution of the top N peptidoforms (with at least 6 PSMs, and then ranked by spectrum peptide q value)
991+ Min and max margins are defined as the 5th and 95th percentiles of the distribution of retention time margins
992+ across the top N peptidoforms.
993+ Returns a tuple with (min_diff, max_diff).
994+
995+ Parameters
996+ ----------
997+ df_psms : pl.DataFrame
998+ DataFrame containing PSM information
999+ df_fragments : pl.DataFrame
1000+ DataFrame containing fragment ion information
1001+ top_n : int, optional
1002+ Number of top peptidoforms to consider based on the lowest 'peptide_q' value (default is 100).
1003+ intensity_threshold : float, optional
1004+ Intensity threshold (as a fraction of apex intensity) to define retention time margins (default is 0.01).
1005+ """
1006+
1007+ # Step 1: Identify the 100 best scoring peptidoforms based on sage qvalue
1008+ # group by peptide and charge to get unique peptidoforms, aggregate number of PSMs, keep min peptide_q
1009+
1010+ df_top_peptidoforms = (
1011+ df_psms .group_by (["peptide" , "charge" ])
1012+ .agg (
1013+ [pl .count ().alias ("num_psms" ), pl .min ("peptide_q" ).alias ("min_peptide_q" )]
1014+ )
1015+ .sort ("min_peptide_q" )
1016+ )
1017+
1018+ # filter for peptidoforms with at least 6 PSMs
1019+ df_top_peptidoforms = df_top_peptidoforms .filter (pl .col ("num_psms" ) >= 6 )
1020+
1021+ # get the top N peptidoforms
1022+ df_top_peptidoforms = df_top_peptidoforms .head (top_n )
1023+
1024+ # Step 2: Extract the retention times of the entire XICs from df_fragments of these peptidoforms
1025+ df_fragments_top100 = df_fragments .filter (pl .col ("peptide" ).is_in (df_top_peptidoforms ["peptide" ]) & pl .col ("charge" ).is_in (df_top_peptidoforms ["charge" ]))
1026+ diffs = []
1027+
1028+ for (peptidoform , charge ), df_fragments_top100_sub in tqdm (
1029+ df_fragments_top100 .group_by (["peptide" , "charge" ])
1030+ ):
1031+ left_bound , right_bound , apex_rt = calculate_rt_margins_intensity_based (df_fragments_top100_sub , intensity_threshold , output_dir = 'debug/calibration_xics' )
1032+ left_diff = apex_rt - left_bound
1033+ right_diff = right_bound - apex_rt
1034+ diffs .append (left_diff )
1035+ diffs .append (right_diff )
1036+
1037+ # remove 0 diffs (if the apex is at the start or end of the XIC)
1038+ diffs = [d for d in diffs if d > 0 ]
1039+
1040+ # Step 3: Calculate the min and max retention times across all these XICs
1041+ if len (diffs ) == 0 :
1042+ log_info ("Could not calibrate retention time margins, using default values." )
1043+ min_diff = 0.02
1044+ max_diff = 0.2
1045+ else :
1046+ # get 5th and 95th percentiles
1047+ min_diff = np .percentile (diffs , 5 )
1048+ max_diff = np .percentile (diffs , 95 )
1049+ log_info (f"Using min and max retention time margins: { min_diff } , { max_diff } " )
1050+
1051+ # plot histogram of diffs
1052+ plot_rt_margin_histogram (diffs , output_dir = 'debug/calibration_xics' , min_diff = min_diff , max_diff = max_diff )
1053+
1054+ return min_diff , max_diff
1055+
1056+
1057+ def add_retention_time_margins (df_psms : pl .DataFrame , df_fragment : pl .DataFrame , min_diff : float , max_diff : float , intensity_threshold : float ) -> pl .DataFrame :
1058+ """
1059+ Add retention time margin features to the PSM DataFrame.
1060+ """
1061+
1062+ pept2lowermargins = {}
1063+ pept2highermargins = {}
1064+
1065+ log_info ("Calculating adapted retention time margins based on intensity for all peptides" )
1066+
1067+ for (peptidoform , charge ), df_fragments_sub in tqdm (
1068+ df_fragment .group_by (["peptide" , "charge" ])
1069+ ):
1070+
1071+ # speed up: skip peptidoforms with only 1 PSM
1072+ if df_fragments_sub ['psm_id' ].n_unique () < 2 :
1073+ pept2lowermargins [(peptidoform , charge )] = np .nan
1074+ pept2highermargins [(peptidoform , charge )] = np .nan
1075+ continue
1076+
1077+ intensity_based_margins = calculate_rt_margins_intensity_based (df_fragments_sub , intensity_threshold , output_dir = 'xics' )
1078+ left_bound , right_bound , apex_rt = intensity_based_margins
1079+
1080+ # check if the intensity based margins are higher than max or lower than min
1081+ left_diff = apex_rt - left_bound
1082+ right_diff = right_bound - apex_rt
1083+
1084+ if left_diff < min_diff :
1085+ left_bound = apex_rt - min_diff
1086+ if right_diff < min_diff :
1087+ right_bound = apex_rt + min_diff
1088+ if left_diff > max_diff :
1089+ left_bound = apex_rt - max_diff
1090+ if right_diff > max_diff :
1091+ right_bound = apex_rt + max_diff
1092+
1093+ pept2lowermargins [(peptidoform , charge )] = left_bound
1094+ pept2highermargins [(peptidoform , charge )] = right_bound
1095+
1096+ log_info ("Adding retention time margin features to PSM DataFrame..." )
1097+
1098+ # add rt_lower_margin and rt_higher_margin to df_psms
1099+ df_psms = df_psms .with_columns (
1100+ [
1101+ pl .struct (["peptide" , "charge" ])
1102+ .map_elements (lambda row : pept2lowermargins .get ((row ["peptide" ], row ["charge" ]), np .nan ))
1103+ .alias ("rt_lower_margin" ),
1104+ pl .struct (["peptide" , "charge" ])
1105+ .map_elements (lambda row : pept2highermargins .get ((row ["peptide" ], row ["charge" ]), np .nan ))
1106+ .alias ("rt_higher_margin" )
1107+ ]
1108+ )
1109+
1110+ log_info ("Adding retention time margin features to Fragment DataFrame..." )
1111+
1112+ # add rt_lower_margin and rt_higher_margin to df_fragment
1113+ df_fragment = df_fragment .with_columns (
1114+ [
1115+ pl .struct (["peptide" , "charge" ])
1116+ .map_elements (lambda row : pept2lowermargins .get ((row ["peptide" ], row ["charge" ]), np .nan ))
1117+ .alias ("rt_lower_margin" ),
1118+ pl .struct (["peptide" , "charge" ])
1119+ .map_elements (lambda row : pept2highermargins .get ((row ["peptide" ], row ["charge" ]), np .nan ))
1120+ .alias ("rt_higher_margin" )
1121+ ]
1122+ )
1123+
1124+ return df_psms , df_fragment
1125+
1126+
1127+ def add_retention_time_margins_loop (df_psms : pl .DataFrame , df_fragment : pl .DataFrame , top_n : int = 10 , intensity_threshold : float = 0.05 ) -> pl .DataFrame :
1128+ """
1129+ Add retention time margin features to the PSM DataFrame.
1130+ """
1131+ log_info ("Calculating min max retention time margins based on intensity..." )
1132+ # Step 1: Calculate min and max retention time window based on top 100 peptidoforms
1133+ min_diff , max_diff = calculate_min_max_margins (df_psms , df_fragment , top_n , intensity_threshold )
1134+
1135+ # Step 2: Calculate adapted margins for each PSM based on the intensity of the most intense fragment
1136+ # and use the retention time distribution as min and max
1137+ log_info ("Adding retention time margin features to PSM DataFrame..." )
1138+ df_psms , df_fragment = add_retention_time_margins (df_psms , df_fragment , min_diff , max_diff , intensity_threshold )
1139+
1140+ return df_psms , df_fragment
1141+
1142+
9141143def calculate_features (
9151144 df_psms : pl .DataFrame ,
9161145 df_fragment : pl .DataFrame ,
@@ -993,6 +1222,23 @@ def calculate_features(
9931222 .unique (subset = ["peptide" , "charge" ], keep = "first" )
9941223 )
9951224
1225+ log_info ("Regenerated df_fragment_max_peptide:" )
1226+ log_info (" Shape: {}" .format (df_fragment_max_peptide .shape ))
1227+ log_info (" Sample entries:" )
1228+ #for row in df_fragment_max_peptide.head(3).iter_rows(named=True):
1229+ # log_info(
1230+ # " Peptide: {}, Charge: {}, PSM ID: {}, RT: {}, Fragment Intensity: {}".format(
1231+ # row["peptide"],
1232+ # row["charge"],
1233+ # row["psm_id"],
1234+ # row["rt"],
1235+ # row["fragment_intensity"],
1236+ # )
1237+ # )
1238+
1239+ log_info (
1240+ "Counting individual peptides per MS2 and filtering by minimum occurrences"
1241+ )
9961242 df_psms = add_count_and_filter_peptides (df_psms , min_occurrences )
9971243
9981244 # Filter df_fragment to only include PSMs that passed all filtering
@@ -1147,6 +1393,7 @@ def calculate_features(
11471393 f"{ config ['mumdia' ]['result_dir' ]} /outfile.pin" , separator = "\t "
11481394 )
11491395
1396+ return df_fragment , df_psms
11501397
11511398def main (
11521399 df_fragment : Optional [pl .DataFrame ] = None ,
@@ -1181,7 +1428,7 @@ def main(
11811428 df_psms = df_psms .filter (~ df_psms ["peptide" ].str .contains ("U" ))
11821429 df_psms = df_psms .sort ("rt" )
11831430
1184- calculate_features (
1431+ df_fragment , df_psms = calculate_features (
11851432 df_psms ,
11861433 df_fragment ,
11871434 df_fragment_max ,
@@ -1193,8 +1440,21 @@ def main(
11931440 )
11941441
11951442 log_info ("Done running MuMDIA..." )
1196- # run_mokapot(output_dir=config["mumdia"]["result_dir"])
1197-
1443+ mokapot_results = run_mokapot (output_dir = config ["mumdia" ]["result_dir" ])
1444+
1445+ df_fragment .write_csv ("debug/df_fragment_before_quant.tsv" , separator = "\t " )
1446+ df_psms .write_csv ("debug/df_psms_before_quant.tsv" , separator = "\t " )
1447+
1448+ # this file will later be used for quantification of proteins with directLFQ (combined with all runs)
1449+ if mokapot_results is not None and isinstance (mokapot_results , (list , tuple )) and len (mokapot_results ) > 1 :
1450+ df_quant_fragment = quantify_fragments (
1451+ df_fragment ,
1452+ mokapot_results [1 ],
1453+ config = config ,
1454+ output_dir = config ["mumdia" ]["result_dir" ]
1455+ )
1456+ else :
1457+ logging .warning ("mokapot_results is None or does not have enough elements; skipping quantification step." )
11981458
11991459if __name__ == "__main__" :
12001460 # In practice, load your input DataFrames (e.g., from parquet files) and then call main().
0 commit comments