@@ -60,10 +60,14 @@ def parse_diann_report(
6060 # Process statistics and modifications
6161 total_protein_quantified , total_peptide_count , pep_plot = _process_diann_statistics (report_data )
6262 peptide_search_score = _process_peptide_search_scores (report_data )
63- _process_modifications (report_data )
63+ modifications_ok = _process_modifications (report_data )
6464
65- # Process run-specific data
66- cal_num_table_data = _process_run_data (report_data , ms_with_psm , quantms_modified , file_df )
65+ # Process run-specific data (requires Modifications column from _process_modifications)
66+ if modifications_ok :
67+ cal_num_table_data = _process_run_data (report_data , ms_with_psm , quantms_modified , file_df )
68+ else :
69+ log .warning ("Skipping run data processing due to missing modifications data" )
70+ cal_num_table_data = {"sdrf_samples" : {}, "ms_runs" : {}}
6771
6872 # Handle files without PSM
6973 ms_without_psm = _handle_files_without_psm (ms_paths , ms_with_psm , cal_num_table_data )
@@ -155,6 +159,10 @@ def _draw_heatmap(sub_section, report_data, heatmap_color_list):
155159
156160def _process_diann_statistics (report_data ):
157161 """Process DIA-NN statistics and create peptide plot."""
162+ required_cols = ["Protein.Group" , "Modified.Sequence" ]
163+ if not all (col in report_data .columns for col in required_cols ):
164+ log .warning (f"Missing required columns for statistics: { [c for c in required_cols if c not in report_data .columns ]} " )
165+ return 0 , 0 , None
158166
159167 total_protein_quantified = len (set (report_data ["Protein.Group" ]))
160168 total_peptide_count = len (set (report_data ["Modified.Sequence" ]))
@@ -180,6 +188,11 @@ def _process_diann_statistics(report_data):
180188
181189def _process_peptide_search_scores (report_data ):
182190 """Process peptide search scores."""
191+ required_cols = ["Modified.Sequence" , "Q.Value" ]
192+ if not all (col in report_data .columns for col in required_cols ):
193+ log .warning (f"Missing required columns for peptide search scores: { [c for c in required_cols if c not in report_data .columns ]} " )
194+ return {}
195+
183196 log .info ("Processing DIA peptide_search_score." )
184197 peptide_search_score = dict ()
185198 pattern = re .compile (r"\((.*?)\)" )
@@ -200,6 +213,10 @@ def _process_peptide_search_scores(report_data):
200213
201214def _process_modifications (report_data ):
202215 """Process modifications in the report data."""
216+ if "Modified.Sequence" not in report_data .columns :
217+ log .warning ("Missing Modified.Sequence column for modifications processing" )
218+ return False
219+
203220 log .info ("Processing DIA Modifications." )
204221 mod_pattern = re .compile (r"\((.*?)\)" )
205222 unimod_data = UnimodDatabase ()
@@ -217,16 +234,21 @@ def find_diann_modified(peptide):
217234 return None
218235
219236 report_data ["Modifications" ] = report_data ["Modified.Sequence" ].apply (find_diann_modified )
237+ return True
220238
221239
222240def _process_run_data (df , ms_with_psm , quantms_modified , sdrf_file_df ):
223241 """
224242 Process run-specific data including modifications and statistics.
225243 """
244+ required_cols = ["Run" , "Modified.Sequence" , "Modifications" , "Protein.Group" ]
245+ missing_cols = [col for col in required_cols if col not in df .columns ]
246+ if missing_cols :
247+ log .warning (f"Missing required columns for run data processing: { missing_cols } " )
248+ return {"sdrf_samples" : {}, "ms_runs" : {}}
226249
227250 log .info ("Processing DIA mod_plot_dict." )
228251
229- required_cols = ["Run" , "Modified.Sequence" , "Modifications" , "Protein.Group" ]
230252 report_data = df [required_cols ].copy ()
231253 if "Proteotypic" in df .columns :
232254 report_data ["Proteotypic" ] = df ["Proteotypic" ]
@@ -425,6 +447,11 @@ def draw_dia_rt_qc(sub_section, report_df):
425447
426448# DIA-NN: IDs over RT
427449def draw_dia_ids_rt (sub_section , report_df ):
450+ required_cols = ["Run" , "RT" ]
451+ if not all (col in report_df .columns for col in required_cols ):
452+ log .warning (f"Missing required columns for IDs over RT plot: { [c for c in required_cols if c not in report_df .columns ]} " )
453+ return
454+
428455 rt_df = report_df [["Run" , "RT" ]].copy ()
429456 rt_df .rename (columns = {"Run" : "raw file" , "RT" : "retention time" }, inplace = True )
430457 ids_over_rt = evidence_rt_count (rt_df )
@@ -437,13 +464,19 @@ def draw_diann_quant_table(sub_section, diann_report, sample_df, file_df):
437464 peptides_table , peptides_headers = create_peptides_table (
438465 diann_report , sample_df , file_df
439466 )
440- draw_peptides_table (sub_section , peptides_table , peptides_headers , "DIA-NN" )
467+ if peptides_table is not None and peptides_headers is not None :
468+ draw_peptides_table (sub_section , peptides_table , peptides_headers , "DIA-NN" )
469+ else :
470+ log .warning ("Skipping peptides quantification table due to missing data" )
441471
442472 # Protein Quantification Table
443473 protein_table , protein_headers = create_protein_table (
444474 diann_report , sample_df , file_df
445475 )
446- draw_protein_table (sub_section , protein_table , protein_headers , "DIA-NN" )
476+ if protein_table is not None and protein_headers is not None :
477+ draw_protein_table (sub_section , protein_table , protein_headers , "DIA-NN" )
478+ else :
479+ log .warning ("Skipping protein quantification table due to missing data" )
447480
448481
449482# Draw: Peptides Quantification Table
@@ -727,9 +760,29 @@ def _prepare_quant_table_data(report_df):
727760 Common preprocessing for quantification table creation.
728761
729762 Returns:
730- pd.DataFrame: Preprocessed report data with positive Precursor.Normalised values.
763+ pd.DataFrame: Preprocessed report data with positive intensity values,
764+ or None if required columns are missing.
731765 """
732- report_data = report_df [report_df ["Precursor.Normalised" ] > 0 ].copy ()
766+ # Check for required columns
767+ required_cols = ["Protein.Names" , "Stripped.Sequence" ]
768+ missing_cols = [col for col in required_cols if col not in report_df .columns ]
769+ if missing_cols :
770+ log .warning (f"Missing required columns for quantification table: { missing_cols } " )
771+ return None
772+
773+ # Use Precursor.Normalised if available, otherwise fall back to Precursor.Quantity
774+ if "Precursor.Normalised" in report_df .columns :
775+ intensity_col = "Precursor.Normalised"
776+ elif "Precursor.Quantity" in report_df .columns :
777+ intensity_col = "Precursor.Quantity"
778+ log .info ("Using Precursor.Quantity as fallback (Precursor.Normalised not available)" )
779+ else :
780+ log .warning ("Neither Precursor.Normalised nor Precursor.Quantity found. Skipping quantification table." )
781+ return None
782+
783+ report_data = report_df [report_df [intensity_col ] > 0 ].copy ()
784+ # Store which intensity column is being used for downstream functions
785+ report_data .attrs ["intensity_col" ] = intensity_col
733786 return drop_empty_row (report_data , ["Protein.Names" , "Stripped.Sequence" ])
734787
735788
@@ -743,6 +796,18 @@ def _merge_condition_data(report_data, sample_df, file_df):
743796 if sample_df .empty or file_df .empty :
744797 return None , []
745798
799+ # Get the intensity column used (stored by _prepare_quant_table_data)
800+ intensity_col = report_data .attrs .get ("intensity_col" , "Precursor.Normalised" )
801+ if intensity_col not in report_data .columns :
802+ # Fallback check
803+ if "Precursor.Normalised" in report_data .columns :
804+ intensity_col = "Precursor.Normalised"
805+ elif "Precursor.Quantity" in report_data .columns :
806+ intensity_col = "Precursor.Quantity"
807+ else :
808+ log .warning ("No intensity column found for condition data merge" )
809+ return None , []
810+
746811 sample_cond_df = pd .merge (
747812 sample_df [["Sample" , "MSstats_Condition" ]],
748813 file_df [["Sample" , "Spectra_Filepath" ]],
@@ -752,10 +817,12 @@ def _merge_condition_data(report_data, sample_df, file_df):
752817 sample_cond_df ["Run" ] = sample_cond_df ["Spectra_Filepath" ].str .rsplit ("." , n = 1 ).str [0 ]
753818
754819 cond_report_data = pd .merge (
755- report_data [["Stripped.Sequence" , "Protein.Names" , "Precursor.Normalised" , "Run" ]],
820+ report_data [["Stripped.Sequence" , "Protein.Names" , intensity_col , "Run" ]],
756821 sample_cond_df [["Run" , "MSstats_Condition" ]].drop_duplicates (),
757822 on = "Run" ,
758823 )
824+ # Store intensity column for downstream use
825+ cond_report_data .attrs ["intensity_col" ] = intensity_col
759826
760827 unique_conditions = sample_df ["MSstats_Condition" ].drop_duplicates ().tolist ()
761828 return cond_report_data , unique_conditions
@@ -773,18 +840,34 @@ def _add_condition_headers(headers, conditions):
773840
774841# DIA-NN: Peptides Quantification Table
775842def create_peptides_table (report_df , sample_df , file_df ):
776- """Create peptides quantification table from DIA-NN report."""
843+ """Create peptides quantification table from DIA-NN report.
844+
845+ Returns:
846+ tuple: (table_dict, headers) or (None, None) if required columns are missing.
847+ """
777848 report_data = _prepare_quant_table_data (report_df )
778- report_data ["BestSearchScore" ] = 1 - report_data ["Q.Value" ]
849+ if report_data is None or report_data .empty :
850+ log .warning ("Cannot create peptides table: missing required data" )
851+ return None , None
852+
853+ # Get the intensity column being used
854+ intensity_col = report_data .attrs .get ("intensity_col" , "Precursor.Normalised" )
855+
856+ # Check for Q.Value column for search score
857+ has_qvalue = "Q.Value" in report_data .columns
858+ if has_qvalue :
859+ report_data ["BestSearchScore" ] = 1 - report_data ["Q.Value" ]
779860
780861 table_dict = {}
781862 for sequence_protein , group in report_data .groupby (["Stripped.Sequence" , "Protein.Names" ]):
782- table_dict [ sequence_protein ] = {
863+ entry = {
783864 "ProteinName" : sequence_protein [1 ],
784865 "PeptideSequence" : sequence_protein [0 ],
785- "BestSearchScore" : group ["BestSearchScore" ].min (),
786- "Average Intensity" : np .log10 (group ["Precursor.Normalised" ].mean ()),
866+ "Average Intensity" : np .log10 (group [intensity_col ].mean ()),
787867 }
868+ if has_qvalue :
869+ entry ["BestSearchScore" ] = group ["BestSearchScore" ].min ()
870+ table_dict [sequence_protein ] = entry
788871
789872 headers = {
790873 "ProteinName" : {
@@ -793,24 +876,27 @@ def create_peptides_table(report_df, sample_df, file_df):
793876 "minrange" : "200" ,
794877 },
795878 "PeptideSequence" : {"title" : "Peptide Sequence" },
796- "BestSearchScore" : {"title" : "Best Search Score" , "format" : "{:,.4f}" },
797879 "Average Intensity" : {
798880 "title" : "Average Intensity" ,
799881 "description" : "Average intensity across all conditions" ,
800882 "format" : "{:,.4f}" ,
801883 },
802884 }
885+ if has_qvalue :
886+ headers ["BestSearchScore" ] = {"title" : "Best Search Score" , "format" : "{:,.4f}" }
803887
804888 cond_report_data , unique_conditions = _merge_condition_data (report_data , sample_df , file_df )
805- if cond_report_data is not None :
889+ if cond_report_data is not None and not cond_report_data .empty :
890+ cond_intensity_col = cond_report_data .attrs .get ("intensity_col" , intensity_col )
806891 for sequence_protein , group in cond_report_data .groupby (
807892 ["Stripped.Sequence" , "Protein.Names" ]
808893 ):
809894 condition_data = {
810- str (cond ): np .log10 (sub_group ["Precursor.Normalised" ].mean ())
895+ str (cond ): np .log10 (sub_group [cond_intensity_col ].mean ())
811896 for cond , sub_group in group .groupby ("MSstats_Condition" )
812897 }
813- table_dict [sequence_protein ].update (condition_data )
898+ if sequence_protein in table_dict :
899+ table_dict [sequence_protein ].update (condition_data )
814900
815901 _add_condition_headers (headers , unique_conditions )
816902
@@ -820,15 +906,25 @@ def create_peptides_table(report_df, sample_df, file_df):
820906
821907# DIA-NN: Protein Quantification Table
822908def create_protein_table (report_df , sample_df , file_df ):
823- """Create protein quantification table from DIA-NN report."""
909+ """Create protein quantification table from DIA-NN report.
910+
911+ Returns:
912+ tuple: (table_dict, headers) or (None, None) if required columns are missing.
913+ """
824914 report_data = _prepare_quant_table_data (report_df )
915+ if report_data is None or report_data .empty :
916+ log .warning ("Cannot create protein table: missing required data" )
917+ return None , None
918+
919+ # Get the intensity column being used
920+ intensity_col = report_data .attrs .get ("intensity_col" , "Precursor.Normalised" )
825921
826922 table_dict = {}
827923 for protein_name , group in report_data .groupby ("Protein.Names" ):
828924 table_dict [protein_name ] = {
829925 "ProteinName" : protein_name ,
830926 "Peptides_Number" : group ["Stripped.Sequence" ].nunique (),
831- "Average Intensity" : np .log10 (group ["Precursor.Normalised" ].mean ()),
927+ "Average Intensity" : np .log10 (group [intensity_col ].mean ()),
832928 }
833929
834930 headers = {
@@ -849,13 +945,15 @@ def create_protein_table(report_df, sample_df, file_df):
849945 }
850946
851947 cond_report_data , unique_conditions = _merge_condition_data (report_data , sample_df , file_df )
852- if cond_report_data is not None :
948+ if cond_report_data is not None and not cond_report_data .empty :
949+ cond_intensity_col = cond_report_data .attrs .get ("intensity_col" , intensity_col )
853950 for protein_name , group in cond_report_data .groupby ("Protein.Names" ):
854951 condition_data = {
855- str (cond ): np .log10 (sub_group ["Precursor.Normalised" ].mean ())
952+ str (cond ): np .log10 (sub_group [cond_intensity_col ].mean ())
856953 for cond , sub_group in group .groupby ("MSstats_Condition" )
857954 }
858- table_dict [protein_name ].update (condition_data )
955+ if protein_name in table_dict :
956+ table_dict [protein_name ].update (condition_data )
859957
860958 _add_condition_headers (headers , unique_conditions )
861959
0 commit comments