@@ -705,15 +705,62 @@ def cal_rt_irt_loess(report_df, frac=0.3, data_bins: int = DEFAULT_BINS):
705705 return plot_dict
706706
707707
708- # DIA-NN: Peptides Quantification Table
709- def create_peptides_table (report_df , sample_df , file_df ):
710- # Validation: remove rows with 0 or NA Precursor.Normalised values
708+ def _prepare_quant_table_data (report_df ):
709+ """
710+ Common preprocessing for quantification table creation.
711+
712+ Returns:
713+ pd.DataFrame: Preprocessed report data with positive Precursor.Normalised values.
714+ """
711715 report_data = report_df [report_df ["Precursor.Normalised" ] > 0 ].copy ()
712- report_data = drop_empty_row (report_data , ["Protein.Names" , "Stripped.Sequence" ])
716+ return drop_empty_row (report_data , ["Protein.Names" , "Stripped.Sequence" ])
717+
718+
719+ def _merge_condition_data (report_data , sample_df , file_df ):
720+ """
721+ Merge report data with condition information from sample/file DataFrames.
722+
723+ Returns:
724+ tuple: (merged DataFrame with condition info, list of unique conditions) or (None, [])
725+ """
726+ if sample_df .empty or file_df .empty :
727+ return None , []
728+
729+ sample_cond_df = pd .merge (
730+ sample_df [["Sample" , "MSstats_Condition" ]],
731+ file_df [["Sample" , "Spectra_Filepath" ]],
732+ on = "Sample" ,
733+ )
734+ # Vectorized path splitting (more efficient than apply with lambda)
735+ sample_cond_df ["Run" ] = sample_cond_df ["Spectra_Filepath" ].str .rsplit ("." , n = 1 ).str [0 ]
713736
737+ cond_report_data = pd .merge (
738+ report_data [["Stripped.Sequence" , "Protein.Names" , "Precursor.Normalised" , "Run" ]],
739+ sample_cond_df [["Run" , "MSstats_Condition" ]].drop_duplicates (),
740+ on = "Run" ,
741+ )
742+
743+ unique_conditions = sample_df ["MSstats_Condition" ].drop_duplicates ().tolist ()
744+ return cond_report_data , unique_conditions
745+
746+
747+ def _add_condition_headers (headers , conditions ):
748+ """Add condition-based headers to the headers dictionary."""
749+ for exp_condition in conditions :
750+ headers [str (exp_condition )] = {
751+ "title" : str (exp_condition ),
752+ "description" : "MSstats Condition" ,
753+ "format" : "{:,.4f}" ,
754+ }
755+
756+
757+ # DIA-NN: Peptides Quantification Table
758+ def create_peptides_table (report_df , sample_df , file_df ):
759+ """Create peptides quantification table from DIA-NN report."""
760+ report_data = _prepare_quant_table_data (report_df )
714761 report_data ["BestSearchScore" ] = 1 - report_data ["Q.Value" ]
715762
716- table_dict = dict ()
763+ table_dict = {}
717764 for sequence_protein , group in report_data .groupby (["Stripped.Sequence" , "Protein.Names" ]):
718765 table_dict [sequence_protein ] = {
719766 "ProteinName" : sequence_protein [1 ],
@@ -737,52 +784,29 @@ def create_peptides_table(report_df, sample_df, file_df):
737784 },
738785 }
739786
740- if not sample_df .empty and not file_df .empty :
741-
742- sample_cond_df = pd .merge (
743- sample_df [["Sample" , "MSstats_Condition" ]],
744- file_df [["Sample" , "Spectra_Filepath" ]],
745- on = "Sample" ,
746- )
747- sample_cond_df ["Run" ] = sample_cond_df ["Spectra_Filepath" ].apply (
748- lambda x : os .path .splitext (x )[0 ]
749- )
750-
751- cond_report_data = pd .merge (
752- report_data [["Stripped.Sequence" , "Protein.Names" , "Precursor.Normalised" , "Run" ]],
753- sample_cond_df [["Run" , "MSstats_Condition" ]].drop_duplicates (),
754- on = "Run" ,
755- )
756-
787+ cond_report_data , unique_conditions = _merge_condition_data (report_data , sample_df , file_df )
788+ if cond_report_data is not None :
757789 for sequence_protein , group in cond_report_data .groupby (
758790 ["Stripped.Sequence" , "Protein.Names" ]
759791 ):
760-
761- condition_data = dict ()
762- for condition , sub_group in group .groupby ("MSstats_Condition" ):
763- condition_data [str (condition )] = np .log10 (sub_group ["Precursor.Normalised" ].mean ())
764-
792+ condition_data = {
793+ str (cond ): np .log10 (sub_group ["Precursor.Normalised" ].mean ())
794+ for cond , sub_group in group .groupby ("MSstats_Condition" )
795+ }
765796 table_dict [sequence_protein ].update (condition_data )
766797
767- for exp_condition in sample_df ["MSstats_Condition" ].drop_duplicates ():
768- headers [str (exp_condition )] = {
769- "title" : str (exp_condition ),
770- "description" : "MSstats Condition" ,
771- "format" : "{:,.4f}" ,
772- }
798+ _add_condition_headers (headers , unique_conditions )
773799
774800 result_dict = {i : v for i , (_ , v ) in enumerate (table_dict .items (), start = 1 )}
775-
776801 return result_dict , headers
777802
778803
779804# DIA-NN: Protein Quantification Table
780805def create_protein_table (report_df , sample_df , file_df ):
781- # Validation: remove rows with 0 or NA Precursor.Normalised values
782- report_data = report_df [report_df ["Precursor.Normalised" ] > 0 ].copy ()
783- report_data = drop_empty_row (report_data , ["Protein.Names" , "Stripped.Sequence" ])
806+ """Create protein quantification table from DIA-NN report."""
807+ report_data = _prepare_quant_table_data (report_df )
784808
785- table_dict = dict ()
809+ table_dict = {}
786810 for protein_name , group in report_data .groupby ("Protein.Names" ):
787811 table_dict [protein_name ] = {
788812 "ProteinName" : protein_name ,
@@ -807,40 +831,18 @@ def create_protein_table(report_df, sample_df, file_df):
807831 },
808832 }
809833
810- if not sample_df .empty and not file_df .empty :
811-
812- sample_cond_df = pd .merge (
813- sample_df [["Sample" , "MSstats_Condition" ]],
814- file_df [["Sample" , "Spectra_Filepath" ]],
815- on = "Sample" ,
816- )
817- sample_cond_df ["Run" ] = sample_cond_df ["Spectra_Filepath" ].apply (
818- lambda x : os .path .splitext (x )[0 ]
819- )
820-
821- cond_report_data = pd .merge (
822- report_data [["Stripped.Sequence" , "Protein.Names" , "Precursor.Normalised" , "Run" ]],
823- sample_cond_df [["Run" , "MSstats_Condition" ]].drop_duplicates (),
824- on = "Run" ,
825- )
826-
834+ cond_report_data , unique_conditions = _merge_condition_data (report_data , sample_df , file_df )
835+ if cond_report_data is not None :
827836 for protein_name , group in cond_report_data .groupby ("Protein.Names" ):
828-
829- condition_data = dict ()
830- for condition , sub_group in group .groupby ("MSstats_Condition" ):
831- condition_data [str (condition )] = np .log10 (sub_group ["Precursor.Normalised" ].mean ())
832-
837+ condition_data = {
838+ str (cond ): np .log10 (sub_group ["Precursor.Normalised" ].mean ())
839+ for cond , sub_group in group .groupby ("MSstats_Condition" )
840+ }
833841 table_dict [protein_name ].update (condition_data )
834842
835- for exp_condition in sample_df ["MSstats_Condition" ].drop_duplicates ():
836- headers [str (exp_condition )] = {
837- "title" : str (exp_condition ),
838- "description" : "MSstats Condition" ,
839- "format" : "{:,.4f}" ,
840- }
843+ _add_condition_headers (headers , unique_conditions )
841844
842845 result_dict = {i : v for i , (_ , v ) in enumerate (table_dict .items (), start = 1 )}
843-
844846 return result_dict , headers
845847
846848
0 commit comments