1111from sklearn .preprocessing import StandardScaler
1212
1313from pmultiqc .modules .common .file_utils import get_filename
14+ from ..common .calc_utils import qualUniform
1415from ...logging import get_logger , Timer
1516
1617# Initialize logger for this module
@@ -442,6 +443,112 @@ def get_evidence(file_path):
442443 return result
443444
444445
446+ # HeatMap
447+ def calculate_heatmap (evidence_df , oversampling , msms_missed_cleavages ):
448+
449+ if any (x is None for x in (evidence_df , oversampling , msms_missed_cleavages )):
450+ return None
451+
452+ if any (
453+ column not in evidence_df .columns
454+ for column in ["Potential contaminant" , "Intensity" , "Raw file" , "Retention time" , "Charge" ]
455+ ):
456+ return None
457+
458+ if evidence_df [evidence_df ["Potential contaminant" ] == "+" ].empty :
459+ logger .info ("The evidence.txt file does not contain any contaminants" )
460+
461+ evidence_data = evidence_df .copy ()
462+
463+ # 8. Pep Missing Values
464+ global_peps = evidence_df ["Modified sequence" ].unique ()
465+ global_peps_count = len (global_peps )
466+
467+ heatmap_dict = dict ()
468+ for raw_file , group in evidence_data [
469+ [
470+ "Potential contaminant" , "Intensity" , "Retention time" ,
471+ "Raw file" , "Modified sequence"
472+ ]
473+ ].groupby ("Raw file" ):
474+
475+ # 1. Contaminants
476+ contaminant = 1 - (group [group ["Potential contaminant" ] == "+" ]["Intensity" ].sum ()
477+ / group ["Intensity" ].sum ())
478+
479+ # 2. Peptide Intensity
480+ peptide_intensity = np .minimum (1.0 , np .nanmedian (group ["Intensity" ]) / (2 ** 23 ))
481+
482+ # 8. Pep Missing Values
483+ pep_missing_values = np .minimum (
484+ 1.0 ,
485+ len (set (global_peps ) & set (group ["Modified sequence" ].unique ())) / global_peps_count
486+ )
487+
488+ heatmap_dict [raw_file ] = {
489+ "Contaminants" : contaminant ,
490+ "Peptide Intensity" : peptide_intensity ,
491+ "ID rate over RT" : qualUniform (group ["Retention time" ]), # 6. ID rate over RT
492+ "Pep Missing Values" : pep_missing_values ,
493+ }
494+
495+ # 4. Missed Cleavages
496+ missed_cleavages = {key : value ["0" ] / 100 for key , value in msms_missed_cleavages .items ()}
497+
498+ # 5. Missed Cleavages Var
499+ mc_median = np .median (list (missed_cleavages .values ()))
500+ missed_cleavages_var = dict (
501+ zip (
502+ missed_cleavages .keys (),
503+ list (map (lambda v : 1 - np .abs (v - mc_median ), missed_cleavages .values ())),
504+ )
505+ )
506+ for raw_file in missed_cleavages .keys ():
507+ heatmap_dict [raw_file ]["Missed Cleavages" ] = missed_cleavages [raw_file ]
508+ heatmap_dict [raw_file ]["Missed Cleavages Var" ] = missed_cleavages_var [raw_file ]
509+
510+ # 3. Charge
511+ charge = dict ()
512+ for raw_file , group in evidence_data .loc [
513+ ~ evidence_data ["is_transferred" ], ["Charge" , "Raw file" ]
514+ ].groupby ("Raw file" ):
515+ charge [raw_file ] = group ["Charge" ].value_counts ()[2 ] / len (group )
516+ charge_median = np .median (list (charge .values ()))
517+ heatmap_charge = dict (
518+ zip (
519+ charge .keys (),
520+ list (map (lambda v : 1 - np .abs (v - charge_median ), charge .values ())),
521+ )
522+ )
523+ for raw_file in heatmap_charge .keys ():
524+ heatmap_dict [raw_file ]["Charge" ] = heatmap_charge [raw_file ]
525+
526+ # 7. MS2 OverSampling
527+ for raw_file , value in oversampling .items ():
528+ heatmap_dict [raw_file ]["MS2 OverSampling" ] = np .minimum (1.0 , (value ["1" ] / 100 ))
529+
530+ # Sort the xnames
531+ heatmap_xname_order = [
532+ "Contaminants" ,
533+ "Peptide Intensity" ,
534+ "Charge" ,
535+ "Missed Cleavages" ,
536+ "Missed Cleavages Var" ,
537+ "ID rate over RT" ,
538+ "MS2 OverSampling" ,
539+ "Pep Missing Values" ,
540+ ]
541+
542+ for raw_file in heatmap_charge .keys ():
543+ heatmap_dict [raw_file ] = {
544+ key : heatmap_dict [raw_file ][key ]
545+ for key in heatmap_xname_order
546+ if key in heatmap_dict [raw_file ].keys ()
547+ }
548+
549+ return heatmap_dict
550+
551+
445552# 3-1. evidence.txt: Top Contaminants per Raw file
446553def evidence_top_contaminants (evidence_df , top_n ):
447554 if any (
@@ -450,6 +557,10 @@ def evidence_top_contaminants(evidence_df, top_n):
450557 ):
451558 return None
452559
560+ if evidence_df [evidence_df ["Potential contaminant" ] == "+" ].empty :
561+ logger .info ("The evidence.txt file does not contain any contaminants" )
562+ return None
563+
453564 evidence_data = evidence_df .copy ()
454565
455566 if "Protein Names" in evidence_data .columns :
0 commit comments