Skip to content

Commit 1a5bc9d

Browse files
authored
Merge pull request #600 from bigbio/dev
Bug fixed for DIANN
2 parents bd6f9b3 + bd8a441 commit 1a5bc9d

File tree

6 files changed

+132
-30
lines changed

6 files changed

+132
-30
lines changed

environment.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ dependencies:
99
- pandas>=1.5
1010
- pyteomics
1111
- pyopenms<=3.4.0
12-
- sdrf-pipelines>=0.0.32
12+
- sdrf-pipelines==0.0.33
1313
- lxml
1414
- numpy>=1.23
1515
- pyarrow
@@ -19,4 +19,4 @@ dependencies:
1919
- requests
2020
- redis
2121
- statsmodels
22-
- urllib3>=2.6.1
22+
- urllib3>=2.6.1

pmultiqc/modules/common/dia_utils.py

Lines changed: 121 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -60,10 +60,14 @@ def parse_diann_report(
6060
# Process statistics and modifications
6161
total_protein_quantified, total_peptide_count, pep_plot = _process_diann_statistics(report_data)
6262
peptide_search_score = _process_peptide_search_scores(report_data)
63-
_process_modifications(report_data)
63+
modifications_ok = _process_modifications(report_data)
6464

65-
# Process run-specific data
66-
cal_num_table_data = _process_run_data(report_data, ms_with_psm, quantms_modified, file_df)
65+
# Process run-specific data (requires Modifications column from _process_modifications)
66+
if modifications_ok:
67+
cal_num_table_data = _process_run_data(report_data, ms_with_psm, quantms_modified, file_df)
68+
else:
69+
log.warning("Skipping run data processing due to missing modifications data")
70+
cal_num_table_data = {"sdrf_samples": {}, "ms_runs": {}}
6771

6872
# Handle files without PSM
6973
ms_without_psm = _handle_files_without_psm(ms_paths, ms_with_psm, cal_num_table_data)
@@ -155,6 +159,10 @@ def _draw_heatmap(sub_section, report_data, heatmap_color_list):
155159

156160
def _process_diann_statistics(report_data):
157161
"""Process DIA-NN statistics and create peptide plot."""
162+
required_cols = ["Protein.Group", "Modified.Sequence"]
163+
if not all(col in report_data.columns for col in required_cols):
164+
log.warning(f"Missing required columns for statistics: {[c for c in required_cols if c not in report_data.columns]}")
165+
return 0, 0, None
158166

159167
total_protein_quantified = len(set(report_data["Protein.Group"]))
160168
total_peptide_count = len(set(report_data["Modified.Sequence"]))
@@ -180,6 +188,11 @@ def _process_diann_statistics(report_data):
180188

181189
def _process_peptide_search_scores(report_data):
182190
"""Process peptide search scores."""
191+
required_cols = ["Modified.Sequence", "Q.Value"]
192+
if not all(col in report_data.columns for col in required_cols):
193+
log.warning(f"Missing required columns for peptide search scores: {[c for c in required_cols if c not in report_data.columns]}")
194+
return {}
195+
183196
log.info("Processing DIA peptide_search_score.")
184197
peptide_search_score = dict()
185198
pattern = re.compile(r"\((.*?)\)")
@@ -200,6 +213,10 @@ def _process_peptide_search_scores(report_data):
200213

201214
def _process_modifications(report_data):
202215
"""Process modifications in the report data."""
216+
if "Modified.Sequence" not in report_data.columns:
217+
log.warning("Missing Modified.Sequence column for modifications processing")
218+
return False
219+
203220
log.info("Processing DIA Modifications.")
204221
mod_pattern = re.compile(r"\((.*?)\)")
205222
unimod_data = UnimodDatabase()
@@ -217,16 +234,21 @@ def find_diann_modified(peptide):
217234
return None
218235

219236
report_data["Modifications"] = report_data["Modified.Sequence"].apply(find_diann_modified)
237+
return True
220238

221239

222240
def _process_run_data(df, ms_with_psm, quantms_modified, sdrf_file_df):
223241
"""
224242
Process run-specific data including modifications and statistics.
225243
"""
244+
required_cols = ["Run", "Modified.Sequence", "Modifications", "Protein.Group"]
245+
missing_cols = [col for col in required_cols if col not in df.columns]
246+
if missing_cols:
247+
log.warning(f"Missing required columns for run data processing: {missing_cols}")
248+
return {"sdrf_samples": {}, "ms_runs": {}}
226249

227250
log.info("Processing DIA mod_plot_dict.")
228251

229-
required_cols = ["Run", "Modified.Sequence", "Modifications", "Protein.Group"]
230252
report_data = df[required_cols].copy()
231253
if "Proteotypic" in df.columns:
232254
report_data["Proteotypic"] = df["Proteotypic"]
@@ -425,6 +447,11 @@ def draw_dia_rt_qc(sub_section, report_df):
425447

426448
# DIA-NN: IDs over RT
427449
def draw_dia_ids_rt(sub_section, report_df):
450+
required_cols = ["Run", "RT"]
451+
if not all(col in report_df.columns for col in required_cols):
452+
log.warning(f"Missing required columns for IDs over RT plot: {[c for c in required_cols if c not in report_df.columns]}")
453+
return
454+
428455
rt_df = report_df[["Run", "RT"]].copy()
429456
rt_df.rename(columns={"Run": "raw file", "RT": "retention time"}, inplace=True)
430457
ids_over_rt = evidence_rt_count(rt_df)
@@ -437,13 +464,19 @@ def draw_diann_quant_table(sub_section, diann_report, sample_df, file_df):
437464
peptides_table, peptides_headers = create_peptides_table(
438465
diann_report, sample_df, file_df
439466
)
440-
draw_peptides_table(sub_section, peptides_table, peptides_headers, "DIA-NN")
467+
if peptides_table is not None and peptides_headers is not None:
468+
draw_peptides_table(sub_section, peptides_table, peptides_headers, "DIA-NN")
469+
else:
470+
log.warning("Skipping peptides quantification table due to missing data")
441471

442472
# Protein Quantification Table
443473
protein_table, protein_headers = create_protein_table(
444474
diann_report, sample_df, file_df
445475
)
446-
draw_protein_table(sub_section, protein_table, protein_headers, "DIA-NN")
476+
if protein_table is not None and protein_headers is not None:
477+
draw_protein_table(sub_section, protein_table, protein_headers, "DIA-NN")
478+
else:
479+
log.warning("Skipping protein quantification table due to missing data")
447480

448481

449482
# Draw: Peptides Quantification Table
@@ -727,9 +760,29 @@ def _prepare_quant_table_data(report_df):
727760
Common preprocessing for quantification table creation.
728761
729762
Returns:
730-
pd.DataFrame: Preprocessed report data with positive Precursor.Normalised values.
763+
pd.DataFrame: Preprocessed report data with positive intensity values,
764+
or None if required columns are missing.
731765
"""
732-
report_data = report_df[report_df["Precursor.Normalised"] > 0].copy()
766+
# Check for required columns
767+
required_cols = ["Protein.Names", "Stripped.Sequence"]
768+
missing_cols = [col for col in required_cols if col not in report_df.columns]
769+
if missing_cols:
770+
log.warning(f"Missing required columns for quantification table: {missing_cols}")
771+
return None
772+
773+
# Use Precursor.Normalised if available, otherwise fall back to Precursor.Quantity
774+
if "Precursor.Normalised" in report_df.columns:
775+
intensity_col = "Precursor.Normalised"
776+
elif "Precursor.Quantity" in report_df.columns:
777+
intensity_col = "Precursor.Quantity"
778+
log.info("Using Precursor.Quantity as fallback (Precursor.Normalised not available)")
779+
else:
780+
log.warning("Neither Precursor.Normalised nor Precursor.Quantity found. Skipping quantification table.")
781+
return None
782+
783+
report_data = report_df[report_df[intensity_col] > 0].copy()
784+
# Store which intensity column is being used for downstream functions
785+
report_data.attrs["intensity_col"] = intensity_col
733786
return drop_empty_row(report_data, ["Protein.Names", "Stripped.Sequence"])
734787

735788

@@ -743,6 +796,18 @@ def _merge_condition_data(report_data, sample_df, file_df):
743796
if sample_df.empty or file_df.empty:
744797
return None, []
745798

799+
# Get the intensity column used (stored by _prepare_quant_table_data)
800+
intensity_col = report_data.attrs.get("intensity_col", "Precursor.Normalised")
801+
if intensity_col not in report_data.columns:
802+
# Fallback check
803+
if "Precursor.Normalised" in report_data.columns:
804+
intensity_col = "Precursor.Normalised"
805+
elif "Precursor.Quantity" in report_data.columns:
806+
intensity_col = "Precursor.Quantity"
807+
else:
808+
log.warning("No intensity column found for condition data merge")
809+
return None, []
810+
746811
sample_cond_df = pd.merge(
747812
sample_df[["Sample", "MSstats_Condition"]],
748813
file_df[["Sample", "Spectra_Filepath"]],
@@ -752,10 +817,12 @@ def _merge_condition_data(report_data, sample_df, file_df):
752817
sample_cond_df["Run"] = sample_cond_df["Spectra_Filepath"].str.rsplit(".", n=1).str[0]
753818

754819
cond_report_data = pd.merge(
755-
report_data[["Stripped.Sequence", "Protein.Names", "Precursor.Normalised", "Run"]],
820+
report_data[["Stripped.Sequence", "Protein.Names", intensity_col, "Run"]],
756821
sample_cond_df[["Run", "MSstats_Condition"]].drop_duplicates(),
757822
on="Run",
758823
)
824+
# Store intensity column for downstream use
825+
cond_report_data.attrs["intensity_col"] = intensity_col
759826

760827
unique_conditions = sample_df["MSstats_Condition"].drop_duplicates().tolist()
761828
return cond_report_data, unique_conditions
@@ -773,18 +840,34 @@ def _add_condition_headers(headers, conditions):
773840

774841
# DIA-NN: Peptides Quantification Table
775842
def create_peptides_table(report_df, sample_df, file_df):
776-
"""Create peptides quantification table from DIA-NN report."""
843+
"""Create peptides quantification table from DIA-NN report.
844+
845+
Returns:
846+
tuple: (table_dict, headers) or (None, None) if required columns are missing.
847+
"""
777848
report_data = _prepare_quant_table_data(report_df)
778-
report_data["BestSearchScore"] = 1 - report_data["Q.Value"]
849+
if report_data is None or report_data.empty:
850+
log.warning("Cannot create peptides table: missing required data")
851+
return None, None
852+
853+
# Get the intensity column being used
854+
intensity_col = report_data.attrs.get("intensity_col", "Precursor.Normalised")
855+
856+
# Check for Q.Value column for search score
857+
has_qvalue = "Q.Value" in report_data.columns
858+
if has_qvalue:
859+
report_data["BestSearchScore"] = 1 - report_data["Q.Value"]
779860

780861
table_dict = {}
781862
for sequence_protein, group in report_data.groupby(["Stripped.Sequence", "Protein.Names"]):
782-
table_dict[sequence_protein] = {
863+
entry = {
783864
"ProteinName": sequence_protein[1],
784865
"PeptideSequence": sequence_protein[0],
785-
"BestSearchScore": group["BestSearchScore"].min(),
786-
"Average Intensity": np.log10(group["Precursor.Normalised"].mean()),
866+
"Average Intensity": np.log10(group[intensity_col].mean()),
787867
}
868+
if has_qvalue:
869+
entry["BestSearchScore"] = group["BestSearchScore"].min()
870+
table_dict[sequence_protein] = entry
788871

789872
headers = {
790873
"ProteinName": {
@@ -793,24 +876,27 @@ def create_peptides_table(report_df, sample_df, file_df):
793876
"minrange": "200",
794877
},
795878
"PeptideSequence": {"title": "Peptide Sequence"},
796-
"BestSearchScore": {"title": "Best Search Score", "format": "{:,.4f}"},
797879
"Average Intensity": {
798880
"title": "Average Intensity",
799881
"description": "Average intensity across all conditions",
800882
"format": "{:,.4f}",
801883
},
802884
}
885+
if has_qvalue:
886+
headers["BestSearchScore"] = {"title": "Best Search Score", "format": "{:,.4f}"}
803887

804888
cond_report_data, unique_conditions = _merge_condition_data(report_data, sample_df, file_df)
805-
if cond_report_data is not None:
889+
if cond_report_data is not None and not cond_report_data.empty:
890+
cond_intensity_col = cond_report_data.attrs.get("intensity_col", intensity_col)
806891
for sequence_protein, group in cond_report_data.groupby(
807892
["Stripped.Sequence", "Protein.Names"]
808893
):
809894
condition_data = {
810-
str(cond): np.log10(sub_group["Precursor.Normalised"].mean())
895+
str(cond): np.log10(sub_group[cond_intensity_col].mean())
811896
for cond, sub_group in group.groupby("MSstats_Condition")
812897
}
813-
table_dict[sequence_protein].update(condition_data)
898+
if sequence_protein in table_dict:
899+
table_dict[sequence_protein].update(condition_data)
814900

815901
_add_condition_headers(headers, unique_conditions)
816902

@@ -820,15 +906,25 @@ def create_peptides_table(report_df, sample_df, file_df):
820906

821907
# DIA-NN: Protein Quantification Table
822908
def create_protein_table(report_df, sample_df, file_df):
823-
"""Create protein quantification table from DIA-NN report."""
909+
"""Create protein quantification table from DIA-NN report.
910+
911+
Returns:
912+
tuple: (table_dict, headers) or (None, None) if required columns are missing.
913+
"""
824914
report_data = _prepare_quant_table_data(report_df)
915+
if report_data is None or report_data.empty:
916+
log.warning("Cannot create protein table: missing required data")
917+
return None, None
918+
919+
# Get the intensity column being used
920+
intensity_col = report_data.attrs.get("intensity_col", "Precursor.Normalised")
825921

826922
table_dict = {}
827923
for protein_name, group in report_data.groupby("Protein.Names"):
828924
table_dict[protein_name] = {
829925
"ProteinName": protein_name,
830926
"Peptides_Number": group["Stripped.Sequence"].nunique(),
831-
"Average Intensity": np.log10(group["Precursor.Normalised"].mean()),
927+
"Average Intensity": np.log10(group[intensity_col].mean()),
832928
}
833929

834930
headers = {
@@ -849,13 +945,15 @@ def create_protein_table(report_df, sample_df, file_df):
849945
}
850946

851947
cond_report_data, unique_conditions = _merge_condition_data(report_data, sample_df, file_df)
852-
if cond_report_data is not None:
948+
if cond_report_data is not None and not cond_report_data.empty:
949+
cond_intensity_col = cond_report_data.attrs.get("intensity_col", intensity_col)
853950
for protein_name, group in cond_report_data.groupby("Protein.Names"):
854951
condition_data = {
855-
str(cond): np.log10(sub_group["Precursor.Normalised"].mean())
952+
str(cond): np.log10(sub_group[cond_intensity_col].mean())
856953
for cond, sub_group in group.groupby("MSstats_Condition")
857954
}
858-
table_dict[protein_name].update(condition_data)
955+
if protein_name in table_dict:
956+
table_dict[protein_name].update(condition_data)
859957

860958
_add_condition_headers(headers, unique_conditions)
861959

pmultiqc/modules/common/plots/dia.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,7 @@ def draw_dia_whole_exp_charge(sub_section, df):
194194
draw_config = {
195195
"id": "distribution_of_precursor_charges",
196196
"cpswitch": True,
197+
"cpswitch_c_active": False,
197198
"title": "Distribution of Precursor Charges",
198199
"tt_decimals": 0,
199200
"ylab": "Count",

pmultiqc/modules/common/plots/ms.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,8 @@ def draw_peak_intensity_distribution(
131131
pconfig = {
132132
"id": "peak_intensity_distribution",
133133
"title": "Peak Intensity Distribution",
134-
"cpswitch": False,
134+
"cpswitch": True,
135+
"cpswitch_c_active": False,
135136
"stacking": "group",
136137
"logswitch": True,
137138
"logswitch_active": True,
@@ -164,6 +165,7 @@ def draw_precursor_charge_distribution(sub_sections, charge_plot=None, ms_info=N
164165
"id": "distribution_of_precursor_charges",
165166
"title": "Distribution of Precursor Charges",
166167
"cpswitch": True,
168+
"cpswitch_c_active": False,
167169
"tt_decimals": 0,
168170
"ylab": "Count",
169171
"save_data_file": False,
@@ -183,7 +185,8 @@ def draw_precursor_charge_distribution(sub_sections, charge_plot=None, ms_info=N
183185
def draw_peaks_per_ms2(sub_sections, peaks_ms2_plot, ms_info):
184186
pconfig = {
185187
"id": "peaks_per_ms2",
186-
"cpswitch": False,
188+
"cpswitch": True,
189+
"cpswitch_c_active": False,
187190
"title": "Number of Peaks per MS/MS spectrum",
188191
"stacking": "group",
189192
"logswitch": True,

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ multiqc = ">=1.29, <=1.33"
4545
pandas = ">=1.5"
4646
pyteomics = "*"
4747
pyopenms = "<=3.4.0"
48-
sdrf-pipelines = ">=0.0.32"
48+
sdrf-pipelines = "0.0.33"
4949
lxml = "*"
5050
numpy = ">=1.23"
5151
pyarrow = "*"

requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@ multiqc>=1.29, <=1.33
22
pandas>=1.5
33
pyteomics
44
pyopenms<=3.4.0
5-
sdrf-pipelines>=0.0.32
5+
sdrf-pipelines==0.0.33
66
lxml
77
numpy>=1.23
88
pyarrow
99
scikit-learn>=1.2
1010
statsmodels
11-
urllib3>=2.6.1
11+
urllib3>=2.6.1

0 commit comments

Comments
 (0)