Skip to content

Commit dfb0d98

Browse files
authored
Merge pull request #524 from bigbio/dev
Dev
2 parents fd4d20a + 02d435b commit dfb0d98

File tree

7 files changed

+149
-96
lines changed

7 files changed

+149
-96
lines changed

pmultiqc/modules/common/common_utils.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -329,16 +329,16 @@ def hist_compute(rt_list, rt_range):
329329

330330
def evidence_calibrated_mass_error(
331331
evidence_data,
332-
recommpute=False,
332+
recompute=False,
333333
filter_outliers_ppm: bool = False
334334
):
335335
# filter_outliers_ppm (if True): Remove rows with mass error [ppm] greater than 1000 (Default: False)
336336

337337
if "potential contaminant" in evidence_data.columns:
338338
evidence_data = evidence_data[evidence_data["potential contaminant"] != "+"].copy()
339339

340-
if recommpute:
341-
evd_df = recommpute_mass_error(evidence_data)
340+
if recompute:
341+
evd_df = recompute_mass_error(evidence_data)
342342
else:
343343
evd_df = evidence_data.copy()
344344

@@ -395,7 +395,7 @@ def evidence_calibrated_mass_error(
395395
return result_dict
396396

397397
# re-compute mass error
398-
def recommpute_mass_error(evidence_df):
398+
def recompute_mass_error(evidence_df):
399399
required_cols = [
400400
"mass error [ppm]",
401401
"uncalibrated mass error [ppm]",
@@ -407,7 +407,7 @@ def recommpute_mass_error(evidence_df):
407407
]
408408

409409
if not all(col in evidence_df.columns for col in required_cols):
410-
log.info("Evidence is missing one or more required columns in recommpute_mass_error.")
410+
log.info("Evidence is missing one or more required columns in recompute_mass_error.")
411411
return None
412412

413413
df = evidence_df[required_cols].copy()

pmultiqc/modules/common/dia_utils.py

Lines changed: 69 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -705,15 +705,62 @@ def cal_rt_irt_loess(report_df, frac=0.3, data_bins: int = DEFAULT_BINS):
705705
return plot_dict
706706

707707

708-
# DIA-NN: Peptides Quantification Table
709-
def create_peptides_table(report_df, sample_df, file_df):
710-
# Validation: remove rows with 0 or NA Precursor.Normalised values
708+
def _prepare_quant_table_data(report_df):
709+
"""
710+
Common preprocessing for quantification table creation.
711+
712+
Returns:
713+
pd.DataFrame: Preprocessed report data with positive Precursor.Normalised values.
714+
"""
711715
report_data = report_df[report_df["Precursor.Normalised"] > 0].copy()
712-
report_data = drop_empty_row(report_data, ["Protein.Names", "Stripped.Sequence"])
716+
return drop_empty_row(report_data, ["Protein.Names", "Stripped.Sequence"])
717+
718+
719+
def _merge_condition_data(report_data, sample_df, file_df):
720+
"""
721+
Merge report data with condition information from sample/file DataFrames.
722+
723+
Returns:
724+
tuple: (merged DataFrame with condition info, list of unique conditions) or (None, [])
725+
"""
726+
if sample_df.empty or file_df.empty:
727+
return None, []
728+
729+
sample_cond_df = pd.merge(
730+
sample_df[["Sample", "MSstats_Condition"]],
731+
file_df[["Sample", "Spectra_Filepath"]],
732+
on="Sample",
733+
)
734+
# Vectorized path splitting (more efficient than apply with lambda)
735+
sample_cond_df["Run"] = sample_cond_df["Spectra_Filepath"].str.rsplit(".", n=1).str[0]
713736

737+
cond_report_data = pd.merge(
738+
report_data[["Stripped.Sequence", "Protein.Names", "Precursor.Normalised", "Run"]],
739+
sample_cond_df[["Run", "MSstats_Condition"]].drop_duplicates(),
740+
on="Run",
741+
)
742+
743+
unique_conditions = sample_df["MSstats_Condition"].drop_duplicates().tolist()
744+
return cond_report_data, unique_conditions
745+
746+
747+
def _add_condition_headers(headers, conditions):
748+
"""Add condition-based headers to the headers dictionary."""
749+
for exp_condition in conditions:
750+
headers[str(exp_condition)] = {
751+
"title": str(exp_condition),
752+
"description": "MSstats Condition",
753+
"format": "{:,.4f}",
754+
}
755+
756+
757+
# DIA-NN: Peptides Quantification Table
758+
def create_peptides_table(report_df, sample_df, file_df):
759+
"""Create peptides quantification table from DIA-NN report."""
760+
report_data = _prepare_quant_table_data(report_df)
714761
report_data["BestSearchScore"] = 1 - report_data["Q.Value"]
715762

716-
table_dict = dict()
763+
table_dict = {}
717764
for sequence_protein, group in report_data.groupby(["Stripped.Sequence", "Protein.Names"]):
718765
table_dict[sequence_protein] = {
719766
"ProteinName": sequence_protein[1],
@@ -737,52 +784,29 @@ def create_peptides_table(report_df, sample_df, file_df):
737784
},
738785
}
739786

740-
if not sample_df.empty and not file_df.empty:
741-
742-
sample_cond_df = pd.merge(
743-
sample_df[["Sample", "MSstats_Condition"]],
744-
file_df[["Sample", "Spectra_Filepath"]],
745-
on="Sample",
746-
)
747-
sample_cond_df["Run"] = sample_cond_df["Spectra_Filepath"].apply(
748-
lambda x: os.path.splitext(x)[0]
749-
)
750-
751-
cond_report_data = pd.merge(
752-
report_data[["Stripped.Sequence", "Protein.Names", "Precursor.Normalised", "Run"]],
753-
sample_cond_df[["Run", "MSstats_Condition"]].drop_duplicates(),
754-
on="Run",
755-
)
756-
787+
cond_report_data, unique_conditions = _merge_condition_data(report_data, sample_df, file_df)
788+
if cond_report_data is not None:
757789
for sequence_protein, group in cond_report_data.groupby(
758790
["Stripped.Sequence", "Protein.Names"]
759791
):
760-
761-
condition_data = dict()
762-
for condition, sub_group in group.groupby("MSstats_Condition"):
763-
condition_data[str(condition)] = np.log10(sub_group["Precursor.Normalised"].mean())
764-
792+
condition_data = {
793+
str(cond): np.log10(sub_group["Precursor.Normalised"].mean())
794+
for cond, sub_group in group.groupby("MSstats_Condition")
795+
}
765796
table_dict[sequence_protein].update(condition_data)
766797

767-
for exp_condition in sample_df["MSstats_Condition"].drop_duplicates():
768-
headers[str(exp_condition)] = {
769-
"title": str(exp_condition),
770-
"description": "MSstats Condition",
771-
"format": "{:,.4f}",
772-
}
798+
_add_condition_headers(headers, unique_conditions)
773799

774800
result_dict = {i: v for i, (_, v) in enumerate(table_dict.items(), start=1)}
775-
776801
return result_dict, headers
777802

778803

779804
# DIA-NN: Protein Quantification Table
780805
def create_protein_table(report_df, sample_df, file_df):
781-
# Validation: remove rows with 0 or NA Precursor.Normalised values
782-
report_data = report_df[report_df["Precursor.Normalised"] > 0].copy()
783-
report_data = drop_empty_row(report_data, ["Protein.Names", "Stripped.Sequence"])
806+
"""Create protein quantification table from DIA-NN report."""
807+
report_data = _prepare_quant_table_data(report_df)
784808

785-
table_dict = dict()
809+
table_dict = {}
786810
for protein_name, group in report_data.groupby("Protein.Names"):
787811
table_dict[protein_name] = {
788812
"ProteinName": protein_name,
@@ -807,40 +831,18 @@ def create_protein_table(report_df, sample_df, file_df):
807831
},
808832
}
809833

810-
if not sample_df.empty and not file_df.empty:
811-
812-
sample_cond_df = pd.merge(
813-
sample_df[["Sample", "MSstats_Condition"]],
814-
file_df[["Sample", "Spectra_Filepath"]],
815-
on="Sample",
816-
)
817-
sample_cond_df["Run"] = sample_cond_df["Spectra_Filepath"].apply(
818-
lambda x: os.path.splitext(x)[0]
819-
)
820-
821-
cond_report_data = pd.merge(
822-
report_data[["Stripped.Sequence", "Protein.Names", "Precursor.Normalised", "Run"]],
823-
sample_cond_df[["Run", "MSstats_Condition"]].drop_duplicates(),
824-
on="Run",
825-
)
826-
834+
cond_report_data, unique_conditions = _merge_condition_data(report_data, sample_df, file_df)
835+
if cond_report_data is not None:
827836
for protein_name, group in cond_report_data.groupby("Protein.Names"):
828-
829-
condition_data = dict()
830-
for condition, sub_group in group.groupby("MSstats_Condition"):
831-
condition_data[str(condition)] = np.log10(sub_group["Precursor.Normalised"].mean())
832-
837+
condition_data = {
838+
str(cond): np.log10(sub_group["Precursor.Normalised"].mean())
839+
for cond, sub_group in group.groupby("MSstats_Condition")
840+
}
833841
table_dict[protein_name].update(condition_data)
834842

835-
for exp_condition in sample_df["MSstats_Condition"].drop_duplicates():
836-
headers[str(exp_condition)] = {
837-
"title": str(exp_condition),
838-
"description": "MSstats Condition",
839-
"format": "{:,.4f}",
840-
}
843+
_add_condition_headers(headers, unique_conditions)
841844

842845
result_dict = {i: v for i, (_, v) in enumerate(table_dict.items(), start=1)}
843-
844846
return result_dict, headers
845847

846848

pmultiqc/modules/common/ms/msinfo.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -139,11 +139,11 @@ def parse(self, **_kwargs) -> None:
139139
self.enable_dia,
140140
)
141141

142-
for m in mzml_table.keys():
143-
if mzml_table[m]["MS2_Num"] > 0:
144-
heatmap_charge[m] = mzml_table[m]["Charge_2"] / mzml_table[m]["MS2_Num"]
145-
else:
146-
heatmap_charge[m] = 0
142+
# Calculate heatmap_charge only for current file (not all files in each iteration)
143+
if mzml_table[m_name]["MS2_Num"] > 0:
144+
heatmap_charge[m_name] = mzml_table[m_name]["Charge_2"] / mzml_table[m_name]["MS2_Num"]
145+
else:
146+
heatmap_charge[m_name] = 0
147147

148148
self.log.info(
149149
"{}: Done aggregating ms_statistics dataframe {}...".format(

pmultiqc/modules/common/plots/dia.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -574,7 +574,21 @@ def draw_loess_rt_irt(sub_section, plot_data):
574574
)
575575

576576
def calculate_dia_intensity_std(df, sdrf_file_df):
577-
577+
"""
578+
Calculate standard deviation of intensity for DIA data.
579+
580+
Parameters:
581+
-----------
582+
df : pd.DataFrame
583+
DataFrame with Run, Modified.Sequence, Protein.Group, and log_intensity columns.
584+
sdrf_file_df : pd.DataFrame
585+
SDRF file DataFrame (can be empty).
586+
587+
Returns:
588+
--------
589+
dict or None: Dictionary mapping sample/condition to list of log intensity std values,
590+
or None if calculation cannot be performed.
591+
"""
578592
df_sub = df[["Run", "Modified.Sequence", "Protein.Group", "log_intensity"]].copy()
579593

580594
if not sdrf_file_df.empty:
@@ -617,8 +631,9 @@ def calculate_dia_intensity_std(df, sdrf_file_df):
617631
}
618632

619633
return plot_data
620-
else:
621-
log.warning("No SDRF available; failed to parse experimental groups; SD Intensity not generated.")
634+
635+
log.warning("No SDRF available; failed to parse experimental groups; SD Intensity not generated.")
636+
return None
622637

623638
def extract_condition_and_replicate(run_name):
624639

pmultiqc/modules/common/plots/general.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,9 @@ def draw_exp_design(sub_sections, exp_design):
197197
"description": "",
198198
"scale": False,
199199
}}
200-
for k, _ in condition_split(sample_df_slice["MSstats_Condition"].iloc[0]).items():
200+
# Use first row of sample_df for condition keys (safer than relying on loop variable)
201+
first_condition = sample_df["MSstats_Condition"].iloc[0] if not sample_df.empty else ""
202+
for k, _ in condition_split(first_condition).items():
201203
headers["MSstats_Condition_" + str(k)] = {
202204
"title": "MSstats Condition: " + str(k),
203205
"description": "",

pmultiqc/modules/common/stats.py

Lines changed: 47 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,25 @@ def nanmedian(values: np.ndarray, all_nan_fallback: np.float64) -> np.float64:
2525

2626
def qual_uniform(group_df_rt):
2727
"""
28+
Calculate quality score based on uniformity of retention time distribution.
29+
2830
Parameters:
2931
-----------
3032
group_df_rt: group["Retention time"] or group["retention_time"]
3133
34+
Returns:
35+
--------
36+
float: Quality score between 0 and 1, where 1 indicates perfect uniformity.
3237
"""
33-
x = group_df_rt / np.nansum(group_df_rt)
3438
n = group_df_rt.notna().sum()
39+
if n == 0:
40+
return 0.0
41+
42+
total_sum = np.nansum(group_df_rt)
43+
if total_sum == 0:
44+
return 0.0
45+
46+
x = group_df_rt / total_sum
3547
y = np.nansum(x) / n
3648
worst = ((1 - y) ** 0.5) * 1 / n + (y**0.5) * (n - 1) / n
3749
sc = np.sum(np.abs(x - y) ** 0.5) / n
@@ -40,25 +52,47 @@ def qual_uniform(group_df_rt):
4052
return result
4153

4254

43-
def cal_delta_mass_dict(df, col):
55+
def cal_delta_mass_dict(df, col, num_bins: int = 1000):
56+
"""
57+
Calculate delta mass distribution as counts and frequencies.
58+
59+
Parameters:
60+
-----------
61+
df : pd.DataFrame
62+
DataFrame containing the mass delta column.
63+
col : str
64+
Name of the column containing mass delta values.
65+
num_bins : int, optional
66+
Number of bins for histogram (default: 1000).
67+
68+
Returns:
69+
--------
70+
dict: Dictionary with 'count' and 'frequency' keys containing binned data.
71+
"""
72+
# Compute value_counts once and derive frequency from counts
73+
count_bin = df[col].value_counts(sort=False, bins=num_bins)
4474

45-
count_bin = df[col].value_counts(sort=False, bins=1000)
46-
count_bin_data = dict()
47-
for index in count_bin.index:
48-
count_bin_data[float(index.mid)] = int(count_bin[index])
75+
# Build count dictionary
76+
count_bin_data = {
77+
float(interval.mid): int(count)
78+
for interval, count in count_bin.items()
79+
}
4980

50-
frequency_bin = df[col].value_counts(sort=False, bins=1000, normalize=True)
51-
frequency_bin_data = dict()
52-
for index in frequency_bin.index:
53-
frequency_bin_data[float(index.mid)] = float(frequency_bin[index])
81+
# Derive frequency from counts (more efficient than calling value_counts twice)
82+
total_count = count_bin.sum()
83+
if total_count > 0:
84+
frequency_bin_data = {
85+
float(interval.mid): float(count / total_count)
86+
for interval, count in count_bin.items()
87+
}
88+
else:
89+
frequency_bin_data = {k: 0.0 for k in count_bin_data.keys()}
5490

55-
delta_mass = {
91+
return {
5692
"count": count_bin_data,
5793
"frequency": frequency_bin_data,
5894
}
5995

60-
return delta_mass
61-
6296

6397
def cal_hm_charge(df: pd.DataFrame, run_col: str, charge_col: str):
6498

pmultiqc/modules/maxquant/maxquant_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from pmultiqc.modules.common.common_utils import (
2020
mods_statistics,
2121
evidence_rt_count,
22-
recommpute_mass_error,
22+
recompute_mass_error,
2323
evidence_calibrated_mass_error
2424
)
2525

@@ -910,7 +910,7 @@ def evidence_uncalibrated_mass_error(evidence_data):
910910
if "potential contaminant" in evidence_data.columns:
911911
evidence_data = evidence_data[evidence_data["potential contaminant"] != "+"].copy()
912912

913-
evd_df = recommpute_mass_error(evidence_data)
913+
evd_df = recompute_mass_error(evidence_data)
914914

915915
if evd_df is None:
916916
if any(

0 commit comments

Comments
 (0)