bigbio · ypriverol · Dec 25, 2025 · Dec 22, 2025 · Dec 22, 2025 · Dec 22, 2025
diff --git a/MultiQC_logo.png b/MultiQC_logo.png
diff --git a/environment.yml b/environment.yml
@@ -5,7 +5,7 @@ channels:
   - defaults
 dependencies:
   - python>=3.10
-  - multiqc>=1.29, <=1.32
+  - multiqc>=1.29, <=1.33
   - pandas>=1.5
   - pyteomics
   - pyopenms<=3.4.0
@@ -18,4 +18,5 @@ dependencies:
   - uvicorn
   - requests
   - redis
-  - statsmodels
+  - statsmodels
+  - urllib3>=2.6.1
diff --git a/pmultiqc/images/pmultiqc_logo_dark.png b/pmultiqc/images/pmultiqc_logo_dark.png
diff --git a/pmultiqc/main.py b/pmultiqc/main.py
@@ -37,11 +37,20 @@ def pmultiqc_plugin_execution_start():
     # logo
     current_dir = Path(__file__).parent.resolve()
     pmultiqc_logo = current_dir / "images" / "pmultiqc_logo_report.png"
+    pmultiqc_logo_dark = current_dir / "images" / "pmultiqc_logo_dark.png"
 
     if pmultiqc_logo.exists():
         config.custom_logo = str(pmultiqc_logo)
         config.custom_logo_url = "https://github.com/bigbio/pmultiqc"
         config.custom_logo_title = "pmultiqc"
+
+        # Supported starting from MultiQC v1.33
+        if hasattr(config, "custom_logo_dark"):
+            config.custom_logo_dark = str(pmultiqc_logo_dark)
+
+        if hasattr(config, "custom_logo_width"):
+            config.custom_logo_width = 118
-        # Supported starting from MultiQC v1.33
-        if hasattr(config, "custom_logo_width"):
-            config.custom_logo_dark = str(pmultiqc_logo_dark)
-
-        if hasattr(config, "custom_logo_width"):
-            config.custom_logo_width = 118
+        # Supported starting from MultiQC v1.33
+        if hasattr(config, "custom_logo_width"):
+            config.custom_logo_dark = str(pmultiqc_logo_dark)
+            config.custom_logo_width = 118
-        # Supported starting from MultiQC v1.33
-        if hasattr(config, "custom_logo_width"):
-            config.custom_logo_dark = str(pmultiqc_logo_dark)
-
-        if hasattr(config, "custom_logo_width"):
-            config.custom_logo_width = 118
+        # Supported starting from MultiQC v1.33
+        if hasattr(config, "custom_logo_width"):
+            config.custom_logo_dark = str(pmultiqc_logo_dark)
+            config.custom_logo_width = 118
+
         log.info(f"pmultiqc: injected custom logo from local path: {pmultiqc_logo}")
     else:
         log.warning(f"pmultiqc logo file not found at: {pmultiqc_logo}")

diff --git a/pmultiqc/modules/common/common_utils.py b/pmultiqc/modules/common/common_utils.py
@@ -269,12 +269,17 @@ def parse_mzml(
     )
 
 
-def mod_group_percentage(group):
-    if "Modifications" in group.columns:
-        group.rename(columns={"Modifications": "modifications"}, inplace=True)
+def mod_group_percentage(df):
 
-    counts = group["modifications"].str.split(",").explode().value_counts()
-    percentage_df = (counts / len(group["modifications"]) * 100).reset_index()
+    df_copy = df.copy()
+
+    if "Modifications" in df_copy.columns and "modifications" not in df_copy.columns:
+        df_copy = df_copy.rename(columns={"Modifications": "modifications"})
+    else:
+        log.warning('Detected both "Modifications" and "modifications" columns.')
+
+    counts = df_copy["modifications"].str.split(",").explode().value_counts()
+    percentage_df = (counts / len(df_copy["modifications"]) * 100).reset_index()
     percentage_df.columns = ["modifications", "percentage"]
 
     # Modified (Total)
@@ -440,4 +445,122 @@ def parse_sdrf(
         True,
         False,
         condition_config,  # config.kwargs["condition"],
-    )
+    )
+
+
+def cal_num_table_at_sample(file_df, data_per_run):
+
+    if file_df.empty:
+        return dict()
+
+    sample_file_df = file_df.copy()
+    sample_file_df["Sample"] = sample_file_df["Sample"].astype(int)
+
-    sample_file_df["Sample"] = sample_file_df["Sample"].astype(int)
+    sample_file_df = file_df.copy()
+    try:
+        sample_file_df["Sample"] = pd.to_numeric(sample_file_df["Sample"], errors="raise").astype(int)
+    except (ValueError, TypeError) as e:
+        log.warning(f"Failed to convert Sample column to int: {e}. Returning empty dict.")
+        return dict()
-    sample_file_df["Sample"] = sample_file_df["Sample"].astype(int)
+    sample_file_df = file_df.copy()
+    try:
+        sample_file_df["Sample"] = pd.to_numeric(sample_file_df["Sample"], errors="raise").astype(int)
+    except (ValueError, TypeError) as e:
+        log.warning(f"Failed to convert Sample column to int: {e}. Returning empty dict.")
+        return dict()
+    cal_num_table_sample = dict()
+    for sample, group in sample_file_df.groupby("Sample", sort=True):
+        proteins_set = set()
+        peptides_set = set()
+        unique_peptides_set = set()
+        modified_pep_set = set()
+
+        for run in group["Run"].unique():
+
+            run_data = data_per_run.get(run)
+            if not run_data:
+                continue
+
+            proteins_set.update(run_data.get("proteins", []))
+            peptides_set.update(run_data.get("peptides", []))
+            unique_peptides_set.update(run_data.get("unique_peptides", []))
+            modified_pep_set.update(run_data.get("modified_peps", []))
+
+        cal_num_table_sample[str(sample)] = {
+            "protein_num": len(proteins_set),
+            "peptide_num": len(peptides_set),
+            "unique_peptide_num": len(unique_peptides_set),
+            "modified_peptide_num": len(modified_pep_set),
+        }
+
+    return cal_num_table_sample
+
+
+def cal_msms_identified_rate(ms2_num_data, identified_data):
+
+    identified_rate = dict()
+    for m, msms_info in identified_data.items():
+        identified_ms2 = msms_info.get("Identified", 0)
+        all_ms2 = ms2_num_data.get(m, {}).get("MS2_Num", 0)
+        if all_ms2:
+            identified_rate[m] = {
+                "Identified Rate": identified_ms2 / all_ms2 * 100
+            }
+
+    return identified_rate
+
+
+def aggregate_msms_identified_rate(
+    mzml_table,
+    identified_msms_spectra,
+    sdrf_file_df=None
+):
+    identified_rate_by_run = cal_msms_identified_rate(
+            ms2_num_data=mzml_table,
+            identified_data=identified_msms_spectra
+        )
+
+    if sdrf_file_df is None:
+        return identified_rate_by_run
+
+    else:
+        identified_by_sample = dict()
+        ms2_num_by_sample = dict()
+
+        sdrf_file_df["Sample"] = sdrf_file_df["Sample"].astype(int)
+
+        for sample, group in sdrf_file_df.groupby("Sample", sort=True):
+            runs = group["Run"]
+
+            sample_identified_ms2 = sum(
+                identified_msms_spectra.get(run, {}).get("Identified", 0)
+                for run in runs
+            )
+            sample_all_ms2 = sum(
+                mzml_table.get(run, {}).get("MS2_Num", 0)
+                for run in runs
+            )
+
+            sample_key = f"Sample {str(sample)}"
+
+            identified_by_sample[sample_key] = {"Identified": sample_identified_ms2}
+            ms2_num_by_sample[sample_key] = {"MS2_Num": sample_all_ms2}
+
+        identified_rate_by_sample = cal_msms_identified_rate(
+            ms2_num_data=ms2_num_by_sample,
+            identified_data=identified_by_sample
+        )
+
+        return [identified_rate_by_run, identified_rate_by_sample]
+
+def summarize_modifications(df):
+
+    mod_group_processed = mod_group_percentage(df)
+    mod_plot_dict = dict(
+        zip(mod_group_processed["modifications"], mod_group_processed["percentage"])
+    )
+    modified_cat = mod_group_processed["modifications"]
+
+    return mod_plot_dict, modified_cat
+
+
+def group_charge(df, group_col, charge_col):
+
+    table = df.groupby([group_col, charge_col], sort=True).size().unstack(fill_value=0)
+    table.columns = table.columns.astype(str)
+
+    if group_col == "Sample":
+        table.index = [
+            f"Sample {str(i)}" for i in table.index
+        ]
+
+    return table
+
diff --git a/pmultiqc/modules/common/dia_utils.py b/pmultiqc/modules/common/dia_utils.py
@@ -13,7 +13,8 @@
 from pmultiqc.modules.common.file_utils import file_prefix
 from pmultiqc.modules.common.common_utils import (
     evidence_rt_count,
-    mod_group_percentage
+    cal_num_table_at_sample,
+    summarize_modifications
 )
 from pmultiqc.modules.core.section_groups import add_sub_section
 from pmultiqc.modules.common.plots.id import draw_ids_rt_count
@@ -33,7 +34,6 @@ def parse_diann_report(
         sample_df,
         file_df,
         ms_with_psm,
-        cal_num_table_data,
         quantms_modified,
         ms_paths,
         msstats_input_valid=False
@@ -53,7 +53,7 @@ def parse_diann_report(
     _process_modifications(report_data)
 
     # Process run-specific data
-    _process_run_data(report_data, ms_with_psm, cal_num_table_data, quantms_modified)
+    cal_num_table_data = _process_run_data(report_data, ms_with_psm, quantms_modified, file_df)
 
     # Handle files without PSM
     ms_without_psm = _handle_files_without_psm(ms_paths, ms_with_psm, cal_num_table_data)
@@ -105,15 +105,15 @@ def _draw_diann_plots(sub_sections, report_data, heatmap_color_list, sample_df,
     """Draw all DIA-NN plots."""
     # Draw intensity plots and heatmap
     if "Precursor.Quantity" in report_data.columns:
-        draw_dia_intensitys(sub_sections["quantification"], report_data)
+        draw_dia_intensitys(sub_sections["quantification"], report_data, file_df)
         _draw_heatmap(sub_sections["summary"], report_data, heatmap_color_list)
 
     # Draw other plots
     log.info("Draw the DIA MS1 subsection.")
     draw_dia_ms1(sub_sections["ms1"], report_data)
 
     log.info("Draw the DIA MS2 subsection.")
-    draw_dia_ms2s(sub_sections["ms2"], report_data)
+    draw_dia_ms2s(sub_sections["ms2"], report_data, file_df)
 
     log.info("Draw the DIA mass_error subsection.")
     draw_dia_mass_error(sub_sections["mass_error"], report_data)
@@ -209,37 +209,58 @@ def find_diann_modified(peptide):
     report_data["Modifications"] = report_data["Modified.Sequence"].apply(find_diann_modified)
 
 
-def _process_run_data(report_data, ms_with_psm, cal_num_table_data, quantms_modified):
-    """Process run-specific data including modifications and statistics."""
+def _process_run_data(df, ms_with_psm, quantms_modified, sdrf_file_df):
+    """
+    Process run-specific data including modifications and statistics.
+    """
+
     log.info("Processing DIA mod_plot_dict.")
-    mod_plot_dict = dict()
+
+    report_data = df[
+        ["Run", "Modified.Sequence", "Modifications", "Protein.Group", "sequence"]
+    ].copy()
+
+    mod_plot_by_run = dict()
     modified_cats = list()
 
+    statistics_at_run = dict()
+    data_per_run = dict()
+
     for run_file, group in report_data.groupby("Run"):
         run_file = str(run_file)
         ms_with_psm.append(run_file)
 
         # Process modifications for this run
-        mod_group_processed = mod_group_percentage(group.drop_duplicates())
-        mod_plot_dict[run_file] = dict(
-            zip(mod_group_processed["modifications"], mod_group_processed["percentage"])
-        )
-        modified_cats.extend(mod_group_processed["modifications"])
+        mod_plot_dict, modified_cat = summarize_modifications(group.drop_duplicates())
+        mod_plot_by_run[run_file] = mod_plot_dict
+        modified_cats.extend(modified_cat)
 
         # Calculate statistics for this run
-        _calculate_run_statistics(group, run_file, cal_num_table_data)
+        statistics_at_run[run_file], data_per_run[run_file] = _calculate_run_statistics(group)
+
+    num_table_at_sample = cal_num_table_at_sample(sdrf_file_df, data_per_run)
+
+    cal_num_table_data = {
+        "sdrf_samples": num_table_at_sample,
+        "ms_runs": statistics_at_run
+    }
+
+    mod_plot_by_sample = dia_sample_level_modifications(
+        df=report_data,
+        sdrf_file_df=sdrf_file_df
+    )
 
     # Update quantms_modified with processed data
-    quantms_modified["plot_data"] = mod_plot_dict
+    quantms_modified["plot_data"] = [mod_plot_by_run, mod_plot_by_sample]
     quantms_modified["cats"] = list(
         sorted(modified_cats, key=lambda x: (x == "Modified (Total)", x))
-    mod_plot_by_sample = dia_sample_level_modifications(
-        df=report_data,
-        sdrf_file_df=sdrf_file_df
-    )
-
-    # Update quantms_modified with processed data
-    quantms_modified["plot_data"] = mod_plot_dict
-    quantms_modified["plot_data"] = [mod_plot_by_run, mod_plot_by_sample]
-    quantms_modified["cats"] = list(
-        sorted(modified_cats, key=lambda x: (x == "Modified (Total)", x))
+    if sdrf_file_df is not None and not sdrf_file_df.empty:
+        mod_plot_by_sample = dia_sample_level_modifications(
+            df=report_data,
+            sdrf_file_df=sdrf_file_df
+        )
+    else:
+        mod_plot_by_sample = {}
+
+    # Update quantms_modified with processed data
+    if mod_plot_by_sample:
+        quantms_modified["plot_data"] = [mod_plot_by_run, mod_plot_by_sample]
+    else:
+        quantms_modified["plot_data"] = mod_plot_by_run
+    quantms_modified["cats"] = list(
+        sorted(modified_cats, key=lambda x: (x == "Modified (Total)", x))
-    mod_plot_by_sample = dia_sample_level_modifications(
-        df=report_data,
-        sdrf_file_df=sdrf_file_df
-    )
-
-    # Update quantms_modified with processed data
-    quantms_modified["plot_data"] = mod_plot_dict
-    quantms_modified["plot_data"] = [mod_plot_by_run, mod_plot_by_sample]
-    quantms_modified["cats"] = list(
-        sorted(modified_cats, key=lambda x: (x == "Modified (Total)", x))
+    if sdrf_file_df is not None and not sdrf_file_df.empty:
+        mod_plot_by_sample = dia_sample_level_modifications(
+            df=report_data,
+            sdrf_file_df=sdrf_file_df
+        )
+    else:
+        mod_plot_by_sample = {}
+
+    # Update quantms_modified with processed data
+    if mod_plot_by_sample:
+        quantms_modified["plot_data"] = [mod_plot_by_run, mod_plot_by_sample]
+    else:
+        quantms_modified["plot_data"] = mod_plot_by_run
+    quantms_modified["cats"] = list(
+        sorted(modified_cats, key=lambda x: (x == "Modified (Total)", x))
     )
 
+    return cal_num_table_data
 
-def _calculate_run_statistics(group, run_file, cal_num_table_data):
+
+def _calculate_run_statistics(group):
     """Calculate statistics for a specific run."""
-    cal_num_table_data[run_file] = {"protein_num": len(set(group["Protein.Group"]))}
-    cal_num_table_data[run_file]["peptide_num"] = len(set(group["sequence"]))
 
     peptides = set(group["Modified.Sequence"])
     modified_pep = list(
@@ -251,8 +272,21 @@ def _calculate_run_statistics(group, run_file, cal_num_table_data):
         pep for pep, prots in group_peptides.items() if len(set(prots)) == 1
     ]
 
-    cal_num_table_data[run_file]["unique_peptide_num"] = len(unique_peptides)
-    cal_num_table_data[run_file]["modified_peptide_num"] = len(modified_pep)
+    stat_run = {
+        "protein_num": len(set(group["Protein.Group"])),
+        "peptide_num": len(set(group["sequence"])),
+        "unique_peptide_num": len(unique_peptides),
+        "modified_peptide_num": len(modified_pep)
+    }
+
+    data_per_run = {
+        "proteins": set(group["Protein.Group"]),
+        "peptides": set(group["sequence"]),
+        "unique_peptides": unique_peptides,
+        "modified_peps": modified_pep
+    }
+
+    return stat_run, data_per_run
 
 
 def _handle_files_without_psm(ms_paths, ms_with_psm, cal_num_table_data):
@@ -261,7 +295,7 @@ def _handle_files_without_psm(ms_paths, ms_with_psm, cal_num_table_data):
 
     for i in ms_without_psm:
         log.warning("No PSM found in '{}'!".format(i))
-        cal_num_table_data[i] = {
+        cal_num_table_data["ms_runs"][i] = {
             "protein_num": 0,
             "peptide_num": 0,
             "unique_peptide_num": 0,
@@ -274,14 +308,13 @@ def _handle_files_without_psm(ms_paths, ms_with_psm, cal_num_table_data):
 ## Removed draw_dia_heatmap wrapper; call cal_dia_heatmap and dia_plots.draw_heatmap directly.
 
 
-def draw_dia_intensitys(sub_section, report_df):
+def draw_dia_intensitys(sub_section, report_df, sdrf_file_df):
     df_sub = report_df[report_df["Precursor.Quantity"] > 0].copy()
     df_sub["log_intensity"] = np.log2(df_sub["Precursor.Quantity"])
 
-    dia_plots.draw_dia_intensity_dis(sub_section, df_sub)
+    dia_plots.draw_dia_intensity_dis(sub_section, df_sub, sdrf_file_df)
 
-    if dia_plots.can_groupby_for_std(report_df, "Run"):
-        dia_plots.draw_dia_intensity_std(sub_section, df_sub)
+    dia_plots.draw_dia_intensity_std(sub_section, df_sub, sdrf_file_df)
 
 
 def draw_dia_ms1(sub_section, df):
@@ -293,13 +326,13 @@ def draw_dia_ms1(sub_section, df):
             dia_plots.draw_dia_ms1_area(sub_section, df_sub)
 
 
-def draw_dia_ms2s(sub_section, df):
+def draw_dia_ms2s(sub_section, df, sdrf_file_df):
     # Distribution of Precursor Charges
     if "Precursor.Charge" in df.columns:
         dia_plots.draw_dia_whole_exp_charge(sub_section, df)
 
         # Charge-state of Per File
-        dia_plots.draw_dia_ms2_charge(sub_section, df)
+        dia_plots.draw_dia_ms2_charge(sub_section, df, sdrf_file_df)
 
 
 def draw_dia_mass_error(sub_section, df):
@@ -798,4 +831,26 @@ def create_protein_table(report_df, sample_df, file_df):
 
     result_dict = {i: v for i, (_, v) in enumerate(table_dict.items(), start=1)}
 
-    return result_dict, headers
+    return result_dict, headers
+
+
+def dia_sample_level_modifications(df, sdrf_file_df):
+
+    report_data = df.copy()
+
+    report_data = report_data.merge(
+        right=sdrf_file_df[["Sample", "Run"]].drop_duplicates(),
+        on="Run"
+    )
+
+    report_data["Sample"] = report_data["Sample"].astype(int)
-    report_data["Sample"] = report_data["Sample"].astype(int)
+    try:
+        report_data["Sample"] = pd.to_numeric(report_data["Sample"], errors="raise").astype(int)
+    except (ValueError, TypeError):
+        log.warning("Failed to convert Sample column to int in dia_sample_level_modifications")
+        return {}
-    report_data["Sample"] = report_data["Sample"].astype(int)
+    try:
+        report_data["Sample"] = pd.to_numeric(report_data["Sample"], errors="raise").astype(int)
+    except (ValueError, TypeError):
+        log.warning("Failed to convert Sample column to int in dia_sample_level_modifications")
+        return {}
+
+    mod_plot = dict()
+    for sample, group in report_data.groupby("Sample", sort=True):
+
+        mod_plot_dict, _ = summarize_modifications(
+            group.drop_duplicates()
+        )
+        mod_plot[f"Sample {str(sample)}"] = mod_plot_dict
+
+    return mod_plot
-def dia_sample_level_modifications(df, sdrf_file_df):
-
-    report_data = df.copy()
-
-    report_data = report_data.merge(
-        right=sdrf_file_df[["Sample", "Run"]].drop_duplicates(),
-        on="Run"
-    )
-
-    report_data["Sample"] = report_data["Sample"].astype(int)
-
-    mod_plot = dict()
-    for sample, group in report_data.groupby("Sample", sort=True):
-
-        mod_plot_dict, _ = summarize_modifications(
-            group.drop_duplicates()
-        )
-        mod_plot[f"Sample {str(sample)}"] = mod_plot_dict
-
-    return mod_plot
+def dia_sample_level_modifications(df, sdrf_file_df):
+
+    if sdrf_file_df is None or sdrf_file_df.empty:
+        return {}
+
+    report_data = df.copy()
+
+    report_data = report_data.merge(
+        right=sdrf_file_df[["Sample", "Run"]].drop_duplicates(),
+        on="Run"
+    )
+
+    report_data["Sample"] = report_data["Sample"].astype(int)
+
+    mod_plot = dict()
+    for sample, group in report_data.groupby("Sample", sort=True):
+
+        mod_plot_dict, _ = summarize_modifications(
+            group.drop_duplicates()
+        )
+        mod_plot[f"Sample {str(sample)}"] = mod_plot_dict
+
+    return mod_plot
-def dia_sample_level_modifications(df, sdrf_file_df):
-
-    report_data = df.copy()
-
-    report_data = report_data.merge(
-        right=sdrf_file_df[["Sample", "Run"]].drop_duplicates(),
-        on="Run"
-    )
-
-    report_data["Sample"] = report_data["Sample"].astype(int)
-
-    mod_plot = dict()
-    for sample, group in report_data.groupby("Sample", sort=True):
-
-        mod_plot_dict, _ = summarize_modifications(
-            group.drop_duplicates()
-        )
-        mod_plot[f"Sample {str(sample)}"] = mod_plot_dict
-
-    return mod_plot
+def dia_sample_level_modifications(df, sdrf_file_df):
+
+    if sdrf_file_df is None or sdrf_file_df.empty:
+        return {}
+
+    report_data = df.copy()
+
+    report_data = report_data.merge(
+        right=sdrf_file_df[["Sample", "Run"]].drop_duplicates(),
+        on="Run"
+    )
+
+    report_data["Sample"] = report_data["Sample"].astype(int)
+
+    mod_plot = dict()
+    for sample, group in report_data.groupby("Sample", sort=True):
+
+        mod_plot_dict, _ = summarize_modifications(
+            group.drop_duplicates()
+        )
+        mod_plot[f"Sample {str(sample)}"] = mod_plot_dict
+
+    return mod_plot