diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 309f9298..7aebf679 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -305,4 +305,27 @@ jobs: name: Upload results with: name: results_big_dia - path: ./results_big_dia \ No newline at end of file + path: ./results_big_dia + + test_single_cell_diann: + needs: setup + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python 3.10 + uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: Install package + run: pip install . + - name: Test single cell dataset + run: | + wget -nv https://ftp.pride.ebi.ac.uk/pride/data/archive/2024/08/PXD053464/03_SingleCell_Searches.zip + unzip -d ./PXD053464 03_SingleCell_Searches.zip + multiqc --diann-plugin ./PXD053464 -o ./results_single_cell_diann + - uses: actions/upload-artifact@v4 + if: always() + name: Upload results + with: + name: results_single_cell_diann + path: ./results_single_cell_diann \ No newline at end of file diff --git a/docs/README.md b/docs/README.md index 6af7f847..50d896e3 100644 --- a/docs/README.md +++ b/docs/README.md @@ -242,6 +242,7 @@ You can find example reports on the [docs page](https://bigbio.github.io/pmultiq | TMT | Tandem mass tag | [TMT Example](https://pmultiqc.quantms.org/TMT_PXD007683/multiqc_report.html) ([disable_hoverinfo](https://pmultiqc.quantms.org/TMT_PXD007683_disable_hoverinfo/multiqc_report.html)) | [TMT_PXD007683.zip](https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/pmultiqc/example-projects/TMT_PXD007683.zip) | | quantms DIA | Data-independent acquisition | [quantms DIA Example](https://pmultiqc.quantms.org/dia/multiqc_report.html) ([disable_hoverinfo](https://pmultiqc.quantms.org/dia_disable_hoverinfo/multiqc_report.html)) | [dia.zip](https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/pmultiqc/dia/dia.zip) | | DIA-NN | Data-independent acquisition | [DIA-NN Example](https://pmultiqc.quantms.org/DIANN/multiqc_report.html) ([disable_hoverinfo](https://pmultiqc.quantms.org/DIANN_disable_hoverinfo/multiqc_report.html)) | [PXD063291.zip](https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/pmultiqc/example-projects/PXD063291.zip) | +| Single cell (DIA-NN) | Single cell dataset | [Single cell Example](https://pmultiqc.quantms.org/PXD053464/multiqc_report.html) ([disable_hoverinfo](https://pmultiqc.quantms.org/PXD053464_disable_hoverinfo/multiqc_report.html)) | [PXD053464 folder](https://ftp.pride.ebi.ac.uk/pride/data/archive/2024/08/PXD053464/) | | MaxQuant | MaxQuant results | [MaxQuant Example](https://pmultiqc.quantms.org/PXD003133/multiqc_report.html) ([disable_hoverinfo](https://pmultiqc.quantms.org/PXD003133_disable_hoverinfo/multiqc_report.html)) | [txt_20min.zip](https://ftp.pride.ebi.ac.uk/pride/data/archive/2015/11/PXD003133/txt_20min.zip) | | MaxQuant DIA | MaxQuant DIA results | [MaxQuant DIA Example](https://pmultiqc.quantms.org/MaxDIA/multiqc_report.html) ([disable_hoverinfo](https://pmultiqc.quantms.org/MaxDIA_disable_hoverinfo/multiqc_report.html)) | [MaxDIA_txt.zip](https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/pmultiqc/maxquant/MaxDIA_txt.zip) | | ProteoBench | ProteoBench results | [ProteoBench Example](https://pmultiqc.quantms.org/ProteoBench/multiqc_report.html) ([disable_hoverinfo](https://pmultiqc.quantms.org/ProteoBench_disable_hoverinfo/multiqc_report.html)) | [ProteoBench data](https://proteobench.cubimed.rub.de/datasets/d01e87b997b84c985868204b1ed26749902fd7f9/d01e87b997b84c985868204b1ed26749902fd7f9_data.zip) | diff --git a/docs/config.json b/docs/config.json index 8ba8750b..989f493b 100644 --- a/docs/config.json +++ b/docs/config.json @@ -195,6 +195,22 @@ ], "path": "docs/PXD062383_disable_hoverinfo", "file_type": ["dia", "disable_hoverinfo"] + }, + { + "accession": "PXD053464", + "urls": [ + "https://ftp.pride.ebi.ac.uk/pride/data/archive/2024/08/PXD053464/03_SingleCell_Searches.zip" + ], + "path": "docs/PXD053464", + "file_type": ["diann", ""] + }, + { + "accession": "PXD053464_disable_hoverinfo", + "urls": [ + "https://ftp.pride.ebi.ac.uk/pride/data/archive/2024/08/PXD053464/03_SingleCell_Searches.zip" + ], + "path": "docs/PXD053464_disable_hoverinfo", + "file_type": ["diann", "disable_hoverinfo"] } ] } \ No newline at end of file diff --git a/pmultiqc/modules/common/dia_utils.py b/pmultiqc/modules/common/dia_utils.py index 600449fd..ce271a19 100644 --- a/pmultiqc/modules/common/dia_utils.py +++ b/pmultiqc/modules/common/dia_utils.py @@ -155,17 +155,13 @@ def _draw_heatmap(sub_section, report_data, heatmap_color_list): def _process_diann_statistics(report_data): """Process DIA-NN statistics and create peptide plot.""" - # Extract sequence information - report_data["sequence"] = report_data[ - "Modified.Sequence" - ].astype("string").str.replace(r"\(.*?\)", "", regex=True) total_protein_quantified = len(set(report_data["Protein.Group"])) - total_peptide_count = len(set(report_data["sequence"])) + total_peptide_count = len(set(report_data["Modified.Sequence"])) # Create peptide plot log.info("Processing DIA pep_plot.") - protein_pep_map = report_data.groupby("Protein.Group")["sequence"].agg(list).to_dict() + protein_pep_map = report_data.groupby("Protein.Group")["Modified.Sequence"].agg(list).to_dict() pep_plot = Histogram("number of peptides per proteins", plot_category="frequency") for _, peps in protein_pep_map.items(): number = len(set(peps)) @@ -230,9 +226,13 @@ def _process_run_data(df, ms_with_psm, quantms_modified, sdrf_file_df): log.info("Processing DIA mod_plot_dict.") - report_data = df[ - ["Run", "Modified.Sequence", "Modifications", "Protein.Group", "sequence"] - ].copy() + required_cols = ["Run", "Modified.Sequence", "Modifications", "Protein.Group"] + report_data = df[required_cols].copy() + if "Proteotypic" in df.columns: + report_data["Proteotypic"] = df["Proteotypic"] + else: + log.warning("Missing Proteotypic column; treating all peptides as proteotypic.") + report_data["Proteotypic"] = 1 mod_plot_by_run = dict() modified_cats = list() @@ -277,25 +277,23 @@ def _calculate_run_statistics(group): """Calculate statistics for a specific run.""" peptides = set(group["Modified.Sequence"]) + unique_peptides = set( + group.loc[group["Proteotypic"] == 1, "Modified.Sequence"] + ) modified_pep = list( filter(lambda x: re.match(r".*?\(.*?\).*?", x) is not None, peptides) ) - group_peptides = group.groupby("sequence")["Protein.Group"].apply(list).to_dict() - unique_peptides = [ - pep for pep, prots in group_peptides.items() if len(set(prots)) == 1 - ] - stat_run = { "protein_num": len(set(group["Protein.Group"])), - "peptide_num": len(set(group["sequence"])), + "peptide_num": len(peptides), "unique_peptide_num": len(unique_peptides), "modified_peptide_num": len(modified_pep) } data_per_run = { "proteins": set(group["Protein.Group"]), - "peptides": set(group["sequence"]), + "peptides": peptides, "unique_peptides": unique_peptides, "modified_peps": modified_pep }