|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | +""" |
| 4 | +Mutation density computation script. |
| 5 | +Mutation density is a metric that quantifies the number of mutations per megabase (Mb) of sequenced DNA. |
| 6 | +This script computes mutation density for a given sample, all possible genes and a list of consequence type groups. |
| 7 | +It calculates mutation density per Mb, both adjusted and non-adjusted by the number of sites available for each consequence type. |
| 8 | +The results are saved to a TSV file. |
| 9 | +""" |
| 10 | + |
| 11 | + |
| 12 | +import click |
| 13 | +import pandas as pd |
| 14 | +from read_utils import custom_na_values |
| 15 | + |
| 16 | +# TODO: bump pandas to 2.2.3 |
| 17 | + |
| 18 | +# -- Auxiliary functions -- # |
| 19 | + |
| 20 | +MUTDENSITY_IMPACT_GROUPS = [False, ["SNV"] , ["INSERTION", "DELETION"], ["SNV", "INSERTION", "DELETION"]] |
| 21 | + |
| 22 | +def mutdensity_sample(maf_df, depths_df, depths_adj_df, sample_name): |
| 23 | + """ |
| 24 | + Computes a sample's global mutation density. Returns the mutation density |
| 25 | + per Mb, non-adjusted and adjusted by panel |
| 26 | + composition. |
| 27 | + """ |
| 28 | + |
| 29 | + impact_group_results = list() |
| 30 | + |
| 31 | + # mutation density depth information |
| 32 | + sample_features_depth = {"DEPTH" : depths_df.drop_duplicates(subset = ["CHROM", "POS"])[f"{sample_name}"].sum(), |
| 33 | + "DEPTH_ADJUSTED": depths_adj_df[f"{sample_name}"].sum() |
| 34 | + } |
| 35 | + |
| 36 | + for type_list in MUTDENSITY_IMPACT_GROUPS: |
| 37 | + if not type_list: |
| 38 | + unique_maf = maf_df[["SAMPLE_ID", "MUT_ID", "ALT_DEPTH"]].drop_duplicates() |
| 39 | + types_included = 'all_types' |
| 40 | + else: |
| 41 | + unique_maf = maf_df[maf_df['TYPE'].isin(type_list)][["SAMPLE_ID", "MUT_ID", "ALT_DEPTH"]].copy().drop_duplicates() |
| 42 | + types_included = '-'.join(sorted(type_list)) |
| 43 | + |
| 44 | + # count number of mutations and mutated reads in the sample |
| 45 | + ## make sure to count each mutation only once (avoid annotation issues) |
| 46 | + n_muts = unique_maf.shape[0] |
| 47 | + n_muts_per_sample = unique_maf.groupby(by = ["SAMPLE_ID", "MUT_ID"] ).agg({"ALT_DEPTH" : "sum" }).reset_index() |
| 48 | + n_mutated_reads = n_muts_per_sample["ALT_DEPTH"].sum() |
| 49 | + print(n_muts, n_mutated_reads) |
| 50 | + |
| 51 | + # mutation density metrics |
| 52 | + sample_features = dict() |
| 53 | + sample_features.update(sample_features_depth) |
| 54 | + sample_features["N_MUTS"] = n_muts |
| 55 | + sample_features["N_MUTATED"] = n_mutated_reads |
| 56 | + |
| 57 | + sample_features["MUTDENSITY_MB"] = ( sample_features["N_MUTS"] / sample_features["DEPTH"] * 1000000 ).astype(float) |
| 58 | + sample_features["MUTDENSITY_MB_ADJUSTED"] = ( sample_features["N_MUTS"] / sample_features["DEPTH_ADJUSTED"] * 1000000 ).astype(float) |
| 59 | + sample_features["MUTREADSRATE_MB"] = ( sample_features["N_MUTATED"] / sample_features["DEPTH"] * 1000000 ).astype(float) |
| 60 | + sample_features["MUTREADSRATE_MB_ADJUSTED"] = ( sample_features["N_MUTATED"] / sample_features["DEPTH_ADJUSTED"] * 1000000 ).astype(float) |
| 61 | + |
| 62 | + sample_features["GENE"] = "ALL_GENES" |
| 63 | + sample_features["MUTTYPES"] = types_included |
| 64 | + |
| 65 | + impact_group_results.append(pd.DataFrame([sample_features])) |
| 66 | + |
| 67 | + # concatenate results for all impact groups |
| 68 | + mutdensity_sample = pd.concat(impact_group_results) |
| 69 | + |
| 70 | + return mutdensity_sample |
| 71 | + |
| 72 | + |
| 73 | +def mutdensity_gene(maf_df, depths_df, depths_adj_df, sample_name): |
| 74 | + """ |
| 75 | + Computes each gene mutation density. Returns the mutation density |
| 76 | + both per Mb and Kb sequenced, both non-adjusted and adjusted by panel |
| 77 | + composition. |
| 78 | + """ |
| 79 | + |
| 80 | + impact_group_results = list() |
| 81 | + |
| 82 | + for type_list in MUTDENSITY_IMPACT_GROUPS: |
| 83 | + # filter by mutation type according to type_list |
| 84 | + if not type_list: |
| 85 | + unique_maf = maf_df[["SAMPLE_ID", "GENE", "MUT_ID", "ALT_DEPTH"]].drop_duplicates() |
| 86 | + types_included = 'all_types' |
| 87 | + else: |
| 88 | + unique_maf = maf_df[maf_df['TYPE'].isin(type_list)][["SAMPLE_ID", "GENE", "MUT_ID", "ALT_DEPTH"]].copy().drop_duplicates() |
| 89 | + types_included = '-'.join(sorted(type_list)) |
| 90 | + |
| 91 | + # count number of mutations and mutated reads per gene |
| 92 | + # make sure to count each mutation only once (avoid annotation issues) |
| 93 | + n_muts_gene = unique_maf.groupby(by = ["GENE"] ).agg({"ALT_DEPTH" : "count" }) |
| 94 | + n_muts_gene.columns = ["N_MUTS"] |
| 95 | + |
| 96 | + # make sure to count each mutation only once (avoid annotation issues) |
| 97 | + n_mutated_reads = unique_maf.groupby(by = ["GENE"] ).agg({"ALT_DEPTH" : "sum" }) |
| 98 | + n_mutated_reads.columns = ["N_MUTATED"] |
| 99 | + |
| 100 | + depths_gene_df = depths_df.groupby("GENE").agg({f"{sample_name}" : "sum" }) |
| 101 | + depths_gene_df.columns = ["DEPTH"] |
| 102 | + depths_adj_gene_df = depths_adj_df.groupby("GENE").agg({f"{sample_name}" : "sum" }) |
| 103 | + depths_adj_gene_df.columns = ["DEPTH_ADJUSTED"] |
| 104 | + |
| 105 | + mut_rate_mut_reads_df = n_muts_gene.merge(n_mutated_reads, on = "GENE") |
| 106 | + depths_depthsadj_gene_df = depths_gene_df.merge(depths_adj_gene_df, on = "GENE") |
| 107 | + ## merge so that mutation density is computed although the number of mutations is NA (meaning, zero) |
| 108 | + mut_depths_df = depths_depthsadj_gene_df.merge(mut_rate_mut_reads_df, on = "GENE", how = 'left') |
| 109 | + mut_depths_df = mut_depths_df.fillna(0) # I think this is not needed |
| 110 | + |
| 111 | + # mutation density metrics |
| 112 | + mut_depths_df["MUTDENSITY_MB"] = (mut_depths_df["N_MUTS"] / mut_depths_df["DEPTH"] * 1000000).astype(float) |
| 113 | + mut_depths_df["MUTDENSITY_MB_ADJUSTED"] = (mut_depths_df["N_MUTS"] / mut_depths_df["DEPTH_ADJUSTED"] * 1000000).astype(float) |
| 114 | + |
| 115 | + mut_depths_df["MUTREADSRATE_MB"] = (mut_depths_df["N_MUTATED"] / mut_depths_df["DEPTH"] * 1000000).astype(float) |
| 116 | + mut_depths_df["MUTREADSRATE_MB_ADJUSTED"] = (mut_depths_df["N_MUTATED"] / mut_depths_df["DEPTH_ADJUSTED"] * 1000000).astype(float) |
| 117 | + |
| 118 | + mut_depths_df["MUTTYPES"] = types_included |
| 119 | + impact_group_results.append(mut_depths_df.reset_index()) |
| 120 | + |
| 121 | + # concatenate results for all impact groups |
| 122 | + mutdensity_per_gene = pd.concat(impact_group_results) |
| 123 | + |
| 124 | + return mutdensity_per_gene |
| 125 | + |
| 126 | + |
| 127 | +def load_n_process_inputs(maf_path, depths_path, annot_panel_path, sample_name): |
| 128 | + # File loading |
| 129 | + maf_df = pd.read_csv(maf_path, sep = "\t", na_values = custom_na_values) |
| 130 | + depths_df = pd.read_csv(depths_path, sep = "\t") |
| 131 | + depths_df = depths_df.drop("CONTEXT", axis = 1) |
| 132 | + annot_panel_df = pd.read_csv(annot_panel_path, sep = "\t", na_values = custom_na_values) |
| 133 | + |
| 134 | + # Subset depths with panel |
| 135 | + ## mode 1: each position counts one (once per gene, be careful that it might be duplicated in different genes) |
| 136 | + depths_subset_df = depths_df.merge(annot_panel_df[["CHROM", "POS", "GENE"]].drop_duplicates(), |
| 137 | + on = ["CHROM", "POS"], how = "inner") |
| 138 | + ## mode 2 (adjusted): each position counts as many times it contributes to the panel |
| 139 | + depths_df[sample_name] = depths_df[sample_name] / 3 # the depth per position can contribute to three different mutations |
| 140 | + depths_subset_adj_df = depths_df.merge(annot_panel_df[["CHROM", "POS", "GENE"]], on = ["CHROM", "POS"], how = "inner") |
| 141 | + |
| 142 | + ## mode 3 (adjusted): each position counts as many times it contributes to the panel, but ONLY ONCE PER SAMPLE |
| 143 | + depths_subset_adj_sample_df = depths_df.merge(annot_panel_df.drop_duplicates(subset = ["CHROM", "POS", "REF", "ALT"])[["CHROM", "POS"]], |
| 144 | + on = ["CHROM", "POS"], how = "inner") |
| 145 | + |
| 146 | + return maf_df, depths_subset_df, depths_subset_adj_df, depths_subset_adj_sample_df |
| 147 | + |
| 148 | + |
| 149 | + |
| 150 | +# -- Main function -- # |
| 151 | +def compute_mutdensity(maf_path, depths_path, annot_panel_path, sample_name, panel_v): |
| 152 | + """ |
| 153 | + Computes mutation density for a given sample based on MAF, depths, and annotation panel files. |
| 154 | + The function calculates mutation density per Mb and Kb, both adjusted and non-adjusted by |
| 155 | + the panel composition. It saves the results to a TSV file. |
| 156 | + """ |
| 157 | + |
| 158 | + maf_df, depths_subset_df, depths_subset_adj_df, depths_subset_adj_sample_df = load_n_process_inputs(maf_path, depths_path, annot_panel_path, sample_name) |
| 159 | + |
| 160 | + # Compute mutation densities |
| 161 | + ## sample mutation density |
| 162 | + mutdensity_sample_df = mutdensity_sample(maf_df, depths_subset_df, depths_subset_adj_sample_df, sample_name) |
| 163 | + |
| 164 | + ## per gene mutation density |
| 165 | + mutdensity_genes_df = mutdensity_gene(maf_df, depths_subset_df, depths_subset_adj_df, sample_name) |
| 166 | + |
| 167 | + mutdensity_df = pd.concat([mutdensity_sample_df, mutdensity_genes_df]) |
| 168 | + |
| 169 | + mutdensity_df["SAMPLE_ID"] = sample_name |
| 170 | + mutdensity_df["REGIONS"] = panel_v |
| 171 | + |
| 172 | + # Save |
| 173 | + mutdensity_df[["SAMPLE_ID", "GENE", "REGIONS", "MUTTYPES", |
| 174 | + "DEPTH", |
| 175 | + "N_MUTS", "N_MUTATED", |
| 176 | + "MUTDENSITY_MB", "MUTDENSITY_MB_ADJUSTED", |
| 177 | + "MUTREADSRATE_MB", "MUTREADSRATE_MB_ADJUSTED", |
| 178 | + ]].to_csv(f"{sample_name}.{panel_v}.mutdensities.tsv", |
| 179 | + sep = "\t", |
| 180 | + header = True, |
| 181 | + index = False |
| 182 | + ) |
| 183 | + |
| 184 | + |
| 185 | +@click.command() |
| 186 | +@click.option('--maf_path', type=click.Path(exists=True), required=True, help='Path to the MAF file.') |
| 187 | +@click.option('--depths_path', type=click.Path(exists=True), required=True, help='Path to the depths file.') |
| 188 | +@click.option('--annot_panel_path', type=click.Path(exists=True), required=True, help='Path to the annotation panel file.') |
| 189 | +@click.option('--sample_name', type=str, required=True, help='Sample name.') |
| 190 | +@click.option('--panel_version', type=str, required=True, help='Panel version.') |
| 191 | +def main(maf_path, depths_path, annot_panel_path, sample_name, panel_version): |
| 192 | + """ |
| 193 | + CLI entry point for computing mutation densities. |
| 194 | + """ |
| 195 | + compute_mutdensity(maf_path, depths_path, annot_panel_path, sample_name, panel_version) |
| 196 | + |
| 197 | + |
| 198 | +if __name__ == '__main__': |
| 199 | + |
| 200 | + main() |
0 commit comments