|
| 1 | +import sys |
| 2 | +import argparse |
| 3 | +import datetime |
| 4 | +import random |
| 5 | +from bit.cli.common import CustomRichHelpFormatter, add_help |
| 6 | +from bit.modules.general import check_files_are_found |
| 7 | +from Bio import SeqIO |
| 8 | +from bit.modules.seqs import mutate_seq |
| 9 | + |
| 10 | + |
| 11 | +def build_parser(): |
| 12 | + |
| 13 | + desc = """ |
| 14 | + This script will mutate all sequences of a nucleotide or amino-acid multifasta with |
| 15 | + the specified mutation rate. It does not take into consideration transition/transversion |
| 16 | + rates. By default it only swaps bases, but it can optionally introduce indels also. |
| 17 | + For version info, run `bit-version`. |
| 18 | + """ |
| 19 | + |
| 20 | + parser = argparse.ArgumentParser( |
| 21 | + description=desc, |
| 22 | + epilog="Ex. usage: `bit-mutate-seqs -i input.fasta`", |
| 23 | + formatter_class=CustomRichHelpFormatter, |
| 24 | + add_help=False |
| 25 | + ) |
| 26 | + |
| 27 | + required = parser.add_argument_group("REQUIRED PARAMETERS") |
| 28 | + optional = parser.add_argument_group("OPTIONAL PARAMETERS") |
| 29 | + |
| 30 | + required.add_argument( |
| 31 | + "-i", |
| 32 | + "--input-fasta", |
| 33 | + metavar = "<FILE>", |
| 34 | + help = "Starting fasta file", |
| 35 | + action = "store", |
| 36 | + required = True |
| 37 | + ) |
| 38 | + |
| 39 | + optional.add_argument( |
| 40 | + "-m", |
| 41 | + "--mutation-rate", |
| 42 | + metavar = "<FLOAT>", |
| 43 | + help = 'Wanted mutation rate, must be a float between 0 and 1 (default: "0.01")', |
| 44 | + action = "store", |
| 45 | + type = float, |
| 46 | + default = 0.01 |
| 47 | + ) |
| 48 | + |
| 49 | + optional.add_argument( |
| 50 | + "-t", |
| 51 | + "--molecule-type", |
| 52 | + help = "'NT' for nucleotides or 'AA' for amino acids (default = 'NT')", |
| 53 | + action = "store", |
| 54 | + choices = ["NT", "AA"], |
| 55 | + default = 'NT' |
| 56 | + ) |
| 57 | + |
| 58 | + optional.add_argument( |
| 59 | + "-o", |
| 60 | + "--output-fasta", |
| 61 | + metavar = "<FILE>", |
| 62 | + help = 'Output mutated fasta file (default: "mutated.fasta").', |
| 63 | + action = "store", |
| 64 | + default = "mutated.fasta" |
| 65 | + ) |
| 66 | + |
| 67 | + optional.add_argument( |
| 68 | + "-l", |
| 69 | + "--output-log", |
| 70 | + metavar = "<FILE>", |
| 71 | + help = 'Output file reporting the seed used and how many characters were changed in each input sequence (default: "mutated.log")', |
| 72 | + action = "store", |
| 73 | + default = "mutated.log" |
| 74 | + ) |
| 75 | + |
| 76 | + optional.add_argument( |
| 77 | + "-I", |
| 78 | + "--indel-rate", |
| 79 | + metavar = "<FLOAT>", |
| 80 | + help = """ |
| 81 | + Specify the frequency of indels wanted, must be a float between 0 and 1. This |
| 82 | + is a subset of the overall mutation rate provided via the '-m' argument. |
| 83 | + E.g., providing '-f 0.2' means 1 out of every 5 generated mutations will be an indel. |
| 84 | + By default, no indels are incorporated (default = 0.0). The likelihood |
| 85 | + of adding an insertion/deletion is 50/50, with all characters equally likely |
| 86 | + to be inserted. |
| 87 | + """, |
| 88 | + action = "store", |
| 89 | + type = float, |
| 90 | + default = 0.0 |
| 91 | + ) |
| 92 | + |
| 93 | + optional.add_argument( |
| 94 | + "-s", |
| 95 | + "--seed", |
| 96 | + metavar = "<INT>", |
| 97 | + help = "Optionally set the randomization seed", |
| 98 | + action = "store", |
| 99 | + type = int |
| 100 | + ) |
| 101 | + |
| 102 | + add_help(optional) |
| 103 | + |
| 104 | + return parser |
| 105 | + |
| 106 | + |
| 107 | +def main(): |
| 108 | + |
| 109 | + parser = build_parser() |
| 110 | + |
| 111 | + if len(sys.argv) == 1: # pragma: no cover |
| 112 | + parser.print_help(sys.stderr) |
| 113 | + sys.exit(0) |
| 114 | + |
| 115 | + args = parser.parse_args() |
| 116 | + |
| 117 | + available_substitutions, seed = preflight_checks_and_setup(args) |
| 118 | + |
| 119 | + # doing mutations per sequence |
| 120 | + with (open (args.input_fasta, "r") as in_fasta, |
| 121 | + open(args.output_fasta, "w") as out_fasta, |
| 122 | + open(args.output_log, "w") as log): |
| 123 | + |
| 124 | + # starting log file |
| 125 | + log.write(f"# seed used: {seed}\n") |
| 126 | + log.write(f"# indel frequency utilized: {args.indel_rate}\n") |
| 127 | + log.write("# the rest below is a tab-delimited table summarizing the changes introduced to each input sequence\n") |
| 128 | + log.write("seq_id\tseq_length\tnum_total_changes_made\tnum_substitutions\tnum_total_indels\tnum_insertions\tnum_deletions\n") |
| 129 | + |
| 130 | + for seq_record in SeqIO.parse(in_fasta, "fasta"): |
| 131 | + |
| 132 | + (seq, total_num_mutations, num_substitutions, num_indels, |
| 133 | + num_insertions, num_deletions) = mutate_seq(seq_record.seq, available_substitutions, |
| 134 | + args.mutation_rate, args.indel_rate) |
| 135 | + |
| 136 | + out_fasta.write(f">{seq_record.id}\n") |
| 137 | + out_fasta.write(f"{seq}\n") |
| 138 | + |
| 139 | + # writing out log file |
| 140 | + log.write(f"{seq_record.id}\t{len(seq_record)}\t{total_num_mutations}\t{num_substitutions}\t{num_indels}\t{num_insertions}\t{num_deletions}\n") |
| 141 | + |
| 142 | + |
| 143 | +def preflight_checks_and_setup(args): |
| 144 | + |
| 145 | + check_files_are_found([args.input_fasta]) |
| 146 | + |
| 147 | + # checking mutation rate is within 0 and 1 |
| 148 | + validate_mutation_rate(args.mutation_rate) |
| 149 | + |
| 150 | + # checking indel frequency rate is within 0 and 1 |
| 151 | + validate_indel_rate(args.indel_rate) |
| 152 | + |
| 153 | + # setting substitution type |
| 154 | + available_substitutions = get_available_substitutions(args.molecule_type) |
| 155 | + |
| 156 | + # setting seed if provided, and recording it for the output log either way |
| 157 | + seed = set_seed(args.seed) |
| 158 | + |
| 159 | + return available_substitutions, seed |
| 160 | + |
| 161 | + |
| 162 | +def validate_mutation_rate(mutation_rate): |
| 163 | + |
| 164 | + if not 0 <= mutation_rate <= 1: |
| 165 | + print("\n The '--mutation-rate' argument needs to be between 0 and and 1.\n") |
| 166 | + print(" Exiting for now.\n") |
| 167 | + sys.exit(1) |
| 168 | + |
| 169 | + |
| 170 | +def validate_indel_rate(indel_rate): |
| 171 | + |
| 172 | + if not 0 <= indel_rate <= 1: |
| 173 | + print("\n The '--indel-rate' argument needs to be between 0 and and 1.\n") |
| 174 | + print(" Exiting for now.\n") |
| 175 | + sys.exit(1) |
| 176 | + |
| 177 | + |
| 178 | +def get_available_substitutions(molecule_type): |
| 179 | + |
| 180 | + if molecule_type == 'NT': |
| 181 | + return ['A', 'T', 'C', 'G'] |
| 182 | + |
| 183 | + else: |
| 184 | + return ['A', 'R', 'N', 'D', |
| 185 | + 'C', 'E', 'Q', 'G', |
| 186 | + 'H', 'I', 'L', 'K', |
| 187 | + 'M', 'F', 'P', 'S', |
| 188 | + 'T', 'W', 'Y', 'V'] |
| 189 | + |
| 190 | + |
| 191 | +def set_seed(input_seed): |
| 192 | + |
| 193 | + if input_seed: |
| 194 | + |
| 195 | + seed = input_seed |
| 196 | + |
| 197 | + else: |
| 198 | + # setting seed manually (pseudo-randomly) BECAUSE it seems if you don't set it, you can't |
| 199 | + # pull it out otherwise (and i want to log it whether it was specified or not by the user) |
| 200 | + time = datetime.datetime.now() |
| 201 | + seed = time.hour * 10000 + time.minute * 100 + time.second |
| 202 | + |
| 203 | + random.seed(seed) |
| 204 | + |
| 205 | + return seed |
0 commit comments