diff --git a/SINGER/SINGER/Rate_map.cpp b/SINGER/SINGER/Rate_map.cpp index 11e843b..c4e601f 100644 --- a/SINGER/SINGER/Rate_map.cpp +++ b/SINGER/SINGER/Rate_map.cpp @@ -7,7 +7,12 @@ #include "Rate_map.hpp" -Rate_map::Rate_map() {} +// Even if there are regions of truly no recombination, we have to pretend there is +// a really low rate to make the math work out currently. +static constexpr double MIN_RECOMB_RATE = 1e-16; + +Rate_map::Rate_map(double poffset) + : offset(poffset) {} void Rate_map::load_map(string mut_map_file) { ifstream fin(mut_map_file); @@ -21,8 +26,12 @@ void Rate_map::load_map(string mut_map_file) { double rate; double mut_dist; while (fin >> left >> right >> rate) { + if (rate < MIN_RECOMB_RATE) { + rate = MIN_RECOMB_RATE; + } coordinates.push_back(left); mut_dist = rate_distances.back() + rate*(right - left); + assert(mut_dist != rate_distances.back()); rate_distances.push_back(mut_dist); } sequence_length = right; @@ -38,16 +47,19 @@ int Rate_map::find_index(double x) { } double Rate_map::cumulative_distance(double x) { - int index = find_index(x); + const double actualCoordinate = x + offset; + int index = find_index(actualCoordinate); double prev_dist = rate_distances[index]; double next_dist = rate_distances[index+1]; - double p = (x - coordinates[index])/(coordinates[index+1] - coordinates[index]); + double p = (actualCoordinate - coordinates[index])/(coordinates[index+1] - coordinates[index]); double dist = (1-p)*prev_dist + p*next_dist; return dist; } double Rate_map::segment_distance(double x, double y) { - return cumulative_distance(y) - cumulative_distance(x); + double cd = cumulative_distance(y) - cumulative_distance(x); + assert(cd > 0.0); + return cd; } double Rate_map::mean_rate() { diff --git a/SINGER/SINGER/Rate_map.hpp b/SINGER/SINGER/Rate_map.hpp index c575d36..cdc6128 100644 --- a/SINGER/SINGER/Rate_map.hpp +++ b/SINGER/SINGER/Rate_map.hpp @@ -9,28 +9,32 @@ #define Rate_map_hpp #include +#include #include "Node.hpp" class Rate_map { public: - double sequence_length = INT_MAX; + // The rate map is for the entire chromosome, typically, but you may be running SINGER on only + // a subset of that chromosome. This offset is where SINGER's concept of 0 starts. + double offset = std::numeric_limits::max(); + double sequence_length = std::numeric_limits::max(); vector coordinates = {}; vector rate_distances = {}; - Rate_map(); + Rate_map(double poffset = 0.0); void load_map(string mut_map_file); - int find_index(double x); - double cumulative_distance(double x); double segment_distance(double x, double y); double mean_rate(); +private: + int find_index(double x); }; #endif /* Rate_map_hpp */ diff --git a/SINGER/SINGER/Sampler.cpp b/SINGER/SINGER/Sampler.cpp index a5000a3..dcfa618 100644 --- a/SINGER/SINGER/Sampler.cpp +++ b/SINGER/SINGER/Sampler.cpp @@ -21,6 +21,7 @@ Sampler::Sampler(double pop_size, Rate_map &rm, Rate_map &mm) { recomb_rate = rm.mean_rate()*Ne; recomb_map = rm; mut_map = mm; + has_map = true; } void Sampler::set_precision(double c, double q) { @@ -306,7 +307,7 @@ void Sampler::build_singleton_arg() { arg = ARG(Ne, sequence_length); arg.discretize(bin_size); arg.build_singleton_arg(n); - if (mut_rate > 0 and recomb_rate > 0) { + if (!has_map) { arg.compute_rhos_thetas(recomb_rate, mut_rate); } else { arg.compute_rhos_thetas(recomb_map, mut_map); @@ -787,7 +788,7 @@ void Sampler::load_resume_arg() { coord_file = output_prefix + "_coordinates.txt"; arg.read(node_file, branch_file, recomb_file, mut_file); arg.read_coordinates(coord_file); - if (mut_rate > 0 and recomb_rate > 0) { + if (!has_map) { arg.compute_rhos_thetas(recomb_rate, mut_rate); } else { arg.compute_rhos_thetas(recomb_map, mut_map); diff --git a/SINGER/SINGER/Sampler.hpp b/SINGER/SINGER/Sampler.hpp index a957be0..7804ea0 100644 --- a/SINGER/SINGER/Sampler.hpp +++ b/SINGER/SINGER/Sampler.hpp @@ -48,6 +48,7 @@ class Sampler { vector ordered_sample_nodes = {}; unordered_map> carriers = {}; unordered_map> mutation_sets = {}; + bool has_map = false; Sampler(); diff --git a/SINGER/SINGER/main.cpp b/SINGER/SINGER/main.cpp index 51ce4f1..eea1366 100644 --- a/SINGER/SINGER/main.cpp +++ b/SINGER/SINGER/main.cpp @@ -224,16 +224,11 @@ int main(int argc, const char * argv[]) { exit(1); } } - if (r < 0) { - cerr << "-r flag missing or invalid value. " << endl; - exit(1); - } - if (m < 0) { - cerr << "-m flag missing or invalid value. " << endl; - exit(1); - } - if (Ne < 0) { - cerr << "-Ne flag missing or invalid value. " << endl; + bool bothRates = (r > 0 && m > 0); + bool bothMaps = !(recomb_map_filename.empty() || mut_map_filename.empty()); + if (!(bothRates || bothMaps)) { + cerr << "Must provide either valid values for -r and -m, OR provide a valid " + << "mutation map and recombination map. " << endl; exit(1); } if (input_filename.size() == 0) { @@ -253,12 +248,12 @@ int main(int argc, const char * argv[]) { exit(1); } Sampler sampler; - if (r > 0 and m > 0) { + if (bothRates) { sampler = Sampler(Ne, r, m); } else { - Rate_map recomb_map = Rate_map(); + Rate_map recomb_map = Rate_map(start_pos); recomb_map.load_map(recomb_map_filename); - Rate_map mut_map = Rate_map(); + Rate_map mut_map = Rate_map(start_pos); mut_map.load_map(mut_map_filename); sampler = Sampler(Ne, recomb_map, mut_map); } diff --git a/SINGER/SINGER/parallel_singer b/SINGER/SINGER/parallel_singer index 4ba697a..8fa4909 100755 --- a/SINGER/SINGER/parallel_singer +++ b/SINGER/SINGER/parallel_singer @@ -14,7 +14,7 @@ def index_vcf(vcf_prefix, block_length): subprocess.run(["python", indexer, vcf_prefix, str(block_length)]) -def run_singer_in_parallel(vcf_prefix, output_prefix, mutation_rate, ratio, block_length, num_iters, thinning_interval, Ne, polar, num_cores): +def run_singer_in_parallel(vcf_prefix, output_prefix, mutation_rate, ratio, block_length, num_iters, thinning_interval, Ne, polar, num_cores, recomb_map, mut_map): """Run singer in parallel using the specified parameters.""" # Read the breakpoints from the index file @@ -30,8 +30,18 @@ def run_singer_in_parallel(vcf_prefix, output_prefix, mutation_rate, ratio, bloc start = breakpoints[i] # Base command - cmd = f"{singer_master_executable} -Ne {Ne} -m {mutation_rate} -ratio {ratio} -vcf {vcf_prefix} -output {output_prefix}_{i}_{i+1} -start {start} -end {start + block_length} -n {num_iters} -thin {thinning_interval} -polar {polar}" - + cmd = f"{singer_master_executable} -vcf {vcf_prefix} -output {output_prefix}_{i}_{i+1} -start {start} -end {start + block_length} -n {num_iters} -thin {thinning_interval} -polar {polar}" + if Ne is not None: + cmd += f" -Ne {Ne}" + if recomb_map: + cmd += f" -recomb_map \"{recomb_map}\"" + else: + cmd += f" -ratio {ratio}" + if mut_map is not None: + cmd += f" -mut_map {mut_map}" + else: + cmd += f" -m {mutation_rate}" + cmd_list.append(cmd) # Execute the commands in parallel @@ -47,9 +57,11 @@ def convert_long_ARG(vcf_prefix, output_prefix, num_iters, freq): def main(): parser = argparse.ArgumentParser(description="Parallelize singer runs by cutting the chromosome") - parser.add_argument('-Ne', type=float, required=True, help='Effective population size. Default: 1e4.') - parser.add_argument("-m", type=float, required=True, help="Mutation rate.") + parser.add_argument('-Ne', type=float, required=False, help='Effective population size. Default: 1e4.') + parser.add_argument("-m", type=float, required=False, help="Mutation rate.") + parser.add_argument("-mut_map", type=str, help="Filename for the mutation rate map.") parser.add_argument("-ratio", type=float, default=1, help="Recombination to mutation ratio. Default: 1.") + parser.add_argument("-recomb_map", default=None, required=False, help="Filename for the recombination rate map.") parser.add_argument("-L", type=int, default=int(1e6), help="Block length. Default: 1e6.") parser.add_argument("-vcf", type=str, required=True, help="VCF file prefix (without .vcf or .vcf.gz extension).") parser.add_argument("-output", type=str, required=True, help="Output file prefix.") @@ -57,8 +69,8 @@ def main(): parser.add_argument("-thin", type=int, required=True, help="Thinning interval length.") parser.add_argument("-polar", type=float, default=0.5, required=False, help="Site flip probability. Default: 0.5.") parser.add_argument("-freq", type=float, default=1, required=False, help="Convert to tskit every {freq} samples. Default: 1.") - parser.add_argument("-num_cores", type=int, default=20, required=False, help="Number of cores. Default: 20.") - + parser.add_argument("-num_cores", type=int, default=20, required=False, help="Number of cores. Default: 20.") + if len(sys.argv) == 1: parser.print_help(sys.stderr) sys.exit(1) @@ -66,9 +78,11 @@ def main(): args = parser.parse_args() print("Parameters:") - print(f"Effective population size: {args.Ne}") + print(f"Effective population size: {args.Ne if args.Ne is not None else 'estimated'}") print(f"Mutation rate: {args.m}") + print(f"Mutation map: {args.mut_map}") print(f"Recombination to mutation ratio: {args.ratio}") + print(f"Recombination map: {args.recomb_map}") print(f"Block length: {args.L}") print(f"VCF file prefix: {args.vcf}") print(f"Output file prefix: {args.output}") @@ -79,7 +93,7 @@ def main(): print(f"Number of cores: {args.num_cores}") index_vcf(args.vcf, args.L) - run_singer_in_parallel(args.vcf, args.output, args.m, args.ratio, args.L, args.n, args.thin, args.Ne, args.polar, args.num_cores) + run_singer_in_parallel(args.vcf, args.output, args.m, args.ratio, args.L, args.n, args.thin, args.Ne, args.polar, args.num_cores, args.recomb_map, args.mut_map) convert_long_ARG(args.vcf, args.output, args.n, args.freq) if __name__ == "__main__": diff --git a/SINGER/SINGER/singer_master b/SINGER/SINGER/singer_master index 21087cc..73cbcff 100755 --- a/SINGER/SINGER/singer_master +++ b/SINGER/SINGER/singer_master @@ -293,16 +293,15 @@ def main(): if args.m: + if args.recomb_map: + parser.error("-recomb_map is not compatible with -m; you must use -mut_map") if args.Ne < 0: args.Ne = compute_Ne(args.vcf, args.start, args.end, args.m) if (args.resume): resume_rate_singer(args.Ne, args.m, args.m*args.ratio, args.start, args.end, args.vcf, args.output, args.n, args.thin, args.polar, args.seed) else: run_rate_singer(args.Ne, args.m, args.m*args.ratio, args.start, args.end, args.vcf, args.output, args.n, args.thin, args.polar, args.seed) - return - - if args.mut_map and not args.m: - args.m = compute_average_rate(args.mut_map, args.start, args.end) + elif args.mut_map: if not args.recomb_map: parser.error("You must provide -recomb_map when using the -mut_map flag")