From 31a0f2a75ff39d4cc3058220b295ee63f9dacc4d Mon Sep 17 00:00:00 2001 From: Drew DeHaas Date: Mon, 23 Sep 2024 08:31:38 -0400 Subject: [PATCH 1/3] Add -recomb_map option for parallel_singer singer_master already supports it. --- SINGER/SINGER/Sampler.cpp | 5 +++-- SINGER/SINGER/Sampler.hpp | 1 + SINGER/SINGER/main.cpp | 17 ++++++----------- SINGER/SINGER/parallel_singer | 32 +++++++++++++++++++++++--------- SINGER/SINGER/singer_master | 7 +++---- 5 files changed, 36 insertions(+), 26 deletions(-) diff --git a/SINGER/SINGER/Sampler.cpp b/SINGER/SINGER/Sampler.cpp index a5000a3..dcfa618 100644 --- a/SINGER/SINGER/Sampler.cpp +++ b/SINGER/SINGER/Sampler.cpp @@ -21,6 +21,7 @@ Sampler::Sampler(double pop_size, Rate_map &rm, Rate_map &mm) { recomb_rate = rm.mean_rate()*Ne; recomb_map = rm; mut_map = mm; + has_map = true; } void Sampler::set_precision(double c, double q) { @@ -306,7 +307,7 @@ void Sampler::build_singleton_arg() { arg = ARG(Ne, sequence_length); arg.discretize(bin_size); arg.build_singleton_arg(n); - if (mut_rate > 0 and recomb_rate > 0) { + if (!has_map) { arg.compute_rhos_thetas(recomb_rate, mut_rate); } else { arg.compute_rhos_thetas(recomb_map, mut_map); @@ -787,7 +788,7 @@ void Sampler::load_resume_arg() { coord_file = output_prefix + "_coordinates.txt"; arg.read(node_file, branch_file, recomb_file, mut_file); arg.read_coordinates(coord_file); - if (mut_rate > 0 and recomb_rate > 0) { + if (!has_map) { arg.compute_rhos_thetas(recomb_rate, mut_rate); } else { arg.compute_rhos_thetas(recomb_map, mut_map); diff --git a/SINGER/SINGER/Sampler.hpp b/SINGER/SINGER/Sampler.hpp index a957be0..7804ea0 100644 --- a/SINGER/SINGER/Sampler.hpp +++ b/SINGER/SINGER/Sampler.hpp @@ -48,6 +48,7 @@ class Sampler { vector ordered_sample_nodes = {}; unordered_map> carriers = {}; unordered_map> mutation_sets = {}; + bool has_map = false; Sampler(); diff --git a/SINGER/SINGER/main.cpp b/SINGER/SINGER/main.cpp index 51ce4f1..b541aae 100644 --- a/SINGER/SINGER/main.cpp +++ b/SINGER/SINGER/main.cpp @@ -224,16 +224,11 @@ int main(int argc, const char * argv[]) { exit(1); } } - if (r < 0) { - cerr << "-r flag missing or invalid value. " << endl; - exit(1); - } - if (m < 0) { - cerr << "-m flag missing or invalid value. " << endl; - exit(1); - } - if (Ne < 0) { - cerr << "-Ne flag missing or invalid value. " << endl; + bool bothRates = (r > 0 && m > 0); + bool bothMaps = !(recomb_map_filename.empty() || mut_map_filename.empty()); + if (!(bothRates || bothMaps)) { + cerr << "Must provide either valid values for -r and -m, OR provide a valid " + << "mutation map and recombination map. " << endl; exit(1); } if (input_filename.size() == 0) { @@ -253,7 +248,7 @@ int main(int argc, const char * argv[]) { exit(1); } Sampler sampler; - if (r > 0 and m > 0) { + if (bothRates) { sampler = Sampler(Ne, r, m); } else { Rate_map recomb_map = Rate_map(); diff --git a/SINGER/SINGER/parallel_singer b/SINGER/SINGER/parallel_singer index 4ba697a..8fa4909 100755 --- a/SINGER/SINGER/parallel_singer +++ b/SINGER/SINGER/parallel_singer @@ -14,7 +14,7 @@ def index_vcf(vcf_prefix, block_length): subprocess.run(["python", indexer, vcf_prefix, str(block_length)]) -def run_singer_in_parallel(vcf_prefix, output_prefix, mutation_rate, ratio, block_length, num_iters, thinning_interval, Ne, polar, num_cores): +def run_singer_in_parallel(vcf_prefix, output_prefix, mutation_rate, ratio, block_length, num_iters, thinning_interval, Ne, polar, num_cores, recomb_map, mut_map): """Run singer in parallel using the specified parameters.""" # Read the breakpoints from the index file @@ -30,8 +30,18 @@ def run_singer_in_parallel(vcf_prefix, output_prefix, mutation_rate, ratio, bloc start = breakpoints[i] # Base command - cmd = f"{singer_master_executable} -Ne {Ne} -m {mutation_rate} -ratio {ratio} -vcf {vcf_prefix} -output {output_prefix}_{i}_{i+1} -start {start} -end {start + block_length} -n {num_iters} -thin {thinning_interval} -polar {polar}" - + cmd = f"{singer_master_executable} -vcf {vcf_prefix} -output {output_prefix}_{i}_{i+1} -start {start} -end {start + block_length} -n {num_iters} -thin {thinning_interval} -polar {polar}" + if Ne is not None: + cmd += f" -Ne {Ne}" + if recomb_map: + cmd += f" -recomb_map \"{recomb_map}\"" + else: + cmd += f" -ratio {ratio}" + if mut_map is not None: + cmd += f" -mut_map {mut_map}" + else: + cmd += f" -m {mutation_rate}" + cmd_list.append(cmd) # Execute the commands in parallel @@ -47,9 +57,11 @@ def convert_long_ARG(vcf_prefix, output_prefix, num_iters, freq): def main(): parser = argparse.ArgumentParser(description="Parallelize singer runs by cutting the chromosome") - parser.add_argument('-Ne', type=float, required=True, help='Effective population size. Default: 1e4.') - parser.add_argument("-m", type=float, required=True, help="Mutation rate.") + parser.add_argument('-Ne', type=float, required=False, help='Effective population size. Default: 1e4.') + parser.add_argument("-m", type=float, required=False, help="Mutation rate.") + parser.add_argument("-mut_map", type=str, help="Filename for the mutation rate map.") parser.add_argument("-ratio", type=float, default=1, help="Recombination to mutation ratio. Default: 1.") + parser.add_argument("-recomb_map", default=None, required=False, help="Filename for the recombination rate map.") parser.add_argument("-L", type=int, default=int(1e6), help="Block length. Default: 1e6.") parser.add_argument("-vcf", type=str, required=True, help="VCF file prefix (without .vcf or .vcf.gz extension).") parser.add_argument("-output", type=str, required=True, help="Output file prefix.") @@ -57,8 +69,8 @@ def main(): parser.add_argument("-thin", type=int, required=True, help="Thinning interval length.") parser.add_argument("-polar", type=float, default=0.5, required=False, help="Site flip probability. Default: 0.5.") parser.add_argument("-freq", type=float, default=1, required=False, help="Convert to tskit every {freq} samples. Default: 1.") - parser.add_argument("-num_cores", type=int, default=20, required=False, help="Number of cores. Default: 20.") - + parser.add_argument("-num_cores", type=int, default=20, required=False, help="Number of cores. Default: 20.") + if len(sys.argv) == 1: parser.print_help(sys.stderr) sys.exit(1) @@ -66,9 +78,11 @@ def main(): args = parser.parse_args() print("Parameters:") - print(f"Effective population size: {args.Ne}") + print(f"Effective population size: {args.Ne if args.Ne is not None else 'estimated'}") print(f"Mutation rate: {args.m}") + print(f"Mutation map: {args.mut_map}") print(f"Recombination to mutation ratio: {args.ratio}") + print(f"Recombination map: {args.recomb_map}") print(f"Block length: {args.L}") print(f"VCF file prefix: {args.vcf}") print(f"Output file prefix: {args.output}") @@ -79,7 +93,7 @@ def main(): print(f"Number of cores: {args.num_cores}") index_vcf(args.vcf, args.L) - run_singer_in_parallel(args.vcf, args.output, args.m, args.ratio, args.L, args.n, args.thin, args.Ne, args.polar, args.num_cores) + run_singer_in_parallel(args.vcf, args.output, args.m, args.ratio, args.L, args.n, args.thin, args.Ne, args.polar, args.num_cores, args.recomb_map, args.mut_map) convert_long_ARG(args.vcf, args.output, args.n, args.freq) if __name__ == "__main__": diff --git a/SINGER/SINGER/singer_master b/SINGER/SINGER/singer_master index 21087cc..73cbcff 100755 --- a/SINGER/SINGER/singer_master +++ b/SINGER/SINGER/singer_master @@ -293,16 +293,15 @@ def main(): if args.m: + if args.recomb_map: + parser.error("-recomb_map is not compatible with -m; you must use -mut_map") if args.Ne < 0: args.Ne = compute_Ne(args.vcf, args.start, args.end, args.m) if (args.resume): resume_rate_singer(args.Ne, args.m, args.m*args.ratio, args.start, args.end, args.vcf, args.output, args.n, args.thin, args.polar, args.seed) else: run_rate_singer(args.Ne, args.m, args.m*args.ratio, args.start, args.end, args.vcf, args.output, args.n, args.thin, args.polar, args.seed) - return - - if args.mut_map and not args.m: - args.m = compute_average_rate(args.mut_map, args.start, args.end) + elif args.mut_map: if not args.recomb_map: parser.error("You must provide -recomb_map when using the -mut_map flag") From 832f759b9eeb3e5aa0a1d2026573fe077319af92 Mon Sep 17 00:00:00 2001 From: Drew DeHaas Date: Thu, 7 Nov 2024 15:28:53 -0500 Subject: [PATCH 2/3] Rate_map has to take the -start option into account SINGER takes -start which allows you to run on a segment of the genome other than starting at basepair=0. Previously the Rate_map ignored this, meaning it was emitting recombination/mutation rates for the wrong region. It could also end up producing queries for coordinates that were prior to the start of the rate map, due to mutations being offset incorrectly. --- SINGER/SINGER/Rate_map.cpp | 8 +++++--- SINGER/SINGER/Rate_map.hpp | 12 ++++++++---- SINGER/SINGER/main.cpp | 4 ++-- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/SINGER/SINGER/Rate_map.cpp b/SINGER/SINGER/Rate_map.cpp index 11e843b..9220d54 100644 --- a/SINGER/SINGER/Rate_map.cpp +++ b/SINGER/SINGER/Rate_map.cpp @@ -7,7 +7,8 @@ #include "Rate_map.hpp" -Rate_map::Rate_map() {} +Rate_map::Rate_map(double poffset) + : offset(poffset) {} void Rate_map::load_map(string mut_map_file) { ifstream fin(mut_map_file); @@ -38,10 +39,11 @@ int Rate_map::find_index(double x) { } double Rate_map::cumulative_distance(double x) { - int index = find_index(x); + const double actualCoordinate = x + offset; + int index = find_index(actualCoordinate); double prev_dist = rate_distances[index]; double next_dist = rate_distances[index+1]; - double p = (x - coordinates[index])/(coordinates[index+1] - coordinates[index]); + double p = (actualCoordinate - coordinates[index])/(coordinates[index+1] - coordinates[index]); double dist = (1-p)*prev_dist + p*next_dist; return dist; } diff --git a/SINGER/SINGER/Rate_map.hpp b/SINGER/SINGER/Rate_map.hpp index c575d36..cdc6128 100644 --- a/SINGER/SINGER/Rate_map.hpp +++ b/SINGER/SINGER/Rate_map.hpp @@ -9,28 +9,32 @@ #define Rate_map_hpp #include +#include #include "Node.hpp" class Rate_map { public: - double sequence_length = INT_MAX; + // The rate map is for the entire chromosome, typically, but you may be running SINGER on only + // a subset of that chromosome. This offset is where SINGER's concept of 0 starts. + double offset = std::numeric_limits::max(); + double sequence_length = std::numeric_limits::max(); vector coordinates = {}; vector rate_distances = {}; - Rate_map(); + Rate_map(double poffset = 0.0); void load_map(string mut_map_file); - int find_index(double x); - double cumulative_distance(double x); double segment_distance(double x, double y); double mean_rate(); +private: + int find_index(double x); }; #endif /* Rate_map_hpp */ diff --git a/SINGER/SINGER/main.cpp b/SINGER/SINGER/main.cpp index b541aae..eea1366 100644 --- a/SINGER/SINGER/main.cpp +++ b/SINGER/SINGER/main.cpp @@ -251,9 +251,9 @@ int main(int argc, const char * argv[]) { if (bothRates) { sampler = Sampler(Ne, r, m); } else { - Rate_map recomb_map = Rate_map(); + Rate_map recomb_map = Rate_map(start_pos); recomb_map.load_map(recomb_map_filename); - Rate_map mut_map = Rate_map(); + Rate_map mut_map = Rate_map(start_pos); mut_map.load_map(mut_map_filename); sampler = Sampler(Ne, recomb_map, mut_map); } From 8f55d022da4200a93733e04ba7d78a52654ebc8d Mon Sep 17 00:00:00 2001 From: Drew DeHaas Date: Mon, 11 Nov 2024 12:26:02 -0500 Subject: [PATCH 3/3] "Support" regions of 0 recombination; Hg38 has a few of these Having recombination rate of 0 causes NaNs in downstream probabilities, and forbidding SINGER from allowing recombination is likely difficult to do. Instead, just impose a very small recombination rate in any region that has too low of a rate. --- SINGER/SINGER/Rate_map.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/SINGER/SINGER/Rate_map.cpp b/SINGER/SINGER/Rate_map.cpp index 9220d54..c4e601f 100644 --- a/SINGER/SINGER/Rate_map.cpp +++ b/SINGER/SINGER/Rate_map.cpp @@ -7,6 +7,10 @@ #include "Rate_map.hpp" +// Even if there are regions of truly no recombination, we have to pretend there is +// a really low rate to make the math work out currently. +static constexpr double MIN_RECOMB_RATE = 1e-16; + Rate_map::Rate_map(double poffset) : offset(poffset) {} @@ -22,8 +26,12 @@ void Rate_map::load_map(string mut_map_file) { double rate; double mut_dist; while (fin >> left >> right >> rate) { + if (rate < MIN_RECOMB_RATE) { + rate = MIN_RECOMB_RATE; + } coordinates.push_back(left); mut_dist = rate_distances.back() + rate*(right - left); + assert(mut_dist != rate_distances.back()); rate_distances.push_back(mut_dist); } sequence_length = right; @@ -49,7 +57,9 @@ double Rate_map::cumulative_distance(double x) { } double Rate_map::segment_distance(double x, double y) { - return cumulative_distance(y) - cumulative_distance(x); + double cd = cumulative_distance(y) - cumulative_distance(x); + assert(cd > 0.0); + return cd; } double Rate_map::mean_rate() {