@@ -5,21 +5,26 @@ import os
55import subprocess
66import argparse
77import random
8- import uuid
8+ # import uuid
9+ import numpy as np
10+ # import pysam
911
1012def random_string ():
1113 unique_id = uuid .uuid4 ().hex
1214 return unique_id
1315
16+ '''
1417def write_rate_map(output_prefix, rate, x, y):
1518 rate_table = np.array([[0, y, rate]])
1619 rand_suffix = random_string()
1720 map_filename = f"{output_prefix}_{rand_suffix}.txt"
21+ print(rate_table)
1822 np.savetxt(map_filename, rate_table, delimiter=' ')
1923 return map_filename
24+ '''
2025
2126def compute_average_rate (map_file , x , y ):
22- data = np .loadtxt (map_file )
27+ data = np .loadtxt (map_file , ndmin = 2 )
2328 mask = (data [:, 1 ] > x ) & (data [:, 0 ] < y )
2429 selected_data = data [mask ]
2530
@@ -38,28 +43,21 @@ def compute_average_rate(map_file, x, y):
3843 average_rate = weighted_sum / total_length
3944 return average_rate
4045
41- def tabix_vcf (vcf_file ):
42- index_file = vcf_file + '.tbi'
43- if not os .path .exists (index_file ):
44- print (f"Indexing VCF file: { vcf_file } " )
45- subprocess .run (['tabix' , '-p' , 'vcf' , vcf_file ], check = True )
46- else :
47- print (f"Index file already exists: { index_file } " )
48-
46+ '''
4947def calculate_diversity_with_filter(vcf_file, start, end):
50- tabix_vcf ( vcf_file )
48+ # Open the VCF file using pysam
5149 vcf = pysam.VariantFile(vcf_file)
52- try :
53- first_variant = next (vcf .fetch ())
54- chrom = first_variant .chrom
55- except StopIteration :
56- raise ValueError ("The VCF file is empty or does not contain any variants." )
57-
50+
5851 num_variants = 0
5952 total_diversity = 0.0
6053 snp_positions = []
61-
62- for record in vcf .fetch (chrom , start , end ):
54+
55+ # Iterate through all records and filter based on position
56+ for record in vcf:
57+ if record.pos < start:
58+ continue
59+ if record.pos > end:
60+ break
6361 snp_positions.append(record.pos)
6462 allele_counts = [0] * len(record.alleles)
6563 num_samples = 0
@@ -80,34 +78,97 @@ def calculate_diversity_with_filter(vcf_file, start, end):
8078 total_diversity += variant_diversity
8179 num_variants += 1
8280
81+ # Calculate SNP distances
8382 snp_distances = np.diff(snp_positions)
8483
8584 if len(snp_distances) == 0:
8685 return 0.0
87- snp_distances = np .append (snp_positions [0 ] - start , snp_distances , )
86+
87+ # Append the distances to the start and end positions
88+ snp_distances = np.append(snp_positions[0] - start, snp_distances)
8889 snp_distances = np.append(snp_distances, end - snp_positions[-1])
90+
91+ # Sort the distances and filter out the longest 5 distances
8992 sorted_snp_distances = np.sort(snp_distances)
90- filtered_length = np .sum (sorted_snp_distances [0 :- 5 ])
93+ filtered_length = np.sum(sorted_snp_distances[:-5])
94+
95+ # Calculate nucleotide diversity
9196 nucleotide_diversity = total_diversity / filtered_length
97+
98+ return nucleotide_diversity
99+ '''
100+
101+ def calculate_diversity (vcf_file , start , end ):
102+ total_diversity = 0.0
103+ segment_length = end - start
104+ num_variants = 0
105+
106+ # Open the VCF file and read it line by line
107+ with open (vcf_file , 'r' ) as vcf :
108+ for line in vcf :
109+ # Skip header lines
110+ if line .startswith ("#" ):
111+ continue
112+
113+ # Split the VCF record into fields
114+ fields = line .strip ().split ('\t ' )
115+ pos = int (fields [1 ]) # Get the position of the variant
116+
117+ # Check if the variant is within the target region
118+ if pos < start :
119+ continue
120+ if pos > end :
121+ break
122+
123+ # Extract genotype information for each sample (after the 9th column)
124+ alleles = [fields [3 ]] + fields [4 ].split ("," ) # REF and ALT alleles
125+ allele_counts = [0 ] * len (alleles )
126+ num_samples = 0
127+
128+ # Iterate over the genotype columns to count allele occurrences
129+ for sample_field in fields [9 :]:
130+ gt = sample_field .split (':' )[0 ] # Extract the genotype (e.g., "0/1")
131+ if '.' in gt : # Missing data
132+ continue
133+
134+ # Genotype fields are usually formatted like "0/1", "1/1", etc.
135+ gt_alleles = gt .replace ('|' , '/' ).split ('/' ) # Handle phased/unphased
136+ num_samples += 1
137+ for allele in gt_alleles :
138+ allele_counts [int (allele )] += 1
139+
140+ # Calculate allele frequencies
141+ if num_samples == 0 :
142+ continue # Skip if there are no valid samples
143+ allele_freqs = [ac / (2 * num_samples ) for ac in allele_counts ]
144+
145+ # Calculate diversity for the current variant
146+ variant_diversity = sum (f * (1 - f ) for f in allele_freqs if f > 0 and f < 1 )
147+
148+ # Add to total diversity
149+ total_diversity += variant_diversity
150+ num_variants += 1
151+
152+ nucleotide_diversity = total_diversity / segment_length
92153 return nucleotide_diversity
93154
94155def compute_Ne (vcf_prefix , start , end , m ):
95- vcf_filename = vcf + ".vcf"
96- effective_diversity = calculate_diversity_with_filter (vcf_filename , start , end )
97- Ne = effective_diversity / 4 / m
156+ vcf_filename = vcf_prefix + ".vcf"
157+ diversity = calculate_diversity (vcf_filename , start , end )
158+ Ne = diversity / 4 / m
98159 return Ne
99160
100- def run_singer (Ne , mut_map_filename , recomb_map_filename , start , end , vcf_prefix , output_prefix , num_iters , thin , polar , seed ):
161+ def run_rate_singer (Ne , m , r , start , end , vcf_prefix , output_prefix , num_iters , thin , polar , seed ):
101162 attempts = 0
102163 max_attempts = 100
103164 random .seed (seed )
104- random_seeds = [random .randint (0 , 2 ** 30 - 1 ) for _ in range (max_attempts )]
105-
165+ random_seeds = [random .randint (0 , 2 ** 30 - 1 ) for _ in range (max_attempts )]
166+
106167 script_dir = os .path .dirname (os .path .realpath (__file__ ))
107- singer_executable = os .path .join (script_dir , "singer" )
108-
109- start_cmd = f"{ singer_executable } -Ne { Ne } -mut_map { mut_map_filename } -recomb_map { recomb_map_filename } -input { vcf_prefix } -output { output_prefix } -start { start } -end { end } -polar { polar } -n { num_iters } -thin { thin } "
110- debug_cmd = f"{ singer_executable } -Ne { Ne } -m { mutation_rate } -r { mutation_rate * recombination_to_mutation_ratio } -input { vcf_prefix } -output { output_prefix } -start { start } -end { end } -polar { polar } -n { num_iters } -thin { thin } -debug"
168+ singer_executable = os .path .join (script_dir , "singer" )
169+
170+ start_cmd = f"{ singer_executable } -Ne { Ne } -m { m } -r { r } -input { vcf_prefix } -output { output_prefix } -start { start } -end { end } -polar { polar } -n { num_iters } -thin { thin } "
171+ debug_cmd = f"{ singer_executable } -Ne { Ne } -m { m } -r { r } -input { vcf_prefix } -output { output_prefix } -start { start } -end { end } -polar { polar } -n { num_iters } -thin { thin } -debug"
111172
112173 seeded_start_cmd = start_cmd + f" -seed { seed } "
113174 print (seeded_start_cmd )
@@ -123,7 +184,7 @@ def run_singer(Ne, mut_map_filename, recomb_map_filename, start, end, vcf_prefix
123184 print (
"Auto-debug failed. Contact the author for help: [email protected] " )
124185 sys .exit (1 )
125186
126- def run_haps_singer (Ne , mutation_rate , recombination_to_mutation_ratio , start , end , haps_prefix , output_prefix , num_iters , thin , polar , seed ):
187+ def resume_rate_singer (Ne , m , r , start , end , vcf_prefix , output_prefix , num_iters , thin , polar , seed ):
127188 attempts = 0
128189 max_attempts = 100
129190 random .seed (seed )
@@ -132,12 +193,12 @@ def run_haps_singer(Ne, mutation_rate, recombination_to_mutation_ratio, start, e
132193 script_dir = os .path .dirname (os .path .realpath (__file__ ))
133194 singer_executable = os .path .join (script_dir , "singer" )
134195
135- start_cmd = f"{ singer_executable } -Ne { Ne } -m { mutation_rate } -r { mutation_rate * recombination_to_mutation_ratio } -haps { haps_prefix } -output { output_prefix } -start { start } -end { end } -polar { polar } -n { num_iters } -thin { thin } "
136- debug_cmd = f"{ singer_executable } -Ne { Ne } -m { mutation_rate } -r { mutation_rate * recombination_to_mutation_ratio } -haps { haps_prefix } -output { output_prefix } -start { start } -end { end } -polar { polar } -n { num_iters } -thin { thin } -debug"
196+ resume_cmd = f"{ singer_executable } -Ne { Ne } -m { m } -r { r } -input { vcf_prefix } -output { output_prefix } -start { start } -end { end } -polar { polar } -n { num_iters } -thin { thin } -resume "
197+ debug_cmd = f"{ singer_executable } -Ne { Ne } -m { m } -r { r } -input { vcf_prefix } -output { output_prefix } -start { start } -end { end } -polar { polar } -n { num_iters } -thin { thin } -debug"
137198
138- seeded_start_cmd = start_cmd + f" -seed { seed } "
139- print (seeded_start_cmd )
140- process = subprocess .run (seeded_start_cmd .split (), check = False )
199+ seeded_resume_cmd = resume_cmd + f" -seed { seed } "
200+ print (seeded_resume_cmd )
201+ process = subprocess .run (seeded_resume_cmd .split (), check = False )
141202 while process .returncode != 0 and attempts < max_attempts :
142203 print (f"Auto-debug iteration: { attempts } " )
143204 seeded_debug_cmd = debug_cmd + f" -seed { random_seeds [attempts ]} "
@@ -149,21 +210,21 @@ def run_haps_singer(Ne, mutation_rate, recombination_to_mutation_ratio, start, e
149210 print (
"Auto-debug failed. Contact the author for help: [email protected] " )
150211 sys .exit (1 )
151212
152- def resume_singer (Ne , mutation_rate , recombination_to_mutation_ratio , start , end , vcf_prefix , output_prefix , num_iters , thin , polar , seed ):
213+ def run_map_singer (Ne , mut_map_filename , recomb_map_filename , start , end , vcf_prefix , output_prefix , num_iters , thin , polar , seed ):
153214 attempts = 0
154215 max_attempts = 100
155216 random .seed (seed )
156- random_seeds = [random .randint (0 , 2 ** 30 - 1 ) for _ in range (max_attempts )]
157-
217+ random_seeds = [random .randint (0 , 2 ** 30 - 1 ) for _ in range (max_attempts )]
218+
158219 script_dir = os .path .dirname (os .path .realpath (__file__ ))
159- singer_executable = os .path .join (script_dir , "singer" )
220+ singer_executable = os .path .join (script_dir , "singer" )
221+
222+ start_cmd = f"{ singer_executable } -Ne { Ne } -mut_map { mut_map_filename } -recomb_map { recomb_map_filename } -input { vcf_prefix } -output { output_prefix } -start { start } -end { end } -polar { polar } -n { num_iters } -thin { thin } "
223+ debug_cmd = f"{ singer_executable } -Ne { Ne } -mut_map { mut_map_filename } -recomb_map { recomb_map_filename } -input { vcf_prefix } -output { output_prefix } -start { start } -end { end } -polar { polar } -n { num_iters } -thin { thin } -debug"
160224
161- resume_cmd = f"{ singer_executable } -Ne { Ne } -m { mutation_rate } -r { mutation_rate * recombination_to_mutation_ratio } -input { vcf_prefix } -output { output_prefix } -start { start } -end { end } -polar { polar } -n { num_iters } -thin { thin } -resume"
162- debug_cmd = f"{ singer_executable } -Ne { Ne } -m { mutation_rate } -r { mutation_rate * recombination_to_mutation_ratio } -input { vcf_prefix } -output { output_prefix } -start { start } -end { end } -polar { polar } -n { num_iters } -thin { thin } -debug"
163-
164- seeded_resume_cmd = resume_cmd + f" -seed { seed } "
165- print (seeded_resume_cmd )
166- process = subprocess .run (seeded_resume_cmd .split (), check = False )
225+ seeded_start_cmd = start_cmd + f" -seed { seed } "
226+ print (seeded_start_cmd )
227+ process = subprocess .run (seeded_start_cmd .split (), check = False )
167228 while process .returncode != 0 and attempts < max_attempts :
168229 print (f"Auto-debug iteration: { attempts } " )
169230 seeded_debug_cmd = debug_cmd + f" -seed { random_seeds [attempts ]} "
@@ -175,8 +236,8 @@ def resume_singer(Ne, mutation_rate, recombination_to_mutation_ratio, start, end
175236 print (
"Auto-debug failed. Contact the author for help: [email protected] " )
176237 sys .exit (1 )
177238
178-
179- def resume_fast_singer (Ne , mutation_rate , recombination_to_mutation_ratio , start , end , vcf_prefix , output_prefix , num_iters , thin , polar , seed ):
239+
240+ def resume_map_singer (Ne , mut_map_filename , recomb_map_filename , start , end , vcf_prefix , output_prefix , num_iters , thin , polar , seed ):
180241 attempts = 0
181242 max_attempts = 100
182243 random .seed (seed )
@@ -185,9 +246,9 @@ def resume_fast_singer(Ne, mutation_rate, recombination_to_mutation_ratio, start
185246 script_dir = os .path .dirname (os .path .realpath (__file__ ))
186247 singer_executable = os .path .join (script_dir , "singer" )
187248
188- resume_cmd = f"{ singer_executable } -fast - Ne { Ne } -m { mutation_rate } -r { mutation_rate * recombination_to_mutation_ratio } -input { vcf_prefix } -output { output_prefix } -start { start } -end { end } -polar { polar } -n { num_iters } -thin { thin } -resume"
189- debug_cmd = f"{ singer_executable } -fast - Ne { Ne } -m { mutation_rate } -r { mutation_rate * recombination_to_mutation_ratio } -input { vcf_prefix } -output { output_prefix } -start { start } -end { end } -polar { polar } -n { num_iters } -thin { thin } -debug"
190-
249+ resume_cmd = f"{ singer_executable } -Ne { Ne } -mut_map { mut_map_filename } -recomb_map { recomb_map_filename } -input { vcf_prefix } -output { output_prefix } -start { start } -end { end } -polar { polar } -n { num_iters } -thin { thin } -resume"
250+ debug_cmd = f"{ singer_executable } -Ne { Ne } -mut_map { mut_map_filename } -recomb_filename { recomb_map_filename } -input { vcf_prefix } -output { output_prefix } -start { start } -end { end } -polar { polar } -n { num_iters } -thin { thin } -debug"
251+
191252 seeded_resume_cmd = resume_cmd + f" -seed { seed } "
192253 print (seeded_resume_cmd )
193254 process = subprocess .run (seeded_resume_cmd .split (), check = False )
@@ -202,19 +263,19 @@ def resume_fast_singer(Ne, mutation_rate, recombination_to_mutation_ratio, start
202263 print (
"Auto-debug failed. Contact the author for help: [email protected] " )
203264 sys .exit (1 )
204265
205-
266+
206267def main ():
207268 parser = argparse .ArgumentParser (description = 'Sample and infer ARG from genetic variation data' )
208269
209270 parser .add_argument ('-Ne' , type = float , default = - 1 , help = 'Effective population size.' )
210- parser .add_argument ('-m' , type = float , required = True , help = 'Mutation rate.' )
271+ parser .add_argument ('-m' , type = float , help = 'Mutation rate.' )
211272 parser .add_argument ('-ratio' , type = float , default = 1 , help = 'Recombination to mutation ratio. Default: 1.' )
212273 parser .add_argument ('-recomb_map' , type = str , help = 'Filename for the recombination rate map.' )
213274 parser .add_argument ('-mut_map' , type = str , help = 'Filename for the mutation rate map.' )
214275 parser .add_argument ('-vcf' , type = str , help = 'VCF file prefix (without .vcf extension).' )
215276 parser .add_argument ('-output' , type = str , required = True , help = 'Output file prefix.' )
216- parser .add_argument ('-start' , type = str , required = True , help = 'Start position.' )
217- parser .add_argument ('-end' , type = str , required = True , help = 'End position.' )
277+ parser .add_argument ('-start' , type = float , required = True , help = 'Start position.' )
278+ parser .add_argument ('-end' , type = float , required = True , help = 'End position.' )
218279 parser .add_argument ('-n' , type = int , default = 100 , help = 'Number of MCMC samples. Default: 100.' )
219280 parser .add_argument ('-thin' , type = int , default = 20 , help = 'Thinning interval length. Default: 20.' )
220281 parser .add_argument ('-polar' , type = float , default = 0.5 , help = 'Site flip probability. Default: 0.5.' )
@@ -230,28 +291,31 @@ def main():
230291 if not args .m and not args .mut_map :
231292 parser .error ("You must provide either -m or -mut_map" )
232293
233- if not args .ratio and not args .recomb_map :
234- parser .error ("You must provide either -ratio or -recomb_map" )
235-
236294
237295 if args .m :
238- args .mut_map = write_rate_map (args .m , args .output , args .start , args .end )
239-
240- if args .ratio :
241- if not args .m :
242- parser .error ("You must provide -m when using the -ratio flag" )
296+ if args .Ne < 0 :
297+ args .Ne = compute_Ne (args .vcf , args .start , args .end , args .m )
298+ if (args .resume ):
299+ resume_rate_singer (args .Ne , args .m , args .m * args .ratio , args .start , args .end , args .vcf , args .output , args .n , args .thin , args .polar , args .seed )
243300 else :
244- args .recomb_map = write_rate_map (args .ratio * args .m , args .vcf , args .start , args .end , args .output )
245-
246- if not args .Ne :
247- args .m = compute_average_rate (args .mut_map , args .start , args .end )
248- args .Ne = compute_Ne (args .vcf , args .start , args .end , args .m )
249-
250- if (args .resume ):
251- resume_singer (args .Ne , args .mut_map , args .recomb_map , args .start , args .end , args .vcf , args .output , args .n , args .thin , args .polar , args .seed )
252- else :
253- run_singer (args .Ne , args .mut_map , args .recomb_map , args .start , args .end , args .vcf , args .output , args .n , args .thin , args .polar , args .seed )
301+ run_rate_singer (args .Ne , args .m , args .m * args .ratio , args .start , args .end , args .vcf , args .output , args .n , args .thin , args .polar , args .seed )
302+ return
303+
304+ else if args .mut_map and not args .m :
305+ args .m = compute_average_rate (args .mut_map , args .start , args .end )
306+ if not args .recomb_map :
307+ parser .error ("You must provide -recomb_map when using the -mut_map flag" )
254308
309+ if args .Ne < 0 :
310+ args .m = compute_average_rate (args .mut_map , args .start , args .end )
311+ args .Ne = compute_Ne (args .vcf , args .start , args .end , args .m )
312+
313+ if (args .resume ):
314+ resume_map_singer (args .Ne , args .mut_map , args .recomb_map , args .start , args .end , args .vcf , args .output , args .n , args .thin , args .polar , args .seed )
315+ else :
316+ run_map_singer (args .Ne , args .mut_map , args .recomb_map , args .start , args .end , args .vcf , args .output , args .n , args .thin , args .polar , args .seed )
317+ return
318+
255319
256320if __name__ == "__main__" :
257321 main ()
0 commit comments