Skip to content

Commit 0ad5649

Browse files
author
Yun Deng
committed
minor fixes to 0.1.8
1 parent e8d0eb3 commit 0ad5649

File tree

4 files changed

+138
-74
lines changed

4 files changed

+138
-74
lines changed
15.5 KB
Binary file not shown.
42.8 KB
Binary file not shown.
4.97 KB
Binary file not shown.

releases/singer-0.1.8-beta-linux-x86_64/singer_master

Lines changed: 138 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,26 @@ import os
55
import subprocess
66
import argparse
77
import random
8-
import uuid
8+
# import uuid
9+
import numpy as np
10+
# import pysam
911

1012
def random_string():
1113
unique_id = uuid.uuid4().hex
1214
return unique_id
1315

16+
'''
1417
def write_rate_map(output_prefix, rate, x, y):
1518
rate_table = np.array([[0, y, rate]])
1619
rand_suffix = random_string()
1720
map_filename = f"{output_prefix}_{rand_suffix}.txt"
21+
print(rate_table)
1822
np.savetxt(map_filename, rate_table, delimiter=' ')
1923
return map_filename
24+
'''
2025

2126
def compute_average_rate(map_file, x, y):
22-
data = np.loadtxt(map_file)
27+
data = np.loadtxt(map_file, ndmin=2)
2328
mask = (data[:, 1] > x) & (data[:, 0] < y)
2429
selected_data = data[mask]
2530

@@ -38,28 +43,21 @@ def compute_average_rate(map_file, x, y):
3843
average_rate = weighted_sum / total_length
3944
return average_rate
4045

41-
def tabix_vcf(vcf_file):
42-
index_file = vcf_file + '.tbi'
43-
if not os.path.exists(index_file):
44-
print(f"Indexing VCF file: {vcf_file}")
45-
subprocess.run(['tabix', '-p', 'vcf', vcf_file], check=True)
46-
else:
47-
print(f"Index file already exists: {index_file}")
48-
46+
'''
4947
def calculate_diversity_with_filter(vcf_file, start, end):
50-
tabix_vcf(vcf_file)
48+
# Open the VCF file using pysam
5149
vcf = pysam.VariantFile(vcf_file)
52-
try:
53-
first_variant = next(vcf.fetch())
54-
chrom = first_variant.chrom
55-
except StopIteration:
56-
raise ValueError("The VCF file is empty or does not contain any variants.")
57-
50+
5851
num_variants = 0
5952
total_diversity = 0.0
6053
snp_positions = []
61-
62-
for record in vcf.fetch(chrom, start, end):
54+
55+
# Iterate through all records and filter based on position
56+
for record in vcf:
57+
if record.pos < start:
58+
continue
59+
if record.pos > end:
60+
break
6361
snp_positions.append(record.pos)
6462
allele_counts = [0] * len(record.alleles)
6563
num_samples = 0
@@ -80,34 +78,97 @@ def calculate_diversity_with_filter(vcf_file, start, end):
8078
total_diversity += variant_diversity
8179
num_variants += 1
8280
81+
# Calculate SNP distances
8382
snp_distances = np.diff(snp_positions)
8483
8584
if len(snp_distances) == 0:
8685
return 0.0
87-
snp_distances = np.append(snp_positions[0] - start, snp_distances, )
86+
87+
# Append the distances to the start and end positions
88+
snp_distances = np.append(snp_positions[0] - start, snp_distances)
8889
snp_distances = np.append(snp_distances, end - snp_positions[-1])
90+
91+
# Sort the distances and filter out the longest 5 distances
8992
sorted_snp_distances = np.sort(snp_distances)
90-
filtered_length = np.sum(sorted_snp_distances[0:-5])
93+
filtered_length = np.sum(sorted_snp_distances[:-5])
94+
95+
# Calculate nucleotide diversity
9196
nucleotide_diversity = total_diversity / filtered_length
97+
98+
return nucleotide_diversity
99+
'''
100+
101+
def calculate_diversity(vcf_file, start, end):
102+
total_diversity = 0.0
103+
segment_length = end - start
104+
num_variants = 0
105+
106+
# Open the VCF file and read it line by line
107+
with open(vcf_file, 'r') as vcf:
108+
for line in vcf:
109+
# Skip header lines
110+
if line.startswith("#"):
111+
continue
112+
113+
# Split the VCF record into fields
114+
fields = line.strip().split('\t')
115+
pos = int(fields[1]) # Get the position of the variant
116+
117+
# Check if the variant is within the target region
118+
if pos < start:
119+
continue
120+
if pos > end:
121+
break
122+
123+
# Extract genotype information for each sample (after the 9th column)
124+
alleles = [fields[3]] + fields[4].split(",") # REF and ALT alleles
125+
allele_counts = [0] * len(alleles)
126+
num_samples = 0
127+
128+
# Iterate over the genotype columns to count allele occurrences
129+
for sample_field in fields[9:]:
130+
gt = sample_field.split(':')[0] # Extract the genotype (e.g., "0/1")
131+
if '.' in gt: # Missing data
132+
continue
133+
134+
# Genotype fields are usually formatted like "0/1", "1/1", etc.
135+
gt_alleles = gt.replace('|', '/').split('/') # Handle phased/unphased
136+
num_samples += 1
137+
for allele in gt_alleles:
138+
allele_counts[int(allele)] += 1
139+
140+
# Calculate allele frequencies
141+
if num_samples == 0:
142+
continue # Skip if there are no valid samples
143+
allele_freqs = [ac / (2 * num_samples) for ac in allele_counts]
144+
145+
# Calculate diversity for the current variant
146+
variant_diversity = sum(f * (1 - f) for f in allele_freqs if f > 0 and f < 1)
147+
148+
# Add to total diversity
149+
total_diversity += variant_diversity
150+
num_variants += 1
151+
152+
nucleotide_diversity = total_diversity / segment_length
92153
return nucleotide_diversity
93154

94155
def compute_Ne(vcf_prefix, start, end, m):
95-
vcf_filename = vcf + ".vcf"
96-
effective_diversity = calculate_diversity_with_filter(vcf_filename, start, end)
97-
Ne = effective_diversity/4/m
156+
vcf_filename = vcf_prefix + ".vcf"
157+
diversity = calculate_diversity(vcf_filename, start, end)
158+
Ne = diversity/4/m
98159
return Ne
99160

100-
def run_singer(Ne, mut_map_filename, recomb_map_filename, start, end, vcf_prefix, output_prefix, num_iters, thin, polar, seed):
161+
def run_rate_singer(Ne, m, r, start, end, vcf_prefix, output_prefix, num_iters, thin, polar, seed):
101162
attempts = 0
102163
max_attempts = 100
103164
random.seed(seed)
104-
random_seeds = [random.randint(0, 2**30 - 1) for _ in range(max_attempts)]
105-
165+
random_seeds = [random.randint(0, 2**30 - 1) for _ in range(max_attempts)]
166+
106167
script_dir = os.path.dirname(os.path.realpath(__file__))
107-
singer_executable = os.path.join(script_dir, "singer")
108-
109-
start_cmd = f"{singer_executable} -Ne {Ne} -mut_map {mut_map_filename} -recomb_map {recomb_map_filename} -input {vcf_prefix} -output {output_prefix} -start {start} -end {end} -polar {polar} -n {num_iters} -thin {thin}"
110-
debug_cmd = f"{singer_executable} -Ne {Ne} -m {mutation_rate} -r {mutation_rate * recombination_to_mutation_ratio} -input {vcf_prefix} -output {output_prefix} -start {start} -end {end} -polar {polar} -n {num_iters} -thin {thin} -debug"
168+
singer_executable = os.path.join(script_dir, "singer")
169+
170+
start_cmd = f"{singer_executable} -Ne {Ne} -m {m} -r {r} -input {vcf_prefix} -output {output_prefix} -start {start} -end {end} -polar {polar} -n {num_iters} -thin {thin}"
171+
debug_cmd = f"{singer_executable} -Ne {Ne} -m {m} -r {r} -input {vcf_prefix} -output {output_prefix} -start {start} -end {end} -polar {polar} -n {num_iters} -thin {thin} -debug"
111172

112173
seeded_start_cmd = start_cmd + f" -seed {seed}"
113174
print(seeded_start_cmd)
@@ -123,7 +184,7 @@ def run_singer(Ne, mut_map_filename, recomb_map_filename, start, end, vcf_prefix
123184
print("Auto-debug failed. Contact the author for help: [email protected]")
124185
sys.exit(1)
125186

126-
def run_haps_singer(Ne, mutation_rate, recombination_to_mutation_ratio, start, end, haps_prefix, output_prefix, num_iters, thin, polar, seed):
187+
def resume_rate_singer(Ne, m, r, start, end, vcf_prefix, output_prefix, num_iters, thin, polar, seed):
127188
attempts = 0
128189
max_attempts = 100
129190
random.seed(seed)
@@ -132,12 +193,12 @@ def run_haps_singer(Ne, mutation_rate, recombination_to_mutation_ratio, start, e
132193
script_dir = os.path.dirname(os.path.realpath(__file__))
133194
singer_executable = os.path.join(script_dir, "singer")
134195

135-
start_cmd = f"{singer_executable} -Ne {Ne} -m {mutation_rate} -r {mutation_rate * recombination_to_mutation_ratio} -haps {haps_prefix} -output {output_prefix} -start {start} -end {end} -polar {polar} -n {num_iters} -thin {thin}"
136-
debug_cmd = f"{singer_executable} -Ne {Ne} -m {mutation_rate} -r {mutation_rate * recombination_to_mutation_ratio} -haps {haps_prefix} -output {output_prefix} -start {start} -end {end} -polar {polar} -n {num_iters} -thin {thin} -debug"
196+
resume_cmd = f"{singer_executable} -Ne {Ne} -m {m} -r {r} -input {vcf_prefix} -output {output_prefix} -start {start} -end {end} -polar {polar} -n {num_iters} -thin {thin} -resume"
197+
debug_cmd = f"{singer_executable} -Ne {Ne} -m {m} -r {r} -input {vcf_prefix} -output {output_prefix} -start {start} -end {end} -polar {polar} -n {num_iters} -thin {thin} -debug"
137198

138-
seeded_start_cmd = start_cmd + f" -seed {seed}"
139-
print(seeded_start_cmd)
140-
process = subprocess.run(seeded_start_cmd.split(), check=False)
199+
seeded_resume_cmd = resume_cmd + f" -seed {seed}"
200+
print(seeded_resume_cmd)
201+
process = subprocess.run(seeded_resume_cmd.split(), check=False)
141202
while process.returncode != 0 and attempts < max_attempts:
142203
print(f"Auto-debug iteration: {attempts}")
143204
seeded_debug_cmd = debug_cmd + f" -seed {random_seeds[attempts]}"
@@ -149,21 +210,21 @@ def run_haps_singer(Ne, mutation_rate, recombination_to_mutation_ratio, start, e
149210
print("Auto-debug failed. Contact the author for help: [email protected]")
150211
sys.exit(1)
151212

152-
def resume_singer(Ne, mutation_rate, recombination_to_mutation_ratio, start, end, vcf_prefix, output_prefix, num_iters, thin, polar, seed):
213+
def run_map_singer(Ne, mut_map_filename, recomb_map_filename, start, end, vcf_prefix, output_prefix, num_iters, thin, polar, seed):
153214
attempts = 0
154215
max_attempts = 100
155216
random.seed(seed)
156-
random_seeds = [random.randint(0, 2**30 - 1) for _ in range(max_attempts)]
157-
217+
random_seeds = [random.randint(0, 2**30 - 1) for _ in range(max_attempts)]
218+
158219
script_dir = os.path.dirname(os.path.realpath(__file__))
159-
singer_executable = os.path.join(script_dir, "singer")
220+
singer_executable = os.path.join(script_dir, "singer")
221+
222+
start_cmd = f"{singer_executable} -Ne {Ne} -mut_map {mut_map_filename} -recomb_map {recomb_map_filename} -input {vcf_prefix} -output {output_prefix} -start {start} -end {end} -polar {polar} -n {num_iters} -thin {thin}"
223+
debug_cmd = f"{singer_executable} -Ne {Ne} -mut_map {mut_map_filename} -recomb_map {recomb_map_filename} -input {vcf_prefix} -output {output_prefix} -start {start} -end {end} -polar {polar} -n {num_iters} -thin {thin} -debug"
160224

161-
resume_cmd = f"{singer_executable} -Ne {Ne} -m {mutation_rate} -r {mutation_rate * recombination_to_mutation_ratio} -input {vcf_prefix} -output {output_prefix} -start {start} -end {end} -polar {polar} -n {num_iters} -thin {thin} -resume"
162-
debug_cmd = f"{singer_executable} -Ne {Ne} -m {mutation_rate} -r {mutation_rate * recombination_to_mutation_ratio} -input {vcf_prefix} -output {output_prefix} -start {start} -end {end} -polar {polar} -n {num_iters} -thin {thin} -debug"
163-
164-
seeded_resume_cmd = resume_cmd + f" -seed {seed}"
165-
print(seeded_resume_cmd)
166-
process = subprocess.run(seeded_resume_cmd.split(), check=False)
225+
seeded_start_cmd = start_cmd + f" -seed {seed}"
226+
print(seeded_start_cmd)
227+
process = subprocess.run(seeded_start_cmd.split(), check=False)
167228
while process.returncode != 0 and attempts < max_attempts:
168229
print(f"Auto-debug iteration: {attempts}")
169230
seeded_debug_cmd = debug_cmd + f" -seed {random_seeds[attempts]}"
@@ -175,8 +236,8 @@ def resume_singer(Ne, mutation_rate, recombination_to_mutation_ratio, start, end
175236
print("Auto-debug failed. Contact the author for help: [email protected]")
176237
sys.exit(1)
177238

178-
179-
def resume_fast_singer(Ne, mutation_rate, recombination_to_mutation_ratio, start, end, vcf_prefix, output_prefix, num_iters, thin, polar, seed):
239+
240+
def resume_map_singer(Ne, mut_map_filename, recomb_map_filename, start, end, vcf_prefix, output_prefix, num_iters, thin, polar, seed):
180241
attempts = 0
181242
max_attempts = 100
182243
random.seed(seed)
@@ -185,9 +246,9 @@ def resume_fast_singer(Ne, mutation_rate, recombination_to_mutation_ratio, start
185246
script_dir = os.path.dirname(os.path.realpath(__file__))
186247
singer_executable = os.path.join(script_dir, "singer")
187248

188-
resume_cmd = f"{singer_executable} -fast -Ne {Ne} -m {mutation_rate} -r {mutation_rate * recombination_to_mutation_ratio} -input {vcf_prefix} -output {output_prefix} -start {start} -end {end} -polar {polar} -n {num_iters} -thin {thin} -resume"
189-
debug_cmd = f"{singer_executable} -fast -Ne {Ne} -m {mutation_rate} -r {mutation_rate * recombination_to_mutation_ratio} -input {vcf_prefix} -output {output_prefix} -start {start} -end {end} -polar {polar} -n {num_iters} -thin {thin} -debug"
190-
249+
resume_cmd = f"{singer_executable} -Ne {Ne} -mut_map {mut_map_filename} -recomb_map {recomb_map_filename} -input {vcf_prefix} -output {output_prefix} -start {start} -end {end} -polar {polar} -n {num_iters} -thin {thin} -resume"
250+
debug_cmd = f"{singer_executable} -Ne {Ne} -mut_map {mut_map_filename} -recomb_filename {recomb_map_filename} -input {vcf_prefix} -output {output_prefix} -start {start} -end {end} -polar {polar} -n {num_iters} -thin {thin} -debug"
251+
191252
seeded_resume_cmd = resume_cmd + f" -seed {seed}"
192253
print(seeded_resume_cmd)
193254
process = subprocess.run(seeded_resume_cmd.split(), check=False)
@@ -202,19 +263,19 @@ def resume_fast_singer(Ne, mutation_rate, recombination_to_mutation_ratio, start
202263
print("Auto-debug failed. Contact the author for help: [email protected]")
203264
sys.exit(1)
204265

205-
266+
206267
def main():
207268
parser = argparse.ArgumentParser(description='Sample and infer ARG from genetic variation data')
208269

209270
parser.add_argument('-Ne', type=float, default=-1, help='Effective population size.')
210-
parser.add_argument('-m', type=float, required=True, help='Mutation rate.')
271+
parser.add_argument('-m', type=float, help='Mutation rate.')
211272
parser.add_argument('-ratio', type=float, default=1, help='Recombination to mutation ratio. Default: 1.')
212273
parser.add_argument('-recomb_map', type=str, help='Filename for the recombination rate map.')
213274
parser.add_argument('-mut_map', type=str, help='Filename for the mutation rate map.')
214275
parser.add_argument('-vcf', type=str, help='VCF file prefix (without .vcf extension).')
215276
parser.add_argument('-output', type=str, required=True, help='Output file prefix.')
216-
parser.add_argument('-start', type=str, required=True, help='Start position.')
217-
parser.add_argument('-end', type=str, required=True, help='End position.')
277+
parser.add_argument('-start', type=float, required=True, help='Start position.')
278+
parser.add_argument('-end', type=float, required=True, help='End position.')
218279
parser.add_argument('-n', type=int, default=100, help='Number of MCMC samples. Default: 100.')
219280
parser.add_argument('-thin', type=int, default=20, help='Thinning interval length. Default: 20.')
220281
parser.add_argument('-polar', type=float, default=0.5, help='Site flip probability. Default: 0.5.')
@@ -230,28 +291,31 @@ def main():
230291
if not args.m and not args.mut_map:
231292
parser.error("You must provide either -m or -mut_map")
232293

233-
if not args.ratio and not args.recomb_map:
234-
parser.error("You must provide either -ratio or -recomb_map")
235-
236294

237295
if args.m:
238-
args.mut_map = write_rate_map(args.m, args.output, args.start, args.end)
239-
240-
if args.ratio:
241-
if not args.m:
242-
parser.error("You must provide -m when using the -ratio flag")
296+
if args.Ne < 0:
297+
args.Ne = compute_Ne(args.vcf, args.start, args.end, args.m)
298+
if (args.resume):
299+
resume_rate_singer(args.Ne, args.m, args.m*args.ratio, args.start, args.end, args.vcf, args.output, args.n, args.thin, args.polar, args.seed)
243300
else:
244-
args.recomb_map = write_rate_map(args.ratio*args.m, args.vcf, args.start, args.end, args.output)
245-
246-
if not args.Ne:
247-
args.m = compute_average_rate(args.mut_map, args.start, args.end)
248-
args.Ne = compute_Ne(args.vcf, args.start, args.end, args.m)
249-
250-
if (args.resume):
251-
resume_singer(args.Ne, args.mut_map, args.recomb_map, args.start, args.end, args.vcf, args.output, args.n, args.thin, args.polar, args.seed)
252-
else:
253-
run_singer(args.Ne, args.mut_map, args.recomb_map, args.start, args.end, args.vcf, args.output, args.n, args.thin, args.polar, args.seed)
301+
run_rate_singer(args.Ne, args.m, args.m*args.ratio, args.start, args.end, args.vcf, args.output, args.n, args.thin, args.polar, args.seed)
302+
return
303+
304+
else if args.mut_map and not args.m:
305+
args.m = compute_average_rate(args.mut_map, args.start, args.end)
306+
if not args.recomb_map:
307+
parser.error("You must provide -recomb_map when using the -mut_map flag")
254308

309+
if args.Ne < 0:
310+
args.m = compute_average_rate(args.mut_map, args.start, args.end)
311+
args.Ne = compute_Ne(args.vcf, args.start, args.end, args.m)
312+
313+
if (args.resume):
314+
resume_map_singer(args.Ne, args.mut_map, args.recomb_map, args.start, args.end, args.vcf, args.output, args.n, args.thin, args.polar, args.seed)
315+
else:
316+
run_map_singer(args.Ne, args.mut_map, args.recomb_map, args.start, args.end, args.vcf, args.output, args.n, args.thin, args.polar, args.seed)
317+
return
318+
255319

256320
if __name__ == "__main__":
257321
main()

0 commit comments

Comments
 (0)