Skip to content

Commit 03b15d1

Browse files
committed
added cht_data module, forgot to add to repo earlier, but required for fit_bnb_coefficients
1 parent 295a18b commit 03b15d1

File tree

1 file changed

+225
-0
lines changed

1 file changed

+225
-0
lines changed

CHT/cht_data.py

Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
import sys
2+
import gzip
3+
4+
import numpy as np
5+
6+
class TestSNP:
7+
def __init__(self, name, geno_hap1, geno_hap2, AS_target_ref, AS_target_alt,
8+
hetps, totals, counts):
9+
self.name = name
10+
self.geno_hap1 = geno_hap1
11+
self.geno_hap2 = geno_hap2
12+
self.AS_target_ref = AS_target_ref
13+
self.AS_target_alt = AS_target_alt
14+
self.hetps = hetps
15+
self.totals = totals
16+
self.counts = counts
17+
18+
19+
def is_het(self):
20+
"""returns True if the test SNP is heterozygous"""
21+
return self.geno_hap1 != self.geno_hap2
22+
23+
def is_homo_ref(self):
24+
"""Returns True if test SNP is homozygous for reference allele"""
25+
return self.geno_hap1 == 0 and self.geno_hap2 == 0
26+
27+
def is_homo_alt(self):
28+
"""Returns True if test SNP is homozygous for non-reference allele"""
29+
return self.geno_hap1 == 1 and self.geno_hap2 == 1
30+
31+
32+
dup_snp_warn = True
33+
34+
35+
def parse_test_snp(snpinfo, shuffle=False):
36+
global dup_snp_warn
37+
snp_id = snpinfo[2]
38+
39+
tot = 0 if snpinfo[16] == "NA" else float(snpinfo[16])
40+
41+
if snpinfo[6] == "NA":
42+
geno_hap1 = 0
43+
geno_hap2 = 0
44+
else:
45+
geno_hap1 = int(snpinfo[6].strip().split("|")[0])
46+
geno_hap2 = int(snpinfo[6].strip().split("|")[1])
47+
48+
count = 0 if snpinfo[15] == "NA" else int(snpinfo[15])
49+
50+
if snpinfo[9].strip() == "NA" or geno_hap1 == geno_hap2:
51+
# SNP is homozygous, so there is no AS info
52+
return TestSNP(snp_id, geno_hap1, geno_hap2, [], [], [], tot, count)
53+
else:
54+
# positions of target SNPs
55+
snp_locs = np.array([int(y.strip()) for y in snpinfo[9].split(';')])
56+
57+
# counts of reads that match reference overlapping linked 'target' SNPs
58+
snp_as_ref = np.array([int(y) for y in snpinfo[12].split(';')])
59+
60+
# counts of reads that match alternate allele
61+
snp_as_alt = np.array([int(y) for y in snpinfo[13].split(';')])
62+
63+
# heterozygote probabilities
64+
snp_hetps = np.array([np.float64(y.strip())
65+
for y in snpinfo[10].split(';')])
66+
67+
# linkage probabilities, not currently used
68+
snp_linkageps = np.array([np.float64(y.strip())
69+
for y in snpinfo[11].split(';')])
70+
71+
72+
# same SNP should not be provided multiple times, this
73+
# can create problems with combined test. Warn and filter
74+
# duplicate SNPs
75+
uniq_loc, uniq_idx = np.unique(snp_locs, return_index=True)
76+
77+
if dup_snp_warn and uniq_loc.shape[0] != snp_locs.shape[0]:
78+
sys.stderr.write("WARNING: discarding SNPs that are repeated "
79+
"multiple times in same line\n")
80+
# only warn once
81+
dup_snp_warn = False
82+
83+
snp_as_ref = snp_as_ref[uniq_idx]
84+
snp_as_alt = snp_as_alt[uniq_idx]
85+
snp_hetps = snp_hetps[uniq_idx]
86+
87+
# linkage probabilities currently not used
88+
snp_linkageps = snp_linkageps[uniq_idx]
89+
90+
if shuffle:
91+
# permute allele-specific read counts by flipping them randomly at
92+
# each SNP
93+
for y in range(len(snp_as_ref)):
94+
if random.randint(0, 1) == 1:
95+
temp = snp_as_ref[y]
96+
snp_as_ref[y] = snp_as_alt[y]
97+
snp_as_alt[y] = temp
98+
99+
return TestSNP(snp_id, geno_hap1, geno_hap2, snp_as_ref,
100+
snp_as_alt, snp_hetps, tot, count)
101+
102+
103+
104+
105+
106+
def open_input_files(in_filename):
107+
if not os.path.exists(in_filename) or not os.path.isfile(in_filename):
108+
raise IOError("input file %s does not exist or is not a "
109+
"regular file\n" % in_filename)
110+
111+
# read file that contains list of input files
112+
in_file = open(in_filename)
113+
114+
infiles = []
115+
for line in in_file:
116+
# open each input file and read first line
117+
filename = line.rstrip()
118+
sys.stderr.write(" " + filename + "\n")
119+
if (not filename) or (not os.path.exists(filename)) or \
120+
(not os.path.isfile(filename)):
121+
sys.stderr.write("input file '%s' does not exist or is not a "
122+
"regular file\n" % in_file)
123+
exit(2)
124+
if filename.endswith(".gz"):
125+
f = gzip.open(filename, "rt")
126+
else:
127+
f = open(filename)
128+
129+
# skip header
130+
f.readline()
131+
132+
infiles.append(f)
133+
in_file.close()
134+
135+
if len(infiles) == 0:
136+
sys.stderr.write("no input files specified in file '%s'\n" % in_filename)
137+
exit(2)
138+
139+
return infiles
140+
141+
142+
143+
144+
145+
146+
def read_count_matrices(input_filename, shuffle=False, skip=0, min_counts=0, min_as_counts=0):
147+
"""Given an input file that contains paths to input files for all individuals, and returns
148+
matrix of observed read counts, and matrix of expected read counts
149+
"""
150+
infiles = open_input_files(input_filename)
151+
152+
is_finished = False
153+
count_matrix = []
154+
expected_matrix = []
155+
line_num = 0
156+
skip_num = 0
157+
158+
while not is_finished:
159+
is_comment = False
160+
line_num += 1
161+
count_line = []
162+
expected_line = []
163+
num_as = 0
164+
165+
for i in range(len(infiles)):
166+
# read next row from this input file
167+
line = infiles[i].readline().strip()
168+
169+
if line.startswith("#") or line.startswith("CHROM"):
170+
# skip comment lines and header line
171+
is_comment = True
172+
elif line:
173+
if is_finished:
174+
raise IOError("All input files should have same number of lines. "
175+
"LINE %d is present in file %s, but not in all input files\n"
176+
% (line_num, infiles[i].name))
177+
if is_comment:
178+
raise IOError("Comment and header lines should be consistent accross "
179+
"all input files. LINE %d is comment or header line in some input files "
180+
"but not in file %s" % (line_num, infiles[i].name))
181+
182+
# parse test SNP and associated info from input file row
183+
new_snp = parse_test_snp(line.split(), shuffle=shuffle)
184+
if new_snp.is_het():
185+
num_as += np.sum(new_snp.AS_target_ref) + \
186+
np.sum(new_snp.AS_target_alt)
187+
188+
count_line.append(new_snp.counts)
189+
expected_line.append(new_snp.totals)
190+
191+
else:
192+
# out of lines from at least one file, assume we are finished
193+
is_finished = True
194+
195+
if not is_finished and not is_comment:
196+
if skip_num < skip:
197+
# skip this row
198+
skip_num += 1
199+
else:
200+
if(sum(count_line) >= min_counts and num_as >= min_as_counts):
201+
# this line exceeded minimum number of read counts and AS counts
202+
count_matrix.append(count_line)
203+
expected_matrix.append(expected_line)
204+
skip_num = 0
205+
206+
count_matrix = np.array(count_matrix, dtype=int)
207+
expected_matrix = np.array(expected_matrix, dtype=np.float64)
208+
209+
sys.stderr.write("count_matrix dimension: %s\n" % str(count_matrix.shape))
210+
sys.stderr.write("expect_matrix dimension: %s\n" % str(expected_matrix.shape))
211+
212+
nrow = count_matrix.shape[0]
213+
if (options.sample > 0) and (options.sample < count_matrix.shape):
214+
# randomly sample subset of rows without replacement
215+
sys.stderr.write("randomly sampling %d target regions\n" % options.sample)
216+
samp_index = np.arange(nrow)
217+
np.random.shuffle(samp_index)
218+
samp_index = samp_index[:options.sample]
219+
count_matrix = count_matrix[samp_index,]
220+
expected_matrix = expected_matrix[samp_index,]
221+
222+
sys.stderr.write("new count_matrix dimension: %s\n" % str(count_matrix.shape))
223+
sys.stderr.write("new expect_matrix dimension: %s\n" % str(expected_matrix.shape))
224+
225+
return count_matrix, expected_matrix

0 commit comments

Comments
 (0)