|
| 1 | +import sys |
| 2 | +import gzip |
| 3 | + |
| 4 | +import numpy as np |
| 5 | + |
| 6 | +class TestSNP: |
| 7 | + def __init__(self, name, geno_hap1, geno_hap2, AS_target_ref, AS_target_alt, |
| 8 | + hetps, totals, counts): |
| 9 | + self.name = name |
| 10 | + self.geno_hap1 = geno_hap1 |
| 11 | + self.geno_hap2 = geno_hap2 |
| 12 | + self.AS_target_ref = AS_target_ref |
| 13 | + self.AS_target_alt = AS_target_alt |
| 14 | + self.hetps = hetps |
| 15 | + self.totals = totals |
| 16 | + self.counts = counts |
| 17 | + |
| 18 | + |
| 19 | + def is_het(self): |
| 20 | + """returns True if the test SNP is heterozygous""" |
| 21 | + return self.geno_hap1 != self.geno_hap2 |
| 22 | + |
| 23 | + def is_homo_ref(self): |
| 24 | + """Returns True if test SNP is homozygous for reference allele""" |
| 25 | + return self.geno_hap1 == 0 and self.geno_hap2 == 0 |
| 26 | + |
| 27 | + def is_homo_alt(self): |
| 28 | + """Returns True if test SNP is homozygous for non-reference allele""" |
| 29 | + return self.geno_hap1 == 1 and self.geno_hap2 == 1 |
| 30 | + |
| 31 | + |
| 32 | +dup_snp_warn = True |
| 33 | + |
| 34 | + |
| 35 | +def parse_test_snp(snpinfo, shuffle=False): |
| 36 | + global dup_snp_warn |
| 37 | + snp_id = snpinfo[2] |
| 38 | + |
| 39 | + tot = 0 if snpinfo[16] == "NA" else float(snpinfo[16]) |
| 40 | + |
| 41 | + if snpinfo[6] == "NA": |
| 42 | + geno_hap1 = 0 |
| 43 | + geno_hap2 = 0 |
| 44 | + else: |
| 45 | + geno_hap1 = int(snpinfo[6].strip().split("|")[0]) |
| 46 | + geno_hap2 = int(snpinfo[6].strip().split("|")[1]) |
| 47 | + |
| 48 | + count = 0 if snpinfo[15] == "NA" else int(snpinfo[15]) |
| 49 | + |
| 50 | + if snpinfo[9].strip() == "NA" or geno_hap1 == geno_hap2: |
| 51 | + # SNP is homozygous, so there is no AS info |
| 52 | + return TestSNP(snp_id, geno_hap1, geno_hap2, [], [], [], tot, count) |
| 53 | + else: |
| 54 | + # positions of target SNPs |
| 55 | + snp_locs = np.array([int(y.strip()) for y in snpinfo[9].split(';')]) |
| 56 | + |
| 57 | + # counts of reads that match reference overlapping linked 'target' SNPs |
| 58 | + snp_as_ref = np.array([int(y) for y in snpinfo[12].split(';')]) |
| 59 | + |
| 60 | + # counts of reads that match alternate allele |
| 61 | + snp_as_alt = np.array([int(y) for y in snpinfo[13].split(';')]) |
| 62 | + |
| 63 | + # heterozygote probabilities |
| 64 | + snp_hetps = np.array([np.float64(y.strip()) |
| 65 | + for y in snpinfo[10].split(';')]) |
| 66 | + |
| 67 | + # linkage probabilities, not currently used |
| 68 | + snp_linkageps = np.array([np.float64(y.strip()) |
| 69 | + for y in snpinfo[11].split(';')]) |
| 70 | + |
| 71 | + |
| 72 | + # same SNP should not be provided multiple times, this |
| 73 | + # can create problems with combined test. Warn and filter |
| 74 | + # duplicate SNPs |
| 75 | + uniq_loc, uniq_idx = np.unique(snp_locs, return_index=True) |
| 76 | + |
| 77 | + if dup_snp_warn and uniq_loc.shape[0] != snp_locs.shape[0]: |
| 78 | + sys.stderr.write("WARNING: discarding SNPs that are repeated " |
| 79 | + "multiple times in same line\n") |
| 80 | + # only warn once |
| 81 | + dup_snp_warn = False |
| 82 | + |
| 83 | + snp_as_ref = snp_as_ref[uniq_idx] |
| 84 | + snp_as_alt = snp_as_alt[uniq_idx] |
| 85 | + snp_hetps = snp_hetps[uniq_idx] |
| 86 | + |
| 87 | + # linkage probabilities currently not used |
| 88 | + snp_linkageps = snp_linkageps[uniq_idx] |
| 89 | + |
| 90 | + if shuffle: |
| 91 | + # permute allele-specific read counts by flipping them randomly at |
| 92 | + # each SNP |
| 93 | + for y in range(len(snp_as_ref)): |
| 94 | + if random.randint(0, 1) == 1: |
| 95 | + temp = snp_as_ref[y] |
| 96 | + snp_as_ref[y] = snp_as_alt[y] |
| 97 | + snp_as_alt[y] = temp |
| 98 | + |
| 99 | + return TestSNP(snp_id, geno_hap1, geno_hap2, snp_as_ref, |
| 100 | + snp_as_alt, snp_hetps, tot, count) |
| 101 | + |
| 102 | + |
| 103 | + |
| 104 | + |
| 105 | + |
| 106 | +def open_input_files(in_filename): |
| 107 | + if not os.path.exists(in_filename) or not os.path.isfile(in_filename): |
| 108 | + raise IOError("input file %s does not exist or is not a " |
| 109 | + "regular file\n" % in_filename) |
| 110 | + |
| 111 | + # read file that contains list of input files |
| 112 | + in_file = open(in_filename) |
| 113 | + |
| 114 | + infiles = [] |
| 115 | + for line in in_file: |
| 116 | + # open each input file and read first line |
| 117 | + filename = line.rstrip() |
| 118 | + sys.stderr.write(" " + filename + "\n") |
| 119 | + if (not filename) or (not os.path.exists(filename)) or \ |
| 120 | + (not os.path.isfile(filename)): |
| 121 | + sys.stderr.write("input file '%s' does not exist or is not a " |
| 122 | + "regular file\n" % in_file) |
| 123 | + exit(2) |
| 124 | + if filename.endswith(".gz"): |
| 125 | + f = gzip.open(filename, "rt") |
| 126 | + else: |
| 127 | + f = open(filename) |
| 128 | + |
| 129 | + # skip header |
| 130 | + f.readline() |
| 131 | + |
| 132 | + infiles.append(f) |
| 133 | + in_file.close() |
| 134 | + |
| 135 | + if len(infiles) == 0: |
| 136 | + sys.stderr.write("no input files specified in file '%s'\n" % in_filename) |
| 137 | + exit(2) |
| 138 | + |
| 139 | + return infiles |
| 140 | + |
| 141 | + |
| 142 | + |
| 143 | + |
| 144 | + |
| 145 | + |
| 146 | +def read_count_matrices(input_filename, shuffle=False, skip=0, min_counts=0, min_as_counts=0): |
| 147 | + """Given an input file that contains paths to input files for all individuals, and returns |
| 148 | + matrix of observed read counts, and matrix of expected read counts |
| 149 | + """ |
| 150 | + infiles = open_input_files(input_filename) |
| 151 | + |
| 152 | + is_finished = False |
| 153 | + count_matrix = [] |
| 154 | + expected_matrix = [] |
| 155 | + line_num = 0 |
| 156 | + skip_num = 0 |
| 157 | + |
| 158 | + while not is_finished: |
| 159 | + is_comment = False |
| 160 | + line_num += 1 |
| 161 | + count_line = [] |
| 162 | + expected_line = [] |
| 163 | + num_as = 0 |
| 164 | + |
| 165 | + for i in range(len(infiles)): |
| 166 | + # read next row from this input file |
| 167 | + line = infiles[i].readline().strip() |
| 168 | + |
| 169 | + if line.startswith("#") or line.startswith("CHROM"): |
| 170 | + # skip comment lines and header line |
| 171 | + is_comment = True |
| 172 | + elif line: |
| 173 | + if is_finished: |
| 174 | + raise IOError("All input files should have same number of lines. " |
| 175 | + "LINE %d is present in file %s, but not in all input files\n" |
| 176 | + % (line_num, infiles[i].name)) |
| 177 | + if is_comment: |
| 178 | + raise IOError("Comment and header lines should be consistent accross " |
| 179 | + "all input files. LINE %d is comment or header line in some input files " |
| 180 | + "but not in file %s" % (line_num, infiles[i].name)) |
| 181 | + |
| 182 | + # parse test SNP and associated info from input file row |
| 183 | + new_snp = parse_test_snp(line.split(), shuffle=shuffle) |
| 184 | + if new_snp.is_het(): |
| 185 | + num_as += np.sum(new_snp.AS_target_ref) + \ |
| 186 | + np.sum(new_snp.AS_target_alt) |
| 187 | + |
| 188 | + count_line.append(new_snp.counts) |
| 189 | + expected_line.append(new_snp.totals) |
| 190 | + |
| 191 | + else: |
| 192 | + # out of lines from at least one file, assume we are finished |
| 193 | + is_finished = True |
| 194 | + |
| 195 | + if not is_finished and not is_comment: |
| 196 | + if skip_num < skip: |
| 197 | + # skip this row |
| 198 | + skip_num += 1 |
| 199 | + else: |
| 200 | + if(sum(count_line) >= min_counts and num_as >= min_as_counts): |
| 201 | + # this line exceeded minimum number of read counts and AS counts |
| 202 | + count_matrix.append(count_line) |
| 203 | + expected_matrix.append(expected_line) |
| 204 | + skip_num = 0 |
| 205 | + |
| 206 | + count_matrix = np.array(count_matrix, dtype=int) |
| 207 | + expected_matrix = np.array(expected_matrix, dtype=np.float64) |
| 208 | + |
| 209 | + sys.stderr.write("count_matrix dimension: %s\n" % str(count_matrix.shape)) |
| 210 | + sys.stderr.write("expect_matrix dimension: %s\n" % str(expected_matrix.shape)) |
| 211 | + |
| 212 | + nrow = count_matrix.shape[0] |
| 213 | + if (options.sample > 0) and (options.sample < count_matrix.shape): |
| 214 | + # randomly sample subset of rows without replacement |
| 215 | + sys.stderr.write("randomly sampling %d target regions\n" % options.sample) |
| 216 | + samp_index = np.arange(nrow) |
| 217 | + np.random.shuffle(samp_index) |
| 218 | + samp_index = samp_index[:options.sample] |
| 219 | + count_matrix = count_matrix[samp_index,] |
| 220 | + expected_matrix = expected_matrix[samp_index,] |
| 221 | + |
| 222 | + sys.stderr.write("new count_matrix dimension: %s\n" % str(count_matrix.shape)) |
| 223 | + sys.stderr.write("new expect_matrix dimension: %s\n" % str(expected_matrix.shape)) |
| 224 | + |
| 225 | + return count_matrix, expected_matrix |
0 commit comments