|
1 | 1 | import dataclasses |
2 | | -import json |
3 | | -import pathlib |
4 | | -import collections.abc |
5 | | -import csv |
6 | 2 |
|
7 | 3 | import tskit |
8 | | -import numba |
9 | | -import pyfaidx |
10 | | -import numpy as np |
11 | | - |
12 | | -from . import jit |
13 | 4 |
|
14 | 5 | __version__ = "undefined" |
15 | 6 | try: |
|
26 | 17 | REFERENCE_GENBANK = "MN908947" |
27 | 18 | REFERENCE_SEQUENCE_LENGTH = 29904 |
28 | 19 |
|
| 20 | +# We omit N here as it's mapped to -1. Make "-" the 5th allele |
| 21 | +# as this is a valid allele for us. |
| 22 | +# NOTE!! This string is also used in the jit module where it's |
| 23 | +# hard-coded into a numba function, so if this ever changes |
| 24 | +# it needs to be updated there also! |
| 25 | +IUPAC_ALLELES = "ACGT-RYSWKMBDHV." |
| 26 | + |
29 | 27 | NODE_IS_MUTATION_OVERLAP = 1 << 21 |
30 | 28 | NODE_IS_REVERSION_PUSH = 1 << 22 |
31 | 29 | NODE_IS_RECOMBINANT = 1 << 23 |
@@ -97,163 +95,3 @@ def decode_flags(f): |
97 | 95 |
|
98 | 96 | def flags_summary(f): |
99 | 97 | return "".join([v.short if (v.value & f) > 0 else "_" for v in flag_values]) |
100 | | - |
101 | | - |
102 | | -class FastaReader(collections.abc.Mapping): |
103 | | - def __init__(self, path, add_zero_base=True): |
104 | | - self.reader = pyfaidx.Fasta(str(path)) |
105 | | - self._keys = list(self.reader.keys()) |
106 | | - self.add_zero_base = add_zero_base |
107 | | - |
108 | | - def __getitem__(self, key): |
109 | | - x = self.reader[key] |
110 | | - h = np.array(x).astype(str) |
111 | | - h = np.char.upper(h) |
112 | | - if self.add_zero_base: |
113 | | - return np.append(["X"], h) |
114 | | - return h |
115 | | - |
116 | | - def __iter__(self): |
117 | | - return iter(self._keys) |
118 | | - |
119 | | - def __len__(self): |
120 | | - return len(self._keys) |
121 | | - |
122 | | - |
123 | | -data_path = pathlib.Path(__file__).parent / "data" |
124 | | - |
125 | | - |
126 | | -def get_problematic_regions(): |
127 | | - """ |
128 | | - These regions have been reported to have highly recurrent or unusual |
129 | | - patterns of deletions. |
130 | | -
|
131 | | - https://github.com/jeromekelleher/sc2ts/issues/231#issuecomment-2401405355 |
132 | | -
|
133 | | - Region: NTD domain |
134 | | - Coords: [21602-22472) |
135 | | - Multiple highly recurrent deleted regions in NTD domain in Spike |
136 | | - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7971772/ |
137 | | -
|
138 | | - Region: ORF8 |
139 | | - https://virological.org/t/repeated-loss-of-orf8-expression-in-circulating-sars-cov-2-lineages/931/1 |
140 | | -
|
141 | | - The 1-based (half-open) coordinates were taken from the UCSC Genome Browser. |
142 | | - """ |
143 | | - orf8 = get_gene_coordinates()["ORF8"] |
144 | | - return np.concatenate( |
145 | | - [ |
146 | | - np.arange(21602, 22472, dtype=np.int64), # NTD domain in S |
147 | | - np.arange(*orf8, dtype=np.int64), |
148 | | - ] |
149 | | - ) |
150 | | - |
151 | | - |
152 | | -def get_flank_coordinates(): |
153 | | - """ |
154 | | - Return the coordinates at either end of the genome for masking out. |
155 | | - """ |
156 | | - genes = get_gene_coordinates() |
157 | | - start = genes["ORF1ab"][0] |
158 | | - end = genes["ORF10"][1] |
159 | | - return np.concatenate( |
160 | | - (np.arange(1, start), np.arange(end, REFERENCE_SEQUENCE_LENGTH)) |
161 | | - ) |
162 | | - |
163 | | - |
164 | | -def get_masked_sites(ts): |
165 | | - """ |
166 | | - Return the set of sites not used in the sequence. |
167 | | - """ |
168 | | - unused = np.ones(int(ts.sequence_length), dtype=bool) |
169 | | - unused[ts.sites_position.astype(int)] = False |
170 | | - unused[0] = False |
171 | | - return np.where(unused)[0] |
172 | | - |
173 | | - |
174 | | -@dataclasses.dataclass |
175 | | -class CovLineage: |
176 | | - name: str |
177 | | - earliest_date: str |
178 | | - latest_date: str |
179 | | - description: str |
180 | | - |
181 | | - |
182 | | -def get_cov_lineages_data(): |
183 | | - with open(data_path / "lineages.json") as f: |
184 | | - data = json.load(f) |
185 | | - ret = {} |
186 | | - for record in data: |
187 | | - lineage = CovLineage( |
188 | | - record["Lineage"], |
189 | | - record["Earliest date"], |
190 | | - record["Latest date"], |
191 | | - record["Description"], |
192 | | - ) |
193 | | - assert lineage.name not in ret |
194 | | - ret[lineage.name] = lineage |
195 | | - return ret |
196 | | - |
197 | | - |
198 | | -__cached_reference = None |
199 | | - |
200 | | - |
201 | | -def get_reference_sequence(as_array=False): |
202 | | - global __cached_reference |
203 | | - if __cached_reference is None: |
204 | | - reader = pyfaidx.Fasta(str(data_path / "reference.fasta")) |
205 | | - __cached_reference = reader[REFERENCE_GENBANK] |
206 | | - if as_array: |
207 | | - h = np.array(__cached_reference).astype(str) |
208 | | - return np.append(["X"], h) |
209 | | - else: |
210 | | - return "X" + str(__cached_reference) |
211 | | - |
212 | | - |
213 | | -__cached_genes = None |
214 | | - |
215 | | - |
216 | | -def get_gene_coordinates(): |
217 | | - """ |
218 | | - Returns a map of gene name to interval, (start, stop). These are |
219 | | - half-open, left-inclusive, right-exclusive. |
220 | | - """ |
221 | | - global __cached_genes |
222 | | - if __cached_genes is None: |
223 | | - d = {} |
224 | | - with open(data_path / "annotation.csv") as f: |
225 | | - reader = csv.DictReader(f, delimiter=",") |
226 | | - for row in reader: |
227 | | - d[row["gene"]] = (int(row["start"]), int(row["end"])) |
228 | | - __cached_genes = d |
229 | | - return __cached_genes |
230 | | - |
231 | | - |
232 | | -# We omit N here as it's mapped to -1. Make "-" the 5th allele |
233 | | -# as this is a valid allele for us. |
234 | | -IUPAC_ALLELES = "ACGT-RYSWKMBDHV." |
235 | | - |
236 | | - |
237 | | -# FIXME make cache optional |
238 | | -@numba.njit(cache=True) |
239 | | -def encode_alignment(h): |
240 | | - # Just so numba knows this is a constant string |
241 | | - alleles = "ACGT-RYSWKMBDHV." |
242 | | - n = h.shape[0] |
243 | | - a = np.full(n, -1, dtype=np.int8) |
244 | | - for j in range(n): |
245 | | - if h[j] == "N": |
246 | | - a[j] = -1 |
247 | | - else: |
248 | | - for k, c in enumerate(alleles): |
249 | | - if c == h[j]: |
250 | | - break |
251 | | - else: |
252 | | - raise ValueError(f"Allele {h[j]} not recognised") |
253 | | - a[j] = k |
254 | | - return a |
255 | | - |
256 | | - |
257 | | -def decode_alignment(a): |
258 | | - alleles = np.array(tuple(IUPAC_ALLELES + "N"), dtype=str) |
259 | | - return alleles[a] |
0 commit comments