Skip to content

Commit 6ee68a1

Browse files
Merge pull request #466 from jeromekelleher/ingest-raw-mafft
Support MAFFT input alignments
2 parents bb65900 + b3075a7 commit 6ee68a1

File tree

5 files changed

+49
-7
lines changed

5 files changed

+49
-7
lines changed

sc2ts/core.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ def __init__(self, path, add_zero_base=True):
107107
def __getitem__(self, key):
108108
x = self.reader[key]
109109
h = np.array(x).astype(str)
110+
h = np.char.upper(h)
110111
if self.add_zero_base:
111112
return np.append(["X"], h)
112113
return h

sc2ts/dataset.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ class Variant:
178178

179179
class Dataset(collections.abc.Mapping):
180180

181-
def __init__(self, path, chunk_cache_size=1, date_field="date"):
181+
def __init__(self, path, chunk_cache_size=1, date_field="date", skip_metadata=False):
182182
logger.info(f"Loading dateset @{path} using {date_field} as date field")
183183
self.date_field = date_field
184184
self.path = pathlib.Path(path)
@@ -196,9 +196,13 @@ def __init__(self, path, chunk_cache_size=1, date_field="date"):
196196
self.haplotypes = CachedHaplotypeMapping(
197197
self.root, self.sample_id_map, chunk_cache_size
198198
)
199-
self.metadata = CachedMetadataMapping(
200-
self.root, self.sample_id_map, date_field, chunk_cache_size=chunk_cache_size
201-
)
199+
if not skip_metadata:
200+
self.metadata = CachedMetadataMapping(
201+
self.root,
202+
self.sample_id_map,
203+
date_field,
204+
chunk_cache_size=chunk_cache_size,
205+
)
202206

203207
def __getitem__(self, key):
204208
return self.root[key]

tests/conftest.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,17 @@ def fx_alignments_fasta(fx_data_cache):
2929
return cache_path
3030

3131

32+
@pytest.fixture
33+
def fx_alignments_mafft_fasta(fx_data_cache):
34+
# This is bgzipped so we can access directly
35+
cache_path = fx_data_cache / "alignments-mafft.fasta"
36+
if not cache_path.exists():
37+
with gzip.open("tests/data/alignments-mafft.fasta.gz") as src:
38+
with open(cache_path, "wb") as dest:
39+
shutil.copyfileobj(src, dest)
40+
return cache_path
41+
42+
3243
def encoded_alignments(path):
3344
fr = sc2ts.FastaReader(path)
3445
alignments = {}
@@ -37,6 +48,11 @@ def encoded_alignments(path):
3748
return alignments
3849

3950

51+
@pytest.fixture
52+
def fx_encoded_alignments_mafft(fx_alignments_mafft_fasta):
53+
return encoded_alignments(fx_alignments_mafft_fasta)
54+
55+
4056
@pytest.fixture
4157
def fx_encoded_alignments(fx_alignments_fasta):
4258
return encoded_alignments(fx_alignments_fasta)
@@ -258,7 +274,9 @@ def recombinant_example_2(tmp_path, fx_ts_map, fx_dataset, ds_path):
258274
date = "2020-03-02"
259275
left = start + 3 + 1
260276
right = end - 3 + 1
261-
ds = sc2ts.tmp_dataset(tmp_path / "tmp.zarr", {f"recombinant_{left}:{right}": a}, date=date)
277+
ds = sc2ts.tmp_dataset(
278+
tmp_path / "tmp.zarr", {f"recombinant_{left}:{right}": a}, date=date
279+
)
262280
rts = sc2ts.extend(
263281
dataset=ds.path,
264282
base_ts=ts_path,
@@ -267,6 +285,7 @@ def recombinant_example_2(tmp_path, fx_ts_map, fx_dataset, ds_path):
267285
)
268286
return rts
269287

288+
270289
def recombinant_example_3(tmp_path, fx_ts_map, fx_dataset, ds_path):
271290
# Pick a distinct strain to be the root of our three new haplotypes added
272291
# on the first day.
@@ -286,7 +305,7 @@ def recombinant_example_3(tmp_path, fx_ts_map, fx_dataset, ds_path):
286305
mid_a = a.copy()
287306
mid_start = 15_000
288307
mid_end = 15_009
289-
mid_a[mid_start: mid_end] = 1 # "C"
308+
mid_a[mid_start:mid_end] = 1 # "C"
290309

291310
a = mid_a.copy()
292311
a[start : start + 3] = left_a[start : start + 3]
@@ -315,7 +334,7 @@ def recombinant_example_3(tmp_path, fx_ts_map, fx_dataset, ds_path):
315334
mut = ts.mutation(mut_id)
316335
assert mut.derived_state == "G"
317336
assert ts.sites_position[mut.site] == start + j + 1
318-
337+
319338
for j, mut_id in enumerate(np.where(ts.mutations_node == mid_node)[0]):
320339
mut = ts.mutation(mut_id)
321340
assert mut.derived_state == "C"
@@ -345,6 +364,7 @@ def recombinant_example_3(tmp_path, fx_ts_map, fx_dataset, ds_path):
345364
assert rts.num_samples == ts.num_samples + 1
346365
return rts
347366

367+
348368
@pytest.fixture
349369
def fx_recombinant_example_1(tmp_path, fx_data_cache, fx_ts_map, fx_dataset):
350370
cache_path = fx_data_cache / "recombinant_ex1.ts"
@@ -366,6 +386,7 @@ def fx_recombinant_example_2(tmp_path, fx_data_cache, fx_ts_map, fx_dataset):
366386
ts.dump(cache_path)
367387
return tskit.load(cache_path)
368388

389+
369390
@pytest.fixture
370391
def fx_recombinant_example_3(tmp_path, fx_data_cache, fx_ts_map, fx_dataset):
371392
cache_path = fx_data_cache / "recombinant_ex3.ts"
100 KB
Binary file not shown.

tests/test_dataset.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,22 @@ def test_date_field(self, fx_dataset):
277277
assert len(diffs) == 6
278278

279279

280+
class TestMafftAlignments:
281+
282+
def test_import(self, tmp_path, fx_encoded_alignments_mafft):
283+
path = tmp_path / "dataset.vcz"
284+
sc2ts.Dataset.new(path)
285+
sc2ts.Dataset.append_alignments(path, fx_encoded_alignments_mafft)
286+
ds = sc2ts.Dataset(path, skip_metadata=True)
287+
assert len(ds.haplotypes) == 19
288+
for k, v in fx_encoded_alignments_mafft.items():
289+
h = ds.haplotypes[k]
290+
nt.assert_array_equal(v, h)
291+
# The flanks are marked as deletions
292+
assert h[0] == 4
293+
assert h[-1] == 4
294+
295+
280296
class TestDatasetAlignments:
281297

282298
def test_fetch_known(self, fx_dataset):

0 commit comments

Comments
 (0)