Skip to content

Commit f011ea4

Browse files
Switch haplotypes to alignment and document
1 parent 7c5adbe commit f011ea4

File tree

6 files changed

+100
-27
lines changed

6 files changed

+100
-27
lines changed

docs/alignments_analysis.md

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,9 +53,68 @@ and {attr}`Dataset.num_variants` attributes.
5353

5454
To get information on the metadata fields that are present, we can use
5555

56-
5756
```{code-cell}
5857
ds.metadata.field_descriptors()
5958
```
59+
:::{warning}
60+
The ``description`` column is currently empty because of a bug in the
61+
data ingest pipeline for the Virian data. Later versions will include
62+
this information so that the dataset is self-describing.
63+
See [GitHub issue](https://github.com/tskit-dev/sc2ts/issues/579).
64+
:::
65+
66+
67+
68+
## Accessing per-sample information
69+
70+
The easiest way to get information about a single sample is through the
71+
the ``.metadata`` and ``.haplotypes`` interfaces. First, let's get
72+
the sample IDs for the first 10 samples:
73+
74+
```{code-cell}
75+
ds.sample_id[:10]
76+
```
77+
Then, we can get the metadata for a given sample as a dictionary using
78+
the {attr}`Dataset.metadata` interface:
79+
80+
```{code-cell}
81+
ds.metadata["SRR11597146"]
82+
```
83+
84+
Similarly, we can get the integer encoded alignment for a sample using
85+
the {attr}`Dataset.alignment` interface:
86+
87+
```{code-cell}
88+
ds.alignment["SRR11597146"]
89+
```
90+
91+
:::{seealso}
92+
See the section {ref}`sec_alignments_analysis_data_encoding` for
93+
details on the integer encoding for alignment data used here.
94+
:::
95+
96+
Both the ``.metadata`` and ``.aligments`` interfaces are **cached**
97+
(avoiding repeated decompression of the same underlying Zarr chunks)
98+
and support iteration, and so provide an efficient way of accessing
99+
data in bulk. For example, here we compute the mean number of
100+
gap ("-") characters per sample:
101+
102+
```{code-cell}
103+
import numpy as np
104+
105+
GAP = sc2ts.IUPAC_ALLELES.index("-")
106+
107+
gap_count = np.zeros(ds.num_samples)
108+
for j, a in enumerate(ds.alignment.values()):
109+
gap_count[j] = np.sum(a == GAP)
110+
np.mean(gap_count)
111+
```
112+
113+
114+
(sec_alignments_analysis_data_encoding)=
115+
116+
## Alignment data encoding
117+
118+
Stuff
60119

61120

sc2ts/dataset.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ def __init__(self, path, chunk_cache_size=1, date_field=None):
246246
247247
:param str path: Path to a directory or ``.zip`` Zarr store.
248248
:param int chunk_cache_size: Maximum number of chunks to cache for
249-
haplotypes and metadata. Defaults to 1.
249+
alignments and metadata. Defaults to 1.
250250
:param str date_field: Name of the metadata field to use as the
251251
sample date, or ``None`` to disable date handling. Defaults
252252
to ``None``.
@@ -265,10 +265,10 @@ def __init__(self, path, chunk_cache_size=1, date_field=None):
265265
self.sample_id_map = {
266266
sample_id: k for k, sample_id in enumerate(self._sample_id)
267267
}
268-
self.haplotypes = CachedHaplotypeMapping(
268+
self._alignment = CachedHaplotypeMapping(
269269
self.root, self.sample_id_map, chunk_cache_size
270270
)
271-
self.metadata = CachedMetadataMapping(
271+
self._metadata = CachedMetadataMapping(
272272
self.root,
273273
self.sample_id_map,
274274
date_field,
@@ -284,6 +284,20 @@ def __iter__(self):
284284
def __len__(self):
285285
return len(self.root)
286286

287+
@property
288+
def alignment(self):
289+
"""
290+
Efficient mapping of sample ID strings to integer encoded alignment data.
291+
"""
292+
return self._alignment
293+
294+
@property
295+
def metadata(self):
296+
"""
297+
Efficient mapping of sample ID strings to metadata dictionaries.
298+
"""
299+
return self._metadata
300+
287301
@property
288302
def sample_id(self):
289303
"""
@@ -387,7 +401,7 @@ def write_fasta(self, out, sample_id=None):
387401
sample_id = self.sample_id
388402

389403
for sid in sample_id:
390-
h = self.haplotypes[sid]
404+
h = self.alignment[sid]
391405
a = decode_alignment(h)
392406
print(f">{sid}", file=out)
393407
# FIXME this is probably a terrible way to write a large numpy string to
@@ -416,7 +430,7 @@ def copy(
416430
alignments = {}
417431
bar = tqdm.tqdm(sample_id, desc="Samples", disable=not show_progress)
418432
for s in bar:
419-
alignments[s] = self.haplotypes[s]
433+
alignments[s] = self.alignment[s]
420434
if len(alignments) == samples_chunk_size:
421435
Dataset.append_alignments(path, alignments)
422436
alignments = {}

sc2ts/inference.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -489,7 +489,7 @@ def preprocess(
489489
samples = []
490490
bar = get_progress(strains, progress_title, "preprocess", show_progress)
491491
for strain in bar:
492-
alignment = dataset.haplotypes[strain]
492+
alignment = dataset.alignment[strain]
493493
alignment = _dataset.mask_flanking_deletions(alignment)
494494
sample = Sample(strain)
495495
# No padding zero site in the alignment

tests/sc2ts_fixtures.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -205,8 +205,8 @@ def recombinant_alignments(dataset):
205205
Generate some recombinant alignments from existing haplotypes
206206
"""
207207
strains = ["SRR11597188", "SRR11597163"]
208-
left_a = dataset.haplotypes[strains[0]]
209-
right_a = dataset.haplotypes[strains[1]]
208+
left_a = dataset.alignment[strains[0]]
209+
right_a = dataset.alignment[strains[1]]
210210
# Recombine in the middle
211211
bp = 9_999
212212
h = left_a.copy()
@@ -243,7 +243,7 @@ def recombinant_example_2(tmp_path, fx_ts_map, fx_dataset, ds_path):
243243
# Pick a distinct strain to be the root of our two new haplotypes added
244244
# on the first day.
245245
root_strain = "SRR11597116"
246-
a = fx_dataset.haplotypes[root_strain]
246+
a = fx_dataset.alignment[root_strain]
247247
base_ts = fx_ts_map["2020-02-13"]
248248
# This sequence has a bunch of Ns at the start, so we have to go inwards
249249
# from them to make sure we're not masking them out.
@@ -310,7 +310,7 @@ def recombinant_example_3(tmp_path, fx_ts_map, fx_dataset, ds_path):
310310
# Pick a distinct strain to be the root of our three new haplotypes added
311311
# on the first day.
312312
root_strain = "SRR11597116"
313-
a = fx_dataset.haplotypes[root_strain]
313+
a = fx_dataset.alignment[root_strain]
314314
base_ts = fx_ts_map["2020-02-13"]
315315
# This sequence has a bunch of Ns at the start, so we have to go inwards
316316
# from them to make sure we're not masking them out.

tests/test_dataset.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -167,8 +167,8 @@ def test_create_zip(self, tmp_path, fx_encoded_alignments, fx_metadata_df):
167167

168168
ds1 = sc2ts.Dataset(path)
169169
ds2 = sc2ts.Dataset(zip_path)
170-
alignments1 = dict(ds1.haplotypes)
171-
alignments2 = dict(ds2.haplotypes)
170+
alignments1 = dict(ds1.alignment)
171+
alignments2 = dict(ds2.alignment)
172172
assert alignments1.keys() == alignments2.keys()
173173
for k in alignments1.keys():
174174
nt.assert_array_equal(alignments1[k], alignments2[k])
@@ -388,9 +388,9 @@ def test_import(self, tmp_path, fx_encoded_alignments_mafft):
388388
sc2ts.Dataset.new(path)
389389
sc2ts.Dataset.append_alignments(path, fx_encoded_alignments_mafft)
390390
ds = sc2ts.Dataset(path)
391-
assert len(ds.haplotypes) == 19
391+
assert len(ds.alignment) == 19
392392
for k, v in fx_encoded_alignments_mafft.items():
393-
h = ds.haplotypes[k]
393+
h = ds.alignment[k]
394394
nt.assert_array_equal(v, h)
395395
# The flanks are marked as deletions
396396
assert h[0] == 4
@@ -400,29 +400,29 @@ def test_import(self, tmp_path, fx_encoded_alignments_mafft):
400400
class TestDatasetAlignments:
401401

402402
def test_fetch_known(self, fx_dataset):
403-
a = fx_dataset.haplotypes["SRR11772659"]
403+
a = fx_dataset.alignment["SRR11772659"]
404404
assert a.shape == (sc2ts.REFERENCE_SEQUENCE_LENGTH - 1,)
405405
assert a[0] == -1
406406
assert a[-1] == -1
407407

408408
def test_compare_fasta(self, fx_dataset, fx_alignments_fasta):
409409
fr = data_import.FastaReader(fx_alignments_fasta)
410410
for k, a1 in fr.items():
411-
h = fx_dataset.haplotypes[k]
411+
h = fx_dataset.alignment[k]
412412
a2 = sc2ts.decode_alignment(h)
413413
nt.assert_array_equal(a1[1:], a2)
414414

415415
def test_len(self, fx_dataset):
416-
assert len(fx_dataset.haplotypes) == 55
416+
assert len(fx_dataset.alignment) == 55
417417

418418
def test_keys(self, fx_dataset):
419-
keys = list(fx_dataset.haplotypes.keys())
420-
assert len(keys) == len(fx_dataset.haplotypes)
419+
keys = list(fx_dataset.alignment.keys())
420+
assert len(keys) == len(fx_dataset.alignment)
421421
assert "SRR11772659" in keys
422422

423423
def test_in(self, fx_dataset):
424-
assert "SRR11772659" in fx_dataset.haplotypes
425-
assert "NOT_IN_STORE" not in fx_dataset.haplotypes
424+
assert "SRR11772659" in fx_dataset.alignment
425+
assert "NOT_IN_STORE" not in fx_dataset.alignment
426426

427427
@pytest.mark.parametrize(
428428
["chunk_size", "cache_size"],
@@ -445,7 +445,7 @@ def test_chunk_size_cache_size(
445445
sc2ts.Dataset.add_metadata(path, fx_metadata_df)
446446
ds = sc2ts.Dataset(path, chunk_cache_size=cache_size)
447447
for k, v in fx_encoded_alignments.items():
448-
nt.assert_array_equal(v, ds.haplotypes[k])
448+
nt.assert_array_equal(v, ds.alignment[k])
449449

450450

451451
class TestDatasetMetadata:
@@ -454,7 +454,7 @@ def test_len(self, fx_dataset):
454454
assert len(fx_dataset.metadata) == 55
455455

456456
def test_keys(self, fx_dataset):
457-
assert fx_dataset.metadata.keys() == fx_dataset.haplotypes.keys()
457+
assert fx_dataset.metadata.keys() == fx_dataset.alignment.keys()
458458

459459
def test_known(self, fx_dataset):
460460
d = fx_dataset.metadata["SRR11772659"]

tests/test_inference.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -597,7 +597,7 @@ def test_2020_02_02_missing_sample(
597597
strain,
598598
num_missing,
599599
):
600-
a = fx_dataset.haplotypes[strain]
600+
a = fx_dataset.alignment[strain]
601601
a = sc2ts.mask_ambiguous(a)
602602

603603
missing_positions = np.where(a == -1)[0] + 1
@@ -986,7 +986,7 @@ def test_exact_match(self, tmp_path, fx_ts_map, fx_dataset):
986986
strains = ["SRR11597218", "ERR4204459"]
987987
fake_strains = ["fake" + s for s in strains]
988988
alignments = {
989-
name: fx_dataset.haplotypes[s] for name, s in zip(fake_strains, strains)
989+
name: fx_dataset.alignment[s] for name, s in zip(fake_strains, strains)
990990
}
991991
date = "2020-03-01"
992992
ds = sc2ts.dataset.tmp_dataset(tmp_path / "tmp.zarr", alignments, date=date)
@@ -1110,7 +1110,7 @@ def test_recombinant_example_2(self, fx_ts_map, fx_recombinant_example_2):
11101110
def test_all_As(self, tmp_path, fx_ts_map, fx_dataset):
11111111
# Same as the recombinant_example_1() function above
11121112
# Just to get something that looks like an alignment easily
1113-
a = fx_dataset.haplotypes["SRR11597188"]
1113+
a = fx_dataset.alignment["SRR11597188"]
11141114
a[1:] = 0
11151115
alignments = {"crazytype": a}
11161116
date = "2020-03-01"

0 commit comments

Comments
 (0)