From 8095f40198216ebcedbdc82fe9effdbf21612193 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Thu, 9 Jan 2025 14:17:25 +0000 Subject: [PATCH 01/12] Basic initial local-alleles schema implementation --- bio2zarr/cli.py | 4 ++- bio2zarr/vcf2zarr/vcz.py | 60 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 61 insertions(+), 3 deletions(-) diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py index e37bd766..6f3ff3a6 100644 --- a/bio2zarr/cli.py +++ b/bio2zarr/cli.py @@ -340,7 +340,8 @@ def inspect(path, verbose): @icf_path @variants_chunk_size @samples_chunk_size -def mkschema(icf_path, variants_chunk_size, samples_chunk_size): +@local_alleles +def mkschema(icf_path, variants_chunk_size, samples_chunk_size, local_alleles): """ Generate a schema for zarr encoding """ @@ -350,6 +351,7 @@ def mkschema(icf_path, variants_chunk_size, samples_chunk_size): stream, variants_chunk_size=variants_chunk_size, samples_chunk_size=samples_chunk_size, + local_alleles=local_alleles, ) diff --git a/bio2zarr/vcf2zarr/vcz.py b/bio2zarr/vcf2zarr/vcz.py index 1096bb4a..1293b579 100644 --- a/bio2zarr/vcf2zarr/vcz.py +++ b/bio2zarr/vcf2zarr/vcz.py @@ -182,6 +182,47 @@ def variant_chunk_nbytes(self): ZARR_SCHEMA_FORMAT_VERSION = "0.4" +def convert_local_allele_field_types(fields): + """ + Update the specified list of fields to include the LAA field, and to convert + any supported localisable fields to the L* counterpart. + + Note that we currently support only two ALT alleles per sample, and so the + dimensions of these fields are fixed by that requirement. Later versions may + use summry data storted in the ICF to make different choices, if information + about subsequent alleles (not in the actual genotype calls) should also be + stored. + """ + fields_by_name = {field.name: field for field in fields} + gt = fields_by_name["call_genotype"] + if gt.shape[-1] != 2: + raise ValueError("Local alleles only supported on diploid data") + shape = gt.shape[:-1] + chunks = gt.chunks[:-1] + + laa = ZarrArraySpec.new( + vcf_field=None, + name="call_LAA", + dtype="i1", + shape=gt.shape, + chunks=gt.chunks, + dimensions=gt.dimensions, # FIXME + description=( + "1-based indices into ALT, indicating which alleles" + " are relevant (local) for the current sample" + ), + ) + pl = fields_by_name.get("call_PL", None) + if pl is not None: + pl.name = "call_LPL" + pl.vcf_field = None + pl.shape = (*shape, 3) + pl.chunks = (*chunks, 3) + pl.description += " (local-alleles)" + # TODO fix dimensions + return [*fields, laa] + + @dataclasses.dataclass class VcfZarrSchema(core.JsonDataclass): format_version: str @@ -232,13 +273,17 @@ def fromjson(s): return VcfZarrSchema.fromdict(json.loads(s)) @staticmethod - def generate(icf, variants_chunk_size=None, samples_chunk_size=None): + def generate( + icf, variants_chunk_size=None, samples_chunk_size=None, local_alleles=None + ): m = icf.num_records n = icf.num_samples if samples_chunk_size is None: samples_chunk_size = 10_000 if variants_chunk_size is None: variants_chunk_size = 1000 + if local_alleles is None: + local_alleles = False logger.info( f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}" ) @@ -365,6 +410,9 @@ def fixed_field_spec( ) ) + if local_alleles: + array_specs = convert_local_allele_field_types(array_specs) + return VcfZarrSchema( format_version=ZARR_SCHEMA_FORMAT_VERSION, samples_chunk_size=samples_chunk_size, @@ -1035,12 +1083,20 @@ def encode_all_partitions( pwm.submit(self.encode_partition, partition_index) -def mkschema(if_path, out, *, variants_chunk_size=None, samples_chunk_size=None): +def mkschema( + if_path, + out, + *, + variants_chunk_size=None, + samples_chunk_size=None, + local_alleles=None, +): store = icf.IntermediateColumnarFormat(if_path) spec = VcfZarrSchema.generate( store, variants_chunk_size=variants_chunk_size, samples_chunk_size=samples_chunk_size, + local_alleles=local_alleles, ) out.write(spec.asjson()) From d347e3762e78a88901c0d0925218f078372f659e Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Thu, 9 Jan 2025 15:41:55 +0000 Subject: [PATCH 02/12] Initial pass at computing LAA values --- bio2zarr/vcf2zarr/vcz.py | 75 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/bio2zarr/vcf2zarr/vcz.py b/bio2zarr/vcf2zarr/vcz.py index 1293b579..240da820 100644 --- a/bio2zarr/vcf2zarr/vcz.py +++ b/bio2zarr/vcf2zarr/vcz.py @@ -197,6 +197,8 @@ def convert_local_allele_field_types(fields): gt = fields_by_name["call_genotype"] if gt.shape[-1] != 2: raise ValueError("Local alleles only supported on diploid data") + # TODO check if LAA is already in here + shape = gt.shape[:-1] chunks = gt.chunks[:-1] @@ -214,6 +216,7 @@ def convert_local_allele_field_types(fields): ) pl = fields_by_name.get("call_PL", None) if pl is not None: + # TODO check if call_LPL is in the list already pl.name = "call_LPL" pl.vcf_field = None pl.shape = (*shape, 3) @@ -510,6 +513,46 @@ def fromdict(d): return ret +def compute_laa_field(genotypes, alleles) -> np.ndarray: + """ + Computes the value of the LAA field for each sample given the genotypes + for a variant. + + The LAA field is a list of one-based indices into the ALT alleles + that indicates which alternate alleles are observed in the sample. + """ + alt_allele_count = len(alleles) - 1 + allele_counts = np.zeros((genotypes.shape[0], len(alleles)), dtype=int) + + genotypes = genotypes.clip(0, None) + genotype_allele_counts = np.apply_along_axis( + np.bincount, axis=1, arr=genotypes, minlength=len(alleles) + ) + allele_counts += genotype_allele_counts + + allele_counts[:, 0] = 0 # We don't count the reference allele + max_row_length = 1 + + def nonzero_pad(arr: np.ndarray, *, length: int): + nonlocal max_row_length + alleles = arr.nonzero()[0] + max_row_length = max(max_row_length, len(alleles)) + pad_length = length - len(alleles) + return np.pad( + alleles, + (0, pad_length), + mode="constant", + constant_values=constants.INT_FILL, + ) + + alleles = np.apply_along_axis( + nonzero_pad, axis=1, arr=allele_counts, length=max(1, alt_allele_count) + ) + alleles = alleles[:, :max_row_length] + + return alleles + + @dataclasses.dataclass class VcfZarrWriteSummary(core.JsonDataclass): num_partitions: int @@ -542,6 +585,12 @@ def has_genotypes(self): return True return False + def has_local_alleles(self): + for field in self.schema.fields: + if field.name == "call_LAA" and field.vcf_field is None: + return True + return False + ####################### # init ####################### @@ -734,6 +783,8 @@ def encode_partition(self, partition_index): self.encode_array_partition(array_spec, partition_index) if self.has_genotypes(): self.encode_genotypes_partition(partition_index) + if self.has_local_alleles(): + self.encode_local_alleles_partition(partition_index) final_path = self.partition_path(partition_index) logger.info(f"Finalising {partition_index} at {final_path}") @@ -805,6 +856,30 @@ def encode_genotypes_partition(self, partition_index): self.finalise_partition_array(partition_index, "call_genotype_mask") self.finalise_partition_array(partition_index, "call_genotype_phased") + def encode_local_alleles_partition(self, partition_index): + call_LAA_array = self.init_partition_array(partition_index, "call_LAA") + partition = self.metadata.partitions[partition_index] + call_LAA = core.BufferedArray(call_LAA_array, partition.start) + + gt_array = zarr.open_array( + store=self.wip_partition_array_path(partition_index, "call_genotype"), + mode="r", + ) + alleles_array = zarr.open_array( + store=self.wip_partition_array_path(partition_index, "variant_allele"), + mode="r", + ) + for chunk_index in range(gt_array.cdata_shape[0]): + A = alleles_array.blocks[chunk_index] + G = gt_array.blocks[chunk_index] + for alleles, var in zip(A, G): + j = call_LAA.next_buffer_row() + # TODO we should probably compute LAAs by chunk for efficiency + call_LAA.buff[j] = compute_laa_field(var, alleles) + + call_LAA.flush() + self.finalise_partition_array(partition_index, "call_LAA") + def encode_alleles_partition(self, partition_index): array_name = "variant_allele" alleles_array = self.init_partition_array(partition_index, array_name) From 2b508adc4114bbbf8db0155f1da3433485331ca3 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Fri, 10 Jan 2025 12:03:23 +0000 Subject: [PATCH 03/12] Simple implementation of compute_laa and tests Fixup Close LAA loophole --- bio2zarr/vcf2zarr/vcz.py | 59 ++++++++++++------------------------- tests/test_local_alleles.py | 31 +++++++++++++++++++ 2 files changed, 50 insertions(+), 40 deletions(-) create mode 100644 tests/test_local_alleles.py diff --git a/bio2zarr/vcf2zarr/vcz.py b/bio2zarr/vcf2zarr/vcz.py index 240da820..32d79314 100644 --- a/bio2zarr/vcf2zarr/vcz.py +++ b/bio2zarr/vcf2zarr/vcz.py @@ -513,7 +513,7 @@ def fromdict(d): return ret -def compute_laa_field(genotypes, alleles) -> np.ndarray: +def compute_laa_field(genotypes) -> np.ndarray: """ Computes the value of the LAA field for each sample given the genotypes for a variant. @@ -521,36 +521,22 @@ def compute_laa_field(genotypes, alleles) -> np.ndarray: The LAA field is a list of one-based indices into the ALT alleles that indicates which alternate alleles are observed in the sample. """ - alt_allele_count = len(alleles) - 1 - allele_counts = np.zeros((genotypes.shape[0], len(alleles)), dtype=int) - - genotypes = genotypes.clip(0, None) - genotype_allele_counts = np.apply_along_axis( - np.bincount, axis=1, arr=genotypes, minlength=len(alleles) - ) - allele_counts += genotype_allele_counts - - allele_counts[:, 0] = 0 # We don't count the reference allele - max_row_length = 1 - - def nonzero_pad(arr: np.ndarray, *, length: int): - nonlocal max_row_length - alleles = arr.nonzero()[0] - max_row_length = max(max_row_length, len(alleles)) - pad_length = length - len(alleles) - return np.pad( - alleles, - (0, pad_length), - mode="constant", - constant_values=constants.INT_FILL, - ) - - alleles = np.apply_along_axis( - nonzero_pad, axis=1, arr=allele_counts, length=max(1, alt_allele_count) - ) - alleles = alleles[:, :max_row_length] - - return alleles + v = 2**31 - 1 + if np.any(genotypes >= v): + raise ValueError("Extreme allele value not supported") + G = genotypes.astype(np.int32) + if len(G) > 0: + # Anything <=0 gets mapped to -2 (pad) in the output, which comes last. + # So, to get this sorting correctly, we remap to the largest value for + # sorting, then map back. We promote the genotypes up to 32 bit for convenience + # here, assuming that we'll never have a allele of 2**31 - 1. + assert np.all(G != v) + G[G <= 0] = v + G.sort(axis=1) + # Equal non-zero values result in padding also + G[G[:, 0] == G[:, 1], 1] = -2 + G[G == v] = -2 + return G.astype(genotypes.dtype) @dataclasses.dataclass @@ -865,17 +851,10 @@ def encode_local_alleles_partition(self, partition_index): store=self.wip_partition_array_path(partition_index, "call_genotype"), mode="r", ) - alleles_array = zarr.open_array( - store=self.wip_partition_array_path(partition_index, "variant_allele"), - mode="r", - ) for chunk_index in range(gt_array.cdata_shape[0]): - A = alleles_array.blocks[chunk_index] - G = gt_array.blocks[chunk_index] - for alleles, var in zip(A, G): + for genotypes in gt_array.blocks[chunk_index]: j = call_LAA.next_buffer_row() - # TODO we should probably compute LAAs by chunk for efficiency - call_LAA.buff[j] = compute_laa_field(var, alleles) + call_LAA.buff[j] = compute_laa_field(genotypes) call_LAA.flush() self.finalise_partition_array(partition_index, "call_LAA") diff --git a/tests/test_local_alleles.py b/tests/test_local_alleles.py new file mode 100644 index 00000000..1c0bae4a --- /dev/null +++ b/tests/test_local_alleles.py @@ -0,0 +1,31 @@ +import numpy as np +import numpy.testing as nt +import pytest + +from bio2zarr.vcf2zarr.vcz import compute_laa_field + + +class TestComputeLAA: + @pytest.mark.parametrize( + ("genotypes", "expected"), + [ + ([], []), + ([[0, 0]], [[-2, -2]]), + ([[0, 0], [0, 0]], [[-2, -2], [-2, -2]]), + ([[1, 1], [0, 0]], [[1, -2], [-2, -2]]), + ([[0, 1], [3, 2], [3, 0]], [[1, -2], [2, 3], [3, -2]]), + ([[0, 0], [2, 3]], [[-2, -2], [2, 3]]), + ([[2, 3], [0, 0]], [[2, 3], [-2, -2]]), + ([[128, 0], [6, 5]], [[128, -2], [5, 6]]), + ([[0, -1], [-1, 5]], [[-2, -2], [5, -2]]), + ], + ) + def test_simple_examples(self, genotypes, expected): + G = np.array(genotypes) + result = compute_laa_field(G) + nt.assert_array_equal(result, expected) + + def test_extreme_value(self): + G = np.array([[0, 2**32 - 1]]) + with pytest.raises(ValueError, match="Extreme"): + compute_laa_field(G) From 35b713bbe1f410d8df4c4cfeb1914841d61382bc Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Wed, 15 Jan 2025 15:38:50 +0000 Subject: [PATCH 04/12] Disable PL for now to get tests working --- bio2zarr/vcf2zarr/vcz.py | 28 ++++++++++++++++------------ tests/test_vcf_examples.py | 8 ++++---- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/bio2zarr/vcf2zarr/vcz.py b/bio2zarr/vcf2zarr/vcz.py index 32d79314..8e2e6f3a 100644 --- a/bio2zarr/vcf2zarr/vcz.py +++ b/bio2zarr/vcf2zarr/vcz.py @@ -199,8 +199,8 @@ def convert_local_allele_field_types(fields): raise ValueError("Local alleles only supported on diploid data") # TODO check if LAA is already in here - shape = gt.shape[:-1] - chunks = gt.chunks[:-1] + # shape = gt.shape[:-1] + # chunks = gt.chunks[:-1] laa = ZarrArraySpec.new( vcf_field=None, @@ -214,15 +214,15 @@ def convert_local_allele_field_types(fields): " are relevant (local) for the current sample" ), ) - pl = fields_by_name.get("call_PL", None) - if pl is not None: - # TODO check if call_LPL is in the list already - pl.name = "call_LPL" - pl.vcf_field = None - pl.shape = (*shape, 3) - pl.chunks = (*chunks, 3) - pl.description += " (local-alleles)" - # TODO fix dimensions + # pl = fields_by_name.get("call_PL", None) + # if pl is not None: + # # TODO check if call_LPL is in the list already + # pl.name = "call_LPL" + # pl.vcf_field = None + # pl.shape = (*shape, 3) + # pl.chunks = (*chunks, 3) + # pl.description += " (local-alleles)" + # # TODO fix dimensions return [*fields, laa] @@ -1164,6 +1164,7 @@ def encode( max_variant_chunks=None, dimension_separator=None, max_memory=None, + local_alleles=None, worker_processes=1, show_progress=False, ): @@ -1176,6 +1177,7 @@ def encode( schema_path=schema_path, variants_chunk_size=variants_chunk_size, samples_chunk_size=samples_chunk_size, + local_alleles=local_alleles, max_variant_chunks=max_variant_chunks, dimension_separator=dimension_separator, ) @@ -1197,6 +1199,7 @@ def encode_init( schema_path=None, variants_chunk_size=None, samples_chunk_size=None, + local_alleles=None, max_variant_chunks=None, dimension_separator=None, max_memory=None, @@ -1209,6 +1212,7 @@ def encode_init( icf_store, variants_chunk_size=variants_chunk_size, samples_chunk_size=samples_chunk_size, + local_alleles=local_alleles, ) else: logger.info(f"Reading schema from {schema_path}") @@ -1261,7 +1265,6 @@ def convert( vcfs, worker_processes=worker_processes, show_progress=show_progress, - local_alleles=local_alleles, ) encode( icf_path, @@ -1270,6 +1273,7 @@ def convert( samples_chunk_size=samples_chunk_size, worker_processes=worker_processes, show_progress=show_progress, + local_alleles=local_alleles, ) diff --git a/tests/test_vcf_examples.py b/tests/test_vcf_examples.py index 1f1d0216..a2db845b 100644 --- a/tests/test_vcf_examples.py +++ b/tests/test_vcf_examples.py @@ -647,12 +647,12 @@ def test_call_AD(self, ds): def test_call_LAA(self, ds): # All the genotypes are 0/0 - call_LAA = np.full((23, 3, 1), -2) + call_LAA = np.full((23, 3, 2), -2) nt.assert_array_equal(ds.call_LAA.values, call_LAA) - def test_call_LPL(self, ds): - call_LPL = np.tile([0, -2, -2], (23, 3, 1)) - nt.assert_array_equal(ds.call_LPL.values, call_LPL) + # def test_call_LPL(self, ds): + # call_LPL = np.tile([0, -2, -2], (23, 3, 1)) + # nt.assert_array_equal(ds.call_LPL.values, call_LPL) def test_call_PID(self, ds): call_PGT = ds["call_PGT"].values From 9ace3955186267b1bc4aadc47bf591ad35012782 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Wed, 15 Jan 2025 16:24:11 +0000 Subject: [PATCH 05/12] First pass at LAD --- bio2zarr/vcf2zarr/vcz.py | 47 +++++++++++++++++++--- tests/test_local_alleles.py | 35 +++++++++++++++- tests/test_vcf_examples.py | 80 +++++++++++++++++++++++++------------ 3 files changed, 130 insertions(+), 32 deletions(-) diff --git a/bio2zarr/vcf2zarr/vcz.py b/bio2zarr/vcf2zarr/vcz.py index 8e2e6f3a..929af9c9 100644 --- a/bio2zarr/vcf2zarr/vcz.py +++ b/bio2zarr/vcf2zarr/vcz.py @@ -199,8 +199,8 @@ def convert_local_allele_field_types(fields): raise ValueError("Local alleles only supported on diploid data") # TODO check if LAA is already in here - # shape = gt.shape[:-1] - # chunks = gt.chunks[:-1] + shape = gt.shape[:-1] + chunks = gt.chunks[:-1] laa = ZarrArraySpec.new( vcf_field=None, @@ -214,6 +214,16 @@ def convert_local_allele_field_types(fields): " are relevant (local) for the current sample" ), ) + ad = fields_by_name.get("call_AD", None) + if ad is not None: + # TODO check if call_LAD is in the list already + ad.name = "call_LAD" + ad.vcf_field = None + ad.shape = (*shape, 2) + ad.chunks = (*chunks, 2) + ad.description += " (local-alleles)" + # TODO fix dimensions + # pl = fields_by_name.get("call_PL", None) # if pl is not None: # # TODO check if call_LPL is in the list already @@ -513,7 +523,7 @@ def fromdict(d): return ret -def compute_laa_field(genotypes) -> np.ndarray: +def compute_laa_field(genotypes): """ Computes the value of the LAA field for each sample given the genotypes for a variant. @@ -539,6 +549,19 @@ def compute_laa_field(genotypes) -> np.ndarray: return G.astype(genotypes.dtype) +def compute_lad_field(ad, laa): + lad = np.full((ad.shape[0], 2), -2, dtype=ad.dtype) + ref_ref = np.where((laa[:, 0] == -2) & (laa[:, 1] == -2))[0] + lad[ref_ref, 0] = ad[ref_ref, 0] + ref_alt = np.where((laa[:, 0] != -2) & (laa[:, 1] == -2))[0] + lad[ref_alt, 0] = ad[ref_alt, 0] + lad[ref_alt, 1] = ad[ref_alt, laa[ref_alt, 0]] + alt_alt = np.where((laa[:, 0] != -2) & (laa[:, 1] != -2))[0] + lad[alt_alt, 0] = ad[alt_alt, laa[alt_alt, 0]] + lad[alt_alt, 1] = ad[alt_alt, laa[alt_alt, 1]] + return lad + + @dataclasses.dataclass class VcfZarrWriteSummary(core.JsonDataclass): num_partitions: int @@ -843,21 +866,35 @@ def encode_genotypes_partition(self, partition_index): self.finalise_partition_array(partition_index, "call_genotype_phased") def encode_local_alleles_partition(self, partition_index): - call_LAA_array = self.init_partition_array(partition_index, "call_LAA") partition = self.metadata.partitions[partition_index] + call_LAA_array = self.init_partition_array(partition_index, "call_LAA") call_LAA = core.BufferedArray(call_LAA_array, partition.start) + call_LAD_array = self.init_partition_array(partition_index, "call_LAD") + call_LAD = core.BufferedArray(call_LAD_array, partition.start) + call_AD_source = self.icf.fields["FORMAT/AD"].iter_values( + partition.start, partition.stop + ) + gt_array = zarr.open_array( store=self.wip_partition_array_path(partition_index, "call_genotype"), mode="r", ) for chunk_index in range(gt_array.cdata_shape[0]): for genotypes in gt_array.blocks[chunk_index]: + laa = compute_laa_field(genotypes) j = call_LAA.next_buffer_row() - call_LAA.buff[j] = compute_laa_field(genotypes) + call_LAA.buff[j] = laa + + ad = next(call_AD_source) + j = call_LAD.next_buffer_row() + lad = compute_lad_field(ad, laa) + call_LAD.buff[j] = lad call_LAA.flush() self.finalise_partition_array(partition_index, "call_LAA") + call_LAD.flush() + self.finalise_partition_array(partition_index, "call_LAD") def encode_alleles_partition(self, partition_index): array_name = "variant_allele" diff --git a/tests/test_local_alleles.py b/tests/test_local_alleles.py index 1c0bae4a..5124f3f4 100644 --- a/tests/test_local_alleles.py +++ b/tests/test_local_alleles.py @@ -2,7 +2,7 @@ import numpy.testing as nt import pytest -from bio2zarr.vcf2zarr.vcz import compute_laa_field +from bio2zarr.vcf2zarr.vcz import compute_laa_field, compute_lad_field class TestComputeLAA: @@ -29,3 +29,36 @@ def test_extreme_value(self): G = np.array([[0, 2**32 - 1]]) with pytest.raises(ValueError, match="Extreme"): compute_laa_field(G) + + +class TestComputeLAD: + @pytest.mark.parametrize( + ("ad", "laa", "expected"), + [ + # 0/0 calls + ([[10, 0]], [[-2, -2]], [[10, -2]]), + ([[10, 0, 0], [11, 0, 0]], [[-2, -2], [-2, -2]], [[10, -2], [11, -2]]), + # 0/1 calls + ([[10, 11]], [[1, -2]], [[10, 11]]), + ([[10, 11], [12, 0]], [[1, -2], [-2, -2]], [[10, 11], [12, -2]]), + # 0/2 calls + ([[10, 0, 11]], [[2, -2]], [[10, 11]]), + ([[10, 0, 11], [10, 11, 0]], [[2, -2], [1, -2]], [[10, 11], [10, 11]]), + ( + [[10, 0, 11], [10, 11, 0], [12, 0, 0]], + [[2, -2], [1, -2], [-2, -2]], + [[10, 11], [10, 11], [12, -2]], + ), + # 1/2 calls + ([[0, 10, 11]], [[1, 2]], [[10, 11]]), + ([[0, 10, 11], [12, 0, 13]], [[1, 2], [2, -2]], [[10, 11], [12, 13]]), + ( + [[0, 10, 11], [12, 0, 13], [14, 0, 0]], + [[1, 2], [2, -2], [-2, -2]], + [[10, 11], [12, 13], [14, -2]], + ), + ], + ) + def test_simple_examples(self, ad, laa, expected): + result = compute_lad_field(np.array(ad), np.array(laa)) + nt.assert_array_equal(result, expected) diff --git a/tests/test_vcf_examples.py b/tests/test_vcf_examples.py index a2db845b..3f61d8de 100644 --- a/tests/test_vcf_examples.py +++ b/tests/test_vcf_examples.py @@ -617,33 +617,61 @@ def test_variant_MLEAC(self, ds): ) nt.assert_array_equal(ds.variant_MLEAC.values, MLEAC) - def test_call_AD(self, ds): - call_AD = [ - [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], - [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], - [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], - [[446, 0, 0, -2, -2], [393, 0, 0, -2, -2], [486, 0, 0, -2, -2]], - [[446, 0, 0, 0, -2], [393, 0, 0, 0, -2], [486, 0, 0, 0, -2]], - [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], - [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], - [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], - [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], - [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], - [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], - [[446, 0, 0, 0, 0], [393, 0, 0, 0, 0], [486, 0, 0, 0, 0]], - [[446, 0, 0, 0, 0], [393, 0, 0, 0, 0], [486, 0, 0, 0, 0]], - [[446, 0, 0, 0, 0], [393, 0, 0, 0, 0], [486, 0, 0, 0, 0]], - [[446, 0, 0, 0, -2], [393, 0, 0, 0, -2], [486, 0, 0, 0, -2]], - [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], - [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], - [[446, 0, 0, -2, -2], [393, 0, 0, -2, -2], [486, 0, 0, -2, -2]], - [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], - [[446, 0, 0, -2, -2], [393, 0, 0, -2, -2], [486, 0, 0, -2, -2]], - [[446, 0, 0, 0, -2], [393, 0, 0, 0, -2], [486, 0, 0, 0, -2]], - [[446, 0, 0, 0, 0], [393, 0, 0, 0, 0], [486, 0, 0, 0, 0]], - [[446, 0, 0, 0, -2], [393, 0, 0, 0, -2], [486, 0, 0, 0, -2]], + # def test_call_AD(self, ds): + # call_AD = [ + # [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], + # [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], + # [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], + # [[446, 0, 0, -2, -2], [393, 0, 0, -2, -2], [486, 0, 0, -2, -2]], + # [[446, 0, 0, 0, -2], [393, 0, 0, 0, -2], [486, 0, 0, 0, -2]], + # [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], + # [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], + # [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], + # [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], + # [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], + # [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], + # [[446, 0, 0, 0, 0], [393, 0, 0, 0, 0], [486, 0, 0, 0, 0]], + # [[446, 0, 0, 0, 0], [393, 0, 0, 0, 0], [486, 0, 0, 0, 0]], + # [[446, 0, 0, 0, 0], [393, 0, 0, 0, 0], [486, 0, 0, 0, 0]], + # [[446, 0, 0, 0, -2], [393, 0, 0, 0, -2], [486, 0, 0, 0, -2]], + # [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], + # [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], + # [[446, 0, 0, -2, -2], [393, 0, 0, -2, -2], [486, 0, 0, -2, -2]], + # [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], + # [[446, 0, 0, -2, -2], [393, 0, 0, -2, -2], [486, 0, 0, -2, -2]], + # [[446, 0, 0, 0, -2], [393, 0, 0, 0, -2], [486, 0, 0, 0, -2]], + # [[446, 0, 0, 0, 0], [393, 0, 0, 0, 0], [486, 0, 0, 0, 0]], + # [[446, 0, 0, 0, -2], [393, 0, 0, 0, -2], [486, 0, 0, 0, -2]], + # ] + # nt.assert_array_equal(ds.call_AD.values, call_AD) + + def test_call_LAD(self, ds): + call_LAD = [ + [[446, -2], [393, -2], [486, -2]], + [[446, -2], [393, -2], [486, -2]], + [[446, -2], [393, -2], [486, -2]], + [[446, -2], [393, -2], [486, -2]], + [[446, -2], [393, -2], [486, -2]], + [[446, -2], [393, -2], [486, -2]], + [[446, -2], [393, -2], [486, -2]], + [[446, -2], [393, -2], [486, -2]], + [[446, -2], [393, -2], [486, -2]], + [[446, -2], [393, -2], [486, -2]], + [[446, -2], [393, -2], [486, -2]], + [[446, -2], [393, -2], [486, -2]], + [[446, -2], [393, -2], [486, -2]], + [[446, -2], [393, -2], [486, -2]], + [[446, -2], [393, -2], [486, -2]], + [[446, -2], [393, -2], [486, -2]], + [[446, -2], [393, -2], [486, -2]], + [[446, -2], [393, -2], [486, -2]], + [[446, -2], [393, -2], [486, -2]], + [[446, -2], [393, -2], [486, -2]], + [[446, -2], [393, -2], [486, -2]], + [[446, -2], [393, -2], [486, -2]], + [[446, -2], [393, -2], [486, -2]], ] - nt.assert_array_equal(ds.call_AD.values, call_AD) + nt.assert_array_equal(ds.call_LAD.values, call_LAD) def test_call_LAA(self, ds): # All the genotypes are 0/0 From f80e1c2ae517708aa6897b4785acf91fa231f961 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Thu, 16 Jan 2025 12:25:52 +0000 Subject: [PATCH 06/12] Fixup genotypes iterator and call_LAA/call_LAD --- bio2zarr/core.py | 21 ++++++++++++++++++ bio2zarr/vcf2zarr/vcz.py | 46 +++++++++++++++++++++++----------------- tests/test_core.py | 35 ++++++++++++++++++++++++++++++ 3 files changed, 82 insertions(+), 20 deletions(-) diff --git a/bio2zarr/core.py b/bio2zarr/core.py index ac4fc293..af61d6d1 100644 --- a/bio2zarr/core.py +++ b/bio2zarr/core.py @@ -63,6 +63,27 @@ def chunk_aligned_slices(z, n, max_chunks=None): return slices +def first_dim_slice_iter(z, start, stop): + """ + Efficiently iterate over the specified slice of the first dimension of the zarr + array z. + """ + chunk_size = z.chunks[0] + first_chunk = start // chunk_size + last_chunk = (stop // chunk_size) + (stop % chunk_size != 0) + for chunk in range(first_chunk, last_chunk): + Z = z.blocks[chunk] + chunk_start = chunk * chunk_size + chunk_stop = chunk_start + chunk_size + slice_start = None + if start > chunk_start: + slice_start = start - chunk_start + slice_stop = None + if stop < chunk_stop: + slice_stop = stop - chunk_start + yield from Z[slice_start:slice_stop] + + def du(path): """ Return the total bytes stored at this path. diff --git a/bio2zarr/vcf2zarr/vcz.py b/bio2zarr/vcf2zarr/vcz.py index 929af9c9..06ba5297 100644 --- a/bio2zarr/vcf2zarr/vcz.py +++ b/bio2zarr/vcf2zarr/vcz.py @@ -550,15 +550,20 @@ def compute_laa_field(genotypes): def compute_lad_field(ad, laa): - lad = np.full((ad.shape[0], 2), -2, dtype=ad.dtype) - ref_ref = np.where((laa[:, 0] == -2) & (laa[:, 1] == -2))[0] - lad[ref_ref, 0] = ad[ref_ref, 0] - ref_alt = np.where((laa[:, 0] != -2) & (laa[:, 1] == -2))[0] - lad[ref_alt, 0] = ad[ref_alt, 0] - lad[ref_alt, 1] = ad[ref_alt, laa[ref_alt, 0]] - alt_alt = np.where((laa[:, 0] != -2) & (laa[:, 1] != -2))[0] - lad[alt_alt, 0] = ad[alt_alt, laa[alt_alt, 0]] - lad[alt_alt, 1] = ad[alt_alt, laa[alt_alt, 1]] + try: + lad = np.full((ad.shape[0], 2), -2, dtype=ad.dtype) + ref_ref = np.where((laa[:, 0] == -2) & (laa[:, 1] == -2))[0] + lad[ref_ref, 0] = ad[ref_ref, 0] + ref_alt = np.where((laa[:, 0] != -2) & (laa[:, 1] == -2))[0] + lad[ref_alt, 0] = ad[ref_alt, 0] + lad[ref_alt, 1] = ad[ref_alt, laa[ref_alt, 0]] + alt_alt = np.where((laa[:, 0] != -2) & (laa[:, 1] != -2))[0] + lad[alt_alt, 0] = ad[alt_alt, laa[alt_alt, 0]] + lad[alt_alt, 1] = ad[alt_alt, laa[alt_alt, 1]] + except Exception as e: + print("ad = ", ad) + print("laa = ", laa) + raise e return lad @@ -875,21 +880,22 @@ def encode_local_alleles_partition(self, partition_index): call_AD_source = self.icf.fields["FORMAT/AD"].iter_values( partition.start, partition.stop ) - gt_array = zarr.open_array( store=self.wip_partition_array_path(partition_index, "call_genotype"), mode="r", ) - for chunk_index in range(gt_array.cdata_shape[0]): - for genotypes in gt_array.blocks[chunk_index]: - laa = compute_laa_field(genotypes) - j = call_LAA.next_buffer_row() - call_LAA.buff[j] = laa - - ad = next(call_AD_source) - j = call_LAD.next_buffer_row() - lad = compute_lad_field(ad, laa) - call_LAD.buff[j] = lad + for genotypes in core.first_dim_slice_iter( + gt_array, partition.start, partition.stop + ): + laa = compute_laa_field(genotypes) + j = call_LAA.next_buffer_row() + call_LAA.buff[j] = laa + + ad = next(call_AD_source) + k = call_LAD.next_buffer_row() + assert j == k + lad = compute_lad_field(ad, laa) + call_LAD.buff[j] = lad call_LAA.flush() self.finalise_partition_array(partition_index, "call_LAA") diff --git a/tests/test_core.py b/tests/test_core.py index 16e2c0c0..25429ed9 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -194,6 +194,41 @@ def test_5_chunk_1(self, n, expected): assert result == expected +class TestFirstDimSliceIter: + @pytest.mark.parametrize("chunk_size", [1, 3, 4, 5]) + @pytest.mark.parametrize( + ("size", "start", "stop"), + [ + (10, 0, 4), + (10, 0, 8), + (10, 0, 10), + (10, 4, 4), + (10, 4, 8), + (10, 4, 10), + (10, 0, 5), + (10, 0, 3), + (10, 0, 9), + (10, 1, 5), + (10, 1, 1), + (10, 1, 2), + (10, 1, 3), + (10, 1, 4), + (10, 1, 10), + (10, 5, 5), + (10, 5, 6), + (10, 5, 7), + (5, 0, 5), + (5, 1, 1), + (5, 1, 3), + ], + ) + def test_examples(self, chunk_size, size, start, stop): + a = np.arange(size, dtype=int) + z = zarr.empty(size, chunks=chunk_size, dtype=int) + z[:] = a + assert list(core.first_dim_slice_iter(z, start, stop)) == list(a[start:stop]) + + @pytest.mark.skipif(sys.platform != "linux", reason="Only valid on Linux") @pytest.mark.parametrize( ("path", "expected"), From 6472ae0554d6926e68db23af8a10dc1adfa729b9 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Thu, 16 Jan 2025 12:40:07 +0000 Subject: [PATCH 07/12] Test local/non local path --- tests/test_vcf_examples.py | 113 ++++++++++++++++++++++++++----------- 1 file changed, 80 insertions(+), 33 deletions(-) diff --git a/tests/test_vcf_examples.py b/tests/test_vcf_examples.py index 3f61d8de..a23db500 100644 --- a/tests/test_vcf_examples.py +++ b/tests/test_vcf_examples.py @@ -546,7 +546,7 @@ class Test1000G2020Example: @pytest.fixture(scope="class") def ds(self, tmp_path_factory): out = tmp_path_factory.mktemp("data") / "example.vcf.zarr" - vcf2zarr.convert([self.data_path], out, worker_processes=0, local_alleles=True) + vcf2zarr.convert([self.data_path], out, worker_processes=0) return sg.load_dataset(out) def test_position(self, ds): @@ -617,33 +617,85 @@ def test_variant_MLEAC(self, ds): ) nt.assert_array_equal(ds.variant_MLEAC.values, MLEAC) - # def test_call_AD(self, ds): - # call_AD = [ - # [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], - # [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], - # [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], - # [[446, 0, 0, -2, -2], [393, 0, 0, -2, -2], [486, 0, 0, -2, -2]], - # [[446, 0, 0, 0, -2], [393, 0, 0, 0, -2], [486, 0, 0, 0, -2]], - # [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], - # [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], - # [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], - # [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], - # [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], - # [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], - # [[446, 0, 0, 0, 0], [393, 0, 0, 0, 0], [486, 0, 0, 0, 0]], - # [[446, 0, 0, 0, 0], [393, 0, 0, 0, 0], [486, 0, 0, 0, 0]], - # [[446, 0, 0, 0, 0], [393, 0, 0, 0, 0], [486, 0, 0, 0, 0]], - # [[446, 0, 0, 0, -2], [393, 0, 0, 0, -2], [486, 0, 0, 0, -2]], - # [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], - # [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], - # [[446, 0, 0, -2, -2], [393, 0, 0, -2, -2], [486, 0, 0, -2, -2]], - # [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], - # [[446, 0, 0, -2, -2], [393, 0, 0, -2, -2], [486, 0, 0, -2, -2]], - # [[446, 0, 0, 0, -2], [393, 0, 0, 0, -2], [486, 0, 0, 0, -2]], - # [[446, 0, 0, 0, 0], [393, 0, 0, 0, 0], [486, 0, 0, 0, 0]], - # [[446, 0, 0, 0, -2], [393, 0, 0, 0, -2], [486, 0, 0, 0, -2]], - # ] - # nt.assert_array_equal(ds.call_AD.values, call_AD) + def test_call_AD(self, ds): + call_AD = [ + [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], + [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], + [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], + [[446, 0, 0, -2, -2], [393, 0, 0, -2, -2], [486, 0, 0, -2, -2]], + [[446, 0, 0, 0, -2], [393, 0, 0, 0, -2], [486, 0, 0, 0, -2]], + [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], + [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], + [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], + [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], + [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], + [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], + [[446, 0, 0, 0, 0], [393, 0, 0, 0, 0], [486, 0, 0, 0, 0]], + [[446, 0, 0, 0, 0], [393, 0, 0, 0, 0], [486, 0, 0, 0, 0]], + [[446, 0, 0, 0, 0], [393, 0, 0, 0, 0], [486, 0, 0, 0, 0]], + [[446, 0, 0, 0, -2], [393, 0, 0, 0, -2], [486, 0, 0, 0, -2]], + [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], + [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], + [[446, 0, 0, -2, -2], [393, 0, 0, -2, -2], [486, 0, 0, -2, -2]], + [[446, 0, -2, -2, -2], [393, 0, -2, -2, -2], [486, 0, -2, -2, -2]], + [[446, 0, 0, -2, -2], [393, 0, 0, -2, -2], [486, 0, 0, -2, -2]], + [[446, 0, 0, 0, -2], [393, 0, 0, 0, -2], [486, 0, 0, 0, -2]], + [[446, 0, 0, 0, 0], [393, 0, 0, 0, 0], [486, 0, 0, 0, 0]], + [[446, 0, 0, 0, -2], [393, 0, 0, 0, -2], [486, 0, 0, 0, -2]], + ] + nt.assert_array_equal(ds.call_AD.values, call_AD) + + def test_call_PID(self, ds): + call_PGT = ds["call_PGT"].values + assert np.all(call_PGT == ".") + assert call_PGT.shape == (23, 3) + + +class Test1000G2020ExampleLocalAlleles: + data_path = "tests/data/vcf/1kg_2020_chrM.vcf.gz" + + @pytest.fixture(scope="class") + def ds(self, tmp_path_factory): + out = tmp_path_factory.mktemp("data") / "example.vcf.zarr" + vcf2zarr.convert([self.data_path], out, worker_processes=0, local_alleles=True) + return sg.load_dataset(out) + + def test_position(self, ds): + # fmt: off + pos = [ + 26, 35, 40, 41, 42, 46, 47, 51, 52, 53, 54, 55, 56, + 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, + ] + # fmt: on + nt.assert_array_equal(ds.variant_position.values, pos) + + def test_alleles(self, ds): + alleles = [ + ["C", "T", "", "", ""], + ["G", "A", "", "", ""], + ["T", "C", "", "", ""], + ["C", "T", "CT", "", ""], + ["T", "TC", "C", "TG", ""], + ["T", "C", "", "", ""], + ["G", "A", "", "", ""], + ["T", "C", "", "", ""], + ["T", "C", "", "", ""], + ["G", "A", "", "", ""], + ["G", "A", "", "", ""], + ["TA", "TAA", "T", "CA", "AA"], + ["ATT", "*", "ATTT", "ACTT", "A"], + ["T", "C", "G", "*", "TC"], + ["T", "A", "C", "*", ""], + ["T", "A", "", "", ""], + ["T", "A", "", "", ""], + ["C", "A", "T", "", ""], + ["G", "A", "", "", ""], + ["T", "C", "A", "", ""], + ["C", "T", "CT", "A", ""], + ["TG", "T", "CG", "TGG", "TCGG"], + ["G", "T", "*", "A", ""], + ] + nt.assert_array_equal(ds.variant_allele.values, alleles) def test_call_LAD(self, ds): call_LAD = [ @@ -682,11 +734,6 @@ def test_call_LAA(self, ds): # call_LPL = np.tile([0, -2, -2], (23, 3, 1)) # nt.assert_array_equal(ds.call_LPL.values, call_LPL) - def test_call_PID(self, ds): - call_PGT = ds["call_PGT"].values - assert np.all(call_PGT == ".") - assert call_PGT.shape == (23, 3) - class Test1000G2020AnnotationsExample: data_path = "tests/data/vcf/1kg_2020_chr20_annotations.bcf" From 7b72ffd7b417521ac0e799963d91dd5a67af8952 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Thu, 16 Jan 2025 16:30:18 +0000 Subject: [PATCH 08/12] Fixups for PL --- bio2zarr/vcf2zarr/vcz.py | 127 ++++++++++++++++++++++-------------- tests/test_local_alleles.py | 92 +++++++++++++++++++------- tests/test_vcf_examples.py | 13 ++-- 3 files changed, 154 insertions(+), 78 deletions(-) diff --git a/bio2zarr/vcf2zarr/vcz.py b/bio2zarr/vcf2zarr/vcz.py index 06ba5297..3bddbd89 100644 --- a/bio2zarr/vcf2zarr/vcz.py +++ b/bio2zarr/vcf2zarr/vcz.py @@ -197,20 +197,21 @@ def convert_local_allele_field_types(fields): gt = fields_by_name["call_genotype"] if gt.shape[-1] != 2: raise ValueError("Local alleles only supported on diploid data") - # TODO check if LAA is already in here + + # TODO check if LA is already in here shape = gt.shape[:-1] chunks = gt.chunks[:-1] - laa = ZarrArraySpec.new( + la = ZarrArraySpec.new( vcf_field=None, - name="call_LAA", + name="call_LA", dtype="i1", shape=gt.shape, chunks=gt.chunks, dimensions=gt.dimensions, # FIXME description=( - "1-based indices into ALT, indicating which alleles" + "0-based indices into REF+ALT, indicating which alleles" " are relevant (local) for the current sample" ), ) @@ -224,16 +225,16 @@ def convert_local_allele_field_types(fields): ad.description += " (local-alleles)" # TODO fix dimensions - # pl = fields_by_name.get("call_PL", None) - # if pl is not None: - # # TODO check if call_LPL is in the list already - # pl.name = "call_LPL" - # pl.vcf_field = None - # pl.shape = (*shape, 3) - # pl.chunks = (*chunks, 3) - # pl.description += " (local-alleles)" - # # TODO fix dimensions - return [*fields, laa] + pl = fields_by_name.get("call_PL", None) + if pl is not None: + # TODO check if call_LPL is in the list already + pl.name = "call_LPL" + pl.vcf_field = None + pl.shape = (*shape, 3) + pl.chunks = (*chunks, 3) + pl.description += " (local-alleles)" + # TODO fix dimensions + return [*fields, la] @dataclasses.dataclass @@ -523,50 +524,66 @@ def fromdict(d): return ret -def compute_laa_field(genotypes): +def compute_la_field(genotypes): """ - Computes the value of the LAA field for each sample given the genotypes - for a variant. - - The LAA field is a list of one-based indices into the ALT alleles - that indicates which alternate alleles are observed in the sample. + Computes the value of the LA field for each sample given the genotypes + for a variant. The LA field lists the unique alleles observed for + each sample, including the REF. """ v = 2**31 - 1 if np.any(genotypes >= v): raise ValueError("Extreme allele value not supported") G = genotypes.astype(np.int32) if len(G) > 0: - # Anything <=0 gets mapped to -2 (pad) in the output, which comes last. + # Anything < 0 gets mapped to -2 (pad) in the output, which comes last. # So, to get this sorting correctly, we remap to the largest value for # sorting, then map back. We promote the genotypes up to 32 bit for convenience # here, assuming that we'll never have a allele of 2**31 - 1. assert np.all(G != v) - G[G <= 0] = v + G[G < 0] = v G.sort(axis=1) - # Equal non-zero values result in padding also G[G[:, 0] == G[:, 1], 1] = -2 + # Equal values result in padding also G[G == v] = -2 return G.astype(genotypes.dtype) -def compute_lad_field(ad, laa): - try: - lad = np.full((ad.shape[0], 2), -2, dtype=ad.dtype) - ref_ref = np.where((laa[:, 0] == -2) & (laa[:, 1] == -2))[0] - lad[ref_ref, 0] = ad[ref_ref, 0] - ref_alt = np.where((laa[:, 0] != -2) & (laa[:, 1] == -2))[0] - lad[ref_alt, 0] = ad[ref_alt, 0] - lad[ref_alt, 1] = ad[ref_alt, laa[ref_alt, 0]] - alt_alt = np.where((laa[:, 0] != -2) & (laa[:, 1] != -2))[0] - lad[alt_alt, 0] = ad[alt_alt, laa[alt_alt, 0]] - lad[alt_alt, 1] = ad[alt_alt, laa[alt_alt, 1]] - except Exception as e: - print("ad = ", ad) - print("laa = ", laa) - raise e +def compute_lad_field(ad, la): + assert ad.shape[0] == la.shape[0] + assert la.shape[1] == 2 + lad = np.full((ad.shape[0], 2), -2, dtype=ad.dtype) + homs = np.where((la[:, 0] != -2) & (la[:, 1] == -2)) + lad[homs, 0] = ad[homs, la[homs, 0]] + hets = np.where(la[:, 1] != -2) + lad[hets, 0] = ad[hets, la[hets, 0]] + lad[hets, 1] = ad[hets, la[hets, 1]] return lad +def pl_index(a, b): + """ + Returns the PL index for alleles a and b. + """ + return b * (b + 1) // 2 + a + + +def compute_lpl_field(pl, la): + lpl = np.full((pl.shape[0], 3), -2, dtype=pl.dtype) + + homs = np.where((la[:, 0] != -2) & (la[:, 1] == -2)) + a = la[homs, 0] + lpl[homs, 0] = pl[homs, pl_index(a, a)] + + hets = np.where(la[:, 1] != -2)[0] + a = la[hets, 0] + b = la[hets, 1] + lpl[hets, 0] = pl[hets, pl_index(a, a)] + lpl[hets, 1] = pl[hets, pl_index(a, b)] + lpl[hets, 2] = pl[hets, pl_index(b, b)] + + return lpl + + @dataclasses.dataclass class VcfZarrWriteSummary(core.JsonDataclass): num_partitions: int @@ -601,7 +618,7 @@ def has_genotypes(self): def has_local_alleles(self): for field in self.schema.fields: - if field.name == "call_LAA" and field.vcf_field is None: + if field.name == "call_LA" and field.vcf_field is None: return True return False @@ -872,14 +889,20 @@ def encode_genotypes_partition(self, partition_index): def encode_local_alleles_partition(self, partition_index): partition = self.metadata.partitions[partition_index] - call_LAA_array = self.init_partition_array(partition_index, "call_LAA") - call_LAA = core.BufferedArray(call_LAA_array, partition.start) + call_LA_array = self.init_partition_array(partition_index, "call_LA") + call_LA = core.BufferedArray(call_LA_array, partition.start) call_LAD_array = self.init_partition_array(partition_index, "call_LAD") call_LAD = core.BufferedArray(call_LAD_array, partition.start) call_AD_source = self.icf.fields["FORMAT/AD"].iter_values( partition.start, partition.stop ) + call_LPL_array = self.init_partition_array(partition_index, "call_LPL") + call_LPL = core.BufferedArray(call_LPL_array, partition.start) + call_PL_source = self.icf.fields["FORMAT/PL"].iter_values( + partition.start, partition.stop + ) + gt_array = zarr.open_array( store=self.wip_partition_array_path(partition_index, "call_genotype"), mode="r", @@ -887,20 +910,28 @@ def encode_local_alleles_partition(self, partition_index): for genotypes in core.first_dim_slice_iter( gt_array, partition.start, partition.stop ): - laa = compute_laa_field(genotypes) - j = call_LAA.next_buffer_row() - call_LAA.buff[j] = laa + la = compute_la_field(genotypes) + j = call_LA.next_buffer_row() + call_LA.buff[j] = la ad = next(call_AD_source) + ad = icf.sanitise_int_array(ad, 2, ad.dtype) k = call_LAD.next_buffer_row() assert j == k - lad = compute_lad_field(ad, laa) - call_LAD.buff[j] = lad + call_LAD.buff[j] = compute_lad_field(ad, la) + + pl = next(call_PL_source) + pl = icf.sanitise_int_array(pl, 2, pl.dtype) + k = call_LPL.next_buffer_row() + assert j == k + call_LPL.buff[j] = compute_lpl_field(pl, la) - call_LAA.flush() - self.finalise_partition_array(partition_index, "call_LAA") + call_LA.flush() + self.finalise_partition_array(partition_index, "call_LA") call_LAD.flush() self.finalise_partition_array(partition_index, "call_LAD") + call_LPL.flush() + self.finalise_partition_array(partition_index, "call_LPL") def encode_alleles_partition(self, partition_index): array_name = "variant_allele" diff --git a/tests/test_local_alleles.py b/tests/test_local_alleles.py index 5124f3f4..91c183ac 100644 --- a/tests/test_local_alleles.py +++ b/tests/test_local_alleles.py @@ -2,63 +2,107 @@ import numpy.testing as nt import pytest -from bio2zarr.vcf2zarr.vcz import compute_laa_field, compute_lad_field +from bio2zarr.vcf2zarr import vcz -class TestComputeLAA: +class TestComputeLA: @pytest.mark.parametrize( ("genotypes", "expected"), [ ([], []), - ([[0, 0]], [[-2, -2]]), - ([[0, 0], [0, 0]], [[-2, -2], [-2, -2]]), - ([[1, 1], [0, 0]], [[1, -2], [-2, -2]]), - ([[0, 1], [3, 2], [3, 0]], [[1, -2], [2, 3], [3, -2]]), - ([[0, 0], [2, 3]], [[-2, -2], [2, 3]]), - ([[2, 3], [0, 0]], [[2, 3], [-2, -2]]), - ([[128, 0], [6, 5]], [[128, -2], [5, 6]]), - ([[0, -1], [-1, 5]], [[-2, -2], [5, -2]]), + ([[0, 0]], [[0, -2]]), + ([[0, 0], [0, 0]], [[0, -2], [0, -2]]), + ([[1, 1], [0, 0]], [[1, -2], [0, -2]]), + ([[0, 1], [3, 2], [3, 0]], [[0, 1], [2, 3], [0, 3]]), + ([[0, 0], [2, 3]], [[0, -2], [2, 3]]), + ([[2, 3], [0, 0]], [[2, 3], [0, -2]]), + ([[128, 0], [6, 5]], [[0, 128], [5, 6]]), + ([[0, -1], [-1, 5]], [[0, -2], [5, -2]]), + ([[-1, -1], [-1, 5]], [[-2, -2], [5, -2]]), ], ) def test_simple_examples(self, genotypes, expected): G = np.array(genotypes) - result = compute_laa_field(G) + result = vcz.compute_la_field(G) nt.assert_array_equal(result, expected) def test_extreme_value(self): G = np.array([[0, 2**32 - 1]]) with pytest.raises(ValueError, match="Extreme"): - compute_laa_field(G) + vcz.compute_la_field(G) class TestComputeLAD: @pytest.mark.parametrize( - ("ad", "laa", "expected"), + ("ad", "la", "expected"), [ + # Missing data + ([[0, 0]], [[-2, -2]], [[-2, -2]]), # 0/0 calls - ([[10, 0]], [[-2, -2]], [[10, -2]]), - ([[10, 0, 0], [11, 0, 0]], [[-2, -2], [-2, -2]], [[10, -2], [11, -2]]), + ([[10, 0]], [[0, -2]], [[10, -2]]), + ([[10, 0, 0]], [[0, -2]], [[10, -2]]), + ([[10, 0, 0], [11, 0, 0]], [[0, -2], [0, -2]], [[10, -2], [11, -2]]), # 0/1 calls - ([[10, 11]], [[1, -2]], [[10, 11]]), - ([[10, 11], [12, 0]], [[1, -2], [-2, -2]], [[10, 11], [12, -2]]), + ([[10, 11]], [[0, 1]], [[10, 11]]), + ([[10, 11], [12, 0]], [[0, 1], [0, -2]], [[10, 11], [12, -2]]), # 0/2 calls - ([[10, 0, 11]], [[2, -2]], [[10, 11]]), - ([[10, 0, 11], [10, 11, 0]], [[2, -2], [1, -2]], [[10, 11], [10, 11]]), + ([[10, 0, 11]], [[0, 2]], [[10, 11]]), + ([[10, 0, 11], [10, 11, 0]], [[0, 2], [0, 1]], [[10, 11], [10, 11]]), ( [[10, 0, 11], [10, 11, 0], [12, 0, 0]], - [[2, -2], [1, -2], [-2, -2]], + [[0, 2], [0, 1], [0, -2]], [[10, 11], [10, 11], [12, -2]], ), # 1/2 calls ([[0, 10, 11]], [[1, 2]], [[10, 11]]), - ([[0, 10, 11], [12, 0, 13]], [[1, 2], [2, -2]], [[10, 11], [12, 13]]), + ([[0, 10, 11], [12, 0, 13]], [[1, 2], [0, 2]], [[10, 11], [12, 13]]), ( [[0, 10, 11], [12, 0, 13], [14, 0, 0]], - [[1, 2], [2, -2], [-2, -2]], + [[1, 2], [0, 2], [0, -2]], [[10, 11], [12, 13], [14, -2]], ), ], ) - def test_simple_examples(self, ad, laa, expected): - result = compute_lad_field(np.array(ad), np.array(laa)) + def test_simple_examples(self, ad, la, expected): + result = vcz.compute_lad_field(np.array(ad), np.array(la)) + nt.assert_array_equal(result, expected) + + +# PL translation indexes: +# a b i +# 0 0 0 +# 0 1 1 +# 0 2 3 +# 0 3 6 +# 1 1 2 +# 1 2 4 +# 1 3 7 +# 2 2 5 +# 2 3 8 +# 3 3 9 + + +class TestComputeLPL: + @pytest.mark.parametrize( + ("pl", "la", "expected"), + [ + # Missing + ([range(3)], [[-2, -2]], [[-2, -2, -2]]), + # 0/0 calls + ([range(3)], [[0, -2]], [[0, -2, -2]]), + # 0/0 calls + ([[-1, -1, -1]], [[0, -2]], [[-1, -2, -2]]), + # 1/1 calls + ([range(3)], [[1, -2]], [[2, -2, -2]]), + ([range(3), range(3)], [[0, -2], [1, -2]], [[0, -2, -2], [2, -2, -2]]), + # 2/2 calls + ([range(6)], [[2, -2]], [[5, -2, -2]]), + # 0/1 calls + ([range(3)], [[0, 1]], [[0, 1, 2]]), + # 0/2 calls + ([range(6)], [[0, 2]], [[0, 3, 5]]), + ], + ) + def test_simple_examples(self, pl, la, expected): + result = vcz.compute_lpl_field(np.array(pl), np.array(la)) nt.assert_array_equal(result, expected) diff --git a/tests/test_vcf_examples.py b/tests/test_vcf_examples.py index a23db500..a596b432 100644 --- a/tests/test_vcf_examples.py +++ b/tests/test_vcf_examples.py @@ -725,14 +725,15 @@ def test_call_LAD(self, ds): ] nt.assert_array_equal(ds.call_LAD.values, call_LAD) - def test_call_LAA(self, ds): + def test_call_LA(self, ds): # All the genotypes are 0/0 - call_LAA = np.full((23, 3, 2), -2) - nt.assert_array_equal(ds.call_LAA.values, call_LAA) + call_LA = np.full((23, 3, 2), -2) + call_LA[:, :, 0] = 0 + nt.assert_array_equal(ds.call_LA.values, call_LA) - # def test_call_LPL(self, ds): - # call_LPL = np.tile([0, -2, -2], (23, 3, 1)) - # nt.assert_array_equal(ds.call_LPL.values, call_LPL) + def test_call_LPL(self, ds): + call_LPL = np.tile([0, -2, -2], (23, 3, 1)) + nt.assert_array_equal(ds.call_LPL.values, call_LPL) class Test1000G2020AnnotationsExample: From cf1947f606a62469bdd577bc79e208c4dfc000ca Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Fri, 17 Jan 2025 11:09:53 +0000 Subject: [PATCH 09/12] Improve local alleles implementation to reduce memory --- bio2zarr/vcf2zarr/vcz.py | 75 ++++++++++++++++++++++++-------------- tests/test_vcf_examples.py | 52 ++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 27 deletions(-) diff --git a/bio2zarr/vcf2zarr/vcz.py b/bio2zarr/vcf2zarr/vcz.py index 3bddbd89..a23888ab 100644 --- a/bio2zarr/vcf2zarr/vcz.py +++ b/bio2zarr/vcf2zarr/vcz.py @@ -584,6 +584,24 @@ def compute_lpl_field(pl, la): return lpl +@dataclasses.dataclass +class LocalisableFieldDescriptor: + array_name: str + vcf_field: str + sanitise: callable + convert: callable + + +localisable_fields = [ + LocalisableFieldDescriptor( + "call_LAD", "FORMAT/AD", icf.sanitise_int_array, compute_lad_field + ), + LocalisableFieldDescriptor( + "call_LPL", "FORMAT/PL", icf.sanitise_int_array, compute_lpl_field + ), +] + + @dataclasses.dataclass class VcfZarrWriteSummary(core.JsonDataclass): num_partitions: int @@ -816,6 +834,7 @@ def encode_partition(self, partition_index): self.encode_genotypes_partition(partition_index) if self.has_local_alleles(): self.encode_local_alleles_partition(partition_index) + self.encode_local_allele_fields_partition(partition_index) final_path = self.partition_path(partition_index) logger.info(f"Finalising {partition_index} at {final_path}") @@ -892,17 +911,6 @@ def encode_local_alleles_partition(self, partition_index): call_LA_array = self.init_partition_array(partition_index, "call_LA") call_LA = core.BufferedArray(call_LA_array, partition.start) - call_LAD_array = self.init_partition_array(partition_index, "call_LAD") - call_LAD = core.BufferedArray(call_LAD_array, partition.start) - call_AD_source = self.icf.fields["FORMAT/AD"].iter_values( - partition.start, partition.stop - ) - call_LPL_array = self.init_partition_array(partition_index, "call_LPL") - call_LPL = core.BufferedArray(call_LPL_array, partition.start) - call_PL_source = self.icf.fields["FORMAT/PL"].iter_values( - partition.start, partition.stop - ) - gt_array = zarr.open_array( store=self.wip_partition_array_path(partition_index, "call_genotype"), mode="r", @@ -914,24 +922,37 @@ def encode_local_alleles_partition(self, partition_index): j = call_LA.next_buffer_row() call_LA.buff[j] = la - ad = next(call_AD_source) - ad = icf.sanitise_int_array(ad, 2, ad.dtype) - k = call_LAD.next_buffer_row() - assert j == k - call_LAD.buff[j] = compute_lad_field(ad, la) - - pl = next(call_PL_source) - pl = icf.sanitise_int_array(pl, 2, pl.dtype) - k = call_LPL.next_buffer_row() - assert j == k - call_LPL.buff[j] = compute_lpl_field(pl, la) - call_LA.flush() self.finalise_partition_array(partition_index, "call_LA") - call_LAD.flush() - self.finalise_partition_array(partition_index, "call_LAD") - call_LPL.flush() - self.finalise_partition_array(partition_index, "call_LPL") + + def encode_local_allele_fields_partition(self, partition_index): + partition = self.metadata.partitions[partition_index] + la_array = zarr.open_array( + store=self.wip_partition_array_path(partition_index, "call_LA"), + mode="r", + ) + field_map = self.schema.field_map() + # We got through the localisable fields one-by-one so that we don't need to + # keep several large arrays in memory at once for each partition. + for descriptor in localisable_fields: + if descriptor.array_name not in field_map: + continue + assert field_map[descriptor.array_name].vcf_field is None + + array = self.init_partition_array(partition_index, descriptor.array_name) + buff = core.BufferedArray(array, partition.start) + source = self.icf.fields[descriptor.vcf_field].iter_values( + partition.start, partition.stop + ) + for la in core.first_dim_slice_iter( + la_array, partition.start, partition.stop + ): + raw_value = next(source) + value = descriptor.sanitise(raw_value, 2, raw_value.dtype) + j = buff.next_buffer_row() + buff.buff[j] = descriptor.convert(value, la) + buff.flush() + self.finalise_partition_array(partition_index, "array_name") def encode_alleles_partition(self, partition_index): array_name = "variant_allele" diff --git a/tests/test_vcf_examples.py b/tests/test_vcf_examples.py index a596b432..176f26b5 100644 --- a/tests/test_vcf_examples.py +++ b/tests/test_vcf_examples.py @@ -444,7 +444,59 @@ def test_vcf_field_description(self, ds, field, description): assert ds[field].attrs["description"] == description +class TestSmallExampleLocalAlleles: + data_path = "tests/data/vcf/sample.vcf.gz" + + @pytest.fixture(scope="class") + def ds(self, tmp_path_factory): + out = tmp_path_factory.mktemp("data") / "example.vcf.zarr" + vcf2zarr.convert([self.data_path], out, local_alleles=True) + return sg.load_dataset(out) + + def test_call_LA(self, ds): + call_genotype = np.array( + [ + [[0, 0], [0, 0], [0, 1]], + [[0, 0], [0, 0], [0, 1]], + [[0, 0], [1, 0], [1, 1]], + [[0, 0], [0, 1], [0, 0]], + [[1, 2], [2, 1], [2, 2]], + [[0, 0], [0, 0], [0, 0]], + [[0, 1], [0, 2], [-1, -1]], + [[0, 0], [0, 0], [-1, -1]], + # FIXME this depends on "mixed ploidy" interpretation. + [[0, -2], [0, 1], [0, 2]], + ], + dtype="i1", + ) + nt.assert_array_equal(ds["call_genotype"], call_genotype) + nt.assert_array_equal(ds["call_genotype_mask"], call_genotype < 0) + + call_LA = np.array( + [ + [[0, -2], [0, -2], [0, 1]], + [[0, -2], [0, -2], [0, 1]], + [[0, -2], [0, 1], [1, -2]], + [[0, -2], [0, 1], [0, -2]], + [[1, 2], [1, 2], [2, -2]], + [[0, -2], [0, -2], [0, -2]], + [[0, 1], [0, 2], [-2, -2]], + [[0, -2], [0, -2], [-2, -2]], + [[0, -2], [0, 1], [0, 2]], + ], + ) + nt.assert_array_equal(ds.call_LA.values, call_LA) + + @pytest.mark.parametrize("field", ["call_LPL", "call_LAD"]) + def test_no_localised_fields(self, ds, field): + assert field not in ds + + class TestLocalAllelesExample: + # Note this example has a mixture of local and non local fields, and uses + # the VCF standard LA field. It's not actually testing our implementation + # of localisation. + data_path = "tests/data/vcf/local_alleles.vcf.gz" @pytest.fixture(scope="class") From 4f94fef199b74352a2d9aca015dd679fb652e471 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Fri, 17 Jan 2025 11:33:35 +0000 Subject: [PATCH 10/12] Remove old localise code and example The tests depended on complex semantics of mixed local and non local fields, which doesn't seem worth the effort. Fixup du test --- bio2zarr/cli.py | 6 - bio2zarr/vcf2zarr/icf.py | 175 +----------------------- tests/data/vcf/local_alleles.vcf.gz | Bin 944 -> 0 bytes tests/data/vcf/local_alleles.vcf.gz.csi | Bin 118 -> 0 bytes tests/test_core.py | 4 +- tests/test_vcf_examples.py | 94 +------------ 6 files changed, 10 insertions(+), 269 deletions(-) delete mode 100644 tests/data/vcf/local_alleles.vcf.gz delete mode 100644 tests/data/vcf/local_alleles.vcf.gz.csi diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py index 6f3ff3a6..6e21576c 100644 --- a/bio2zarr/cli.py +++ b/bio2zarr/cli.py @@ -221,7 +221,6 @@ def show_work_summary(work_summary, json): @compressor @progress @worker_processes -@local_alleles def explode( vcfs, icf_path, @@ -231,7 +230,6 @@ def explode( compressor, progress, worker_processes, - local_alleles, ): """ Convert VCF(s) to intermediate columnar format @@ -245,7 +243,6 @@ def explode( column_chunk_size=column_chunk_size, compressor=get_compressor(compressor), show_progress=progress, - local_alleles=local_alleles, ) @@ -260,7 +257,6 @@ def explode( @verbose @progress @worker_processes -@local_alleles def dexplode_init( vcfs, icf_path, @@ -272,7 +268,6 @@ def dexplode_init( verbose, progress, worker_processes, - local_alleles, ): """ Initial step for distributed conversion of VCF(s) to intermediate columnar format @@ -289,7 +284,6 @@ def dexplode_init( worker_processes=worker_processes, compressor=get_compressor(compressor), show_progress=progress, - local_alleles=local_alleles, ) show_work_summary(work_summary, json) diff --git a/bio2zarr/vcf2zarr/icf.py b/bio2zarr/vcf2zarr/icf.py index 8e313f8c..dd0bbc91 100644 --- a/bio2zarr/vcf2zarr/icf.py +++ b/bio2zarr/vcf2zarr/icf.py @@ -217,7 +217,7 @@ def make_field_def(name, vcf_type, vcf_number): return fields -def scan_vcf(path, target_num_partitions, *, local_alleles): +def scan_vcf(path, target_num_partitions): with vcf_utils.IndexedVcf(path) as indexed_vcf: vcf = indexed_vcf.vcf filters = [] @@ -237,10 +237,6 @@ def scan_vcf(path, target_num_partitions, *, local_alleles): pass_filter = filters.pop(pass_index) filters.insert(0, pass_filter) - # Indicates whether vcf2zarr can introduce local alleles - can_localize = False - should_add_laa_field = True - should_add_lpl_field = True fields = fixed_vcf_field_definitions() for h in vcf.header_iter(): if h["HeaderType"] in ["INFO", "FORMAT"]: @@ -249,36 +245,6 @@ def scan_vcf(path, target_num_partitions, *, local_alleles): field.vcf_type = "Integer" field.vcf_number = "." fields.append(field) - if field.category == "FORMAT": - if field.name == "PL": - can_localize = True - if field.name == "LAA": - should_add_laa_field = False - if field.name == "LPL": - should_add_lpl_field = False - - if local_alleles and can_localize: - if should_add_laa_field: - laa_field = VcfField( - category="FORMAT", - name="LAA", - vcf_type="Integer", - vcf_number=".", - description="1-based indices into ALT, indicating which alleles" - " are relevant (local) for the current sample", - summary=VcfFieldSummary(), - ) - fields.append(laa_field) - if should_add_lpl_field: - lpl_field = VcfField( - category="FORMAT", - name="LPL", - vcf_type="Integer", - vcf_number="LG", - description="Local-allele representation of PL", - summary=VcfFieldSummary(), - ) - fields.append(lpl_field) try: contig_lengths = vcf.seqlens @@ -315,14 +281,7 @@ def scan_vcf(path, target_num_partitions, *, local_alleles): return metadata, vcf.raw_header -def scan_vcfs( - paths, - show_progress, - target_num_partitions, - worker_processes=1, - *, - local_alleles, -): +def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1): logger.info( f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions}" f" partitions." @@ -346,7 +305,6 @@ def scan_vcfs( scan_vcf, path, max(1, target_num_partitions // len(paths)), - local_alleles=local_alleles, ) results = list(pwm.results_as_completed()) @@ -505,104 +463,6 @@ def sanitise_value_int_2d(buff, j, value): buff[j, :, : value.shape[1]] = value -def compute_laa_field(variant) -> np.ndarray: - """ - Computes the value of the LAA field for each sample given a variant. - - The LAA field is a list of one-based indices into the ALT alleles - that indicates which alternate alleles are observed in the sample. - - This method infers which alleles are observed from the GT field. - """ - sample_count = variant.num_called + variant.num_unknown - alt_allele_count = len(variant.ALT) - allele_count = alt_allele_count + 1 - allele_counts = np.zeros((sample_count, allele_count), dtype=int) - - if "GT" in variant.FORMAT: - # The last element of each sample's genotype indicates the phasing - # and is not an allele. - genotypes = variant.genotype.array()[:, :-1] - genotypes.clip(0, None, out=genotypes) - genotype_allele_counts = np.apply_along_axis( - np.bincount, axis=1, arr=genotypes, minlength=allele_count - ) - allele_counts += genotype_allele_counts - - allele_counts[:, 0] = 0 # We don't count the reference allele - max_row_length = 1 - - def nonzero_pad(arr: np.ndarray, *, length: int): - nonlocal max_row_length - alleles = arr.nonzero()[0] - max_row_length = max(max_row_length, len(alleles)) - pad_length = length - len(alleles) - return np.pad( - alleles, - (0, pad_length), - mode="constant", - constant_values=constants.INT_FILL, - ) - - alleles = np.apply_along_axis( - nonzero_pad, axis=1, arr=allele_counts, length=max(1, alt_allele_count) - ) - alleles = alleles[:, :max_row_length] - - return alleles - - -def compute_lpl_field(variant, laa_val: np.ndarray) -> np.ndarray: - assert laa_val is not None - - la_val = np.zeros((laa_val.shape[0], laa_val.shape[1] + 1), dtype=laa_val.dtype) - la_val[:, 1:] = laa_val - ploidy = variant.ploidy - - if "PL" not in variant.FORMAT: - sample_count = variant.num_called + variant.num_unknown - local_allele_count = la_val.shape[1] - - if ploidy == 1: - local_genotype_count = local_allele_count - elif ploidy == 2: - local_genotype_count = local_allele_count * (local_allele_count + 1) // 2 - else: - raise ValueError(f"Cannot handle ploidy = {ploidy}") - - return np.full((sample_count, local_genotype_count), constants.INT_MISSING) - - # Compute a and b - if ploidy == 1: - a = la_val - b = np.zeros_like(la_val) - elif ploidy == 2: - repeats = np.arange(1, la_val.shape[1] + 1) - b = np.repeat(la_val, repeats, axis=1) - arange_tile = np.tile(np.arange(la_val.shape[1]), (la_val.shape[1], 1)) - tril_indices = np.tril_indices_from(arange_tile) - a_index = np.tile(arange_tile[tril_indices], (b.shape[0], 1)) - row_index = np.arange(la_val.shape[0]).reshape(-1, 1) - a = la_val[row_index, a_index] - else: - raise ValueError(f"Cannot handle ploidy = {ploidy}") - - # Compute n, the local indices of the PL field - n = (b * (b + 1) / 2 + a).astype(int) - - pl_val = variant.format("PL") - pl_val[pl_val == constants.VCF_INT_MISSING] = constants.INT_MISSING - # When the PL value is missing in all samples, pl_val has shape (sample_count, 1). - # In that case, we need to broadcast the PL value. - if pl_val.shape[1] < n.shape[1]: - pl_val = np.broadcast_to(pl_val, n.shape) - row_index = np.arange(pl_val.shape[0]).reshape(-1, 1) - lpl_val = pl_val[row_index, n] - lpl_val[b == constants.INT_FILL] = constants.INT_FILL - - return lpl_val - - missing_value_map = { "Integer": constants.INT_MISSING, "Float": constants.FLOAT32_MISSING, @@ -1107,14 +967,11 @@ def init( target_num_partitions=None, show_progress=False, compressor=None, - local_alleles=None, ): if self.path.exists(): raise ValueError(f"ICF path already exists: {self.path}") if compressor is None: compressor = ICF_DEFAULT_COMPRESSOR - if local_alleles is None: - local_alleles = False vcfs = [pathlib.Path(vcf) for vcf in vcfs] target_num_partitions = max(target_num_partitions, len(vcfs)) @@ -1124,7 +981,6 @@ def init( worker_processes=worker_processes, show_progress=show_progress, target_num_partitions=target_num_partitions, - local_alleles=local_alleles, ) check_field_clobbering(icf_metadata) self.metadata = icf_metadata @@ -1207,17 +1063,6 @@ def process_partition(self, partition_index): else: format_fields.append(field) - format_field_names = [format_field.name for format_field in format_fields] - if "LAA" in format_field_names and "LPL" in format_field_names: - laa_index = format_field_names.index("LAA") - lpl_index = format_field_names.index("LPL") - # LAA needs to come before LPL - if lpl_index < laa_index: - format_fields[laa_index], format_fields[lpl_index] = ( - format_fields[lpl_index], - format_fields[laa_index], - ) - last_position = None with IcfPartitionWriter( self.metadata, @@ -1245,18 +1090,8 @@ def process_partition(self, partition_index): else: val = variant.genotype.array() tcw.append("FORMAT/GT", val) - laa_val = None for field in format_fields: - if field.name == "LAA": - if "LAA" not in variant.FORMAT: - laa_val = compute_laa_field(variant) - else: - laa_val = variant.format("LAA") - val = laa_val - elif field.name == "LPL" and "LPL" not in variant.FORMAT: - val = compute_lpl_field(variant, laa_val) - else: - val = variant.format(field.name) + val = variant.format(field.name) tcw.append(field.full_name, val) # Note: an issue with updating the progress per variant here like @@ -1352,7 +1187,6 @@ def explode( worker_processes=1, show_progress=False, compressor=None, - local_alleles=None, ): writer = IntermediateColumnarFormatWriter(icf_path) writer.init( @@ -1363,7 +1197,6 @@ def explode( show_progress=show_progress, column_chunk_size=column_chunk_size, compressor=compressor, - local_alleles=local_alleles, ) writer.explode(worker_processes=worker_processes, show_progress=show_progress) writer.finalise() @@ -1379,7 +1212,6 @@ def explode_init( worker_processes=1, show_progress=False, compressor=None, - local_alleles=None, ): writer = IntermediateColumnarFormatWriter(icf_path) return writer.init( @@ -1389,7 +1221,6 @@ def explode_init( show_progress=show_progress, column_chunk_size=column_chunk_size, compressor=compressor, - local_alleles=local_alleles, ) diff --git a/tests/data/vcf/local_alleles.vcf.gz b/tests/data/vcf/local_alleles.vcf.gz deleted file mode 100644 index 5ee4c11be0c8e5a610eded6dcde927b35356483a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 944 zcmV;h15f-PiwFb&00000{{{d;LjnM70*#bUZ{jcz#n0)d7!~a;tpF{Dtu`xV2?3-4 zfza;RBm=S5*sg6@@$J`12!GOwa)a3K=jWL>Znf5w;aaFIG2UB0+<$-WJ+xZkI9d)C z-p_I1#eR~w0qRsyX{g}dqt6&vl|iLJ60I?N{AD#VCz|q&Cy6qYF#BaZ4QC}|nz+;5 zc7@98xXXh?Z_EvDP+fDYEg;sAkga5>;SV)A1jLZ?fD_K2-h*2TLm2QAlrV+`lyXRm z-x+L>3uE=Vf$!J&=Novx#6c={+!V|c%mrIcK$7O*N6X`}Zn4GW2%2mWgvzWZqJdFu zj^#TYAH?_HEwXW9D+QN66~TT_D~&4a9s$7&Ql2Art?g)2;-w#l&Bhg5mOxIu4JqX^Nvbs@2i( zMh+rj!~9f*3=~R*8o43GVsWcD`T#z+_jbJPd~;&(G&f?UQSI&6oRcis2jBOlcY!3k z2=8Ul=J)99s==`bO^P4Xl*QL0tO_&PXJ*blxYdFg8tf#ttb`k8~t!?R*wm*Hl7WIWZ)NkFMC`wl4#Wr@a z6sCe3x+%43t~&Npf3%oQoOqTvsZ^1K^Z1rba^}6WvtK(IdIpGf}p$>y~(Ao|Gfly30 z(d7Y9T|!{Q(`{#plhWlA6P3t=iR$P?X4WgSNxhP^zzl^cXrwV+U_GXKFo}vdP>gF- z1`&uiK6d8LO6=~vtKs7lVjWkqpIr-%_3=CnP*zNODz3V@h(<;AJ`$FiW&!vT+yB!%6cd zhcu+ Date: Fri, 17 Jan 2025 11:56:57 +0000 Subject: [PATCH 11/12] Fixup CLI tests --- tests/test_cli.py | 56 +++++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 2ede6ee2..c6be17ee 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -14,7 +14,6 @@ compressor=None, worker_processes=1, show_progress=True, - local_alleles=False, ) DEFAULT_DEXPLODE_PARTITION_ARGS = dict() @@ -24,7 +23,6 @@ column_chunk_size=64, compressor=None, show_progress=True, - local_alleles=False, ) DEFAULT_ENCODE_ARGS = dict( @@ -49,6 +47,12 @@ DEFAULT_DENCODE_FINALISE_ARGS = dict(show_progress=True) +DEFAULT_MKSHCHEMA_ARGS = dict( + variants_chunk_size=None, + samples_chunk_size=None, + local_alleles=False, +) + DEFAULT_CONVERT_ARGS = dict( variants_chunk_size=None, samples_chunk_size=None, @@ -297,29 +301,6 @@ def test_vcf_explode_missing_and_existing_vcf(self, mocked, tmp_path): assert "'no_such_file' does not exist" in result.stderr mocked.assert_not_called() - @pytest.mark.parametrize("local_alleles", [False, True]) - @mock.patch("bio2zarr.vcf2zarr.explode") - def test_vcf_explode_local_alleles(self, mocked, tmp_path, local_alleles): - icf_path = tmp_path / "icf" - runner = ct.CliRunner(mix_stderr=False) - - if local_alleles: - local_alleles_flag = "--local-alleles" - else: - local_alleles_flag = "--no-local-alleles" - - result = runner.invoke( - cli.vcf2zarr_main, - f"explode {self.vcf_path} {icf_path} {local_alleles_flag}", - catch_exceptions=False, - ) - assert result.exit_code == 0 - assert len(result.stdout) == 0 - assert len(result.stderr) == 0 - args = dict(DEFAULT_EXPLODE_ARGS) - args["local_alleles"] = local_alleles - mocked.assert_called_once_with(str(icf_path), (self.vcf_path,), **args) - @pytest.mark.parametrize(("progress", "flag"), [(True, "-P"), (False, "-Q")]) @mock.patch("bio2zarr.vcf2zarr.explode_init", return_value=FakeWorkSummary(5)) def test_vcf_dexplode_init(self, mocked, tmp_path, progress, flag): @@ -462,7 +443,7 @@ def test_mkschema(self, mocked, tmp_path): runner = ct.CliRunner(mix_stderr=False) result = runner.invoke( cli.vcf2zarr_main, - f"mkschema {tmp_path} --variants-chunk-size=3 " "--samples-chunk-size=4", + f"mkschema {tmp_path} --variants-chunk-size=3 --samples-chunk-size=4", catch_exceptions=False, ) assert result.exit_code == 0 @@ -726,6 +707,29 @@ def test_mkschema(self, tmp_path): assert d["samples_chunk_size"] == 2 assert d["variants_chunk_size"] == 3 + @pytest.mark.parametrize("local_alleles", [False, True]) + def test_mkschema_local_alleles(self, tmp_path, local_alleles): + icf_path = tmp_path / "icf" + local_alleles_flag = {True: "--local-alleles", False: "--no-local-alleles"}[ + local_alleles + ] + runner = ct.CliRunner(mix_stderr=False) + result = runner.invoke( + cli.vcf2zarr_main, + f"explode {self.vcf_path} {icf_path}", + catch_exceptions=False, + ) + assert result.exit_code == 0 + result = runner.invoke( + cli.vcf2zarr_main, + f"mkschema {icf_path} {local_alleles_flag}", + catch_exceptions=False, + ) + assert result.exit_code == 0 + d = json.loads(result.stdout) + call_LA_exists = "call_LA" in [f["name"] for f in d["fields"]] + assert call_LA_exists == local_alleles + def test_encode(self, tmp_path): icf_path = tmp_path / "icf" zarr_path = tmp_path / "zarr" From 45db77b56b7340b27b1551258ee309e8dff227ae Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Fri, 17 Jan 2025 11:58:22 +0000 Subject: [PATCH 12/12] Fixup ICF tests --- tests/test_icf.py | 42 +----------------------------------------- 1 file changed, 1 insertion(+), 41 deletions(-) diff --git a/tests/test_icf.py b/tests/test_icf.py index 954a5102..b59ae1dd 100644 --- a/tests/test_icf.py +++ b/tests/test_icf.py @@ -25,7 +25,7 @@ class TestSmallExample: @pytest.fixture(scope="class") def icf(self, tmp_path_factory): out = tmp_path_factory.mktemp("data") / "example.exploded" - return vcf2zarr.explode(out, [self.data_path], local_alleles=False) + return vcf2zarr.explode(out, [self.data_path]) def test_format_version(self, icf): assert icf.metadata.format_version == icf_mod.ICF_METADATA_FORMAT_VERSION @@ -91,46 +91,6 @@ def test_INFO_NS(self, icf): assert icf["INFO/NS"].values == [None, None, 3, 3, 2, 3, 3, None, None] -class TestLocalAllelesExample: - data_path = "tests/data/vcf/local_alleles.vcf.gz" - - fields = ( - "ALT", - "CHROM", - "FILTERS", - "FORMAT/AD", - "FORMAT/DP", - "FORMAT/GQ", - "FORMAT/GT", - "FORMAT/LAA", - "FORMAT/LPL", - "FORMAT/PL", - "ID", - "INFO/AA", - "INFO/AC", - "INFO/AF", - "INFO/AN", - "INFO/DB", - "INFO/DP", - "INFO/H2", - "INFO/NS", - "POS", - "QUAL", - "REF", - "rlen", - ) - - @pytest.fixture(scope="class") - def icf(self, tmp_path_factory): - out = tmp_path_factory.mktemp("data") / "example.exploded" - return vcf2zarr.explode(out, [self.data_path]) - - def test_summary_table(self, icf): - data = icf.summary_table() - fields = [d["name"] for d in data] - assert tuple(sorted(fields)) == self.fields - - class TestIcfWriterExample: data_path = "tests/data/vcf/sample.vcf.gz"