diff --git a/bio2zarr/icf.py b/bio2zarr/icf.py index 8dc25c59..887b8533 100644 --- a/bio2zarr/icf.py +++ b/bio2zarr/icf.py @@ -844,7 +844,6 @@ def convert_local_allele_field_types(fields): dimensions = gt.dimensions[:-1] la = vcz.ZarrArraySpec.new( - vcf_field=None, name="call_LA", dtype="i1", shape=gt.shape, @@ -859,7 +858,7 @@ def convert_local_allele_field_types(fields): if ad is not None: # TODO check if call_LAD is in the list already ad.name = "call_LAD" - ad.vcf_field = None + ad.source = None ad.shape = (*shape, 2) ad.chunks = (*chunks, 2) ad.dimensions = (*dimensions, "local_alleles") @@ -869,7 +868,7 @@ def convert_local_allele_field_types(fields): if pl is not None: # TODO check if call_LPL is in the list already pl.name = "call_LPL" - pl.vcf_field = None + pl.source = None pl.shape = (*shape, 3) pl.chunks = (*chunks, 3) pl.description += " (local-alleles)" @@ -1060,13 +1059,13 @@ def spec_from_field(field, array_name=None): def fixed_field_spec( name, dtype, - vcf_field=None, + source=None, shape=(m,), dimensions=("variants",), chunks=None, ): return vcz.ZarrArraySpec.new( - vcf_field=vcf_field, + source=source, name=name, dtype=dtype, shape=shape, @@ -1137,7 +1136,6 @@ def fixed_field_spec( dimensions = ["variants", "samples"] array_specs.append( vcz.ZarrArraySpec.new( - vcf_field=None, name="call_genotype_phased", dtype="bool", shape=list(shape), @@ -1151,7 +1149,6 @@ def fixed_field_spec( dimensions += ["ploidy"] array_specs.append( vcz.ZarrArraySpec.new( - vcf_field=None, name="call_genotype", dtype=gt_field.smallest_dtype(), shape=list(shape), @@ -1162,7 +1159,6 @@ def fixed_field_spec( ) array_specs.append( vcz.ZarrArraySpec.new( - vcf_field=None, name="call_genotype_mask", dtype="bool", shape=list(shape), diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py index 17b78513..800aa820 100644 --- a/bio2zarr/plink.py +++ b/bio2zarr/plink.py @@ -83,7 +83,7 @@ def generate_schema( array_specs = [ vcz.ZarrArraySpec.new( - vcf_field="position", + source="position", name="variant_position", dtype="i4", shape=[m], @@ -92,7 +92,6 @@ def generate_schema( description=None, ), vcz.ZarrArraySpec.new( - vcf_field=None, name="variant_allele", dtype="O", shape=[m, 2], @@ -101,7 +100,6 @@ def generate_schema( description=None, ), vcz.ZarrArraySpec.new( - vcf_field=None, name="call_genotype_phased", dtype="bool", shape=[m, n], @@ -113,7 +111,6 @@ def generate_schema( description=None, ), vcz.ZarrArraySpec.new( - vcf_field=None, name="call_genotype", dtype="i1", shape=[m, n, 2], @@ -126,7 +123,6 @@ def generate_schema( description=None, ), vcz.ZarrArraySpec.new( - vcf_field=None, name="call_genotype_mask", dtype="bool", shape=[m, n, 2], diff --git a/bio2zarr/vcz.py b/bio2zarr/vcz.py index 12dbd2ff..5465cd86 100644 --- a/bio2zarr/vcz.py +++ b/bio2zarr/vcz.py @@ -93,9 +93,9 @@ class ZarrArraySpec: chunks: tuple dimensions: tuple description: str - vcf_field: str compressor: dict filters: list + source: str = None def __post_init__(self): if self.name in _fixed_field_descriptions: @@ -151,7 +151,7 @@ def from_field( else: dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim") return ZarrArraySpec.new( - vcf_field=vcf_field.full_name, + source=vcf_field.full_name, name=array_name, dtype=vcf_field.smallest_dtype(), shape=shape, @@ -465,7 +465,7 @@ def has_genotypes(self): def has_local_alleles(self): for field in self.schema.fields: - if field.name == "call_LA" and field.vcf_field is None: + if field.name == "call_LA" and field.source is None: return True return False @@ -667,7 +667,7 @@ def encode_partition(self, partition_index): self.encode_contig_partition(partition_index) self.encode_alleles_partition(partition_index) for array_spec in self.schema.fields: - if array_spec.vcf_field is not None: + if array_spec.source is not None: self.encode_array_partition(array_spec, partition_index) if self.has_genotypes(): self.encode_genotypes_partition(partition_index) @@ -711,7 +711,7 @@ def encode_array_partition(self, array_spec, partition_index): partition = self.metadata.partitions[partition_index] ba = self.init_partition_array(partition_index, array_spec.name) for value in self.source.iter_field( - array_spec.vcf_field, + array_spec.source, ba.buff.shape[1:], partition.start, partition.stop, @@ -783,7 +783,7 @@ def encode_local_allele_fields_partition(self, partition_index): for descriptor in localisable_fields: if descriptor.array_name not in field_map: continue - assert field_map[descriptor.array_name].vcf_field is None + assert field_map[descriptor.array_name].source is None buff = self.init_partition_array(partition_index, descriptor.array_name) source = self.source.fields[descriptor.vcf_field].iter_values( diff --git a/tests/test_vcz.py b/tests/test_vcz.py index 49f863a7..6d6f9e0b 100644 --- a/tests/test_vcz.py +++ b/tests/test_vcz.py @@ -340,7 +340,7 @@ def test_variant_contig(self, schema): "dimensions": ("variants",), "description": "An identifier from the reference genome or an " "angle-bracketed ID string pointing to a contig in the assembly file", - "vcf_field": None, + "source": None, "compressor": { "id": "blosc", "cname": "zstd", @@ -359,7 +359,7 @@ def test_call_genotype(self, schema): "chunks": (1000, 10000, 2), "dimensions": ("variants", "samples", "ploidy"), "description": "", - "vcf_field": None, + "source": None, "compressor": { "id": "blosc", "cname": "zstd", @@ -378,7 +378,7 @@ def test_call_genotype_mask(self, schema): "chunks": (1000, 10000, 2), "dimensions": ("variants", "samples", "ploidy"), "description": "", - "vcf_field": None, + "source": None, "compressor": { "id": "blosc", "cname": "zstd", @@ -397,7 +397,7 @@ def test_call_genotype_phased(self, schema): "chunks": (1000, 10000, 2), "dimensions": ("variants", "samples", "ploidy"), "description": "", - "vcf_field": None, + "source": None, "compressor": { "id": "blosc", "cname": "zstd", @@ -416,7 +416,7 @@ def test_call_GQ(self, schema): "chunks": (1000, 10000), "dimensions": ("variants", "samples"), "description": "Genotype Quality", - "vcf_field": "FORMAT/GQ", + "source": "FORMAT/GQ", "compressor": { "id": "blosc", "cname": "zstd", @@ -437,6 +437,7 @@ def test_differences(self, schema, local_alleles_schema): def test_call_LA(self, local_alleles_schema): d = get_field_dict(local_alleles_schema, "call_LA") assert d == { + "source": None, "name": "call_LA", "dtype": "i1", "shape": (9, 3, 2), @@ -446,7 +447,6 @@ def test_call_LA(self, local_alleles_schema): "0-based indices into REF+ALT, indicating which alleles" " are relevant (local) for the current sample" ), - "vcf_field": None, "compressor": { "id": "blosc", "cname": "zstd",