Remove redundant dictionary in Schema format

jeromekelleher · jeromekelleher · commit b5ea5249b1bd · 2024-05-09T10:12:25.000+01:00
Breaking change for ongoing encode operations
diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py
@@ -1311,6 +1311,7 @@ def __post_init__(self):
         self.shape = tuple(self.shape)
         self.chunks = tuple(self.chunks)
         self.dimensions = tuple(self.dimensions)
+        self.filters = tuple(self.filters)
 
     @staticmethod
     def new(**kwargs):
@@ -1404,7 +1405,7 @@ def variant_chunk_nbytes(self):
         return chunk_items * dt.itemsize
 
 
-ZARR_SCHEMA_FORMAT_VERSION = "0.3"
+ZARR_SCHEMA_FORMAT_VERSION = "0.4"
 
 
 @dataclasses.dataclass
@@ -1416,7 +1417,10 @@ class VcfZarrSchema:
     samples: list
     contigs: list
     filters: list
-    fields: dict
+    fields: list
+
+    def field_map(self):
+        return {field.name: field for field in self.fields}
 
     def asdict(self):
         return dataclasses.asdict(self)
@@ -1435,9 +1439,7 @@ def fromdict(d):
         ret.samples = [Sample(**sd) for sd in d["samples"]]
         ret.contigs = [Contig(**sd) for sd in d["contigs"]]
         ret.filters = [Filter(**sd) for sd in d["filters"]]
-        ret.fields = {
-            key: ZarrColumnSpec(**value) for key, value in d["fields"].items()
-        }
+        ret.fields = [ZarrColumnSpec(**sd) for sd in d["fields"]]
         return ret
 
     @staticmethod
@@ -1572,7 +1574,7 @@ def fixed_field_spec(
             format_version=ZARR_SCHEMA_FORMAT_VERSION,
             samples_chunk_size=samples_chunk_size,
             variants_chunk_size=variants_chunk_size,
-            fields={col.name: col for col in colspecs},
+            fields=colspecs,
             dimensions=["variants", "samples", "ploidy", "alleles", "filters"],
             samples=icf.metadata.samples,
             contigs=icf.metadata.contigs,
@@ -1701,6 +1703,12 @@ def schema(self):
     def num_partitions(self):
         return len(self.metadata.partitions)
 
+    def has_genotypes(self):
+        for field in self.schema.fields:
+            if field.name == "call_genotype":
+                return True
+        return False
+
     #######################
     # init
     #######################
@@ -1760,7 +1768,7 @@ def init(
         root = zarr.group(store=store)
 
         total_chunks = 0
-        for field in self.schema.fields.values():
+        for field in self.schema.fields:
             a = self.init_array(root, field, partitions[-1].stop)
             total_chunks += a.nchunks
 
@@ -1880,10 +1888,10 @@ def encode_partition(self, partition_index):
         self.encode_filters_partition(partition_index)
         self.encode_contig_partition(partition_index)
         self.encode_alleles_partition(partition_index)
-        for col in self.schema.fields.values():
+        for col in self.schema.fields:
             if col.vcf_field is not None:
                 self.encode_array_partition(col, partition_index)
-        if "call_genotype" in self.schema.fields:
+        if self.has_genotypes():
             self.encode_genotypes_partition(partition_index)
 
         final_path = self.partition_path(partition_index)
@@ -2100,8 +2108,8 @@ def finalise(self, show_progress=False):
         # for multiple workers, or making a standard wrapper for tqdm
         # that allows us to have a consistent look and feel.
         with core.ParallelWorkManager(0, progress_config) as pwm:
-            for name in self.schema.fields:
-                pwm.submit(self.finalise_array, name)
+            for field in self.schema.fields:
+                pwm.submit(self.finalise_array, field.name)
         logger.debug(f"Removing {self.wip_path}")
         shutil.rmtree(self.wip_path)
         logger.info("Consolidating Zarr metadata")
@@ -2116,17 +2124,14 @@ def get_max_encoding_memory(self):
         Return the approximate maximum memory used to encode a variant chunk.
         """
         max_encoding_mem = 0
-        for col in self.schema.fields.values():
+        for col in self.schema.fields:
             max_encoding_mem = max(max_encoding_mem, col.variant_chunk_nbytes)
         gt_mem = 0
-        if "call_genotype" in self.schema.fields:
-            encoded_together = [
-                "call_genotype",
-                "call_genotype_phased",
-                "call_genotype_mask",
-            ]
+        if self.has_genotypes:
             gt_mem = sum(
-                self.schema.fields[col].variant_chunk_nbytes for col in encoded_together
+                field.variant_chunk_nbytes
+                for field in self.schema.fields
+                if field.name.startswith("call_genotype")
             )
         return max(max_encoding_mem, gt_mem)
 
@@ -2158,7 +2163,7 @@ def encode_all_partitions(
         num_workers = min(max_num_workers, worker_processes)
 
         total_bytes = 0
-        for col in self.schema.fields.values():
+        for col in self.schema.fields:
             # Open the array definition to get the total size
             total_bytes += zarr.open(self.arrays_path / col.name).nbytes
 
diff --git a/tests/test_icf.py b/tests/test_icf.py
@@ -228,7 +228,7 @@ def schema(self, icf):
         ],
     )
     def test_info_schemas(self, schema, name, dtype, shape, dimensions):
-        v = schema.fields[name]
+        v = schema.field_map()[name]
         assert v.dtype == dtype
         assert tuple(v.shape) == shape
         assert v.dimensions == dimensions
diff --git a/tests/test_vcf.py b/tests/test_vcf.py
@@ -32,7 +32,7 @@ def schema_path(icf_path, tmp_path_factory):
 @pytest.fixture(scope="module")
 def schema(schema_path):
     with open(schema_path) as f:
-        return json.load(f)
+        return vcf.VcfZarrSchema.fromjson(f.read())
 
 
 @pytest.fixture(scope="module")
@@ -83,7 +83,7 @@ def test_not_enough_memory_for_two(
 class TestJsonVersions:
     @pytest.mark.parametrize("version", ["0.1", "1.0", "xxxxx", 0.2])
     def test_zarr_schema_mismatch(self, schema, version):
-        d = dict(schema)
+        d = schema.asdict()
         d["format_version"] = version
         with pytest.raises(ValueError, match="Zarr schema format version mismatch"):
             vcf.VcfZarrSchema.fromdict(d)
@@ -156,13 +156,13 @@ def test_generated_no_samples(self, icf_path):
     def test_generated_change_dtype(self, icf_path):
         icf = vcf.IntermediateColumnarFormat(icf_path)
         schema = vcf.VcfZarrSchema.generate(icf)
-        schema.fields["variant_position"].dtype = "i8"
+        schema.field_map()["variant_position"].dtype = "i8"
         self.assert_json_round_trip(schema)
 
     def test_generated_change_compressor(self, icf_path):
         icf = vcf.IntermediateColumnarFormat(icf_path)
         schema = vcf.VcfZarrSchema.generate(icf)
-        schema.fields["variant_position"].compressor = {"cname": "FAKE"}
+        schema.field_map()["variant_position"].compressor = {"cname": "FAKE"}
         self.assert_json_round_trip(schema)
 
 
@@ -174,7 +174,7 @@ def test_codec(self, tmp_path, icf_path, cname, clevel, shuffle):
         zarr_path = tmp_path / "zarr"
         icf = vcf.IntermediateColumnarFormat(icf_path)
         schema = vcf.VcfZarrSchema.generate(icf)
-        for var in schema.fields.values():
+        for var in schema.fields:
             var.compressor["cname"] = cname
             var.compressor["clevel"] = clevel
             var.compressor["shuffle"] = shuffle
@@ -183,7 +183,7 @@ def test_codec(self, tmp_path, icf_path, cname, clevel, shuffle):
             f.write(schema.asjson())
         vcf.encode(icf_path, zarr_path, schema_path=schema_path)
         root = zarr.open(zarr_path)
-        for var in schema.fields.values():
+        for var in schema.fields:
             a = root[var.name]
             assert a.compressor.cname == cname
             assert a.compressor.clevel == clevel
@@ -194,7 +194,7 @@ def test_genotype_dtype(self, tmp_path, icf_path, dtype):
         zarr_path = tmp_path / "zarr"
         icf = vcf.IntermediateColumnarFormat(icf_path)
         schema = vcf.VcfZarrSchema.generate(icf)
-        schema.fields["call_genotype"].dtype = dtype
+        schema.field_map()["call_genotype"].dtype = dtype
         schema_path = tmp_path / "schema"
         with open(schema_path, "w") as f:
             f.write(schema.asjson())
@@ -203,16 +203,23 @@ def test_genotype_dtype(self, tmp_path, icf_path, dtype):
         assert root["call_genotype"].dtype == dtype
 
 
+def get_field_dict(a_schema, name):
+    d = a_schema.asdict()
+    for field in d["fields"]:
+        if field["name"] == name:
+            return field
+
+
 class TestDefaultSchema:
     def test_format_version(self, schema):
-        assert schema["format_version"] == vcf.ZARR_SCHEMA_FORMAT_VERSION
+        assert schema.format_version == vcf.ZARR_SCHEMA_FORMAT_VERSION
 
     def test_chunk_size(self, schema):
-        assert schema["samples_chunk_size"] == 1000
-        assert schema["variants_chunk_size"] == 10000
+        assert schema.samples_chunk_size == 1000
+        assert schema.variants_chunk_size == 10000
 
     def test_dimensions(self, schema):
-        assert schema["dimensions"] == [
+        assert schema.dimensions == [
             "variants",
             "samples",
             "ploidy",
@@ -221,29 +228,29 @@ def test_dimensions(self, schema):
         ]
 
     def test_samples(self, schema):
-        assert schema["samples"] == [
+        assert schema.asdict()["samples"] == [
             {"id": s} for s in ["NA00001", "NA00002", "NA00003"]
         ]
 
     def test_contigs(self, schema):
-        assert schema["contigs"] == [
+        assert schema.asdict()["contigs"] == [
             {"id": s, "length": None} for s in ["19", "20", "X"]
         ]
 
     def test_filters(self, schema):
-        assert schema["filters"] == [
+        assert schema.asdict()["filters"] == [
             {"id": "PASS", "description": "All filters passed"},
             {"id": "s50", "description": "Less than 50% of samples have data"},
             {"id": "q10", "description": "Quality below 10"},
         ]
 
     def test_variant_contig(self, schema):
-        assert schema["fields"]["variant_contig"] == {
+        assert get_field_dict(schema, "variant_contig") == {
             "name": "variant_contig",
             "dtype": "i1",
-            "shape": [9],
-            "chunks": [10000],
-            "dimensions": ["variants"],
+            "shape": (9,),
+            "chunks": (10000,),
+            "dimensions": ("variants",),
             "description": "",
             "vcf_field": None,
             "compressor": {
@@ -253,16 +260,16 @@ def test_variant_contig(self, schema):
                 "shuffle": 0,
                 "blocksize": 0,
             },
-            "filters": [],
+            "filters": tuple(),
         }
 
     def test_call_genotype(self, schema):
-        assert schema["fields"]["call_genotype"] == {
+        assert get_field_dict(schema, "call_genotype") == {
             "name": "call_genotype",
             "dtype": "i1",
-            "shape": [9, 3, 2],
-            "chunks": [10000, 1000],
-            "dimensions": ["variants", "samples", "ploidy"],
+            "shape": (9, 3, 2),
+            "chunks": (10000, 1000),
+            "dimensions": ("variants", "samples", "ploidy"),
             "description": "",
             "vcf_field": None,
             "compressor": {
@@ -272,16 +279,16 @@ def test_call_genotype(self, schema):
                 "shuffle": 2,
                 "blocksize": 0,
             },
-            "filters": [],
+            "filters": tuple(),
         }
 
     def test_call_genotype_mask(self, schema):
-        assert schema["fields"]["call_genotype_mask"] == {
+        assert get_field_dict(schema, "call_genotype_mask") == {
             "name": "call_genotype_mask",
             "dtype": "bool",
-            "shape": [9, 3, 2],
-            "chunks": [10000, 1000],
-            "dimensions": ["variants", "samples", "ploidy"],
+            "shape": (9, 3, 2),
+            "chunks": (10000, 1000),
+            "dimensions": ("variants", "samples", "ploidy"),
             "description": "",
             "vcf_field": None,
             "compressor": {
@@ -291,16 +298,16 @@ def test_call_genotype_mask(self, schema):
                 "shuffle": 2,
                 "blocksize": 0,
             },
-            "filters": [],
+            "filters": tuple(),
         }
 
     def test_call_genotype_phased(self, schema):
-        assert schema["fields"]["call_genotype_mask"] == {
+        assert get_field_dict(schema, "call_genotype_mask") == {
             "name": "call_genotype_mask",
             "dtype": "bool",
-            "shape": [9, 3, 2],
-            "chunks": [10000, 1000],
-            "dimensions": ["variants", "samples", "ploidy"],
+            "shape": (9, 3, 2),
+            "chunks": (10000, 1000),
+            "dimensions": ("variants", "samples", "ploidy"),
             "description": "",
             "vcf_field": None,
             "compressor": {
@@ -310,16 +317,16 @@ def test_call_genotype_phased(self, schema):
                 "shuffle": 2,
                 "blocksize": 0,
             },
-            "filters": [],
+            "filters": tuple(),
         }
 
     def test_call_GQ(self, schema):
-        assert schema["fields"]["call_GQ"] == {
+        assert get_field_dict(schema, "call_GQ") == {
             "name": "call_GQ",
             "dtype": "i1",
-            "shape": [9, 3],
-            "chunks": [10000, 1000],
-            "dimensions": ["variants", "samples"],
+            "shape": (9, 3),
+            "chunks": (10000, 1000),
+            "dimensions": ("variants", "samples"),
             "description": "Genotype Quality",
             "vcf_field": "FORMAT/GQ",
             "compressor": {
@@ -329,7 +336,7 @@ def test_call_GQ(self, schema):
                 "shuffle": 0,
                 "blocksize": 0,
             },
-            "filters": [],
+            "filters": tuple(),
         }
 
 
@@ -379,7 +386,7 @@ class TestVcfDescriptions:
         ],
     )
     def test_fields(self, schema, field, description):
-        assert schema["fields"][field]["description"] == description
+        assert schema.field_map()[field].description == description
 
     # This information is not in the schema yet,
     # https://github.com/sgkit-dev/bio2zarr/issues/123

Original file line number	Diff line number	Diff line change
`@@ -228,7 +228,7 @@ def schema(self, icf):`
`228`	`228`	`],`
`229`	`229`	`)`
`230`	`230`	`def test_info_schemas(self, schema, name, dtype, shape, dimensions):`
`231`		`- v = schema.fields[name]`
	`231`	`+ v = schema.field_map()[name]`
`232`	`232`	`assert v.dtype == dtype`
`233`	`233`	`assert tuple(v.shape) == shape`
`234`	`234`	`assert v.dimensions == dimensions`