@@ -1057,40 +1057,41 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
1057
1057
def generate_schema (
1058
1058
self , variants_chunk_size = None , samples_chunk_size = None , local_alleles = None
1059
1059
):
1060
- m = self .num_records
1061
- n = self .num_samples
1062
1060
if local_alleles is None :
1063
1061
local_alleles = False
1064
1062
1065
1063
max_alleles = max (self .fields ["ALT" ].vcf_field .summary .max_number + 1 , 2 )
1066
- dimensions = {
1067
- "variants" : vcz .VcfZarrDimension (
1068
- size = m , chunk_size = variants_chunk_size or vcz .DEFAULT_VARIANT_CHUNK_SIZE
1069
- ),
1070
- "samples" : vcz .VcfZarrDimension (
1071
- size = n , chunk_size = samples_chunk_size or vcz .DEFAULT_SAMPLE_CHUNK_SIZE
1072
- ),
1073
- # ploidy and genotypes added conditionally below
1074
- "alleles" : vcz .VcfZarrDimension (size = max_alleles ),
1075
- "alt_alleles" : vcz .VcfZarrDimension (size = max_alleles - 1 ),
1076
- "filters" : vcz .VcfZarrDimension (size = self .metadata .num_filters ),
1077
- }
1078
1064
1079
1065
# Add ploidy and genotypes dimensions only when needed
1080
1066
max_genotypes = 0
1081
1067
for field in self .metadata .format_fields :
1082
1068
if field .vcf_number == "G" :
1083
1069
max_genotypes = max (max_genotypes , field .summary .max_number )
1070
+
1071
+ ploidy = None
1072
+ genotypes_size = None
1084
1073
if self .gt_field is not None :
1085
1074
ploidy = max (self .gt_field .summary .max_number - 1 , 1 )
1086
- dimensions ["ploidy" ] = vcz .VcfZarrDimension (size = ploidy )
1087
- max_genotypes = math .comb (max_alleles + ploidy - 1 , ploidy )
1088
- dimensions ["genotypes" ] = vcz .VcfZarrDimension (size = max_genotypes )
1075
+ # NOTE: it's not clear why we're computing this, when we must have had
1076
+ # at least one number=G field to require it anyway?
1077
+ genotypes_size = math .comb (max_alleles + ploidy - 1 , ploidy )
1078
+ # assert max_genotypes == genotypes_size
1089
1079
else :
1090
1080
if max_genotypes > 0 :
1091
1081
# there is no GT field, but there is at least one Number=G field,
1092
1082
# so need to define genotypes dimension
1093
- dimensions ["genotypes" ] = vcz .VcfZarrDimension (size = max_genotypes )
1083
+ genotypes_size = max_genotypes
1084
+
1085
+ dimensions = vcz .standard_dimensions (
1086
+ variants_size = self .num_records ,
1087
+ variants_chunk_size = variants_chunk_size ,
1088
+ samples_size = self .num_samples ,
1089
+ samples_chunk_size = samples_chunk_size ,
1090
+ alleles_size = max_alleles ,
1091
+ filters_size = self .metadata .num_filters ,
1092
+ ploidy_size = ploidy ,
1093
+ genotypes_size = genotypes_size ,
1094
+ )
1094
1095
1095
1096
schema_instance = vcz .VcfZarrSchema (
1096
1097
format_version = vcz .ZARR_SCHEMA_FORMAT_VERSION ,
@@ -1173,7 +1174,7 @@ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
1173
1174
continue
1174
1175
array_specs .append (spec_from_field (field ))
1175
1176
1176
- if self .gt_field is not None and n > 0 :
1177
+ if self .gt_field is not None and self . num_samples > 0 :
1177
1178
array_specs .append (
1178
1179
vcz .ZarrArraySpec (
1179
1180
name = "call_genotype_phased" ,
0 commit comments