15
15
import numcodecs
16
16
import numpy as np
17
17
18
- from bio2zarr import schema
19
-
20
- from .. import constants , core , provenance , vcf_utils , writer
18
+ from .. import constants , core , provenance , schema , vcf_utils , writer
21
19
22
20
logger = logging .getLogger (__name__ )
23
21
@@ -1029,28 +1027,33 @@ def iter_genotypes(self, shape, start, stop):
1029
1027
def generate_schema (
1030
1028
self , variants_chunk_size = None , samples_chunk_size = None , local_alleles = None
1031
1029
):
1032
- # Import schema here to avoid circular import
1033
- from bio2zarr import schema
1034
-
1035
1030
m = self .num_records
1036
1031
n = self .num_samples
1037
- if samples_chunk_size is None :
1038
- samples_chunk_size = 10_000
1039
- if variants_chunk_size is None :
1040
- variants_chunk_size = 1000
1041
1032
if local_alleles is None :
1042
1033
local_alleles = False
1034
+
1035
+ schema_instance = schema .VcfZarrSchema (
1036
+ format_version = schema .ZARR_SCHEMA_FORMAT_VERSION ,
1037
+ samples_chunk_size = samples_chunk_size ,
1038
+ variants_chunk_size = variants_chunk_size ,
1039
+ fields = [],
1040
+ samples = self .metadata .samples ,
1041
+ contigs = self .metadata .contigs ,
1042
+ filters = self .metadata .filters ,
1043
+ )
1044
+
1043
1045
logger .info (
1044
- f"Generating schema with chunks={ variants_chunk_size , samples_chunk_size } "
1046
+ "Generating schema with chunks="
1047
+ f"{ schema_instance .variants_chunk_size , schema_instance .samples_chunk_size } "
1045
1048
)
1046
1049
1047
1050
def spec_from_field (field , array_name = None ):
1048
1051
return schema .ZarrArraySpec .from_field (
1049
1052
field ,
1050
1053
num_samples = n ,
1051
1054
num_variants = m ,
1052
- samples_chunk_size = samples_chunk_size ,
1053
- variants_chunk_size = variants_chunk_size ,
1055
+ samples_chunk_size = schema_instance . samples_chunk_size ,
1056
+ variants_chunk_size = schema_instance . variants_chunk_size ,
1054
1057
array_name = array_name ,
1055
1058
)
1056
1059
@@ -1069,7 +1072,7 @@ def fixed_field_spec(
1069
1072
shape = shape ,
1070
1073
description = "" ,
1071
1074
dimensions = dimensions ,
1072
- chunks = chunks or [variants_chunk_size ],
1075
+ chunks = chunks or [schema_instance . variants_chunk_size ],
1073
1076
)
1074
1077
1075
1078
alt_field = self .fields ["ALT" ]
@@ -1085,14 +1088,14 @@ def fixed_field_spec(
1085
1088
dtype = "bool" ,
1086
1089
shape = (m , self .metadata .num_filters ),
1087
1090
dimensions = ["variants" , "filters" ],
1088
- chunks = (variants_chunk_size , self .metadata .num_filters ),
1091
+ chunks = (schema_instance . variants_chunk_size , self .metadata .num_filters ),
1089
1092
),
1090
1093
fixed_field_spec (
1091
1094
name = "variant_allele" ,
1092
1095
dtype = "O" ,
1093
1096
shape = (m , max_alleles ),
1094
1097
dimensions = ["variants" , "alleles" ],
1095
- chunks = (variants_chunk_size , max_alleles ),
1098
+ chunks = (schema_instance . variants_chunk_size , max_alleles ),
1096
1099
),
1097
1100
fixed_field_spec (
1098
1101
name = "variant_id" ,
@@ -1127,7 +1130,10 @@ def fixed_field_spec(
1127
1130
if gt_field is not None and n > 0 :
1128
1131
ploidy = max (gt_field .summary .max_number - 1 , 1 )
1129
1132
shape = [m , n ]
1130
- chunks = [variants_chunk_size , samples_chunk_size ]
1133
+ chunks = [
1134
+ schema_instance .variants_chunk_size ,
1135
+ schema_instance .samples_chunk_size ,
1136
+ ]
1131
1137
dimensions = ["variants" , "samples" ]
1132
1138
array_specs .append (
1133
1139
schema .ZarrArraySpec .new (
@@ -1169,15 +1175,8 @@ def fixed_field_spec(
1169
1175
if local_alleles :
1170
1176
array_specs = convert_local_allele_field_types (array_specs )
1171
1177
1172
- return schema .VcfZarrSchema (
1173
- format_version = schema .ZARR_SCHEMA_FORMAT_VERSION ,
1174
- samples_chunk_size = samples_chunk_size ,
1175
- variants_chunk_size = variants_chunk_size ,
1176
- fields = array_specs ,
1177
- samples = self .metadata .samples ,
1178
- contigs = self .metadata .contigs ,
1179
- filters = self .metadata .filters ,
1180
- )
1178
+ schema_instance .fields = array_specs
1179
+ return schema_instance
1181
1180
1182
1181
1183
1182
@dataclasses .dataclass
0 commit comments