@@ -111,9 +111,6 @@ def full_name(self):
111111 return self .name
112112 return f"{ self .category } /{ self .name } "
113113
114- # TODO add method here to choose a good set compressor and
115- # filters default here for this field.
116-
117114 def smallest_dtype (self ):
118115 """
119116 Returns the smallest dtype suitable for this field based
@@ -123,13 +120,13 @@ def smallest_dtype(self):
123120 if self .vcf_type == "Float" :
124121 ret = "f4"
125122 elif self .vcf_type == "Integer" :
126- dtype = "i4"
127- for a_dtype in [ "i1" , "i2" ]:
128- info = np . iinfo ( a_dtype )
129- if info . min <= s . min_value and s . max_value <= info . max :
130- dtype = a_dtype
131- break
132- ret = dtype
123+ if not math . isfinite ( s . max_value ):
124+ # All missing values; use i1. Note we should have some API to
125+ # check more explicitly for missingness:
126+ # https://github.com/sgkit-dev/bio2zarr/issues/131
127+ ret = "i1"
128+ else :
129+ ret = core . min_int_dtype ( s . min_value , s . max_value )
133130 elif self .vcf_type == "Flag" :
134131 ret = "bool"
135132 elif self .vcf_type == "Character" :
@@ -152,6 +149,10 @@ class VcfPartition:
152149 cname = "zstd" , clevel = 7 , shuffle = numcodecs .Blosc .NOSHUFFLE
153150)
154151
152+ # TODO refactor this to have embedded Contig dataclass, Filters
153+ # and Samples dataclasses to allow for more information to be
154+ # retained and forward compatibility.
155+
155156
156157@dataclasses .dataclass
157158class IcfMetadata :
@@ -183,6 +184,14 @@ def format_fields(self):
183184 fields .append (field )
184185 return fields
185186
187+ @property
188+ def num_contigs (self ):
189+ return len (self .contig_names )
190+
191+ @property
192+ def num_filters (self ):
193+ return len (self .filters )
194+
186195 @property
187196 def num_records (self ):
188197 return sum (self .contig_record_counts .values ())
@@ -1242,6 +1251,50 @@ def new(**kwargs):
12421251 spec ._choose_compressor_settings ()
12431252 return spec
12441253
1254+ @staticmethod
1255+ def from_field (
1256+ vcf_field ,
1257+ * ,
1258+ num_variants ,
1259+ num_samples ,
1260+ variants_chunk_size ,
1261+ samples_chunk_size ,
1262+ variable_name = None ,
1263+ ):
1264+ shape = [num_variants ]
1265+ prefix = "variant_"
1266+ dimensions = ["variants" ]
1267+ chunks = [variants_chunk_size ]
1268+ if vcf_field .category == "FORMAT" :
1269+ prefix = "call_"
1270+ shape .append (num_samples )
1271+ chunks .append (samples_chunk_size )
1272+ dimensions .append ("samples" )
1273+ if variable_name is None :
1274+ variable_name = prefix + vcf_field .name
1275+ # TODO make an option to add in the empty extra dimension
1276+ if vcf_field .summary .max_number > 1 :
1277+ shape .append (vcf_field .summary .max_number )
1278+ # TODO we should really be checking this to see if the named dimensions
1279+ # are actually correct.
1280+ if vcf_field .vcf_number == "R" :
1281+ dimensions .append ("alleles" )
1282+ elif vcf_field .vcf_number == "A" :
1283+ dimensions .append ("alt_alleles" )
1284+ elif vcf_field .vcf_number == "G" :
1285+ dimensions .append ("genotypes" )
1286+ else :
1287+ dimensions .append (f"{ vcf_field .category } _{ vcf_field .name } _dim" )
1288+ return ZarrColumnSpec .new (
1289+ vcf_field = vcf_field .full_name ,
1290+ name = variable_name ,
1291+ dtype = vcf_field .smallest_dtype (),
1292+ shape = shape ,
1293+ chunks = chunks ,
1294+ dimensions = dimensions ,
1295+ description = vcf_field .description ,
1296+ )
1297+
12451298 def _choose_compressor_settings (self ):
12461299 """
12471300 Choose compressor and filter settings based on the size and
@@ -1250,15 +1303,19 @@ def _choose_compressor_settings(self):
12501303
12511304 See https://github.com/pystatgen/bio2zarr/discussions/74
12521305 """
1253- dt = np .dtype (self .dtype )
12541306 # Default is to not shuffle, because autoshuffle isn't recognised
12551307 # by many Zarr implementations, and shuffling can lead to worse
12561308 # performance in some cases anyway. Turning on shuffle should be a
12571309 # deliberate choice.
12581310 shuffle = numcodecs .Blosc .NOSHUFFLE
1259- if dt .itemsize == 1 :
1260- # Any 1 byte field gets BITSHUFFLE by default
1311+ if self .name == "call_genotype" and self .dtype == "i1" :
1312+ # call_genotype gets BITSHUFFLE by default as it gets
1313+ # significantly better compression (at a cost of slower
1314+ # decoding)
1315+ shuffle = numcodecs .Blosc .BITSHUFFLE
1316+ elif self .dtype == "bool" :
12611317 shuffle = numcodecs .Blosc .BITSHUFFLE
1318+
12621319 self .compressor ["shuffle" ] = shuffle
12631320
12641321
@@ -1313,6 +1370,16 @@ def generate(icf, variants_chunk_size=None, samples_chunk_size=None):
13131370 f"Generating schema with chunks={ variants_chunk_size , samples_chunk_size } "
13141371 )
13151372
1373+ def spec_from_field (field , variable_name = None ):
1374+ return ZarrColumnSpec .from_field (
1375+ field ,
1376+ num_samples = n ,
1377+ num_variants = m ,
1378+ samples_chunk_size = samples_chunk_size ,
1379+ variants_chunk_size = variants_chunk_size ,
1380+ variable_name = variable_name ,
1381+ )
1382+
13161383 def fixed_field_spec (
13171384 name , dtype , vcf_field = None , shape = (m ,), dimensions = ("variants" ,)
13181385 ):
@@ -1328,95 +1395,56 @@ def fixed_field_spec(
13281395
13291396 alt_col = icf .columns ["ALT" ]
13301397 max_alleles = alt_col .vcf_field .summary .max_number + 1
1331- num_filters = len (icf .metadata .filters )
13321398
1333- # # FIXME get dtype from lookup table
13341399 colspecs = [
13351400 fixed_field_spec (
13361401 name = "variant_contig" ,
1337- dtype = "i2" , # FIXME
1402+ dtype = core . min_int_dtype ( 0 , icf . metadata . num_contigs ),
13381403 ),
13391404 fixed_field_spec (
13401405 name = "variant_filter" ,
13411406 dtype = "bool" ,
1342- shape = (m , num_filters ),
1407+ shape = (m , icf . metadata . num_filters ),
13431408 dimensions = ["variants" , "filters" ],
13441409 ),
13451410 fixed_field_spec (
13461411 name = "variant_allele" ,
13471412 dtype = "str" ,
1348- shape = [ m , max_alleles ] ,
1413+ shape = ( m , max_alleles ) ,
13491414 dimensions = ["variants" , "alleles" ],
13501415 ),
13511416 fixed_field_spec (
1352- vcf_field = "POS" ,
1353- name = "variant_position" ,
1354- dtype = "i4" ,
1355- ),
1356- fixed_field_spec (
1357- vcf_field = None ,
13581417 name = "variant_id" ,
13591418 dtype = "str" ,
13601419 ),
13611420 fixed_field_spec (
1362- vcf_field = None ,
13631421 name = "variant_id_mask" ,
13641422 dtype = "bool" ,
13651423 ),
1366- fixed_field_spec (
1367- vcf_field = "QUAL" ,
1368- name = "variant_quality" ,
1369- dtype = "f4" ,
1370- ),
13711424 ]
1425+ name_map = {field .full_name : field for field in icf .metadata .fields }
1426+
1427+ # Only two of the fixed fields have a direct one-to-one mapping.
1428+ colspecs .extend (
1429+ [
1430+ spec_from_field (name_map ["QUAL" ], variable_name = "variant_quality" ),
1431+ spec_from_field (name_map ["POS" ], variable_name = "variant_position" ),
1432+ ]
1433+ )
1434+ colspecs .extend ([spec_from_field (field ) for field in icf .metadata .info_fields ])
13721435
13731436 gt_field = None
1374- for field in icf .metadata .fields :
1375- if field .category == "fixed" :
1376- continue
1437+ for field in icf .metadata .format_fields :
13771438 if field .name == "GT" :
13781439 gt_field = field
13791440 continue
1380- shape = [m ]
1381- prefix = "variant_"
1382- dimensions = ["variants" ]
1383- chunks = [variants_chunk_size ]
1384- if field .category == "FORMAT" :
1385- prefix = "call_"
1386- shape .append (n )
1387- chunks .append (samples_chunk_size )
1388- dimensions .append ("samples" )
1389- # TODO make an option to add in the empty extra dimension
1390- if field .summary .max_number > 1 :
1391- shape .append (field .summary .max_number )
1392- # TODO we should really be checking this to see if the named dimensions
1393- # are actually correct.
1394- if field .vcf_number == "R" :
1395- dimensions .append ("alleles" )
1396- elif field .vcf_number == "A" :
1397- dimensions .append ("alt_alleles" )
1398- elif field .vcf_number == "G" :
1399- dimensions .append ("genotypes" )
1400- else :
1401- dimensions .append (f"{ field .category } _{ field .name } _dim" )
1402- variable_name = prefix + field .name
1403- colspec = ZarrColumnSpec .new (
1404- vcf_field = field .full_name ,
1405- name = variable_name ,
1406- dtype = field .smallest_dtype (),
1407- shape = shape ,
1408- chunks = chunks ,
1409- dimensions = dimensions ,
1410- description = field .description ,
1411- )
1412- colspecs .append (colspec )
1441+ colspecs .append (spec_from_field (field ))
14131442
14141443 if gt_field is not None :
14151444 ploidy = gt_field .summary .max_number - 1
14161445 shape = [m , n ]
14171446 chunks = [variants_chunk_size , samples_chunk_size ]
14181447 dimensions = ["variants" , "samples" ]
1419-
14201448 colspecs .append (
14211449 ZarrColumnSpec .new (
14221450 vcf_field = None ,
0 commit comments