@@ -887,7 +887,6 @@ def inspect(if_path):
887
887
888
888
@dataclasses .dataclass
889
889
class ZarrColumnSpec :
890
- # TODO change to "variable_name"
891
890
name : str
892
891
dtype : str
893
892
shape : tuple
@@ -898,6 +897,11 @@ class ZarrColumnSpec:
898
897
compressor : dict
899
898
# TODO add filters
900
899
900
+ def __post_init__ (self ):
901
+ self .shape = tuple (self .shape )
902
+ self .chunks = tuple (self .chunks )
903
+ self .dimensions = tuple (self .dimensions )
904
+
901
905
902
906
@dataclasses .dataclass
903
907
class ZarrConversionSpec :
@@ -908,17 +912,24 @@ class ZarrConversionSpec:
908
912
contig_id : list
909
913
contig_length : list
910
914
filter_id : list
911
- variables : list
915
+ columns : dict
912
916
913
917
def asdict (self ):
914
918
return dataclasses .asdict (self )
915
919
920
+ def asjson (self ):
921
+ return json .dumps (self .asdict (), indent = 4 )
922
+
916
923
@staticmethod
917
924
def fromdict (d ):
918
925
ret = ZarrConversionSpec (** d )
919
- ret .variables = [ ZarrColumnSpec (** cd ) for cd in d ["variables" ]]
926
+ ret .columns = { key : ZarrColumnSpec (** value ) for key , value in d ["columns" ]. items ()}
920
927
return ret
921
928
929
+ @staticmethod
930
+ def fromjson (s ):
931
+ return ZarrConversionSpec .fromdict (json .loads (s ))
932
+
922
933
@staticmethod
923
934
def generate (pcvcf , chunk_length = None , chunk_width = None ):
924
935
m = pcvcf .num_records
@@ -1070,7 +1081,7 @@ def fixed_field_spec(
1070
1081
return ZarrConversionSpec (
1071
1082
chunk_width = chunk_width ,
1072
1083
chunk_length = chunk_length ,
1073
- variables = colspecs ,
1084
+ columns = { col . name : col for col in colspecs } ,
1074
1085
dimensions = ["variants" , "samples" , "ploidy" , "alleles" , "filters" ],
1075
1086
sample_id = pcvcf .metadata .samples ,
1076
1087
contig_id = pcvcf .metadata .contig_names ,
@@ -1261,8 +1272,8 @@ def convert(
1261
1272
logger .info (f"Create zarr at { write_path } " )
1262
1273
sgvcf = SgvcfZarr (write_path )
1263
1274
sgvcf .root = zarr .group (store = store , overwrite = True )
1264
- for variable in conversion_spec .variables [:] :
1265
- sgvcf .create_array (variable )
1275
+ for column in conversion_spec .columns . values () :
1276
+ sgvcf .create_array (column )
1266
1277
1267
1278
progress_config = core .ProgressConfig (
1268
1279
total = pcvcf .total_uncompressed_bytes ,
@@ -1287,7 +1298,7 @@ def convert(
1287
1298
)
1288
1299
pwm .submit (sgvcf .encode_filters , pcvcf , conversion_spec .filter_id )
1289
1300
has_gt = False
1290
- for variable in conversion_spec .variables [:] :
1301
+ for variable in conversion_spec .columns . values () :
1291
1302
if variable .vcf_field is not None :
1292
1303
# print("Encode", variable.name)
1293
1304
# TODO for large columns it's probably worth splitting up
@@ -1309,10 +1320,10 @@ def convert(
1309
1320
os .rename (write_path , path )
1310
1321
1311
1322
1312
- def generate_spec (if_path , out ):
1323
+ def mkschema (if_path , out ):
1313
1324
pcvcf = PickleChunkedVcf .load (if_path )
1314
1325
spec = ZarrConversionSpec .generate (pcvcf )
1315
- json . dump (spec .asdict (), out , indent = 4 )
1326
+ out . write (spec .asjson () )
1316
1327
1317
1328
1318
1329
def to_zarr (
@@ -1323,8 +1334,7 @@ def to_zarr(
1323
1334
spec = ZarrConversionSpec .generate (pcvcf )
1324
1335
else :
1325
1336
with open (conversion_spec , "r" ) as f :
1326
- d = json .load (f )
1327
- spec = ZarrConversionSpec .fromdict (d )
1337
+ spec = ZarrConversionSpec .fromjson (f .read ())
1328
1338
SgvcfZarr .convert (
1329
1339
pcvcf ,
1330
1340
zarr_path ,
0 commit comments