12
12
import numcodecs
13
13
import numpy as np
14
14
import zarr
15
+ from packaging .version import Version
15
16
16
17
from .. import constants , core , provenance
17
18
from . import icf
@@ -470,6 +471,17 @@ class VcfZarrWriteSummary(core.JsonDataclass):
470
471
max_encoding_memory : str
471
472
472
473
474
+ def _zarr_v3 () -> bool :
475
+ return Version (zarr .__version__ ).major >= 3
476
+
477
+
478
+ if _zarr_v3 ():
479
+ # Use zarr format v2 even when running with zarr-python v3
480
+ ZARR_FORMAT_KWARGS = dict (zarr_format = 2 )
481
+ else :
482
+ ZARR_FORMAT_KWARGS = dict ()
483
+
484
+
473
485
class VcfZarrWriter :
474
486
def __init__ (self , path ):
475
487
self .path = pathlib .Path (path )
@@ -532,7 +544,7 @@ def init(
532
544
)
533
545
534
546
self .path .mkdir ()
535
- root = zarr .open (store = self .path , mode = "a" )
547
+ root = zarr .open (store = self .path , mode = "a" , ** ZARR_FORMAT_KWARGS )
536
548
root .attrs .update (
537
549
{
538
550
"vcf_zarr_version" : "0.2" ,
@@ -548,7 +560,7 @@ def init(
548
560
self .wip_path .mkdir ()
549
561
self .arrays_path .mkdir ()
550
562
self .partitions_path .mkdir ()
551
- root = zarr .open (store = self .arrays_path , mode = "a" )
563
+ root = zarr .open (store = self .arrays_path , mode = "a" , ** ZARR_FORMAT_KWARGS )
552
564
553
565
total_chunks = 0
554
566
for field in self .schema .fields :
@@ -572,7 +584,8 @@ def encode_samples(self, root):
572
584
raise ValueError ("Subsetting or reordering samples not supported currently" )
573
585
array = root .array (
574
586
"sample_id" ,
575
- [sample .id for sample in self .schema .samples ],
587
+ data = [sample .id for sample in self .schema .samples ],
588
+ shape = len (self .schema .samples ),
576
589
dtype = "str" ,
577
590
compressor = DEFAULT_ZARR_COMPRESSOR ,
578
591
chunks = (self .schema .samples_chunk_size ,),
@@ -583,15 +596,17 @@ def encode_samples(self, root):
583
596
def encode_contig_id (self , root ):
584
597
array = root .array (
585
598
"contig_id" ,
586
- [contig .id for contig in self .schema .contigs ],
599
+ data = [contig .id for contig in self .schema .contigs ],
600
+ shape = len (self .schema .contigs ),
587
601
dtype = "str" ,
588
602
compressor = DEFAULT_ZARR_COMPRESSOR ,
589
603
)
590
604
array .attrs ["_ARRAY_DIMENSIONS" ] = ["contigs" ]
591
605
if all (contig .length is not None for contig in self .schema .contigs ):
592
606
array = root .array (
593
607
"contig_length" ,
594
- [contig .length for contig in self .schema .contigs ],
608
+ data = [contig .length for contig in self .schema .contigs ],
609
+ shape = len (self .schema .contigs ),
595
610
dtype = np .int64 ,
596
611
compressor = DEFAULT_ZARR_COMPRESSOR ,
597
612
)
@@ -602,7 +617,8 @@ def encode_filter_id(self, root):
602
617
# https://github.com/sgkit-dev/vcf-zarr-spec/issues/19
603
618
array = root .array (
604
619
"filter_id" ,
605
- [filt .id for filt in self .schema .filters ],
620
+ data = [filt .id for filt in self .schema .filters ],
621
+ shape = len (self .schema .filters ),
606
622
dtype = "str" ,
607
623
compressor = DEFAULT_ZARR_COMPRESSOR ,
608
624
)
@@ -616,14 +632,15 @@ def init_array(self, root, array_spec, variants_dim_size):
616
632
# Truncate the variants dimension is max_variant_chunks was specified
617
633
shape [0 ] = variants_dim_size
618
634
a = root .empty (
619
- array_spec .name ,
635
+ name = array_spec .name ,
620
636
shape = shape ,
621
637
chunks = array_spec .chunks ,
622
638
dtype = array_spec .dtype ,
623
639
compressor = numcodecs .get_codec (array_spec .compressor ),
624
640
filters = [numcodecs .get_codec (filt ) for filt in array_spec .filters ],
625
641
object_codec = object_codec ,
626
642
dimension_separator = self .metadata .dimension_separator ,
643
+ ** ZARR_FORMAT_KWARGS ,
627
644
)
628
645
a .attrs .update (
629
646
{
@@ -688,7 +705,11 @@ def init_partition_array(self, partition_index, name):
688
705
# Overwrite any existing WIP files
689
706
wip_path = self .wip_partition_array_path (partition_index , name )
690
707
shutil .copytree (src , wip_path , dirs_exist_ok = True )
691
- wip_root = zarr .open (store = self .wip_partition_path (partition_index ), mode = "a" )
708
+ wip_root = zarr .open (
709
+ store = self .wip_partition_path (partition_index ),
710
+ mode = "a" ,
711
+ ** ZARR_FORMAT_KWARGS ,
712
+ )
692
713
array = wip_root [name ]
693
714
logger .debug (f"Opened empty array { array .name } <{ array .dtype } > @ { wip_path } " )
694
715
return array
0 commit comments