@@ -844,23 +844,36 @@ def encode_partition(self, partition_index):
844
844
os .rename (partition_path , final_path )
845
845
846
846
def init_partition_array (self , partition_index , name ):
847
+ field_map = self .schema .field_map ()
848
+ array_spec = field_map [name ]
847
849
# Create an empty array like the definition
848
- src = self .arrays_path / name
850
+ src = self .arrays_path / array_spec . name
849
851
# Overwrite any existing WIP files
850
- wip_path = self .wip_partition_array_path (partition_index , name )
852
+ wip_path = self .wip_partition_array_path (partition_index , array_spec . name )
851
853
shutil .copytree (src , wip_path , dirs_exist_ok = True )
852
854
array = zarr .open_array (store = wip_path , mode = "a" )
853
- logger .debug (f"Opened empty array { array .name } <{ array .dtype } > @ { wip_path } " )
854
- return array
855
-
856
- def finalise_partition_array (self , partition_index , name ):
857
- logger .debug (f"Encoded { name } partition { partition_index } " )
855
+ partition = self .metadata .partitions [partition_index ]
856
+ ba = core .BufferedArray (array , partition .start , name )
857
+ logger .info (
858
+ f"Start partition { partition_index } array { name } <{ array .dtype } > "
859
+ f"{ array .shape } @ { wip_path } "
860
+ )
861
+ return ba
862
+
863
+ def finalise_partition_array (self , partition_index , buffered_array ):
864
+ buffered_array .flush ()
865
+ # field_map = self.schema.field_map()
866
+ # array_spec = field_map[buffered_array.name]
867
+ # ba = buffered_array
868
+ # print(array_spec.name, "ba.max_buff_size", ba.max_buff_size,
869
+ # array_spec.variant_chunk_nbytes)
870
+ logger .info (
871
+ f"Completed partition { partition_index } array { buffered_array .name } "
872
+ )
858
873
859
874
def encode_array_partition (self , array_spec , partition_index ):
860
- array = self .init_partition_array (partition_index , array_spec .name )
861
-
862
875
partition = self .metadata .partitions [partition_index ]
863
- ba = core . BufferedArray ( array , partition . start )
876
+ ba = self . init_partition_array ( partition_index , array_spec . name )
864
877
source_field = self .icf .fields [array_spec .vcf_field ]
865
878
sanitiser = source_field .sanitiser_factory (ba .buff .shape )
866
879
@@ -869,20 +882,16 @@ def encode_array_partition(self, array_spec, partition_index):
869
882
# to make it easier to reason about dimension padding
870
883
j = ba .next_buffer_row ()
871
884
sanitiser (ba .buff , j , value )
872
- ba .flush ()
873
- self .finalise_partition_array (partition_index , array_spec .name )
885
+ self .finalise_partition_array (partition_index , ba )
874
886
875
887
def encode_genotypes_partition (self , partition_index ):
876
- gt_array = self . init_partition_array ( partition_index , "call_genotype" )
877
- gt_mask_array = self . init_partition_array ( partition_index , "call_genotype_mask" )
878
- gt_phased_array = self .init_partition_array (
879
- partition_index , "call_genotype_phased"
880
- )
888
+ # FIXME we should be doing these one at a time, reading back in the genotypes
889
+ # like we do for local alleles
890
+ gt = self .init_partition_array (partition_index , "call_genotype" )
891
+ gt_mask = self . init_partition_array ( partition_index , "call_genotype_mask" )
892
+ gt_phased = self . init_partition_array ( partition_index , "call_genotype_phased" )
881
893
882
894
partition = self .metadata .partitions [partition_index ]
883
- gt = core .BufferedArray (gt_array , partition .start )
884
- gt_mask = core .BufferedArray (gt_mask_array , partition .start )
885
- gt_phased = core .BufferedArray (gt_phased_array , partition .start )
886
895
887
896
source_field = self .icf .fields ["FORMAT/GT" ]
888
897
for value in source_field .iter_values (partition .start , partition .stop ):
@@ -898,18 +907,14 @@ def encode_genotypes_partition(self, partition_index):
898
907
# with mixed ploidies?
899
908
j = gt_mask .next_buffer_row ()
900
909
gt_mask .buff [j ] = gt .buff [j ] < 0
901
- gt .flush ()
902
- gt_phased .flush ()
903
- gt_mask .flush ()
904
910
905
- self .finalise_partition_array (partition_index , "call_genotype" )
906
- self .finalise_partition_array (partition_index , "call_genotype_mask" )
907
- self .finalise_partition_array (partition_index , "call_genotype_phased" )
911
+ self .finalise_partition_array (partition_index , gt )
912
+ self .finalise_partition_array (partition_index , gt_phased )
913
+ self .finalise_partition_array (partition_index , gt_mask )
908
914
909
915
def encode_local_alleles_partition (self , partition_index ):
910
916
partition = self .metadata .partitions [partition_index ]
911
- call_LA_array = self .init_partition_array (partition_index , "call_LA" )
912
- call_LA = core .BufferedArray (call_LA_array , partition .start )
917
+ call_LA = self .init_partition_array (partition_index , "call_LA" )
913
918
914
919
gt_array = zarr .open_array (
915
920
store = self .wip_partition_array_path (partition_index , "call_genotype" ),
@@ -921,26 +926,23 @@ def encode_local_alleles_partition(self, partition_index):
921
926
la = compute_la_field (genotypes )
922
927
j = call_LA .next_buffer_row ()
923
928
call_LA .buff [j ] = la
924
-
925
- call_LA .flush ()
926
- self .finalise_partition_array (partition_index , "call_LA" )
929
+ self .finalise_partition_array (partition_index , call_LA )
927
930
928
931
def encode_local_allele_fields_partition (self , partition_index ):
929
932
partition = self .metadata .partitions [partition_index ]
930
933
la_array = zarr .open_array (
931
934
store = self .wip_partition_array_path (partition_index , "call_LA" ),
932
935
mode = "r" ,
933
936
)
934
- field_map = self .schema .field_map ()
935
937
# We got through the localisable fields one-by-one so that we don't need to
936
938
# keep several large arrays in memory at once for each partition.
939
+ field_map = self .schema .field_map ()
937
940
for descriptor in localisable_fields :
938
941
if descriptor .array_name not in field_map :
939
942
continue
940
943
assert field_map [descriptor .array_name ].vcf_field is None
941
944
942
- array = self .init_partition_array (partition_index , descriptor .array_name )
943
- buff = core .BufferedArray (array , partition .start )
945
+ buff = self .init_partition_array (partition_index , descriptor .array_name )
944
946
source = self .icf .fields [descriptor .vcf_field ].iter_values (
945
947
partition .start , partition .stop
946
948
)
@@ -951,14 +953,11 @@ def encode_local_allele_fields_partition(self, partition_index):
951
953
value = descriptor .sanitise (raw_value , 2 , raw_value .dtype )
952
954
j = buff .next_buffer_row ()
953
955
buff .buff [j ] = descriptor .convert (value , la )
954
- buff .flush ()
955
- self .finalise_partition_array (partition_index , "array_name" )
956
+ self .finalise_partition_array (partition_index , buff )
956
957
957
958
def encode_alleles_partition (self , partition_index ):
958
- array_name = "variant_allele"
959
- alleles_array = self .init_partition_array (partition_index , array_name )
959
+ alleles = self .init_partition_array (partition_index , "variant_allele" )
960
960
partition = self .metadata .partitions [partition_index ]
961
- alleles = core .BufferedArray (alleles_array , partition .start )
962
961
ref_field = self .icf .fields ["REF" ]
963
962
alt_field = self .icf .fields ["ALT" ]
964
963
@@ -970,16 +969,12 @@ def encode_alleles_partition(self, partition_index):
970
969
alleles .buff [j , :] = constants .STR_FILL
971
970
alleles .buff [j , 0 ] = ref [0 ]
972
971
alleles .buff [j , 1 : 1 + len (alt )] = alt
973
- alleles .flush ()
974
-
975
- self .finalise_partition_array (partition_index , array_name )
972
+ self .finalise_partition_array (partition_index , alleles )
976
973
977
974
def encode_id_partition (self , partition_index ):
978
- vid_array = self .init_partition_array (partition_index , "variant_id" )
979
- vid_mask_array = self .init_partition_array (partition_index , "variant_id_mask" )
975
+ vid = self .init_partition_array (partition_index , "variant_id" )
976
+ vid_mask = self .init_partition_array (partition_index , "variant_id_mask" )
980
977
partition = self .metadata .partitions [partition_index ]
981
- vid = core .BufferedArray (vid_array , partition .start )
982
- vid_mask = core .BufferedArray (vid_mask_array , partition .start )
983
978
field = self .icf .fields ["ID" ]
984
979
985
980
for value in field .iter_values (partition .start , partition .stop ):
@@ -992,18 +987,14 @@ def encode_id_partition(self, partition_index):
992
987
else :
993
988
vid .buff [j ] = constants .STR_MISSING
994
989
vid_mask .buff [j ] = True
995
- vid .flush ()
996
- vid_mask .flush ()
997
990
998
- self .finalise_partition_array (partition_index , "variant_id" )
999
- self .finalise_partition_array (partition_index , "variant_id_mask" )
991
+ self .finalise_partition_array (partition_index , vid )
992
+ self .finalise_partition_array (partition_index , vid_mask )
1000
993
1001
994
def encode_filters_partition (self , partition_index ):
1002
995
lookup = {filt .id : index for index , filt in enumerate (self .schema .filters )}
1003
- array_name = "variant_filter"
1004
- array = self .init_partition_array (partition_index , array_name )
996
+ var_filter = self .init_partition_array (partition_index , "variant_filter" )
1005
997
partition = self .metadata .partitions [partition_index ]
1006
- var_filter = core .BufferedArray (array , partition .start )
1007
998
1008
999
field = self .icf .fields ["FILTERS" ]
1009
1000
for value in field .iter_values (partition .start , partition .stop ):
@@ -1016,16 +1007,13 @@ def encode_filters_partition(self, partition_index):
1016
1007
raise ValueError (
1017
1008
f"Filter '{ f } ' was not defined in the header."
1018
1009
) from None
1019
- var_filter .flush ()
1020
1010
1021
- self .finalise_partition_array (partition_index , array_name )
1011
+ self .finalise_partition_array (partition_index , var_filter )
1022
1012
1023
1013
def encode_contig_partition (self , partition_index ):
1024
1014
lookup = {contig .id : index for index , contig in enumerate (self .schema .contigs )}
1025
- array_name = "variant_contig"
1026
- array = self .init_partition_array (partition_index , array_name )
1015
+ contig = self .init_partition_array (partition_index , "variant_contig" )
1027
1016
partition = self .metadata .partitions [partition_index ]
1028
- contig = core .BufferedArray (array , partition .start )
1029
1017
field = self .icf .fields ["CHROM" ]
1030
1018
1031
1019
for value in field .iter_values (partition .start , partition .stop ):
@@ -1035,9 +1023,8 @@ def encode_contig_partition(self, partition_index):
1035
1023
# will always succeed. However, if anyone ever does hit a KeyError
1036
1024
# here, please do open an issue with a reproducible example!
1037
1025
contig .buff [j ] = lookup [value [0 ]]
1038
- contig .flush ()
1039
1026
1040
- self .finalise_partition_array (partition_index , array_name )
1027
+ self .finalise_partition_array (partition_index , contig )
1041
1028
1042
1029
#######################
1043
1030
# finalise
0 commit comments