@@ -902,6 +902,63 @@ def finalise(self, show_progress=False):
902902 logger .info ("Consolidating Zarr metadata" )
903903 zarr .consolidate_metadata (self .path )
904904
905+ #######################
906+ # index
907+ #######################
908+
909+ def create_index (self ):
910+ """Create an index to support efficient region queries."""
911+
912+ store = zarr .DirectoryStore (self .path )
913+ root = zarr .open_group (store = store , mode = "r+" )
914+
915+ contig = root ["variant_contig" ]
916+ pos = root ["variant_position" ]
917+ length = root ["variant_length" ]
918+
919+ assert contig .cdata_shape == pos .cdata_shape
920+
921+ index = []
922+
923+ logger .info ("Creating region index" )
924+ for v_chunk in range (pos .cdata_shape [0 ]):
925+ c = contig .blocks [v_chunk ]
926+ p = pos .blocks [v_chunk ]
927+ e = p + length .blocks [v_chunk ] - 1
928+
929+ # create a row for each contig in the chunk
930+ d = np .diff (c , append = - 1 )
931+ c_start_idx = 0
932+ for c_end_idx in np .nonzero (d )[0 ]:
933+ assert c [c_start_idx ] == c [c_end_idx ]
934+ index .append (
935+ (
936+ v_chunk , # chunk index
937+ c [c_start_idx ], # contig ID
938+ p [c_start_idx ], # start
939+ p [c_end_idx ], # end
940+ np .max (e [c_start_idx : c_end_idx + 1 ]), # max end
941+ c_end_idx - c_start_idx + 1 , # num records
942+ )
943+ )
944+ c_start_idx = c_end_idx + 1
945+
946+ index = np .array (index , dtype = np .int32 )
947+ array = root .array (
948+ "region_index" ,
949+ data = index ,
950+ shape = index .shape ,
951+ dtype = index .dtype ,
952+ compressor = numcodecs .Blosc ("zstd" , clevel = 9 , shuffle = 0 ),
953+ )
954+ array .attrs ["_ARRAY_DIMENSIONS" ] = [
955+ "region_index_values" ,
956+ "region_index_fields" ,
957+ ]
958+
959+ logger .info ("Consolidating Zarr metadata" )
960+ zarr .consolidate_metadata (self .path )
961+
905962 ######################
906963 # encode_all_partitions
907964 ######################
@@ -1004,6 +1061,7 @@ def encode(
10041061 max_memory = max_memory ,
10051062 )
10061063 vzw .finalise (show_progress )
1064+ vzw .create_index ()
10071065
10081066
10091067def encode_init (
0 commit comments