Skip to content

Commit b4e16e0

Browse files
tomwhitejeromekelleher
authored andcommitted
Create an index to support efficient region queries
1 parent ab4c51b commit b4e16e0

File tree

1 file changed

+58
-0
lines changed

1 file changed

+58
-0
lines changed

bio2zarr/vcf2zarr/vcz.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -902,6 +902,63 @@ def finalise(self, show_progress=False):
902902
logger.info("Consolidating Zarr metadata")
903903
zarr.consolidate_metadata(self.path)
904904

905+
#######################
906+
# index
907+
#######################
908+
909+
def create_index(self):
910+
"""Create an index to support efficient region queries."""
911+
912+
store = zarr.DirectoryStore(self.path)
913+
root = zarr.open_group(store=store, mode="r+")
914+
915+
contig = root["variant_contig"]
916+
pos = root["variant_position"]
917+
length = root["variant_length"]
918+
919+
assert contig.cdata_shape == pos.cdata_shape
920+
921+
index = []
922+
923+
logger.info("Creating region index")
924+
for v_chunk in range(pos.cdata_shape[0]):
925+
c = contig.blocks[v_chunk]
926+
p = pos.blocks[v_chunk]
927+
e = p + length.blocks[v_chunk] - 1
928+
929+
# create a row for each contig in the chunk
930+
d = np.diff(c, append=-1)
931+
c_start_idx = 0
932+
for c_end_idx in np.nonzero(d)[0]:
933+
assert c[c_start_idx] == c[c_end_idx]
934+
index.append(
935+
(
936+
v_chunk, # chunk index
937+
c[c_start_idx], # contig ID
938+
p[c_start_idx], # start
939+
p[c_end_idx], # end
940+
np.max(e[c_start_idx : c_end_idx + 1]), # max end
941+
c_end_idx - c_start_idx + 1, # num records
942+
)
943+
)
944+
c_start_idx = c_end_idx + 1
945+
946+
index = np.array(index, dtype=np.int32)
947+
array = root.array(
948+
"region_index",
949+
data=index,
950+
shape=index.shape,
951+
dtype=index.dtype,
952+
compressor=numcodecs.Blosc("zstd", clevel=9, shuffle=0),
953+
)
954+
array.attrs["_ARRAY_DIMENSIONS"] = [
955+
"region_index_values",
956+
"region_index_fields",
957+
]
958+
959+
logger.info("Consolidating Zarr metadata")
960+
zarr.consolidate_metadata(self.path)
961+
905962
######################
906963
# encode_all_partitions
907964
######################
@@ -1004,6 +1061,7 @@ def encode(
10041061
max_memory=max_memory,
10051062
)
10061063
vzw.finalise(show_progress)
1064+
vzw.create_index()
10071065

10081066

10091067
def encode_init(

0 commit comments

Comments
 (0)