|
1 | 1 | import re
|
2 | 2 | from typing import Any
|
3 | 3 |
|
4 |
| -import numcodecs |
5 | 4 | import numpy as np
|
6 | 5 | import pandas as pd
|
7 |
| -import zarr |
8 | 6 | from pyranges import PyRanges
|
9 | 7 |
|
10 | 8 |
|
11 |
| -def create_index(vcz) -> None: |
12 |
| - """Create an index to support efficient region queries.""" |
13 |
| - |
14 |
| - root = zarr.open(vcz, mode="r+") |
15 |
| - |
16 |
| - contig = root["variant_contig"] |
17 |
| - pos = root["variant_position"] |
18 |
| - length = root["variant_length"] |
19 |
| - |
20 |
| - assert contig.cdata_shape == pos.cdata_shape |
21 |
| - |
22 |
| - index = [] |
23 |
| - |
24 |
| - for v_chunk in range(pos.cdata_shape[0]): |
25 |
| - c = contig.blocks[v_chunk] |
26 |
| - p = pos.blocks[v_chunk] |
27 |
| - e = p + length.blocks[v_chunk] - 1 |
28 |
| - |
29 |
| - # create a row for each contig in the chunk |
30 |
| - d = np.diff(c, append=-1) |
31 |
| - c_start_idx = 0 |
32 |
| - for c_end_idx in np.nonzero(d)[0]: |
33 |
| - assert c[c_start_idx] == c[c_end_idx] |
34 |
| - index.append( |
35 |
| - ( |
36 |
| - v_chunk, # chunk index |
37 |
| - c[c_start_idx], # contig ID |
38 |
| - p[c_start_idx], # start |
39 |
| - p[c_end_idx], # end |
40 |
| - np.max(e[c_start_idx : c_end_idx + 1]), # max end |
41 |
| - c_end_idx - c_start_idx + 1, # num records |
42 |
| - ) |
43 |
| - ) |
44 |
| - c_start_idx = c_end_idx + 1 |
45 |
| - |
46 |
| - index = np.array(index, dtype=np.int32) |
47 |
| - root.array( |
48 |
| - "region_index", |
49 |
| - data=index, |
50 |
| - compressor=numcodecs.Blosc("zstd", clevel=9, shuffle=0), |
51 |
| - overwrite=True, |
52 |
| - ) |
53 |
| - |
54 |
| - |
55 | 9 | def parse_region_string(region: str) -> tuple[str, int | None, int | None]:
|
56 | 10 | """Return the contig, start position and end position from a region string."""
|
57 | 11 | if re.search(r":\d+-\d*$", region):
|
|
0 commit comments