Skip to content

Commit 1fea152

Browse files
committed
Remove read_scikit_allel_vcfzarr
1 parent 2a5b14e commit 1fea152

File tree

4 files changed

+1
-714
lines changed

4 files changed

+1
-714
lines changed

sgkit/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
from .display import display_genotypes, display_pedigree
44
from .distance.api import pairwise_distance
55
from .io.dataset import load_dataset, save_dataset
6-
from .io.vcfzarr_reader import read_scikit_allel_vcfzarr
76
from .model import (
87
DIM_ALLELE,
98
DIM_PLOIDY,
@@ -94,7 +93,6 @@
9493
"genee",
9594
"genomic_relationship",
9695
"gwas_linear_regression",
97-
"read_scikit_allel_vcfzarr",
9896
"regenie",
9997
"regenie_loco_regression",
10098
"hardy_weinberg_test",

sgkit/io/utils.py

Lines changed: 1 addition & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
1-
from typing import Any, Dict, Mapping, Optional, Sequence, Tuple
1+
from typing import Mapping, Optional, Tuple
22

3-
import dask.array as da
43
import dask.dataframe as dd
54
import numpy as np
6-
import zarr
75

86
from ..typing import ArrayLike, DType
97
from ..utils import encode_array, max_str_len
@@ -52,109 +50,3 @@ def encode_contigs(contig: ArrayLike) -> Tuple[ArrayLike, ArrayLike]:
5250
else:
5351
ids, names = encode_array(np.asarray(contig, dtype=str))
5452
return ids, names
55-
56-
57-
def concatenate_and_rechunk(
58-
zarrs: Sequence[zarr.Array],
59-
chunks: Optional[Tuple[int, ...]] = None,
60-
dtype: DType = None,
61-
) -> da.Array:
62-
"""Perform a concatenate and rechunk operation on a collection of Zarr arrays
63-
to produce an array with a uniform chunking, suitable for saving as
64-
a single Zarr array.
65-
66-
In contrast to Dask's ``rechunk`` method, the Dask computation graph
67-
is embarrassingly parallel and will make efficient use of memory,
68-
since no Zarr chunks are cached by the Dask scheduler.
69-
70-
The Zarr arrays must have matching shapes except in the first
71-
dimension.
72-
73-
Parameters
74-
----------
75-
zarrs
76-
Collection of Zarr arrays to concatenate.
77-
chunks : Optional[Tuple[int, ...]], optional
78-
The chunks to apply to the concatenated arrays. If not specified
79-
the chunks for the first array will be applied to the concatenated
80-
array.
81-
dtype
82-
The dtype of the concatenated array, by default the same as the
83-
first array.
84-
85-
Returns
86-
-------
87-
A Dask array, suitable for saving as a single Zarr array.
88-
89-
Raises
90-
------
91-
ValueError
92-
If the Zarr arrays do not have matching shapes (except in the first
93-
dimension).
94-
"""
95-
96-
if len(set([z.shape[1:] for z in zarrs])) > 1:
97-
shapes = [z.shape for z in zarrs]
98-
raise ValueError(
99-
f"Zarr arrays must have matching shapes (except in the first dimension): {shapes}"
100-
)
101-
102-
lengths = np.array([z.shape[0] for z in zarrs])
103-
lengths0 = np.insert(lengths, 0, 0, axis=0) # type: ignore[no-untyped-call]
104-
offsets = np.cumsum(lengths0)
105-
total_length = offsets[-1]
106-
107-
shape = (total_length, *zarrs[0].shape[1:])
108-
chunks = chunks or zarrs[0].chunks
109-
dtype = dtype or zarrs[0].dtype
110-
111-
ar = da.empty(shape, chunks=chunks)
112-
113-
def load_chunk(
114-
x: ArrayLike,
115-
zarrs: Sequence[zarr.Array],
116-
offsets: ArrayLike,
117-
block_info: Dict[Any, Any],
118-
) -> ArrayLike:
119-
return _slice_zarrs(zarrs, offsets, block_info[0]["array-location"])
120-
121-
return ar.map_blocks(load_chunk, zarrs=zarrs, offsets=offsets, dtype=dtype)
122-
123-
124-
def _zarr_index(offsets: ArrayLike, pos: int) -> int:
125-
"""Return the index of the zarr file that pos falls in"""
126-
index: int = np.searchsorted(offsets, pos, side="right") - 1 # type: ignore[assignment]
127-
return index
128-
129-
130-
def _slice_zarrs(
131-
zarrs: Sequence[zarr.Array], offsets: ArrayLike, locs: Sequence[Tuple[int, ...]]
132-
) -> ArrayLike:
133-
"""Slice concatenated zarrs by locs"""
134-
# convert array locations to slices
135-
locs = [slice(*loc) for loc in locs] # type: ignore[misc]
136-
# determine which zarr files are needed
137-
start, stop = locs[0].start, locs[0].stop # type: ignore[attr-defined] # stack on first axis
138-
i0 = _zarr_index(offsets, start)
139-
i1 = _zarr_index(offsets, stop)
140-
if i0 == i1: # within a single zarr file
141-
sel = slice(start - offsets[i0], stop - offsets[i0])
142-
return zarrs[i0][(sel, *locs[1:])]
143-
else: # more than one zarr file
144-
slices = []
145-
slices.append((i0, slice(start - offsets[i0], None)))
146-
for i in range(i0 + 1, i1): # entire zarr
147-
slices.append((i, slice(None)))
148-
if stop > offsets[i1]:
149-
slices.append((i1, slice(0, stop - offsets[i1])))
150-
parts = [zarrs[i][(sel, *locs[1:])] for (i, sel) in slices]
151-
return np.concatenate(parts) # type: ignore[no-untyped-call]
152-
153-
154-
def str_is_int(x: str) -> bool:
155-
"""Test if a string can be parsed as an int"""
156-
try:
157-
int(x)
158-
return True
159-
except ValueError:
160-
return False

0 commit comments

Comments
 (0)