Skip to content

Commit 8d69bfa

Browse files
committed
Tests pass again
1 parent 737a2ae commit 8d69bfa

File tree

4 files changed

+142
-148
lines changed

4 files changed

+142
-148
lines changed

bio2zarr/plink.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,18 @@ def iter_field(self, field_name, shape, start, stop):
3535
data = {
3636
"position": self.bed.bp_position,
3737
}[field_name]
38-
for value in data[start:stop]:
39-
yield value
40-
41-
def iter_genotypes(self, start, stop):
42-
gt_calls = self.bed.gts.values[start:stop]
43-
phased = np.zeros_like(gt_calls, dtype=bool)
44-
for idx in range(len(gt_calls)):
45-
yield gt_calls[idx], phased[idx]
38+
yield from data[start:stop]
39+
40+
def iter_genotypes(self, shape, start, stop):
41+
bed_chunk = self.bed.read(slice(start, stop), dtype=np.int8).T
42+
gt = np.zeros(shape, dtype=np.int8)
43+
phased = np.zeros(shape[:-1], dtype=bool)
44+
for values in bed_chunk:
45+
gt[values == -127] = -1 # Missing values
46+
gt[values == 0] = [1, 1] # Homozygous ALT (2 in PLINK)
47+
gt[values == 1] = [1, 0] # Heterozygous (1 in PLINK)
48+
gt[values == 2] = [0, 0] # Homozygous REF (0 in PLINK)
49+
yield gt, phased
4650

4751

4852
# Import here to avoid circular import

bio2zarr/vcf2zarr/icf.py

Lines changed: 122 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -8,135 +8,18 @@
88
import pickle
99
import shutil
1010
import sys
11+
from functools import partial
1112
from typing import Any
1213

1314
import numcodecs
1415
import numpy as np
1516

16-
from bio2zarr import schema, zarr_utils
17+
from bio2zarr import schema
1718

1819
from .. import constants, core, provenance, vcf_utils
19-
from functools import partial
2020

2121
logger = logging.getLogger(__name__)
2222

23-
def sanitise_value_bool(shape, value):
24-
x = True
25-
if value is None:
26-
x = False
27-
return x
28-
29-
30-
def sanitise_value_float_scalar(shape, value):
31-
x = value
32-
if value is None:
33-
x = [constants.FLOAT32_MISSING]
34-
return x[0]
35-
36-
37-
def sanitise_value_int_scalar(shape, value):
38-
x = value
39-
if value is None:
40-
x = [constants.INT_MISSING]
41-
else:
42-
x = sanitise_int_array(value, ndmin=1, dtype=np.int32)
43-
return x[0]
44-
45-
46-
def sanitise_value_string_scalar(shape, value):
47-
if value is None:
48-
return "."
49-
else:
50-
return value[0]
51-
52-
53-
def sanitise_value_string_1d(shape, value):
54-
if value is None:
55-
return np.full(shape, ".", dtype='O')
56-
else:
57-
value = drop_empty_second_dim(value)
58-
result = np.full(shape, "", dtype=value.dtype)
59-
result[:value.shape[0]] = value
60-
return result
61-
62-
63-
def sanitise_value_string_2d(shape, value):
64-
if value is None:
65-
return np.full(shape, ".", dtype='O')
66-
else:
67-
result = np.full(shape, "", dtype='O')
68-
if value.ndim == 2:
69-
result[:value.shape[0], :value.shape[1]] = value
70-
else:
71-
# Convert 1D array into 2D with appropriate shape
72-
for k, val in enumerate(value):
73-
result[k, :len(val)] = val
74-
return result
75-
76-
77-
def drop_empty_second_dim(value):
78-
assert len(value.shape) == 1 or value.shape[1] == 1
79-
if len(value.shape) == 2 and value.shape[1] == 1:
80-
value = value[..., 0]
81-
return value
82-
83-
def sanitise_value_float_1d(shape, value):
84-
if value is None:
85-
return np.full(shape, constants.FLOAT32_MISSING)
86-
else:
87-
value = np.array(value, ndmin=1, dtype=np.float32, copy=True)
88-
# numpy will map None values to Nan, but we need a
89-
# specific NaN
90-
value[np.isnan(value)] = constants.FLOAT32_MISSING
91-
value = drop_empty_second_dim(value)
92-
result = np.full(shape, constants.FLOAT32_FILL, dtype=np.float32)
93-
result[:value.shape[0]] = value
94-
print(result)
95-
return result
96-
97-
def sanitise_value_float_2d(shape, value):
98-
if value is None:
99-
return np.full(shape, constants.FLOAT32_MISSING)
100-
else:
101-
value = np.array(value, ndmin=2, dtype=np.float32, copy=True)
102-
result = np.full(shape, constants.FLOAT32_FILL, dtype=np.float32)
103-
result[:, :value.shape[1]] = value
104-
print(result)
105-
return result
106-
107-
108-
def sanitise_int_array(value, ndmin, dtype):
109-
if isinstance(value, tuple):
110-
value = [
111-
constants.VCF_INT_MISSING if x is None else x for x in value
112-
] # NEEDS TEST
113-
value = np.array(value, ndmin=ndmin, copy=True)
114-
value[value == constants.VCF_INT_MISSING] = -1
115-
value[value == constants.VCF_INT_FILL] = -2
116-
# TODO watch out for clipping here!
117-
return value.astype(dtype)
118-
119-
120-
def sanitise_value_int_1d(shape, value):
121-
if value is None:
122-
return np.full(shape, -1)
123-
else:
124-
value = sanitise_int_array(value, 1, np.int32)
125-
value = drop_empty_second_dim(value)
126-
result = np.full(shape, -2, dtype=np.int32)
127-
result[:value.shape[0]] = value
128-
return result
129-
130-
131-
def sanitise_value_int_2d(shape, value):
132-
if value is None:
133-
return np.full(shape, -1)
134-
else:
135-
value = sanitise_int_array(value, 2, np.int32)
136-
result = np.full(shape, -2, dtype=np.int32)
137-
result[:, :value.shape[1]] = value
138-
return result
139-
14023

14124
@dataclasses.dataclass
14225
class VcfFieldSummary(core.JsonDataclass):
@@ -469,6 +352,126 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
469352
return icf_metadata, header
470353

471354

355+
def sanitise_value_bool(shape, value):
356+
x = True
357+
if value is None:
358+
x = False
359+
return x
360+
361+
362+
def sanitise_value_float_scalar(shape, value):
363+
x = value
364+
if value is None:
365+
x = [constants.FLOAT32_MISSING]
366+
return x[0]
367+
368+
369+
def sanitise_value_int_scalar(shape, value):
370+
x = value
371+
if value is None:
372+
x = [constants.INT_MISSING]
373+
else:
374+
x = sanitise_int_array(value, ndmin=1, dtype=np.int32)
375+
return x[0]
376+
377+
378+
def sanitise_value_string_scalar(shape, value):
379+
if value is None:
380+
return "."
381+
else:
382+
return value[0]
383+
384+
385+
def sanitise_value_string_1d(shape, value):
386+
if value is None:
387+
return np.full(shape, ".", dtype="O")
388+
else:
389+
value = drop_empty_second_dim(value)
390+
result = np.full(shape, "", dtype=value.dtype)
391+
result[: value.shape[0]] = value
392+
return result
393+
394+
395+
def sanitise_value_string_2d(shape, value):
396+
if value is None:
397+
return np.full(shape, ".", dtype="O")
398+
else:
399+
result = np.full(shape, "", dtype="O")
400+
if value.ndim == 2:
401+
result[: value.shape[0], : value.shape[1]] = value
402+
else:
403+
# Convert 1D array into 2D with appropriate shape
404+
for k, val in enumerate(value):
405+
result[k, : len(val)] = val
406+
return result
407+
408+
409+
def drop_empty_second_dim(value):
410+
assert len(value.shape) == 1 or value.shape[1] == 1
411+
if len(value.shape) == 2 and value.shape[1] == 1:
412+
value = value[..., 0]
413+
return value
414+
415+
416+
def sanitise_value_float_1d(shape, value):
417+
if value is None:
418+
return np.full(shape, constants.FLOAT32_MISSING)
419+
else:
420+
value = np.array(value, ndmin=1, dtype=np.float32, copy=True)
421+
# numpy will map None values to Nan, but we need a
422+
# specific NaN
423+
value[np.isnan(value)] = constants.FLOAT32_MISSING
424+
value = drop_empty_second_dim(value)
425+
result = np.full(shape, constants.FLOAT32_FILL, dtype=np.float32)
426+
result[: value.shape[0]] = value
427+
print(result)
428+
return result
429+
430+
431+
def sanitise_value_float_2d(shape, value):
432+
if value is None:
433+
return np.full(shape, constants.FLOAT32_MISSING)
434+
else:
435+
value = np.array(value, ndmin=2, dtype=np.float32, copy=True)
436+
result = np.full(shape, constants.FLOAT32_FILL, dtype=np.float32)
437+
result[:, : value.shape[1]] = value
438+
print(result)
439+
return result
440+
441+
442+
def sanitise_int_array(value, ndmin, dtype):
443+
if isinstance(value, tuple):
444+
value = [
445+
constants.VCF_INT_MISSING if x is None else x for x in value
446+
] # NEEDS TEST
447+
value = np.array(value, ndmin=ndmin, copy=True)
448+
value[value == constants.VCF_INT_MISSING] = -1
449+
value[value == constants.VCF_INT_FILL] = -2
450+
# TODO watch out for clipping here!
451+
return value.astype(dtype)
452+
453+
454+
def sanitise_value_int_1d(shape, value):
455+
if value is None:
456+
return np.full(shape, -1)
457+
else:
458+
value = sanitise_int_array(value, 1, np.int32)
459+
value = drop_empty_second_dim(value)
460+
result = np.full(shape, -2, dtype=np.int32)
461+
result[: value.shape[0]] = value
462+
return result
463+
464+
465+
def sanitise_value_int_2d(shape, value):
466+
if value is None:
467+
return np.full(shape, -1)
468+
else:
469+
value = sanitise_int_array(value, 2, np.int32)
470+
result = np.full(shape, -2, dtype=np.int32)
471+
result[:, : value.shape[1]] = value
472+
return result
473+
474+
472475
missing_value_map = {
473476
"Integer": constants.INT_MISSING,
474477
"Float": constants.FLOAT32_MISSING,
@@ -689,16 +692,6 @@ def values(self):
689692
return ret
690693

691694
def sanitiser_factory(self, shape):
692-
"""
693-
Return a function that sanitises values from this column
694-
and returns a properly formatted array with the specified shape.
695-
696-
Args:
697-
shape: The shape of the target buffer, used to determine how to format the output
698-
699-
Returns:
700-
A function that takes a value and returns a sanitised version
701-
"""
702695
assert len(shape) <= 2
703696
if self.vcf_field.vcf_type == "Flag":
704697
assert len(shape) == 0

bio2zarr/writer.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -427,25 +427,23 @@ def encode_array_partition(self, array_spec, partition_index):
427427
):
428428
j = ba.next_buffer_row()
429429
ba.buff[j] = value
430-
430+
431431
self.finalise_partition_array(partition_index, ba)
432432

433433
def encode_genotypes_partition(self, partition_index):
434434
partition = self.metadata.partitions[partition_index]
435435
gt = self.init_partition_array(partition_index, "call_genotype")
436436
gt_phased = self.init_partition_array(partition_index, "call_genotype_phased")
437-
437+
438438
for genotype, phased in self.source.iter_genotypes(
439-
gt.buff.shape[1:],
440-
partition.start,
441-
partition.stop
439+
gt.buff.shape[1:], partition.start, partition.stop
442440
):
443441
j = gt.next_buffer_row()
444442
gt.buff[j] = genotype
445-
443+
446444
j_phased = gt_phased.next_buffer_row()
447445
gt_phased.buff[j_phased] = phased
448-
446+
449447
self.finalise_partition_array(partition_index, gt)
450448
self.finalise_partition_array(partition_index, gt_phased)
451449

@@ -513,7 +511,9 @@ def encode_alleles_partition(self, partition_index):
513511
alleles = self.init_partition_array(partition_index, "variant_allele")
514512
partition = self.metadata.partitions[partition_index]
515513

516-
for value in self.source.iter_alleles(partition.start, partition.stop, alleles.array.shape[1]):
514+
for value in self.source.iter_alleles(
515+
partition.start, partition.stop, alleles.array.shape[1]
516+
):
517517
j = alleles.next_buffer_row()
518518
alleles.buff[j] = value
519519

bio2zarr/zarr_utils.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,5 @@
1-
import numpy as np
21
import zarr
32

4-
from bio2zarr import constants
5-
63

74
def zarr_v3() -> bool:
85
return zarr.__version__ >= "3"

0 commit comments

Comments
 (0)