|
8 | 8 | import pickle
|
9 | 9 | import shutil
|
10 | 10 | import sys
|
| 11 | +from functools import partial |
11 | 12 | from typing import Any
|
12 | 13 |
|
13 | 14 | import numcodecs
|
14 | 15 | import numpy as np
|
15 | 16 |
|
16 |
| -from bio2zarr import schema, zarr_utils |
| 17 | +from bio2zarr import schema |
17 | 18 |
|
18 | 19 | from .. import constants, core, provenance, vcf_utils
|
19 |
| -from functools import partial |
20 | 20 |
|
21 | 21 | logger = logging.getLogger(__name__)
|
22 | 22 |
|
23 |
| -def sanitise_value_bool(shape, value): |
24 |
| - x = True |
25 |
| - if value is None: |
26 |
| - x = False |
27 |
| - return x |
28 |
| - |
29 |
| - |
30 |
| -def sanitise_value_float_scalar(shape, value): |
31 |
| - x = value |
32 |
| - if value is None: |
33 |
| - x = [constants.FLOAT32_MISSING] |
34 |
| - return x[0] |
35 |
| - |
36 |
| - |
37 |
| -def sanitise_value_int_scalar(shape, value): |
38 |
| - x = value |
39 |
| - if value is None: |
40 |
| - x = [constants.INT_MISSING] |
41 |
| - else: |
42 |
| - x = sanitise_int_array(value, ndmin=1, dtype=np.int32) |
43 |
| - return x[0] |
44 |
| - |
45 |
| - |
46 |
| -def sanitise_value_string_scalar(shape, value): |
47 |
| - if value is None: |
48 |
| - return "." |
49 |
| - else: |
50 |
| - return value[0] |
51 |
| - |
52 |
| - |
53 |
| -def sanitise_value_string_1d(shape, value): |
54 |
| - if value is None: |
55 |
| - return np.full(shape, ".", dtype='O') |
56 |
| - else: |
57 |
| - value = drop_empty_second_dim(value) |
58 |
| - result = np.full(shape, "", dtype=value.dtype) |
59 |
| - result[:value.shape[0]] = value |
60 |
| - return result |
61 |
| - |
62 |
| - |
63 |
| -def sanitise_value_string_2d(shape, value): |
64 |
| - if value is None: |
65 |
| - return np.full(shape, ".", dtype='O') |
66 |
| - else: |
67 |
| - result = np.full(shape, "", dtype='O') |
68 |
| - if value.ndim == 2: |
69 |
| - result[:value.shape[0], :value.shape[1]] = value |
70 |
| - else: |
71 |
| - # Convert 1D array into 2D with appropriate shape |
72 |
| - for k, val in enumerate(value): |
73 |
| - result[k, :len(val)] = val |
74 |
| - return result |
75 |
| - |
76 |
| - |
77 |
| -def drop_empty_second_dim(value): |
78 |
| - assert len(value.shape) == 1 or value.shape[1] == 1 |
79 |
| - if len(value.shape) == 2 and value.shape[1] == 1: |
80 |
| - value = value[..., 0] |
81 |
| - return value |
82 |
| - |
83 |
| -def sanitise_value_float_1d(shape, value): |
84 |
| - if value is None: |
85 |
| - return np.full(shape, constants.FLOAT32_MISSING) |
86 |
| - else: |
87 |
| - value = np.array(value, ndmin=1, dtype=np.float32, copy=True) |
88 |
| - # numpy will map None values to Nan, but we need a |
89 |
| - # specific NaN |
90 |
| - value[np.isnan(value)] = constants.FLOAT32_MISSING |
91 |
| - value = drop_empty_second_dim(value) |
92 |
| - result = np.full(shape, constants.FLOAT32_FILL, dtype=np.float32) |
93 |
| - result[:value.shape[0]] = value |
94 |
| - print(result) |
95 |
| - return result |
96 |
| - |
97 |
| -def sanitise_value_float_2d(shape, value): |
98 |
| - if value is None: |
99 |
| - return np.full(shape, constants.FLOAT32_MISSING) |
100 |
| - else: |
101 |
| - value = np.array(value, ndmin=2, dtype=np.float32, copy=True) |
102 |
| - result = np.full(shape, constants.FLOAT32_FILL, dtype=np.float32) |
103 |
| - result[:, :value.shape[1]] = value |
104 |
| - print(result) |
105 |
| - return result |
106 |
| - |
107 |
| - |
108 |
| -def sanitise_int_array(value, ndmin, dtype): |
109 |
| - if isinstance(value, tuple): |
110 |
| - value = [ |
111 |
| - constants.VCF_INT_MISSING if x is None else x for x in value |
112 |
| - ] # NEEDS TEST |
113 |
| - value = np.array(value, ndmin=ndmin, copy=True) |
114 |
| - value[value == constants.VCF_INT_MISSING] = -1 |
115 |
| - value[value == constants.VCF_INT_FILL] = -2 |
116 |
| - # TODO watch out for clipping here! |
117 |
| - return value.astype(dtype) |
118 |
| - |
119 |
| - |
120 |
| -def sanitise_value_int_1d(shape, value): |
121 |
| - if value is None: |
122 |
| - return np.full(shape, -1) |
123 |
| - else: |
124 |
| - value = sanitise_int_array(value, 1, np.int32) |
125 |
| - value = drop_empty_second_dim(value) |
126 |
| - result = np.full(shape, -2, dtype=np.int32) |
127 |
| - result[:value.shape[0]] = value |
128 |
| - return result |
129 |
| - |
130 |
| - |
131 |
| -def sanitise_value_int_2d(shape, value): |
132 |
| - if value is None: |
133 |
| - return np.full(shape, -1) |
134 |
| - else: |
135 |
| - value = sanitise_int_array(value, 2, np.int32) |
136 |
| - result = np.full(shape, -2, dtype=np.int32) |
137 |
| - result[:, :value.shape[1]] = value |
138 |
| - return result |
139 |
| - |
140 | 23 |
|
141 | 24 | @dataclasses.dataclass
|
142 | 25 | class VcfFieldSummary(core.JsonDataclass):
|
@@ -469,6 +352,126 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
|
469 | 352 | return icf_metadata, header
|
470 | 353 |
|
471 | 354 |
|
| 355 | +def sanitise_value_bool(shape, value): |
| 356 | + x = True |
| 357 | + if value is None: |
| 358 | + x = False |
| 359 | + return x |
| 360 | + |
| 361 | + |
| 362 | +def sanitise_value_float_scalar(shape, value): |
| 363 | + x = value |
| 364 | + if value is None: |
| 365 | + x = [constants.FLOAT32_MISSING] |
| 366 | + return x[0] |
| 367 | + |
| 368 | + |
| 369 | +def sanitise_value_int_scalar(shape, value): |
| 370 | + x = value |
| 371 | + if value is None: |
| 372 | + x = [constants.INT_MISSING] |
| 373 | + else: |
| 374 | + x = sanitise_int_array(value, ndmin=1, dtype=np.int32) |
| 375 | + return x[0] |
| 376 | + |
| 377 | + |
| 378 | +def sanitise_value_string_scalar(shape, value): |
| 379 | + if value is None: |
| 380 | + return "." |
| 381 | + else: |
| 382 | + return value[0] |
| 383 | + |
| 384 | + |
| 385 | +def sanitise_value_string_1d(shape, value): |
| 386 | + if value is None: |
| 387 | + return np.full(shape, ".", dtype="O") |
| 388 | + else: |
| 389 | + value = drop_empty_second_dim(value) |
| 390 | + result = np.full(shape, "", dtype=value.dtype) |
| 391 | + result[: value.shape[0]] = value |
| 392 | + return result |
| 393 | + |
| 394 | + |
| 395 | +def sanitise_value_string_2d(shape, value): |
| 396 | + if value is None: |
| 397 | + return np.full(shape, ".", dtype="O") |
| 398 | + else: |
| 399 | + result = np.full(shape, "", dtype="O") |
| 400 | + if value.ndim == 2: |
| 401 | + result[: value.shape[0], : value.shape[1]] = value |
| 402 | + else: |
| 403 | + # Convert 1D array into 2D with appropriate shape |
| 404 | + for k, val in enumerate(value): |
| 405 | + result[k, : len(val)] = val |
| 406 | + return result |
| 407 | + |
| 408 | + |
| 409 | +def drop_empty_second_dim(value): |
| 410 | + assert len(value.shape) == 1 or value.shape[1] == 1 |
| 411 | + if len(value.shape) == 2 and value.shape[1] == 1: |
| 412 | + value = value[..., 0] |
| 413 | + return value |
| 414 | + |
| 415 | + |
| 416 | +def sanitise_value_float_1d(shape, value): |
| 417 | + if value is None: |
| 418 | + return np.full(shape, constants.FLOAT32_MISSING) |
| 419 | + else: |
| 420 | + value = np.array(value, ndmin=1, dtype=np.float32, copy=True) |
| 421 | + # numpy will map None values to Nan, but we need a |
| 422 | + # specific NaN |
| 423 | + value[np.isnan(value)] = constants.FLOAT32_MISSING |
| 424 | + value = drop_empty_second_dim(value) |
| 425 | + result = np.full(shape, constants.FLOAT32_FILL, dtype=np.float32) |
| 426 | + result[: value.shape[0]] = value |
| 427 | + print(result) |
| 428 | + return result |
| 429 | + |
| 430 | + |
| 431 | +def sanitise_value_float_2d(shape, value): |
| 432 | + if value is None: |
| 433 | + return np.full(shape, constants.FLOAT32_MISSING) |
| 434 | + else: |
| 435 | + value = np.array(value, ndmin=2, dtype=np.float32, copy=True) |
| 436 | + result = np.full(shape, constants.FLOAT32_FILL, dtype=np.float32) |
| 437 | + result[:, : value.shape[1]] = value |
| 438 | + print(result) |
| 439 | + return result |
| 440 | + |
| 441 | + |
| 442 | +def sanitise_int_array(value, ndmin, dtype): |
| 443 | + if isinstance(value, tuple): |
| 444 | + value = [ |
| 445 | + constants.VCF_INT_MISSING if x is None else x for x in value |
| 446 | + ] # NEEDS TEST |
| 447 | + value = np.array(value, ndmin=ndmin, copy=True) |
| 448 | + value[value == constants.VCF_INT_MISSING] = -1 |
| 449 | + value[value == constants.VCF_INT_FILL] = -2 |
| 450 | + # TODO watch out for clipping here! |
| 451 | + return value.astype(dtype) |
| 452 | + |
| 453 | + |
| 454 | +def sanitise_value_int_1d(shape, value): |
| 455 | + if value is None: |
| 456 | + return np.full(shape, -1) |
| 457 | + else: |
| 458 | + value = sanitise_int_array(value, 1, np.int32) |
| 459 | + value = drop_empty_second_dim(value) |
| 460 | + result = np.full(shape, -2, dtype=np.int32) |
| 461 | + result[: value.shape[0]] = value |
| 462 | + return result |
| 463 | + |
| 464 | + |
| 465 | +def sanitise_value_int_2d(shape, value): |
| 466 | + if value is None: |
| 467 | + return np.full(shape, -1) |
| 468 | + else: |
| 469 | + value = sanitise_int_array(value, 2, np.int32) |
| 470 | + result = np.full(shape, -2, dtype=np.int32) |
| 471 | + result[:, : value.shape[1]] = value |
| 472 | + return result |
| 473 | + |
| 474 | + |
472 | 475 | missing_value_map = {
|
473 | 476 | "Integer": constants.INT_MISSING,
|
474 | 477 | "Float": constants.FLOAT32_MISSING,
|
@@ -689,16 +692,6 @@ def values(self):
|
689 | 692 | return ret
|
690 | 693 |
|
691 | 694 | def sanitiser_factory(self, shape):
|
692 |
| - """ |
693 |
| - Return a function that sanitises values from this column |
694 |
| - and returns a properly formatted array with the specified shape. |
695 |
| - |
696 |
| - Args: |
697 |
| - shape: The shape of the target buffer, used to determine how to format the output |
698 |
| - |
699 |
| - Returns: |
700 |
| - A function that takes a value and returns a sanitised version |
701 |
| - """ |
702 | 695 | assert len(shape) <= 2
|
703 | 696 | if self.vcf_field.vcf_type == "Flag":
|
704 | 697 | assert len(shape) == 0
|
|
0 commit comments