Skip to content

Commit f23fda1

Browse files
Fix some bugs in String handling
1 parent a33c822 commit f23fda1

File tree

2 files changed

+37
-86
lines changed

2 files changed

+37
-86
lines changed

bio2zarr/vcf.py

Lines changed: 30 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -115,9 +115,11 @@ def smallest_dtype(self):
115115
ret = dtype
116116
elif self.vcf_type == "Flag":
117117
ret = "bool"
118+
elif self.vcf_type == "Character":
119+
ret = "S1"
118120
else:
119-
assert self.vcf_type in ("String", "Character")
120-
ret = "str"
121+
assert self.vcf_type == "String"
122+
ret = "O"
121123
return ret
122124

123125

@@ -266,22 +268,29 @@ def sanitise_value_string_1d(buff, j, value):
266268
if value is None:
267269
buff[j] = "."
268270
else:
269-
value = np.array(value, ndmin=1, dtype=buff.dtype, copy=False)
271+
# value = np.array(value, ndmin=1, dtype=buff.dtype, copy=False)
272+
# FIXME failure isn't coming from here, it seems to be from an
273+
# incorrectly detected dimension in the zarr array
274+
# The dimesions look all wrong, and the dtype should be Object
275+
# not str
270276
value = drop_empty_second_dim(value)
271277
buff[j] = ""
272-
# TODO check for missing?
273278
buff[j, : value.shape[0]] = value
274279

275280

276281
def sanitise_value_string_2d(buff, j, value):
277282
if value is None:
278283
buff[j] = "."
279284
else:
280-
value = np.array(value, ndmin=1, dtype=buff.dtype, copy=False)
281-
value = drop_empty_second_dim(value)
285+
# print(buff.shape, value.dtype, value)
286+
# assert value.ndim == 2
282287
buff[j] = ""
283-
# TODO check for missing?
284-
buff[j, : value.shape[0]] = value
288+
if value.ndim == 2:
289+
buff[j, :, : value.shape[1]] = value
290+
else:
291+
# TODO check if this is still necessary
292+
for k, val in enumerate(value):
293+
buff[j, k, : len(val)] = val
285294

286295

287296
def drop_empty_second_dim(value):
@@ -343,72 +352,10 @@ def sanitise_value_int_2d(buff, j, value):
343352
buff[j, :, : value.shape[1]] = value
344353

345354

346-
def update_bounds_float(summary, value, number_dim):
347-
value = np.array(value, dtype=np.float32, copy=False)
348-
# Map back to python types to avoid JSON issues later. Could
349-
# be done more efficiently at the end.
350-
if value.size > 0:
351-
summary.min_value = float(min(summary.min_value, np.min(value)))
352-
summary.max_value = float(max(summary.max_value, np.max(value)))
353-
number = 0
354-
assert len(value.shape) <= number_dim + 1
355-
if len(value.shape) == number_dim + 1:
356-
number = value.shape[number_dim]
357-
summary.max_number = max(summary.max_number, number)
358-
return value
359-
360-
361355
MIN_INT_VALUE = np.iinfo(np.int32).min + 2
362356
VCF_INT_MISSING = np.iinfo(np.int32).min
363357
VCF_INT_FILL = np.iinfo(np.int32).min + 1
364358

365-
366-
def update_bounds_integer(summary, value, number_dim):
367-
# NOTE we don't convert to local MISSING and FILL values here
368-
# to allow users to detect and deal with it later, in case they
369-
# need -1 and -2 values in their data.
370-
if value is None:
371-
return VCF_INT_MISSING
372-
# print("update bounds int", summary, value)
373-
if isinstance(value, tuple):
374-
value = [VCF_INT_MISSING if x is None else x for x in value]
375-
value = np.array(value, dtype=np.int32, copy=False)
376-
377-
# Mask out missing and fill values
378-
a = value[value >= MIN_INT_VALUE]
379-
if a.size > 0:
380-
summary.max_value = int(max(summary.max_value, np.max(a)))
381-
summary.min_value = int(min(summary.min_value, np.min(a)))
382-
number = 0
383-
assert len(value.shape) <= number_dim + 1
384-
if len(value.shape) == number_dim + 1:
385-
number = value.shape[number_dim]
386-
summary.max_number = max(summary.max_number, number)
387-
return value
388-
389-
390-
def update_bounds_string(summary, value, number_dim):
391-
# if isinstance(value, str):
392-
# number = 0
393-
# else:
394-
# number = len(value)
395-
# summary.max_number = max(summary.max_number, number)
396-
return value
397-
398-
399-
def update_bounds_flag(summary, value, number_dim):
400-
return value
401-
402-
403-
def update_bounds_char(summary, value, number_dim):
404-
# if isinstance(value, str):
405-
# number = 0
406-
# else:
407-
# number = len(value)
408-
# summary.max_number = max(summary.max_number, number)
409-
return value
410-
411-
412359
missing_value_map = {
413360
"Integer": -1,
414361
"Float": FLOAT32_MISSING,
@@ -428,12 +375,9 @@ def __init__(self, field, num_samples):
428375
self.field = field
429376
self.num_samples = num_samples
430377
self.dimension = 1
431-
self.missing = missing_value_map[field.vcf_type]
432378
if field.category == "FORMAT":
433379
self.dimension = 2
434-
self.missing_value = np.full((self.num_samples, 1), self.missing)
435-
else:
436-
self.missing_value = np.array([self.missing])
380+
self.missing = missing_value_map[field.vcf_type]
437381

438382
@staticmethod
439383
def factory(field, num_samples):
@@ -446,8 +390,6 @@ def factory(field, num_samples):
446390
return StringValueTransformer(field, num_samples)
447391

448392
def transform(self, vcf_value):
449-
if vcf_value is None:
450-
return self.missing_value
451393
if isinstance(vcf_value, tuple):
452394
vcf_value = [self.missing if v is None else v for v in vcf_value]
453395
value = np.array(vcf_value, ndmin=self.dimension, copy=False)
@@ -498,11 +440,15 @@ def update_bounds(self, value):
498440
summary.max_number = max(summary.max_number, number)
499441

500442
def transform(self, vcf_value):
443+
# print("transform", vcf_value)
501444
if self.dimension == 1:
502445
value = np.array(list(vcf_value.split(",")))
503446
else:
504447
# TODO can we make this faster??
505-
value = np.array([list(v.split(",")) for v in vcf_value], dtype="O")
448+
value = np.array([v.split(",") for v in vcf_value], dtype="O")
449+
# print("HERE", vcf_value, value)
450+
# for v in vcf_value:
451+
# print("\t", type(v), len(v), v.split(","))
506452
# print("S: ", self.dimension, ":", value.shape, value)
507453
return value
508454

@@ -678,6 +624,7 @@ def field_summaries(self):
678624

679625
def append(self, name, value):
680626
buff = self.buffers[name]
627+
# print("Append", name, value)
681628
value = buff.transformer.transform_and_update_bounds(value)
682629
assert value is None or isinstance(value, np.ndarray)
683630
buff.append(value)
@@ -1052,11 +999,8 @@ def fixed_field_spec(
1052999
shape.append(n)
10531000
chunks.append(chunk_width),
10541001
dimensions.append("samples")
1055-
if field.category == "FORMAT" and field.vcf_type == "String":
1056-
# FIXME not handling format string values very well right now
1057-
# as the max_number value is just the number of samples
1058-
pass
1059-
elif field.summary.max_number > 1:
1002+
# TODO make an option to add in the empty extra dimension
1003+
if field.summary.max_number > 1:
10601004
shape.append(field.summary.max_number)
10611005
dimensions.append(field.name)
10621006
variable_name = prefix + field.name
@@ -1136,12 +1080,16 @@ def __init__(self, path):
11361080

11371081
def create_array(self, variable):
11381082
# print("CREATE", variable)
1083+
object_codec = None
1084+
if variable.dtype == "O":
1085+
object_codec = numcodecs.VLenUTF8()
11391086
a = self.root.empty(
11401087
variable.name,
11411088
shape=variable.shape,
11421089
chunks=variable.chunks,
11431090
dtype=variable.dtype,
11441091
compressor=numcodecs.get_codec(variable.compressor),
1092+
object_codec=object_codec,
11451093
)
11461094
a.attrs["_ARRAY_DIMENSIONS"] = variable.dimensions
11471095

tests/test_vcf_examples.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -653,11 +653,14 @@ def ds(self, tmp_path_factory):
653653
vcf.convert_vcf([self.data_path], out)
654654
return sg.load_dataset(out)
655655

656-
# def test_info_string1(self, ds):
657-
# print(repr(ds["variant_IS1"].values))
656+
def test_info_string1(self, ds):
657+
print(repr(ds["variant_IS1"].values))
658658

659-
# def test_info_string2(self, ds):
660-
# print(repr(ds["variant_IS2"].values))
659+
def test_info_string2(self, ds):
660+
print(repr(ds["variant_IS2"].values))
661+
662+
def test_format_string2(self, ds):
663+
print(repr(ds["call_FS2"].values))
661664

662665

663666
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)