@@ -115,9 +115,11 @@ def smallest_dtype(self):
115
115
ret = dtype
116
116
elif self .vcf_type == "Flag" :
117
117
ret = "bool"
118
+ elif self .vcf_type == "Character" :
119
+ ret = "S1"
118
120
else :
119
- assert self .vcf_type in ( "String" , "Character" )
120
- ret = "str "
121
+ assert self .vcf_type == "String"
122
+ ret = "O "
121
123
return ret
122
124
123
125
@@ -266,22 +268,29 @@ def sanitise_value_string_1d(buff, j, value):
266
268
if value is None :
267
269
buff [j ] = "."
268
270
else :
269
- value = np .array (value , ndmin = 1 , dtype = buff .dtype , copy = False )
271
+ # value = np.array(value, ndmin=1, dtype=buff.dtype, copy=False)
272
+ # FIXME failure isn't coming from here, it seems to be from an
273
+ # incorrectly detected dimension in the zarr array
274
+ # The dimesions look all wrong, and the dtype should be Object
275
+ # not str
270
276
value = drop_empty_second_dim (value )
271
277
buff [j ] = ""
272
- # TODO check for missing?
273
278
buff [j , : value .shape [0 ]] = value
274
279
275
280
276
281
def sanitise_value_string_2d (buff , j , value ):
277
282
if value is None :
278
283
buff [j ] = "."
279
284
else :
280
- value = np . array ( value , ndmin = 1 , dtype = buff .dtype , copy = False )
281
- value = drop_empty_second_dim ( value )
285
+ # print(buff.shape, value .dtype, value )
286
+ # assert value.ndim == 2
282
287
buff [j ] = ""
283
- # TODO check for missing?
284
- buff [j , : value .shape [0 ]] = value
288
+ if value .ndim == 2 :
289
+ buff [j , :, : value .shape [1 ]] = value
290
+ else :
291
+ # TODO check if this is still necessary
292
+ for k , val in enumerate (value ):
293
+ buff [j , k , : len (val )] = val
285
294
286
295
287
296
def drop_empty_second_dim (value ):
@@ -343,72 +352,10 @@ def sanitise_value_int_2d(buff, j, value):
343
352
buff [j , :, : value .shape [1 ]] = value
344
353
345
354
346
- def update_bounds_float (summary , value , number_dim ):
347
- value = np .array (value , dtype = np .float32 , copy = False )
348
- # Map back to python types to avoid JSON issues later. Could
349
- # be done more efficiently at the end.
350
- if value .size > 0 :
351
- summary .min_value = float (min (summary .min_value , np .min (value )))
352
- summary .max_value = float (max (summary .max_value , np .max (value )))
353
- number = 0
354
- assert len (value .shape ) <= number_dim + 1
355
- if len (value .shape ) == number_dim + 1 :
356
- number = value .shape [number_dim ]
357
- summary .max_number = max (summary .max_number , number )
358
- return value
359
-
360
-
361
355
MIN_INT_VALUE = np .iinfo (np .int32 ).min + 2
362
356
VCF_INT_MISSING = np .iinfo (np .int32 ).min
363
357
VCF_INT_FILL = np .iinfo (np .int32 ).min + 1
364
358
365
-
366
- def update_bounds_integer (summary , value , number_dim ):
367
- # NOTE we don't convert to local MISSING and FILL values here
368
- # to allow users to detect and deal with it later, in case they
369
- # need -1 and -2 values in their data.
370
- if value is None :
371
- return VCF_INT_MISSING
372
- # print("update bounds int", summary, value)
373
- if isinstance (value , tuple ):
374
- value = [VCF_INT_MISSING if x is None else x for x in value ]
375
- value = np .array (value , dtype = np .int32 , copy = False )
376
-
377
- # Mask out missing and fill values
378
- a = value [value >= MIN_INT_VALUE ]
379
- if a .size > 0 :
380
- summary .max_value = int (max (summary .max_value , np .max (a )))
381
- summary .min_value = int (min (summary .min_value , np .min (a )))
382
- number = 0
383
- assert len (value .shape ) <= number_dim + 1
384
- if len (value .shape ) == number_dim + 1 :
385
- number = value .shape [number_dim ]
386
- summary .max_number = max (summary .max_number , number )
387
- return value
388
-
389
-
390
- def update_bounds_string (summary , value , number_dim ):
391
- # if isinstance(value, str):
392
- # number = 0
393
- # else:
394
- # number = len(value)
395
- # summary.max_number = max(summary.max_number, number)
396
- return value
397
-
398
-
399
- def update_bounds_flag (summary , value , number_dim ):
400
- return value
401
-
402
-
403
- def update_bounds_char (summary , value , number_dim ):
404
- # if isinstance(value, str):
405
- # number = 0
406
- # else:
407
- # number = len(value)
408
- # summary.max_number = max(summary.max_number, number)
409
- return value
410
-
411
-
412
359
missing_value_map = {
413
360
"Integer" : - 1 ,
414
361
"Float" : FLOAT32_MISSING ,
@@ -428,12 +375,9 @@ def __init__(self, field, num_samples):
428
375
self .field = field
429
376
self .num_samples = num_samples
430
377
self .dimension = 1
431
- self .missing = missing_value_map [field .vcf_type ]
432
378
if field .category == "FORMAT" :
433
379
self .dimension = 2
434
- self .missing_value = np .full ((self .num_samples , 1 ), self .missing )
435
- else :
436
- self .missing_value = np .array ([self .missing ])
380
+ self .missing = missing_value_map [field .vcf_type ]
437
381
438
382
@staticmethod
439
383
def factory (field , num_samples ):
@@ -446,8 +390,6 @@ def factory(field, num_samples):
446
390
return StringValueTransformer (field , num_samples )
447
391
448
392
def transform (self , vcf_value ):
449
- if vcf_value is None :
450
- return self .missing_value
451
393
if isinstance (vcf_value , tuple ):
452
394
vcf_value = [self .missing if v is None else v for v in vcf_value ]
453
395
value = np .array (vcf_value , ndmin = self .dimension , copy = False )
@@ -498,11 +440,15 @@ def update_bounds(self, value):
498
440
summary .max_number = max (summary .max_number , number )
499
441
500
442
def transform (self , vcf_value ):
443
+ # print("transform", vcf_value)
501
444
if self .dimension == 1 :
502
445
value = np .array (list (vcf_value .split ("," )))
503
446
else :
504
447
# TODO can we make this faster??
505
- value = np .array ([list (v .split ("," )) for v in vcf_value ], dtype = "O" )
448
+ value = np .array ([v .split ("," ) for v in vcf_value ], dtype = "O" )
449
+ # print("HERE", vcf_value, value)
450
+ # for v in vcf_value:
451
+ # print("\t", type(v), len(v), v.split(","))
506
452
# print("S: ", self.dimension, ":", value.shape, value)
507
453
return value
508
454
@@ -678,6 +624,7 @@ def field_summaries(self):
678
624
679
625
def append (self , name , value ):
680
626
buff = self .buffers [name ]
627
+ # print("Append", name, value)
681
628
value = buff .transformer .transform_and_update_bounds (value )
682
629
assert value is None or isinstance (value , np .ndarray )
683
630
buff .append (value )
@@ -1052,11 +999,8 @@ def fixed_field_spec(
1052
999
shape .append (n )
1053
1000
chunks .append (chunk_width ),
1054
1001
dimensions .append ("samples" )
1055
- if field .category == "FORMAT" and field .vcf_type == "String" :
1056
- # FIXME not handling format string values very well right now
1057
- # as the max_number value is just the number of samples
1058
- pass
1059
- elif field .summary .max_number > 1 :
1002
+ # TODO make an option to add in the empty extra dimension
1003
+ if field .summary .max_number > 1 :
1060
1004
shape .append (field .summary .max_number )
1061
1005
dimensions .append (field .name )
1062
1006
variable_name = prefix + field .name
@@ -1136,12 +1080,16 @@ def __init__(self, path):
1136
1080
1137
1081
def create_array (self , variable ):
1138
1082
# print("CREATE", variable)
1083
+ object_codec = None
1084
+ if variable .dtype == "O" :
1085
+ object_codec = numcodecs .VLenUTF8 ()
1139
1086
a = self .root .empty (
1140
1087
variable .name ,
1141
1088
shape = variable .shape ,
1142
1089
chunks = variable .chunks ,
1143
1090
dtype = variable .dtype ,
1144
1091
compressor = numcodecs .get_codec (variable .compressor ),
1092
+ object_codec = object_codec ,
1145
1093
)
1146
1094
a .attrs ["_ARRAY_DIMENSIONS" ] = variable .dimensions
1147
1095
0 commit comments