@@ -102,14 +102,14 @@ class Contig:
102
102
103
103
104
104
@dataclasses .dataclass
105
- class Feature :
105
+ class Name :
106
106
id : str
107
107
108
108
109
109
@dataclasses .dataclass
110
110
class BedMetadata (core .JsonDataclass ):
111
111
contigs : list
112
- features : list
112
+ names : list
113
113
fields : list
114
114
bed_type : BedType
115
115
num_records : int = - 1
@@ -120,16 +120,16 @@ def num_contigs(self):
120
120
return len (self .contigs )
121
121
122
122
@property
123
- def num_features (self ):
124
- return len (self .features )
123
+ def num_names (self ):
124
+ return len (self .names )
125
125
126
126
127
127
@dataclasses .dataclass
128
128
class BedZarrSchema (core .JsonDataclass ):
129
129
format_version : str
130
130
fields : list
131
131
contigs : list
132
- features : list
132
+ names : list
133
133
bed_type : str
134
134
records_chunk_size : int
135
135
@@ -147,52 +147,54 @@ def fromdict(d):
147
147
)
148
148
ret = BedZarrSchema (** d )
149
149
ret .fields = [ZarrArraySpec (** sd ) for sd in d ["fields" ]]
150
- ret .contigs = [Contig (** sd ) for sd in d ["contigs" ]]
151
- ret .features = [Feature (** sd ) for sd in d ["features " ]]
150
+ ret .contig_ids = [Contig (** sd ) for sd in d ["contigs" ]]
151
+ ret .name_ids = [Name (** sd ) for sd in d ["names " ]]
152
152
return ret
153
153
154
154
@staticmethod
155
155
def fromjson (s ):
156
156
return BedZarrSchema .fromdict (json .loads (s ))
157
157
158
158
@staticmethod
159
- def generate (num_records , bed_type , fields , contigs , features , records_chunk_size = None ):
159
+ def generate (metadata , records_chunk_size = None ):
160
160
if records_chunk_size is None :
161
161
records_chunk_size = 1000
162
162
logger .info ("Generating schema with chunks=%d" , records_chunk_size )
163
+ fields = metadata .fields
163
164
164
165
def spec_from_field (field ):
165
166
return ZarrArraySpec .from_field (
166
167
field ,
167
- num_records = num_records ,
168
+ num_records = metadata . num_records ,
168
169
records_chunk_size = records_chunk_size ,
169
170
)
170
171
171
172
specs = [spec_from_field (f ) for f in fields ]
172
173
173
- contig_spec = ZarrArraySpec .new (
174
+ # Contig_id and name_id specs unnecessary?
175
+ contig_id_spec = ZarrArraySpec .new (
174
176
name = "contig_id" ,
175
177
dtype = "O" ,
176
- shape = [len ( contigs ) ],
177
- chunks = [len ( contigs ) ],
178
+ shape = [metadata . num_contigs ],
179
+ chunks = [metadata . num_contigs ],
178
180
dimensions = ["contigs" ],
179
181
description = "Contig ID" ,
180
182
)
181
- feature_spec = ZarrArraySpec .new (
182
- name = "feature_id " ,
183
+ name_id_spec = ZarrArraySpec .new (
184
+ name = "name_id " ,
183
185
dtype = "O" ,
184
- shape = [len ( features ) ],
185
- chunks = [len ( features ) ],
186
- dimensions = ["features " ],
187
- description = "Feature ID" ,
186
+ shape = [metadata . num_names ],
187
+ chunks = [metadata . num_names ],
188
+ dimensions = ["names " ],
189
+ description = "Name ID" ,
188
190
)
189
- fields .extend ([contig_spec , feature_spec ])
191
+ specs .extend ([contig_id_spec , name_id_spec ])
190
192
return BedZarrSchema (
191
193
format_version = ZARR_SCHEMA_FORMAT_VERSION ,
192
194
fields = specs ,
193
- contigs = contigs ,
194
- features = features ,
195
- bed_type = bed_type .name ,
195
+ contigs = metadata . contigs ,
196
+ names = metadata . names ,
197
+ bed_type = metadata . bed_type .name ,
196
198
records_chunk_size = records_chunk_size ,
197
199
)
198
200
@@ -206,17 +208,17 @@ class BedFieldSummary(core.JsonDataclass):
206
208
min_value : Any = math .inf
207
209
208
210
def update (self , other ):
209
- self .num_chunks = array . nchunks
210
- self .compressed_size = array . nbytes
211
- self .uncompressed_size = array . nbytes
211
+ self .num_chunks = other . num_chunks
212
+ self .compressed_size = other . compressed_size
213
+ self .uncompressed_size = other . uncompressed_size
212
214
self .min_value = min (self .min_value , other .min_value )
213
215
self .max_value = max (self .max_value , other .max_value )
214
216
215
217
@staticmethod
216
218
def fromdict (d ):
217
219
return BedFieldSummary (** d )
218
220
219
-
221
+
220
222
@dataclasses .dataclass
221
223
class BedField :
222
224
category : str
@@ -227,7 +229,8 @@ class BedField:
227
229
summary : BedFieldSummary
228
230
229
231
def smallest_dtype (self ):
230
- """Return the smallest dtype suitable for this field based on type and values"""
232
+ """Return the smallest dtype suitable for this field based on
233
+ type and values"""
231
234
s = self .summary
232
235
if self .bed_dtype == "Integer" :
233
236
if not math .isfinite (s .max_value ):
@@ -255,6 +258,8 @@ def guess_bed_file_type(path):
255
258
raise ValueError (f"Expected at most 12 columns in BED file, got { num_cols } " )
256
259
if num_cols in (10 , 11 ):
257
260
raise ValueError (f"BED10 and BED11 are prohibited, got { num_cols } columns" )
261
+ if num_cols in (9 , 12 ):
262
+ raise ValueError ("BED9 and BED12 are valid but currently unsupported formats" )
258
263
return BedType (num_cols )
259
264
260
265
@@ -266,7 +271,7 @@ class BedZarrWriterMetadata(core.JsonDataclass):
266
271
format_version : str
267
272
fields : list
268
273
contigs : list
269
- features : list
274
+ names : list
270
275
bed_type : str
271
276
schema : BedZarrSchema
272
277
@@ -319,7 +324,7 @@ def init(
319
324
bed_type = self .bed_metadata .bed_type .name ,
320
325
fields = self .fields ,
321
326
contigs = self .bed_metadata .contigs ,
322
- features = self .bed_metadata .features ,
327
+ names = self .bed_metadata .names ,
323
328
schema = schema ,
324
329
)
325
330
self .path .mkdir ()
@@ -337,14 +342,14 @@ def write(self):
337
342
"source" : f"bio2zarr-{ provenance .__version__ } " ,
338
343
}
339
344
)
340
- d = {i : f .name for i , f in enumerate (self .schema .fields )}
345
+ datafields = self .schema .fields [0 : BedType [self .schema .bed_type ].value ]
346
+ d = {i : f .name for i , f in enumerate (datafields )}
341
347
self .data .rename (columns = d , inplace = True )
342
- self .encode_fields (root )
348
+ self .encode_fields (root , datafields )
343
349
self .encode_contig_id (root )
344
- self .encode_feature_id (root )
350
+ self .encode_name_id (root )
345
351
346
352
def encode_contig_id (self , root ):
347
- print (self .schema .contigs )
348
353
array = root .array (
349
354
"contig_id" ,
350
355
[contig .id for contig in self .schema .contigs ],
@@ -353,28 +358,29 @@ def encode_contig_id(self, root):
353
358
)
354
359
array .attrs ["_ARRAY_DIMENSIONS" ] = ["contigs" ]
355
360
356
-
357
- def encode_feature_id (self , root ):
361
+ def encode_name_id (self , root ):
358
362
array = root .array (
359
- "feature_id " ,
360
- [feature .id for feature in self .schema .features ],
363
+ "name_id " ,
364
+ [name .id for name in self .schema .names ],
361
365
dtype = "str" ,
362
366
compressor = DEFAULT_ZARR_COMPRESSOR ,
363
367
)
364
- array .attrs ["_ARRAY_DIMENSIONS" ] = ["features " ]
368
+ array .attrs ["_ARRAY_DIMENSIONS" ] = ["names " ]
365
369
366
-
367
370
def finalise (self ):
371
+ self .load_metadata ()
368
372
logger .debug ("Removing %s" , self .wip_path )
369
373
shutil .rmtree (self .wip_path )
374
+ logger .info ("Consolidating Zarr metadata" )
375
+ zarr .consolidate_metadata (self .path )
370
376
371
377
def load_metadata (self ):
372
378
if self .metadata is None :
373
379
with open (self .wip_path / "metadata.json" ) as f :
374
380
self .metadata = BedZarrWriterMetadata .fromdict (json .load (f ))
375
381
376
- def encode_fields (self , root ):
377
- for field in self . schema . fields :
382
+ def encode_fields (self , root , datafields ):
383
+ for field in datafields :
378
384
object_codec = None
379
385
if field .dtype == "O" :
380
386
object_codec = numcodecs .VLenUTF8 ()
@@ -405,8 +411,8 @@ def make_field_def(name, alias, bed_dtype, description=""):
405
411
406
412
fields = [
407
413
make_field_def ("contig" , "chrom" , "Category" , "Chromosome name" ),
408
- make_field_def ("start" , "chromStart" , "Integer" , "Feature start position" ),
409
- make_field_def ("end" , "chromEnd" , "Integer" , "Feature end position" ),
414
+ make_field_def ("start" , "chromStart" , "Integer" , "Name start position" ),
415
+ make_field_def ("end" , "chromEnd" , "Integer" , "Name end position" ),
410
416
]
411
417
return fields
412
418
@@ -423,12 +429,10 @@ def make_field_def(name, alias, bed_dtype, description=""):
423
429
)
424
430
425
431
fields = [
426
- make_field_def ("name" , "name" , "Category" , "Feature description" ),
432
+ make_field_def ("name" , "name" , "Category" , "Name description" ),
427
433
make_field_def ("score" , "score" , "Integer" , "A numerical value" ),
428
- make_field_def ("strand" , "strand" , "Character" , "Feature strand" ),
429
- make_field_def (
430
- "thickStart" , "thickStart" , "Integer" , "Thick start position"
431
- ),
434
+ make_field_def ("strand" , "strand" , "Character" , "Name strand" ),
435
+ make_field_def ("thickStart" , "thickStart" , "Integer" , "Thick start position" ),
432
436
make_field_def ("thickEnd" , "thickEnd" , "Integer" , "Thick end position" ),
433
437
make_field_def ("itemRgb" , "itemRgb" , "Integer" , "Display" ),
434
438
make_field_def ("blockCount" , "blockCount" , "Integer" , "Number of blocks" ),
@@ -447,11 +451,41 @@ def mkfields(bed_type):
447
451
return mandatory + optional
448
452
449
453
450
- def read_bed (bed_path , bed_type ):
451
- """Read BED file."""
452
- fields = mkfields (bed_type )
453
- data = pd .read_table (bed_path , header = None ,
454
- names = [f .name for f in fields ])
454
+ def parse_bed (bed_path , records_chunk_size = None ):
455
+ """Read and parse BED file and return data frame and metadata."""
456
+ bed_type = guess_bed_file_type (bed_path )
457
+ fields = mkfields (bed_type )
458
+ data = pd .read_table (bed_path , header = None , names = [f .name for f in fields ])
459
+ data , contig_id , name_id = encode_categoricals (data , bed_type )
460
+ data = parse_csv_fields (data , bed_type )
461
+ fields = update_field_bounds (bed_type , data )
462
+ metadata = BedMetadata (
463
+ contigs = [Contig (c ) for c in contig_id ],
464
+ names = [Name (f ) for f in name_id ],
465
+ fields = fields ,
466
+ bed_type = bed_type ,
467
+ num_records = data .shape [0 ],
468
+ records_chunk_size = records_chunk_size ,
469
+ )
470
+ return data , metadata
471
+
472
+
473
+ def parse_csv_fields (data , bed_type ):
474
+ if bed_type .value < BedType .BED8 .value :
475
+ return data
476
+
477
+ def _convert_csv_data (values ):
478
+ if values .dtype == "O" :
479
+ ret = values .str .split ("," ).apply (lambda x : np .array ([int (y ) for y in x ]))
480
+ else :
481
+ ret = values
482
+ return ret
483
+
484
+ if bed_type .value >= BedType .BED9 .value :
485
+ data ["itemRgb" ] = _convert_csv_data (data ["itemRgb" ])
486
+ if bed_type .value == BedType .BED12 .value :
487
+ data ["blockSizes" ] = _convert_csv_data (data ["blockSizes" ])
488
+ data ["blockStarts" ] = _convert_csv_data (data ["blockStarts" ])
455
489
return data
456
490
457
491
@@ -463,14 +497,14 @@ def encode_categoricals(data, bed_type):
463
497
contig_id = contig .categories .values
464
498
data ["contig" ] = contig .codes
465
499
if bed_type .value >= BedType .BED4 .value :
466
- feature = pd .Categorical (
500
+ name = pd .Categorical (
467
501
data ["name" ], categories = data ["name" ].unique (), ordered = True
468
502
)
469
- feature_id = feature .categories .values
470
- data ["name" ] = feature .codes
503
+ name_id = name .categories .values
504
+ data ["name" ] = name .codes
471
505
else :
472
- feature_id = None
473
- return data , contig_id , feature_id
506
+ name_id = []
507
+ return data , contig_id , name_id
474
508
475
509
476
510
def update_field_bounds (bed_type , data ):
@@ -481,10 +515,7 @@ def update_field_bounds(bed_type, data):
481
515
if f .bed_dtype == "Integer" :
482
516
if f .name in ("itemRgb" , "blockSizes" , "blockStarts" ):
483
517
if data [f .name ].dtype == "O" :
484
- values = np .concatenate (
485
- data [f .name ].str .split ("," ).apply (
486
- lambda x : np .array ([int (y ) for y in x ])
487
- ))
518
+ values = np .concatenate (data [f .name ])
488
519
else :
489
520
values = data [f .name ]
490
521
f .summary .min_value = min (values )
@@ -499,16 +530,19 @@ def update_field_bounds(bed_type, data):
499
530
return ret
500
531
501
532
502
- def mkschema (schema_path , metadata ):
503
- """Read schema or make schema from fields."""
533
+ def mkschema (bed_path , out ):
534
+ """Write schema to file"""
535
+ _ , metadata = parse_bed (bed_path )
536
+ spec = BedZarrSchema .generate (
537
+ metadata , records_chunk_size = metadata .records_chunk_size
538
+ )
539
+ out .write (spec .asjson ())
540
+
541
+
542
+ def init_schema (schema_path , metadata ):
504
543
if schema_path is None :
505
544
schema = BedZarrSchema .generate (
506
- num_records = metadata .num_records ,
507
- bed_type = metadata .bed_type ,
508
- fields = metadata .fields ,
509
- contigs = metadata .contigs ,
510
- features = metadata .features ,
511
- records_chunk_size = metadata .records_chunk_size ,
545
+ metadata , records_chunk_size = metadata .records_chunk_size
512
546
)
513
547
else :
514
548
logger .info ("Reading schema from %s" , schema_path )
@@ -520,28 +554,15 @@ def mkschema(schema_path, metadata):
520
554
schema = BedZarrSchema .fromjson (f .read ())
521
555
return schema
522
556
557
+
523
558
def bed2zarr (
524
559
bed_path ,
525
560
zarr_path ,
526
561
schema_path = None ,
527
562
records_chunk_size = None ,
528
563
):
529
- bed_type = guess_bed_file_type (bed_path )
530
- data = read_bed (bed_path , bed_type )
531
- data , contig_id , feature_id = encode_categoricals (data , bed_type )
532
- fields = update_field_bounds (bed_type , data )
533
- metadata = BedMetadata (
534
- contigs = [Contig (c ) for c in contig_id ],
535
- features = [Feature (f ) for f in feature_id ],
536
- fields = fields ,
537
- bed_type = bed_type ,
538
- num_records = data .shape [0 ],
539
- records_chunk_size = records_chunk_size ,
540
- )
541
- schema = mkschema (
542
- schema_path ,
543
- metadata ,
544
- )
564
+ data , metadata = parse_bed (bed_path , records_chunk_size )
565
+ schema = init_schema (schema_path , metadata )
545
566
bedzw = BedZarrWriter (zarr_path )
546
567
bedzw .init (
547
568
data = data ,
0 commit comments