Skip to content

Commit eda5654

Browse files
committed
Add tests and withdraw support for BED9, BED12
1 parent 731cfa6 commit eda5654

File tree

3 files changed

+160
-134
lines changed

3 files changed

+160
-134
lines changed

bio2zarr/bed2zarr.py

Lines changed: 107 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -102,14 +102,14 @@ class Contig:
102102

103103

104104
@dataclasses.dataclass
105-
class Feature:
105+
class Name:
106106
id: str
107107

108108

109109
@dataclasses.dataclass
110110
class BedMetadata(core.JsonDataclass):
111111
contigs: list
112-
features: list
112+
names: list
113113
fields: list
114114
bed_type: BedType
115115
num_records: int = -1
@@ -120,16 +120,16 @@ def num_contigs(self):
120120
return len(self.contigs)
121121

122122
@property
123-
def num_features(self):
124-
return len(self.features)
123+
def num_names(self):
124+
return len(self.names)
125125

126126

127127
@dataclasses.dataclass
128128
class BedZarrSchema(core.JsonDataclass):
129129
format_version: str
130130
fields: list
131131
contigs: list
132-
features: list
132+
names: list
133133
bed_type: str
134134
records_chunk_size: int
135135

@@ -147,52 +147,54 @@ def fromdict(d):
147147
)
148148
ret = BedZarrSchema(**d)
149149
ret.fields = [ZarrArraySpec(**sd) for sd in d["fields"]]
150-
ret.contigs = [Contig(**sd) for sd in d["contigs"]]
151-
ret.features = [Feature(**sd) for sd in d["features"]]
150+
ret.contig_ids = [Contig(**sd) for sd in d["contigs"]]
151+
ret.name_ids = [Name(**sd) for sd in d["names"]]
152152
return ret
153153

154154
@staticmethod
155155
def fromjson(s):
156156
return BedZarrSchema.fromdict(json.loads(s))
157157

158158
@staticmethod
159-
def generate(num_records, bed_type, fields, contigs, features, records_chunk_size=None):
159+
def generate(metadata, records_chunk_size=None):
160160
if records_chunk_size is None:
161161
records_chunk_size = 1000
162162
logger.info("Generating schema with chunks=%d", records_chunk_size)
163+
fields = metadata.fields
163164

164165
def spec_from_field(field):
165166
return ZarrArraySpec.from_field(
166167
field,
167-
num_records=num_records,
168+
num_records=metadata.num_records,
168169
records_chunk_size=records_chunk_size,
169170
)
170171

171172
specs = [spec_from_field(f) for f in fields]
172173

173-
contig_spec = ZarrArraySpec.new(
174+
# Contig_id and name_id specs unnecessary?
175+
contig_id_spec = ZarrArraySpec.new(
174176
name="contig_id",
175177
dtype="O",
176-
shape=[len(contigs)],
177-
chunks=[len(contigs)],
178+
shape=[metadata.num_contigs],
179+
chunks=[metadata.num_contigs],
178180
dimensions=["contigs"],
179181
description="Contig ID",
180182
)
181-
feature_spec = ZarrArraySpec.new(
182-
name="feature_id",
183+
name_id_spec = ZarrArraySpec.new(
184+
name="name_id",
183185
dtype="O",
184-
shape=[len(features)],
185-
chunks=[len(features)],
186-
dimensions=["features"],
187-
description="Feature ID",
186+
shape=[metadata.num_names],
187+
chunks=[metadata.num_names],
188+
dimensions=["names"],
189+
description="Name ID",
188190
)
189-
fields.extend([contig_spec, feature_spec])
191+
specs.extend([contig_id_spec, name_id_spec])
190192
return BedZarrSchema(
191193
format_version=ZARR_SCHEMA_FORMAT_VERSION,
192194
fields=specs,
193-
contigs=contigs,
194-
features=features,
195-
bed_type=bed_type.name,
195+
contigs=metadata.contigs,
196+
names=metadata.names,
197+
bed_type=metadata.bed_type.name,
196198
records_chunk_size=records_chunk_size,
197199
)
198200

@@ -206,17 +208,17 @@ class BedFieldSummary(core.JsonDataclass):
206208
min_value: Any = math.inf
207209

208210
def update(self, other):
209-
self.num_chunks = array.nchunks
210-
self.compressed_size = array.nbytes
211-
self.uncompressed_size = array.nbytes
211+
self.num_chunks = other.num_chunks
212+
self.compressed_size = other.compressed_size
213+
self.uncompressed_size = other.uncompressed_size
212214
self.min_value = min(self.min_value, other.min_value)
213215
self.max_value = max(self.max_value, other.max_value)
214216

215217
@staticmethod
216218
def fromdict(d):
217219
return BedFieldSummary(**d)
218220

219-
221+
220222
@dataclasses.dataclass
221223
class BedField:
222224
category: str
@@ -227,7 +229,8 @@ class BedField:
227229
summary: BedFieldSummary
228230

229231
def smallest_dtype(self):
230-
"""Return the smallest dtype suitable for this field based on type and values"""
232+
"""Return the smallest dtype suitable for this field based on
233+
type and values"""
231234
s = self.summary
232235
if self.bed_dtype == "Integer":
233236
if not math.isfinite(s.max_value):
@@ -255,6 +258,8 @@ def guess_bed_file_type(path):
255258
raise ValueError(f"Expected at most 12 columns in BED file, got {num_cols}")
256259
if num_cols in (10, 11):
257260
raise ValueError(f"BED10 and BED11 are prohibited, got {num_cols} columns")
261+
if num_cols in (9, 12):
262+
raise ValueError("BED9 and BED12 are valid but currently unsupported formats")
258263
return BedType(num_cols)
259264

260265

@@ -266,7 +271,7 @@ class BedZarrWriterMetadata(core.JsonDataclass):
266271
format_version: str
267272
fields: list
268273
contigs: list
269-
features: list
274+
names: list
270275
bed_type: str
271276
schema: BedZarrSchema
272277

@@ -319,7 +324,7 @@ def init(
319324
bed_type=self.bed_metadata.bed_type.name,
320325
fields=self.fields,
321326
contigs=self.bed_metadata.contigs,
322-
features=self.bed_metadata.features,
327+
names=self.bed_metadata.names,
323328
schema=schema,
324329
)
325330
self.path.mkdir()
@@ -337,14 +342,14 @@ def write(self):
337342
"source": f"bio2zarr-{provenance.__version__}",
338343
}
339344
)
340-
d = {i: f.name for i, f in enumerate(self.schema.fields)}
345+
datafields = self.schema.fields[0 : BedType[self.schema.bed_type].value]
346+
d = {i: f.name for i, f in enumerate(datafields)}
341347
self.data.rename(columns=d, inplace=True)
342-
self.encode_fields(root)
348+
self.encode_fields(root, datafields)
343349
self.encode_contig_id(root)
344-
self.encode_feature_id(root)
350+
self.encode_name_id(root)
345351

346352
def encode_contig_id(self, root):
347-
print(self.schema.contigs)
348353
array = root.array(
349354
"contig_id",
350355
[contig.id for contig in self.schema.contigs],
@@ -353,28 +358,29 @@ def encode_contig_id(self, root):
353358
)
354359
array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
355360

356-
357-
def encode_feature_id(self, root):
361+
def encode_name_id(self, root):
358362
array = root.array(
359-
"feature_id",
360-
[feature.id for feature in self.schema.features],
363+
"name_id",
364+
[name.id for name in self.schema.names],
361365
dtype="str",
362366
compressor=DEFAULT_ZARR_COMPRESSOR,
363367
)
364-
array.attrs["_ARRAY_DIMENSIONS"] = ["features"]
368+
array.attrs["_ARRAY_DIMENSIONS"] = ["names"]
365369

366-
367370
def finalise(self):
371+
self.load_metadata()
368372
logger.debug("Removing %s", self.wip_path)
369373
shutil.rmtree(self.wip_path)
374+
logger.info("Consolidating Zarr metadata")
375+
zarr.consolidate_metadata(self.path)
370376

371377
def load_metadata(self):
372378
if self.metadata is None:
373379
with open(self.wip_path / "metadata.json") as f:
374380
self.metadata = BedZarrWriterMetadata.fromdict(json.load(f))
375381

376-
def encode_fields(self, root):
377-
for field in self.schema.fields:
382+
def encode_fields(self, root, datafields):
383+
for field in datafields:
378384
object_codec = None
379385
if field.dtype == "O":
380386
object_codec = numcodecs.VLenUTF8()
@@ -405,8 +411,8 @@ def make_field_def(name, alias, bed_dtype, description=""):
405411

406412
fields = [
407413
make_field_def("contig", "chrom", "Category", "Chromosome name"),
408-
make_field_def("start", "chromStart", "Integer", "Feature start position"),
409-
make_field_def("end", "chromEnd", "Integer", "Feature end position"),
414+
make_field_def("start", "chromStart", "Integer", "Name start position"),
415+
make_field_def("end", "chromEnd", "Integer", "Name end position"),
410416
]
411417
return fields
412418

@@ -423,12 +429,10 @@ def make_field_def(name, alias, bed_dtype, description=""):
423429
)
424430

425431
fields = [
426-
make_field_def("name", "name", "Category", "Feature description"),
432+
make_field_def("name", "name", "Category", "Name description"),
427433
make_field_def("score", "score", "Integer", "A numerical value"),
428-
make_field_def("strand", "strand", "Character", "Feature strand"),
429-
make_field_def(
430-
"thickStart", "thickStart", "Integer", "Thick start position"
431-
),
434+
make_field_def("strand", "strand", "Character", "Name strand"),
435+
make_field_def("thickStart", "thickStart", "Integer", "Thick start position"),
432436
make_field_def("thickEnd", "thickEnd", "Integer", "Thick end position"),
433437
make_field_def("itemRgb", "itemRgb", "Integer", "Display"),
434438
make_field_def("blockCount", "blockCount", "Integer", "Number of blocks"),
@@ -447,11 +451,41 @@ def mkfields(bed_type):
447451
return mandatory + optional
448452

449453

450-
def read_bed(bed_path, bed_type):
451-
"""Read BED file."""
452-
fields = mkfields(bed_type)
453-
data = pd.read_table(bed_path, header=None,
454-
names=[f.name for f in fields])
454+
def parse_bed(bed_path, records_chunk_size=None):
455+
"""Read and parse BED file and return data frame and metadata."""
456+
bed_type = guess_bed_file_type(bed_path)
457+
fields = mkfields(bed_type)
458+
data = pd.read_table(bed_path, header=None, names=[f.name for f in fields])
459+
data, contig_id, name_id = encode_categoricals(data, bed_type)
460+
data = parse_csv_fields(data, bed_type)
461+
fields = update_field_bounds(bed_type, data)
462+
metadata = BedMetadata(
463+
contigs=[Contig(c) for c in contig_id],
464+
names=[Name(f) for f in name_id],
465+
fields=fields,
466+
bed_type=bed_type,
467+
num_records=data.shape[0],
468+
records_chunk_size=records_chunk_size,
469+
)
470+
return data, metadata
471+
472+
473+
def parse_csv_fields(data, bed_type):
474+
if bed_type.value < BedType.BED8.value:
475+
return data
476+
477+
def _convert_csv_data(values):
478+
if values.dtype == "O":
479+
ret = values.str.split(",").apply(lambda x: np.array([int(y) for y in x]))
480+
else:
481+
ret = values
482+
return ret
483+
484+
if bed_type.value >= BedType.BED9.value:
485+
data["itemRgb"] = _convert_csv_data(data["itemRgb"])
486+
if bed_type.value == BedType.BED12.value:
487+
data["blockSizes"] = _convert_csv_data(data["blockSizes"])
488+
data["blockStarts"] = _convert_csv_data(data["blockStarts"])
455489
return data
456490

457491

@@ -463,14 +497,14 @@ def encode_categoricals(data, bed_type):
463497
contig_id = contig.categories.values
464498
data["contig"] = contig.codes
465499
if bed_type.value >= BedType.BED4.value:
466-
feature = pd.Categorical(
500+
name = pd.Categorical(
467501
data["name"], categories=data["name"].unique(), ordered=True
468502
)
469-
feature_id = feature.categories.values
470-
data["name"] = feature.codes
503+
name_id = name.categories.values
504+
data["name"] = name.codes
471505
else:
472-
feature_id = None
473-
return data, contig_id, feature_id
506+
name_id = []
507+
return data, contig_id, name_id
474508

475509

476510
def update_field_bounds(bed_type, data):
@@ -481,10 +515,7 @@ def update_field_bounds(bed_type, data):
481515
if f.bed_dtype == "Integer":
482516
if f.name in ("itemRgb", "blockSizes", "blockStarts"):
483517
if data[f.name].dtype == "O":
484-
values = np.concatenate(
485-
data[f.name].str.split(",").apply(
486-
lambda x: np.array([int(y) for y in x])
487-
))
518+
values = np.concatenate(data[f.name])
488519
else:
489520
values = data[f.name]
490521
f.summary.min_value = min(values)
@@ -499,16 +530,19 @@ def update_field_bounds(bed_type, data):
499530
return ret
500531

501532

502-
def mkschema(schema_path, metadata):
503-
"""Read schema or make schema from fields."""
533+
def mkschema(bed_path, out):
534+
"""Write schema to file"""
535+
_, metadata = parse_bed(bed_path)
536+
spec = BedZarrSchema.generate(
537+
metadata, records_chunk_size=metadata.records_chunk_size
538+
)
539+
out.write(spec.asjson())
540+
541+
542+
def init_schema(schema_path, metadata):
504543
if schema_path is None:
505544
schema = BedZarrSchema.generate(
506-
num_records=metadata.num_records,
507-
bed_type=metadata.bed_type,
508-
fields=metadata.fields,
509-
contigs=metadata.contigs,
510-
features=metadata.features,
511-
records_chunk_size=metadata.records_chunk_size,
545+
metadata, records_chunk_size=metadata.records_chunk_size
512546
)
513547
else:
514548
logger.info("Reading schema from %s", schema_path)
@@ -520,28 +554,15 @@ def mkschema(schema_path, metadata):
520554
schema = BedZarrSchema.fromjson(f.read())
521555
return schema
522556

557+
523558
def bed2zarr(
524559
bed_path,
525560
zarr_path,
526561
schema_path=None,
527562
records_chunk_size=None,
528563
):
529-
bed_type = guess_bed_file_type(bed_path)
530-
data = read_bed(bed_path, bed_type)
531-
data, contig_id, feature_id = encode_categoricals(data, bed_type)
532-
fields = update_field_bounds(bed_type, data)
533-
metadata = BedMetadata(
534-
contigs=[Contig(c) for c in contig_id],
535-
features=[Feature(f) for f in feature_id],
536-
fields=fields,
537-
bed_type=bed_type,
538-
num_records=data.shape[0],
539-
records_chunk_size=records_chunk_size,
540-
)
541-
schema = mkschema(
542-
schema_path,
543-
metadata,
544-
)
564+
data, metadata = parse_bed(bed_path, records_chunk_size)
565+
schema = init_schema(schema_path, metadata)
545566
bedzw = BedZarrWriter(zarr_path)
546567
bedzw.init(
547568
data=data,

0 commit comments

Comments
 (0)