Skip to content

Commit c3b1ef1

Browse files
committed
Add more logging
1 parent ed48856 commit c3b1ef1

File tree

1 file changed

+40
-23
lines changed

1 file changed

+40
-23
lines changed

bio2zarr/vcf.py

Lines changed: 40 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ def scan_vcf(path, target_num_partitions):
273273

274274

275275
def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
276-
logger.info(f"Scanning {len(paths)} VCFs")
276+
logger.info(f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions} partitions.")
277277
progress_config = core.ProgressConfig(
278278
total=len(paths),
279279
units="files",
@@ -311,6 +311,7 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
311311
key=lambda x: (contig_index_map[x.region.contig], x.region.start)
312312
)
313313
vcf_metadata.partitions = all_partitions
314+
logger.info(f"Scan complete, resulting in {len(all_partitions)} partitions.")
314315
return vcf_metadata, header
315316

316317

@@ -875,6 +876,7 @@ def num_columns(self):
875876
return len(self.columns)
876877

877878
def mkdirs(self):
879+
logger.info(f"Creating {len(self.columns) * self.num_partitions} directories")
878880
self.path.mkdir()
879881
for col in self.columns.values():
880882
col.path.mkdir(parents=True)
@@ -883,14 +885,12 @@ def mkdirs(self):
883885
part_path.mkdir()
884886

885887
def write_metadata(self):
888+
logger.info(f"Writing metadata")
886889
with open(self.path / "metadata.json", "w") as f:
887890
json.dump(self.metadata.asdict(), f, indent=4)
888-
# Write number of partitions in a convenience file for
889-
# workflows
890-
with open(self.path / "num_partitions.txt", "w") as f:
891-
f.write(str(self.num_partitions))
892891

893892
def write_header(self):
893+
logger.info(f"Writing header")
894894
with open(self.path / "header.txt", "w") as f:
895895
f.write(self.vcf_header)
896896

@@ -1012,35 +1012,52 @@ def convert_init(vcfs, out_path, *, num_partitions=1, worker_processes=1, show_p
10121012
return pcvcf
10131013

10141014
def convert_slice(self, start, stop, *, worker_processes=1, show_progress=False, column_chunk_size=16):
1015-
logger.info(
1016-
f"Exploding {self.num_columns} columns {self.metadata.num_records} variants "
1017-
f"{self.num_samples} samples"
1018-
)
10191015
if start < 0:
10201016
raise ValueError(f"start={start} must be non-negative")
10211017
if stop > self.num_partitions:
10221018
raise ValueError(f"stop={stop} must be less than the number of partitions")
1023-
num_records_to_progress = sum([partition.num_records for partition in self.metadata.partitions[start:stop]])
1019+
if start == 0 and stop == self.num_partitions:
1020+
num_records_to_process = self.num_records
1021+
logger.info(
1022+
f"Exploding {self.num_columns} columns {self.metadata.num_records} variants "
1023+
f"{self.num_samples} samples"
1024+
)
1025+
else:
1026+
num_records_to_process = None
1027+
logger.info(
1028+
f"Exploding {self.num_columns} columns {self.num_samples} samples"
1029+
f" from partitions {start} to {stop}"
1030+
)
1031+
10241032
progress_config = core.ProgressConfig(
1025-
total=num_records_to_progress,
1033+
total=num_records_to_process,
10261034
units="vars",
10271035
title="Explode",
10281036
show=show_progress,
10291037
)
1030-
with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
1031-
for j in range(start, stop):
1032-
pwm.submit(
1033-
PickleChunkedVcf.convert_partition,
1034-
self.metadata,
1035-
j,
1036-
self.path,
1037-
column_chunk_size=column_chunk_size,
1038-
)
1039-
for _ in pwm.results_as_completed():
1040-
pass
1038+
if stop-start == 1:
1039+
PickleChunkedVcf.convert_partition(
1040+
self.metadata,
1041+
start,
1042+
self.path,
1043+
column_chunk_size=column_chunk_size,
1044+
)
1045+
else:
1046+
with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
1047+
for j in range(start, stop):
1048+
pwm.submit(
1049+
PickleChunkedVcf.convert_partition,
1050+
self.metadata,
1051+
j,
1052+
self.path,
1053+
column_chunk_size=column_chunk_size,
1054+
)
1055+
for _ in pwm.results_as_completed():
1056+
pass
10411057

10421058
def convert_finalise(self):
1043-
assert not self.metadata.finalised
1059+
if self.metadata.finalised:
1060+
raise ValueError("Already finalised")
10441061

10451062
partition_summaries = self.load_partition_summaries()
10461063
for index, summary in enumerate(partition_summaries):

0 commit comments

Comments
 (0)