Skip to content

Commit cb4ddc8

Browse files
Merge pull request #96 from jeromekelleher/explode-parition
Explode partition + other ICF refactoring
2 parents d3601c4 + 4f08a57 commit cb4ddc8

17 files changed

+532
-225
lines changed

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1-
# 0.0.2 2024-03-xx
1+
# 0.0.2 2024-03-27
22

33
- Merged 1D and 2D encode steps into one, and change rate reporting to bytes
44
- Add --max-memory for encode
55
- Change `chunk_width` to `samples_chunk_size` and `chunk_length` to `variants_chunk_size`
6+
- Various updates to the intermediate chunked format, with breaking change to version 0.2
7+
- Add distributed explode commands

bio2zarr/cli.py

Lines changed: 15 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -88,44 +88,40 @@ def explode(vcfs, zarr_path, verbose, worker_processes, column_chunk_size):
8888
@click.argument("vcfs", nargs=-1, required=True)
8989
@click.argument("icf_path", type=click.Path())
9090
@click.argument("num_partitions", type=int)
91+
@column_chunk_size
9192
@verbose
9293
@worker_processes
93-
def dexplode_init(vcfs, icf_path, num_partitions, verbose, worker_processes):
94+
def dexplode_init(
95+
vcfs, icf_path, num_partitions, column_chunk_size, verbose, worker_processes
96+
):
9497
"""
9598
Initial step for parallel conversion of VCF(s) to intermediate columnar format
9699
over the requested number of paritions.
97100
"""
98101
setup_logging(verbose)
99102
num_partitions = vcf.explode_init(
100-
vcfs,
101103
icf_path,
104+
vcfs,
102105
target_num_partitions=num_partitions,
106+
column_chunk_size=column_chunk_size,
103107
worker_processes=worker_processes,
104108
show_progress=True,
105109
)
106110
click.echo(num_partitions)
107111

108112

109113
@click.command
110-
@click.argument("path", type=click.Path(), required=True)
111-
@click.argument("start", type=int)
112-
@click.argument("end", type=int)
114+
@click.argument("icf_path", type=click.Path())
115+
@click.argument("partition", type=int)
113116
@verbose
114-
@worker_processes
115-
@column_chunk_size
116-
def dexplode_slice(path, start, end, verbose, worker_processes, column_chunk_size):
117+
def dexplode_partition(icf_path, partition, verbose):
117118
"""
118-
Convert VCF(s) to intermediate columnar format
119+
Convert a VCF partition into intermediate columnar format. Must be called *after*
120+
the ICF path has been initialised with dexplode_init. Partition indexes must be
121+
from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
119122
"""
120123
setup_logging(verbose)
121-
vcf.explode_slice(
122-
path,
123-
start,
124-
end,
125-
worker_processes=worker_processes,
126-
column_chunk_size=column_chunk_size,
127-
show_progress=True,
128-
)
124+
vcf.explode_partition(icf_path, partition, show_progress=True)
129125

130126

131127
@click.command
@@ -297,7 +293,7 @@ def vcf2zarr():
297293
298294
\b
299295
$ vcf2zarr dexplode-init [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH] [NUM_PARTITIONS]
300-
$ vcf2zarr dexplode-slice [ICF_PATH] [START] [STOP]
296+
$ vcf2zarr dexplode-partition [ICF_PATH] [PARTITION_INDEX]
301297
$ vcf2zarr dexplode-finalise [ICF_PATH]
302298
303299
See the online documentation at [FIXME] for more details on distributed explode.
@@ -311,7 +307,7 @@ def vcf2zarr():
311307
vcf2zarr.add_command(mkschema)
312308
vcf2zarr.add_command(encode)
313309
vcf2zarr.add_command(dexplode_init)
314-
vcf2zarr.add_command(dexplode_slice)
310+
vcf2zarr.add_command(dexplode_partition)
315311
vcf2zarr.add_command(dexplode_finalise)
316312
vcf2zarr.add_command(validate)
317313

0 commit comments

Comments
 (0)