Skip to content

Commit 9840d49

Browse files
Merge pull request #110 from jeromekelleher/update-defaults
Update defaults
2 parents b62bb60 + 0fd66c2 commit 9840d49

File tree

8 files changed

+75
-24
lines changed

8 files changed

+75
-24
lines changed

bio2zarr/cli.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ def list_commands(self, ctx):
7575
"--compressor",
7676
type=click.Choice(["lz4", "zstd"]),
7777
default=None,
78-
help="Codec to use for compressing column chunks",
78+
help="Codec to use for compressing column chunks (Default=zstd)."
7979
)
8080

8181
# Note: -l and -w were chosen when these were called "width" and "length".
@@ -282,7 +282,7 @@ def encode(
282282
worker_processes,
283283
):
284284
"""
285-
Encode intermediate columnar format (see explode) to vcfzarr.
285+
Convert intermediate columnar format to vcfzarr.
286286
"""
287287
setup_logging(verbose)
288288
check_overwrite_dir(zarr_path, force)

bio2zarr/core.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@ def wait_on_futures(futures):
5050
cancel_futures(futures)
5151
if isinstance(exception, cf.process.BrokenProcessPool):
5252
raise RuntimeError(
53-
"Worker process died: you may have run out of memory") from exception
53+
"Worker process died: you may have run out of memory"
54+
) from exception
5455
else:
5556
raise exception
5657

bio2zarr/vcf.py

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ class VcfPartition:
151151

152152
ICF_METADATA_FORMAT_VERSION = "0.2"
153153
ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
154-
cname="lz4", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
154+
cname="zstd", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
155155
)
156156

157157

@@ -890,6 +890,15 @@ def num_columns(self):
890890
return len(self.columns)
891891

892892

893+
894+
def mkdir_with_progress(path):
895+
logger.debug(f"mkdir f{path}")
896+
# NOTE we may have race-conditions here, I'm not sure. Hopefully allowing
897+
# parents=True will take care of it.
898+
path.mkdir(parents=True)
899+
core.update_progress(1)
900+
901+
893902
class IntermediateColumnarFormatWriter:
894903
def __init__(self, path):
895904
self.path = pathlib.Path(path)
@@ -932,7 +941,7 @@ def init(
932941
# dependencies as well.
933942
self.metadata.provenance = {"source": f"bio2zarr-{provenance.__version__}"}
934943

935-
self.mkdirs(worker_processes)
944+
self.mkdirs(worker_processes, show_progress=show_progress)
936945

937946
# Note: this is needed for the current version of the vcfzarr spec, but it's
938947
# probably going to be dropped.
@@ -947,30 +956,30 @@ def init(
947956
json.dump(self.metadata.asdict(), f, indent=4)
948957
return self.num_partitions
949958

950-
def mkdirs(self, worker_processes=1):
951-
logger.info(
952-
f"Creating {len(self.metadata.fields) * self.num_partitions} directories"
953-
)
959+
def mkdirs(self, worker_processes=1, show_progress=False):
960+
num_dirs = len(self.metadata.fields) * self.num_partitions
961+
logger.info(f"Creating {num_dirs} directories")
954962
self.path.mkdir()
955963
self.wip_path.mkdir()
956964
# Due to high latency batch system filesystems, we create all the directories in
957965
# parallel
958966
progress_config = core.ProgressConfig(
959-
total=len(self.metadata.fields) * self.num_partitions,
960-
units="dir",
961-
title="Creating directories",
962-
show=True
967+
total=num_dirs,
968+
units="dirs",
969+
title="Mkdirs",
970+
show=show_progress,
963971
)
964972
with core.ParallelWorkManager(
965-
worker_processes=worker_processes,
966-
progress_config=progress_config
973+
worker_processes=worker_processes, progress_config=progress_config
967974
) as manager:
968975
for field in self.metadata.fields:
969976
col_path = get_vcf_field_path(self.path, field)
977+
# Don't bother trying to count the intermediate directories towards
978+
# progress
970979
manager.submit(col_path.mkdir, parents=True)
971980
for j in range(self.num_partitions):
972981
part_path = col_path / f"p{j}"
973-
manager.submit(part_path.mkdir, parents=True)
982+
manager.submit(mkdir_with_progress, part_path)
974983

975984
def load_partition_summaries(self):
976985
summaries = []
@@ -1499,15 +1508,17 @@ def parse_max_memory(max_memory):
14991508

15001509

15011510
class VcfZarrWriter:
1502-
def __init__(self, path, icf, schema):
1511+
def __init__(self, path, icf, schema, dimension_separator=None):
15031512
self.path = pathlib.Path(path)
15041513
self.icf = icf
15051514
self.schema = schema
1515+
# Default to using nested directories following the Zarr v3 default.
1516+
# This seems to require version 2.17+ to work properly
1517+
self.dimension_separator = "/" if dimension_separator is None else dimension_separator
15061518
store = zarr.DirectoryStore(self.path)
15071519
self.root = zarr.group(store=store)
15081520

15091521
def init_array(self, variable):
1510-
# print("CREATE", variable)
15111522
object_codec = None
15121523
if variable.dtype == "O":
15131524
object_codec = numcodecs.VLenUTF8()
@@ -1519,7 +1530,9 @@ def init_array(self, variable):
15191530
compressor=numcodecs.get_codec(variable.compressor),
15201531
filters=[numcodecs.get_codec(filt) for filt in variable.filters],
15211532
object_codec=object_codec,
1533+
dimension_separator=self.dimension_separator,
15221534
)
1535+
# Dimension names are part of the spec in Zarr v3
15231536
a.attrs["_ARRAY_DIMENSIONS"] = variable.dimensions
15241537

15251538
def get_array(self, name):
@@ -1657,6 +1670,7 @@ def encode_contig_id(self):
16571670
"contig_length",
16581671
self.schema.contig_length,
16591672
dtype=np.int64,
1673+
compressor=DEFAULT_ZARR_COMPRESSOR,
16601674
)
16611675
array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
16621676
return {v: j for j, v in enumerate(self.schema.contig_id)}
@@ -1849,6 +1863,7 @@ def encode(
18491863
variants_chunk_size=None,
18501864
samples_chunk_size=None,
18511865
max_v_chunks=None,
1866+
dimension_separator=None,
18521867
max_memory=None,
18531868
worker_processes=1,
18541869
show_progress=False,
@@ -1872,7 +1887,7 @@ def encode(
18721887
if zarr_path.exists():
18731888
logger.warning(f"Deleting existing {zarr_path}")
18741889
shutil.rmtree(zarr_path)
1875-
vzw = VcfZarrWriter(zarr_path, icf, schema)
1890+
vzw = VcfZarrWriter(zarr_path, icf, schema, dimension_separator=dimension_separator)
18761891
vzw.init()
18771892
vzw.encode(
18781893
max_v_chunks=max_v_chunks,

requirements/development.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,5 @@ pytest-coverage
88
pytest-xdist
99
sgkit
1010
tabulate
11-
tqdm
11+
tqdm
12+
zarr>=2.17

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ include_package_data = True
2828
python_requires = >=3.9
2929
install_requires =
3030
numpy
31-
zarr >= 2.10.0, != 2.11.0, != 2.11.1, != 2.11.2
31+
zarr >= 2.17
3232
click
3333
tabulate
3434
tqdm

tests/test_icf.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,16 @@ def test_compressor_explode(self, tmp_path, compressor):
275275
icf = self.run_explode(tmp_path, compressor=compressor)
276276
assert icf.metadata.compressor == compressor.get_config()
277277

278+
def test_default_compressor_explode(self, tmp_path):
279+
icf = self.run_explode(tmp_path)
280+
assert icf.metadata.compressor == vcf.ICF_DEFAULT_COMPRESSOR.get_config()
281+
assert icf.metadata.compressor["cname"] == "zstd"
282+
283+
def test_default_compressor_dexplode(self, tmp_path):
284+
icf = self.run_dexplode(tmp_path)
285+
assert icf.metadata.compressor == vcf.ICF_DEFAULT_COMPRESSOR.get_config()
286+
assert icf.metadata.compressor["cname"] == "zstd"
287+
278288
@pytest.mark.parametrize(
279289
"compressor",
280290
[
@@ -340,7 +350,8 @@ def test_empty_chunk_file(self, tmp_path):
340350
with pytest.raises(RuntimeError, match="blosc"):
341351
icf["POS"].values
342352

343-
@pytest.mark.parametrize("length", [10, 100, 200, 210])
353+
# Chunk file is 195 long
354+
@pytest.mark.parametrize("length", [10, 100, 190, 194])
344355
def test_truncated_chunk_file(self, tmp_path, length):
345356
icf_path = tmp_path / "icf"
346357
vcf.explode(icf_path, [self.data_path])
@@ -359,7 +370,7 @@ def test_chunk_incorrect_length(self, tmp_path):
359370
icf_path = tmp_path / "icf"
360371
vcf.explode(icf_path, [self.data_path])
361372
chunk_file = icf_path / "POS" / "p0" / "2"
362-
compressor = numcodecs.Blosc(cname="lz4")
373+
compressor = numcodecs.Blosc(cname="zstd")
363374
with open(chunk_file, "rb") as f:
364375
pkl = compressor.decode(f.read())
365376
x = pickle.loads(pkl)

tests/test_vcf.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,29 @@ def test_exploded_metadata_mismatch(self, tmpdir, icf_path, version):
9999
vcf.IcfMetadata.fromdict(d)
100100

101101

102+
class TestEncodeDimensionSeparator:
103+
104+
@pytest.mark.parametrize("dimension_separator", [None, "/"])
105+
def test_directories(self, tmp_path, icf_path, dimension_separator):
106+
zarr_path = tmp_path / "zarr"
107+
vcf.encode(icf_path, zarr_path, dimension_separator=dimension_separator)
108+
# print(zarr_path)
109+
chunk_file = zarr_path / "call_genotype" / "0" / "0" / "0"
110+
assert chunk_file.exists()
111+
112+
def test_files(self, tmp_path, icf_path):
113+
zarr_path = tmp_path / "zarr"
114+
vcf.encode(icf_path, zarr_path, dimension_separator=".")
115+
chunk_file = zarr_path / "call_genotype" / "0.0.0"
116+
assert chunk_file.exists()
117+
118+
@pytest.mark.parametrize("dimension_separator", ["\\", "X", []])
119+
def test_bad_value(self, tmp_path, icf_path, dimension_separator):
120+
zarr_path = tmp_path / "zarr"
121+
with pytest.raises(ValueError):
122+
vcf.encode(icf_path, zarr_path, dimension_separator=dimension_separator)
123+
124+
102125
class TestDefaultSchema:
103126
def test_format_version(self, schema):
104127
assert schema["format_version"] == vcf.ZARR_SCHEMA_FORMAT_VERSION

tests/test_vcf_examples.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -841,7 +841,7 @@ def test_split_explode(tmp_path):
841841
pcvcf = vcf.IntermediateColumnarFormat(out)
842842
assert pcvcf.columns["POS"].vcf_field.summary.asdict() == {
843843
"num_chunks": 3,
844-
"compressed_size": 630,
844+
"compressed_size": 587,
845845
"uncompressed_size": 1008,
846846
"max_number": 1,
847847
"max_value": 1235237,

0 commit comments

Comments
 (0)