Skip to content

Commit bae2f43

Browse files
Merge pull request #84 from jeromekelleher/rename-width-length
Rename width length
2 parents 20e8c58 + 7b32b35 commit bae2f43

File tree

7 files changed

+208
-186
lines changed

7 files changed

+208
-186
lines changed

bio2zarr/cli.py

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -14,18 +14,19 @@
1414
"-p", "--worker-processes", type=int, default=1, help="Number of worker processes"
1515
)
1616

17-
# TODO help text
18-
chunk_length = click.option(
17+
# Note: -l and -w were chosen when these were called "width" and "length".
18+
# possibly there are better letters now.
19+
variants_chunk_size = click.option(
1920
"-l",
20-
"--chunk-length",
21+
"--variants-chunk-size",
2122
type=int,
2223
default=None,
2324
help="Chunk size in the variants dimension",
2425
)
2526

26-
chunk_width = click.option(
27+
samples_chunk_size = click.option(
2728
"-w",
28-
"--chunk-width",
29+
"--samples-chunk-size",
2930
type=int,
3031
default=None,
3132
help="Chunk size in the samples dimension",
@@ -96,8 +97,8 @@ def mkschema(if_path):
9697
@click.argument("zarr_path", type=click.Path())
9798
@verbose
9899
@click.option("-s", "--schema", default=None, type=click.Path(exists=True))
99-
@chunk_length
100-
@chunk_width
100+
@variants_chunk_size
101+
@samples_chunk_size
101102
@click.option(
102103
"-V",
103104
"--max-variant-chunks",
@@ -122,8 +123,8 @@ def encode(
122123
zarr_path,
123124
verbose,
124125
schema,
125-
chunk_length,
126-
chunk_width,
126+
variants_chunk_size,
127+
samples_chunk_size,
127128
max_variant_chunks,
128129
max_memory,
129130
worker_processes,
@@ -136,8 +137,8 @@ def encode(
136137
if_path,
137138
zarr_path,
138139
schema,
139-
chunk_length=chunk_length,
140-
chunk_width=chunk_width,
140+
variants_chunk_size=variants_chunk_size,
141+
samples_chunk_size=samples_chunk_size,
141142
max_v_chunks=max_variant_chunks,
142143
worker_processes=worker_processes,
143144
max_memory=max_memory,
@@ -148,20 +149,22 @@ def encode(
148149
@click.command(name="convert")
149150
@click.argument("vcfs", nargs=-1, required=True)
150151
@click.argument("out_path", type=click.Path())
151-
@chunk_length
152-
@chunk_width
152+
@variants_chunk_size
153+
@samples_chunk_size
153154
@verbose
154155
@worker_processes
155-
def convert_vcf(vcfs, out_path, chunk_length, chunk_width, verbose, worker_processes):
156+
def convert_vcf(
157+
vcfs, out_path, variants_chunk_size, samples_chunk_size, verbose, worker_processes
158+
):
156159
"""
157160
Convert input VCF(s) directly to vcfzarr (not recommended for large files)
158161
"""
159162
setup_logging(verbose)
160163
vcf.convert(
161164
vcfs,
162165
out_path,
163-
chunk_length=chunk_length,
164-
chunk_width=chunk_width,
166+
variants_chunk_size=variants_chunk_size,
167+
samples_chunk_size=samples_chunk_size,
165168
show_progress=True,
166169
worker_processes=worker_processes,
167170
)
@@ -198,10 +201,15 @@ def vcf2zarr():
198201
@click.argument("out_path", type=click.Path())
199202
@worker_processes
200203
@verbose
201-
@chunk_length
202-
@chunk_width
204+
@variants_chunk_size
205+
@samples_chunk_size
203206
def convert_plink(
204-
in_path, out_path, verbose, worker_processes, chunk_length, chunk_width
207+
in_path,
208+
out_path,
209+
verbose,
210+
worker_processes,
211+
variants_chunk_size,
212+
samples_chunk_size,
205213
):
206214
"""
207215
In development; DO NOT USE!
@@ -212,8 +220,8 @@ def convert_plink(
212220
out_path,
213221
show_progress=True,
214222
worker_processes=worker_processes,
215-
chunk_width=chunk_width,
216-
chunk_length=chunk_length,
223+
samples_chunk_size=samples_chunk_size,
224+
variants_chunk_size=variants_chunk_size,
217225
)
218226

219227

bio2zarr/core.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -86,11 +86,11 @@ def __init__(self, array, offset):
8686
self.buffer_row = 0
8787

8888
@property
89-
def chunk_length(self):
89+
def variants_chunk_size(self):
9090
return self.buff.shape[0]
9191

9292
def next_buffer_row(self):
93-
if self.buffer_row == self.chunk_length:
93+
if self.buffer_row == self.variants_chunk_size:
9494
self.flush()
9595
row = self.buffer_row
9696
self.buffer_row += 1
@@ -112,7 +112,7 @@ def flush(self):
112112
f"{self.array_offset}:{self.array_offset + self.buffer_row}"
113113
f"{self.buff.nbytes / 2**20: .2f}Mb"
114114
)
115-
self.array_offset += self.chunk_length
115+
self.array_offset += self.variants_chunk_size
116116
self.buffer_row = 0
117117

118118

@@ -126,13 +126,13 @@ def sync_flush_2d_array(np_buffer, zarr_array, offset):
126126
# incremental, and to avoid large memcopies in the underlying
127127
# encoder implementations.
128128
s = slice(offset, offset + np_buffer.shape[0])
129-
chunk_width = zarr_array.chunks[1]
129+
samples_chunk_size = zarr_array.chunks[1]
130130
# TODO use zarr chunks here to support non-uniform chunking later
131131
# and for simplicity
132132
zarr_array_width = zarr_array.shape[1]
133133
start = 0
134134
while start < zarr_array_width:
135-
stop = min(start + chunk_width, zarr_array_width)
135+
stop = min(start + samples_chunk_size, zarr_array_width)
136136
chunk_buffer = np_buffer[:, start:stop]
137137
zarr_array[s, start:stop] = chunk_buffer
138138
update_progress(chunk_buffer.nbytes)

bio2zarr/plink.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,14 @@ def encode_genotypes_slice(bed_path, zarr_path, start, stop):
2222
gt = core.BufferedArray(root["call_genotype"], start)
2323
gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
2424
gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
25-
chunk_length = gt.array.chunks[0]
25+
variants_chunk_size = gt.array.chunks[0]
2626
n = gt.array.shape[1]
27-
assert start % chunk_length == 0
27+
assert start % variants_chunk_size == 0
2828

2929
logger.debug(f"Reading slice {start}:{stop}")
3030
chunk_start = start
3131
while chunk_start < stop:
32-
chunk_stop = min(chunk_start + chunk_length, stop)
32+
chunk_stop = min(chunk_start + variants_chunk_size, stop)
3333
logger.debug(f"Reading bed slice {chunk_start}:{chunk_stop}")
3434
bed_chunk = bed.read(slice(chunk_start, chunk_stop), dtype=np.int8).T
3535
logger.debug(f"Got bed slice {humanfriendly.format_size(bed_chunk.nbytes)}")
@@ -60,34 +60,34 @@ def convert(
6060
*,
6161
show_progress=False,
6262
worker_processes=1,
63-
chunk_length=None,
64-
chunk_width=None,
63+
variants_chunk_size=None,
64+
samples_chunk_size=None,
6565
):
6666
bed = bed_reader.open_bed(bed_path, num_threads=1)
6767
n = bed.iid_count
6868
m = bed.sid_count
6969
logging.info(f"Scanned plink with {n} samples and {m} variants")
7070

7171
# FIXME
72-
if chunk_width is None:
73-
chunk_width = 1000
74-
if chunk_length is None:
75-
chunk_length = 10_000
72+
if samples_chunk_size is None:
73+
samples_chunk_size = 1000
74+
if variants_chunk_size is None:
75+
variants_chunk_size = 10_000
7676

7777
store = zarr.DirectoryStore(zarr_path)
7878
root = zarr.group(store=store, overwrite=True)
7979

8080
ploidy = 2
8181
shape = [m, n]
82-
chunks = [chunk_length, chunk_width]
82+
chunks = [variants_chunk_size, samples_chunk_size]
8383
dimensions = ["variants", "samples"]
8484

8585
a = root.array(
8686
"sample_id",
8787
bed.iid,
8888
dtype="str",
8989
compressor=core.default_compressor,
90-
chunks=(chunk_width,),
90+
chunks=(samples_chunk_size,),
9191
)
9292
a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
9393
logger.debug(f"Encoded samples")
@@ -99,7 +99,7 @@ def convert(
9999
bed.bp_position,
100100
dtype=np.int32,
101101
compressor=core.default_compressor,
102-
chunks=(chunk_length,),
102+
chunks=(variants_chunk_size,),
103103
)
104104
a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
105105
logger.debug(f"encoded variant_position")
@@ -110,7 +110,7 @@ def convert(
110110
alleles,
111111
dtype="str",
112112
compressor=core.default_compressor,
113-
chunks=(chunk_length,),
113+
chunks=(variants_chunk_size,),
114114
)
115115
a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
116116
logger.debug(f"encoded variant_allele")

bio2zarr/vcf.py

Lines changed: 26 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1063,8 +1063,8 @@ def __post_init__(self):
10631063
@dataclasses.dataclass
10641064
class ZarrConversionSpec:
10651065
format_version: str
1066-
chunk_width: int
1067-
chunk_length: int
1066+
samples_chunk_size: int
1067+
variants_chunk_size: int
10681068
dimensions: list
10691069
sample_id: list
10701070
contig_id: list
@@ -1091,15 +1091,17 @@ def fromjson(s):
10911091
return ZarrConversionSpec.fromdict(json.loads(s))
10921092

10931093
@staticmethod
1094-
def generate(pcvcf, chunk_length=None, chunk_width=None):
1094+
def generate(pcvcf, variants_chunk_size=None, samples_chunk_size=None):
10951095
m = pcvcf.num_records
10961096
n = pcvcf.num_samples
10971097
# FIXME
1098-
if chunk_width is None:
1099-
chunk_width = 1000
1100-
if chunk_length is None:
1101-
chunk_length = 10_000
1102-
logger.info(f"Generating schema with chunks={chunk_length, chunk_width}")
1098+
if samples_chunk_size is None:
1099+
samples_chunk_size = 1000
1100+
if variants_chunk_size is None:
1101+
variants_chunk_size = 10_000
1102+
logger.info(
1103+
f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}"
1104+
)
11031105
compressor = core.default_compressor.get_config()
11041106

11051107
def fixed_field_spec(
@@ -1112,7 +1114,7 @@ def fixed_field_spec(
11121114
shape=shape,
11131115
description="",
11141116
dimensions=dimensions,
1115-
chunks=[chunk_length],
1117+
chunks=[variants_chunk_size],
11161118
compressor=compressor,
11171119
)
11181120

@@ -1170,11 +1172,11 @@ def fixed_field_spec(
11701172
shape = [m]
11711173
prefix = "variant_"
11721174
dimensions = ["variants"]
1173-
chunks = [chunk_length]
1175+
chunks = [variants_chunk_size]
11741176
if field.category == "FORMAT":
11751177
prefix = "call_"
11761178
shape.append(n)
1177-
chunks.append(chunk_width),
1179+
chunks.append(samples_chunk_size),
11781180
dimensions.append("samples")
11791181
# TODO make an option to add in the empty extra dimension
11801182
if field.summary.max_number > 1:
@@ -1196,7 +1198,7 @@ def fixed_field_spec(
11961198
if gt_field is not None:
11971199
ploidy = gt_field.summary.max_number - 1
11981200
shape = [m, n]
1199-
chunks = [chunk_length, chunk_width]
1201+
chunks = [variants_chunk_size, samples_chunk_size]
12001202
dimensions = ["variants", "samples"]
12011203

12021204
colspecs.append(
@@ -1241,8 +1243,8 @@ def fixed_field_spec(
12411243
return ZarrConversionSpec(
12421244
# TODO do something systematic
12431245
format_version="0.1",
1244-
chunk_width=chunk_width,
1245-
chunk_length=chunk_length,
1246+
samples_chunk_size=samples_chunk_size,
1247+
variants_chunk_size=variants_chunk_size,
12461248
columns={col.name: col for col in colspecs},
12471249
dimensions=["variants", "samples", "ploidy", "alleles", "filters"],
12481250
sample_id=pcvcf.metadata.samples,
@@ -1431,7 +1433,7 @@ def encode_samples(self):
14311433
self.schema.sample_id,
14321434
dtype="str",
14331435
compressor=core.default_compressor,
1434-
chunks=(self.schema.chunk_width,),
1436+
chunks=(self.schema.samples_chunk_size,),
14351437
)
14361438
array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
14371439
logger.debug("Samples done")
@@ -1639,8 +1641,8 @@ def encode(
16391641
if_path,
16401642
zarr_path,
16411643
schema_path=None,
1642-
chunk_length=None,
1643-
chunk_width=None,
1644+
variants_chunk_size=None,
1645+
samples_chunk_size=None,
16441646
max_v_chunks=None,
16451647
max_memory=None,
16461648
worker_processes=1,
@@ -1650,12 +1652,12 @@ def encode(
16501652
if schema_path is None:
16511653
schema = ZarrConversionSpec.generate(
16521654
pcvcf,
1653-
chunk_length=chunk_length,
1654-
chunk_width=chunk_width,
1655+
variants_chunk_size=variants_chunk_size,
1656+
samples_chunk_size=samples_chunk_size,
16551657
)
16561658
else:
16571659
logger.info(f"Reading schema from {schema_path}")
1658-
if chunk_length is not None or chunk_width is not None:
1660+
if variants_chunk_size is not None or samples_chunk_size is not None:
16591661
raise ValueError("Cannot specify schema along with chunk sizes")
16601662
with open(schema_path, "r") as f:
16611663
schema = ZarrConversionSpec.fromjson(f.read())
@@ -1678,8 +1680,8 @@ def convert(
16781680
vcfs,
16791681
out_path,
16801682
*,
1681-
chunk_length=None,
1682-
chunk_width=None,
1683+
variants_chunk_size=None,
1684+
samples_chunk_size=None,
16831685
worker_processes=1,
16841686
show_progress=False,
16851687
# TODO add arguments to control location of tmpdir
@@ -1694,8 +1696,8 @@ def convert(
16941696
encode(
16951697
if_dir,
16961698
out_path,
1697-
chunk_length=chunk_length,
1698-
chunk_width=chunk_width,
1699+
variants_chunk_size=variants_chunk_size,
1700+
samples_chunk_size=samples_chunk_size,
16991701
worker_processes=worker_processes,
17001702
show_progress=show_progress,
17011703
)

0 commit comments

Comments
 (0)