5
5
import numpy as np
6
6
import zarr
7
7
8
+ from bio2zarr import constants , schema , writer
9
+
8
10
logger = logging .getLogger (__name__ )
9
11
10
12
@@ -21,7 +23,6 @@ def iter_alleles(self, start, stop, num_alleles):
21
23
ref_field = self .bed .allele_1
22
24
alt_field = self .bed .allele_2
23
25
24
- # TODO - should be doing whole chunks rather than one at a time
25
26
for ref , alt in zip (
26
27
ref_field [start :stop ],
27
28
alt_field [start :stop ],
@@ -49,10 +50,6 @@ def iter_genotypes(self, shape, start, stop):
49
50
yield gt , phased
50
51
51
52
52
- # Import here to avoid circular import
53
- from bio2zarr import constants , schema , writer # noqa: E402
54
-
55
-
56
53
def generate_schema (
57
54
bed ,
58
55
variants_chunk_size = None ,
@@ -147,7 +144,7 @@ def convert(
147
144
samples_chunk_size = samples_chunk_size ,
148
145
)
149
146
zarr_path = pathlib .Path (zarr_path )
150
- vzw = writer .VcfZarrWriter ("plink" , zarr_path )
147
+ vzw = writer .VcfZarrWriter (PlinkFormat , zarr_path )
151
148
# Rough heuristic to split work up enough to keep utilisation high
152
149
target_num_partitions = max (1 , worker_processes * 4 )
153
150
vzw .init (
@@ -168,146 +165,6 @@ def convert(
168
165
# vzw.create_index()
169
166
170
167
171
- # def encode_genotypes_slice(bed_path, zarr_path, start, stop):
172
- # # We need to count the A2 alleles here if we want to keep the
173
- # # alleles reported as allele_1, allele_2. It's obvious here what
174
- # # the correct approach is, but it is important to note that the
175
- # # 0th allele is *not* necessarily the REF for these datasets.
176
- # bed = bed_reader.open_bed(bed_path, num_threads=1, count_A1=False)
177
- # root = zarr.open(store=zarr_path, mode="a", **ZARR_FORMAT_KWARGS)
178
- # gt = core.BufferedArray(root["call_genotype"], start)
179
- # gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
180
- # gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
181
- # variants_chunk_size = gt.array.chunks[0]
182
- # assert start % variants_chunk_size == 0
183
-
184
- # logger.debug(f"Reading slice {start}:{stop}")
185
- # chunk_start = start
186
- # while chunk_start < stop:
187
- # chunk_stop = min(chunk_start + variants_chunk_size, stop)
188
- # logger.debug(f"Reading bed slice {chunk_start}:{chunk_stop}")
189
- # bed_chunk = bed.read(slice(chunk_start, chunk_stop), dtype=np.int8).T
190
- # logger.debug(f"Got bed slice {humanfriendly.format_size(bed_chunk.nbytes)}")
191
- # # Probably should do this without iterating over rows, but it's a bit
192
- # # simpler and lines up better with the array buffering API. The bottleneck
193
- # # is in the encoding anyway.
194
- # for values in bed_chunk:
195
- # j = gt.next_buffer_row()
196
- # g = np.zeros_like(gt.buff[j])
197
- # g[values == -127] = -1
198
- # g[values == 2] = 1
199
- # g[values == 1, 0] = 1
200
- # gt.buff[j] = g
201
- # j = gt_phased.next_buffer_row()
202
- # gt_phased.buff[j] = False
203
- # j = gt_mask.next_buffer_row()
204
- # gt_mask.buff[j] = gt.buff[j] == -1
205
- # chunk_start = chunk_stop
206
- # gt.flush()
207
- # gt_phased.flush()
208
- # gt_mask.flush()
209
- # logger.debug(f"GT slice {start}:{stop} done")
210
-
211
- # root = zarr.open_group(store=zarr_path, mode="w", **ZARR_FORMAT_KWARGS)
212
-
213
- # ploidy = 2
214
- # shape = [m, n]
215
- # chunks = [variants_chunk_size, samples_chunk_size]
216
- # dimensions = ["variants", "samples"]
217
-
218
- # # TODO we should be reusing some logic from vcfzarr here on laying
219
- # # out the basic dataset, and using the schema generator. Currently
220
- # # we're not using the best Blosc settings for genotypes here.
221
- # default_compressor = numcodecs.Blosc(cname="zstd", clevel=7)
222
-
223
- # a = root.array(
224
- # "sample_id",
225
- # data=bed.iid,
226
- # shape=bed.iid.shape,
227
- # dtype="str",
228
- # compressor=default_compressor,
229
- # chunks=(samples_chunk_size,),
230
- # )
231
- # a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
232
- # logger.debug("Encoded samples")
233
-
234
- # # TODO encode these in slices - but read them in one go to avoid
235
- # # fetching repeatedly from bim file
236
- # a = root.array(
237
- # "variant_position",
238
- # data=bed.bp_position,
239
- # shape=bed.bp_position.shape,
240
- # dtype=np.int32,
241
- # compressor=default_compressor,
242
- # chunks=(variants_chunk_size,),
243
- # )
244
- # a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
245
- # logger.debug("encoded variant_position")
246
-
247
- # alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
248
- # a = root.array(
249
- # "variant_allele",
250
- # data=alleles,
251
- # shape=alleles.shape,
252
- # dtype="str",
253
- # compressor=default_compressor,
254
- # chunks=(variants_chunk_size, alleles.shape[1]),
255
- # )
256
- # a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
257
- # logger.debug("encoded variant_allele")
258
-
259
- # # TODO remove this?
260
- # a = root.empty(
261
- # name="call_genotype_phased",
262
- # dtype="bool",
263
- # shape=list(shape),
264
- # chunks=list(chunks),
265
- # compressor=default_compressor,
266
- # **ZARR_FORMAT_KWARGS,
267
- # )
268
- # a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
269
-
270
- # shape += [ploidy]
271
- # dimensions += ["ploidy"]
272
- # a = root.empty(
273
- # name="call_genotype",
274
- # dtype="i1",
275
- # shape=list(shape),
276
- # chunks=list(chunks),
277
- # compressor=default_compressor,
278
- # **ZARR_FORMAT_KWARGS,
279
- # )
280
- # a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
281
-
282
- # a = root.empty(
283
- # name="call_genotype_mask",
284
- # dtype="bool",
285
- # shape=list(shape),
286
- # chunks=list(chunks),
287
- # compressor=default_compressor,
288
- # **ZARR_FORMAT_KWARGS,
289
- # )
290
- # a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
291
-
292
- # del bed
293
-
294
- # num_slices = max(1, worker_processes * 4)
295
- # slices = core.chunk_aligned_slices(a, num_slices)
296
-
297
- # total_chunks = sum(a.nchunks for _, a in root.arrays())
298
-
299
- # progress_config = core.ProgressConfig(
300
- # total=total_chunks, title="Convert", units="chunks", show=show_progress
301
- # )
302
- # with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
303
- # for start, stop in slices:
304
- # pwm.submit(encode_genotypes_slice, bed_path, zarr_path, start, stop)
305
-
306
- # # TODO also add atomic swap like VCF. Should be abstracted to
307
- # # share basic code for setting up the variation dataset zarr
308
- # zarr.consolidate_metadata(zarr_path)
309
-
310
-
311
168
# FIXME do this more efficiently - currently reading the whole thing
312
169
# in for convenience, and also comparing call-by-call
313
170
def validate (bed_path , zarr_path ):
0 commit comments