Skip to content

Commit e90796b

Browse files
committed
Set Dask's dataframe.convert-string to False to retain previous behaviour
1 parent 11c0a5f commit e90796b

File tree

2 files changed

+104
-99
lines changed

2 files changed

+104
-99
lines changed

sgkit/io/bgen/bgen_reader.py

Lines changed: 46 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -273,50 +273,53 @@ def read_bgen(
273273
f"`contig_dtype` must be of string or int type, not {contig_dtype}"
274274
)
275275

276-
path = Path(path)
277-
sample_path = Path(sample_path) if sample_path else path.with_suffix(".sample")
278-
279-
if sample_path.exists():
280-
sample_id = read_samples(sample_path).sample_id.values.astype("U")
281-
else:
282-
sample_id = _default_sample_ids(path)
283-
284-
bgen_reader = BgenReader(path, metafile_path=metafile_path, dtype=gp_dtype)
285-
286-
df = read_metafile(bgen_reader.metafile_path)
287-
if persist:
288-
df = df.persist()
289-
arrs = dataframe_to_dict(df, METAFILE_DTYPE)
290-
291-
variant_id = arrs["id"]
292-
variant_contig: ArrayLike = arrs["chrom"].astype(contig_dtype)
293-
variant_contig, variant_contig_names = encode_contigs(variant_contig)
294-
variant_contig_names = list(variant_contig_names)
295-
variant_position = arrs["pos"]
296-
variant_allele = da.hstack((arrs["a1"][:, np.newaxis], arrs["a2"][:, np.newaxis]))
297-
298-
call_genotype_probability = da.from_array(
299-
bgen_reader,
300-
chunks=chunks,
301-
lock=lock,
302-
fancy=False,
303-
asarray=False,
304-
name=f"{bgen_reader.name}:read_bgen:{path}",
305-
)
306-
call_dosage = _to_dosage(call_genotype_probability)
307-
308-
ds: Dataset = create_genotype_dosage_dataset(
309-
variant_contig_names=variant_contig_names,
310-
variant_contig=variant_contig,
311-
variant_position=variant_position,
312-
variant_allele=variant_allele,
313-
sample_id=sample_id,
314-
call_dosage=call_dosage,
315-
call_genotype_probability=call_genotype_probability,
316-
variant_id=variant_id,
317-
)
276+
with dask.config.set({"dataframe.convert-string": False}):
277+
path = Path(path)
278+
sample_path = Path(sample_path) if sample_path else path.with_suffix(".sample")
318279

319-
return ds
280+
if sample_path.exists():
281+
sample_id = read_samples(sample_path).sample_id.values.astype("U")
282+
else:
283+
sample_id = _default_sample_ids(path)
284+
285+
bgen_reader = BgenReader(path, metafile_path=metafile_path, dtype=gp_dtype)
286+
287+
df = read_metafile(bgen_reader.metafile_path)
288+
if persist:
289+
df = df.persist()
290+
arrs = dataframe_to_dict(df, METAFILE_DTYPE)
291+
292+
variant_id = arrs["id"]
293+
variant_contig: ArrayLike = arrs["chrom"].astype(contig_dtype)
294+
variant_contig, variant_contig_names = encode_contigs(variant_contig)
295+
variant_contig_names = list(variant_contig_names)
296+
variant_position = arrs["pos"]
297+
variant_allele = da.hstack(
298+
(arrs["a1"][:, np.newaxis], arrs["a2"][:, np.newaxis])
299+
)
300+
301+
call_genotype_probability = da.from_array(
302+
bgen_reader,
303+
chunks=chunks,
304+
lock=lock,
305+
fancy=False,
306+
asarray=False,
307+
name=f"{bgen_reader.name}:read_bgen:{path}",
308+
)
309+
call_dosage = _to_dosage(call_genotype_probability)
310+
311+
ds: Dataset = create_genotype_dosage_dataset(
312+
variant_contig_names=variant_contig_names,
313+
variant_contig=variant_contig,
314+
variant_position=variant_position,
315+
variant_allele=variant_allele,
316+
sample_id=sample_id,
317+
call_dosage=call_dosage,
318+
call_genotype_probability=call_genotype_probability,
319+
variant_id=variant_id,
320+
)
321+
322+
return ds
320323

321324

322325
def _default_sample_ids(path: PathType) -> ArrayLike:

sgkit/io/plink/plink_reader.py

Lines changed: 58 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from pathlib import Path
33
from typing import Any, Dict, List, MutableMapping, Optional, Tuple, Union
44

5+
import dask
56
import dask.array as da
67
import dask.dataframe as dd
78
import numpy as np
@@ -245,64 +246,65 @@ def read_plink(
245246
f"{path}.{ext}" for ext in ["bed", "bim", "fam"]
246247
]
247248

248-
# Load axis data first to determine dimension sizes
249-
df_fam = read_fam(fam_path, sep=fam_sep) # type: ignore[arg-type]
250-
df_bim = read_bim(bim_path, sep=bim_sep) # type: ignore[arg-type]
251-
252-
if persist:
253-
df_fam = df_fam.persist()
254-
df_bim = df_bim.persist()
255-
256-
arr_fam = dataframe_to_dict(df_fam, dtype=FAM_ARRAY_DTYPE)
257-
arr_bim = dataframe_to_dict(df_bim, dtype=BIM_ARRAY_DTYPE)
258-
259-
# Load genotyping data
260-
call_genotype = da.from_array(
261-
# Make sure to use asarray=False in order for masked arrays to propagate
262-
BedReader(bed_path, (len(df_bim), len(df_fam)), count_A1=count_a1), # type: ignore[arg-type]
263-
chunks=chunks,
264-
# Lock must be true with multiprocessing dask scheduler
265-
# to not get bed-reader errors (it works w/ threading backend though)
266-
lock=lock,
267-
asarray=False,
268-
name=f"bed_reader:read_plink:{bed_path}",
269-
)
249+
with dask.config.set({"dataframe.convert-string": False}):
250+
# Load axis data first to determine dimension sizes
251+
df_fam = read_fam(fam_path, sep=fam_sep) # type: ignore[arg-type]
252+
df_bim = read_bim(bim_path, sep=bim_sep) # type: ignore[arg-type]
253+
254+
if persist:
255+
df_fam = df_fam.persist()
256+
df_bim = df_bim.persist()
257+
258+
arr_fam = dataframe_to_dict(df_fam, dtype=FAM_ARRAY_DTYPE)
259+
arr_bim = dataframe_to_dict(df_bim, dtype=BIM_ARRAY_DTYPE)
260+
261+
# Load genotyping data
262+
call_genotype = da.from_array(
263+
# Make sure to use asarray=False in order for masked arrays to propagate
264+
BedReader(bed_path, (len(df_bim), len(df_fam)), count_A1=count_a1), # type: ignore[arg-type]
265+
chunks=chunks,
266+
# Lock must be true with multiprocessing dask scheduler
267+
# to not get bed-reader errors (it works w/ threading backend though)
268+
lock=lock,
269+
asarray=False,
270+
name=f"bed_reader:read_plink:{bed_path}",
271+
)
270272

271-
# If contigs are already integers, use them as-is
272-
if bim_int_contig:
273-
variant_contig = arr_bim["contig"].astype("int16")
274-
variant_contig_names = da.unique(variant_contig).astype(str)
275-
variant_contig_names = list(variant_contig_names.compute())
276-
# Otherwise create index for contig names based
277-
# on order of appearance in underlying .bim file
278-
else:
279-
variant_contig, variant_contig_names = encode_array(arr_bim["contig"].compute()) # type: ignore
280-
variant_contig = variant_contig.astype("int16")
281-
variant_contig_names = list(variant_contig_names)
282-
283-
variant_position = arr_bim["pos"]
284-
a1: ArrayLike = arr_bim["a1"].astype("str")
285-
a2: ArrayLike = arr_bim["a2"].astype("str")
286-
287-
# Note: column_stack not implemented in Dask, must use [v|h]stack
288-
variant_allele = da.hstack((a1[:, np.newaxis], a2[:, np.newaxis]))
289-
variant_allele = variant_allele.astype("S")
290-
variant_id = arr_bim["variant_id"]
291-
292-
sample_id = arr_fam["member_id"]
293-
294-
ds = create_genotype_call_dataset(
295-
variant_contig_names=variant_contig_names,
296-
variant_contig=variant_contig,
297-
variant_position=variant_position,
298-
variant_allele=variant_allele,
299-
sample_id=sample_id,
300-
call_genotype=call_genotype,
301-
variant_id=variant_id,
302-
)
273+
# If contigs are already integers, use them as-is
274+
if bim_int_contig:
275+
variant_contig = arr_bim["contig"].astype("int16")
276+
variant_contig_names = da.unique(variant_contig).astype(str)
277+
variant_contig_names = list(variant_contig_names.compute())
278+
# Otherwise create index for contig names based
279+
# on order of appearance in underlying .bim file
280+
else:
281+
variant_contig, variant_contig_names = encode_array(arr_bim["contig"].compute()) # type: ignore
282+
variant_contig = variant_contig.astype("int16")
283+
variant_contig_names = list(variant_contig_names)
284+
285+
variant_position = arr_bim["pos"]
286+
a1: ArrayLike = arr_bim["a1"].astype("str")
287+
a2: ArrayLike = arr_bim["a2"].astype("str")
288+
289+
# Note: column_stack not implemented in Dask, must use [v|h]stack
290+
variant_allele = da.hstack((a1[:, np.newaxis], a2[:, np.newaxis]))
291+
variant_allele = variant_allele.astype("S")
292+
variant_id = arr_bim["variant_id"]
293+
294+
sample_id = arr_fam["member_id"]
295+
296+
ds = create_genotype_call_dataset(
297+
variant_contig_names=variant_contig_names,
298+
variant_contig=variant_contig,
299+
variant_position=variant_position,
300+
variant_allele=variant_allele,
301+
sample_id=sample_id,
302+
call_genotype=call_genotype,
303+
variant_id=variant_id,
304+
)
303305

304-
# Assign PLINK-specific pedigree fields
305-
return ds.assign({f"sample_{f}": (DIM_SAMPLE, arr_fam[f]) for f in arr_fam})
306+
# Assign PLINK-specific pedigree fields
307+
return ds.assign({f"sample_{f}": (DIM_SAMPLE, arr_fam[f]) for f in arr_fam})
306308

307309

308310
def plink_to_zarr(

0 commit comments

Comments
 (0)