Skip to content

Commit a076108

Browse files
Silence some pandas warnings and add chunk tests for plink
1 parent b98762c commit a076108

File tree

2 files changed

+17
-5
lines changed

2 files changed

+17
-5
lines changed

bio2zarr/plink.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,17 +33,19 @@
3333
BIM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in BIM_FIELDS])
3434

3535

36-
def read_fam(path, sep=None):
36+
# See https://github.com/sgkit-dev/bio2zarr/issues/409 for discussion
37+
# on the parameters to Pandas here.
38+
def read_fam(path):
3739
# See: https://www.cog-genomics.org/plink/1.9/formats#fam
3840
names = [f[0] for f in FAM_FIELDS]
39-
df = pd.read_csv(path, sep=sep, names=names, dtype=FAM_DF_DTYPE)
41+
df = pd.read_csv(path, sep=None, names=names, dtype=FAM_DF_DTYPE, engine="python")
4042
return df
4143

4244

43-
def read_bim(path, sep=None):
45+
def read_bim(path):
4446
# See: https://www.cog-genomics.org/plink/1.9/formats#bim
4547
names = [f[0] for f in BIM_FIELDS]
46-
df = pd.read_csv(str(path), sep=sep, names=names, dtype=BIM_DF_DTYPE)
48+
df = pd.read_csv(path, sep=None, names=names, dtype=BIM_DF_DTYPE, engine="python")
4749
return df
4850

4951

@@ -102,6 +104,10 @@ def decode(self, start, stop):
102104
start_offset = 3 + (start * self.bytes_per_variant)
103105
bytes_to_read = chunk_size * self.bytes_per_variant
104106

107+
logger.debug(
108+
f"Reading {chunk_size} variants ({bytes_to_read} bytes) "
109+
f"from {self.path}"
110+
)
105111
# TODO make it possible to read sequentially from the same file handle,
106112
# seeking only when necessary.
107113
with open(self.path, "rb") as f:

tests/test_plink.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -356,7 +356,13 @@ def test_chunk_size(
356356
ds = ds.drop_vars("region_index")
357357
ds2 = ds2.drop_vars("region_index")
358358
xt.assert_equal(ds, ds2)
359-
# TODO check array chunks
359+
root = zarr.open(out, mode="r")
360+
v = variants_chunk_size
361+
s = samples_chunk_size
362+
assert root["call_genotype"].chunks == (v, s, 2)
363+
assert root["call_genotype_phased"].chunks == (v, s)
364+
assert root["variant_position"].chunks == (v,)
365+
assert root["variant_contig"].chunks == (v,)
360366

361367

362368
def validate(bed_path, zarr_path):

0 commit comments

Comments
 (0)