Silence some pandas warnings and add chunk tests for plink

jeromekelleher · jeromekelleher · commit a0761082c18e · 2025-05-23T14:13:13.000Z
diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py
@@ -33,17 +33,19 @@
 BIM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in BIM_FIELDS])
 
 
-def read_fam(path, sep=None):
+# See https://github.com/sgkit-dev/bio2zarr/issues/409 for discussion
+# on the parameters to Pandas here.
+def read_fam(path):
     # See: https://www.cog-genomics.org/plink/1.9/formats#fam
     names = [f[0] for f in FAM_FIELDS]
-    df = pd.read_csv(path, sep=sep, names=names, dtype=FAM_DF_DTYPE)
+    df = pd.read_csv(path, sep=None, names=names, dtype=FAM_DF_DTYPE, engine="python")
     return df
 
 
-def read_bim(path, sep=None):
+def read_bim(path):
     # See: https://www.cog-genomics.org/plink/1.9/formats#bim
     names = [f[0] for f in BIM_FIELDS]
-    df = pd.read_csv(str(path), sep=sep, names=names, dtype=BIM_DF_DTYPE)
+    df = pd.read_csv(path, sep=None, names=names, dtype=BIM_DF_DTYPE, engine="python")
     return df
 
 
@@ -102,6 +104,10 @@ def decode(self, start, stop):
         start_offset = 3 + (start * self.bytes_per_variant)
         bytes_to_read = chunk_size * self.bytes_per_variant
 
+        logger.debug(
+            f"Reading {chunk_size} variants ({bytes_to_read} bytes) "
+            f"from {self.path}"
+        )
         # TODO make it possible to read sequentially from the same file handle,
         # seeking only when necessary.
         with open(self.path, "rb") as f:
diff --git a/tests/test_plink.py b/tests/test_plink.py
@@ -356,7 +356,13 @@ def test_chunk_size(
         ds = ds.drop_vars("region_index")
         ds2 = ds2.drop_vars("region_index")
         xt.assert_equal(ds, ds2)
-        # TODO check array chunks
+        root = zarr.open(out, mode="r")
+        v = variants_chunk_size
+        s = samples_chunk_size
+        assert root["call_genotype"].chunks == (v, s, 2)
+        assert root["call_genotype_phased"].chunks == (v, s)
+        assert root["variant_position"].chunks == (v,)
+        assert root["variant_contig"].chunks == (v,)
 
 
 def validate(bed_path, zarr_path):