|
33 | 33 | BIM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in BIM_FIELDS])
|
34 | 34 |
|
35 | 35 |
|
36 |
| -def read_fam(path, sep=None): |
| 36 | +# See https://github.com/sgkit-dev/bio2zarr/issues/409 for discussion |
| 37 | +# on the parameters to Pandas here. |
| 38 | +def read_fam(path): |
37 | 39 | # See: https://www.cog-genomics.org/plink/1.9/formats#fam
|
38 | 40 | names = [f[0] for f in FAM_FIELDS]
|
39 |
| - df = pd.read_csv(path, sep=sep, names=names, dtype=FAM_DF_DTYPE) |
| 41 | + df = pd.read_csv(path, sep=None, names=names, dtype=FAM_DF_DTYPE, engine="python") |
40 | 42 | return df
|
41 | 43 |
|
42 | 44 |
|
43 |
| -def read_bim(path, sep=None): |
| 45 | +def read_bim(path): |
44 | 46 | # See: https://www.cog-genomics.org/plink/1.9/formats#bim
|
45 | 47 | names = [f[0] for f in BIM_FIELDS]
|
46 |
| - df = pd.read_csv(str(path), sep=sep, names=names, dtype=BIM_DF_DTYPE) |
| 48 | + df = pd.read_csv(path, sep=None, names=names, dtype=BIM_DF_DTYPE, engine="python") |
47 | 49 | return df
|
48 | 50 |
|
49 | 51 |
|
@@ -102,6 +104,10 @@ def decode(self, start, stop):
|
102 | 104 | start_offset = 3 + (start * self.bytes_per_variant)
|
103 | 105 | bytes_to_read = chunk_size * self.bytes_per_variant
|
104 | 106 |
|
| 107 | + logger.debug( |
| 108 | + f"Reading {chunk_size} variants ({bytes_to_read} bytes) " |
| 109 | + f"from {self.path}" |
| 110 | + ) |
105 | 111 | # TODO make it possible to read sequentially from the same file handle,
|
106 | 112 | # seeking only when necessary.
|
107 | 113 | with open(self.path, "rb") as f:
|
|
0 commit comments