Merge pull request #24 from jeromekelleher/basic-plink

jeromekelleher · web-flow · commit 5abbfada036d · 2024-02-21T16:57:01.000Z
Basic plink tests
diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py
@@ -3,6 +3,7 @@
 import coloredlogs
 
 from . import vcf
+from . import plink
 
 # Common arguments/options
 verbose = click.option("-v", "--verbose", count=True, help="Increase verbosity")
@@ -112,14 +113,14 @@ def vcf2zarr():
 
 
 @click.command(name="convert")
-@click.argument("plink", type=click.Path())
+@click.argument("in_path", type=click.Path())
 @click.argument("out_path", type=click.Path())
 @worker_processes
 @click.option("--chunk-width", type=int, default=None)
 @click.option("--chunk-length", type=int, default=None)
-def convert_plink(plink, out_path, worker_processes, chunk_width, chunk_length):
-    vcf.convert_plink(
-        plink,
+def convert_plink(in_path, out_path, worker_processes, chunk_width, chunk_length):
+    plink.convert(
+        in_path,
         out_path,
         show_progress=True,
         worker_processes=worker_processes,
diff --git a/bio2zarr/core.py b/bio2zarr/core.py
@@ -10,10 +10,19 @@
 import zarr
 import numpy as np
 import tqdm
+import numcodecs
 
 
 logger = logging.getLogger(__name__)
 
+numcodecs.blosc.use_threads = False
+
+# TODO this should probably go in another module where we abstract
+# out the zarr defaults
+default_compressor = numcodecs.Blosc(
+    cname="zstd", clevel=7, shuffle=numcodecs.Blosc.AUTOSHUFFLE
+)
+
 
 class SynchronousExecutor(cf.Executor):
     def submit(self, fn, /, *args, **kwargs):
diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py
@@ -20,8 +20,6 @@
 import tqdm
 import zarr
 
-import bed_reader
-
 from . import core
 
 logger = logging.getLogger(__name__)
@@ -38,12 +36,6 @@
     [0x7F800001, 0x7F800002], dtype=np.int32
 )
 
-numcodecs.blosc.use_threads = False
-
-default_compressor = numcodecs.Blosc(
-    cname="zstd", clevel=7, shuffle=numcodecs.Blosc.AUTOSHUFFLE
-)
-
 
 def assert_all_missing_float(a):
     v = np.array(a, dtype=np.float32).view(np.int32)
@@ -437,6 +429,8 @@ def __init__(self, vcf_field, base_path):
         else:
             self.path = base_path / vcf_field.category / vcf_field.name
 
+        # TODO Check if other compressors would give reasonable compression
+        # with significantly faster times
         self.compressor = numcodecs.Blosc(cname="zstd", clevel=7)
         # TODO have a clearer way of defining this state between
         # read and write mode.
@@ -905,7 +899,7 @@ def generate(pcvcf, chunk_length=None, chunk_width=None):
         if chunk_length is None:
             chunk_length = 10_000
 
-        compressor = default_compressor.get_config()
+        compressor = core.default_compressor.get_config()
 
         def fixed_field_spec(
             name, dtype, vcf_field=None, shape=(m,), dimensions=("variants",)
@@ -1136,7 +1130,7 @@ def encode_samples(self, pcvcf, sample_id, chunk_width):
             "sample_id",
             sample_id,
             dtype="str",
-            compressor=default_compressor,
+            compressor=core.default_compressor,
             chunks=(chunk_width,),
         )
         array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
@@ -1147,7 +1141,7 @@ def encode_contig(self, pcvcf, contig_names, contig_lengths):
             "contig_id",
             contig_names,
             dtype="str",
-            compressor=default_compressor,
+            compressor=core.default_compressor,
         )
         array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
 
@@ -1181,7 +1175,7 @@ def encode_filters(self, pcvcf, filter_names):
             "filter_id",
             filter_names,
             dtype="str",
-            compressor=default_compressor,
+            compressor=core.default_compressor,
         )
         array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
 
@@ -1339,41 +1333,6 @@ def convert_vcf(
         )
 
 
-def encode_bed_partition_genotypes(
-    bed_path, zarr_path, start_variant, end_variant, encoder_threads=8
-):
-    bed = bed_reader.open_bed(bed_path, num_threads=1)
-
-    store = zarr.DirectoryStore(zarr_path)
-    root = zarr.group(store=store)
-    gt = core.BufferedArray(root["call_genotype"])
-    gt_mask = core.BufferedArray(root["call_genotype_mask"])
-    gt_phased = core.BufferedArray(root["call_genotype_phased"])
-    chunk_length = gt.array.chunks[0]
-    assert start_variant % chunk_length == 0
-
-    buffered_arrays = [gt, gt_phased, gt_mask]
-
-    with core.ThreadedZarrEncoder(buffered_arrays, encoder_threads) as te:
-        start = start_variant
-        while start < end_variant:
-            stop = min(start + chunk_length, end_variant)
-            bed_chunk = bed.read(index=slice(start, stop), dtype="int8").T
-            # Note could do this without iterating over rows, but it's a bit
-            # simpler and the bottleneck is in the encoding step anyway. It's
-            # also nice to have updates on the progress monitor.
-            for values in bed_chunk:
-                j = te.next_buffer_row()
-                dest = gt.buff[j]
-                dest[values == -127] = -1
-                dest[values == 2] = 1
-                dest[values == 1, 0] = 1
-                gt_phased.buff[j] = False
-                gt_mask.buff[j] = dest == -1
-                core.update_progress(1)
-            start = stop
-
-
 def validate(vcf_path, zarr_path, show_progress=False):
     store = zarr.DirectoryStore(zarr_path)
 
@@ -1508,89 +1467,3 @@ def validate(vcf_path, zarr_path, show_progress=False):
                     print(vcf_val)
                     print(zarr_val)
                     assert False
-
-
-def convert_plink(
-    bed_path,
-    zarr_path,
-    *,
-    show_progress,
-    worker_processes=1,
-    chunk_length=None,
-    chunk_width=None,
-):
-    bed = bed_reader.open_bed(bed_path, num_threads=1)
-    n = bed.iid_count
-    m = bed.sid_count
-    del bed
-
-    # FIXME
-    if chunk_width is None:
-        chunk_width = 1000
-    if chunk_length is None:
-        chunk_length = 10_000
-
-    store = zarr.DirectoryStore(zarr_path)
-    root = zarr.group(store=store, overwrite=True)
-
-    ploidy = 2
-    shape = [m, n]
-    chunks = [chunk_length, chunk_width]
-    dimensions = ["variants", "samples"]
-
-    a = root.empty(
-        "call_genotype_phased",
-        dtype="bool",
-        shape=list(shape),
-        chunks=list(chunks),
-        compressor=default_compressor,
-    )
-    a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
-
-    shape += [ploidy]
-    dimensions += ["ploidy"]
-    a = root.empty(
-        "call_genotype",
-        dtype="i8",
-        shape=list(shape),
-        chunks=list(chunks),
-        compressor=default_compressor,
-    )
-    a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
-
-    a = root.empty(
-        "call_genotype_mask",
-        dtype="bool",
-        shape=list(shape),
-        chunks=list(chunks),
-        compressor=default_compressor,
-    )
-    a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
-
-    num_chunks = max(1, m // chunk_length)
-    worker_processes = min(worker_processes, num_chunks)
-    if num_chunks == 1 or worker_processes == 1:
-        partitions = [(0, m)]
-    else:
-        # Generate num_workers partitions
-        # TODO finer grained might be better.
-        partitions = []
-        chunk_boundaries = [
-            p[0] for p in np.array_split(np.arange(num_chunks), worker_processes)
-        ]
-        for j in range(len(chunk_boundaries) - 1):
-            start = chunk_boundaries[j] * chunk_length
-            end = chunk_boundaries[j + 1] * chunk_length
-            end = min(end, m)
-            partitions.append((start, end))
-        last_stop = partitions[-1][-1]
-        if last_stop != m:
-            partitions.append((last_stop, m))
-    # print(partitions)
-
-    progress_config = core.ProgressConfig(
-        total=m, title="Convert", units="vars", show=show_progress
-    )
-    with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
-        for start, end in partitions:
-            pwm.submit(encode_bed_partition_genotypes, bed_path, zarr_path, start, end)
diff --git a/tests/data/plink/example.bed b/tests/data/plink/example.bed
diff --git a/tests/data/plink/example.bim b/tests/data/plink/example.bim
@@ -0,0 +1,2 @@
+1	1_10	0	10	A	G
+1	1_20	0	20	T	C
diff --git a/tests/data/plink/example.fam b/tests/data/plink/example.fam
@@ -0,0 +1,10 @@
+ind0 ind0 0 0 0 -9
+ind1 ind1 0 0 0 -9
+ind2 ind2 0 0 0 -9
+ind3 ind3 0 0 0 -9
+ind4 ind4 0 0 0 -9
+ind5 ind5 0 0 0 -9
+ind6 ind6 0 0 0 -9
+ind7 ind7 0 0 0 -9
+ind8 ind8 0 0 0 -9
+ind9 ind9 0 0 0 -9
diff --git a/tests/data/plink/example.map b/tests/data/plink/example.map
@@ -0,0 +1,2 @@
+1 1_10 0 10
+1 1_20 0 20
diff --git a/tests/data/plink/example.nosex b/tests/data/plink/example.nosex
@@ -0,0 +1,10 @@
+ind0	ind0
+ind1	ind1
+ind2	ind2
+ind3	ind3
+ind4	ind4
+ind5	ind5
+ind6	ind6
+ind7	ind7
+ind8	ind8
+ind9	ind9
diff --git a/tests/data/plink/example.ped b/tests/data/plink/example.ped
@@ -0,0 +1,10 @@
+ind0 ind0 0 0 0 0 A A T T
+ind1 ind1 0 0 0 0 A A T T
+ind2 ind2 0 0 0 0 A A T T
+ind3 ind3 0 0 0 0 G G T T
+ind4 ind4 0 0 0 0 G G C C
+ind5 ind5 0 0 0 0 G G C C
+ind6 ind6 0 0 0 0 G G C C
+ind7 ind7 0 0 0 0 G G C C
+ind8 ind8 0 0 0 0 G G C C
+ind9 ind9 0 0 0 0 G G C C
diff --git a/tests/data/plink/example_with_fam.bed b/tests/data/plink/example_with_fam.bed
diff --git a/tests/data/plink/example_with_fam.bim b/tests/data/plink/example_with_fam.bim
@@ -0,0 +1,2 @@
+1	1_10	0	10	A	G
+1	1_20	0	20	T	C
diff --git a/tests/data/plink/example_with_fam.fam b/tests/data/plink/example_with_fam.fam
@@ -0,0 +1,10 @@
+ind0 ind0 0 0 0 -9
+ind1 ind1 0 0 0 -9
+ind2 ind2 ind1 ind0 2 1
+ind3 ind3 ind1 ind0 1 2
+ind4 ind4 0 0 0 -9
+ind5 ind5 0 0 0 -9
+ind6 ind6 0 0 0 -9
+ind7 ind7 0 0 0 -9
+ind8 ind8 0 0 0 -9
+ind9 ind9 0 0 0 -9
diff --git a/tests/data/plink/plink_sim_10s_100v_10pmiss.bed b/tests/data/plink/plink_sim_10s_100v_10pmiss.bed
diff --git a/tests/data/plink/plink_sim_10s_100v_10pmiss.bim b/tests/data/plink/plink_sim_10s_100v_10pmiss.bim
@@ -0,0 +1,100 @@
+1	1:1:G:CGCGCG	0.0	1	CGCGCG	G
+1	1:2:ACT:G	0.0	2	G	ACT
+1	1:3:ACT:G	0.0	3	G	ACT
+1	1:4:G:CGCGCG	0.0	4	CGCGCG	G
+1	1:5:G:CGCGCG	0.0	5	CGCGCG	G
+1	1:6:ACT:G	0.0	6	G	ACT
+1	1:7:G:CGCGCG	0.0	7	CGCGCG	G
+1	1:8:T:GTGG	0.0	8	GTGG	T
+1	1:9:T:GTGG	0.0	9	GTGG	T
+1	1:10:A:C	0.0	10	C	A
+1	1:11:ACT:G	0.0	11	G	ACT
+1	1:12:G:CGCGCG	0.0	12	CGCGCG	G
+1	1:13:G:CGCGCG	0.0	13	CGCGCG	G
+1	1:14:T:GTGG	0.0	14	GTGG	T
+1	1:15:ACT:G	0.0	15	G	ACT
+1	1:16:A:C	0.0	16	C	A
+1	1:17:ACT:G	0.0	17	G	ACT
+1	1:18:T:GTGG	0.0	18	GTGG	T
+1	1:19:A:C	0.0	19	C	A
+1	1:20:A:C	0.0	20	C	A
+1	1:21:T:GTGG	0.0	21	GTGG	T
+1	1:22:G:CGCGCG	0.0	22	CGCGCG	G
+1	1:23:T:GTGG	0.0	23	GTGG	T
+1	1:24:A:C	0.0	24	C	A
+1	1:25:A:C	0.0	25	C	A
+1	1:26:ACT:G	0.0	26	G	ACT
+1	1:27:G:CGCGCG	0.0	27	CGCGCG	G
+1	1:28:ACT:G	0.0	28	G	ACT
+1	1:29:T:GTGG	0.0	29	GTGG	T
+1	1:30:A:C	0.0	30	C	A
+1	1:31:T:GTGG	0.0	31	GTGG	T
+1	1:32:G:CGCGCG	0.0	32	CGCGCG	G
+1	1:33:ACT:G	0.0	33	G	ACT
+1	1:34:G:CGCGCG	0.0	34	CGCGCG	G
+1	1:35:A:C	0.0	35	C	A
+1	1:36:G:CGCGCG	0.0	36	CGCGCG	G
+1	1:37:T:GTGG	0.0	37	GTGG	T
+1	1:38:A:C	0.0	38	C	A
+1	1:39:A:C	0.0	39	C	A
+1	1:40:T:GTGG	0.0	40	GTGG	T
+1	1:41:A:C	0.0	41	C	A
+1	1:42:G:CGCGCG	0.0	42	CGCGCG	G
+1	1:43:T:GTGG	0.0	43	GTGG	T
+1	1:44:ACT:G	0.0	44	G	ACT
+1	1:45:G:CGCGCG	0.0	45	CGCGCG	G
+1	1:46:ACT:G	0.0	46	G	ACT
+1	1:47:G:CGCGCG	0.0	47	CGCGCG	G
+1	1:48:A:C	0.0	48	C	A
+1	1:49:A:C	0.0	49	C	A
+1	1:50:A:C	0.0	50	C	A
+1	1:51:G:CGCGCG	0.0	51	CGCGCG	G
+1	1:52:A:C	0.0	52	C	A
+1	1:53:ACT:G	0.0	53	G	ACT
+1	1:54:A:C	0.0	54	C	A
+1	1:55:G:CGCGCG	0.0	55	CGCGCG	G
+1	1:56:T:GTGG	0.0	56	GTGG	T
+1	1:57:G:CGCGCG	0.0	57	CGCGCG	G
+1	1:58:A:C	0.0	58	C	A
+1	1:59:T:GTGG	0.0	59	GTGG	T
+1	1:60:G:CGCGCG	0.0	60	CGCGCG	G
+1	1:61:ACT:G	0.0	61	G	ACT
+1	1:62:A:C	0.0	62	C	A
+1	1:63:G:CGCGCG	0.0	63	CGCGCG	G
+1	1:64:T:GTGG	0.0	64	GTGG	T
+1	1:65:T:GTGG	0.0	65	GTGG	T
+1	1:66:ACT:G	0.0	66	G	ACT
+1	1:67:T:GTGG	0.0	67	GTGG	T
+1	1:68:ACT:G	0.0	68	G	ACT
+1	1:69:G:CGCGCG	0.0	69	CGCGCG	G
+1	1:70:G:CGCGCG	0.0	70	CGCGCG	G
+1	1:71:ACT:G	0.0	71	G	ACT
+1	1:72:G:CGCGCG	0.0	72	CGCGCG	G
+1	1:73:A:C	0.0	73	C	A
+1	1:74:A:C	0.0	74	C	A
+1	1:75:T:GTGG	0.0	75	GTGG	T
+1	1:76:A:C	0.0	76	C	A
+1	1:77:ACT:G	0.0	77	G	ACT
+1	1:78:ACT:G	0.0	78	G	ACT
+1	1:79:A:C	0.0	79	C	A
+1	1:80:A:C	0.0	80	C	A
+1	1:81:A:C	0.0	81	C	A
+1	1:82:T:GTGG	0.0	82	GTGG	T
+1	1:83:A:C	0.0	83	C	A
+1	1:84:ACT:G	0.0	84	G	ACT
+1	1:85:A:C	0.0	85	C	A
+1	1:86:G:CGCGCG	0.0	86	CGCGCG	G
+1	1:87:ACT:G	0.0	87	G	ACT
+1	1:88:A:C	0.0	88	C	A
+1	1:89:A:C	0.0	89	C	A
+1	1:90:T:GTGG	0.0	90	GTGG	T
+1	1:91:T:GTGG	0.0	91	GTGG	T
+1	1:92:T:GTGG	0.0	92	GTGG	T
+1	1:93:A:C	0.0	93	C	A
+1	1:94:A:C	0.0	94	C	A
+1	1:95:A:C	0.0	95	C	A
+1	1:96:A:C	0.0	96	C	A
+1	1:97:T:GTGG	0.0	97	GTGG	T
+1	1:98:ACT:G	0.0	98	G	ACT
+1	1:99:T:GTGG	0.0	99	GTGG	T
+1	1:100:A:C	0.0	100	C	A
diff --git a/tests/data/plink/plink_sim_10s_100v_10pmiss.fam b/tests/data/plink/plink_sim_10s_100v_10pmiss.fam
diff --git a/tests/test_plink.py b/tests/test_plink.py