Change default worker processes to 0

jeromekelleher · jeromekelleher · commit bf317ff0cbb5 · 2025-05-23T12:16:18.000+01:00
Closes #404
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,9 @@
 
 - Make format-specific dependencies optional (#385)
 
+- Change default number of worker processes to zero (#404) to simplify
+  debugging
+
 Breaking changes
 
 - Remove explicit sample, contig and filter lists from the schema.
diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py
@@ -8,7 +8,7 @@
 import numcodecs
 import tabulate
 
-from . import plink, provenance, vcf_utils
+from . import core, plink, provenance, vcf_utils
 from . import tskit as tskit_mod
 from . import vcf as vcf_mod
 
@@ -89,7 +89,12 @@ def list_commands(self, ctx):
 version = click.version_option(version=f"{provenance.__version__}")
 
 worker_processes = click.option(
-    "-p", "--worker-processes", type=int, default=1, help="Number of worker processes"
+    "-p",
+    "--worker-processes",
+    type=int,
+    default=core.DEFAULT_WORKER_PROCESSES,
+    help="Number of worker processes",
+    show_default=True,
 )
 
 column_chunk_size = click.option(
diff --git a/bio2zarr/core.py b/bio2zarr/core.py
@@ -130,12 +130,20 @@ def du(path):
     return total
 
 
+# We set the default number of worker processes to 0 because it avoids
+# complexity in the call chain and makes things easier to debug by
+# default. However, it does use the SynchronousExecutor here, which
+# is technically not recommended by the Python docs.
+DEFAULT_WORKER_PROCESSES = 0
+
+
 class SynchronousExecutor(cf.Executor):
-    # Arguably we should use workers=0 as the default and use this
+    # Since https://github.com/sgkit-dev/bio2zarr/issues/404 we
+    # set worker_processses=0 as the default and use this
     # executor implementation. However, the docs are fairly explicit
     # about saying we shouldn't instantiate Future objects directly,
-    # so it's best to keep this as a semi-secret debugging interface
-    # for now.
+    # so we may need to revisit this is obscure problems start to
+    # arise.
     def submit(self, fn, /, *args, **kwargs):
         future = cf.Future()
         future.set_result(fn(*args, **kwargs))
diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py
@@ -291,7 +291,7 @@ def convert(
     *,
     variants_chunk_size=None,
     samples_chunk_size=None,
-    worker_processes=1,
+    worker_processes=core.DEFAULT_WORKER_PROCESSES,
     show_progress=False,
 ):
     plink_format = PlinkFormat(prefix)
diff --git a/bio2zarr/tskit.py b/bio2zarr/tskit.py
@@ -20,7 +20,7 @@ def __init__(
     ):
         import tskit
 
-        self._path = None  # Not sure what we're using this for?
+        self._path = None
         # Future versions here will need to deal with the complexities of
         # having lists of tree sequences for multiple chromosomes.
         if isinstance(ts, tskit.TreeSequence):
@@ -256,7 +256,7 @@ def convert(
     isolated_as_missing=False,
     variants_chunk_size=None,
     samples_chunk_size=None,
-    worker_processes=1,
+    worker_processes=core.DEFAULT_WORKER_PROCESSES,
     show_progress=False,
 ):
     """
@@ -265,6 +265,15 @@ def convert(
 
     .. todo:: Document parameters
     """
+    # FIXME there's some tricky details here in how we're handling
+    # parallelism that we'll need to tackle properly, and maybe
+    # review the current structures a bit. Basically, it looks like
+    # we're pickling/unpickling the format object when we have
+    # multiple workers, and this results in several copies of the
+    # tree sequence object being pass around. This is fine most
+    # of the time, but results in lots of memory being used when
+    # we're dealing with really massive files.
+    # See https://github.com/sgkit-dev/bio2zarr/issues/403
     tskit_format = TskitFormat(
         ts_or_path,
         model_mapping=model_mapping,
diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py
@@ -285,7 +285,12 @@ def scan_vcf(path, target_num_partitions):
         return metadata, vcf.raw_header
 
 
-def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
+def scan_vcfs(
+    paths,
+    show_progress,
+    target_num_partitions,
+    worker_processes=core.DEFAULT_WORKER_PROCESSES,
+):
     logger.info(
         f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions}"
         f" partitions."
@@ -1298,7 +1303,7 @@ def init(
         vcfs,
         *,
         column_chunk_size=16,
-        worker_processes=1,
+        worker_processes=core.DEFAULT_WORKER_PROCESSES,
         target_num_partitions=None,
         show_progress=False,
         compressor=None,
@@ -1450,7 +1455,9 @@ def process_partition(self, partition_index):
             f"{num_records} records last_pos={last_position}"
         )
 
-    def explode(self, *, worker_processes=1, show_progress=False):
+    def explode(
+        self, *, worker_processes=core.DEFAULT_WORKER_PROCESSES, show_progress=False
+    ):
         self.load_metadata()
         num_records = self.metadata.num_records
         if np.isinf(num_records):
@@ -1518,7 +1525,7 @@ def explode(
     vcfs,
     *,
     column_chunk_size=16,
-    worker_processes=1,
+    worker_processes=core.DEFAULT_WORKER_PROCESSES,
     show_progress=False,
     compressor=None,
 ):
@@ -1543,7 +1550,7 @@ def explode_init(
     *,
     column_chunk_size=16,
     target_num_partitions=1,
-    worker_processes=1,
+    worker_processes=core.DEFAULT_WORKER_PROCESSES,
     show_progress=False,
     compressor=None,
 ):
@@ -1605,7 +1612,7 @@ def convert(
     *,
     variants_chunk_size=None,
     samples_chunk_size=None,
-    worker_processes=1,
+    worker_processes=core.DEFAULT_WORKER_PROCESSES,
     local_alleles=None,
     show_progress=False,
     icf_path=None,
@@ -1649,7 +1656,7 @@ def encode(
     dimension_separator=None,
     max_memory=None,
     local_alleles=None,
-    worker_processes=1,
+    worker_processes=core.DEFAULT_WORKER_PROCESSES,
     show_progress=False,
 ):
     # Rough heuristic to split work up enough to keep utilisation high
@@ -1687,7 +1694,7 @@ def encode_init(
     max_variant_chunks=None,
     dimension_separator=None,
     max_memory=None,
-    worker_processes=1,
+    worker_processes=core.DEFAULT_WORKER_PROCESSES,
     show_progress=False,
 ):
     icf_store = IntermediateColumnarFormat(icf_path)
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -7,19 +7,19 @@
 import pytest
 
 from bio2zarr import __main__ as main
-from bio2zarr import cli, provenance
+from bio2zarr import cli, core, provenance
 
 DEFAULT_EXPLODE_ARGS = dict(
     column_chunk_size=64,
     compressor=None,
-    worker_processes=1,
+    worker_processes=core.DEFAULT_WORKER_PROCESSES,
     show_progress=True,
 )
 
 DEFAULT_DEXPLODE_PARTITION_ARGS = dict()
 
 DEFAULT_DEXPLODE_INIT_ARGS = dict(
-    worker_processes=1,
+    worker_processes=core.DEFAULT_WORKER_PROCESSES,
     column_chunk_size=64,
     compressor=None,
     show_progress=True,
@@ -30,7 +30,7 @@
     variants_chunk_size=None,
     samples_chunk_size=None,
     max_variant_chunks=None,
-    worker_processes=1,
+    worker_processes=core.DEFAULT_WORKER_PROCESSES,
     max_memory=None,
     show_progress=True,
 )
@@ -57,7 +57,7 @@
     variants_chunk_size=None,
     samples_chunk_size=None,
     show_progress=True,
-    worker_processes=1,
+    worker_processes=core.DEFAULT_WORKER_PROCESSES,
     local_alleles=False,
 )
 
@@ -67,14 +67,14 @@
     variants_chunk_size=None,
     samples_chunk_size=None,
     show_progress=True,
-    worker_processes=1,
+    worker_processes=core.DEFAULT_WORKER_PROCESSES,
 )
 
 DEFAULT_PLINK_CONVERT_ARGS = dict(
     variants_chunk_size=None,
     samples_chunk_size=None,
     show_progress=True,
-    worker_processes=1,
+    worker_processes=core.DEFAULT_WORKER_PROCESSES,
 )