Some basic testing for dencode, plus tidying

jeromekelleher · jeromekelleher · commit 687c715c94e7 · 2024-04-24T12:39:37.000+01:00
diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py
@@ -357,7 +357,7 @@ def dencode_partition(zarr_path, partition, verbose):
     TODO DOCUMENT
     """
     setup_logging(verbose)
-    vcf.encode_partition(zarr_path, partition, show_progress=False)
+    vcf.encode_partition(zarr_path, partition)
 
 
 @click.command
diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py
@@ -1540,15 +1540,6 @@ def summary_table(self):
         return data
 
 
-@dataclasses.dataclass
-class EncodingWork:
-    func: callable = dataclasses.field(repr=False)
-    start: int
-    stop: int
-    columns: list[str]
-    memory: int = 0
-
-
 def parse_max_memory(max_memory):
     if max_memory is None:
         # Effectively unbounded
@@ -1640,7 +1631,7 @@ def init(
     ):
         self.icf = icf
         if self.path.exists():
-            raise ValueError("Zarr path already exists")
+            raise ValueError("Zarr path already exists")  # NEEDS TEST
         partitions = VcfZarrPartition.generate_partitions(
             self.icf.num_records,
             schema.variants_chunk_size,
@@ -1807,6 +1798,7 @@ def finalise_partition_array(self, partition_index, name):
         wip_path = self.wip_partition_array_path(partition_index, name)
         final_path = self.partition_array_path(partition_index, name)
         if final_path.exists():
+            # NEEDS TEST
             logger.warning(f"Removing existing {final_path}")
             shutil.rmtree(final_path)
         # Atomic swap
@@ -1923,7 +1915,7 @@ def encode_filters_partition(self, partition_index):
                     var_filter.buff[j, lookup[f]] = True
                 except KeyError:
                     raise ValueError(
-                        f"Filter '{f}' was not defined " f"in the header."
+                        f"Filter '{f}' was not defined in the header."
                     ) from None
         var_filter.flush()
 
@@ -1956,6 +1948,7 @@ def finalise_array(self, name):
         logger.info(f"Finalising {name}")
         final_path = self.path / name
         if final_path.exists():
+            # NEEDS TEST
             raise ValueError(f"Array {name} already exists")
         for partition in range(len(self.metadata.partitions)):
             # Move all the files in partition dir to dest dir
@@ -1992,7 +1985,12 @@ def finalise(self, show_progress=False):
         # NOTE: it's not clear that adding more workers will make this quicker,
         # as it's just going to be causing contention on the file system.
         # Something to check empirically in some deployments.
-        with core.ParallelWorkManager(1, progress_config) as pwm:
+        # FIXME we're just using worker_processes=0 here to hook into the
+        # SynchronousExecutor which is intended for testing purposes so
+        # that we get test coverage. Should fix this either by allowing
+        # for multiple workers, or making a standard wrapper for tqdm
+        # that allows us to have a consistent look and feel.
+        with core.ParallelWorkManager(0, progress_config) as pwm:
             for name in self.metadata.schema.columns:
                 pwm.submit(self.finalise_array, name)
         zarr.consolidate_metadata(self.path)
@@ -2131,11 +2129,9 @@ def encode_init(
     )
 
 
-def encode_partition(zarr_path, partition, *, show_progress=False, worker_processes=1):
+def encode_partition(zarr_path, partition):
     writer = VcfZarrWriter(zarr_path)
-    writer.encode_partition(
-        partition, show_progress=show_progress, worker_processes=worker_processes
-    )
+    writer.encode_partition(partition)
 
 
 def encode_finalise(zarr_path, show_progress=False):
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -41,6 +41,8 @@
     show_progress=True,
 )
 
+DEFAULT_DENCODE_PARTITION_ARGS = dict()
+
 
 class TestWithMocks:
     vcf_path = "tests/data/vcf/sample.vcf.gz"
@@ -395,7 +397,7 @@ def test_encode(self, mocked, tmp_path):
         )
 
     @mock.patch("bio2zarr.vcf.encode_init", return_value=10)
-    def test_dencode(self, mocked, tmp_path):
+    def test_dencode_init(self, mocked, tmp_path):
         icf_path = tmp_path / "icf"
         icf_path.mkdir()
         zarr_path = tmp_path / "zarr"
@@ -429,7 +431,7 @@ def test_vcf_dencode_partition(self, mocked, tmp_path):
         assert len(result.stdout) == 0
         assert len(result.stderr) == 0
         mocked.assert_called_once_with(
-            str(zarr_path), 1, **DEFAULT_DEXPLODE_PARTITION_ARGS
+            str(zarr_path), 1, **DEFAULT_DENCODE_PARTITION_ARGS
         )
 
     @mock.patch("bio2zarr.vcf.encode_finalise")
@@ -548,6 +550,42 @@ def test_encode(self, tmp_path):
         # Arbitrary check
         assert "variant_position" in result.stdout
 
+    def test_dencode(self, tmp_path):
+        icf_path = tmp_path / "icf"
+        zarr_path = tmp_path / "zarr"
+        runner = ct.CliRunner(mix_stderr=False)
+        result = runner.invoke(
+            cli.vcf2zarr, f"explode {self.vcf_path} {icf_path}", catch_exceptions=False
+        )
+        assert result.exit_code == 0
+        result = runner.invoke(
+            cli.vcf2zarr,
+            f"dencode-init {icf_path} {zarr_path} 5 --variants-chunk-size=3",
+            catch_exceptions=False,
+        )
+        assert result.exit_code == 0
+        assert result.stdout.strip() == "3"
+
+        for j in range(3):
+            result = runner.invoke(
+                cli.vcf2zarr,
+                f"dencode-partition {zarr_path} {j}",
+                catch_exceptions=False,
+            )
+        assert result.exit_code == 0
+
+        result = runner.invoke(
+            cli.vcf2zarr, f"dencode-finalise {zarr_path}", catch_exceptions=False
+        )
+        assert result.exit_code == 0
+
+        result = runner.invoke(
+            cli.vcf2zarr, f"inspect {zarr_path}", catch_exceptions=False
+        )
+        assert result.exit_code == 0
+        # Arbitrary check
+        assert "variant_position" in result.stdout
+
     def test_convert(self, tmp_path):
         zarr_path = tmp_path / "zarr"
         runner = ct.CliRunner(mix_stderr=False)