sgkit-dev
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 15 additions & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎bio2zarr/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎bio2zarr/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bio2zarr/__main__.py‎
Lines changed: 2 additions & 0 deletions b/‎bio2zarr/__main__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎bio2zarr/cli.py‎
Lines changed: 3 additions & 7 deletions b/‎bio2zarr/cli.py‎
Lines changed: 3 additions & 7 deletions
diff --git a/‎bio2zarr/core.py‎
Lines changed: 5 additions & 6 deletions b/‎bio2zarr/core.py‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎bio2zarr/plink.py‎
Lines changed: 6 additions & 8 deletions b/‎bio2zarr/plink.py‎
Lines changed: 6 additions & 8 deletions
diff --git a/‎bio2zarr/typing.py‎
Lines changed: 1 addition & 1 deletion b/‎bio2zarr/typing.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bio2zarr/vcf.py‎
Lines changed: 38 additions & 31 deletions b/‎bio2zarr/vcf.py‎
Lines changed: 38 additions & 31 deletions
@@ -0,0 +1,15 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: check-merge-conflict
+      - id: debug-statements
+      - id: mixed-line-ending
+      - id: check-case-conflict
+      - id: check-yaml
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.3.7
+    hooks:
+      - id: ruff
+        args: [ --fix ]
+      - id: ruff-format
@@ -1 +1 @@
-from . provenance import __version__
+from .provenance import __version__  # noqa F401
@@ -2,11 +2,13 @@
 
 from . import cli
 
+
 @cli.version
 @click.group()
 def bio2zarr():
     pass
 
+
 # Provide a single top-level interface to all of the functionality.
 # This probably isn't the recommended way of interacting, as we
 # install individual commands as console scripts. However, this
 
@@ -4,15 +4,11 @@
 import shutil
 
 import click
-import tabulate
 import coloredlogs
 import numcodecs
+import tabulate
 
-from . import vcf
-from . import vcf_utils
-from . import plink
-from . import provenance
-
+from . import plink, provenance, vcf, vcf_utils
 
 logger = logging.getLogger(__name__)
 
@@ -75,7 +71,7 @@ def list_commands(self, ctx):
     "--compressor",
     type=click.Choice(["lz4", "zstd"]),
     default=None,
-    help="Codec to use for compressing column chunks (Default=zstd)."
+    help="Codec to use for compressing column chunks (Default=zstd).",
 )
 
 # Note: -l and -w were chosen when these were called "width" and "length".
 
@@ -1,16 +1,15 @@
-import dataclasses
-import contextlib
 import concurrent.futures as cf
+import contextlib
+import dataclasses
+import logging
 import multiprocessing
 import threading
-import logging
 import time
 
-import zarr
+import numcodecs
 import numpy as np
 import tqdm
-import numcodecs
-
+import zarr
 
 logger = logging.getLogger(__name__)
 
 
@@ -1,14 +1,13 @@
 import logging
 
+import bed_reader
 import humanfriendly
+import numcodecs
 import numpy as np
 import zarr
-import bed_reader
-import numcodecs
 
 from . import core
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -24,7 +23,6 @@ def encode_genotypes_slice(bed_path, zarr_path, start, stop):
     gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
     gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
     variants_chunk_size = gt.array.chunks[0]
-    n = gt.array.shape[1]
     assert start % variants_chunk_size == 0
 
     logger.debug(f"Reading slice {start}:{stop}")
@@ -96,7 +94,7 @@ def convert(
         chunks=(samples_chunk_size,),
     )
     a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
-    logger.debug(f"Encoded samples")
+    logger.debug("Encoded samples")
 
     # TODO encode these in slices - but read them in one go to avoid
     # fetching repeatedly from bim file
@@ -108,7 +106,7 @@ def convert(
         chunks=(variants_chunk_size,),
     )
     a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
-    logger.debug(f"encoded variant_position")
+    logger.debug("encoded variant_position")
 
     alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
     a = root.array(
@@ -119,7 +117,7 @@ def convert(
         chunks=(variants_chunk_size,),
     )
     a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
-    logger.debug(f"encoded variant_allele")
+    logger.debug("encoded variant_allele")
 
     # TODO remove this?
     a = root.empty(
@@ -201,4 +199,4 @@ def validate(bed_path, zarr_path):
             elif bed_call == 2:
                 assert list(zarr_call) == [1, 1]
             else:  # pragma no cover
-                assert False
+                raise AssertionError(f"Unexpected bed call {bed_call}")
@@ -1,4 +1,4 @@
 from pathlib import Path
 from typing import Union
 
-PathType = Union[str, Path]
+PathType = Union[str, Path]
@@ -1,29 +1,27 @@
 import collections
+import contextlib
 import dataclasses
 import functools
+import json
 import logging
+import math
 import os
 import pathlib
 import pickle
-import sys
 import shutil
-import json
-import math
+import sys
 import tempfile
-import contextlib
 from typing import Any, List
 
-import humanfriendly
 import cyvcf2
+import humanfriendly
 import numcodecs
 import numpy as np
 import numpy.testing as nt
 import tqdm
 import zarr
 
-from . import core
-from . import provenance
-from . import vcf_utils
+from . import core, provenance, vcf_utils
 
 logger = logging.getLogger(__name__)
 
@@ -301,7 +299,8 @@ def check_overlap(partitions):
 
 def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
     logger.info(
-        f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions} partitions."
+        f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions}"
+        f" partitions."
     )
     # An easy mistake to make is to pass the same file twice. Check this early on.
     for path, count in collections.Counter(paths).items():
@@ -850,7 +849,7 @@ def __init__(self, path):
             partition.num_records for partition in self.metadata.partitions
         ]
         # Allow us to find which partition a given record is in
-        self.partition_record_index = np.cumsum([0] + partition_num_records)
+        self.partition_record_index = np.cumsum([0, *partition_num_records])
         for field in self.metadata.fields:
             self.columns[field.full_name] = IntermediateColumnarFormatField(self, field)
         logger.info(
@@ -860,7 +859,8 @@ def __init__(self, path):
 
     def __repr__(self):
         return (
-            f"IntermediateColumnarFormat(fields={len(self)}, partitions={self.num_partitions}, "
+            f"IntermediateColumnarFormat(fields={len(self)}, "
+            f"partitions={self.num_partitions}, "
             f"records={self.num_records}, path={self.path})"
         )
 
@@ -956,11 +956,11 @@ def init(
         # probably going to be dropped.
         # https://github.com/pystatgen/vcf-zarr-spec/issues/15
         # May be useful to keep lying around still though?
-        logger.info(f"Writing VCF header")
+        logger.info("Writing VCF header")
         with open(self.path / "header.txt", "w") as f:
             f.write(header)
 
-        logger.info(f"Writing WIP metadata")
+        logger.info("Writing WIP metadata")
         with open(self.wip_path / "metadata.json", "w") as f:
             json.dump(self.metadata.asdict(), f, indent=4)
         return self.num_partitions
@@ -988,13 +988,14 @@ def load_partition_summaries(self):
                 not_found.append(j)
         if len(not_found) > 0:
             raise FileNotFoundError(
-                f"Partition metadata not found for {len(not_found)} partitions: {not_found}"
+                f"Partition metadata not found for {len(not_found)}"
+                f" partitions: {not_found}"
             )
         return summaries
 
     def load_metadata(self):
         if self.metadata is None:
-            with open(self.wip_path / f"metadata.json") as f:
+            with open(self.wip_path / "metadata.json") as f:
                 self.metadata = IcfMetadata.fromdict(json.load(f))
 
     def process_partition(self, partition_index):
@@ -1043,12 +1044,14 @@ def process_partition(self, partition_index):
                     for field in format_fields:
                         val = variant.format(field.name)
                         tcw.append(field.full_name, val)
-                    # Note: an issue with updating the progress per variant here like this
-                    # is that we get a significant pause at the end of the counter while
-                    # all the "small" fields get flushed. Possibly not much to be done about it.
+                    # Note: an issue with updating the progress per variant here like
+                    # this is that we get a significant pause at the end of the counter
+                    # while all the "small" fields get flushed. Possibly not much to be
+                    # done about it.
                     core.update_progress(1)
             logger.info(
-                f"Finished reading VCF for partition {partition_index}, flushing buffers"
+                f"Finished reading VCF for partition {partition_index}, "
+                f"flushing buffers"
             )
 
         partition_metadata = {
@@ -1130,11 +1133,11 @@ def finalise(self):
             for summary in partition_summaries:
                 field.summary.update(summary["field_summaries"][field.full_name])
 
-        logger.info(f"Finalising metadata")
+        logger.info("Finalising metadata")
         with open(self.path / "metadata.json", "w") as f:
             json.dump(self.metadata.asdict(), f, indent=4)
 
-        logger.debug(f"Removing WIP directory")
+        logger.debug("Removing WIP directory")
         shutil.rmtree(self.wip_path)
 
 
@@ -1148,7 +1151,7 @@ def explode(
     compressor=None,
 ):
     writer = IntermediateColumnarFormatWriter(icf_path)
-    num_partitions = writer.init(
+    writer.init(
         vcfs,
         # Heuristic to get reasonable worker utilisation with lumpy partition sizing
         target_num_partitions=max(1, worker_processes * 4),
@@ -1381,7 +1384,7 @@ def fixed_field_spec(
             if field.category == "FORMAT":
                 prefix = "call_"
                 shape.append(n)
-                chunks.append(samples_chunk_size),
+                chunks.append(samples_chunk_size)
                 dimensions.append("samples")
             # TODO make an option to add in the empty extra dimension
             if field.summary.max_number > 1:
@@ -1633,7 +1636,9 @@ def encode_filters_slice(self, lookup, start, stop):
                 try:
                     var_filter.buff[j, lookup[f]] = True
                 except KeyError:
-                    raise ValueError(f"Filter '{f}' was not defined in the header.")
+                    raise ValueError(
+                        f"Filter '{f}' was not defined " f"in the header."
+                    ) from None
         var_filter.flush()
         logger.debug(f"Encoded FILTERS slice {start}:{stop}")
 
@@ -1736,7 +1741,8 @@ def encode(
             variant_chunk_size = array.blocks[0].nbytes
             encoding_memory_requirements[col.name] = variant_chunk_size
             logger.debug(
-                f"{col.name} requires at least {display_size(variant_chunk_size)} per worker"
+                f"{col.name} requires at least {display_size(variant_chunk_size)} "
+                f"per worker"
             )
             total_bytes += array.nbytes
 
@@ -1845,8 +1851,9 @@ def service_completed_futures():
                     or len(future_to_work) > max_queued
                 ):
                     logger.debug(
-                        f"Wait: mem_required={used_memory + wp.memory} max_mem={max_memory} "
-                        f"queued={len(future_to_work)} max_queued={max_queued}"
+                        f"Wait: mem_required={used_memory + wp.memory} "
+                        f"max_mem={max_memory} queued={len(future_to_work)} "
+                        f"max_queued={max_queued}"
                     )
                     service_completed_futures()
                 future = pwm.submit(wp.func, wp.start, wp.stop)
@@ -1890,7 +1897,7 @@ def encode(
             raise ValueError(
                 "Cannot specify schema along with chunk sizes"
             )  # NEEDS TEST
-        with open(schema_path, "r") as f:
+        with open(schema_path) as f:
             schema = VcfZarrSchema.fromjson(f.read())
     zarr_path = pathlib.Path(zarr_path)
     if zarr_path.exists():
@@ -1971,7 +1978,7 @@ def assert_all_fill(zarr_val, vcf_type):
     elif vcf_type == "Float":
         assert_all_fill_float(zarr_val)
     else:  # pragma: no cover
-        assert False
+        assert False  # noqa PT015
 
 
 def assert_all_missing(zarr_val, vcf_type):
@@ -1984,7 +1991,7 @@ def assert_all_missing(zarr_val, vcf_type):
     elif vcf_type == "Float":
         assert_all_missing_float(zarr_val)
     else:  # pragma: no cover
-        assert False
+        assert False  # noqa PT015
 
 
 def assert_info_val_missing(zarr_val, vcf_type):
@@ -2123,7 +2130,7 @@ def validate(vcf_path, zarr_path, show_progress=False):
         assert vid[j] == ("." if row.ID is None else row.ID)
         assert allele[j, 0] == row.REF
         k = len(row.ALT)
-        nt.assert_array_equal(allele[j, 1 : k + 1], row.ALT),
+        nt.assert_array_equal(allele[j, 1 : k + 1], row.ALT)
         assert np.all(allele[j, k + 1 :] == "")
         # TODO FILTERS
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-from . provenance import __version__`
	`1`	`+from .provenance import __version__ # noqa F401`