Quick hacks to see if uncompressed VCF is possible

jeromekelleher · jeromekelleher · commit e74103887fc5 · 2025-03-27T16:59:00.000Z
diff --git a/bio2zarr/vcf2zarr/icf.py b/bio2zarr/vcf2zarr/icf.py
@@ -354,13 +354,13 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
     # Note: this will be infinity here if any of the chunks has an index
     # that doesn't keep track of the number of records per-contig
     icf_metadata.num_records = total_records
-
-    # Sort by contig (in the order they appear in the header) first,
-    # then by start coordinate
-    contig_index_map = {contig.id: j for j, contig in enumerate(metadata.contigs)}
-    all_partitions.sort(
-        key=lambda x: (contig_index_map[x.region.contig], x.region.start)
-    )
+    if len(all_partitions) > 1:
+        # Sort by contig (in the order they appear in the header) first,
+        # then by start coordinate
+        contig_index_map = {contig.id: j for j, contig in enumerate(metadata.contigs)}
+        all_partitions.sort(
+            key=lambda x: (contig_index_map[x.region.contig], x.region.start)
+        )
     icf_metadata.partitions = all_partitions
     logger.info(f"Scan complete, resulting in {len(all_partitions)} partitions.")
     return icf_metadata, header
diff --git a/bio2zarr/vcf_utils.py b/bio2zarr/vcf_utils.py
@@ -84,11 +84,14 @@ class Region:
     A htslib style region, where coordinates are 1-based and inclusive.
     """
 
-    contig: str
+    contig: Optional[str] = None
     start: Optional[int] = None
     end: Optional[int] = None
 
     def __post_init__(self):
+        if self.contig is None:
+            return
+
         if self.start is not None:
             self.start = int(self.start)
             assert self.start > 0
@@ -408,9 +411,8 @@ def __init__(self, vcf_path, index_path=None):
                     vcf_path.suffix + VcfIndexType.CSI.value
                 )
                 if not index_path.exists():
-                    raise FileNotFoundError(
-                        f"Cannot find .tbi or .csi file for {vcf_path}"
-                    )
+                    # Use this as a proxy for "no index"
+                    index_path = vcf_path
         else:
             index_path = pathlib.Path(index_path)
 
@@ -424,14 +426,18 @@ def __init__(self, vcf_path, index_path=None):
         elif index_path.suffix == VcfIndexType.TABIX.value:
             self.index_type = VcfIndexType.TABIX
             self.file_type = VcfFileType.VCF
-        else:
-            raise ValueError("Only .tbi or .csi indexes are supported.")
+        # else:
+
+        #     raise ValueError("Only .tbi or .csi indexes are supported.")
 
         self.vcf = cyvcf2.VCF(vcf_path)
-        self.vcf.set_index(str(self.index_path))
+        if self.index_type is not None:
+            self.vcf.set_index(str(self.index_path))
+
         logger.debug(f"Loaded {vcf_path} with index {self.index_path}")
         self.sequence_names = None
 
+        self.index = None
         if self.index_type == VcfIndexType.CSI:
             # Determine the file-type based on the "aux" field.
             self.index = read_csi(self.index_path)
@@ -441,7 +447,7 @@ def __init__(self, vcf_path, index_path=None):
                 self.sequence_names = self.index.parse_vcf_aux()
             else:
                 self.sequence_names = self.vcf.seqnames
-        else:
+        elif self.index_type == VcfIndexType.TABIX:
             self.index = read_tabix(self.index_path)
             self.sequence_names = self.index.sequence_names
 
@@ -452,6 +458,8 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         return False
 
     def contig_record_counts(self):
+        if self.index is None:
+            return {None: np.inf}
         d = dict(zip(self.sequence_names, self.index.record_counts))
         if self.file_type == VcfFileType.BCF:
             d = {k: v for k, v in d.items() if v > 0}
@@ -461,11 +469,15 @@ def count_variants(self, region):
         return sum(1 for _ in self.variants(region))
 
     def variants(self, region):
-        start = 1 if region.start is None else region.start
-        for var in self.vcf(str(region)):
-            # Need to filter because of indels overlapping the region
-            if var.POS >= start:
-                yield var
+        if self.index is None:
+            assert region.contig is None
+            yield from self.vcf
+        else:
+            start = 1 if region.start is None else region.start
+            for var in self.vcf(str(region)):
+                # Need to filter because of indels overlapping the region
+                if var.POS >= start:
+                    yield var
 
     def _filter_empty_and_refine(self, regions):
         """
@@ -486,6 +498,9 @@ def partition_into_regions(
         num_parts: Optional[int] = None,
         target_part_size: Union[None, int, str] = None,
     ):
+        if self.index is None:
+            return [Region()]
+
         if num_parts is None and target_part_size is None:
             raise ValueError("One of num_parts or target_part_size must be specified")
 
diff --git a/tests/test_simulated_data.py b/tests/test_simulated_data.py
@@ -1,7 +1,6 @@
 import sys
 
 import numpy.testing as nt
-import pysam
 import pytest
 import sgkit as sg
 
@@ -45,9 +44,10 @@ def assert_ts_ds_equal(ts, ds, ploidy=1):
 def write_vcf(ts, vcf_path, contig_id="1"):
     with open(vcf_path, "w") as f:
         ts.write_vcf(f, contig_id=contig_id)
-    # This also compresses the input file
-    pysam.tabix_index(str(vcf_path), preset="vcf")
-    return vcf_path.with_suffix(vcf_path.suffix + ".gz")
+    # # This also compresses the input file
+    # pysam.tabix_index(str(vcf_path), preset="vcf")
+    # return vcf_path.with_suffix(vcf_path.suffix + ".gz")
+    return vcf_path
 
 
 # https://github.com/sgkit-dev/bio2zarr/issues/336