Change chunk files to store cumulative record counts

jeromekelleher · jeromekelleher · commit b2ccec20db4f · 2024-03-01T09:41:52.000Z
diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py
@@ -528,6 +528,10 @@ def __init__(self, pcvcf, vcf_field):
         self.compressor = pcvcf.compressor
         self.num_partitions = pcvcf.num_partitions
         self.num_records = pcvcf.num_records
+        self.partition_record_index = pcvcf.partition_record_index
+        # A map of partition index to the cumulative number of records
+        # in chunks
+        self._chunk_cumulative_records = {}
 
     @staticmethod
     def get_path(base_path, vcf_field):
@@ -536,17 +540,29 @@ def get_path(base_path, vcf_field):
         return base_path / vcf_field.category / vcf_field.name
 
     def __repr__(self):
-        return f"PickleChunkedVcfField(path={self.path})"
+        partition_chunks = [self.num_chunks(j) for j in range(self.num_partitions)]
+        return f"PickleChunkedVcfField(partition_chunks={partition_chunks}, path={self.path})"
 
     def num_chunks(self, partition_index):
+        return len(self.chunk_files(partition_index))
+
+    def chunk_cumulative_records(self, partition_index):
+        if partition_index not in self._chunk_cumulative_records:
+            partition_path = self.path / f"p{partition_index}"
+            # Let numpy do the string->int parsing
+            a = np.array(os.listdir(partition_path), dtype=int)
+            a.sort()
+            self._chunk_cumulative_records[partition_index] = a
+        return self._chunk_cumulative_records[partition_index]
+
+    def chunk_files(self, partition_index):
         partition_path = self.path / f"p{partition_index}"
-        return len(list(partition_path.iterdir()))
-
-    def chunk_path(self, partition_index, chunk_index):
-        return self.path / f"p{partition_index}" / f"c{chunk_index}"
+        return [
+            partition_path / str(n)
+            for n in self.chunk_cumulative_records(partition_index)
+        ]
 
-    def read_chunk(self, partition_index, chunk_index):
-        path = self.chunk_path(partition_index, chunk_index)
+    def read_chunk(self, path):
         with open(path, "rb") as f:
             pkl = self.compressor.decode(f.read())
         return pickle.loads(pkl), len(pkl)
@@ -555,8 +571,8 @@ def iter_values_bytes(self):
         num_records = 0
         bytes_read = 0
         for partition_index in range(self.num_partitions):
-            for chunk_index in range(self.num_chunks(partition_index)):
-                chunk, chunk_bytes = self.read_chunk(partition_index, chunk_index)
+            for chunk_path in self.chunk_files(partition_index):
+                chunk, chunk_bytes = self.read_chunk(chunk_path)
                 bytes_read += chunk_bytes
                 for record in chunk:
                     yield record, bytes_read
@@ -569,13 +585,21 @@ def iter_values_bytes(self):
     def iter_values(self, start=None, stop=None):
         start = 0 if start is None else start
         stop = self.num_records if stop is None else stop
-        num_records = 0
-        for partition_index in range(self.num_partitions):
-            for chunk_index in range(self.num_chunks(partition_index)):
-                chunk, chunk_bytes = self.read_chunk(partition_index, chunk_index)
+        start_partition = (
+            np.searchsorted(self.partition_record_index, start, side="right") - 1
+        )
+        num_records = self.partition_record_index[start_partition]
+        assert num_records <= start
+        for partition_index in range(start_partition, self.num_partitions):
+            # TODO use the offsets from the partition chunk counts to seek to
+            # the first chunk
+            for chunk_path in self.chunk_files(partition_index):
+                chunk, _ = self.read_chunk(chunk_path)
                 for record in chunk:
                     if start <= num_records < stop:
                         yield record
+                    if num_records >= stop:
+                        return
                     num_records += 1
 
     # Note: this involves some computation so should arguably be a method,
@@ -627,13 +651,15 @@ class PcvcfFieldWriter:
     buff: list = dataclasses.field(default_factory=list)
     buffered_bytes: int = 0
     chunk_index: int = 0
+    num_records: int = 0
 
     def append(self, val):
         val = self.transformer.transform_and_update_bounds(val)
         assert val is None or isinstance(val, np.ndarray)
         self.buff.append(val)
         val_bytes = sys.getsizeof(val)
         self.buffered_bytes += val_bytes
+        self.num_records += 1
         if self.buffered_bytes >= self.max_buffered_bytes:
             logger.debug(
                 f"Flush {self.path} buffered={self.buffered_bytes} max={self.max_buffered_bytes}"
@@ -644,7 +670,7 @@ def append(self, val):
             self.chunk_index += 1
 
     def write_chunk(self):
-        path = self.path / f"c{self.chunk_index}"
+        path = self.path / f"{self.num_records}"
         logger.debug(f"Start write: {path}")
         pkl = pickle.dumps(self.buff)
         compressed = self.compressor.encode(pkl)
@@ -667,7 +693,7 @@ def flush(self):
 
 class PcvcfPartitionWriter(contextlib.AbstractContextManager):
     """
-    Writes the data for a PickleChunkedVcf for a given partition.
+    Writes the data for a PickleChunkedVcf partition.
     """
 
     def __init__(
@@ -724,11 +750,21 @@ def __init__(self, path, metadata, vcf_header):
         self.metadata = metadata
         self.vcf_header = vcf_header
         self.compressor = self.DEFAULT_COMPRESSOR
-
         self.columns = {}
+        partition_num_records = [
+            partition.num_records for partition in self.metadata.partitions
+        ]
+        # Allow us to find which partition a given record is in
+        self.partition_record_index = np.cumsum([0] + partition_num_records)
         for field in self.metadata.fields:
             self.columns[field.full_name] = PickleChunkedVcfField(self, field)
 
+    def __repr__(self):
+        return (
+            f"PickleChunkedVcf(fields={len(self)}, partitions={self.num_partitions}, "
+            f"records={self.num_records}, path={self.path})"
+        )
+
     def __getitem__(self, key):
         return self.columns[key]
 
@@ -931,7 +967,6 @@ def convert(
             json.dump(vcf_metadata.asdict(), f, indent=4)
         with open(out_path / "header.txt", "w") as f:
             f.write(header)
-        return pcvcf
 
 
 def explode(
@@ -946,13 +981,14 @@ def explode(
     if out_path.exists():
         shutil.rmtree(out_path)
 
-    return PickleChunkedVcf.convert(
+    PickleChunkedVcf.convert(
         vcfs,
         out_path,
         column_chunk_size=column_chunk_size,
         worker_processes=worker_processes,
         show_progress=show_progress,
     )
+    return PickleChunkedVcf.load(out_path)
 
 
 def inspect(if_path):
diff --git a/tests/test_pcvcf.py b/tests/test_pcvcf.py
@@ -56,14 +56,6 @@ def test_POS(self, pcvcf):
             [111, 112, 14370, 17330, 1110696, 1230237, 1234567, 1235237, 10],
         )
 
-    def test_POS_slice(self, pcvcf):
-        col = pcvcf["POS"]
-        v = [row[0] for row in col.values]
-        start = 1
-        stop = 6
-        s = [row[0] for row in col.iter_values(start, stop)]
-        assert v[start:stop] == s
-
     def test_REF(self, pcvcf):
         ref = ["A", "A", "G", "T", "A", "T", "G", "T", "AC"]
         assert pcvcf["REF"].values == ref
@@ -156,3 +148,50 @@ def test_format_string2(self, pcvcf):
         non_missing = [v for v in pcvcf["FORMAT/FS2"].values if v is not None]
         nt.assert_array_equal(non_missing[0], [["bc", "op"], [".", "op"]])
         nt.assert_array_equal(non_missing[1], [["bc", "."], [".", "."]])
+
+
+class TestSlicing:
+    data_path = "tests/data/vcf/multi_contig.vcf.gz"
+
+    @pytest.fixture(scope="class")
+    def pcvcf(self, tmp_path_factory):
+        out = tmp_path_factory.mktemp("data") / "example.exploded"
+        return vcf.explode([self.data_path], out, column_chunk_size=0.0125)
+
+    def test_repr(self, pcvcf):
+        assert repr(pcvcf).startswith(
+            "PickleChunkedVcf(fields=7, partitions=5, records=4665, path="
+        )
+
+    def test_partition_record_index(self, pcvcf):
+        nt.assert_array_equal(
+            pcvcf.partition_record_index, [0, 933, 1866, 2799, 3732, 4665]
+        )
+
+    def test_pos_chunk_records(self, pcvcf):
+        pos = pcvcf["POS"]
+        for j in range(pos.num_partitions):
+            a = pos.chunk_cumulative_records(j)
+            nt.assert_array_equal(a, [118, 236, 354, 472, 590, 708, 826, 933])
+
+    @pytest.mark.parametrize(
+        ["start", "stop"],
+        [
+            (0, 1),
+            (0, 4665),
+            (100, 200),
+            (118, 237),
+            (710, 850),
+            (931, 1000),
+            (1865, 1867),
+            (1866, 2791),
+            (2732, 3200),
+            (4664, 4665),
+        ],
+    )
+    def test_slice(self, pcvcf, start, stop):
+        # TODO put in the actual values here, 5 copies of 0-933
+        col = pcvcf["POS"]
+        pos = np.array(col.values)
+        pos_slice = np.array(list(col.iter_values(start, stop)))
+        nt.assert_array_equal(pos[start:stop], pos_slice)