@@ -1548,10 +1548,8 @@ def parse_max_memory(max_memory):
1548
1548
1549
1549
@dataclasses .dataclass
1550
1550
class VcfZarrPartition :
1551
- start_index : int
1552
- stop_index : int
1553
- start_chunk : int
1554
- stop_chunk : int
1551
+ start : int
1552
+ stop : int
1555
1553
1556
1554
@staticmethod
1557
1555
def generate_partitions (num_records , chunk_size , num_partitions , max_chunks = None ):
@@ -1565,9 +1563,7 @@ def generate_partitions(num_records, chunk_size, num_partitions, max_chunks=None
1565
1563
stop_chunk = int (chunk_slice [- 1 ]) + 1
1566
1564
start_index = start_chunk * chunk_size
1567
1565
stop_index = min (stop_chunk * chunk_size , num_records )
1568
- partitions .append (
1569
- VcfZarrPartition (start_index , stop_index , start_chunk , stop_chunk )
1570
- )
1566
+ partitions .append (VcfZarrPartition (start_index , stop_index ))
1571
1567
return partitions
1572
1568
1573
1569
@@ -1590,7 +1586,7 @@ def asdict(self):
1590
1586
def fromdict (d ):
1591
1587
if d ["format_version" ] != VZW_METADATA_FORMAT_VERSION :
1592
1588
raise ValueError (
1593
- "VcfZarrWriter format version mismatch: "
1589
+ "VcfZarrWriter format version mismatch: "
1594
1590
f"{ d ['format_version' ]} != { VZW_METADATA_FORMAT_VERSION } "
1595
1591
)
1596
1592
ret = VcfZarrWriterMetadata (** d )
@@ -1675,7 +1671,7 @@ def init(
1675
1671
root = zarr .group (store = store )
1676
1672
1677
1673
for column in self .schema .columns .values ():
1678
- self .init_array (root , column , partitions [- 1 ].stop_index )
1674
+ self .init_array (root , column , partitions [- 1 ].stop )
1679
1675
1680
1676
logger .info ("Writing WIP metadata" )
1681
1677
with open (self .wip_path / "metadata.json" , "w" ) as f :
@@ -1809,13 +1805,11 @@ def encode_array_partition(self, column, partition_index):
1809
1805
array = self .init_partition_array (partition_index , column .name )
1810
1806
1811
1807
partition = self .metadata .partitions [partition_index ]
1812
- ba = core .BufferedArray (array , partition .start_index )
1808
+ ba = core .BufferedArray (array , partition .start )
1813
1809
source_col = self .icf .columns [column .vcf_field ]
1814
1810
sanitiser = source_col .sanitiser_factory (ba .buff .shape )
1815
1811
1816
- for value in source_col .iter_values (
1817
- partition .start_index , partition .stop_index
1818
- ):
1812
+ for value in source_col .iter_values (partition .start , partition .stop ):
1819
1813
# We write directly into the buffer in the sanitiser function
1820
1814
# to make it easier to reason about dimension padding
1821
1815
j = ba .next_buffer_row ()
@@ -1831,14 +1825,12 @@ def encode_genotypes_partition(self, partition_index):
1831
1825
)
1832
1826
1833
1827
partition = self .metadata .partitions [partition_index ]
1834
- gt = core .BufferedArray (gt_array , partition .start_index )
1835
- gt_mask = core .BufferedArray (gt_mask_array , partition .start_index )
1836
- gt_phased = core .BufferedArray (gt_phased_array , partition .start_index )
1828
+ gt = core .BufferedArray (gt_array , partition .start )
1829
+ gt_mask = core .BufferedArray (gt_mask_array , partition .start )
1830
+ gt_phased = core .BufferedArray (gt_phased_array , partition .start )
1837
1831
1838
1832
source_col = self .icf .columns ["FORMAT/GT" ]
1839
- for value in source_col .iter_values (
1840
- partition .start_index , partition .stop_index
1841
- ):
1833
+ for value in source_col .iter_values (partition .start , partition .stop ):
1842
1834
j = gt .next_buffer_row ()
1843
1835
sanitise_value_int_2d (gt .buff , j , value [:, :- 1 ])
1844
1836
j = gt_phased .next_buffer_row ()
@@ -1859,13 +1851,13 @@ def encode_alleles_partition(self, partition_index):
1859
1851
array_name = "variant_allele"
1860
1852
alleles_array = self .init_partition_array (partition_index , array_name )
1861
1853
partition = self .metadata .partitions [partition_index ]
1862
- alleles = core .BufferedArray (alleles_array , partition .start_index )
1854
+ alleles = core .BufferedArray (alleles_array , partition .start )
1863
1855
ref_col = self .icf .columns ["REF" ]
1864
1856
alt_col = self .icf .columns ["ALT" ]
1865
1857
1866
1858
for ref , alt in zip (
1867
- ref_col .iter_values (partition .start_index , partition .stop_index ),
1868
- alt_col .iter_values (partition .start_index , partition .stop_index ),
1859
+ ref_col .iter_values (partition .start , partition .stop ),
1860
+ alt_col .iter_values (partition .start , partition .stop ),
1869
1861
):
1870
1862
j = alleles .next_buffer_row ()
1871
1863
alleles .buff [j , :] = STR_FILL
@@ -1879,11 +1871,11 @@ def encode_id_partition(self, partition_index):
1879
1871
vid_array = self .init_partition_array (partition_index , "variant_id" )
1880
1872
vid_mask_array = self .init_partition_array (partition_index , "variant_id_mask" )
1881
1873
partition = self .metadata .partitions [partition_index ]
1882
- vid = core .BufferedArray (vid_array , partition .start_index )
1883
- vid_mask = core .BufferedArray (vid_mask_array , partition .start_index )
1874
+ vid = core .BufferedArray (vid_array , partition .start )
1875
+ vid_mask = core .BufferedArray (vid_mask_array , partition .start )
1884
1876
col = self .icf .columns ["ID" ]
1885
1877
1886
- for value in col .iter_values (partition .start_index , partition .stop_index ):
1878
+ for value in col .iter_values (partition .start , partition .stop ):
1887
1879
j = vid .next_buffer_row ()
1888
1880
k = vid_mask .next_buffer_row ()
1889
1881
assert j == k
@@ -1904,10 +1896,10 @@ def encode_filters_partition(self, partition_index):
1904
1896
array_name = "variant_filter"
1905
1897
array = self .init_partition_array (partition_index , array_name )
1906
1898
partition = self .metadata .partitions [partition_index ]
1907
- var_filter = core .BufferedArray (array , partition .start_index )
1899
+ var_filter = core .BufferedArray (array , partition .start )
1908
1900
1909
1901
col = self .icf .columns ["FILTERS" ]
1910
- for value in col .iter_values (partition .start_index , partition .stop_index ):
1902
+ for value in col .iter_values (partition .start , partition .stop ):
1911
1903
j = var_filter .next_buffer_row ()
1912
1904
var_filter .buff [j ] = False
1913
1905
for f in value :
@@ -1926,10 +1918,10 @@ def encode_contig_partition(self, partition_index):
1926
1918
array_name = "variant_contig"
1927
1919
array = self .init_partition_array (partition_index , array_name )
1928
1920
partition = self .metadata .partitions [partition_index ]
1929
- contig = core .BufferedArray (array , partition .start_index )
1921
+ contig = core .BufferedArray (array , partition .start )
1930
1922
col = self .icf .columns ["CHROM" ]
1931
1923
1932
- for value in col .iter_values (partition .start_index , partition .stop_index ):
1924
+ for value in col .iter_values (partition .start , partition .stop ):
1933
1925
j = contig .next_buffer_row ()
1934
1926
# Note: because we are using the indexes to define the lookups
1935
1927
# and we always have an index, it seems that we the contig lookup
0 commit comments