@@ -294,17 +294,18 @@ def scan_vcf(path, target_num_partitions):
294
294
295
295
def check_overlap (partitions ):
296
296
for i in range (1 , len (partitions )):
297
- prev_partition = partitions [i - 1 ]
298
- current_partition = partitions [i ]
299
- if (
300
- prev_partition .region .contig == current_partition .region .contig
301
- and prev_partition .region .end > current_partition .region .start
302
- ):
303
- raise ValueError (
304
- f"Multiple VCFs have the region "
305
- f"{ prev_partition .region .contig } :{ prev_partition .region .start } -"
306
- f"{ current_partition .region .end } "
307
- )
297
+ prev_region = partitions [i - 1 ].region
298
+ current_region = partitions [i ].region
299
+ if prev_region .contig == current_region .contig :
300
+ if prev_region .end is None :
301
+ logger .warning ("Cannot check overlaps; issue #146" )
302
+ continue
303
+ if prev_region .end > current_region .start :
304
+ raise ValueError (
305
+ f"Multiple VCFs have the region "
306
+ f"{ prev_region .contig } :{ prev_region .start } -"
307
+ f"{ current_region .end } "
308
+ )
308
309
309
310
310
311
def scan_vcfs (paths , show_progress , target_num_partitions , worker_processes = 1 ):
@@ -453,7 +454,7 @@ def sanitise_value_float_2d(buff, j, value):
453
454
454
455
def sanitise_int_array (value , ndmin , dtype ):
455
456
if isinstance (value , tuple ):
456
- value = [VCF_INT_MISSING if x is None else x for x in value ] # NEEDS TEST
457
+ value = [VCF_INT_MISSING if x is None else x for x in value ] # NEEDS TEST
457
458
value = np .array (value , ndmin = ndmin , copy = False )
458
459
value [value == VCF_INT_MISSING ] = - 1
459
460
value [value == VCF_INT_FILL ] = - 2
@@ -1548,10 +1549,8 @@ def parse_max_memory(max_memory):
1548
1549
1549
1550
@dataclasses .dataclass
1550
1551
class VcfZarrPartition :
1551
- start_index : int
1552
- stop_index : int
1553
- start_chunk : int
1554
- stop_chunk : int
1552
+ start : int
1553
+ stop : int
1555
1554
1556
1555
@staticmethod
1557
1556
def generate_partitions (num_records , chunk_size , num_partitions , max_chunks = None ):
@@ -1565,9 +1564,7 @@ def generate_partitions(num_records, chunk_size, num_partitions, max_chunks=None
1565
1564
stop_chunk = int (chunk_slice [- 1 ]) + 1
1566
1565
start_index = start_chunk * chunk_size
1567
1566
stop_index = min (stop_chunk * chunk_size , num_records )
1568
- partitions .append (
1569
- VcfZarrPartition (start_index , stop_index , start_chunk , stop_chunk )
1570
- )
1567
+ partitions .append (VcfZarrPartition (start_index , stop_index ))
1571
1568
return partitions
1572
1569
1573
1570
@@ -1590,7 +1587,7 @@ def asdict(self):
1590
1587
def fromdict (d ):
1591
1588
if d ["format_version" ] != VZW_METADATA_FORMAT_VERSION :
1592
1589
raise ValueError (
1593
- "VcfZarrWriter format version mismatch: "
1590
+ "VcfZarrWriter format version mismatch: "
1594
1591
f"{ d ['format_version' ]} != { VZW_METADATA_FORMAT_VERSION } "
1595
1592
)
1596
1593
ret = VcfZarrWriterMetadata (** d )
@@ -1675,7 +1672,7 @@ def init(
1675
1672
root = zarr .group (store = store )
1676
1673
1677
1674
for column in self .schema .columns .values ():
1678
- self .init_array (root , column , partitions [- 1 ].stop_index )
1675
+ self .init_array (root , column , partitions [- 1 ].stop )
1679
1676
1680
1677
logger .info ("Writing WIP metadata" )
1681
1678
with open (self .wip_path / "metadata.json" , "w" ) as f :
@@ -1762,28 +1759,42 @@ def load_metadata(self):
1762
1759
def partition_path (self , partition_index ):
1763
1760
return self .partitions_path / f"p{ partition_index } "
1764
1761
1762
+ def wip_partition_path (self , partition_index ):
1763
+ return self .partitions_path / f"wip_p{ partition_index } "
1764
+
1765
1765
def wip_partition_array_path (self , partition_index , name ):
1766
- return self .partition_path (partition_index ) / f"wip_ { name } "
1766
+ return self .wip_partition_path (partition_index ) / name
1767
1767
1768
1768
def partition_array_path (self , partition_index , name ):
1769
1769
return self .partition_path (partition_index ) / name
1770
1770
1771
1771
def encode_partition (self , partition_index ):
1772
1772
self .load_metadata ()
1773
- partition_path = self .partition_path (partition_index )
1773
+ if partition_index < 0 or partition_index >= self .num_partitions :
1774
+ raise ValueError (
1775
+ "Partition index must be in the range 0 <= index < num_partitions"
1776
+ )
1777
+ partition_path = self .wip_partition_path (partition_index )
1774
1778
partition_path .mkdir (exist_ok = True )
1775
1779
logger .info (f"Encoding partition { partition_index } to { partition_path } " )
1776
1780
1777
- self .encode_alleles_partition (partition_index )
1778
1781
self .encode_id_partition (partition_index )
1779
1782
self .encode_filters_partition (partition_index )
1780
1783
self .encode_contig_partition (partition_index )
1784
+ self .encode_alleles_partition (partition_index )
1781
1785
for col in self .schema .columns .values ():
1782
1786
if col .vcf_field is not None :
1783
1787
self .encode_array_partition (col , partition_index )
1784
1788
if "call_genotype" in self .schema .columns :
1785
1789
self .encode_genotypes_partition (partition_index )
1786
1790
1791
+ final_path = self .partition_path (partition_index )
1792
+ logger .info (f"Finalising { partition_index } at { final_path } " )
1793
+ if final_path .exists ():
1794
+ logger .warning ("Removing existing partition at {final_path}" )
1795
+ shutil .rmtree (final_path )
1796
+ os .rename (partition_path , final_path )
1797
+
1787
1798
def init_partition_array (self , partition_index , name ):
1788
1799
wip_path = self .wip_partition_array_path (partition_index , name )
1789
1800
# Create an empty array like the definition
@@ -1795,27 +1806,17 @@ def init_partition_array(self, partition_index, name):
1795
1806
return array
1796
1807
1797
1808
def finalise_partition_array (self , partition_index , name ):
1798
- wip_path = self .wip_partition_array_path (partition_index , name )
1799
- final_path = self .partition_array_path (partition_index , name )
1800
- if final_path .exists ():
1801
- # NEEDS TEST
1802
- logger .warning (f"Removing existing { final_path } " )
1803
- shutil .rmtree (final_path )
1804
- # Atomic swap
1805
- os .rename (wip_path , final_path )
1806
1809
logger .debug (f"Encoded { name } partition { partition_index } " )
1807
1810
1808
1811
def encode_array_partition (self , column , partition_index ):
1809
1812
array = self .init_partition_array (partition_index , column .name )
1810
1813
1811
1814
partition = self .metadata .partitions [partition_index ]
1812
- ba = core .BufferedArray (array , partition .start_index )
1815
+ ba = core .BufferedArray (array , partition .start )
1813
1816
source_col = self .icf .columns [column .vcf_field ]
1814
1817
sanitiser = source_col .sanitiser_factory (ba .buff .shape )
1815
1818
1816
- for value in source_col .iter_values (
1817
- partition .start_index , partition .stop_index
1818
- ):
1819
+ for value in source_col .iter_values (partition .start , partition .stop ):
1819
1820
# We write directly into the buffer in the sanitiser function
1820
1821
# to make it easier to reason about dimension padding
1821
1822
j = ba .next_buffer_row ()
@@ -1831,14 +1832,12 @@ def encode_genotypes_partition(self, partition_index):
1831
1832
)
1832
1833
1833
1834
partition = self .metadata .partitions [partition_index ]
1834
- gt = core .BufferedArray (gt_array , partition .start_index )
1835
- gt_mask = core .BufferedArray (gt_mask_array , partition .start_index )
1836
- gt_phased = core .BufferedArray (gt_phased_array , partition .start_index )
1835
+ gt = core .BufferedArray (gt_array , partition .start )
1836
+ gt_mask = core .BufferedArray (gt_mask_array , partition .start )
1837
+ gt_phased = core .BufferedArray (gt_phased_array , partition .start )
1837
1838
1838
1839
source_col = self .icf .columns ["FORMAT/GT" ]
1839
- for value in source_col .iter_values (
1840
- partition .start_index , partition .stop_index
1841
- ):
1840
+ for value in source_col .iter_values (partition .start , partition .stop ):
1842
1841
j = gt .next_buffer_row ()
1843
1842
sanitise_value_int_2d (gt .buff , j , value [:, :- 1 ])
1844
1843
j = gt_phased .next_buffer_row ()
@@ -1859,13 +1858,13 @@ def encode_alleles_partition(self, partition_index):
1859
1858
array_name = "variant_allele"
1860
1859
alleles_array = self .init_partition_array (partition_index , array_name )
1861
1860
partition = self .metadata .partitions [partition_index ]
1862
- alleles = core .BufferedArray (alleles_array , partition .start_index )
1861
+ alleles = core .BufferedArray (alleles_array , partition .start )
1863
1862
ref_col = self .icf .columns ["REF" ]
1864
1863
alt_col = self .icf .columns ["ALT" ]
1865
1864
1866
1865
for ref , alt in zip (
1867
- ref_col .iter_values (partition .start_index , partition .stop_index ),
1868
- alt_col .iter_values (partition .start_index , partition .stop_index ),
1866
+ ref_col .iter_values (partition .start , partition .stop ),
1867
+ alt_col .iter_values (partition .start , partition .stop ),
1869
1868
):
1870
1869
j = alleles .next_buffer_row ()
1871
1870
alleles .buff [j , :] = STR_FILL
@@ -1879,11 +1878,11 @@ def encode_id_partition(self, partition_index):
1879
1878
vid_array = self .init_partition_array (partition_index , "variant_id" )
1880
1879
vid_mask_array = self .init_partition_array (partition_index , "variant_id_mask" )
1881
1880
partition = self .metadata .partitions [partition_index ]
1882
- vid = core .BufferedArray (vid_array , partition .start_index )
1883
- vid_mask = core .BufferedArray (vid_mask_array , partition .start_index )
1881
+ vid = core .BufferedArray (vid_array , partition .start )
1882
+ vid_mask = core .BufferedArray (vid_mask_array , partition .start )
1884
1883
col = self .icf .columns ["ID" ]
1885
1884
1886
- for value in col .iter_values (partition .start_index , partition .stop_index ):
1885
+ for value in col .iter_values (partition .start , partition .stop ):
1887
1886
j = vid .next_buffer_row ()
1888
1887
k = vid_mask .next_buffer_row ()
1889
1888
assert j == k
@@ -1904,10 +1903,10 @@ def encode_filters_partition(self, partition_index):
1904
1903
array_name = "variant_filter"
1905
1904
array = self .init_partition_array (partition_index , array_name )
1906
1905
partition = self .metadata .partitions [partition_index ]
1907
- var_filter = core .BufferedArray (array , partition .start_index )
1906
+ var_filter = core .BufferedArray (array , partition .start )
1908
1907
1909
1908
col = self .icf .columns ["FILTERS" ]
1910
- for value in col .iter_values (partition .start_index , partition .stop_index ):
1909
+ for value in col .iter_values (partition .start , partition .stop ):
1911
1910
j = var_filter .next_buffer_row ()
1912
1911
var_filter .buff [j ] = False
1913
1912
for f in value :
@@ -1926,10 +1925,10 @@ def encode_contig_partition(self, partition_index):
1926
1925
array_name = "variant_contig"
1927
1926
array = self .init_partition_array (partition_index , array_name )
1928
1927
partition = self .metadata .partitions [partition_index ]
1929
- contig = core .BufferedArray (array , partition .start_index )
1928
+ contig = core .BufferedArray (array , partition .start )
1930
1929
col = self .icf .columns ["CHROM" ]
1931
1930
1932
- for value in col .iter_values (partition .start_index , partition .stop_index ):
1931
+ for value in col .iter_values (partition .start , partition .stop ):
1933
1932
j = contig .next_buffer_row ()
1934
1933
# Note: because we are using the indexes to define the lookups
1935
1934
# and we always have an index, it seems that we the contig lookup
@@ -1950,7 +1949,7 @@ def finalise_array(self, name):
1950
1949
if final_path .exists ():
1951
1950
# NEEDS TEST
1952
1951
raise ValueError (f"Array { name } already exists" )
1953
- for partition in range (len ( self .metadata . partitions ) ):
1952
+ for partition in range (self .num_partitions ):
1954
1953
# Move all the files in partition dir to dest dir
1955
1954
src = self .partition_array_path (partition , name )
1956
1955
if not src .exists ():
@@ -1977,6 +1976,15 @@ def finalise_array(self, name):
1977
1976
def finalise (self , show_progress = False ):
1978
1977
self .load_metadata ()
1979
1978
1979
+ logger .info ("Scanning {self.num_partitions} partitions" )
1980
+ missing = []
1981
+ # TODO may need a progress bar here
1982
+ for partition_id in range (self .num_partitions ):
1983
+ if not self .partition_path (partition_id ).exists ():
1984
+ missing .append (partition_id )
1985
+ if len (missing ) > 0 :
1986
+ raise FileNotFoundError (f"Partitions not encoded: { missing } " )
1987
+
1980
1988
progress_config = core .ProgressConfig (
1981
1989
total = len (self .schema .columns ),
1982
1990
title = "Finalise" ,
@@ -1994,6 +2002,9 @@ def finalise(self, show_progress=False):
1994
2002
with core .ParallelWorkManager (0 , progress_config ) as pwm :
1995
2003
for name in self .schema .columns :
1996
2004
pwm .submit (self .finalise_array , name )
2005
+ logger .debug (f"Removing { self .wip_path } " )
2006
+ shutil .rmtree (self .wip_path )
2007
+ logger .info ("Consolidating Zarr metadata" )
1997
2008
zarr .consolidate_metadata (self .path )
1998
2009
1999
2010
######################
0 commit comments