@@ -144,6 +144,7 @@ class VcfPartition:
144
144
num_records : int = - 1
145
145
146
146
147
+ # TODO bump this before current PR is done!
147
148
ICF_METADATA_FORMAT_VERSION = "0.2"
148
149
ICF_DEFAULT_COMPRESSOR = numcodecs .Blosc (
149
150
cname = "zstd" , clevel = 7 , shuffle = numcodecs .Blosc .NOSHUFFLE
@@ -903,6 +904,40 @@ def num_columns(self):
903
904
return len (self .columns )
904
905
905
906
907
+ @dataclasses .dataclass
908
+ class IcfPartitionMetadata :
909
+ num_records : int
910
+ last_position : int
911
+ field_summaries : dict
912
+
913
+ def asdict (self ):
914
+ return dataclasses .asdict (self )
915
+
916
+ def asjson (self ):
917
+ return json .dumps (self .asdict (), indent = 4 )
918
+
919
+ @staticmethod
920
+ def fromdict (d ):
921
+ md = IcfPartitionMetadata (** d )
922
+ for k , v in md .field_summaries .items ():
923
+ md .field_summaries [k ] = VcfFieldSummary .fromdict (v )
924
+ return md
925
+
926
+
927
+ def check_overlapping_partitions (partitions ):
928
+ for i in range (1 , len (partitions )):
929
+ prev_region = partitions [i - 1 ].region
930
+ current_region = partitions [i ].region
931
+ if prev_region .contig == current_region .contig :
932
+ assert prev_region .end is not None
933
+ # Regions are *inclusive*
934
+ if prev_region .end >= current_region .start :
935
+ raise ValueError (
936
+ f"Overlapping VCF regions in partitions { i - 1 } and { i } : "
937
+ f"{ prev_region } and { current_region } "
938
+ )
939
+
940
+
906
941
class IntermediateColumnarFormatWriter :
907
942
def __init__ (self , path ):
908
943
self .path = pathlib .Path (path )
@@ -974,11 +1009,8 @@ def load_partition_summaries(self):
974
1009
not_found = []
975
1010
for j in range (self .num_partitions ):
976
1011
try :
977
- with open (self .wip_path / f"p{ j } _summary.json" ) as f :
978
- summary = json .load (f )
979
- for k , v in summary ["field_summaries" ].items ():
980
- summary ["field_summaries" ][k ] = VcfFieldSummary .fromdict (v )
981
- summaries .append (summary )
1012
+ with open (self .wip_path / f"p{ j } .json" ) as f :
1013
+ summaries .append (IcfPartitionMetadata .fromdict (json .load (f )))
982
1014
except FileNotFoundError :
983
1015
not_found .append (j )
984
1016
if len (not_found ) > 0 :
@@ -995,7 +1027,7 @@ def load_metadata(self):
995
1027
996
1028
def process_partition (self , partition_index ):
997
1029
self .load_metadata ()
998
- summary_path = self .wip_path / f"p{ partition_index } _summary .json"
1030
+ summary_path = self .wip_path / f"p{ partition_index } .json"
999
1031
# If someone is rewriting a summary path (for whatever reason), make sure it
1000
1032
# doesn't look like it's already been completed.
1001
1033
# NOTE to do this properly we probably need to take a lock on this file - but
@@ -1016,6 +1048,7 @@ def process_partition(self, partition_index):
1016
1048
else :
1017
1049
format_fields .append (field )
1018
1050
1051
+ last_position = None
1019
1052
with IcfPartitionWriter (
1020
1053
self .metadata ,
1021
1054
self .path ,
@@ -1025,6 +1058,7 @@ def process_partition(self, partition_index):
1025
1058
num_records = 0
1026
1059
for variant in ivcf .variants (partition .region ):
1027
1060
num_records += 1
1061
+ last_position = variant .POS
1028
1062
tcw .append ("CHROM" , variant .CHROM )
1029
1063
tcw .append ("POS" , variant .POS )
1030
1064
tcw .append ("QUAL" , variant .QUAL )
@@ -1049,15 +1083,16 @@ def process_partition(self, partition_index):
1049
1083
f"flushing buffers"
1050
1084
)
1051
1085
1052
- partition_metadata = {
1053
- "num_records" : num_records ,
1054
- "field_summaries" : {k : v .asdict () for k , v in tcw .field_summaries .items ()},
1055
- }
1086
+ partition_metadata = IcfPartitionMetadata (
1087
+ num_records = num_records ,
1088
+ last_position = last_position ,
1089
+ field_summaries = tcw .field_summaries ,
1090
+ )
1056
1091
with open (summary_path , "w" ) as f :
1057
- json . dump (partition_metadata , f , indent = 4 )
1092
+ f . write (partition_metadata . asjson () )
1058
1093
logger .info (
1059
- f"Finish p{ partition_index } { partition .vcf_path } __{ partition .region } = "
1060
- f"{ num_records } records"
1094
+ f"Finish p{ partition_index } { partition .vcf_path } __{ partition .region } "
1095
+ f"{ num_records } records last_pos= { last_position } "
1061
1096
)
1062
1097
1063
1098
def explode (self , * , worker_processes = 1 , show_progress = False ):
@@ -1099,8 +1134,9 @@ def finalise(self):
1099
1134
partition_summaries = self .load_partition_summaries ()
1100
1135
total_records = 0
1101
1136
for index , summary in enumerate (partition_summaries ):
1102
- partition_records = summary [ " num_records" ]
1137
+ partition_records = summary . num_records
1103
1138
self .metadata .partitions [index ].num_records = partition_records
1139
+ self .metadata .partitions [index ].region .end = summary .last_position
1104
1140
total_records += partition_records
1105
1141
if not np .isinf (self .metadata .num_records ):
1106
1142
# Note: this is just telling us that there's a bug in the
@@ -1110,9 +1146,11 @@ def finalise(self):
1110
1146
assert total_records == self .metadata .num_records
1111
1147
self .metadata .num_records = total_records
1112
1148
1149
+ check_overlapping_partitions (self .metadata .partitions )
1150
+
1113
1151
for field in self .metadata .fields :
1114
1152
for summary in partition_summaries :
1115
- field .summary .update (summary [ " field_summaries" ] [field .full_name ])
1153
+ field .summary .update (summary . field_summaries [field .full_name ])
1116
1154
1117
1155
logger .info ("Finalising metadata" )
1118
1156
with open (self .path / "metadata.json" , "w" ) as f :
@@ -1756,7 +1794,7 @@ def encode_partition(self, partition_index):
1756
1794
final_path = self .partition_path (partition_index )
1757
1795
logger .info (f"Finalising { partition_index } at { final_path } " )
1758
1796
if final_path .exists ():
1759
- logger .warning ("Removing existing partition at {final_path}" )
1797
+ logger .warning (f "Removing existing partition at { final_path } " )
1760
1798
shutil .rmtree (final_path )
1761
1799
os .rename (partition_path , final_path )
1762
1800
0 commit comments