@@ -31,17 +31,28 @@ class Field:
3131 vcf_key : str
3232 vcf_type : str
3333 vcf_number : str
34+ description : str = "Generated field"
3435
3536 def get_header (self ):
3637 return (
3738 f"##{ self .category } =<"
3839 f"ID={ self .vcf_key } ,"
3940 f"Type={ self .vcf_type } ,"
4041 f"Number={ self .vcf_number } ,"
41- f'Description="{ self .category } ,Type= { self . vcf_type } ,Number= { self . vcf_number } ">'
42+ f'Description="{ self .description } ">'
4243 )
4344
4445
46+ # GT is a special case, since it has a special syntax, and must be listed as the first
47+ # format field (if present)
48+ GT = Field (
49+ category = "FORMAT" ,
50+ vcf_key = "GT" ,
51+ vcf_type = "String" ,
52+ vcf_number = "1" ,
53+ description = "Genotype" ,
54+ )
55+
4556# references to the VCF spec are for https://samtools.github.io/hts-specs/VCFv4.3.pdf
4657
4758# [Table 1: Reserved INFO keys]
@@ -133,7 +144,7 @@ def vcf_numbers(category, max_number):
133144def vcf_fields (category , max_number ):
134145 # info flag fields must have number 0
135146 # non-flag fields can't have number 0
136- return builds (
147+ general_fields = builds (
137148 Field ,
138149 category = just (category ),
139150 vcf_key = vcf_field_keys (category ),
@@ -143,6 +154,11 @@ def vcf_fields(category, max_number):
143154 lambda field : (field .vcf_type == "Flag" and field .vcf_number == "0" )
144155 or (field .vcf_type != "Flag" and field .vcf_number != "0" )
145156 )
157+ if category == "INFO" :
158+ return general_fields
159+ else :
160+ # FORMAT: GT special case
161+ return one_of (just (GT ), general_fields )
146162
147163
148164# [1.6.1 Fixed fields]
@@ -183,8 +199,31 @@ def qualities():
183199 )
184200
185201
202+ # [1.6.2 Genotype fields]
203+
204+
205+ def genotypes (alleles , ploidy ):
206+ def gt_str (allele_indexes , phased ):
207+ sep = "|" if phased else "/"
208+ return sep .join (
209+ [str (idx ) if idx is not None else "." for idx in allele_indexes ]
210+ )
211+
212+ return builds (
213+ gt_str ,
214+ lists (
215+ one_of (integers (0 , alleles - 1 ), none ()), min_size = ploidy , max_size = ploidy
216+ ),
217+ booleans (),
218+ )
219+
220+
186221@composite
187222def vcf_values (draw , field , * , max_number , alt_alleles , ploidy ):
223+ # GT special case
224+ if field is GT :
225+ return [draw (genotypes (alleles = alt_alleles + 1 , ploidy = ploidy ))]
226+
188227 # [1.3 Data types]
189228 if field .vcf_type == "Integer" :
190229 # some integer values at lower end of range are not allowed
@@ -231,6 +270,15 @@ def vcf_number_to_ints(vcf_number, *, max_number, alt_alleles, ploidy):
231270 raise ValueError (f"Number '{ vcf_number } ' is not supported." )
232271
233272
273+ def ensure_gt_first (format_fields ):
274+ # GT must be the first field if present [1.6.2 Genotype fields]
275+ try :
276+ i = format_fields .index (GT )
277+ format_fields .insert (0 , format_fields .pop (i ))
278+ except ValueError :
279+ pass
280+
281+
234282@composite
235283def vcf (
236284 draw ,
@@ -290,6 +338,7 @@ def vcf(
290338 unique_by = lambda f : f .vcf_key .lower (),
291339 )
292340 )
341+ ensure_gt_first (format_fields )
293342 sample_ids = draw (
294343 lists (
295344 text (alphabet = ALPHANUMERIC , min_size = 1 ), max_size = max_samples , unique = True
0 commit comments