2
2
import shutil
3
3
import string
4
4
from pathlib import Path
5
- from random import choice , choices , randint
6
5
from typing import Any , Dict , Tuple
7
6
8
7
import numpy as np
@@ -40,7 +39,7 @@ def fixture_dir():
40
39
41
40
42
41
def simulate_contig (* , low , high , base_composition ):
43
- size = rng .integers (low = low , high = high )
42
+ size = int ( rng .integers (low = low , high = high ) )
44
43
bases = np .array ([b"a" , b"c" , b"g" , b"t" , b"n" , b"A" , b"C" , b"G" , b"T" , b"N" ])
45
44
p = np .array ([base_composition [b ] for b in bases ])
46
45
seq = rng .choice (bases , size = size , replace = True , p = p )
@@ -151,9 +150,9 @@ def simulate_genes(self, *, contig, contig_size):
151
150
# Simulate genes.
152
151
for gene_ix in range (self .max_genes ):
153
152
gene_id = f"gene-{ contig } -{ gene_ix } "
154
- strand = choice (["+" , "-" ])
155
- inter_size = randint ( self .inter_size_low , self .inter_size_high )
156
- gene_size = randint ( self .gene_size_low , self .gene_size_high )
153
+ strand = rng . choice (["+" , "-" ])
154
+ inter_size = int ( rng . integers ( self .inter_size_low , self .inter_size_high ) )
155
+ gene_size = int ( rng . integers ( self .gene_size_low , self .gene_size_high ) )
157
156
if strand == "+" :
158
157
gene_start = cur_fwd + inter_size
159
158
else :
@@ -166,7 +165,11 @@ def simulate_genes(self, *, contig, contig_size):
166
165
gene_attrs = f"ID={ gene_id } "
167
166
for attr in self .attrs :
168
167
random_str = "" .join (
169
- choices (string .ascii_uppercase + string .digits , k = 5 )
168
+ rng .choice (
169
+ list (string .ascii_uppercase + string .digits ),
170
+ size = 5 ,
171
+ replace = True ,
172
+ )
170
173
)
171
174
gene_attrs += f";{ attr } ={ random_str } "
172
175
gene = (
@@ -212,7 +215,7 @@ def simulate_transcripts(
212
215
# accurate in real data.
213
216
214
217
for transcript_ix in range (
215
- randint ( self .n_transcripts_low , self .n_transcripts_high )
218
+ int ( rng . integers ( self .n_transcripts_low , self .n_transcripts_high ) )
216
219
):
217
220
transcript_id = f"transcript-{ contig } -{ gene_ix } -{ transcript_ix } "
218
221
transcript_start = gene_start
@@ -260,13 +263,16 @@ def simulate_exons(
260
263
transcript_size = transcript_end - transcript_start
261
264
exons = []
262
265
exon_end = transcript_start
263
- n_exons = randint ( self .n_exons_low , self .n_exons_high )
266
+ n_exons = int ( rng . integers ( self .n_exons_low , self .n_exons_high ) )
264
267
for exon_ix in range (n_exons ):
265
268
exon_id = f"exon-{ contig } -{ gene_ix } -{ transcript_ix } -{ exon_ix } "
266
269
if exon_ix > 0 :
267
270
# Insert an intron between this exon and the previous one.
268
- intron_size = randint (
269
- self .intron_size_low , min (transcript_size , self .intron_size_high )
271
+ intron_size = int (
272
+ rng .integers (
273
+ self .intron_size_low ,
274
+ min (transcript_size , self .intron_size_high ),
275
+ )
270
276
)
271
277
exon_start = exon_end + intron_size
272
278
if exon_start >= transcript_end :
@@ -275,7 +281,7 @@ def simulate_exons(
275
281
else :
276
282
# First exon, assume exon starts where the transcript starts.
277
283
exon_start = transcript_start
278
- exon_size = randint ( self .exon_size_low , self .exon_size_high )
284
+ exon_size = int ( rng . integers ( self .exon_size_low , self .exon_size_high ) )
279
285
exon_end = min (exon_start + exon_size , transcript_end )
280
286
assert exon_end > exon_start
281
287
exon = (
@@ -311,7 +317,7 @@ def simulate_exons(
311
317
else :
312
318
feature_type = self .cds_type
313
319
# Cheat a little, random phase.
314
- phase = choice ([1 , 2 , 3 ])
320
+ phase = rng . choice ([1 , 2 , 3 ])
315
321
feature = (
316
322
contig ,
317
323
self .source ,
@@ -549,7 +555,7 @@ def simulate_aim_variants(path, contigs, snp_sites, n_sites_low, n_sites_high):
549
555
# Simulate AIM positions variable.
550
556
snp_pos = snp_sites [f"{ contig } /variants/POS" ][:]
551
557
loc_aim_sites = rng .choice (
552
- snp_pos .shape [0 ], size = rng .integers (n_sites_low , n_sites_high )
558
+ snp_pos .shape [0 ], size = int ( rng .integers (n_sites_low , n_sites_high ) )
553
559
)
554
560
loc_aim_sites .sort ()
555
561
aim_pos = snp_pos [loc_aim_sites ]
@@ -731,11 +737,10 @@ def simulate_cnv_coverage_calls(zarr_path, metadata_path, contigs, contig_sizes)
731
737
contig_length_bp = contig_sizes [contig ]
732
738
733
739
# Get a random number of CNV alleles ("variants") to simulate.
734
- n_cnv_alleles = rng .integers (1 , 5_000 )
740
+ n_cnv_alleles = int ( rng .integers (1 , 5_000 ) )
735
741
736
742
# Produce a set of random start positions for each allele as a sorted list.
737
743
allele_start_pos = sorted (rng .integers (1 , contig_length_bp , size = n_cnv_alleles ))
738
-
739
744
# Produce a set of random allele lengths for each allele, according to a range.
740
745
allele_length_bp_min = 100
741
746
allele_length_bp_max = 100_000
@@ -874,7 +879,7 @@ def simulate_cnv_discordant_read_calls(zarr_path, metadata_path, contigs, contig
874
879
contig_length_bp = contig_sizes [contig ]
875
880
876
881
# Get a random number of CNV variants to simulate.
877
- n_cnv_variants = rng .integers (1 , 100 )
882
+ n_cnv_variants = int ( rng .integers (1 , 100 ) )
878
883
879
884
# Produce a set of random start positions for each variant as a sorted list.
880
885
variant_start_pos = sorted (
@@ -1010,28 +1015,28 @@ def contigs(self) -> Tuple[str, ...]:
1010
1015
return tuple (self .config ["CONTIGS" ])
1011
1016
1012
1017
def random_contig (self ):
1013
- return choice (self .contigs )
1018
+ return rng . choice (self .contigs )
1014
1019
1015
1020
def random_transcript_id (self ):
1016
1021
df_transcripts = self .genome_features .query ("type == 'mRNA'" )
1017
1022
transcript_ids = [
1018
1023
gff3_parse_attributes (t )["ID" ] for t in df_transcripts .loc [:, "attributes" ]
1019
1024
]
1020
- transcript_id = choice (transcript_ids )
1025
+ transcript_id = rng . choice (transcript_ids )
1021
1026
return transcript_id
1022
1027
1023
1028
def random_region_str (self , region_size = None ):
1024
1029
contig = self .random_contig ()
1025
1030
contig_size = self .contig_sizes [contig ]
1026
- region_start = randint ( 1 , contig_size )
1031
+ region_start = int ( rng . integers ( 1 , contig_size ) )
1027
1032
if region_size :
1028
1033
# Ensure we the region span doesn't exceed the contig size.
1029
1034
if contig_size - region_start < region_size :
1030
1035
region_start = contig_size - region_size
1031
1036
1032
1037
region_end = region_start + region_size
1033
1038
else :
1034
- region_end = randint ( region_start , contig_size )
1039
+ region_end = int ( rng . integers ( region_start , contig_size ) )
1035
1040
region = f"{ contig } :{ region_start :,} -{ region_end :,} "
1036
1041
return region
1037
1042
@@ -1133,7 +1138,7 @@ def init_public_release_manifest(self):
1133
1138
manifest = pd .DataFrame (
1134
1139
{
1135
1140
"sample_set" : ["AG1000G-AO" , "AG1000G-BF-A" ],
1136
- "sample_count" : [randint ( 10 , 50 ), randint ( 10 , 40 )],
1141
+ "sample_count" : [int ( rng . integers ( 10 , 50 )), int ( rng . integers ( 10 , 40 ) )],
1137
1142
"study_id" : ["AG1000G-AO" , "AG1000G-BF-1" ],
1138
1143
"study_url" : [
1139
1144
"https://www.malariagen.net/network/where-we-work/AG1000G-AO" ,
@@ -1165,7 +1170,7 @@ def init_pre_release_manifest(self):
1165
1170
"1177-VO-ML-LEHMANN-VMF00004" ,
1166
1171
],
1167
1172
# Make sure we have some gambiae, coluzzii and arabiensis.
1168
- "sample_count" : [randint ( 20 , 60 )],
1173
+ "sample_count" : [int ( rng . integers ( 20 , 60 ) )],
1169
1174
"study_id" : ["1177-VO-ML-LEHMANN" ],
1170
1175
"study_url" : [
1171
1176
"https://www.malariagen.net/network/where-we-work/1177-VO-ML-LEHMANN"
0 commit comments