Skip to content

Commit 1d1e204

Browse files
authored
Fix bug with identifier validation in mhpl8r seq (#190)
1 parent 149cbac commit 1d1e204

File tree

11 files changed

+273
-4
lines changed

11 files changed

+273
-4
lines changed

.github/workflows/cibuild.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ jobs:
88
strategy:
99
max-parallel: 4
1010
matrix:
11-
python-version: ["3.8", "3.9", "3.10", "3.11"]
11+
python-version: ["3.9", "3.10", "3.11"]
1212
steps:
1313
- uses: actions/checkout@v1
1414
- name: Set up Python ${{ matrix.python-version }}

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ This project adheres to [Semantic Versioning](http://semver.org/).
88
### Changed
99
- Intermediate FASTQ files are now bgzip compressed to reduce storage requirements (#189).
1010

11+
### Fixed
12+
- Bug with handling marker vs. locus identifiers when running `mhpl8r seq` (#190).
13+
1114

1215
## [0.8.4] 2024-09-16
1316

microhapulator/api.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,7 @@ def seq(
449449
:param str sig: an optional signature (prefix) to apply to all simulated reads
450450
:yields: for each read pair, a tuple (I, X, Y) where I is a serial index, X is the forward read (R1), and Y is the reverse read (R2)
451451
"""
452+
check_profiles_for_multiple_alleles(profiles)
452453
n = len(profiles)
453454
if seeds is None:
454455
seeds = [np.random.randint(1, 2**32 - 1) for _ in range(n)]
@@ -476,6 +477,13 @@ def seq(
476477
reads_sequenced = data[0]
477478

478479

480+
def check_profiles_for_multiple_alleles(profiles):
481+
for profile in profiles:
482+
if len(profile.loci()) < len(profile.markers()):
483+
msg = "simulating sequencing of profiles with multiple allele definitions at a locus is not permitted"
484+
raise ValueError(msg)
485+
486+
479487
def sim(frequencies, seed=None):
480488
"""Simulate a diploid genotype from the specified microhaplotype frequencies
481489
@@ -484,6 +492,7 @@ def sim(frequencies, seed=None):
484492
:returns: a simulated genotype profile for all markers specified in the haplotype frequencies
485493
:rtype: microhapulator.profile.SimulatedProfile
486494
"""
495+
check_frequencies_for_multiple_alleles(frequencies)
487496
profile = SimulatedProfile(ploidy=2)
488497
if seed is None:
489498
seed = np.random.randint(2**32 - 1)
@@ -505,6 +514,14 @@ def sim(frequencies, seed=None):
505514
return profile
506515

507516

517+
def check_frequencies_for_multiple_alleles(frequencies):
518+
marker_ids = set(frequencies.Marker)
519+
locus_ids = set(frequencies.Marker.apply(lambda m: m.split(".")[0]))
520+
if len(locus_ids) < len(marker_ids):
521+
msg = "simulating profiles with multiple allele definitions at a locus is not permitted"
522+
raise ValueError(msg)
523+
524+
508525
def check_index(bamfile):
509526
index1 = bamfile + ".bai"
510527
index2 = re.sub(r".bam$", ".bai", bamfile)

microhapulator/profile.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,9 @@ def haploindexes(self):
7979
def markers(self):
8080
return set(self.data["markers"])
8181

82+
def loci(self):
83+
return set(m.split(".")[0] for m in self.markers())
84+
8285
def haplotypes(self, markerid, index=None):
8386
if markerid not in self.data["markers"]:
8487
return set()
@@ -176,7 +179,7 @@ def __str__(self):
176179
return json.dumps(self.data, indent=4, sort_keys=True)
177180

178181
def bedstream(self, mhindex):
179-
mhindex.validate(refrids=self.markers(), symmetric=True)
182+
mhindex.validate(refrids=self.loci(), symmetric=True)
180183
for markerid in sorted(self.markers()):
181184
marker = mhindex.markers[markerid]
182185
offsets = marker.offsets_locus
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
Marker Offset Chrom OffsetHg38
2+
mh03USC-3qC.v2 47 chr3 196652865
3+
mh03USC-3qC.v2 153 chr3 196652971
4+
mh03USC-3qC.v2 207 chr3 196653025
5+
mh03USC-3qC.v2 216 chr3 196653034
6+
mh03USC-3qC.v2 226 chr3 196653044
7+
mh03USC-3qC.v2 266 chr3 196653084
8+
mh03USC-3qC.v2 303 chr3 196653121
9+
mh04WL-052.v1 54 chr4 2303788
10+
mh04WL-052.v1 61 chr4 2303795
11+
mh04WL-052.v1 99 chr4 2303833
12+
mh04WL-052.v1 105 chr4 2303839
13+
mh04WL-052.v1 172 chr4 2303906
14+
mh04WL-052.v1 185 chr4 2303919
15+
mh04WL-052.v1 210 chr4 2303944
16+
mh04WL-052.v1 272 chr4 2304006
17+
mh04WL-052.v1 296 chr4 2304030
18+
mh06SCUZJ-0528857 92 chr6 73429488
19+
mh06SCUZJ-0528857 157 chr6 73429553
20+
mh06SCUZJ-0528857 167 chr6 73429563
21+
mh06SCUZJ-0528857 200 chr6 73429596
22+
mh06SCUZJ-0528857 226 chr6 73429622
23+
mh06SCUZJ-0528857 233 chr6 73429629
24+
mh06SCUZJ-0528857 258 chr6 73429654
25+
mh17FHL-005.v3 51 chr17 78268164
26+
mh17FHL-005.v3 92 chr17 78268205
27+
mh17FHL-005.v3 200 chr17 78268313
28+
mh17FHL-005.v3 235 chr17 78268348
29+
mh17FHL-005.v3 265 chr17 78268378
30+
mh17FHL-005.v3 299 chr17 78268412
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
{
2+
"markers": {
3+
"mh13KK-218.v1": {
4+
"genotype": [
5+
{
6+
"haplotype": "C,T,C,C",
7+
"index": 0
8+
},
9+
{
10+
"haplotype": "C,T,C,T",
11+
"index": 1
12+
}
13+
]
14+
},
15+
"mh13KK-218.v6": {
16+
"genotype": [
17+
{
18+
"haplotype": "G,C,C,C,T",
19+
"index": 0
20+
},
21+
{
22+
"haplotype": "G,T,T,T,T",
23+
"index": 1
24+
}
25+
]
26+
},
27+
"mh21KK-320.v1": {
28+
"genotype": [
29+
{
30+
"haplotype": "G,G,C,G",
31+
"index": 0
32+
},
33+
{
34+
"haplotype": "A,A,C,A",
35+
"index": 1
36+
}
37+
]
38+
},
39+
"mh21KK-320.v5": {
40+
"genotype": [
41+
{
42+
"haplotype": "A,A,C,A,A",
43+
"index": 0
44+
},
45+
{
46+
"haplotype": "A,A,C,A,A",
47+
"index": 1
48+
}
49+
]
50+
}
51+
},
52+
"metadata": {
53+
"HaploSeed": 3674659513
54+
},
55+
"ploidy": 2,
56+
"type": "SimulatedProfile",
57+
"version": "0.8.4+4.g45db76e.dirty"
58+
}
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
Marker Population Haplotype Frequency Count Source
2+
mh13KK-218.v1 ITU C,C,C,C 0.01442 208 Byrska-Bishop2022
3+
mh13KK-218.v1 ITU C,C,C,T 0.14904 208 Byrska-Bishop2022
4+
mh13KK-218.v1 ITU C,C,T,T 0.00481 208 Byrska-Bishop2022
5+
mh13KK-218.v1 ITU C,T,C,C 0.07212 208 Byrska-Bishop2022
6+
mh13KK-218.v1 ITU C,T,C,T 0.07692 208 Byrska-Bishop2022
7+
mh13KK-218.v1 ITU C,T,T,C 0.07692 208 Byrska-Bishop2022
8+
mh13KK-218.v1 ITU C,T,T,T 0.03846 208 Byrska-Bishop2022
9+
mh13KK-218.v1 ITU T,C,C,C 0.02404 208 Byrska-Bishop2022
10+
mh13KK-218.v1 ITU T,C,C,T 0.04327 208 Byrska-Bishop2022
11+
mh13KK-218.v1 ITU T,C,T,C 0.00481 208 Byrska-Bishop2022
12+
mh13KK-218.v1 ITU T,C,T,T 0.01923 208 Byrska-Bishop2022
13+
mh13KK-218.v1 ITU T,T,C,C 0.05769 208 Byrska-Bishop2022
14+
mh13KK-218.v1 ITU T,T,C,T 0.13942 208 Byrska-Bishop2022
15+
mh13KK-218.v1 ITU T,T,T,C 0.06731 208 Byrska-Bishop2022
16+
mh13KK-218.v1 ITU T,T,T,T 0.21154 208 Byrska-Bishop2022
17+
mh13KK-218.v6 ITU G,C,C,C,C 0.01442 208 Byrska-Bishop2022
18+
mh13KK-218.v6 ITU G,C,C,C,T 0.14904 208 Byrska-Bishop2022
19+
mh13KK-218.v6 ITU G,C,C,T,T 0.00481 208 Byrska-Bishop2022
20+
mh13KK-218.v6 ITU G,C,T,C,C 0.07212 208 Byrska-Bishop2022
21+
mh13KK-218.v6 ITU G,C,T,C,T 0.07692 208 Byrska-Bishop2022
22+
mh13KK-218.v6 ITU G,C,T,T,C 0.07692 208 Byrska-Bishop2022
23+
mh13KK-218.v6 ITU G,C,T,T,T 0.03846 208 Byrska-Bishop2022
24+
mh13KK-218.v6 ITU G,T,C,C,C 0.02404 208 Byrska-Bishop2022
25+
mh13KK-218.v6 ITU G,T,C,C,T 0.04327 208 Byrska-Bishop2022
26+
mh13KK-218.v6 ITU G,T,C,T,C 0.00481 208 Byrska-Bishop2022
27+
mh13KK-218.v6 ITU G,T,C,T,T 0.01923 208 Byrska-Bishop2022
28+
mh13KK-218.v6 ITU G,T,T,C,C 0.05769 208 Byrska-Bishop2022
29+
mh13KK-218.v6 ITU G,T,T,C,T 0.13942 208 Byrska-Bishop2022
30+
mh13KK-218.v6 ITU G,T,T,T,C 0.06731 208 Byrska-Bishop2022
31+
mh13KK-218.v6 ITU G,T,T,T,T 0.21154 208 Byrska-Bishop2022
32+
mh21KK-320.v1 ITU A,A,C,A 0.26442 208 Byrska-Bishop2022
33+
mh21KK-320.v1 ITU A,A,C,G 0.25 208 Byrska-Bishop2022
34+
mh21KK-320.v1 ITU G,A,C,A 0.16346 208 Byrska-Bishop2022
35+
mh21KK-320.v1 ITU G,A,C,G 0.02885 208 Byrska-Bishop2022
36+
mh21KK-320.v1 ITU G,A,T,A 0.01923 208 Byrska-Bishop2022
37+
mh21KK-320.v1 ITU G,G,C,A 0.17788 208 Byrska-Bishop2022
38+
mh21KK-320.v1 ITU G,G,C,G 0.09615 208 Byrska-Bishop2022
39+
mh21KK-320.v5 ITU A,A,C,A,A 0.26442 208 Byrska-Bishop2022
40+
mh21KK-320.v5 ITU A,A,C,G,A 0.25 208 Byrska-Bishop2022
41+
mh21KK-320.v5 ITU G,A,C,A,A 0.16346 208 Byrska-Bishop2022
42+
mh21KK-320.v5 ITU G,A,C,G,A 0.02885 208 Byrska-Bishop2022
43+
mh21KK-320.v5 ITU G,A,T,A,A 0.01923 208 Byrska-Bishop2022
44+
mh21KK-320.v5 ITU G,G,C,A,A 0.17788 208 Byrska-Bishop2022
45+
mh21KK-320.v5 ITU G,G,C,G,A 0.09615 208 Byrska-Bishop2022
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
{
2+
"markers": {
3+
"mh03USC-3qC.v2": {
4+
"genotype": [
5+
{
6+
"haplotype": "C,T,A,C,C,G,G",
7+
"index": 0
8+
},
9+
{
10+
"haplotype": "G,T,G,T,C,A,G",
11+
"index": 1
12+
}
13+
]
14+
},
15+
"mh04WL-052.v1": {
16+
"genotype": [
17+
{
18+
"haplotype": "A,C,C,G,A,G,T,C,C",
19+
"index": 0
20+
},
21+
{
22+
"haplotype": "G,T,C,A,A,G,C,T,C",
23+
"index": 1
24+
}
25+
]
26+
},
27+
"mh06SCUZJ-0528857": {
28+
"genotype": [
29+
{
30+
"haplotype": "A,A,C,T,G,T,T",
31+
"index": 0
32+
},
33+
{
34+
"haplotype": "G,A,C,T,G,T,C",
35+
"index": 1
36+
}
37+
]
38+
},
39+
"mh17FHL-005.v3": {
40+
"genotype": [
41+
{
42+
"haplotype": "A,G,C,C,T,T",
43+
"index": 0
44+
},
45+
{
46+
"haplotype": "A,G,T,T,T,T",
47+
"index": 1
48+
}
49+
]
50+
}
51+
},
52+
"metadata": {
53+
"HaploSeed": 1278493438
54+
},
55+
"ploidy": 2,
56+
"type": "SimulatedProfile",
57+
"version": "0.8.4+3.g41bc147"
58+
}
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
>mh03USC-3qC GRCh38:chr3:196652818-196653169 mh03USC-3qC.v2=47,153,207,216,226,266,303
2+
CTGGAGTCACAGCAGCAGGTACCACCCCACACCTGACAAGAGGTGGCCAGGGAGGAAGGGAGGGTTCTTACCTCCCCATC
3+
TCCCCTCAGTAAAATTCAGGATGCCCAGTGAAGTTTGAATGTCAGATAAACAATTTGTTAGTATAAGGATGTATCTAGCA
4+
TTGAAATGATGCCTTGTAATTTACTAAATCTGCAACTATGCAGCCTTATTTCATGGCGGGCAGTGGTGGTGATCCCAGGT
5+
TTCAGGGGCGGGGAAGGGTGCTGGGGGGATCCTGAGGTCAGGAACCCGTACACCTCTGCTTCTGCCCTCTCTTCCCTGTG
6+
CCGGCCACAAGGCAATGACTCCTGTGTGGGT
7+
>mh04WL-052 GRCh38:chr4:2303734-2304085 mh04WL-052.v1=54,61,99,105,172,185,210,272,296
8+
CCCCTGGGAATTTGCCCATGGGTGGTCTCAGCACCCCTCATGGCCCCAGTCTCCGCGCCCACGGCCACGGGTCTGCGTCT
9+
CGTTTCAGCCCGGTGTGAGCCAGGCAGCCACAGTGGTCAGCAAAGAGAAGTGGGCAGTGCGCTCGGATGCGGCTCAGGGC
10+
AAGGGGCCCGGCGCTGGCACAGCCCGCTCCCACTGCCCCGCACAGGCACGCGGCTCACACCAGGGGCGGGATGGGCCGGG
11+
GCTGCGGCAGGTGAGCTGAGGGCTTGGCACTGCGCCCATGTCACCTGACAGGCACCCGGAACATCAGAAACAGGGAAGCG
12+
GGCCCGCTGGCGCACATTTTCTGGTGCATGA
13+
>mh06SCUZJ-0528857 GRCh38:chr6:73429396-73429747 mh06SCUZJ-0528857=92,157,167,200,226,233,258
14+
GTTAAATATTTTGTTTCAATAATAATTTGAATCAATTTATTCACTTTTTCTGTACCATCCTTACAGGTTTTATATTAATA
15+
CAAAATGAATTGGGGACCATTTCCTTTATTCTTATTTATGAGAGTTTGTGTAAATATAGAATGATTTCACCTGGGCGCGG
16+
TGGCTCACGCCTGTAATCCCAGCACCTTGGGAGGCCAAGGTGGGCGGATCACAAGGTCGGGAGATCAAGACCATCCTGGC
17+
TAACACAGTGAAACCCTGCCTCTACCAAAAATACAAAAAATTAGCCGGGCGTGGTGGCGGGCGCCTGTAATCCCAGCTAC
18+
TTGGGAGGCTGAGGCAGGAGAATGGCGCGAA
19+
>mh17FHL-005 GRCh38:chr17:78268113-78268464 mh17FHL-005.v3=51,92,200,235,265,299
20+
GGTGCCCAGGTGGGGGGCGGGCGGGTGGTCTTTCTTTTGGGGCAGCATCCCAAAAGAAAGGAAAAACAGCAGGGGGCGTG
21+
GCTCCCCTCACTGCCCCGCAACCATCCCTGCCTGGCCAGGGCCTTTTGGGCAGTGCCACTGGAGATTCCTGGAAGTGCCT
22+
TCGGGCCACACACACACACATCCTCCGGGACCCTCCCAGCCCCCCACTCTGCGCTTCCCGGGTTTGCAGGAGGGCCCCAA
23+
GGCCAGCCGGGAGCCCTGCAGGACCTGGGAGGAACCTGAGCCTGAAGGGTCTAGGAGCCCGCCGTGTCTGCTCTCAAGTG
24+
CCCCGCGTTCCCTTGGCCTGATCCCCGATCC

microhapulator/tests/test_seq.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def test_proportions(n, totalreads, prop, result):
3636

3737
def test_proportions_failure_modes():
3838
message = r"mismatch between contributor number and proportions"
39-
with pytest.raises(ValueError, match=message) as ve:
39+
with pytest.raises(ValueError, match=message):
4040
mhapi.calc_n_reads_from_proportions(3, 1000, [0.6, 0.4])
4141

4242

@@ -250,6 +250,30 @@ def test_main_out_three_filenames(capsys):
250250
data_file("prof/orange-sim-profile.json"),
251251
]
252252
with pytest.raises(SystemExit):
253-
args = microhapulator.cli.get_parser().parse_args(arglist)
253+
_ = microhapulator.cli.get_parser().parse_args(arglist)
254254
terminal = capsys.readouterr()
255255
assert "expected 1 or 2 output filenames, got 3" in terminal.err
256+
257+
258+
def test_regression_seq_locus_names_in_refr():
259+
profile = Profile(fromfile=data_file("prof/mwg4-sim.json"))
260+
index = MicrohapIndex.from_files(
261+
data_file("def/mwg4-offsets.tsv"), data_file("refr/mwg4-refr.fasta")
262+
)
263+
sequencer = mhapi.seq([profile], index, totalreads=1000)
264+
for n, read1, read2 in sequencer:
265+
pass
266+
numfragments = n * 2
267+
assert numfragments == pytest.approx(1000, abs=50)
268+
269+
270+
def test_seq_multidef():
271+
profile = Profile(fromfile=data_file("pashtun-sim/tiny-panel-multidef-bogus-profile.json"))
272+
index = MicrohapIndex.from_files(
273+
data_file("pashtun-sim/tiny-panel-multidef.tsv"),
274+
data_file("pashtun-sim/tiny-panel-multidef.fasta"),
275+
)
276+
sequencer = mhapi.seq([profile], index, totalreads=1000)
277+
msg = "simulating sequencing of profiles with multiple allele definitions at a locus is not permitted"
278+
with pytest.raises(ValueError, match=msg):
279+
_ = list(sequencer)

0 commit comments

Comments
 (0)