Skip to content

Commit 135cdb1

Browse files
authored
Allele format (#192)
1 parent 63d995d commit 135cdb1

File tree

95 files changed

+7814
-7814
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

95 files changed

+7814
-7814
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ This project adheres to [Semantic Versioning](http://semver.org/).
77

88
### Changed
99
- Intermediate FASTQ files are now bgzip compressed to reduce storage requirements (#189).
10+
- Colons are now used instead of commas to separate SNP alleles in microhap alleles (#192).
1011

1112
### Fixed
1213
- Bug with handling marker vs. locus identifiers when running `mhpl8r seq` (#190).

microhapulator/api.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -504,7 +504,7 @@ def sim(frequencies, seed=None):
504504
for haploindex in range(2):
505505
for marker in markers:
506506
haplofreqs = frequencies[frequencies.Marker == marker]
507-
haplotypes = list(haplofreqs.Haplotype)
507+
haplotypes = list(haplofreqs.Allele)
508508
freqs = list(haplofreqs.Frequency)
509509
freqs = [x / sum(freqs) for x in freqs]
510510
sampled_haplotype = np.random.choice(haplotypes, p=freqs)
@@ -561,7 +561,7 @@ def tally_haplotypes(bam, mhindex, minbasequal=10, max_depth=1e6):
561561
if len(htlist) < len(marker.offsets_locus):
562562
discarded += 1
563563
continue
564-
htstr = ",".join(htlist)
564+
htstr = ":".join(htlist)
565565
haplotypes[htstr] += 1
566566
yield marker.id, cov_pos, haplotypes, discarded
567567
totaldiscarded += discarded

microhapulator/cli/sim.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,9 @@ def main(args):
5959
profile = mhapi.sim(frequencies, seed=args.seed)
6060
with mhopen(args.out, "w") as fh:
6161
profile.dump(fh)
62-
if hasattr(fh, "name"):
63-
message = "profile JSON written to {:s}".format(fh.name)
64-
print("[MicroHapulator::sim]", message, file=sys.stderr)
62+
fh_name = fh.name if hasattr(fh, "name") else "stdout"
63+
message = "profile JSON written to {:s}".format(fh_name)
64+
print("[MicroHapulator::sim]", message, file=sys.stderr)
6565
if args.haplo_seq:
6666
index = MicrohapIndex.from_files(args.markers, fasta_path=args.sequences)
6767
index.validate()

microhapulator/load.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def open(filename, mode):
3535

3636
def load_marker_frequencies(tsvfile):
3737
frequencies = pd.read_csv(tsvfile, sep="\t")
38-
missing = set(["Marker", "Haplotype", "Frequency"]) - set(frequencies.columns)
38+
missing = set(["Marker", "Allele", "Frequency"]) - set(frequencies.columns)
3939
if len(missing) > 0:
4040
message = "column(s) missing from marker frequency file: " + ", ".join(sorted(missing))
4141
raise ValueError(message)

microhapulator/profile.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ def rand_match_prob(self, freqs, minfreq=0.01):
110110
nhaps = len(haplotypes)
111111
msg = f"cannot compute random match prob. for marker with {nhaps} haplotypes"
112112
raise RandomMatchError(msg)
113-
result = freqs[(freqs.Marker == marker) & (freqs.Haplotype.isin(haplotypes))]
113+
result = freqs[(freqs.Marker == marker) & (freqs.Allele.isin(haplotypes))]
114114
if len(haplotypes) == 1:
115115
p = minfreq
116116
if len(result) == 1:
@@ -186,7 +186,7 @@ def bedstream(self, mhindex):
186186
variants = [list() for _ in range(len(offsets))]
187187
for i in sorted(self.haploindexes()):
188188
haplotype = self.haplotypes(markerid, index=i).pop()
189-
for snp, allelelist in zip(haplotype.split(","), variants):
189+
for snp, allelelist in zip(haplotype.split(":"), variants):
190190
allelelist.append(snp)
191191
for offset, snps in zip(offsets, variants):
192192
haplostr = "|".join(snps)
@@ -273,7 +273,7 @@ def populate_from_bed(bedfile):
273273
profile = SimulatedProfile(ploidy=ploidy)
274274
for marker, allele_list in marker_alleles.items():
275275
for i, haplotype in enumerate(allele_list):
276-
profile.add(i, marker, ",".join(haplotype))
276+
profile.add(i, marker, ":".join(haplotype))
277277
return profile
278278

279279
def merge(profiles):
Lines changed: 77 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -1,117 +1,117 @@
11
MHDBL000002
2-
>>> T,G,G,C
3-
<<< T,G,C,C
2+
>>> T:G:G:C
3+
<<< T:G:C:C
44
MHDBL000003
5-
<<< C,A,G,G
5+
<<< C:A:G:G
66
MHDBL000007
7-
>>> T,T,G,G
7+
>>> T:T:G:G
88
MHDBL000013
9-
>>> A,G,C,C
9+
>>> A:G:C:C
1010
MHDBL000017
11-
>>> A,A,G,C,T
12-
>>> T,A,A,T,T
13-
<<< T,G,G,C,C
11+
>>> A:A:G:C:T
12+
>>> T:A:A:T:T
13+
<<< T:G:G:C:C
1414
MHDBL000018
15-
>>> C,A,C,C,G
16-
<<< C,A,C,T,G
17-
<<< T,A,T,T,G
15+
>>> C:A:C:C:G
16+
<<< C:A:C:T:G
17+
<<< T:A:T:T:G
1818
MHDBL000030
19-
>>> A,C,C,C
20-
<<< A,A,T,C
19+
>>> A:C:C:C
20+
<<< A:A:T:C
2121
MHDBL000036
22-
>>> A,C,G
23-
>>> G,C,G
24-
<<< G,T,G
22+
>>> A:C:G
23+
>>> G:C:G
24+
<<< G:T:G
2525
MHDBL000038
26-
>>> T,A,A,T
26+
>>> T:A:A:T
2727
MHDBL000047
28-
<<< T,T
28+
<<< T:T
2929
MHDBL000058
30-
>>> A,G,C,G
31-
>>> A,G,T,G
32-
<<< A,A,C,G
33-
<<< G,G,C,G
30+
>>> A:G:C:G
31+
>>> A:G:T:G
32+
<<< A:A:C:G
33+
<<< G:G:C:G
3434
MHDBL000061
35-
<<< G,A,A
35+
<<< G:A:A
3636
MHDBL000076
37-
>>> G,T
37+
>>> G:T
3838
MHDBL000079
39-
>>> C,T
40-
<<< A,G
39+
>>> C:T
40+
<<< A:G
4141
MHDBL000082
42-
>>> A,C,T,T
43-
<<< G,C,T,T
44-
<<< G,T,A,T
42+
>>> A:C:T:T
43+
<<< G:C:T:T
44+
<<< G:T:A:T
4545
MHDBL000085
46-
>>> A,C,T,G
47-
>>> G,A,C,A
48-
<<< G,A,T,G
49-
<<< G,C,T,G
46+
>>> A:C:T:G
47+
>>> G:A:C:A
48+
<<< G:A:T:G
49+
<<< G:C:T:G
5050
MHDBL000088
51-
<<< C,T
51+
<<< C:T
5252
MHDBL000101
53-
>>> C,C,C,T
54-
<<< T,C,C,C
53+
>>> C:C:C:T
54+
<<< T:C:C:C
5555
MHDBL000106
56-
>>> C,G,T,G
57-
<<< A,G,C,G
56+
>>> C:G:T:G
57+
<<< A:G:C:G
5858
MHDBL000108
59-
<<< A,T,G,A
59+
<<< A:T:G:A
6060
MHDBL000111
61-
>>> G,C,A,A,G
62-
<<< A,C,A,A,A
61+
>>> G:C:A:A:G
62+
<<< A:C:A:A:A
6363
MHDBL000112
64-
>>> G,G,A,C
64+
>>> G:G:A:C
6565
MHDBL000122
66-
>>> T,G,C
67-
<<< G,A,C
66+
>>> T:G:C
67+
<<< G:A:C
6868
MHDBL000124
69-
<<< G,A,A
69+
<<< G:A:A
7070
MHDBL000128
71-
>>> A,T,C,G
72-
<<< T,T,T,G
71+
>>> A:T:C:G
72+
<<< T:T:T:G
7373
MHDBL000129
74-
>>> G,T,C
75-
<<< G,T,A
74+
>>> G:T:C
75+
<<< G:T:A
7676
MHDBL000135
77-
>>> G,C,C
78-
>>> G,T,T
79-
<<< A,T,C
77+
>>> G:C:C
78+
>>> G:T:T
79+
<<< A:T:C
8080
MHDBL000136
81-
<<< A,C,G,C
81+
<<< A:C:G:C
8282
MHDBL000138
83-
>>> A,A,C,A
84-
<<< G,A,C,G
85-
<<< G,G,C,G
83+
>>> A:A:C:A
84+
<<< G:A:C:G
85+
<<< G:G:C:G
8686
MHDBL000140
87-
>>> C,C,A,A
88-
<<< C,C,T,A
87+
>>> C:C:A:A
88+
<<< C:C:T:A
8989
MHDBL000144
90-
>>> A,G
91-
<<< G,G
90+
>>> A:G
91+
<<< G:G
9292
MHDBL000152
93-
<<< A,G
93+
<<< A:G
9494
MHDBL000154
95-
>>> C,C
96-
<<< T,T
95+
>>> C:C
96+
<<< T:T
9797
MHDBL000163
98-
>>> A,A,G,A,T
99-
>>> C,A,G,A,T
100-
<<< C,G,A,A,T
101-
<<< C,G,G,G,T
98+
>>> A:A:G:A:T
99+
>>> C:A:G:A:T
100+
<<< C:G:A:A:T
101+
<<< C:G:G:G:T
102102
MHDBL000181
103-
<<< C,C
103+
<<< C:C
104104
MHDBL000183
105-
>>> C,G
106-
>>> T,G
107-
<<< C,A
108-
<<< T,A
105+
>>> C:G
106+
>>> T:G
107+
<<< C:A
108+
<<< T:A
109109
MHDBL000194
110-
<<< G,C
110+
<<< G:C
111111
MHDBL000210
112-
>>> A,T,T,G
112+
>>> A:T:T:G
113113
MHDBL000211
114-
>>> T,A,A
115-
<<< T,G,A
114+
>>> T:A:A
115+
<<< T:G:A
116116
MHDBL000212
117-
>>> G,C,C,C,T
117+
>>> G:C:C:C:T
Lines changed: 62 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1,62 +1,62 @@
1-
Marker Haplotype Frequency
2-
mh07USC-7qB C,A,A 0.448
3-
mh07USC-7qB C,A,G 0.292
4-
mh07USC-7qB C,C,A 0.135
5-
mh07USC-7qB T,C,A 0.125
6-
mh08USC-8qC A,A,A,C 0.13
7-
mh08USC-8qC C,A,A,C 0.297
8-
mh08USC-8qC C,A,G,C 0.411
9-
mh08USC-8qC C,C,G,A 0.042
10-
mh08USC-8qC C,C,G,C 0.12
11-
mh09USC-9qB C,A,T,T 0.104
12-
mh09USC-9qB C,G,C,T 0.052
13-
mh09USC-9qB C,G,T,C 0.005
14-
mh09USC-9qB C,G,T,T 0.359
15-
mh09USC-9qB T,G,C,C 0.104
16-
mh09USC-9qB T,G,C,T 0.208
17-
mh09USC-9qB T,G,T,T 0.167
18-
mh10USC-10pA A,G,G 0.177
19-
mh10USC-10pA T,A,G 0.323
20-
mh10USC-10pA T,G,A 0.104
21-
mh10USC-10pA T,G,G 0.396
22-
mh11USC-11qA C,C 0.297
23-
mh11USC-11qA C,T 0.323
24-
mh11USC-11qA G,C 0.198
25-
mh11USC-11qA T,C 0.182
26-
mh12USC-12qC A,C,A,C 0.172
27-
mh12USC-12qC A,C,A,T 0.104
28-
mh12USC-12qC A,T,A,C 0.354
29-
mh12USC-12qC A,T,A,T 0.182
30-
mh12USC-12qC G,C,A,C 0.042
31-
mh12USC-12qC G,C,G,C 0.146
32-
mh13USC-13qC C,C,T,C 0.115
33-
mh13USC-13qC C,G,T,C 0.495
34-
mh13USC-13qC T,G,G,C 0.177
35-
mh13USC-13qC T,G,T,C 0.047
36-
mh13USC-13qC T,G,T,G 0.167
37-
mh14USC-14qA G,C,C 0.005
38-
mh14USC-14qA G,C,G 0.141
39-
mh14USC-14qA G,T,G 0.146
40-
mh14USC-14qA T,C,C 0.208
41-
mh14USC-14qA T,C,G 0.49
42-
mh14USC-14qA T,T,G 0.01
43-
mh14USC-14qD G,A,G 0.182
44-
mh14USC-14qD G,A,T 0.172
45-
mh14USC-14qD G,T,G 0.391
46-
mh14USC-14qD T,A,T 0.25
47-
mh14USC-14qD T,T,T 0.005
48-
mh15USC-15qA A,C,A,A 0.318
49-
mh15USC-15qA A,T,A,A 0.036
50-
mh15USC-15qA A,T,A,G 0.193
51-
mh15USC-15qA A,T,C,A 0.292
52-
mh15USC-15qA G,T,A,A 0.161
53-
mh18USC-18pA C,A,A,T,G 0.219
54-
mh18USC-18pA C,C,A,G,A 0.031
55-
mh18USC-18pA C,C,A,G,G 0.214
56-
mh18USC-18pA T,C,A,T,A 0.057
57-
mh18USC-18pA T,C,A,T,G 0.318
58-
mh18USC-18pA T,C,G,T,G 0.161
59-
mh21USC-21qB C,A,C 0.333
60-
mh21USC-21qB T,A,C 0.203
61-
mh21USC-21qB T,A,T 0.281
62-
mh21USC-21qB T,G,C 0.182
1+
Marker Allele Frequency
2+
mh07USC-7qB C:A:A 0.448
3+
mh07USC-7qB C:A:G 0.292
4+
mh07USC-7qB C:C:A 0.135
5+
mh07USC-7qB T:C:A 0.125
6+
mh08USC-8qC A:A:A:C 0.13
7+
mh08USC-8qC C:A:A:C 0.297
8+
mh08USC-8qC C:A:G:C 0.411
9+
mh08USC-8qC C:C:G:A 0.042
10+
mh08USC-8qC C:C:G:C 0.12
11+
mh09USC-9qB C:A:T:T 0.104
12+
mh09USC-9qB C:G:C:T 0.052
13+
mh09USC-9qB C:G:T:C 0.005
14+
mh09USC-9qB C:G:T:T 0.359
15+
mh09USC-9qB T:G:C:C 0.104
16+
mh09USC-9qB T:G:C:T 0.208
17+
mh09USC-9qB T:G:T:T 0.167
18+
mh10USC-10pA A:G:G 0.177
19+
mh10USC-10pA T:A:G 0.323
20+
mh10USC-10pA T:G:A 0.104
21+
mh10USC-10pA T:G:G 0.396
22+
mh11USC-11qA C:C 0.297
23+
mh11USC-11qA C:T 0.323
24+
mh11USC-11qA G:C 0.198
25+
mh11USC-11qA T:C 0.182
26+
mh12USC-12qC A:C:A:C 0.172
27+
mh12USC-12qC A:C:A:T 0.104
28+
mh12USC-12qC A:T:A:C 0.354
29+
mh12USC-12qC A:T:A:T 0.182
30+
mh12USC-12qC G:C:A:C 0.042
31+
mh12USC-12qC G:C:G:C 0.146
32+
mh13USC-13qC C:C:T:C 0.115
33+
mh13USC-13qC C:G:T:C 0.495
34+
mh13USC-13qC T:G:G:C 0.177
35+
mh13USC-13qC T:G:T:C 0.047
36+
mh13USC-13qC T:G:T:G 0.167
37+
mh14USC-14qA G:C:C 0.005
38+
mh14USC-14qA G:C:G 0.141
39+
mh14USC-14qA G:T:G 0.146
40+
mh14USC-14qA T:C:C 0.208
41+
mh14USC-14qA T:C:G 0.49
42+
mh14USC-14qA T:T:G 0.01
43+
mh14USC-14qD G:A:G 0.182
44+
mh14USC-14qD G:A:T 0.172
45+
mh14USC-14qD G:T:G 0.391
46+
mh14USC-14qD T:A:T 0.25
47+
mh14USC-14qD T:T:T 0.005
48+
mh15USC-15qA A:C:A:A 0.318
49+
mh15USC-15qA A:T:A:A 0.036
50+
mh15USC-15qA A:T:A:G 0.193
51+
mh15USC-15qA A:T:C:A 0.292
52+
mh15USC-15qA G:T:A:A 0.161
53+
mh18USC-18pA C:A:A:T:G 0.219
54+
mh18USC-18pA C:C:A:G:A 0.031
55+
mh18USC-18pA C:C:A:G:G 0.214
56+
mh18USC-18pA T:C:A:T:A 0.057
57+
mh18USC-18pA T:C:A:T:G 0.318
58+
mh18USC-18pA T:C:G:T:G 0.161
59+
mh21USC-21qB C:A:C 0.333
60+
mh21USC-21qB T:A:C 0.203
61+
mh21USC-21qB T:A:T 0.281
62+
mh21USC-21qB T:G:C 0.182

0 commit comments

Comments
 (0)