Skip to content

Commit 12dcb65

Browse files
committed
Add BioStockholm extension
This supports MSAs loaded by BioStockholm.
1 parent d54d832 commit 12dcb65

File tree

6 files changed

+155
-17
lines changed

6 files changed

+155
-17
lines changed

Project.toml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "GPCRAnalysis"
22
uuid = "c1d73f9e-d42a-418a-8d5b-c7b00ec0358f"
3-
authors = ["Tim Holy <[email protected]> and contributors"]
43
version = "0.6.0"
4+
authors = ["Tim Holy <[email protected]> and contributors"]
55

66
[deps]
77
BioStructures = "de9282ab-8554-53be-b2d6-f6c222edabfc"
@@ -27,15 +27,18 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
2727
TravelingSalesmanHeuristics = "8c8f4381-2cdd-507c-846c-be2bcff6f45f"
2828

2929
[weakdeps]
30+
BioStockholm = "eeb925a3-6f9d-43e6-829e-e0ea03b76ecf"
3031
HiGHS = "87dc4568-4c63-4d18-b0c0-bb2238e4078b"
3132
JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
3233
MIToS = "51bafb47-8a16-5ded-8b04-24ef4eede0b5"
3334

3435
[extensions]
36+
GPCRAnalysisBioStockholmExt = "BioStockholm"
3537
GPCRAnalysisJuMPExt = ["JuMP", "HiGHS"]
3638
GPCRAnalysisMIToSExt = "MIToS"
3739

3840
[compat]
41+
BioStockholm = "0.2.1"
3942
BioStructures = "4.2"
4043
ColorTypes = "0.11, 0.12"
4144
Distances = "0.10"
@@ -63,11 +66,12 @@ TravelingSalesmanHeuristics = "0.3"
6366
julia = "1.10"
6467

6568
[extras]
69+
BioStockholm = "eeb925a3-6f9d-43e6-829e-e0ea03b76ecf"
6670
HiGHS = "87dc4568-4c63-4d18-b0c0-bb2238e4078b"
6771
InvertedIndices = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
6872
JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
6973
MIToS = "51bafb47-8a16-5ded-8b04-24ef4eede0b5"
7074
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
7175

7276
[targets]
73-
test = ["HiGHS", "InvertedIndices", "JuMP", "MIToS", "Test"]
77+
test = ["BioStockholm", "HiGHS", "InvertedIndices", "JuMP", "MIToS", "Test"]

ext/GPCRAnalysisBioStockholmExt.jl

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
module GPCRAnalysisBioStockholmExt
2+
3+
using GPCRAnalysis
4+
using BioStockholm
5+
using BioStockholm: OrderedDict # from OrderedCollections.jl
6+
7+
function conscols(msa::MSA)
8+
ss = msa.GC["SS_cons"]
9+
return findfirst(!=( '.'), ss):findlast(!=( '.'), ss)
10+
end
11+
12+
# Low-level API implementation
13+
# GPCRAnalysis.sequenceindexes(msaseq::AnnotatedAlignedSequence) = getsequencemapping(msaseq)
14+
# GPCRAnalysis.sequenceindexes(msaseq::MSA, i::Int) = getsequencemapping(msaseq, i)
15+
GPCRAnalysis.sequencekeys(msa::MSA) = collect(keys(msa.seq))
16+
GPCRAnalysis.msasequence(msa::MSA, key) = msa.seq[key][conscols(msa)]
17+
function GPCRAnalysis.residuematrix(msa::MSA)
18+
keepcols = conscols(msa)
19+
reduce(vcat, [permutedims(seq[keepcols]) for (_, seq) in msa.seq])
20+
end
21+
GPCRAnalysis.subseqs(msa::MSA{T}, rowmask::AbstractVector{Bool}) where T = MSA{T}(OrderedDict(pr for (pr, keep) in zip(msa.seq, rowmask) if keep), msa.GF, OrderedDict(pr for (pr, keep) in zip(msa.GS, rowmask) if keep), msa.GC, msa.GR)
22+
function GPCRAnalysis.subseqs!(msa::MSA, rowmask::AbstractVector{Bool})
23+
for ((key, _), keep) in zip(msa.seq, rowmask)
24+
if !keep
25+
delete!(msa.seq, key)
26+
delete!(msa.GS, key)
27+
end
28+
end
29+
return msa
30+
end
31+
GPCRAnalysis.columnindexes(msa::BioStockholm.MSA) = conscols(msa)
32+
33+
Base.getindex(msa::MSA, seqname::MSACode) = msa.seq[seqname.name][conscols(msa)]
34+
Base.getindex(msa::MSA, seqname::AccessionCode) = msa[MSACode(msa, seqname)]
35+
36+
37+
function GPCRAnalysis.AccessionCode(msa::MSA, seqname::AbstractString)
38+
AccessionCode(split(msa.GS[seqname]["AC"], '.')[1])
39+
end
40+
GPCRAnalysis.AccessionCode(msa::MSA, seqname::MSACode) = AccessionCode(msa, seqname.name)
41+
GPCRAnalysis.AccessionCode(::MSA, seqname::AccessionCode) = seqname
42+
43+
function GPCRAnalysis.MSACode(msa::MSA, accession::AbstractString)
44+
acs = [split(ac["AC"], '.')[1] for (_, ac) in msa.GS]
45+
i = findfirst(==(accession), acs)
46+
return MSACode(GPCRAnalysis.sequencekeys(msa)[i])
47+
end
48+
GPCRAnalysis.MSACode(msa::MSA, accession::AccessionCode) = MSACode(msa, accession.name)
49+
GPCRAnalysis.MSACode(::MSA, accession::MSACode) = accession
50+
51+
52+
function reduced_alphabet(r::Char)
53+
if r == '-'
54+
return 0
55+
elseif r in ('A','I','L','M','V')
56+
return 1 # hydrophobic
57+
elseif r in ('N','Q','S','T')
58+
return 2 # polar
59+
elseif r in ('R','H','K')
60+
return 3 # charged
61+
elseif r in ('D','E')
62+
return 4 # charged
63+
elseif r in ('F','W','Y')
64+
return 5 # aromatic
65+
end
66+
offset = findfirst(==(r), ('C','G','P'))
67+
offset === nothing && throw(ArgumentError("Unknown residue '$r'"))
68+
return 5 + offset # special or unknown
69+
end
70+
71+
GPCRAnalysis.columnwise_entropy(msa) = columnwise_entropy(reduced_alphabet, msa)
72+
73+
function GPCRAnalysis.percent_similarity(f, msa::MSA)
74+
# This mimics MIToS's implementation
75+
function pctsim(v1, v2)
76+
same = l = 0
77+
for (a, b) in zip(v1, v2)
78+
a == b == 0 && continue # skip gaps
79+
same += a == b
80+
l += 1
81+
end
82+
return 100 * same / l
83+
end
84+
85+
M = f.(GPCRAnalysis.residuematrix(msa))
86+
n = size(M, 1)
87+
S = zeros(Float64, n, n)
88+
for i in 1:n
89+
for j in i:n
90+
S[i, j] = pctsim(M[i, :], M[j, :])
91+
S[j, i] = S[i, j]
92+
end
93+
end
94+
return S
95+
end
96+
GPCRAnalysis.percent_similarity(msa::MSA) = GPCRAnalysis.percent_similarity(reduced_alphabet, msa)
97+
98+
99+
end

ext/GPCRAnalysisMIToSExt.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ using MIToS: MIToS, Pfam, MSA
1111
using MIToS.MSA: AbstractMultipleSequenceAlignment, AnnotatedAlignedSequence, AnnotatedMultipleSequenceAlignment,
1212
ReducedAlphabet, ResidueAlphabet, GAP, XAA
1313
using MIToS.MSA: getsequence, getannotsequence, getsequencemapping, getresidues, three2residue, sequencenames,
14-
filtersequences, filtersequences!, percentsimilarity
14+
filtersequences, filtersequences!, percentsimilarity, getcolumnmapping
1515

1616

1717
# Low-level API implementation
@@ -25,6 +25,7 @@ GPCRAnalysis.residuematrix(msa::AbstractMultipleSequenceAlignment) = getresidues
2525
GPCRAnalysis.subseqs(msa::AbstractMultipleSequenceAlignment, rowmask) = filtersequences(msa, rowmask)
2626
GPCRAnalysis.subseqs!(msa::AbstractMultipleSequenceAlignment, rowmask) = filtersequences!(msa, rowmask)
2727
GPCRAnalysis.percent_similarity(msa::AbstractMultipleSequenceAlignment) = percentsimilarity(msa)
28+
GPCRAnalysis.columnindexes(msa::MSA.AbstractMultipleSequenceAlignment) = getcolumnmapping(msa)
2829

2930
Base.getindex(msa::AbstractMultipleSequenceAlignment, seqname::MSACode) = getsequence(msa, seqname.name)
3031
Base.getindex(msa::AbstractMultipleSequenceAlignment, seqname::AccessionCode) = getsequence(msa, MSACode(msa, seqname).name)

src/msa.jl

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,13 @@ The two-argument form retrieves the sequenceindexes for the `i`th sequence in `m
1414
"""
1515
function sequenceindexes end
1616

17+
"""
18+
idxs = columnindexes(msa)
19+
20+
Return the indices (within the reference sequence) covered by the conserved columns of the MSA.
21+
"""
22+
function columnindexes end
23+
1724
"""
1825
isgap(res)
1926
@@ -108,11 +115,11 @@ Remove all sequences from `msa` with fewer than `minres` matching residues.
108115
"""
109116
function filter_long!(msa, minres::Real)
110117
# Get rid of short sequences
111-
nresidues = map(eachrow(msa)) do v
112-
sum(!isgap, v)
118+
nresidues = map(sequencekeys(msa)) do key
119+
sum(!isgap, msasequence(msa, key))
113120
end
114121
rowmask = nresidues .> minres
115-
subseqs(msa, rowmask)
122+
subseqs!(msa, rowmask)
116123
end
117124

118125
struct SequenceMapping <: AbstractVector{Int}

test/PF09645_full.stockholm

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,4 @@ F112_SSV1/3-112 .....QTLNSYKMAEIMYKILEKKGELTLEDILAQFEISVPSAYNIQRA
1111
#=GR F112_SSV1/3-112 SS .....X---HHHHHHHHHHHHHHHSEE-HHHHHHHH---HHHHHHHHHHHHHHHHH-TTTEEEEE-SS-EEEEE--XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX.....
1212
#=GC SS_cons .....X---HHHHHHHHHHHHHHHSEE-HHHHHHHH---HHHHHHHHHHHHHHHHH-TTTEEEEE-SS-EEEEE--XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX.....
1313
#=GC seq_cons ........NshphAclhaKILppKtElolEDIlAQFEISsosAYsI.+sL+hICEpH.-ECpsppKsRKTlhh.hKpEphppptpEp..ppItKIhsAp................h....
14-
//
14+
//

test/runtests.jl

Lines changed: 37 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
using GPCRAnalysis
22
using GPCRAnalysis: three2char
33
# MSA interface functions
4-
using GPCRAnalysis: sequenceindexes, isgap, isunknown, sequencekeys, msasequence, residuematrix,
4+
using GPCRAnalysis: sequenceindexes, columnindexes, isgap, isunknown, sequencekeys, msasequence, residuematrix,
55
subseqs, subseqs!, percent_similarity
66
using MIToS: MSA, Pfam
77
using MIToS.MSA: coverage, GappedAlphabet, nsequences
8-
nsequences, sequencenames, getsequencemapping, getcolumnmapping
8+
using BioStockholm: BioStockholm
99
using BioStructures
1010
using FASTX
1111
using GaussianMixtureAlignment
@@ -16,8 +16,6 @@ using JuMP, HiGHS
1616
using ColorTypes
1717
using Test
1818

19-
columnindexes(msa::MSA.AbstractMultipleSequenceAlignment) = MSA.getcolumnmapping(msa)
20-
2119
# skip the network-hitting components by setting `skip_download = true` in the global namespace
2220

2321
@testset "GPCRAnalysis.jl" begin
@@ -66,6 +64,7 @@ columnindexes(msa::MSA.AbstractMultipleSequenceAlignment) = MSA.getcolumnmapping
6664
@testset "MSA" begin
6765
# The test file is copied from MIToS/test/data, with gratitude
6866
pf09645_sto = "PF09645_full.stockholm"
67+
## First in MIToS format
6968
msa = MSA.read_file(pf09645_sto, Pfam.Stockholm)
7069
@test MSA.nsequences(filter_species!(deepcopy(msa), "ATV")) == 1
7170
@test MSA.nsequences(filter_long!(deepcopy(msa), 70)) == 3
@@ -91,15 +90,43 @@ columnindexes(msa::MSA.AbstractMultipleSequenceAlignment) = MSA.getcolumnmapping
9190
@test AccessionCode(msa, MSACode("Y070_ATV/2-70")) == AccessionCode("Q3V4T1")
9291
@test MSACode(msa, AccessionCode("Q3V4T1")) == MSACode("Y070_ATV/2-70")
9392
@test msa[MSACode("Y070_ATV/2-70")][8] == msa[AccessionCode("Q3V4T1")][8] == MSA.Residue('V')
93+
94+
## Now in BioStockholm format
95+
msa = read(pf09645_sto, BioStockholm.MSA)
96+
@test length(sequencekeys(filter_species!(deepcopy(msa), "ATV"))) == 1
97+
@test length(sequencekeys(filter_long!(deepcopy(msa), 70))) == 3
98+
99+
idx = SequenceMapping([0, 4, 5, 0])
100+
seqvals = fill(NaN, 9)
101+
seqvals[idx] = [0.1, 0.2, 0.3, 0.4]
102+
@test seqvals[4] == 0.2
103+
@test seqvals[5] == 0.3
104+
@test all(isnan, seqvals[1:3])
105+
@test all(isnan, seqvals[6:end])
106+
107+
# analyze
108+
e = columnwise_entropy(msa)
109+
@test length(e) == size(residuematrix(msa), 2) && e[9] == 0
110+
e2 = columnwise_entropy(identity, msa)
111+
@test all(e2 .>= e)
112+
@test !all(e2 .== e)
113+
114+
@test size(project_sequences(msa)) == (3, 4)
115+
@test size(project_sequences(msa; fracvar=0.5)) == (1, 4)
116+
117+
@test AccessionCode(msa, MSACode("Y070_ATV/2-70")) == AccessionCode("Q3V4T1")
118+
@test MSACode(msa, AccessionCode("Q3V4T1")) == MSACode("Y070_ATV/2-70")
119+
@test msa[MSACode("Y070_ATV/2-70")][8] == msa[AccessionCode("Q3V4T1")][8] == 'V'
94120
end
95121
@testset "Properties" begin
96122
pf09645_sto = "PF09645_full.stockholm"
97-
msa = MSA.read_file(pf09645_sto, Pfam.Stockholm)
98-
X = aa_properties_matrix(msa)
99-
ΔX = X .- mean(X, dims=2)
100-
i = findfirst(==(14), columnindexes(msa))
101-
@test all(iszero, ΔX[i, :])
102-
@test !all(iszero, ΔX[i-1, :])
123+
for msa in (MSA.read_file(pf09645_sto, Pfam.Stockholm), read(pf09645_sto, BioStockholm.MSA))
124+
X = aa_properties_matrix(msa)
125+
ΔX = X .- mean(X, dims=2)
126+
i = findfirst(==(14), columnindexes(msa))
127+
@test all(iszero, ΔX[i, :])
128+
@test !all(iszero, ΔX[i-1, :])
129+
end
103130
seqs = FASTAReader(open("test.fasta")) do io
104131
collect(io)
105132
end

0 commit comments

Comments
 (0)