Skip to content

Commit ad99603

Browse files
committed
WIP: Add BioStockholm extension
1 parent d54d832 commit ad99603

File tree

4 files changed

+140
-10
lines changed

4 files changed

+140
-10
lines changed

Project.toml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "GPCRAnalysis"
22
uuid = "c1d73f9e-d42a-418a-8d5b-c7b00ec0358f"
3-
authors = ["Tim Holy <[email protected]> and contributors"]
43
version = "0.6.0"
4+
authors = ["Tim Holy <[email protected]> and contributors"]
55

66
[deps]
77
BioStructures = "de9282ab-8554-53be-b2d6-f6c222edabfc"
@@ -27,15 +27,18 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
2727
TravelingSalesmanHeuristics = "8c8f4381-2cdd-507c-846c-be2bcff6f45f"
2828

2929
[weakdeps]
30+
BioStockholm = "eeb925a3-6f9d-43e6-829e-e0ea03b76ecf"
3031
HiGHS = "87dc4568-4c63-4d18-b0c0-bb2238e4078b"
3132
JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
3233
MIToS = "51bafb47-8a16-5ded-8b04-24ef4eede0b5"
3334

3435
[extensions]
36+
GPCRAnalysisBioStockholmExt = "BioStockholm"
3537
GPCRAnalysisJuMPExt = ["JuMP", "HiGHS"]
3638
GPCRAnalysisMIToSExt = "MIToS"
3739

3840
[compat]
41+
BioStockholm = "0.2.1"
3942
BioStructures = "4.2"
4043
ColorTypes = "0.11, 0.12"
4144
Distances = "0.10"
@@ -63,11 +66,12 @@ TravelingSalesmanHeuristics = "0.3"
6366
julia = "1.10"
6467

6568
[extras]
69+
BioStockholm = "eeb925a3-6f9d-43e6-829e-e0ea03b76ecf"
6670
HiGHS = "87dc4568-4c63-4d18-b0c0-bb2238e4078b"
6771
InvertedIndices = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
6872
JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
6973
MIToS = "51bafb47-8a16-5ded-8b04-24ef4eede0b5"
7074
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
7175

7276
[targets]
73-
test = ["HiGHS", "InvertedIndices", "JuMP", "MIToS", "Test"]
77+
test = ["BioStockholm", "HiGHS", "InvertedIndices", "JuMP", "MIToS", "Test"]

ext/GPCRAnalysisBioStockholmExt.jl

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
module GPCRAnalysisBioStockholmExt
2+
3+
using GPCRAnalysis
4+
using BioStockholm
5+
using BioStockholm: OrderedDict # from OrderedCollections.jl
6+
7+
function conscols(msa::MSA)
8+
ss = msa.GC["SS_cons"]
9+
return findfirst(!=( '.'), ss):findlast(!=( '.'), ss)
10+
end
11+
12+
# Low-level API implementation
13+
# GPCRAnalysis.sequenceindexes(msaseq::AnnotatedAlignedSequence) = getsequencemapping(msaseq)
14+
# GPCRAnalysis.sequenceindexes(msaseq::MSA, i::Int) = getsequencemapping(msaseq, i)
15+
# GPCRAnalysis.isgap(res::MSA.Residue) = res == GAP
16+
# GPCRAnalysis.isunknown(res::MSA.Residue) = res == XAA
17+
GPCRAnalysis.sequencekeys(msa::MSA) = collect(keys(msa.seq))
18+
# GPCRAnalysis.msasequence(msa::AbstractMultipleSequenceAlignment, key) = getsequence(msa, key)
19+
function GPCRAnalysis.residuematrix(msa::MSA)
20+
keepcols = conscols(msa)
21+
reduce(vcat, [permutedims(seq[keepcols]) for (_, seq) in msa.seq])
22+
end
23+
GPCRAnalysis.subseqs(msa::MSA{T}, rowmask::AbstractVector{Bool}) where T = MSA{T}(OrderedDict(pr for (pr, keep) in zip(msa.seq, rowmask) if keep), msa.GF, OrderedDict(pr for (pr, keep) in zip(msa.GS, rowmask) if keep), msa.GC, msa.GR)
24+
function GPCRAnalysis.subseqs!(msa::MSA, rowmask::AbstractVector{Bool})
25+
for ((key, _), keep) in zip(msa.seq, rowmask)
26+
if !keep
27+
delete!(msa.seq, key)
28+
delete!(msa.GS, key)
29+
end
30+
end
31+
return msa
32+
end
33+
34+
# These are piracy
35+
function Base.eachrow(msa::MSA)
36+
keepcols = conscols(msa)
37+
return Iterators.map(values(msa.seq)) do s
38+
return s[keepcols]
39+
end
40+
end
41+
Base.size(msa::MSA) = (length(msa.seq), length(conscols(msa)))
42+
Base.size(msa::MSA, dim::Int) = dim == 1 ? length(msa.seq) : length(conscols(msa))
43+
44+
Base.getindex(msa::MSA, seqname::MSACode) = msa.seq[seqname.name]
45+
Base.getindex(msa::MSA, seqname::AccessionCode) = msa[MSACode(msa, seqname)]
46+
47+
48+
function GPCRAnalysis.AccessionCode(msa::MSA, seqname::AbstractString)
49+
AccessionCode(split(msa.GS[seqname]["AC"], '.')[1])
50+
end
51+
GPCRAnalysis.AccessionCode(msa::MSA, seqname::MSACode) = AccessionCode(msa, seqname.name)
52+
GPCRAnalysis.AccessionCode(::MSA, seqname::AccessionCode) = seqname
53+
54+
function GPCRAnalysis.MSACode(msa::MSA, accession::AbstractString)
55+
acs = [split(ac["AC"], '.')[1] for (_, ac) in msa.GS]
56+
i = findfirst(==(accession), acs)
57+
return MSACode(GPCRAnalysis.sequencekeys(msa)[i])
58+
end
59+
GPCRAnalysis.MSACode(msa::MSA, accession::AccessionCode) = MSACode(msa, accession.name)
60+
GPCRAnalysis.MSACode(::MSA, accession::MSACode) = accession
61+
62+
63+
function reduced_alphabet(r::Char)
64+
if r in ('A','I','L','M','V')
65+
return 1 # hydrophobic
66+
elseif r in ('N','Q','S','T')
67+
return 2 # polar
68+
elseif r in ('R','H','K')
69+
return 3 # charged
70+
elseif r in ('D','E')
71+
return 4 # charged
72+
elseif r in ('F','W','Y')
73+
return 5 # aromatic
74+
end
75+
offset = findfirst(==(r), ('C','G','P','-'))
76+
offset === nothing && throw(ArgumentError("Unknown residue '$r'"))
77+
return 5 + offset # special or unknown
78+
end
79+
80+
GPCRAnalysis.columnwise_entropy(msa) = columnwise_entropy(reduced_alphabet, msa)
81+
82+
function GPCRAnalysis.percent_similarity(msa)
83+
M = reduced_alphabet.(GPCRAnalysis.residuematrix(msa))
84+
n = size(M, 1)
85+
S = zeros(Float64, n, n)
86+
for i in 1:n
87+
for j in i:n
88+
S[i, j] = 100 * sum(M[i, :] .== M[j, :]) / size(M, 2)
89+
S[j, i] = S[i, j]
90+
end
91+
end
92+
return S
93+
end
94+
95+
96+
end

test/PF09645_full.stockholm

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,4 @@ F112_SSV1/3-112 .....QTLNSYKMAEIMYKILEKKGELTLEDILAQFEISVPSAYNIQRA
1111
#=GR F112_SSV1/3-112 SS .....X---HHHHHHHHHHHHHHHSEE-HHHHHHHH---HHHHHHHHHHHHHHHHH-TTTEEEEE-SS-EEEEE--XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX.....
1212
#=GC SS_cons .....X---HHHHHHHHHHHHHHHSEE-HHHHHHHH---HHHHHHHHHHHHHHHHH-TTTEEEEE-SS-EEEEE--XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX.....
1313
#=GC seq_cons ........NshphAclhaKILppKtElolEDIlAQFEISsosAYsI.+sL+hICEpH.-ECpsppKsRKTlhh.hKpEphppptpEp..ppItKIhsAp................h....
14-
//
14+
//

test/runtests.jl

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ using GPCRAnalysis: sequenceindexes, isgap, isunknown, sequencekeys, msasequence
55
subseqs, subseqs!, percent_similarity
66
using MIToS: MSA, Pfam
77
using MIToS.MSA: coverage, GappedAlphabet, nsequences
8-
nsequences, sequencenames, getsequencemapping, getcolumnmapping
8+
using BioStockholm: BioStockholm
99
using BioStructures
1010
using FASTX
1111
using GaussianMixtureAlignment
@@ -17,6 +17,7 @@ using ColorTypes
1717
using Test
1818

1919
columnindexes(msa::MSA.AbstractMultipleSequenceAlignment) = MSA.getcolumnmapping(msa)
20+
columnindexes(msa::BioStockholm.MSA) = eachindex(first(msa.seq).second)
2021

2122
# skip the network-hitting components by setting `skip_download = true` in the global namespace
2223

@@ -66,6 +67,7 @@ columnindexes(msa::MSA.AbstractMultipleSequenceAlignment) = MSA.getcolumnmapping
6667
@testset "MSA" begin
6768
# The test file is copied from MIToS/test/data, with gratitude
6869
pf09645_sto = "PF09645_full.stockholm"
70+
## First in MIToS format
6971
msa = MSA.read_file(pf09645_sto, Pfam.Stockholm)
7072
@test MSA.nsequences(filter_species!(deepcopy(msa), "ATV")) == 1
7173
@test MSA.nsequences(filter_long!(deepcopy(msa), 70)) == 3
@@ -91,15 +93,43 @@ columnindexes(msa::MSA.AbstractMultipleSequenceAlignment) = MSA.getcolumnmapping
9193
@test AccessionCode(msa, MSACode("Y070_ATV/2-70")) == AccessionCode("Q3V4T1")
9294
@test MSACode(msa, AccessionCode("Q3V4T1")) == MSACode("Y070_ATV/2-70")
9395
@test msa[MSACode("Y070_ATV/2-70")][8] == msa[AccessionCode("Q3V4T1")][8] == MSA.Residue('V')
96+
97+
## Now in BioStockholm format
98+
msa = read(pf09645_sto, BioStockholm.MSA)
99+
@test length(sequencekeys(filter_species!(deepcopy(msa), "ATV"))) == 1
100+
@test length(sequencekeys(filter_long!(deepcopy(msa), 70))) == 3
101+
102+
idx = SequenceMapping([0, 4, 5, 0])
103+
seqvals = fill(NaN, 9)
104+
seqvals[idx] = [0.1, 0.2, 0.3, 0.4]
105+
@test seqvals[4] == 0.2
106+
@test seqvals[5] == 0.3
107+
@test all(isnan, seqvals[1:3])
108+
@test all(isnan, seqvals[6:end])
109+
110+
# analyze
111+
e = columnwise_entropy(msa)
112+
@test length(e) == size(msa, 2) && e[9] == 0
113+
e2 = columnwise_entropy(identity, msa)
114+
@test all(e2 .>= e)
115+
@test !all(e2 .== e)
116+
117+
@test size(project_sequences(msa)) == (3, 4)
118+
@test size(project_sequences(msa; fracvar=0.5)) == (1, 4)
119+
120+
@test AccessionCode(msa, MSACode("Y070_ATV/2-70")) == AccessionCode("Q3V4T1")
121+
@test MSACode(msa, AccessionCode("Q3V4T1")) == MSACode("Y070_ATV/2-70")
122+
@test msa[MSACode("Y070_ATV/2-70")][8] == msa[AccessionCode("Q3V4T1")][8] == MSA.Residue('V')
94123
end
95124
@testset "Properties" begin
96125
pf09645_sto = "PF09645_full.stockholm"
97-
msa = MSA.read_file(pf09645_sto, Pfam.Stockholm)
98-
X = aa_properties_matrix(msa)
99-
ΔX = X .- mean(X, dims=2)
100-
i = findfirst(==(14), columnindexes(msa))
101-
@test all(iszero, ΔX[i, :])
102-
@test !all(iszero, ΔX[i-1, :])
126+
for msa in (MSA.read_file(pf09645_sto, Pfam.Stockholm), read(pf09645_sto, BioStockholm.MSA))
127+
X = aa_properties_matrix(msa)
128+
ΔX = X .- mean(X, dims=2)
129+
i = findfirst(==(14), columnindexes(msa))
130+
@test all(iszero, ΔX[i, :])
131+
@test !all(iszero, ΔX[i-1, :])
132+
end
103133
seqs = FASTAReader(open("test.fasta")) do io
104134
collect(io)
105135
end

0 commit comments

Comments
 (0)