Skip to content

Commit 65249bd

Browse files
authored
Improve bit-twiddling for counting (#335)
This adds a more generic encoding counting function and uses it.
1 parent d4b1426 commit 65249bd

File tree

3 files changed

+31
-34
lines changed

3 files changed

+31
-34
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "BioSequences"
22
uuid = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
33
authors = ["Sabrina Jaye Ward <sabrinajward@protonmail.com>", "Jakob Nissen <jakobnybonissen@gmail.com>"]
4-
version = "3.4.1"
4+
version = "3.4.2"
55

66
[deps]
77
BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"

src/bit-manipulation/bit-manipulation.jl

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,26 @@ end
6868
@inline sum(map(i -> f(i.value), z))
6969
end
7070

71+
pattern(::BitsPerSymbol{1}) = typemax(UInt128)
72+
pattern(::BitsPerSymbol{2}) = 0x55555555555555555555555555555555
73+
pattern(::BitsPerSymbol{4}) = 0x11111111111111111111111111111111
74+
pattern(::BitsPerSymbol{8}) = 0x01010101010101010101010101010101
75+
pattern(::BitsPerSymbol{16}) = 0x00010001000100010001000100010001
76+
pattern(::BitsPerSymbol{32}) = 0x00000001000000010000000100000001
77+
pattern(::BitsPerSymbol{64}) = 0x00000000000000010000000000000001
78+
pattern(::BitsPerSymbol{128}) = 0x00000000000000000000000000000001
79+
80+
function count_encoding(chunk::T, encoding::T, b::BitsPerSymbol{B}) where {T <: Unsigned, B}
81+
pat = pattern(b) % typeof(encoding)
82+
u = chunk (encoding * pat)
83+
for i in 1:trailing_zeros(B)
84+
shift = (1 << (i - 1)) & (8*sizeof(T) - 1)
85+
u |= (u >> shift)
86+
end
87+
u = ~u & pat
88+
return count_ones(u)
89+
end
90+
7191
@inline function certain_bitcount(x::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}}
7292
x = enumerate_nibbles(x)
7393
x = x 0x1111111111111111

src/counting.jl

Lines changed: 10 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ end
2121
y += @inline body(data[i])
2222
end
2323
y
24-
y
2524
end
2625

2726
"""
@@ -356,44 +355,22 @@ function count_symbol(seq::BioSequence, sym::BioSymbol)
356355
n
357356
end
358357

359-
function Base.count(
360-
pred::Base.Fix2{<:Union{typeof(==), typeof(isequal)}, <: BioSymbol},
361-
s::BioSequence
362-
)
363-
count_symbol(s, pred.x)
364-
end
365-
366-
function count_symbol(seq::FourBit, s::Union{RNA, DNA})
367-
pattern = encode(Alphabet(seq), s)::UInt64 * 0x1111111111111111
358+
function count_symbol(seq::Union{LongSubSeq, LongSequence}, sym::BioSymbol)
359+
enc = encode(Alphabet(seq), sym)
368360
tail = (chunk, rm) -> begin
369361
mask = UInt64(1) << (rm & 63) - 1
370-
masked = iszero(pattern) ? chunk | ~mask : mask & chunk
371-
count_0000_nibbles(masked pattern)
362+
masked = iszero(enc) ? chunk | ~mask : mask & chunk
363+
count_encoding(masked, enc, BitsPerSymbol(seq))
372364
end
373-
body = i -> count_0000_nibbles(i pattern)
365+
body = i -> count_encoding(i, enc, BitsPerSymbol(seq))
374366
counter_1seq(tail, body, seq)
375367
end
376368

377-
function count_symbol(seq::TwoBit, s::Union{RNA, DNA})
378-
pattern = encode(Alphabet(seq), s)::UInt64 * 0x5555555555555555
379-
tail = (chunk, rm) -> begin
380-
mask = UInt64(1) << (rm & 63) - 1
381-
masked = iszero(pattern) ? chunk | ~mask : mask & chunk
382-
count_00_bitpairs(masked pattern)
383-
end
384-
body = i -> count_00_bitpairs(i pattern)
385-
counter_1seq(tail, body, seq)
386-
end
387-
388-
function count_symbol(seq::SeqOrView{AminoAcidAlphabet}, s::AminoAcid)
389-
byte = encode(Alphabet(seq), s) % UInt8
390-
tail = (chunk, rm) -> begin
391-
mask = UInt64(1) << (rm & 63) - 1
392-
masked = iszero(byte) ? chunk | ~mask : mask & chunk
393-
count_compared_bytes(==(byte), masked)
394-
end
395-
body = i -> count_compared_bytes(==(byte), i)
396-
counter_1seq(tail, body, seq)
369+
function Base.count(
370+
pred::Base.Fix2{<:Union{typeof(==), typeof(isequal)}, <: BioSymbol},
371+
s::BioSequence
372+
)
373+
count_symbol(s, pred.x)
397374
end
398375

399376
## Deprecate weird two-arg methods

0 commit comments

Comments
 (0)