Skip to content

Commit 8a18615

Browse files
committed
Index: keep track of duplicates and implement O(1) Base.allunique()
this comes at the cost of slower value replacement, see the playground branch for benchmarks
1 parent 0595e87 commit 8a18615

File tree

4 files changed

+141
-113
lines changed

4 files changed

+141
-113
lines changed

src/anndata.jl

Lines changed: 27 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -462,39 +462,38 @@ end
462462

463463
function attr_make_unique!(ad::AnnData, namesattr::Symbol, join)
464464
index = getproperty(ad, namesattr)
465-
duplicates = duplicateindices(index)
466-
467-
if isempty(duplicates)
468-
@info "var names are already unique, doing nothing"
469-
return ad
470-
end
471-
472-
example_colliding_names = []
473-
for (name, positions) duplicates
474-
i = 1
475-
for pos Iterators.rest(positions, 2)
476-
while true
477-
potential = string(index[pos], join, i)
478-
i += 1
479-
if potential index
480-
if length(example_colliding_names) <= 5
481-
push!(example_colliding_names, potential)
465+
if !allunique(index)
466+
duplicates = duplicateindices(index)
467+
468+
example_colliding_names = []
469+
for (name, positions) duplicates
470+
i = 1
471+
for pos Iterators.rest(positions, 2)
472+
while true
473+
potential = string(index[pos], join, i)
474+
i += 1
475+
if potential index
476+
if length(example_colliding_names) <= 5
477+
push!(example_colliding_names, potential)
478+
end
479+
else
480+
index[pos] = potential
481+
break
482482
end
483-
else
484-
index[pos] = potential
485-
break
486483
end
487484
end
488485
end
489-
end
490486

491-
if !isempty(example_colliding_names)
492-
@warn """
493-
Appending $(join)[1-9...] to duplicates caused collision with another name.
494-
Example(s): $example_colliding_names
495-
This may make the names hard to interperet.
496-
Consider setting a different delimiter with `join={delimiter}`
497-
"""
487+
if !isempty(example_colliding_names)
488+
@warn """
489+
Appending $(join)[1-9...] to duplicates caused collision with another name.
490+
Example(s): $example_colliding_names
491+
This may make the names hard to interperet.
492+
Consider setting a different delimiter with `join={delimiter}`
493+
"""
494+
end
495+
else
496+
@info "var names are already unique, doing nothing"
498497
end
499498
return ad
500499
end

src/index.jl

Lines changed: 70 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ mutable struct Index{T, V} <: AbstractIndex{T, V}
99
longestprobe::V
1010
initiallongestprobe::V
1111
deletions::V
12+
duplicates::V
1213

1314
function Index{T}(nelements::Integer) where {T}
1415
size = ceil(nelements / 0.9) # 0.9 load factor
@@ -18,9 +19,10 @@ mutable struct Index{T, V} <: AbstractIndex{T, V}
1819
Vector{T}(undef, nelements),
1920
zeros(mintype, mintype(size)),
2021
zeros(mintype, mintype(size)),
21-
0,
22-
0,
23-
0,
22+
0x0,
23+
0x0,
24+
0x0,
25+
0x0,
2426
)
2527
end
2628
end
@@ -33,9 +35,14 @@ function _setindex!(idx::Index{T, V}, elem::T, position::Unsigned) where {T, V}
3335
k = position
3436
location = hash(elem) % _length(idx) + 0x1
3537
prevlongestprobe = idx.longestprobe
38+
is_duplicate = false
3639
@inbounds while k != 0x0
3740
probeposition += 0x1
3841
recordpos = idx.probepositions[location]
42+
if !is_duplicate && recordpos > 0 && isequal(idx.vals[idx.indices[location]], elem)
43+
idx.duplicates += 0x1
44+
is_duplicate = true
45+
end
3946
if probeposition > recordpos
4047
k, idx.indices[location] = idx.indices[location], k
4148
idx.probepositions[location] = probeposition
@@ -45,7 +52,7 @@ function _setindex!(idx::Index{T, V}, elem::T, position::Unsigned) where {T, V}
4552
location = location == _length(idx) ? UInt(0x1) : location + 0x1
4653
end
4754

48-
# reset longestprobe after some deletions deletions, this should maintain stable performance for _getindex and _getindex_array
55+
# reset longestprobe after some deletions, this should maintain stable performance for _getindex and _getindex_array
4956
if idx.longestprobe == prevlongestprobe &&
5057
idx.longestprobe > idx.initiallongestprobe &&
5158
idx.deletions >= length(idx) ÷ 2
@@ -54,24 +61,28 @@ function _setindex!(idx::Index{T, V}, elem::T, position::Unsigned) where {T, V}
5461
end
5562
end
5663

57-
function _getindex(idx::Index{T}, elem::T) where {T}
64+
function _getindex(idx::Index{T, V}, elem::T) where {T, V}
5865
ilength = _length(idx)
5966
if ilength > 0x0
60-
location = hash(elem) % ilength + 0x1
67+
location::V = hash(elem) % ilength + 0x1
6168
@inbounds for probeposition 0x1:(idx.longestprobe)
6269
pos = idx.indices[location]
6370
if pos > 0x0 && isequal(idx.vals[pos], elem)
6471
return location
6572
elseif pos == 0x0
66-
return 0x0
73+
return V(0x0)
6774
end
6875
location = location == ilength ? UInt(0x1) : location + 0x1
6976
end
7077
end
71-
return 0x0
78+
return V(0x0)
7279
end
7380

7481
function _getindex_array(idx::Index{T, V}, elem::T) where {T, V}
82+
if allunique(idx)
83+
ret = _getindex(idx, elem)
84+
return ret > 0x0 ? [ret] : V[]
85+
end
7586
locations = Vector{V}()
7687
ilength = _length(idx)
7788
if ilength > 0x0
@@ -89,45 +100,68 @@ function _getindex_array(idx::Index{T, V}, elem::T) where {T, V}
89100
return locations
90101
end
91102

92-
function _getindex_byposition(idx::Index{T}, i::Integer) where {T}
103+
function _getindex_byposition(idx::Index{T, V}, i::Integer) where {T, V}
93104
elem = idx.vals[i]
94105
ilength = _length(idx)
106+
is_duplicate = 0x0
95107
if ilength > 0x0
96-
location = hash(elem) % ilength + 0x1
108+
location::V = hash(elem) % ilength + 0x1
97109
@inbounds for probeposition 0x1:(idx.longestprobe)
98110
pos = idx.indices[location]
99-
if pos > 0x0 && pos == i
100-
return location
111+
if pos > 0x0
112+
if is_duplicate < 0x2
113+
is_duplicate += isequal(elem, idx.vals[pos])
114+
end
115+
if pos == i
116+
return location, is_duplicate > 0x1
117+
end
101118
elseif pos == 0x0
102119
break
103120
end
104-
location = location == ilength ? UInt64(0x1) : location + 0x1
121+
location = location == ilength ? V(0x1) : location + 0x1
105122
end
106123
end
107124
error("Element not found. This should never happen.")
108125
end
109126

110-
function _delete!(idx::Index, oldkeyindex::Integer)
127+
function _delete!(idx::Index, oldkeyindex::Integer, is_duplicate::Bool=false)
111128
lastidx = findnext((0x1), idx.probepositions, oldkeyindex + 0x1)
129+
has_duplicates = !allunique(idx)
112130
@inbounds if !isnothing(lastidx)
113131
lastidx -= 0x1
132+
133+
if !is_duplicate && has_duplicates
134+
subvals = @view idx.vals[idx.indices[oldkeyindex:lastidx]]
135+
is_duplicate = !isnothing(findnext(isequal(subvals[1]), subvals, 2))
136+
end
137+
114138
idx.indices[oldkeyindex:(lastidx - 0x1)] .= @view idx.indices[(oldkeyindex + 0x1):lastidx]
115139
idx.probepositions[oldkeyindex:(lastidx - 0x1)] .= @view(idx.probepositions[(oldkeyindex + 0x1):lastidx]) .- 0x1
116140
idx.indices[lastidx] = idx.probepositions[lastidx] = 0x0
117141
else
118142
if oldkeyindex < _length(idx)
143+
if !is_duplicate && has_duplicates
144+
subvals = @view idx.vals[idx.indices[oldkeyindex:end]]
145+
is_duplicate = !isnothing(findnext(isequal(subvals[1]), subvals, 2))
146+
end
147+
119148
idx.indices[oldkeyindex:(end - 0x1)] .= @view idx.indices[(oldkeyindex + 0x1):end]
120149
idx.probepositions[oldkeyindex:(end - 0x1)] .= @view(idx.probepositions[(oldkeyindex + 0x1):end]) .- 0x1
121150
end
122151
if idx.probepositions[1] > 0x1
123-
idx.indices[end] = idx.indices[1]
124-
idx.probepositions[end] = idx.probepositions[1] - 0x1
125-
126152
lastidx = findnext((0x1), idx.probepositions, 2)
127153
if isnothing(lastidx)
128154
lastidx = oldkeyindex
129155
end
130156
lastidx -= 0x1
157+
158+
if !is_duplicate && has_duplicates
159+
subvals = @view idx.vals[idx.indices[1:lastidx]]
160+
is_duplicate = !isnothing(findnext(isequal(subvals[1]), subvals, 2))
161+
end
162+
163+
idx.indices[end] = idx.indices[1]
164+
idx.probepositions[end] = idx.probepositions[1] - 0x1
131165
idx.indices[0x1:(lastidx - 0x1)] .= @view idx.indices[0x2:lastidx]
132166
idx.probepositions[0x1:(lastidx - 0x1)] .= @view(idx.probepositions[0x2:lastidx]) .- 0x1
133167
idx.indices[lastidx] = idx.probepositions[lastidx] = 0x0
@@ -136,6 +170,7 @@ function _delete!(idx::Index, oldkeyindex::Integer)
136170
end
137171
end
138172

173+
idx.duplicates -= is_duplicate
139174
idx.deletions += 0x1
140175
end
141176

@@ -157,6 +192,7 @@ Base.getindex(idx::AbstractIndex{T, V}, elems::AbstractVector{T}, x::Union{Val{t
157192
isempty(elems) ? V[] : reduce(vcat, (getindex(idx, elem, x) for elem elems))
158193
Base.in(elem::T, idx::AbstractIndex{T}) where {T} = getindex(idx, elem, Val(false), Val(false)) != 0x0
159194

195+
Base.allunique(idx::Index) = idx.duplicates == 0x0
160196
Base.getindex(idx::Index{T}, elem::T, ::Val{true}, ::Val{false}) where {T} =
161197
@inbounds idx.indices[_getindex_array(idx, elem)] # exceptions may be undesirable in high-performance scenarios
162198
function Base.getindex(idx::Index{T}, elem::T, ::Val{true}) where {T}
@@ -185,9 +221,9 @@ end
185221

186222
@inline function Base.setindex!(idx::Index{T}, newval::T, i::Integer) where {T}
187223
@boundscheck checkbounds(idx.vals, i)
188-
oldkeyindex = _getindex_byposition(idx, i)
224+
oldkeyindex, is_duplicate = _getindex_byposition(idx, i)
189225
validx = idx.indices[oldkeyindex]
190-
_delete!(idx, oldkeyindex)
226+
_delete!(idx, oldkeyindex, is_duplicate)
191227
_setindex!(idx, newval, validx)
192228
return idx
193229
end
@@ -198,7 +234,7 @@ function Base.setindex!(idx::Index{T}, newval::T, oldval::T) where {T}
198234
throw(KeyError(oldval))
199235
end
200236
validx = idx.indices[oldkeyindex]
201-
_delete!(idx, oldkeyindex)
237+
_delete!(idx, oldkeyindex, false)
202238
_setindex!(idx, newval, validx)
203239
return idx
204240
end
@@ -209,7 +245,7 @@ Base.values(idx::Index) = idx.vals
209245
Base.convert(::Type{<:Index}, x::AbstractArray) = Index(x)
210246
Base.convert(::Type{<:Index}, x::Index) = x
211247

212-
struct SubIndex{T, V, I} <: AbstractIndex{T, V}
248+
mutable struct SubIndex{T, V, I} <: AbstractIndex{T, V}
213249
parent::Index{T, V}
214250
indices::I
215251
revmapping::Union{Nothing, Index}
@@ -224,9 +260,9 @@ end
224260
@boundscheck checkbounds(idx, I)
225261
return SubIndex(idx, I, Index(V.(I)))
226262
end
227-
@inline function Base.view(idx::Index{T, V}, I::AbstractArray{Bool}) where {T, V}
263+
@inline function Base.view(idx::Index, I::AbstractArray{Bool})
228264
@boundscheck checkbounds(idx, I)
229-
return SubIndex(idx, I, Index(V.(findall(I))))
265+
return @inbounds view(idx, findall(I))
230266
end
231267
@inline function Base.view(idx::Index, I::Integer)
232268
@boundscheck checkbounds(idx, I)
@@ -242,9 +278,17 @@ Base.parent(si::SubIndex) = si.parent
242278
Base.parentindices(si::SubIndex) = si.indices
243279
Base.length(si::SubIndex) = length(parentindices(si))
244280
Base.length(si::SubIndex{T, V, Colon}) where {T, V} = length(parent(si))
245-
Base.length(si::SubIndex{T, V, I}) where {T, V, I <: AbstractArray{Bool}} = length(si.revmapping)
246281
Base.size(si::SubIndex) = (length(si),)
247-
Base.values(si::SubIndex) = parent(si)[parentindices(si)]
282+
Base.values(si::SubIndex) = @view parent(si)[parentindices(si)]
283+
284+
Base.allunique(si::SubIndex{T, V, Colon}) where {T, V} = allunique(parent(si))
285+
# it is impossible to accurately keep track of duplicates for views: The parent object may be changed by another view or by itself
286+
# thus we really need to always recheck for duplicates in the view if the parent has any
287+
function Base.allunique(si::SubIndex)
288+
return allunique(parent(si)) ? true :
289+
length(parentindices(si)) - length(unique(view(parent(si).vals, parentindices(si)))) == 0x0
290+
end
291+
248292
function Base.getindex(si::SubIndex{T}, elem::T, ::Val{true}, ::Val{false}) where {T}
249293
res = getindex(parent(si), elem, Val(true), Val(false))
250294
i = 0
@@ -260,15 +304,6 @@ function Base.getindex(si::SubIndex{T}, elem::T, ::Val{true}, ::Val{false}) wher
260304
end
261305
Base.getindex(si::SubIndex{T, V, Colon}, elem::T, ::Val{true}, ::Val{false}) where {T, V} =
262306
getindex(parent(si), elem, Val(true), Val(false))
263-
function Base.getindex(si::SubIndex{T, V, I}, elem::T, ::Val{true}, ::Val{false}) where {T, V, I <: AbstractArray{Bool}}
264-
res = getindex(parent(si), elem, Val(true), Val(false))
265-
res = res[parentindices(si)[res]]
266-
@inbounds for (i, r) enumerate(res)
267-
res[i] = si.revmapping[r, false, false]
268-
end
269-
270-
return res
271-
end
272307
function Base.getindex(
273308
si::SubIndex{T, V, I},
274309
elem::T,
@@ -313,18 +348,6 @@ function Base.getindex(si::SubIndex{T, V}, elem::T, ::Val{false}, ::Val{false})
313348
end
314349
Base.getindex(si::SubIndex{T, V, Colon}, elem::T, ::Val{false}, ::Val{false}) where {T, V} =
315350
getindex(parent(si), elem, Val(false), Val(false))
316-
function Base.getindex(
317-
si::SubIndex{T, V, I},
318-
elem::T,
319-
::Val{false},
320-
::Val{false},
321-
) where {T, V, I <: AbstractArray{Bool}}
322-
res = getindex(parent(si, elem, Val(false), Val(false)))
323-
if res == 0x0 || res > 0x0 && !(@inbounds parentindices(si)[res])
324-
return V(0x0)
325-
end
326-
return si.revmapping[res, false, false]
327-
end
328351
function Base.getindex(
329352
si::SubIndex{T, V, I},
330353
elem::T,
@@ -369,7 +392,7 @@ Base.@propagate_inbounds function Base.setindex!(si::SubIndex{T}, newval::T, i::
369392
end
370393
Base.@propagate_inbounds function Base.setindex!(si::SubIndex{T, V, Colon}, newval::T, i::Integer) where {T, V}
371394
@boundscheck checkbounds(si, i)
372-
setindex!(parent(si), newval, i)
395+
return setindex!(parent(si), newval, i)
373396
end
374397
Base.@propagate_inbounds function Base.setindex!(si::SubIndex{T}, newval::T, oldval::T) where {T}
375398
oldidx = parent(si)[oldval, true]
@@ -380,19 +403,6 @@ Base.@propagate_inbounds function Base.setindex!(si::SubIndex{T}, newval::T, old
380403
parent(si)[oldidx[foldidx]] = newval
381404
return si
382405
end
383-
Base.@propagate_inbounds function Base.setindex!(
384-
si::SubIndex{T, V, I},
385-
newval::T,
386-
oldval::T,
387-
) where {T, V, I <: AbstractArray{Bool}}
388-
oldidx = parent(si)[oldval, true]
389-
oldidx = oldidx[parentindices(si)[oldix]]
390-
if length(oldidx) == 0
391-
throw(KeyError(oldval))
392-
end
393-
parent(si)[oldidx[1]] = newval
394-
return si
395-
end
396406
Base.@propagate_inbounds function Base.setindex!(
397407
si::SubIndex{T, V, I},
398408
newval::T,

0 commit comments

Comments
 (0)