Skip to content

improve median for pooled vectors #73

@bkamins

Description

@bkamins

If length of pool is much smaller than the number of entries we can run the following (working code):

function median_fast(x::PooledVector)
    n = length(x)
    p = sortperm(x.pool)
    counts = zeros(Int, length(p))
    for v in x.refs
        counts[v] += 1
    end
    cum = 0
    for (j,i) in enumerate(p)
        cum += counts[i]
        if isodd(n)
            if cum >= div(n + 1, 2)
                return middle(x.pool[i])
            end
        else
            if cum >= div(n, 2)
                if cum == div(n, 2)
                    return middle(x.pool[i], x.pool[p[j+1]])
                else
                    return middle(x.pool[i])
                end
            end
        end
    end
    error("unreachable reached")
end

Example timings:

julia> x = PooledArray(rand(1:100, 10^8));

julia> @time median_fast(x);
  0.063734 seconds (4 allocations: 1.781 KiB)

julia> @time median_fast(x);
  0.070556 seconds (4 allocations: 1.781 KiB)

julia> @time median(x);
  0.931585 seconds (3 allocations: 762.940 MiB, 9.86% gc time)

julia> @time median(x);
  0.955555 seconds (3 allocations: 762.940 MiB, 14.29% gc time)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions