From 94e117b096a6775d62289f697d736192487ba8dd Mon Sep 17 00:00:00 2001 From: Jacob Quinn Date: Tue, 6 Sep 2022 10:33:35 -0600 Subject: [PATCH] Change getrows -> subset Also adds a generic implementation of subset for column-oriented tables and row-oriented tables that are AbstractVector. Adds implementation and tests for DictRowTable/DictColumnTable. --- src/Tables.jl | 36 +++++++++++++++++++++++++++----- src/dicts.jl | 12 +++++++++++ src/namedtuples.jl | 24 ++++++++-------------- test/runtests.jl | 51 ++++++++++++++++++++++++++++++---------------- 4 files changed, 84 insertions(+), 39 deletions(-) diff --git a/src/Tables.jl b/src/Tables.jl index 8efaf41..219d725 100644 --- a/src/Tables.jl +++ b/src/Tables.jl @@ -569,15 +569,15 @@ struct Partitioner{T} end """ - Tables.getrows(x, inds; view=nothing) + Tables.subset(x, inds; view=nothing) Return one or more rows from table `x` according to the position(s) specified by `inds`: - If `inds` is a single non-boolean integer return a row object. -- If `inds` is a vector of non-boolean integers, a vector of booleans, or a `:`, return an indexable object of rows. +- If `inds` is a vector of non-boolean integers, a vector of booleans, or a `:`, return a subset of the original table according to the indices. In this case, the returned type is not necessarily the same as the original table type. -If other type of `inds` is passed than specified above the behavior is undefined. +If other types of `inds` are passed than specified above the behavior is undefined. The `view` argument influences whether the returned object is a view of the original table or an independent copy: @@ -587,11 +587,37 @@ or an independent copy: - If `view=true` then a view is returned and if `view=false` a copy is returned. This applies both to returning a row or a table. -Any specialized implementation of `getrows` must support the `view=nothing` argument. +Any specialized implementation of `subset` must support the `view=nothing` argument. Support for `view=true` or `view=false` is optional (i.e. implementations might error on them if they are not supported). """ -function getrows end +function subset(x::T, inds; view::Union{Bool, Nothing}=nothing) where {T} + # because this method is being called, we know `x` didn't define it's own Tables.subset + # first check if it supports column access, and if so, apply inds and wrap columns in a DictColumnTable + if columnaccess(x) + cols = columns(x) + if inds isa Integer + return ColumnsRow(cols, inds) + else + ret = view === true ? _map(c -> Base.view(c, inds), cols) : _map(c -> c[inds], cols) + return DictColumnTable(schema(cols), ret) + end + end + # otherwise, let's get the rows and see if we can apply inds to them + r = rows(x) + if r isa AbstractVector + inds isa Integer && return r[inds] + ret = view === true ? Base.view(x, inds) : x[inds] + (ret isa AbstractVector) || throw(ArgumentError("`Tables.subset`: invalid `inds` argument, expected `AbstractVector` output, got $(typeof(ret))")) + return ret + end + throw(ArgumentError("no default `Tables.subset` implementation for type: $T")) +end + +vectorcheck(x::AbstractVector) = x +vectorcheck(x) = throw(ArgumentError("`Tables.subset`: invalid `inds` argument, expected `AbstractVector` output, got $(typeof(x))")) +_map(f, cols) = OrderedDict(nm => vectorcheck(f(getcolumn(cols, nm))) for nm in columnnames(cols)) + """ Tables.partitioner(f, itr) diff --git a/src/dicts.jl b/src/dicts.jl index 292adb8..ee5e6db 100644 --- a/src/dicts.jl +++ b/src/dicts.jl @@ -1,3 +1,4 @@ +# Dict of Vectors as table struct DictColumnTable <: AbstractColumns schema::Schema values::OrderedDict{Symbol, AbstractVector} @@ -94,6 +95,7 @@ columnnames(x::DictColumnTable) = getfield(x, :schema).names getcolumn(x::DictColumnTable, i::Int) = getfield(x, :values)[columnnames(x)[i]] getcolumn(x::DictColumnTable, nm::Symbol) = getfield(x, :values)[nm] +# Vector of Dicts as table struct DictRowTable names::Vector{Symbol} types::Dict{Symbol, Type} @@ -122,6 +124,16 @@ function Base.iterate(x::DictRowTable, st=1) return DictRow(x.names, x.values[st]), st + 1 end +function subset(x::DictRowTable, inds; view::Union{Bool,Nothing} = nothing) + values = view === true ? Base.view(getfield(x, :values), inds) : getfield(x, :values)[inds] + if inds isa Integer + return DictRow(getfield(x, :names), values) + else + values isa AbstractVector || throw(ArgumentError("`Tables.subset`: invalid `inds` argument, expected `RowTable` output, got $(typeof(ret))")) + return DictRowTable(getfield(x, :names), getfield(x, :types), values) + end +end + """ Tables.dictrowtable(x) => Tables.DictRowTable diff --git a/src/namedtuples.jl b/src/namedtuples.jl index 7ac24fd..f063d30 100644 --- a/src/namedtuples.jl +++ b/src/namedtuples.jl @@ -106,18 +106,18 @@ function rowtable(itr::T) where {T} return collect(namedtupleiterator(eltype(r), r)) end -function getrows(x::RowTable, inds; view::Union{Bool,Nothing} = nothing) - if view === true - return Base.view(x, inds) +# NamedTuple of arrays of matching dimensionality +const ColumnTable = NamedTuple{names, T} where {names, T <: NTuple{N, AbstractVector{S} where S}} where {N} +rowcount(c::ColumnTable) = length(c) == 0 ? 0 : length(c[1]) + +function subset(x::ColumnTable, inds; view::Union{Bool,Nothing}=nothing) + if inds isa Integer + return map(c -> c[inds], x) else - return x[inds] + return view === true ? map(c -> vectorcheck(Base.view(c, inds)), x) : map(c -> vectorcheck(c[inds]), x) end end -# NamedTuple of arrays of matching dimensionality -const ColumnTable = NamedTuple{names, T} where {names, T <: NTuple{N, AbstractArray{S, D} where S}} where {N, D} -rowcount(c::ColumnTable) = length(c) == 0 ? 0 : length(c[1]) - # interface implementation istable(::Type{<:ColumnTable}) = true columnaccess(::Type{<:ColumnTable}) = true @@ -181,11 +181,3 @@ function columntable(itr::T) where {T} return columntable(schema(cols), cols) end columntable(x::ColumnTable) = x - -function getrows(x::ColumnTable, inds; view::Union{Bool,Nothing} = nothing) - if view === true - return map(c -> Base.view(c, inds), x) - else - return map(c -> c[inds], x) - end -end diff --git a/test/runtests.jl b/test/runtests.jl index 999a95a..c4819e0 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -145,30 +145,32 @@ end @test Tables.buildcolumns(nothing, rt) == nt @test Tables.columntable(nothing, nt) == nt - @testset "columntable getrows" begin - @test Tables.getrows(nt, 1) == (a=1, b=4.0, c="7") - @test Tables.getrows(nt, 1, view=false) == (a=1, b=4.0, c="7") - @test Tables.getrows(nt, 1, view=nothing) == (a=1, b=4.0, c="7") - @test Tables.getrows(nt, 1:2) == (a=[1,2], b=[4.0, 5.0], c=["7","8"]) - @test Tables.getrows(nt, 1:2, view=false) == (a=[1,2], b=[4.0, 5.0], c=["7","8"]) - @test Tables.getrows(nt, 1:2, view=nothing) == (a=[1,2], b=[4.0, 5.0], c=["7","8"]) + @testset "columntable subset" begin + @test Tables.subset(nt, 1) == (a=1, b=4.0, c="7") + @test Tables.subset(nt, 1, view=false) == (a=1, b=4.0, c="7") + @test Tables.subset(nt, 1, view=nothing) == (a=1, b=4.0, c="7") + @test Tables.subset(nt, 1:2) == (a=[1,2], b=[4.0, 5.0], c=["7","8"]) + @test Tables.subset(nt, 1:2, view=false) == (a=[1,2], b=[4.0, 5.0], c=["7","8"]) + @test Tables.subset(nt, 1:2, view=nothing) == (a=[1,2], b=[4.0, 5.0], c=["7","8"]) + @test_throws ArgumentError Tables.subset(nt, [1:2 1:2]) - @test Tables.getrows(nt, 1, view=true) == (a = fill(1), b = fill(4.0), c = fill("7")) - rs = Tables.getrows(nt, 1:2, view=true) + @test Tables.subset(nt, 1, view=true) == (a=1, b=4.0, c="7") + rs = Tables.subset(nt, 1:2, view=true) @test rs == (a=[1,2], b=[4.0, 5.0], c=["7","8"]) @test rs.a.parent === nt.a end - @testset "rowtable getrows" begin - @test Tables.getrows(rt, 1) == (a=1, b=4.0, c="7") - @test Tables.getrows(rt, 1, view=false) == (a=1, b=4.0, c="7") - @test Tables.getrows(rt, 1, view=nothing) == (a=1, b=4.0, c="7") - @test Tables.getrows(rt, 1:2) == [(a=1, b=4.0, c="7"), (a=2, b=5.0, c="8")] - @test Tables.getrows(rt, 1:2, view=false) == [(a=1, b=4.0, c="7"), (a=2, b=5.0, c="8")] - @test Tables.getrows(rt, 1:2, view=nothing) == [(a=1, b=4.0, c="7"), (a=2, b=5.0, c="8")] + @testset "rowtable subset" begin + @test Tables.subset(rt, 1) == (a=1, b=4.0, c="7") + @test Tables.subset(rt, 1, view=false) == (a=1, b=4.0, c="7") + @test Tables.subset(rt, 1, view=nothing) == (a=1, b=4.0, c="7") + @test Tables.subset(rt, 1:2) == [(a=1, b=4.0, c="7"), (a=2, b=5.0, c="8")] + @test Tables.subset(rt, 1:2, view=false) == [(a=1, b=4.0, c="7"), (a=2, b=5.0, c="8")] + @test Tables.subset(rt, 1:2, view=nothing) == [(a=1, b=4.0, c="7"), (a=2, b=5.0, c="8")] + @test_throws ArgumentError Tables.subset(rt, [1:2 1:2]) - @test Tables.getrows(rt, 1, view=true) == fill((a = 1, b = 4.0, c = "7")) - rs = Tables.getrows(rt, 1:2, view=true) + @test Tables.subset(rt, 1, view=true) == (a=1, b=4.0, c="7") + rs = Tables.subset(rt, 1:2, view=true) @test rs == [(a=1, b=4.0, c="7"), (a=2, b=5.0, c="8")] @test rs.parent === rt end @@ -714,6 +716,12 @@ end @test dct.a == [1, 2, 3] @test dct.b == [4.0, 5.0, 6.0] @test dct.c == ["7", "8", "9"] + # Tables.subset + drow = Tables.subset(dct, 1) + @test drow.a == 1 && drow.b == 4.0 && drow.c == "7" + drows = Tables.subset(dct, [1, 2]) + @test drows.a == [1, 2] && drows.b == [4.0, 5.0] && drows.c == ["7", "8"] + @test Tables.rowcount(drows) == 2 dct = Tables.dictcolumntable(ct) @test dct.a == [1, 2, 3] @@ -760,6 +768,13 @@ end # https://github.com/JuliaData/Tables.jl/issues/286 dta = Tables.dictcolumntable([(; a="hey"), (; b=2)]).a @test isequal(dta, ["hey", missing]) + # Tables.subset + drow = Tables.subset(drt, 1) + @test drow.a == 1 && drow.b == 2 && drow.c == 3 + drows = Tables.subset(drt, [1, 2]) + @test length(drows) == 2 + drowsv = Tables.subset(drt, [1, 2]; view=true) + @test length(drowsv) == 2 end # extremely wide tables