JuliaData · quinnj · Sep 6, 2022 · Sep 6, 2022 · bkamins · Sep 6, 2022
diff --git a/src/Tables.jl b/src/Tables.jl
@@ -569,15 +569,15 @@ struct Partitioner{T}
 end
 
 """
-    Tables.getrows(x, inds; view=nothing)
+    Tables.subset(x, inds; view=nothing)
 
 Return one or more rows from table `x` according to the position(s) specified by `inds`:
 
 - If `inds` is a single non-boolean integer return a row object.
-- If `inds` is a vector of non-boolean integers, a vector of booleans, or a `:`, return an indexable object of rows. 
+- If `inds` is a vector of non-boolean integers, a vector of booleans, or a `:`, return a subset of the original table according to the indices.
   In this case, the returned type is not necessarily the same as the original table type.
 
-If other type of `inds` is passed than specified above the behavior is undefined.
+If other types of `inds` are passed than specified above the behavior is undefined.
 
 The `view` argument influences whether the returned object is a view of the original table
 or an independent copy:
@@ -587,11 +587,37 @@ or an independent copy:
 - If `view=true` then a view is returned and if `view=false` a copy is returned.
   This applies both to returning a row or a table.
 
-Any specialized implementation of `getrows` must support the `view=nothing` argument. 
+Any specialized implementation of `subset` must support the `view=nothing` argument.
 Support for `view=true` or `view=false` is optional
 (i.e. implementations might error on them if they are not supported).
 """
-function getrows end
+function subset(x::T, inds; view::Union{Bool, Nothing}=nothing) where {T}
+    # because this method is being called, we know `x` didn't define it's own Tables.subset
+    # first check if it supports column access, and if so, apply inds and wrap columns in a DictColumnTable
+    if columnaccess(x)
+        cols = columns(x)
+        if inds isa Integer
+            return ColumnsRow(cols, inds)
+        else
+            ret = view === true ? _map(c -> Base.view(c, inds), cols) : _map(c -> c[inds], cols)
+            return DictColumnTable(schema(cols), ret)
+        end
+    end
+    # otherwise, let's get the rows and see if we can apply inds to them
+    r = rows(x)
+    if r isa AbstractVector
+        inds isa Integer && return r[inds]
+        ret = view === true ? Base.view(x, inds) : x[inds]
+        (ret isa AbstractVector) || throw(ArgumentError("`Tables.subset`: invalid `inds` argument, expected `AbstractVector` output, got $(typeof(ret))"))
+        return ret
+    end
+    throw(ArgumentError("no default `Tables.subset` implementation for type: $T"))
+end
+
+vectorcheck(x::AbstractVector) = x
+vectorcheck(x) = throw(ArgumentError("`Tables.subset`: invalid `inds` argument, expected `AbstractVector` output, got $(typeof(x))"))
+_map(f, cols) = OrderedDict(nm => vectorcheck(f(getcolumn(cols, nm))) for nm in columnnames(cols))
+
 
 """
     Tables.partitioner(f, itr)

diff --git a/src/dicts.jl b/src/dicts.jl
@@ -1,3 +1,4 @@
+# Dict of Vectors as table
 struct DictColumnTable <: AbstractColumns
     schema::Schema
     values::OrderedDict{Symbol, AbstractVector}
@@ -94,6 +95,7 @@ columnnames(x::DictColumnTable) = getfield(x, :schema).names
 getcolumn(x::DictColumnTable, i::Int) = getfield(x, :values)[columnnames(x)[i]]
 getcolumn(x::DictColumnTable, nm::Symbol) = getfield(x, :values)[nm]
 
+# Vector of Dicts as table
 struct DictRowTable
     names::Vector{Symbol}
     types::Dict{Symbol, Type}
@@ -122,6 +124,16 @@ function Base.iterate(x::DictRowTable, st=1)
     return DictRow(x.names, x.values[st]), st + 1
 end
 
+function subset(x::DictRowTable, inds; view::Union{Bool,Nothing} = nothing)
+    values = view === true ? Base.view(getfield(x, :values), inds) : getfield(x, :values)[inds]
+    if inds isa Integer
+        return DictRow(getfield(x, :names), values)
+    else
+        values isa AbstractVector || throw(ArgumentError("`Tables.subset`: invalid `inds` argument, expected `RowTable` output, got $(typeof(ret))"))
+        return DictRowTable(getfield(x, :names), getfield(x, :types), values)
+    end
+end
+
 """
     Tables.dictrowtable(x) => Tables.DictRowTable
 

diff --git a/src/namedtuples.jl b/src/namedtuples.jl
@@ -106,18 +106,18 @@ function rowtable(itr::T) where {T}
     return collect(namedtupleiterator(eltype(r), r))
 end
 
-function getrows(x::RowTable, inds; view::Union{Bool,Nothing} = nothing)
-    if view === true
-        return Base.view(x, inds)
+# NamedTuple of arrays of matching dimensionality
+const ColumnTable = NamedTuple{names, T} where {names, T <: NTuple{N, AbstractVector{S} where S}} where {N}
+rowcount(c::ColumnTable) = length(c) == 0 ? 0 : length(c[1])
+
+function subset(x::ColumnTable, inds; view::Union{Bool,Nothing}=nothing)
+    if inds isa Integer
+        return map(c -> c[inds], x)
     else
-        return x[inds]
+        return view === true ? map(c -> vectorcheck(Base.view(c, inds)), x) : map(c -> vectorcheck(c[inds]), x)
     end
 end
 
-# NamedTuple of arrays of matching dimensionality
-const ColumnTable = NamedTuple{names, T} where {names, T <: NTuple{N, AbstractArray{S, D} where S}} where {N, D}
-rowcount(c::ColumnTable) = length(c) == 0 ? 0 : length(c[1])
-
 # interface implementation
 istable(::Type{<:ColumnTable}) = true
 columnaccess(::Type{<:ColumnTable}) = true
@@ -181,11 +181,3 @@ function columntable(itr::T) where {T}
     return columntable(schema(cols), cols)
 end
 columntable(x::ColumnTable) = x
-
-function getrows(x::ColumnTable, inds; view::Union{Bool,Nothing} = nothing)
-    if view === true
-        return map(c -> Base.view(c, inds), x)
-    else
-        return map(c -> c[inds], x)
-    end
-end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -145,30 +145,32 @@ end
     @test Tables.buildcolumns(nothing, rt) == nt
     @test Tables.columntable(nothing, nt) == nt
 
-    @testset "columntable getrows" begin
-        @test Tables.getrows(nt, 1) == (a=1, b=4.0, c="7")
-        @test Tables.getrows(nt, 1, view=false) == (a=1, b=4.0, c="7")
-        @test Tables.getrows(nt, 1, view=nothing) == (a=1, b=4.0, c="7")
-        @test Tables.getrows(nt, 1:2) == (a=[1,2], b=[4.0, 5.0], c=["7","8"])
-        @test Tables.getrows(nt, 1:2, view=false) == (a=[1,2], b=[4.0, 5.0], c=["7","8"])
-        @test Tables.getrows(nt, 1:2, view=nothing) == (a=[1,2], b=[4.0, 5.0], c=["7","8"])
+    @testset "columntable subset" begin
+        @test Tables.subset(nt, 1) == (a=1, b=4.0, c="7")
+        @test Tables.subset(nt, 1, view=false) == (a=1, b=4.0, c="7")
+        @test Tables.subset(nt, 1, view=nothing) == (a=1, b=4.0, c="7")
+        @test Tables.subset(nt, 1:2) == (a=[1,2], b=[4.0, 5.0], c=["7","8"])
+        @test Tables.subset(nt, 1:2, view=false) == (a=[1,2], b=[4.0, 5.0], c=["7","8"])
+        @test Tables.subset(nt, 1:2, view=nothing) == (a=[1,2], b=[4.0, 5.0], c=["7","8"])
+        @test_throws ArgumentError Tables.subset(nt, [1:2 1:2])
 
-        @test Tables.getrows(nt, 1, view=true) == (a = fill(1), b = fill(4.0), c = fill("7"))
-        rs = Tables.getrows(nt, 1:2, view=true)
+        @test Tables.subset(nt, 1, view=true) == (a=1, b=4.0, c="7")
+        rs = Tables.subset(nt, 1:2, view=true)
         @test rs == (a=[1,2], b=[4.0, 5.0], c=["7","8"])
         @test rs.a.parent === nt.a
     end
 
-    @testset "rowtable getrows" begin
-        @test Tables.getrows(rt, 1) == (a=1, b=4.0, c="7")
-        @test Tables.getrows(rt, 1, view=false) == (a=1, b=4.0, c="7")
-        @test Tables.getrows(rt, 1, view=nothing) == (a=1, b=4.0, c="7")
-        @test Tables.getrows(rt, 1:2) == [(a=1, b=4.0, c="7"), (a=2, b=5.0, c="8")]
-        @test Tables.getrows(rt, 1:2, view=false) == [(a=1, b=4.0, c="7"), (a=2, b=5.0, c="8")]
-        @test Tables.getrows(rt, 1:2, view=nothing) == [(a=1, b=4.0, c="7"), (a=2, b=5.0, c="8")]
+    @testset "rowtable subset" begin
+        @test Tables.subset(rt, 1) == (a=1, b=4.0, c="7")
+        @test Tables.subset(rt, 1, view=false) == (a=1, b=4.0, c="7")
+        @test Tables.subset(rt, 1, view=nothing) == (a=1, b=4.0, c="7")
+        @test Tables.subset(rt, 1:2) == [(a=1, b=4.0, c="7"), (a=2, b=5.0, c="8")]
+        @test Tables.subset(rt, 1:2, view=false) == [(a=1, b=4.0, c="7"), (a=2, b=5.0, c="8")]
+        @test Tables.subset(rt, 1:2, view=nothing) == [(a=1, b=4.0, c="7"), (a=2, b=5.0, c="8")]
+        @test_throws ArgumentError Tables.subset(rt, [1:2 1:2])
 
-        @test Tables.getrows(rt, 1, view=true) == fill((a = 1, b = 4.0, c = "7"))
-        rs = Tables.getrows(rt, 1:2, view=true)
+        @test Tables.subset(rt, 1, view=true) == (a=1, b=4.0, c="7")
+        rs = Tables.subset(rt, 1:2, view=true)
         @test rs == [(a=1, b=4.0, c="7"), (a=2, b=5.0, c="8")]
         @test rs.parent === rt
     end
@@ -714,6 +716,12 @@ end
     @test dct.a == [1, 2, 3]
     @test dct.b == [4.0, 5.0, 6.0]
     @test dct.c == ["7", "8", "9"]
+    # Tables.subset
+    drow = Tables.subset(dct, 1)
+    @test drow.a == 1 && drow.b == 4.0 && drow.c == "7"
+    drows = Tables.subset(dct, [1, 2])
+    @test drows.a == [1, 2] && drows.b == [4.0, 5.0] && drows.c == ["7", "8"]
+    @test Tables.rowcount(drows) == 2
 
     dct = Tables.dictcolumntable(ct)
     @test dct.a == [1, 2, 3]
@@ -760,6 +768,13 @@ end
     # https://github.com/JuliaData/Tables.jl/issues/286
     dta = Tables.dictcolumntable([(; a="hey"), (; b=2)]).a
     @test isequal(dta, ["hey", missing])
+    # Tables.subset
+    drow = Tables.subset(drt, 1)
+    @test drow.a == 1 && drow.b == 2 && drow.c == 3
+    drows = Tables.subset(drt, [1, 2])
+    @test length(drows) == 2
+    drowsv = Tables.subset(drt, [1, 2]; view=true)
+    @test length(drowsv) == 2
 end
 
 # extremely wide tables