Skip to content

Commit 4446a3d

Browse files
authored
add keep to nonunique, unique, and unique! (#3260)
1 parent 70d1e23 commit 4446a3d

File tree

8 files changed

+566
-365
lines changed

8 files changed

+566
-365
lines changed

NEWS.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@
1717
* Joining functions now support `order` keyword argument allowing the user
1818
to specify the order of the rows in the produced table
1919
([#3233](https://github.com/JuliaData/DataFrames.jl/pull/3233))
20+
* Add `keep` keyword argument to `nonunique`, `unique`, and `unique!`
21+
allowing to specify which duplicate rows should be kept
22+
([#3260](https://github.com/JuliaData/DataFrames.jl/pull/3260))
2023

2124
## Bug fixes
2225

src/DataFrames.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ include("other/utils.jl")
134134
include("other/index.jl")
135135

136136
include("abstractdataframe/abstractdataframe.jl")
137+
include("abstractdataframe/unique.jl")
137138
include("dataframe/dataframe.jl")
138139
include("subdataframe/subdataframe.jl")
139140
include("dataframerow/dataframerow.jl")

src/abstractdataframe/abstractdataframe.jl

Lines changed: 3 additions & 274 deletions
Original file line numberDiff line numberDiff line change
@@ -1369,278 +1369,6 @@ end
13691369
Base.Array(df::AbstractDataFrame) = Matrix(df)
13701370
Base.Array{T}(df::AbstractDataFrame) where {T} = Matrix{T}(df)
13711371

1372-
"""
1373-
nonunique(df::AbstractDataFrame)
1374-
nonunique(df::AbstractDataFrame, cols)
1375-
1376-
Return a `Vector{Bool}` in which `true` entries indicate duplicate rows.
1377-
A row is a duplicate if there exists a prior row with all columns containing
1378-
equal values (according to `isequal`).
1379-
1380-
See also [`unique`](@ref) and [`unique!`](@ref).
1381-
1382-
# Arguments
1383-
- `df` : `AbstractDataFrame`
1384-
- `cols` : a selector specifying the column(s) or their transformations to compare.
1385-
Can be any column selector or transformation accepted by [`select`](@ref) that
1386-
returns at least one column if `df` has at least one column.
1387-
1388-
# Examples
1389-
1390-
```jldoctest
1391-
julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
1392-
4×2 DataFrame
1393-
Row │ i x
1394-
│ Int64 Int64
1395-
─────┼──────────────
1396-
1 │ 1 1
1397-
2 │ 2 2
1398-
3 │ 3 1
1399-
4 │ 4 2
1400-
1401-
julia> df = vcat(df, df)
1402-
8×2 DataFrame
1403-
Row │ i x
1404-
│ Int64 Int64
1405-
─────┼──────────────
1406-
1 │ 1 1
1407-
2 │ 2 2
1408-
3 │ 3 1
1409-
4 │ 4 2
1410-
5 │ 1 1
1411-
6 │ 2 2
1412-
7 │ 3 1
1413-
8 │ 4 2
1414-
1415-
julia> nonunique(df)
1416-
8-element Vector{Bool}:
1417-
0
1418-
0
1419-
0
1420-
0
1421-
1
1422-
1
1423-
1
1424-
1
1425-
1426-
julia> nonunique(df, 2)
1427-
8-element Vector{Bool}:
1428-
0
1429-
0
1430-
1
1431-
1
1432-
1
1433-
1
1434-
1
1435-
1
1436-
```
1437-
"""
1438-
function nonunique(df::AbstractDataFrame)
1439-
ncol(df) == 0 && return Bool[]
1440-
gslots = row_group_slots(ntuple(i -> df[!, i], ncol(df)), Val(true), nothing, false, nothing)[3]
1441-
# unique rows are the first encountered group representatives,
1442-
# nonunique are everything else
1443-
res = fill(true, nrow(df))
1444-
@inbounds for g_row in gslots
1445-
(g_row > 0) && (res[g_row] = false)
1446-
end
1447-
return res
1448-
end
1449-
1450-
function nonunique(df::AbstractDataFrame, cols)
1451-
udf = _try_select_no_copy(df, cols)
1452-
if ncol(df) > 0 && ncol(udf) == 0
1453-
throw(ArgumentError("finding duplicate rows in data frame when " *
1454-
"`cols` selects no columns is not allowed"))
1455-
else
1456-
return nonunique(udf)
1457-
end
1458-
end
1459-
1460-
"""
1461-
allunique(df::AbstractDataFrame, cols=:)
1462-
1463-
Return `true` if all rows of `df` are not duplicated. Two rows are duplicate if
1464-
all their columns contain equal values (according to `isequal`).
1465-
1466-
See also [`unique`](@ref) and [`nonunique`](@ref).
1467-
1468-
# Arguments
1469-
- `df` : `AbstractDataFrame`
1470-
- `cols` : a selector specifying the column(s) or their transformations to compare.
1471-
Can be any column selector or transformation accepted by [`select`](@ref).
1472-
1473-
# Examples
1474-
1475-
```jldoctest
1476-
julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
1477-
4×2 DataFrame
1478-
Row │ i x
1479-
│ Int64 Int64
1480-
─────┼──────────────
1481-
1 │ 1 1
1482-
2 │ 2 2
1483-
3 │ 3 1
1484-
4 │ 4 2
1485-
1486-
julia> allunique(df)
1487-
true
1488-
1489-
julia> allunique(df, :x)
1490-
false
1491-
1492-
julia> allunique(df, :i => ByRow(isodd))
1493-
false
1494-
```
1495-
"""
1496-
function Base.allunique(df::AbstractDataFrame, cols=:)
1497-
udf = _try_select_no_copy(df, cols)
1498-
nrow(udf) == 0 && return true
1499-
return row_group_slots(ntuple(i -> udf[!, i], ncol(udf)),
1500-
Val(false), nothing, false, nothing)[1] == nrow(df)
1501-
end
1502-
1503-
"""
1504-
unique(df::AbstractDataFrame; view::Bool=false)
1505-
unique(df::AbstractDataFrame, cols; view::Bool=false)
1506-
1507-
Return a data frame containing only the first occurrence of unique rows in `df`.
1508-
When `cols` is specified, the returned `DataFrame` contains complete rows,
1509-
retaining in each case the first occurrence of a given combination of values
1510-
in selected columns or their transformations. `cols` can be any column
1511-
selector or transformation accepted by [`select`](@ref).
1512-
1513-
If `view=false` a freshly allocated `DataFrame` is returned,
1514-
and if `view=true` then a `SubDataFrame` view into `df` is returned.
1515-
1516-
# Arguments
1517-
- `df` : the AbstractDataFrame
1518-
- `cols` : column indicator (`Symbol`, `Int`, `Vector{Symbol}`, `Regex`, etc.)
1519-
specifying the column(s) to compare.
1520-
1521-
$METADATA_FIXED
1522-
1523-
See also: [`unique!`](@ref), [`nonunique`](@ref).
1524-
1525-
# Examples
1526-
1527-
```jldoctest
1528-
julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
1529-
4×2 DataFrame
1530-
Row │ i x
1531-
│ Int64 Int64
1532-
─────┼──────────────
1533-
1 │ 1 1
1534-
2 │ 2 2
1535-
3 │ 3 1
1536-
4 │ 4 2
1537-
1538-
julia> df = vcat(df, df)
1539-
8×2 DataFrame
1540-
Row │ i x
1541-
│ Int64 Int64
1542-
─────┼──────────────
1543-
1 │ 1 1
1544-
2 │ 2 2
1545-
3 │ 3 1
1546-
4 │ 4 2
1547-
5 │ 1 1
1548-
6 │ 2 2
1549-
7 │ 3 1
1550-
8 │ 4 2
1551-
1552-
julia> unique(df) # doesn't modify df
1553-
4×2 DataFrame
1554-
Row │ i x
1555-
│ Int64 Int64
1556-
─────┼──────────────
1557-
1 │ 1 1
1558-
2 │ 2 2
1559-
3 │ 3 1
1560-
4 │ 4 2
1561-
1562-
julia> unique(df, 2)
1563-
2×2 DataFrame
1564-
Row │ i x
1565-
│ Int64 Int64
1566-
─────┼──────────────
1567-
1 │ 1 1
1568-
2 │ 2 2
1569-
```
1570-
"""
1571-
@inline function Base.unique(df::AbstractDataFrame; view::Bool=false)
1572-
rowidxs = (!).(nonunique(df))
1573-
return view ? Base.view(df, rowidxs, :) : df[rowidxs, :]
1574-
end
1575-
1576-
@inline function Base.unique(df::AbstractDataFrame, cols; view::Bool=false)
1577-
rowidxs = (!).(nonunique(df, cols))
1578-
return view ? Base.view(df, rowidxs, :) : df[rowidxs, :]
1579-
end
1580-
1581-
"""
1582-
unique!(df::AbstractDataFrame)
1583-
unique!(df::AbstractDataFrame, cols)
1584-
1585-
Update `df` in-place to contain only the first occurrence of unique rows in `df`.
1586-
When `cols` is specified, the returned `DataFrame` contains complete rows,
1587-
retaining in each case the first occurrence of a given combination of values
1588-
in selected columns or their transformations. `cols` can be any column
1589-
selector or transformation accepted by [`select`](@ref).
1590-
1591-
# Arguments
1592-
- `df` : the AbstractDataFrame
1593-
- `cols` : column indicator (`Symbol`, `Int`, `Vector{Symbol}`, `Regex`, etc.)
1594-
specifying the column(s) to compare.
1595-
1596-
$METADATA_FIXED
1597-
1598-
See also: [`unique!`](@ref), [`nonunique`](@ref).
1599-
1600-
# Examples
1601-
1602-
```jldoctest
1603-
julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
1604-
4×2 DataFrame
1605-
Row │ i x
1606-
│ Int64 Int64
1607-
─────┼──────────────
1608-
1 │ 1 1
1609-
2 │ 2 2
1610-
3 │ 3 1
1611-
4 │ 4 2
1612-
1613-
julia> df = vcat(df, df)
1614-
8×2 DataFrame
1615-
Row │ i x
1616-
│ Int64 Int64
1617-
─────┼──────────────
1618-
1 │ 1 1
1619-
2 │ 2 2
1620-
3 │ 3 1
1621-
4 │ 4 2
1622-
5 │ 1 1
1623-
6 │ 2 2
1624-
7 │ 3 1
1625-
8 │ 4 2
1626-
1627-
julia> unique!(df) # modifies df
1628-
4×2 DataFrame
1629-
Row │ i x
1630-
│ Int64 Int64
1631-
─────┼──────────────
1632-
1 │ 1 1
1633-
2 │ 2 2
1634-
3 │ 3 1
1635-
4 │ 4 2
1636-
```
1637-
"""
1638-
Base.unique!(df::AbstractDataFrame) = deleteat!(df, _findall(nonunique(df)))
1639-
Base.unique!(df::AbstractDataFrame, cols::AbstractVector) =
1640-
deleteat!(df, _findall(nonunique(df, cols)))
1641-
Base.unique!(df::AbstractDataFrame, cols) =
1642-
deleteat!(df, _findall(nonunique(df, cols)))
1643-
16441372
"""
16451373
fillcombinations(df::AbstractDataFrame, indexcols;
16461374
allowduplicates::Bool=false,
@@ -1703,8 +1431,9 @@ function fillcombinations(df::AbstractDataFrame, indexcols;
17031431
"must be specified"))
17041432
end
17051433

1706-
has_duplicates = row_group_slots(ntuple(i -> df[!, colind[i]], length(colind)),
1707-
Val(false), nothing, false, nothing)[1] != nrow(df)
1434+
# we use hashing algorithm here, because we assume that the tables we work with are not huge
1435+
has_duplicates = row_group_slots!(ntuple(i -> df[!, colind[i]], length(colind)),
1436+
Val(false), nothing, false, nothing, true)[1] != nrow(df)
17081437
if has_duplicates && !allowduplicates
17091438
throw(ArgumentError("duplicate combinations of `indexcols` are not " *
17101439
"allowed in input when `allowduplicates=false`"))

0 commit comments

Comments
 (0)