@@ -1369,278 +1369,6 @@ end
13691369Base. Array (df:: AbstractDataFrame ) = Matrix (df)
13701370Base. Array {T} (df:: AbstractDataFrame ) where {T} = Matrix {T} (df)
13711371
1372- """
1373- nonunique(df::AbstractDataFrame)
1374- nonunique(df::AbstractDataFrame, cols)
1375-
1376- Return a `Vector{Bool}` in which `true` entries indicate duplicate rows.
1377- A row is a duplicate if there exists a prior row with all columns containing
1378- equal values (according to `isequal`).
1379-
1380- See also [`unique`](@ref) and [`unique!`](@ref).
1381-
1382- # Arguments
1383- - `df` : `AbstractDataFrame`
1384- - `cols` : a selector specifying the column(s) or their transformations to compare.
1385- Can be any column selector or transformation accepted by [`select`](@ref) that
1386- returns at least one column if `df` has at least one column.
1387-
1388- # Examples
1389-
1390- ```jldoctest
1391- julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
1392- 4×2 DataFrame
1393- Row │ i x
1394- │ Int64 Int64
1395- ─────┼──────────────
1396- 1 │ 1 1
1397- 2 │ 2 2
1398- 3 │ 3 1
1399- 4 │ 4 2
1400-
1401- julia> df = vcat(df, df)
1402- 8×2 DataFrame
1403- Row │ i x
1404- │ Int64 Int64
1405- ─────┼──────────────
1406- 1 │ 1 1
1407- 2 │ 2 2
1408- 3 │ 3 1
1409- 4 │ 4 2
1410- 5 │ 1 1
1411- 6 │ 2 2
1412- 7 │ 3 1
1413- 8 │ 4 2
1414-
1415- julia> nonunique(df)
1416- 8-element Vector{Bool}:
1417- 0
1418- 0
1419- 0
1420- 0
1421- 1
1422- 1
1423- 1
1424- 1
1425-
1426- julia> nonunique(df, 2)
1427- 8-element Vector{Bool}:
1428- 0
1429- 0
1430- 1
1431- 1
1432- 1
1433- 1
1434- 1
1435- 1
1436- ```
1437- """
1438- function nonunique (df:: AbstractDataFrame )
1439- ncol (df) == 0 && return Bool[]
1440- gslots = row_group_slots (ntuple (i -> df[! , i], ncol (df)), Val (true ), nothing , false , nothing )[3 ]
1441- # unique rows are the first encountered group representatives,
1442- # nonunique are everything else
1443- res = fill (true , nrow (df))
1444- @inbounds for g_row in gslots
1445- (g_row > 0 ) && (res[g_row] = false )
1446- end
1447- return res
1448- end
1449-
1450- function nonunique (df:: AbstractDataFrame , cols)
1451- udf = _try_select_no_copy (df, cols)
1452- if ncol (df) > 0 && ncol (udf) == 0
1453- throw (ArgumentError (" finding duplicate rows in data frame when " *
1454- " `cols` selects no columns is not allowed" ))
1455- else
1456- return nonunique (udf)
1457- end
1458- end
1459-
1460- """
1461- allunique(df::AbstractDataFrame, cols=:)
1462-
1463- Return `true` if all rows of `df` are not duplicated. Two rows are duplicate if
1464- all their columns contain equal values (according to `isequal`).
1465-
1466- See also [`unique`](@ref) and [`nonunique`](@ref).
1467-
1468- # Arguments
1469- - `df` : `AbstractDataFrame`
1470- - `cols` : a selector specifying the column(s) or their transformations to compare.
1471- Can be any column selector or transformation accepted by [`select`](@ref).
1472-
1473- # Examples
1474-
1475- ```jldoctest
1476- julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
1477- 4×2 DataFrame
1478- Row │ i x
1479- │ Int64 Int64
1480- ─────┼──────────────
1481- 1 │ 1 1
1482- 2 │ 2 2
1483- 3 │ 3 1
1484- 4 │ 4 2
1485-
1486- julia> allunique(df)
1487- true
1488-
1489- julia> allunique(df, :x)
1490- false
1491-
1492- julia> allunique(df, :i => ByRow(isodd))
1493- false
1494- ```
1495- """
1496- function Base. allunique (df:: AbstractDataFrame , cols= :)
1497- udf = _try_select_no_copy (df, cols)
1498- nrow (udf) == 0 && return true
1499- return row_group_slots (ntuple (i -> udf[! , i], ncol (udf)),
1500- Val (false ), nothing , false , nothing )[1 ] == nrow (df)
1501- end
1502-
1503- """
1504- unique(df::AbstractDataFrame; view::Bool=false)
1505- unique(df::AbstractDataFrame, cols; view::Bool=false)
1506-
1507- Return a data frame containing only the first occurrence of unique rows in `df`.
1508- When `cols` is specified, the returned `DataFrame` contains complete rows,
1509- retaining in each case the first occurrence of a given combination of values
1510- in selected columns or their transformations. `cols` can be any column
1511- selector or transformation accepted by [`select`](@ref).
1512-
1513- If `view=false` a freshly allocated `DataFrame` is returned,
1514- and if `view=true` then a `SubDataFrame` view into `df` is returned.
1515-
1516- # Arguments
1517- - `df` : the AbstractDataFrame
1518- - `cols` : column indicator (`Symbol`, `Int`, `Vector{Symbol}`, `Regex`, etc.)
1519- specifying the column(s) to compare.
1520-
1521- $METADATA_FIXED
1522-
1523- See also: [`unique!`](@ref), [`nonunique`](@ref).
1524-
1525- # Examples
1526-
1527- ```jldoctest
1528- julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
1529- 4×2 DataFrame
1530- Row │ i x
1531- │ Int64 Int64
1532- ─────┼──────────────
1533- 1 │ 1 1
1534- 2 │ 2 2
1535- 3 │ 3 1
1536- 4 │ 4 2
1537-
1538- julia> df = vcat(df, df)
1539- 8×2 DataFrame
1540- Row │ i x
1541- │ Int64 Int64
1542- ─────┼──────────────
1543- 1 │ 1 1
1544- 2 │ 2 2
1545- 3 │ 3 1
1546- 4 │ 4 2
1547- 5 │ 1 1
1548- 6 │ 2 2
1549- 7 │ 3 1
1550- 8 │ 4 2
1551-
1552- julia> unique(df) # doesn't modify df
1553- 4×2 DataFrame
1554- Row │ i x
1555- │ Int64 Int64
1556- ─────┼──────────────
1557- 1 │ 1 1
1558- 2 │ 2 2
1559- 3 │ 3 1
1560- 4 │ 4 2
1561-
1562- julia> unique(df, 2)
1563- 2×2 DataFrame
1564- Row │ i x
1565- │ Int64 Int64
1566- ─────┼──────────────
1567- 1 │ 1 1
1568- 2 │ 2 2
1569- ```
1570- """
1571- @inline function Base. unique (df:: AbstractDataFrame ; view:: Bool = false )
1572- rowidxs = (! ). (nonunique (df))
1573- return view ? Base. view (df, rowidxs, :) : df[rowidxs, :]
1574- end
1575-
1576- @inline function Base. unique (df:: AbstractDataFrame , cols; view:: Bool = false )
1577- rowidxs = (! ). (nonunique (df, cols))
1578- return view ? Base. view (df, rowidxs, :) : df[rowidxs, :]
1579- end
1580-
1581- """
1582- unique!(df::AbstractDataFrame)
1583- unique!(df::AbstractDataFrame, cols)
1584-
1585- Update `df` in-place to contain only the first occurrence of unique rows in `df`.
1586- When `cols` is specified, the returned `DataFrame` contains complete rows,
1587- retaining in each case the first occurrence of a given combination of values
1588- in selected columns or their transformations. `cols` can be any column
1589- selector or transformation accepted by [`select`](@ref).
1590-
1591- # Arguments
1592- - `df` : the AbstractDataFrame
1593- - `cols` : column indicator (`Symbol`, `Int`, `Vector{Symbol}`, `Regex`, etc.)
1594- specifying the column(s) to compare.
1595-
1596- $METADATA_FIXED
1597-
1598- See also: [`unique!`](@ref), [`nonunique`](@ref).
1599-
1600- # Examples
1601-
1602- ```jldoctest
1603- julia> df = DataFrame(i=1:4, x=[1, 2, 1, 2])
1604- 4×2 DataFrame
1605- Row │ i x
1606- │ Int64 Int64
1607- ─────┼──────────────
1608- 1 │ 1 1
1609- 2 │ 2 2
1610- 3 │ 3 1
1611- 4 │ 4 2
1612-
1613- julia> df = vcat(df, df)
1614- 8×2 DataFrame
1615- Row │ i x
1616- │ Int64 Int64
1617- ─────┼──────────────
1618- 1 │ 1 1
1619- 2 │ 2 2
1620- 3 │ 3 1
1621- 4 │ 4 2
1622- 5 │ 1 1
1623- 6 │ 2 2
1624- 7 │ 3 1
1625- 8 │ 4 2
1626-
1627- julia> unique!(df) # modifies df
1628- 4×2 DataFrame
1629- Row │ i x
1630- │ Int64 Int64
1631- ─────┼──────────────
1632- 1 │ 1 1
1633- 2 │ 2 2
1634- 3 │ 3 1
1635- 4 │ 4 2
1636- ```
1637- """
1638- Base. unique! (df:: AbstractDataFrame ) = deleteat! (df, _findall (nonunique (df)))
1639- Base. unique! (df:: AbstractDataFrame , cols:: AbstractVector ) =
1640- deleteat! (df, _findall (nonunique (df, cols)))
1641- Base. unique! (df:: AbstractDataFrame , cols) =
1642- deleteat! (df, _findall (nonunique (df, cols)))
1643-
16441372"""
16451373 fillcombinations(df::AbstractDataFrame, indexcols;
16461374 allowduplicates::Bool=false,
@@ -1703,8 +1431,9 @@ function fillcombinations(df::AbstractDataFrame, indexcols;
17031431 " must be specified" ))
17041432 end
17051433
1706- has_duplicates = row_group_slots (ntuple (i -> df[! , colind[i]], length (colind)),
1707- Val (false ), nothing , false , nothing )[1 ] != nrow (df)
1434+ # we use hashing algorithm here, because we assume that the tables we work with are not huge
1435+ has_duplicates = row_group_slots! (ntuple (i -> df[! , colind[i]], length (colind)),
1436+ Val (false ), nothing , false , nothing , true )[1 ] != nrow (df)
17081437 if has_duplicates && ! allowduplicates
17091438 throw (ArgumentError (" duplicate combinations of `indexcols` are not " *
17101439 " allowed in input when `allowduplicates=false`" ))
0 commit comments