@@ -190,21 +190,19 @@ struct DataFrame <: AbstractDataFrame
190190
191191 # we write into columns as we know that it is guaranteed
192192 # that it was freshly allocated in the outer constructor
193- for (i, col) in enumerate (columns)
194- # check for vectors first as they are most common
195- if col isa AbstractRange
196- columns[i] = collect (col)
197- elseif col isa AbstractVector
198- columns[i] = copycols ? copy (col) : col
199- elseif col isa Union{AbstractArray{<: Any , 0 }, Ref}
200- x = col[]
201- columns[i] = fill! (Tables. allocatecolumn (typeof (x), len), x)
193+ @static if VERSION >= v " 1.4"
194+ if copycols && len >= 1_000_000 && length (columns) > 1 && Threads. nthreads () > 1
195+ @sync for i in eachindex (columns)
196+ Threads. @spawn columns[i] = _preprocess_column (columns[i], len, copycols)
197+ end
202198 else
203- if col isa AbstractArray
204- throw (ArgumentError (" adding AbstractArray other than AbstractVector " *
205- " as a column of a data frame is not allowed" ))
199+ for i in eachindex (columns)
200+ columns[i] = _preprocess_column (columns[i], len, copycols)
206201 end
207- columns[i] = fill! (Tables. allocatecolumn (typeof (col), len), col)
202+ end
203+ else
204+ for i in eachindex (columns)
205+ columns[i] = _preprocess_column (columns[i], len, copycols)
208206 end
209207 end
210208
@@ -216,6 +214,22 @@ struct DataFrame <: AbstractDataFrame
216214 end
217215end
218216
217+ function _preprocess_column (col:: Any , len:: Integer , copycols:: Bool )
218+ if col isa AbstractRange
219+ return collect (col)
220+ elseif col isa AbstractVector
221+ return copycols ? copy (col) : col
222+ elseif col isa Union{AbstractArray{<: Any , 0 }, Ref}
223+ x = col[]
224+ return fill! (Tables. allocatecolumn (typeof (x), len), x)
225+ elseif col isa AbstractArray
226+ throw (ArgumentError (" adding AbstractArray other than AbstractVector " *
227+ " as a column of a data frame is not allowed" ))
228+ else
229+ return fill! (Tables. allocatecolumn (typeof (col), len), col)
230+ end
231+ end
232+
219233DataFrame (df:: DataFrame ; copycols:: Bool = true ) = copy (df, copycols= copycols)
220234
221235function DataFrame (pairs:: Pair{Symbol, <:Any} ...; makeunique:: Bool = false ,
@@ -502,34 +516,75 @@ end
502516 throw (BoundsError (df, (row_inds, col_inds)))
503517 end
504518 selected_columns = index (df)[col_inds]
505- # Computing integer indices once for all columns is faster
506- selected_rows = T === Bool ? findall (row_inds) : row_inds
507- new_columns = AbstractVector[dv[selected_rows] for dv in _columns (df)[selected_columns]]
508- return DataFrame (new_columns, Index (_names (df)[selected_columns]), copycols= false )
519+
520+ u = _names (df)[selected_columns]
521+ lookup = Dict {Symbol, Int} (zip (u, 1 : length (u)))
522+ # use this constructor to avoid checking twice if column names are not
523+ # duplicate as index(df)[col_inds] already checks this
524+ idx = Index (lookup, u)
525+
526+ if length (selected_columns) == 1
527+ return DataFrame (AbstractVector[_columns (df)[selected_columns[1 ]][row_inds]],
528+ idx, copycols= false )
529+ else
530+ # Computing integer indices once for all columns is faster
531+ selected_rows = T === Bool ? findall (row_inds) : row_inds
532+ @static if VERSION >= v " 1.4"
533+ if length (selected_rows) >= 1_000_000 && Threads. nthreads () > 1
534+ new_columns = Vector {AbstractVector} (undef, length (selected_columns))
535+ @sync for i in eachindex (new_columns)
536+ Threads. @spawn new_columns[i] = _columns (df)[selected_columns[i]][selected_rows]
537+ end
538+ return DataFrame (new_columns, idx, copycols= false )
539+ else
540+ return DataFrame (AbstractVector[_columns (df)[i][selected_rows] for i in selected_columns],
541+ idx, copycols= false )
542+ end
543+ else
544+ return DataFrame (AbstractVector[_columns (df)[i][selected_rows] for i in selected_columns],
545+ idx, copycols= false )
546+ end
547+ end
509548end
510549
511550@inline function Base. getindex (df:: DataFrame , row_inds:: AbstractVector{T} , :: Colon ) where T
512551 @boundscheck if ! checkindex (Bool, axes (df, 1 ), row_inds)
513552 throw (BoundsError (df, (row_inds, :)))
514553 end
515- # Computing integer indices once for all columns is faster
516- selected_rows = T === Bool ? findall (row_inds) : row_inds
517- new_columns = AbstractVector[dv[selected_rows] for dv in _columns (df)]
518- return DataFrame (new_columns, copy (index (df)), copycols= false )
554+ idx = copy (index (df))
555+
556+ if ncol (df) == 1
557+ return DataFrame (AbstractVector[_columns (df)[1 ][row_inds]], idx, copycols= false )
558+ else
559+ # Computing integer indices once for all columns is faster
560+ selected_rows = T === Bool ? findall (row_inds) : row_inds
561+ @static if VERSION >= v " 1.4"
562+ if length (selected_rows) >= 1_000_000 && Threads. nthreads () > 1
563+ new_columns = Vector {AbstractVector} (undef, ncol (df))
564+ @sync for i in eachindex (new_columns)
565+ Threads. @spawn new_columns[i] = _columns (df)[i][selected_rows]
566+ end
567+ return DataFrame (new_columns, idx, copycols= false )
568+ else
569+ return DataFrame (AbstractVector[dv[selected_rows] for dv in _columns (df)],
570+ idx, copycols= false )
571+ end
572+ else
573+ return DataFrame (AbstractVector[dv[selected_rows] for dv in _columns (df)],
574+ idx, copycols= false )
575+ end
576+ end
519577end
520578
521- @inline Base. getindex (df:: DataFrame , row_inds:: Not ,
522- col_inds:: MultiColumnIndex ) =
579+ @inline Base. getindex (df:: DataFrame , row_inds:: Not , col_inds:: MultiColumnIndex ) =
523580 df[axes (df, 1 )[row_inds], col_inds]
524581
525582# df[:, MultiColumnIndex] => DataFrame
526- Base. getindex (df:: DataFrame , row_ind:: Colon ,
527- col_inds:: MultiColumnIndex ) =
583+ Base. getindex (df:: DataFrame , row_ind:: Colon , col_inds:: MultiColumnIndex ) =
528584 select (df, col_inds, copycols= true )
529585
530586# df[!, MultiColumnIndex] => DataFrame
531- Base. getindex (df:: DataFrame , row_ind:: typeof (! ),
532- col_inds:: MultiColumnIndex ) =
587+ Base. getindex (df:: DataFrame , row_ind:: typeof (! ), col_inds:: MultiColumnIndex ) =
533588 select (df, col_inds, copycols= false )
534589
535590# #############################################################################
@@ -875,11 +930,7 @@ copies of column vectors in `df`.
875930If `copycols=false`, return a new `DataFrame` sharing column vectors with `df`.
876931"""
877932function Base. copy (df:: DataFrame ; copycols:: Bool = true )
878- if copycols
879- df[:, :]
880- else
881- DataFrame (_columns (df), _names (df), copycols= false )
882- end
933+ return DataFrame (copy (_columns (df)), copy (index (df)), copycols= copycols)
883934end
884935
885936"""
0 commit comments