JuliaParallel
diff --git a/‎docs/src/dtable.md‎
Lines changed: 69 additions & 15 deletions b/‎docs/src/dtable.md‎
Lines changed: 69 additions & 15 deletions
diff --git a/‎src/table/dataframes_interface_utils.jl‎
Lines changed: 1 addition & 1 deletion b/‎src/table/dataframes_interface_utils.jl‎
Lines changed: 1 addition & 1 deletion
@@ -10,7 +10,7 @@ Provide a `Tables.jl` compatible source, as well as a `chunksize`, which is the
 maximum number of rows of each partition:
 
 ```julia
-julia> using Dagger
+julia> using DTables
 
 julia> table = (a=[1, 2, 3, 4, 5], b=[6, 7, 8, 9, 10]);
 
@@ -28,7 +28,7 @@ Provide a `loader_function` and a list of filenames, which are parts of the
 full table:
 
 ```julia
-julia> using Dagger, CSV
+julia> using DTables, CSV
 
 julia> files = ["1.csv", "2.csv", "3.csv"];
 
@@ -87,6 +87,60 @@ julia> fetch(d, NamedTuple)
 (a = [1, 2, 3, 4, 5], b = [6, 7, 8, 9, 10])
 ```
 
+## Behavior of the `interpartition_merges` kwarg
+
+If a source supports the `Tables.partitions` interface then the DTable
+will assume the partitioning size from the source. However, if you decide
+to specify the exact size of the chunk the DTable will attempt to create
+chunks exactly of that size even if it means merging data between partitions.
+The behavior can be controlled by the `interpartition_merges` (`true` by default)
+kwarg and is best seen on the following example.
+
+```julia
+julia> using DTables, CSV
+
+julia> DTable(CSV.Chunks("test.csv", ntasks=4)) |> DTables.chunk_lengths
+4-element Vector{Int64}:
+ 249995
+ 250005
+ 249995
+ 250005
+
+julia> DTable(CSV.Chunks("test.csv", ntasks=4), 200_000) |> DTables.chunk_lengths
+5-element Vector{Int64}:
+ 200000
+ 200000
+ 200000
+ 200000
+ 200000
+
+julia> DTable(CSV.Chunks("test.csv", ntasks=4), 200_000, interpartition_merges=false) |> DTables.chunk_lengths
+8-element Vector{Int64}:
+ 200000
+  49995
+ 200000
+  50005
+ 200000
+  49995
+ 200000
+  50005
+
+julia> DTable(CSV.Chunks("test.csv", ntasks=4), 300_000) |> DTables.chunk_lengths
+4-element Vector{Int64}:
+ 300000
+ 300000
+ 300000
+ 100000
+
+julia> DTable(CSV.Chunks("test.csv", ntasks=4), 300_000, interpartition_merges=false) |> DTables.chunk_lengths
+4-element Vector{Int64}:
+ 249995
+ 250005
+ 249995
+ 250005
+
+```
+
 # Table operations
 
 **Warning: this interface is experimental and may change at any time**
@@ -99,7 +153,7 @@ Below is an example of their usage.
 For more information please refer to the API documentation and unit tests.
 
 ```julia
-julia> using Dagger
+julia> using DTables
 
 julia> d = DTable((k = repeat(['a', 'b'], 500), v = repeat(1:10, 100)), 100)
 DTable with 10 partitions
@@ -153,16 +207,16 @@ It lets you transform a row to the required format before applying the reduce fu
 In consequence a lot of memory usage should be saved due to the lack of an intermediate `map` step that allocates a full column.
 
 ```julia
-julia> using Dagger, OnlineStats
+julia> using DTables, OnlineStats
 
-julia> fetch(Dagger.mapreduce(sum, fit!, d1, init = Mean()))
+julia> fetch(DTables.mapreduce(sum, fit!, d1, init = Mean()))
 Mean: n=100 | value=1.50573
 
 julia> d1 = DTable((a=collect(1:100).%3, b=rand(100)), 25);
 
 julia> gg = GroupBy(Int, Mean());
 
-julia> fetch(Dagger.mapreduce(x-> (x.a, x.b), fit!, d1, init=gg))
+julia> fetch(DTables.mapreduce(x-> (x.a, x.b), fit!, d1, init=gg))
 GroupBy: Int64 => Mean
 ├─ 1
 │  └─ Mean: n=34 | value=0.491379
@@ -175,7 +229,7 @@ julia> d2 = DTable((;a1=abs.(rand(Int, 100).%2), [Symbol("a\$(i)") => rand(100)
 
 julia> gb = GroupBy(Int, Group([Series(Mean(), Variance(), Extrema()) for _ in 1:3]...));
 
-julia> fetch(Dagger.mapreduce(r -> (r.a1, tuple(r...)), fit!, d2, init = gb))
+julia> fetch(DTables.mapreduce(r -> (r.a1, tuple(r...)), fit!, d2, init = gb))
 GroupBy: Int64 => Group
 ├─ 1
 │  └─ Group
@@ -208,7 +262,7 @@ GroupBy: Int64 => Group
 ```
 
 
-# Dagger.groupby interface
+# DTables.groupby interface
 
 A `DTable` can be grouped which will result in creation of a `GDTable`.
 A distinct set of values contained in a single or multiple columns can be used as grouping keys.
@@ -224,22 +278,22 @@ julia> d = DTable((a=shuffle(repeat('a':'d', inner=4, outer=4)),b=repeat(1:4, 16
 DTable with 16 partitions
 Tabletype: NamedTuple
 
-julia> Dagger.groupby(d, :a)
+julia> DTables.groupby(d, :a)
 GDTable with 4 partitions and 4 keys
 Tabletype: NamedTuple
 Grouped by: [:a]
 
-julia> Dagger.groupby(d, [:a, :b])
+julia> DTables.groupby(d, [:a, :b])
 GDTable with 16 partitions and 16 keys
 Tabletype: NamedTuple
 Grouped by: [:a, :b]
 
-julia> Dagger.groupby(d, row -> row.a + row.b)
+julia> DTables.groupby(d, row -> row.a + row.b)
 GDTable with 7 partitions and 7 keys
 Tabletype: NamedTuple
 Grouped by: #5
 
-julia> g = Dagger.groupby(d, :a); keys(g)
+julia> g = DTables.groupby(d, :a); keys(g)
 KeySet for a Dict{Char, Vector{UInt64}} with 4 entries. Keys:
   'c'
   'd'
@@ -256,7 +310,7 @@ Tabletype: NamedTuple
 Operations such as `map`, `filter`, `reduce` can be performed on a `GDTable`
 
 ```julia
-julia> g = Dagger.groupby(d, [:a, :b])
+julia> g = DTables.groupby(d, [:a, :b])
 GDTable with 16 partitions and 16 keys
 Tabletype: NamedTuple
 Grouped by: [:a, :b]
@@ -308,7 +362,7 @@ julia> d = DTable((a=repeat('a':'b', inner=2),b=1:4), 2)
 DTable with 2 partitions
 Tabletype: NamedTuple
 
-julia> g = Dagger.groupby(d, :a)
+julia> g = DTables.groupby(d, :a)
 GDTable with 2 partitions and 2 keys
 Tabletype: NamedTuple
 Grouped by: [:a]
@@ -355,7 +409,7 @@ the join functions coming from the `DataFrames.jl` package for the per chunk joi
 In the future this behavior will be expanded to any type that implements its own join methods, but for now is limited to `DataFrame` only.
 
 Please note that the usage of any of the keyword arguments described above will always result in the usage of generic join methods
-defined in `Dagger` regardless of the availability of specialized methods.
+defined in `DTables` regardless of the availability of specialized methods.
 
 ```julia
 julia> using Tables; pp = d -> for x in Tables.rows(d) println("$(x.a), $(x.b), $(x.c)") end;
 
@@ -101,4 +101,4 @@ function fillcolumns(
 end
 
 ncol(d::DTable) = length(Tables.columns(d))
-index(df::DTable) = Index(_columnnames_svector(df))
+index(df::DTable) = Index(columnnames_svector(df))