Skip to content

Commit 55f3a14

Browse files
authored
Faster groupby (#1096)
* Tweaked sort.max_radix_bits constant to speed up sorting * Groupby operations are now significantly faster (especially function first) because columns are no longer materialized before performing the reduction. Speed improvements: * sort(): 221.671 ms -> 85.602 ms * first(): 76.152 ms -> 0.084 ms * sum(): 179.940 ms -> 69.260 ms
1 parent b4f3f2b commit 55f3a14

File tree

10 files changed

+71
-52
lines changed

10 files changed

+71
-52
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
2020
- during grouping, group keys are now added automatically to the select list.
2121
- implement `sum()` reducer.
2222
- `==` operator now works for string columns too.
23+
- Improved performance of groupby operations.
2324

2425
#### Fixed
2526
- fread will no longer emit an error if there is an NA string in the header.

c/datatable.cc

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,9 @@ DataTable::DataTable(Column** cols)
3131
nrows = cols[0]->nrows;
3232

3333
for (Column* col = cols[++ncols]; cols[ncols] != nullptr; ++ncols) {
34-
// TODO: restore, once Column also uses RowIndex
35-
// if (rowindex != col->rowindex()) {
36-
// throw ValueError() << "Mismatched RowIndex in Column " << ncols;
37-
// }
34+
if (rowindex != col->rowindex()) {
35+
throw ValueError() << "Mismatched RowIndex in Column " << ncols;
36+
}
3837
if (nrows != col->nrows) {
3938
throw ValueError() << "Mismatched length in Column " << ncols << ": "
4039
<< "found " << col->nrows << ", expected " << nrows;

c/expr/binaryop.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -475,6 +475,8 @@ static mapperfn resolve0(SType lhs_type, SType rhs_type, int opcode, void** para
475475

476476
Column* binaryop(int opcode, Column* lhs, Column* rhs)
477477
{
478+
lhs->reify();
479+
rhs->reify();
478480
int64_t lhs_nrows = lhs->nrows;
479481
int64_t rhs_nrows = rhs->nrows;
480482
SType lhs_type = lhs->stype();

c/expr/py_expr.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ PyObject* expr_column(PyObject*, PyObject* args)
5858
PyErr_Format(PyExc_ValueError, "Invalid column index %lld", index);
5959
}
6060
Column* col = dt->columns[index]->shallowcopy(ri);
61-
col->reify();
61+
// col->reify();
6262
return pycolumn::from_column(col, nullptr, 0);
6363
}
6464

c/expr/reduceop.cc

Lines changed: 48 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -40,13 +40,14 @@ static Column* reduce_first(Column* arg, const Groupby& groupby) {
4040
return Column::new_data_column(arg->stype(), 0);
4141
}
4242
size_t ngrps = groupby.ngroups();
43-
arr32_t indices(ngrps);
44-
// TODO: avoid copy (by allowing RowIndex to be created from a MemoryRange)
45-
std::memcpy(indices.data(), groupby.offsets_r(), ngrps * sizeof(int32_t));
46-
RowIndex ri = RowIndex::from_array32(std::move(indices), true);
47-
Column* res = arg->shallowcopy(ri);
48-
res->reify();
49-
return res;
43+
// groupby.offsets array has length `ngrps + 1` and contains offsets of the
44+
// beginning of each group. We will take this array and reinterpret it as a
45+
// RowIndex (taking only the first `ngrps` elements). Applying this rowindex
46+
// to the column will produce the vector of first elements in that column.
47+
arr32_t indices(ngrps, groupby.offsets_r());
48+
RowIndex ri = RowIndex::from_array32(std::move(indices), true)
49+
.uplift(arg->rowindex());
50+
return arg->shallowcopy(ri);
5051
}
5152

5253

@@ -65,11 +66,12 @@ static void sum_skipna(const int32_t* groups, int32_t grp, void** params) {
6566
OT sum = 0;
6667
int32_t row0 = groups[grp];
6768
int32_t row1 = groups[grp + 1];
68-
for (int32_t i = row0; i < row1; ++i) {
69-
IT x = inputs[i];
70-
if (ISNA<IT>(x)) continue;
71-
sum += static_cast<OT>(x);
72-
}
69+
col0->rowindex().strided_loop(row0, row1, 1,
70+
[&](int64_t i) {
71+
IT x = inputs[i];
72+
if (!ISNA<IT>(x))
73+
sum += static_cast<OT>(x);
74+
});
7375
outputs[grp] = sum;
7476
}
7577

@@ -90,15 +92,16 @@ static void mean_skipna(const int32_t* groups, int32_t grp, void** params) {
9092
OT delta = 0;
9193
int32_t row0 = groups[grp];
9294
int32_t row1 = groups[grp + 1];
93-
for (int32_t i = row0; i < row1; ++i) {
94-
IT x = inputs[i];
95-
if (ISNA<IT>(x)) continue;
96-
OT y = static_cast<OT>(x) - delta;
97-
OT t = sum + y;
98-
delta = (t - sum) - y;
99-
sum = t;
100-
cnt++;
101-
}
95+
col0->rowindex().strided_loop(row0, row1, 1,
96+
[&](int64_t i) {
97+
IT x = inputs[i];
98+
if (ISNA<IT>(x)) return;
99+
OT y = static_cast<OT>(x) - delta;
100+
OT t = sum + y;
101+
delta = (t - sum) - y;
102+
sum = t;
103+
cnt++;
104+
});
102105
outputs[grp] = cnt == 0? GETNA<OT>() : sum / cnt;
103106
}
104107

@@ -120,15 +123,16 @@ static void stdev_skipna(const int32_t* groups, int32_t grp, void** params) {
120123
int64_t cnt = 0;
121124
int32_t row0 = groups[grp];
122125
int32_t row1 = groups[grp + 1];
123-
for (int32_t i = row0; i < row1; ++i) {
124-
IT x = inputs[i];
125-
if (ISNA<IT>(x)) continue;
126-
cnt++;
127-
OT t1 = x - mean;
128-
mean += t1 / cnt;
129-
OT t2 = x - mean;
130-
m2 += t1 * t2;
131-
}
126+
col0->rowindex().strided_loop(row0, row1, 1,
127+
[&](int64_t i) {
128+
IT x = inputs[i];
129+
if (ISNA<IT>(x)) return;
130+
cnt++;
131+
OT t1 = x - mean;
132+
mean += t1 / cnt;
133+
OT t2 = x - mean;
134+
m2 += t1 * t2;
135+
});
132136
outputs[grp] = cnt <= 1? GETNA<OT>() : std::sqrt(m2 / (cnt - 1));
133137
}
134138

@@ -147,12 +151,13 @@ static void min_skipna(const int32_t* groups, int32_t grp, void** params) {
147151
T res = infinity<T>();
148152
int32_t row0 = groups[grp];
149153
int32_t row1 = groups[grp + 1];
150-
for (int32_t i = row0; i < row1; ++i) {
151-
T x = inputs[i];
152-
if (!ISNA<T>(x) && x < res) {
153-
res = x;
154-
}
155-
}
154+
col0->rowindex().strided_loop(row0, row1, 1,
155+
[&](int64_t i) {
156+
T x = inputs[i];
157+
if (!ISNA<T>(x) && x < res) {
158+
res = x;
159+
}
160+
});
156161
outputs[grp] = res;
157162
}
158163

@@ -171,12 +176,13 @@ static void max_skipna(const int32_t* groups, int32_t grp, void** params) {
171176
T res = -infinity<T>();
172177
int32_t row0 = groups[grp];
173178
int32_t row1 = groups[grp + 1];
174-
for (int32_t i = row0; i < row1; ++i) {
175-
T x = inputs[i];
176-
if (!ISNA<T>(x) && x > res) {
177-
res = x;
178-
}
179-
}
179+
col0->rowindex().strided_loop(row0, row1, 1,
180+
[&](int64_t i) {
181+
T x = inputs[i];
182+
if (!ISNA<T>(x) && x > res) {
183+
res = x;
184+
}
185+
});
180186
outputs[grp] = res;
181187
}
182188

c/expr/unaryop.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ static mapperfn resolve0(SType stype, int opcode) {
135135
Column* unaryop(int opcode, Column* arg)
136136
{
137137
if (opcode == OpCode::Plus) return arg->shallowcopy();
138+
arg->reify();
138139

139140
SType arg_type = arg->stype();
140141
SType res_type = arg_type;

c/py_columnset.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ PyObject* columns_from_columns(PyObject*, PyObject* args)
140140
return nullptr;
141141
}
142142
reinterpret_cast<pycolumn::obj*>(elem)->ref = nullptr;
143+
columns[i]->reify();
143144
}
144145
columns[ncols] = nullptr;
145146

c/utils/array.h

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,14 @@ template <typename T> class array
4848
private:
4949
T* x;
5050
size_t n;
51+
bool owned;
52+
int64_t : 56;
5153

5254
public:
53-
array(size_t len = 0) : x(nullptr), n(0) { resize(len); }
54-
~array() { std::free(x); }
55+
array(size_t len = 0) : x(nullptr), n(0), owned(true) { resize(len); }
56+
array(size_t len, const T* ptr)
57+
: x(const_cast<T*>(ptr)), n(len), owned(false) {}
58+
~array() { if (owned) std::free(x); }
5559
// copy-constructor and assignment are forbidden
5660
array(const array<T>&) = delete;
5761
array<T>& operator=(const array<T>&) = delete;
@@ -64,12 +68,14 @@ template <typename T> class array
6468
using std::swap;
6569
swap(first.x, second.x);
6670
swap(first.n, second.n);
71+
swap(first.owned, second.owned);
6772
}
6873

6974
template <typename S> array<S> cast() {
7075
array<S> res;
7176
res.n = n * sizeof(T) / sizeof(S);
7277
res.x = reinterpret_cast<S*>(x);
78+
res.owned = owned;
7379
x = nullptr;
7480
return res;
7581
}
@@ -79,7 +85,7 @@ template <typename T> class array
7985
size_t size = sizeof(T) * n;
8086
x = nullptr;
8187
n = 0;
82-
return MemoryRange(size, ptr, /* own = */ true);
88+
return MemoryRange(size, ptr, /* own = */ owned);
8389
}
8490

8591
// Standard operators
@@ -95,6 +101,9 @@ template <typename T> class array
95101

96102
void resize(size_t newn) {
97103
if (newn == n) return;
104+
if (!owned) {
105+
throw MemoryError() << "Cannot resize array: not owned";
106+
}
98107
if (newn == 0) {
99108
std::free(x);
100109
x = nullptr;

datatable/frame.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -981,7 +981,7 @@ def data_viewer(row0, row1, col0, col1):
981981
"sort.max_chunk_length", xtype=int, default=1024, core=True)
982982

983983
options.register_option(
984-
"sort.max_radix_bits", xtype=int, default=8, core=True)
984+
"sort.max_radix_bits", xtype=int, default=12, core=True)
985985

986986
options.register_option(
987987
"sort.over_radix_bits", xtype=int, default=8, core=True)

tests/fread/test_fread_api.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -592,7 +592,7 @@ def test_fread_skip_blank_lines_true():
592592
assert d0.topython() == [[1, 3], [2, 4]]
593593

594594

595-
@pytest.mark.xfail()
595+
@pytest.mark.skip("Issue #838")
596596
def test_fread_skip_blank_lines_false():
597597
inp = "A,B\n1,2\n \n\n3,4\n"
598598
with pytest.warns(DatatableWarning) as ws:

0 commit comments

Comments
 (0)