Skip to content

Commit 2a41d70

Browse files
authored
Merge pull request #1 from jangorecki/polars
add polars rolling statistics benchmark
2 parents 08a5e0f + baff06c commit 2a41d70

File tree

2 files changed

+113
-44
lines changed

2 files changed

+113
-44
lines changed

README.md

Lines changed: 52 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,77 +1,85 @@
11

22
# mini rolling statistics benchmark
33

4-
Compares python pandas and R data.table by:
4+
Compares python pandas and R data.table and polars by:
55

66
- input size: `1e6, 1e7, 1e8`
77
- rolling window size: `1e2, 1e3, 1e4`
88
- rolling functions: `mean` and `median`
99
- batching: `single` computation and `quadruple` (2 columns x 2 windows) computation
1010

11+
For comprehensive benchmark of rolling functions see [db-benchmark](https://github.com/duckdblabs/db-benchmark/pull/9) project.
12+
1113
## running
1214

1315
```sh
14-
python3 pandas2.py
16+
python pandas2.py
1517
Rscript data.table.R
18+
Rscript polars.R
1619
```
1720

1821
## results
1922

20-
As of 2023-10-04 using
23+
As of 2023-11-21 using
2124

2225
```
2326
pandas 2.0.3
2427
data.table@rollmedian
28+
polars 0.10.1
2529
```
2630

27-
Timings are in seconds. Column `pd2dt` is time of pandas divided by time of data.table to easily show how many times data.table is faster than pandas.
31+
Fedora 39 (= recent toolchain) and i5-10210Y cpu.
32+
33+
Timings are in seconds. Columns `pd2dt` and `ps2dt` are times of pandas and polars divided by time of data.table to present how many times data.table is faster.
2834

2935
### single computation
3036

3137
```
32-
rolling length window data.table pandas pd2dt
33-
<char> <num> <int> <num> <num> <num>
34-
1: mean 1e+06 100 0.005 0.023 4.60
35-
2: mean 1e+06 1000 0.006 0.017 2.83
36-
3: mean 1e+06 10000 0.005 0.017 3.40
37-
4: mean 1e+07 100 0.071 0.175 2.46
38-
5: mean 1e+07 1000 0.046 0.190 4.13
39-
6: mean 1e+07 10000 0.046 0.188 4.09
40-
7: mean 1e+08 100 0.393 1.727 4.39
41-
8: mean 1e+08 1000 0.368 1.746 4.74
42-
9: mean 1e+08 10000 0.367 1.741 4.74
43-
10: median 1e+06 100 0.063 0.485 7.70
44-
11: median 1e+06 1000 0.064 0.710 11.09
45-
12: median 1e+06 10000 0.089 1.077 12.10
46-
13: median 1e+07 100 0.601 4.833 8.04
47-
14: median 1e+07 1000 0.683 7.038 10.30
48-
15: median 1e+07 10000 0.859 10.851 12.63
49-
16: median 1e+08 100 6.037 47.857 7.93
50-
17: median 1e+08 1000 7.027 69.763 9.93
51-
18: median 1e+08 10000 8.594 94.668 11.02
38+
rolling length window data.table pandas polars pd2dt ps2dt
39+
1: mean 1e+06 100 0.004 0.031 0.008 7.75 2.00
40+
2: mean 1e+06 1000 0.005 0.022 0.009 4.40 1.80
41+
3: mean 1e+06 10000 0.004 0.022 0.009 5.50 2.25
42+
4: mean 1e+07 100 0.062 0.174 0.071 2.81 1.15
43+
5: mean 1e+07 1000 0.040 0.187 0.072 4.68 1.80
44+
6: mean 1e+07 10000 0.039 0.187 0.074 4.79 1.90
45+
7: mean 1e+08 100 0.392 1.933 0.722 4.93 1.84
46+
8: mean 1e+08 1000 0.370 2.435 0.712 6.58 1.92
47+
9: mean 1e+08 10000 0.368 2.094 0.829 5.69 2.25
48+
10: median 1e+06 100 0.065 0.491 0.167 7.55 2.57
49+
11: median 1e+06 1000 0.067 0.739 0.311 11.03 4.64
50+
12: median 1e+06 10000 0.079 1.115 1.517 14.11 19.20
51+
13: median 1e+07 100 0.590 4.906 1.481 8.32 2.51
52+
14: median 1e+07 1000 0.729 7.042 2.798 9.66 3.84
53+
15: median 1e+07 10000 0.866 10.849 14.316 12.53 16.53
54+
16: median 1e+08 100 6.253 48.645 14.580 7.78 2.33
55+
17: median 1e+08 1000 7.350 68.938 27.551 9.38 3.75
56+
18: median 1e+08 10000 9.104 95.918 145.899 10.54 16.03
5257
```
5358

5459
### quadruple computation
5560

5661
```
57-
rolling length window data.table pandas pd2dt
58-
<char> <num> <int> <num> <num> <num>
59-
1: mean 1e+06 100 0.006 0.076 12.67
60-
2: mean 1e+06 1000 0.006 0.078 13.00
61-
3: mean 1e+06 10000 0.004 0.079 19.75
62-
4: mean 1e+07 100 0.052 0.841 16.17
63-
5: mean 1e+07 1000 0.054 0.850 15.74
64-
6: mean 1e+07 10000 0.052 0.853 16.40
65-
7: mean 1e+08 100 0.557 8.374 15.03
66-
8: mean 1e+08 1000 0.557 9.711 17.43
67-
9: mean 1e+08 10000 0.544 9.420 17.32
68-
10: median 1e+06 100 0.109 1.888 17.32
69-
11: median 1e+06 1000 0.134 2.776 20.72
70-
12: median 1e+06 10000 0.198 3.683 18.60
71-
13: median 1e+07 100 1.347 19.097 14.18
72-
14: median 1e+07 1000 2.066 27.983 13.54
73-
15: median 1e+07 10000 2.742 43.061 15.70
74-
16: median 1e+08 100 16.265 194.275 11.94
75-
17: median 1e+08 1000 20.835 281.547 13.51
76-
18: median 1e+08 10000 27.177 434.483 15.99
62+
rolling length window data.table pandas polars pd2dt ps2dt
63+
1: mean 1e+06 100 0.006 0.098 0.019 16.33 3.17
64+
2: mean 1e+06 1000 0.006 0.096 0.020 16.00 3.33
65+
3: mean 1e+06 10000 0.010 0.097 0.021 9.70 2.10
66+
4: mean 1e+07 100 0.055 0.944 0.180 17.16 3.27
67+
5: mean 1e+07 1000 0.059 0.969 0.195 16.42 3.31
68+
6: mean 1e+07 10000 0.056 0.969 0.208 17.30 3.71
69+
7: mean 1e+08 100 0.657 14.427 2.177 21.96 3.31
70+
8: mean 1e+08 1000 0.807 27.017 2.363 33.48 2.93
71+
9: mean 1e+08 10000 0.776 12.753 2.317 16.43 2.99
72+
10: median 1e+06 100 0.107 1.928 0.324 18.02 3.03
73+
11: median 1e+06 1000 0.138 2.770 0.656 20.07 4.75
74+
12: median 1e+06 10000 0.222 3.881 3.656 17.48 16.47
75+
13: median 1e+07 100 1.344 19.269 3.635 14.34 2.70
76+
14: median 1e+07 1000 2.234 27.619 7.015 12.36 3.14
77+
15: median 1e+07 10000 2.852 37.945 38.491 13.30 13.50
78+
16: median 1e+08 100 16.539 193.638 37.390 11.71 2.26
79+
17: median 1e+08 1000 22.452 290.607 70.992 12.94 3.16
80+
18: median 1e+08 10000 27.820 406.971 377.634 14.63 13.57
7781
```
82+
83+
## license
84+
85+
MIT

polars.R

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
library(polars, lib="/home/jan/git/r-polars/lib")
2+
set.seed(108)
3+
# mean
4+
test1 = function(N, W) {
5+
for (n in N) {
6+
x = pl$DataFrame(list(V1=rnorm(n)))
7+
for (w in W) {
8+
t = system.time(
9+
ans <- x$select(pl$col("V1")$rolling_mean(window_size=w))
10+
)[["elapsed"]]
11+
cat("polars,mean,single,",n,",",w,",",t,"\n",sep="")
12+
}
13+
}
14+
}
15+
test4 = function(N, W) {
16+
for (n in N) {
17+
x = pl$DataFrame(setNames(replicate(2L, rnorm(n), simplify=FALSE), c("V1","V2")))
18+
for (w in W) {
19+
ww = w + c(-10L,10L)
20+
t = system.time(
21+
ans <- list(
22+
x$select(pl$col(c("V1","V2"))$rolling_mean(window_size=ww[1])),
23+
x$select(pl$col(c("V1","V2"))$rolling_mean(window_size=ww[2]))
24+
)
25+
)[["elapsed"]]
26+
cat("polars,mean,quadruple,",n,",",w,",",t,"\n",sep="")
27+
}
28+
}
29+
}
30+
test1(N = c(1e6, 1e7, 1e8), W = c(1e2, 1e3, 1e4))
31+
test4(N = c(1e6, 1e7, 1e8), W = c(1e2, 1e3, 1e4))
32+
# median
33+
test1 = function(N, W) {
34+
for (n in N) {
35+
df = pl$DataFrame(list(V1=rnorm(n)))
36+
for (w in W) {
37+
t = system.time(
38+
ans <- df$select(pl$col("V1")$rolling_median(window_size=w))
39+
)[["elapsed"]]
40+
cat("polars,median,single,",n,",",w,",",t,"\n",sep="")
41+
}
42+
}
43+
}
44+
test4 = function(N, W) {
45+
for (n in N) {
46+
x = pl$DataFrame(setNames(replicate(2L, rnorm(n), simplify=FALSE), c("V1","V2")))
47+
for (w in W) {
48+
ww = w + c(-10L,10L)
49+
t = system.time(
50+
ans <- list(
51+
x$select(pl$col(c("V1","V2"))$rolling_median(window_size=ww[1])),
52+
x$select(pl$col(c("V1","V2"))$rolling_median(window_size=ww[2]))
53+
)
54+
)[["elapsed"]]
55+
cat("polars,median,quadruple,",n,",",w,",",t,"\n",sep="")
56+
}
57+
}
58+
}
59+
test1(N = c(1e6, 1e7, 1e8), W = c(1e2, 1e3, 1e4))
60+
test4(N = c(1e6, 1e7, 1e8), W = c(1e2, 1e3, 1e4))
61+
q(status=0)

0 commit comments

Comments
 (0)