Skip to content

Commit d8baf65

Browse files
committed
Add support for running pandas queries with cudf.pandas enabled
1 parent 53012e2 commit d8baf65

File tree

5 files changed

+104
-9
lines changed

5 files changed

+104
-9
lines changed

Makefile

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,20 @@ run-duckdb: .venv data/tables/ ## Run DuckDB benchmarks
9292
run-pandas: .venv data/tables/ ## Run pandas benchmarks
9393
$(VENV_BIN)/python -m queries.pandas
9494

95+
.PHONY: run-pandas-gpu
96+
run-pandas-gpu: .venv ## Run cudf.pandas benchmarks
97+
# TODO: Change this to use $(PYTHON) once
98+
# https://github.com/pola-rs/polars-benchmark/pull/146 is merged
99+
RUN_PANDAS_GPU=true $(VENV_BIN)/python -m queries.pandas
100+
101+
.PHONY: run-pandas-no-env
102+
run-pandas-no-env: ## Run pandas benchmarks
103+
python -m queries.pandas
104+
105+
.PHONY: run-pandas-gpu-no-env
106+
run-pandas-no-env: ## Run pandas benchmarks
107+
RUN_PANDAS_GPU=true python -m queries.pandas
108+
95109
.PHONY: run-pyspark
96110
run-pyspark: .venv data/tables/ ## Run PySpark benchmarks
97111
$(VENV_BIN)/python -m queries.pyspark

queries/pandas/utils.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,12 @@ def get_part_supp_ds() -> pd.DataFrame:
8080

8181

8282
def run_query(query_number: int, query: Callable[..., Any]) -> None:
83+
if settings.run.pandas_gpu:
84+
# Note that this has a global effect. Using it in this way should be fine though
85+
# given that execute_all launches a separate Python process for each query.
86+
import cudf.pandas
87+
cudf.pandas.install()
88+
8389
run_query_generic(
8490
query, query_number, "pandas", query_checker=check_query_result_pd
8591
)

requirements.in

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ pandas>=2.0
66
polars
77
polars_cloud
88
pyspark
9+
cudf-cu12
10+
cudf-polars-cu12
911

1012
pyarrow # Required by duckdb/pandas
1113
fastparquet # Required by pandas

requirements.txt

Lines changed: 80 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ attrs==25.3.0
88
# via
99
# jsonschema
1010
# referencing
11+
cachetools==5.5.2
12+
# via cudf-cu12
1113
certifi==2025.4.26
1214
# via requests
1315
charset-normalizer==3.4.1
@@ -22,6 +24,19 @@ contourpy==1.3.2
2224
# via matplotlib
2325
cramjam==2.10.0
2426
# via fastparquet
27+
cuda-bindings==12.8.0
28+
# via cuda-python
29+
cuda-python==12.8.0
30+
# via
31+
# cudf-cu12
32+
# pylibcudf-cu12
33+
# rmm-cu12
34+
cudf-cu12==25.4.0
35+
# via -r requirements.in
36+
cudf-polars-cu12==25.4.0
37+
# via -r requirements.in
38+
cupy-cuda12x==13.4.1
39+
# via cudf-cu12
2540
cycler==0.12.1
2641
# via matplotlib
2742
dask==2025.4.1
@@ -34,6 +49,8 @@ duckdb==1.2.2
3449
# via -r requirements.in
3550
fastparquet==2024.11.0
3651
# via -r requirements.in
52+
fastrlock==0.8.3
53+
# via cupy-cuda12x
3754
filelock==3.18.0
3855
# via ray
3956
fonttools==4.57.0
@@ -44,58 +61,92 @@ frozenlist==1.6.0
4461
# ray
4562
fsspec==2025.3.2
4663
# via
64+
# cudf-cu12
4765
# dask
4866
# fastparquet
4967
# modin
5068
idna==3.10
5169
# via requests
52-
importlib-metadata==8.6.1
53-
# via dask
5470
jsonschema==4.23.0
5571
# via ray
5672
jsonschema-specifications==2025.4.1
5773
# via jsonschema
5874
kiwisolver==1.4.8
5975
# via matplotlib
76+
libcudf-cu12==25.4.0
77+
# via
78+
# cudf-cu12
79+
# pylibcudf-cu12
80+
libkvikio-cu12==25.4.0
81+
# via libcudf-cu12
82+
librmm-cu12==25.4.0
83+
# via
84+
# libcudf-cu12
85+
# rmm-cu12
6086
linetimer==0.1.5
6187
# via -r requirements.in
88+
llvmlite==0.44.0
89+
# via numba
6290
locket==1.0.0
6391
# via partd
92+
markdown-it-py==3.0.0
93+
# via rich
6494
matplotlib==3.10.1
6595
# via plotnine
96+
mdurl==0.1.2
97+
# via markdown-it-py
6698
mizani==0.13.3
6799
# via plotnine
68100
modin==0.32.0
69101
# via -r requirements.in
70102
msgpack==1.1.0
71103
# via ray
72-
narwhals==1.36.0
104+
narwhals==1.37.0
73105
# via plotly
106+
numba==0.61.2
107+
# via
108+
# cudf-cu12
109+
# numba-cuda
110+
numba-cuda==0.4.0
111+
# via cudf-cu12
74112
numpy==2.2.5
75113
# via
76114
# contourpy
115+
# cudf-cu12
116+
# cupy-cuda12x
77117
# dask
78118
# fastparquet
79119
# matplotlib
80120
# mizani
81121
# modin
122+
# numba
82123
# pandas
83124
# patsy
84125
# plotnine
126+
# rmm-cu12
85127
# scipy
86128
# statsmodels
129+
nvidia-nvcomp-cu12==4.2.0.11
130+
# via libcudf-cu12
131+
nvtx==0.2.11
132+
# via
133+
# cudf-cu12
134+
# pylibcudf-cu12
87135
packaging==25.0
88136
# via
137+
# cudf-cu12
89138
# dask
90139
# fastparquet
91140
# matplotlib
92141
# modin
93142
# plotly
143+
# pylibcudf-cu12
94144
# ray
95145
# statsmodels
96146
pandas==2.2.3
97147
# via
98148
# -r requirements.in
149+
# cudf-cu12
99150
# dask
100151
# fastparquet
101152
# mizani
@@ -112,11 +163,12 @@ plotly==6.0.1
112163
# via -r requirements.in
113164
plotnine==0.14.5
114165
# via -r requirements.in
115-
polars==1.28.0
166+
polars==1.25.2
116167
# via
117168
# -r requirements.in
169+
# cudf-polars-cu12
118170
# polars-cloud
119-
polars-cloud==0.0.4
171+
polars-cloud==0.0.5
120172
# via -r requirements.in
121173
protobuf==6.30.2
122174
# via ray
@@ -127,8 +179,10 @@ py4j==0.10.9.7
127179
pyarrow==19.0.1
128180
# via
129181
# -r requirements.in
182+
# cudf-cu12
130183
# dask
131184
# modin
185+
# pylibcudf-cu12
132186
pydantic==2.11.3
133187
# via
134188
# -r requirements.in
@@ -137,6 +191,14 @@ pydantic-core==2.33.1
137191
# via pydantic
138192
pydantic-settings==2.9.1
139193
# via -r requirements.in
194+
pygments==2.19.1
195+
# via rich
196+
pylibcudf-cu12==25.4.0
197+
# via
198+
# cudf-cu12
199+
# cudf-polars-cu12
200+
pynvjitlink-cu12==0.5.2
201+
# via cudf-cu12
140202
pyparsing==3.2.3
141203
# via matplotlib
142204
pyspark==3.5.5
@@ -153,6 +215,10 @@ pyyaml==6.0.2
153215
# via
154216
# dask
155217
# ray
218+
rapids-logger==0.1.1
219+
# via
220+
# libcudf-cu12
221+
# librmm-cu12
156222
ray==2.44.1
157223
# via modin
158224
referencing==0.36.2
@@ -161,6 +227,12 @@ referencing==0.36.2
161227
# jsonschema-specifications
162228
requests==2.32.3
163229
# via ray
230+
rich==14.0.0
231+
# via cudf-cu12
232+
rmm-cu12==25.4.0
233+
# via
234+
# cudf-cu12
235+
# pylibcudf-cu12
164236
rpds-py==0.24.0
165237
# via
166238
# jsonschema
@@ -170,7 +242,7 @@ scipy==1.15.2
170242
# mizani
171243
# plotnine
172244
# statsmodels
173-
setuptools==79.0.1
245+
setuptools==80.0.0
174246
# via -r requirements.in
175247
six==1.17.0
176248
# via python-dateutil
@@ -182,9 +254,10 @@ toolz==1.0.0
182254
# partd
183255
typing-extensions==4.13.2
184256
# via
185-
# polars-cloud
257+
# cudf-cu12
186258
# pydantic
187259
# pydantic-core
260+
# pylibcudf-cu12
188261
# referencing
189262
# typing-inspection
190263
typing-inspection==0.4.0
@@ -195,5 +268,3 @@ tzdata==2025.2
195268
# via pandas
196269
urllib3==2.4.0
197270
# via requests
198-
zipp==3.21.0
199-
# via importlib-metadata

settings.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ class Run(BaseSettings):
5555
spark_executor_memory: str = "1g" # Tune as needed for optimal performance
5656
spark_log_level: str = "ERROR"
5757

58+
pandas_gpu: bool = False # Use cudf.pandas to run pandas benchmarks
59+
5860
@computed_field # type: ignore[prop-decorator]
5961
@property
6062
def include_io(self) -> bool:

0 commit comments

Comments
 (0)