Skip to content

Commit 30d38e2

Browse files
committed
Various fixes and amendments
1 parent 3264c49 commit 30d38e2

File tree

8 files changed

+239
-216
lines changed

8 files changed

+239
-216
lines changed

polars-vs-pandas/DataFrame_Plots.ipynb

Lines changed: 10 additions & 122 deletions
Large diffs are not rendered by default.

polars-vs-pandas/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@ Your download bundle contains the following files:
66

77
| File | Description |
88
|-----------------------------------------|------------------------------------------------------------------------------------------------------------|
9-
| `online_retail.parquet` | This parquet file contains retail data used in some of the queries. |
109
| `data_generation.py` | This script contains the `generate_data()` function used to generate different quantities of data. |
1110
| `dataframe_and_lazyframe_time_tests.py` | This script performs time tests for DataFrames and a LazyFrame. |
12-
| `streaming_test.py` | This script performs time tests for a LazyFrame with streaming enabled. |
1311
| `dataframe_conversions.py` | This file contains the code used to convert between pandas and Polars DataFrames, plus a Narwhals example. |
12+
| `DataFrame_plots.ipynb` | This Jupyter Notebook file contains the plotting code to demonstrate default plotting capabilities. |
13+
| `online_retail.parquet` | This parquet file contains retail data used in some of the queries. |
1414
| `sample_pandas_and_polars_code.py` | This file contains the code used to illustrate the differences between pandas and Polars syntax. |
15-
| `dataFrame_plots.ipynb` | This Jupyter Notebook file contains the plotting code to demonstrate default plotting capabilities. |
15+
| `streaming_test.py` | This script performs time tests for a LazyFrame with streaming enabled. |

polars-vs-pandas/data_generation.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import numpy as np
22

3+
34
def generate_data(number_of_rows):
45
rng = np.random.default_rng()
56

@@ -15,4 +16,4 @@ def generate_data(number_of_rows):
1516
["Helmet", "Oxygen", "Boots", "Gloves"], size=number_of_rows
1617
),
1718
"sales_income": rng.integers(1, 5001, size=number_of_rows),
18-
}
19+
}

polars-vs-pandas/dataframe_and_lazyframe_time_tests.py

Lines changed: 73 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@
44

55
import pandas as pd
66
import polars as pl
7-
8-
from data_generation import data_generation
7+
from data_generation import generate_data
98

109

1110
def create_pandas_dataframe(test_data):
@@ -25,73 +24,90 @@ def analyze_pandas_dataframe(pandas_df):
2524
"sales_income"
2625
].sum()
2726

27+
2828
def analyze_polars_dataframe(polars_df):
2929
polars_df.group_by(["region", "product", "sales_person"]).agg(
3030
total_sales=pl.col("sales_income").sum()
3131
)
3232

33+
3334
def analyze_polars_lazyframe(polars_lf):
3435
polars_lf.group_by(["region", "product", "sales_person"]).agg(
3536
total_sales=pl.col("sales_income").sum()
3637
).collect()
3738

3839

39-
test_data = generate_data(int(sys.argv[1]))
40-
41-
print(f"\nPandas dataframe creation time for {int(sys.argv[1])} rows:")
42-
print(Timer(functools.partial(create_pandas_dataframe, test_data)).timeit(100))
43-
print(f"\nPolars dataframe creation time for {int(sys.argv[1])} rows:")
44-
print(Timer(functools.partial(create_polars_dataframe, test_data)).timeit(100))
45-
print(f"\nPolars lazyframe creation time for {int(sys.argv[1])} rows:")
46-
print(Timer(functools.partial(create_polars_lazyframe, test_data)).timeit(100))
47-
48-
print()
49-
50-
pandas_df = create_pandas_dataframe(test_data)
51-
polars_df = create_polars_dataframe(test_data)
52-
polars_lf = create_polars_lazyframe(test_data)
53-
54-
print(f"Pandas dataframe analysis time for {int(sys.argv[1])} rows:")
55-
print(
56-
Timer(functools.partial(analyze_pandas_dataframe, pandas_df)).timeit(100)
57-
)
58-
59-
print()
60-
print(f"Polars dataframe analysis time for {int(sys.argv[1])} rows:")
61-
print(
62-
Timer(functools.partial(analyze_polars_dataframe, polars_df)).timeit(100)
63-
)
64-
65-
print()
66-
print(f"Polars lazyframe analysis time for {int(sys.argv[1])} rows:")
67-
print(
68-
Timer(functools.partial(analyze_polars_lazyframe, polars_lf)).timeit(100)
69-
)
70-
71-
print()
72-
print("\nShow Boots sales in the East region for pandas DataFrame")
73-
print(
74-
analyze_pandas_dataframe(pandas_df)["East"]["Boots"]
75-
)
76-
77-
print("\nShow Boots sales in the East region for Polars DataFrame")
78-
print(
79-
(
80-
analyze_polars_dataframe(polars_df)
81-
.filter(
82-
pl.col("region") == "East",
83-
pl.col("product") == "Boots",
40+
if __name__ == "__main__":
41+
test_data = generate_data(int(sys.argv[1]))
42+
43+
print(f"\nPandas dataframe creation time for {int(sys.argv[1])} rows:")
44+
print(
45+
Timer(functools.partial(create_pandas_dataframe, test_data)).timeit(
46+
100
47+
)
48+
)
49+
print(f"\nPolars dataframe creation time for {int(sys.argv[1])} rows:")
50+
print(
51+
Timer(functools.partial(create_polars_dataframe, test_data)).timeit(
52+
100
8453
)
8554
)
86-
)
87-
88-
print("\nShow Boots sales in the East region for Polars LazyFrame")
89-
print(
90-
(
91-
analyze_polars_lazyframe(polars_lf)
92-
.filter(
93-
pl.col("region") == "East",
94-
pl.col("product") == "Boots",
55+
print(f"\nPolars lazyframe creation time for {int(sys.argv[1])} rows:")
56+
print(
57+
Timer(functools.partial(create_polars_lazyframe, test_data)).timeit(
58+
100
59+
)
60+
)
61+
62+
print()
63+
64+
pandas_df = create_pandas_dataframe(test_data)
65+
polars_df = create_polars_dataframe(test_data)
66+
polars_lf = create_polars_lazyframe(test_data)
67+
68+
print(f"Pandas dataframe analysis time for {int(sys.argv[1])} rows:")
69+
print(
70+
Timer(functools.partial(analyze_pandas_dataframe, pandas_df)).timeit(
71+
100
72+
)
73+
)
74+
75+
print()
76+
print(f"Polars dataframe analysis time for {int(sys.argv[1])} rows:")
77+
print(
78+
Timer(functools.partial(analyze_polars_dataframe, polars_df)).timeit(
79+
100
80+
)
81+
)
82+
83+
print()
84+
print(f"Polars lazyframe analysis time for {int(sys.argv[1])} rows:")
85+
print(
86+
Timer(functools.partial(analyze_polars_lazyframe, polars_lf)).timeit(
87+
100
88+
)
89+
)
90+
91+
print()
92+
print("\nShow Boots sales in the East region for pandas DataFrame")
93+
print(analyze_pandas_dataframe(pandas_df)["East"]["Boots"])
94+
95+
print("\nShow Boots sales in the East region for Polars DataFrame")
96+
print(
97+
(
98+
analyze_polars_dataframe(polars_df).filter(
99+
pl.col("region") == "East",
100+
pl.col("product") == "Boots",
101+
)
102+
)
103+
)
104+
105+
print("\nShow Boots sales in the East region for Polars LazyFrame")
106+
print(
107+
(
108+
analyze_polars_lazyframe(polars_lf).filter(
109+
pl.col("region") == "East",
110+
pl.col("product") == "Boots",
111+
)
95112
)
96113
)
97-
)

polars-vs-pandas/dataframe_conversions.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,6 @@
22
import polars as pl
33
from data_generation import generate_data
44

5-
polars_df = pl.DataFrame(data_generation(4))
6-
polars_df
7-
8-
pandas_df = polars_df.to_pandas()
9-
type(pandas_df)
10-
pandas_df
11-
12-
polars_df = pl.from_pandas(pandas_df)
13-
type(polars_df)
14-
polars_df
15-
165

176
def universal_groupby(df):
187
return (
@@ -24,6 +13,17 @@ def universal_groupby(df):
2413
)
2514

2615

27-
universal_groupby(pandas_df)
16+
if __name__ == "__main__":
17+
polars_df = pl.DataFrame(generate_data(4))
18+
print(polars_df)
19+
20+
pandas_df = polars_df.to_pandas()
21+
print(type(pandas_df))
22+
print(pandas_df)
23+
24+
polars_df = pl.from_pandas(pandas_df)
25+
print(type(polars_df))
26+
print(polars_df)
2827

29-
universal_groupby(polars_df)
28+
print(universal_groupby(pandas_df))
29+
print(universal_groupby(polars_df))

polars-vs-pandas/requirements.txt

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
altair==5.5.0
2+
anyio==4.10.0
3+
argon2-cffi==25.1.0
4+
argon2-cffi-bindings==25.1.0
5+
arrow==1.3.0
6+
asttokens==3.0.0
7+
async-lru==2.0.5
8+
attrs==25.3.0
9+
babel==2.17.0
10+
beautifulsoup4==4.13.5
11+
bleach==6.2.0
12+
certifi==2025.8.3
13+
cffi==2.0.0
14+
charset-normalizer==3.4.3
15+
comm==0.2.3
16+
contourpy==1.3.3
17+
cycler==0.12.1
18+
debugpy==1.8.16
19+
decorator==5.2.1
20+
defusedxml==0.7.1
21+
executing==2.2.1
22+
fastjsonschema==2.21.2
23+
fonttools==4.59.2
24+
fqdn==1.5.1
25+
h11==0.16.0
26+
httpcore==1.0.9
27+
httpx==0.28.1
28+
idna==3.10
29+
ipykernel==6.30.1
30+
ipython==9.5.0
31+
ipython_pygments_lexers==1.1.1
32+
ipywidgets==8.1.7
33+
isoduration==20.11.0
34+
jedi==0.19.2
35+
Jinja2==3.1.6
36+
json5==0.12.1
37+
jsonpointer==3.0.0
38+
jsonschema==4.25.1
39+
jsonschema-specifications==2025.9.1
40+
jupyter==1.1.1
41+
jupyter-console==6.6.3
42+
jupyter-events==0.12.0
43+
jupyter-lsp==2.3.0
44+
jupyter_client==8.6.3
45+
jupyter_core==5.8.1
46+
jupyter_server==2.17.0
47+
jupyter_server_terminals==0.5.3
48+
jupyterlab==4.4.7
49+
jupyterlab_pygments==0.3.0
50+
jupyterlab_server==2.27.3
51+
jupyterlab_widgets==3.0.15
52+
kiwisolver==1.4.9
53+
lab==8.4
54+
lark==1.2.2
55+
MarkupSafe==3.0.2
56+
matplotlib==3.10.6
57+
matplotlib-inline==0.1.7
58+
mistune==3.1.4
59+
narwhals==2.5.0
60+
nbclient==0.10.2
61+
nbconvert==7.16.6
62+
nbformat==5.10.4
63+
nest-asyncio==1.6.0
64+
notebook==7.4.5
65+
notebook_shim==0.2.4
66+
numpy==2.3.3
67+
packaging==25.0
68+
pandas==2.3.2
69+
pandocfilters==1.5.1
70+
parso==0.8.5
71+
pexpect==4.9.0
72+
pillow==11.3.0
73+
platformdirs==4.4.0
74+
polars==1.33.1
75+
prometheus_client==0.22.1
76+
prompt_toolkit==3.0.52
77+
psutil==7.0.0
78+
ptyprocess==0.7.0
79+
pure_eval==0.2.3
80+
pyarrow==21.0.0
81+
pycparser==2.23
82+
Pygments==2.19.2
83+
pyparsing==3.2.4
84+
python-dateutil==2.9.0.post0
85+
python-json-logger==3.3.0
86+
pytz==2025.2
87+
PyYAML==6.0.2
88+
pyzmq==27.1.0
89+
referencing==0.36.2
90+
requests==2.32.5
91+
rfc3339-validator==0.1.4
92+
rfc3986-validator==0.1.1
93+
rfc3987-syntax==1.1.0
94+
rpds-py==0.27.1
95+
Send2Trash==1.8.3
96+
setuptools==80.9.0
97+
simplejson==3.20.1
98+
six==1.17.0
99+
sniffio==1.3.1
100+
soupsieve==2.8
101+
stack-data==0.6.3
102+
terminado==0.18.1
103+
tinycss2==1.4.0
104+
tornado==6.5.2
105+
traitlets==5.14.3
106+
txt2tags==3.9
107+
types-python-dateutil==2.9.0.20250822
108+
typing_extensions==4.15.0
109+
tzdata==2025.2
110+
uri-template==1.3.0
111+
urllib3==2.5.0
112+
wcwidth==0.2.13
113+
webcolors==24.11.1
114+
webencodings==0.5.1
115+
websocket-client==1.8.0
116+
widgetsnbextension==4.0.14
Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,22 @@
11
import pandas as pd
22
import polars as pl
33

4-
54
orders_pandas = pd.read_parquet("online_retail.parquet")
65

7-
orders_pandas["Total"] = (
8-
orders_pandas["Quantity"] * orders_pandas["UnitPrice"]
9-
)
6+
orders_pandas["Total"] = orders_pandas["Quantity"] * orders_pandas["UnitPrice"]
107

118
orders_pandas[["InvoiceNo", "Quantity", "UnitPrice", "Total"]][
129
orders_pandas["Total"] > 100
1310
].head(3)
1411

15-
1612
(
17-
orders_pandas
18-
.assign(Total=orders_pandas["Quantity"] * orders_pandas["UnitPrice"])
13+
orders_pandas.assign(
14+
Total=orders_pandas["Quantity"] * orders_pandas["UnitPrice"]
15+
)
1916
.filter(["InvoiceNo", "Quantity", "UnitPrice", "Total"])
2017
.query("Total > 100")
2118
).head(3)
2219

23-
2420
orders_polars = pl.read_parquet("online_retail.parquet")
2521

2622
(
@@ -29,4 +25,3 @@
2925
Total=pl.col("Quantity") * pl.col("UnitPrice"),
3026
).filter(pl.col("Total") > 100)
3127
).head(3)
32-

0 commit comments

Comments
 (0)