Skip to content

Commit 5704140

Browse files
eyrei123bzaczynskistephengruppetta
authored
Polars vs pandas (#697)
* addition of deleted file * code error fix * Rename agnositic_groupby to universal_groupby * Upload files from a ZIP archive * Various fixes and amendments * Final QA * Fix linter issues --------- Co-authored-by: Bartosz Zaczyński <[email protected]> Co-authored-by: stephengruppetta <[email protected]>
1 parent 9c3ef87 commit 5704140

File tree

9 files changed

+454
-0
lines changed

9 files changed

+454
-0
lines changed

polars-vs-pandas/README.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Polars vs pandas: What's the Difference?
2+
3+
The materials contained in this folder are designed to complement the Real Python tutorial [Polars vs pandas: What's the Difference?](https://realpython.com/polars-vs-pandas/).
4+
5+
Your download bundle contains the following files:
6+
7+
| File | Description |
8+
|-----------------------------------------|------------------------------------------------------------------------------------------------------------|
9+
| `benchmark.py` | This script performs time tests for DataFrames and a LazyFrame. |
10+
| `conversions.py` | This file contains the code used to convert between pandas and Polars DataFrames, plus a Narwhals example. |
11+
| `data_generation.py` | This script contains the `generate_data()` function used to generate different quantities of data. |
12+
| `online_retail.parquet` | This Parquet file contains retail data used in some of the queries. |
13+
| `pandas_polars_demo.py` | This file contains the code used to illustrate the differences between pandas and Polars syntax. |
14+
| `plots.ipynb` | This Jupyter Notebook file contains the plotting code to demonstrate default plotting capabilities. |
15+
| `streaming_test.py` | This script performs time tests for a LazyFrame with streaming enabled. |

polars-vs-pandas/benchmark.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
"""
2+
Running:
3+
$ python benchmark.py 500
4+
"""
5+
6+
import functools
7+
import sys
8+
from timeit import Timer
9+
10+
import pandas as pd
11+
import polars as pl
12+
from data_generation import generate_data
13+
14+
15+
def create_pandas_dataframe(test_data):
16+
return pd.DataFrame(test_data).convert_dtypes(dtype_backend="pyarrow")
17+
18+
19+
def create_polars_dataframe(test_data):
20+
return pl.DataFrame(test_data)
21+
22+
23+
def create_polars_lazyframe(test_data):
24+
return pl.LazyFrame(test_data)
25+
26+
27+
def analyze_pandas_dataframe(pandas_df):
28+
return pandas_df.groupby(["region", "product", "sales_person"])[
29+
"sales_income"
30+
].sum()
31+
32+
33+
def analyze_polars_dataframe(polars_df):
34+
return polars_df.group_by(["region", "product", "sales_person"]).agg(
35+
total_sales=pl.col("sales_income").sum()
36+
)
37+
38+
39+
def analyze_polars_lazyframe(polars_lf):
40+
return (
41+
polars_lf.group_by(["region", "product", "sales_person"])
42+
.agg(total_sales=pl.col("sales_income").sum())
43+
.collect()
44+
)
45+
46+
47+
print("Creating DataFrames...")
48+
49+
test_data = generate_data(int(sys.argv[1]))
50+
51+
print(f"Pandas dataframe creation time for {int(sys.argv[1]):,} rows:")
52+
print(Timer(functools.partial(create_pandas_dataframe, test_data)).timeit(100))
53+
print(f"\nPolars dataframe creation time for {int(sys.argv[1]):,} rows:")
54+
print(Timer(functools.partial(create_polars_dataframe, test_data)).timeit(100))
55+
print(f"\nPolars lazyframe creation time for {int(sys.argv[1]):,} rows:")
56+
print(Timer(functools.partial(create_polars_lazyframe, test_data)).timeit(100))
57+
58+
print("-" * 50)
59+
print("Analyzing DataFrames...")
60+
61+
pandas_df = create_pandas_dataframe(test_data)
62+
polars_df = create_polars_dataframe(test_data)
63+
polars_lf = create_polars_lazyframe(test_data)
64+
65+
print(f"Pandas dataframe analysis time for {int(sys.argv[1]):,} rows:")
66+
print(
67+
Timer(functools.partial(analyze_pandas_dataframe, pandas_df)).timeit(100)
68+
)
69+
70+
print()
71+
print(f"Polars dataframe analysis time for {int(sys.argv[1]):,} rows:")
72+
print(
73+
Timer(functools.partial(analyze_polars_dataframe, polars_df)).timeit(100)
74+
)
75+
76+
print()
77+
print(f"Polars lazyframe analysis time for {int(sys.argv[1]):,} rows:")
78+
print(
79+
Timer(functools.partial(analyze_polars_lazyframe, polars_lf)).timeit(100)
80+
)
81+
82+
print("\nShow Boots sales in the East region for pandas DataFrame")
83+
print(analyze_pandas_dataframe(pandas_df)["East"]["Boots"])
84+
85+
print("\nShow Boots sales in the East region for Polars DataFrame")
86+
print(
87+
(
88+
analyze_polars_dataframe(polars_df).filter(
89+
pl.col("region") == "East",
90+
pl.col("product") == "Boots",
91+
)
92+
)
93+
)
94+
95+
print("\nShow Boots sales in the East region for Polars LazyFrame")
96+
print(
97+
(
98+
analyze_polars_lazyframe(polars_lf).filter(
99+
pl.col("region") == "East",
100+
pl.col("product") == "Boots",
101+
)
102+
)
103+
)

polars-vs-pandas/conversions.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import narwhals as nw
2+
import polars as pl
3+
from data_generation import generate_data
4+
5+
6+
def universal_groupby(df):
7+
return (
8+
nw.from_native(df)
9+
.group_by("region")
10+
.agg(nw.col("sales_income").sum())
11+
.sort("region")
12+
.to_native()
13+
)
14+
15+
16+
polars_df = pl.DataFrame(generate_data(4))
17+
print(polars_df)
18+
19+
print("\nPolars to pandas:")
20+
pandas_df = polars_df.to_pandas()
21+
print(type(pandas_df))
22+
print(pandas_df)
23+
24+
print("\npandas to Polars:")
25+
polars_df = pl.from_pandas(pandas_df)
26+
print(type(polars_df))
27+
print(polars_df)
28+
29+
print("\nNarwhals with pandas:")
30+
print(universal_groupby(pandas_df))
31+
32+
print("\nNarwhals with Polars:")
33+
print(universal_groupby(polars_df))
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import numpy as np
2+
3+
4+
def generate_data(number_of_rows):
5+
rng = np.random.default_rng()
6+
7+
return {
8+
"order_id": range(1, number_of_rows + 1),
9+
"region": rng.choice(
10+
["North", "South", "East", "West"], size=number_of_rows
11+
),
12+
"sales_person": rng.choice(
13+
["Armstrong", "Aldrin", "Collins"], size=number_of_rows
14+
),
15+
"product": rng.choice(
16+
["Helmet", "Oxygen", "Boots", "Gloves"], size=number_of_rows
17+
),
18+
"sales_income": rng.integers(1, 5001, size=number_of_rows),
19+
}
3.49 MB
Binary file not shown.
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import pandas as pd
2+
import polars as pl
3+
4+
print("Index-Based syntax in pandas:")
5+
orders_pandas = pd.read_parquet("online_retail.parquet")
6+
orders_pandas["Total"] = orders_pandas["Quantity"] * orders_pandas["UnitPrice"]
7+
print(
8+
orders_pandas[["InvoiceNo", "Quantity", "UnitPrice", "Total"]][
9+
orders_pandas["Total"] > 100
10+
].head(3)
11+
)
12+
13+
print()
14+
15+
print("Method-chaining syntax in pandas:")
16+
orders_pandas = pd.read_parquet("online_retail.parquet")
17+
print(
18+
(
19+
orders_pandas.assign(
20+
Total=orders_pandas["Quantity"] * orders_pandas["UnitPrice"]
21+
)
22+
.filter(["InvoiceNo", "Quantity", "UnitPrice", "Total"])
23+
.query("Total > 100")
24+
).head(3)
25+
)
26+
27+
print()
28+
29+
print("Method-chaining syntax in Polars:")
30+
orders_polars = pl.read_parquet("online_retail.parquet")
31+
print(
32+
(
33+
orders_polars.select(
34+
pl.col(["InvoiceNo", "Quantity", "UnitPrice"]),
35+
Total=pl.col("Quantity") * pl.col("UnitPrice"),
36+
).filter(pl.col("Total") > 100)
37+
).head(3)
38+
)

polars-vs-pandas/plots.ipynb

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "8ced8243-d770-437e-a90d-f794ffa57fc0",
6+
"metadata": {},
7+
"source": [
8+
"# Dataframe Plots"
9+
]
10+
},
11+
{
12+
"cell_type": "code",
13+
"execution_count": null,
14+
"id": "1cf56fd3-605c-4449-8a5e-d0fd94b49080",
15+
"metadata": {},
16+
"outputs": [],
17+
"source": [
18+
"from data_generation import generate_data\n",
19+
"\n",
20+
"sales_data = generate_data(50)"
21+
]
22+
},
23+
{
24+
"cell_type": "markdown",
25+
"id": "913c18ed-373b-400e-ba27-38ca8e7b70b9",
26+
"metadata": {},
27+
"source": [
28+
"## polars Plotting"
29+
]
30+
},
31+
{
32+
"cell_type": "code",
33+
"execution_count": null,
34+
"id": "014e9e56-8fff-45ab-85ff-eb51840f2bc7",
35+
"metadata": {},
36+
"outputs": [],
37+
"source": [
38+
"import polars as pl\n",
39+
"\n",
40+
"orders_polars = pl.DataFrame(sales_data)\n",
41+
"\n",
42+
"(\n",
43+
" orders_polars.group_by(\"region\")\n",
44+
" .agg(total_sales=pl.col(\"sales_income\").sum())\n",
45+
" .plot.bar(x=\"region\", y=\"total_sales\")\n",
46+
" .properties(width=200, height=200, title=\"Total Sales per Region ($)\")\n",
47+
")"
48+
]
49+
},
50+
{
51+
"cell_type": "markdown",
52+
"id": "a62335eb-4763-4c46-adb2-d7386914f56b",
53+
"metadata": {},
54+
"source": [
55+
"## Pandas Plotting"
56+
]
57+
},
58+
{
59+
"cell_type": "code",
60+
"execution_count": null,
61+
"id": "85929590-e514-4497-b396-58cfe26e59d3",
62+
"metadata": {},
63+
"outputs": [],
64+
"source": [
65+
"import pandas as pd\n",
66+
"\n",
67+
"orders_pandas = pd.DataFrame(sales_data)\n",
68+
"\n",
69+
"(\n",
70+
" orders_pandas.groupby(\n",
71+
" [\n",
72+
" \"region\",\n",
73+
" ]\n",
74+
" )[\"sales_income\"]\n",
75+
" .sum()\n",
76+
" .plot(kind=\"bar\", title=\"Total Sales per Region ($)\", ylabel=\"total_sales\")\n",
77+
")"
78+
]
79+
}
80+
],
81+
"metadata": {
82+
"kernelspec": {
83+
"display_name": "Python 3 (ipykernel)",
84+
"language": "python",
85+
"name": "python3"
86+
},
87+
"language_info": {
88+
"codemirror_mode": {
89+
"name": "ipython",
90+
"version": 3
91+
},
92+
"file_extension": ".py",
93+
"mimetype": "text/x-python",
94+
"name": "python",
95+
"nbconvert_exporter": "python",
96+
"pygments_lexer": "ipython3",
97+
"version": "3.13.7"
98+
}
99+
},
100+
"nbformat": 4,
101+
"nbformat_minor": 5
102+
}

0 commit comments

Comments
 (0)