Skip to content

Commit 57bd323

Browse files
committed
Upload files from a ZIP archive
1 parent cff52e6 commit 57bd323

File tree

8 files changed

+432
-29
lines changed

8 files changed

+432
-29
lines changed

polars-vs-pandas/DataFrame_Plots.ipynb

Lines changed: 207 additions & 0 deletions
Large diffs are not rendered by default.

polars-vs-pandas/README.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Polars vs pandas: What's the Difference?
2+
3+
The materials contained in this folder are designed to complement the Real Python tutorial [Polars vs pandas: What's the Difference?](https://realpython.com/polars-vs-pandas/).
4+
5+
Your download bundle contains the following files:
6+
7+
| File | Description |
8+
|-----------------------------------------|------------------------------------------------------------------------------------------------------------|
9+
| `online_retail.parquet` | This parquet file contains retail data used in some of the queries. |
10+
| `data_generation.py` | This script contains the `generate_data()` function used to generate different quantities of data. |
11+
| `dataframe_and_lazyframe_time_tests.py` | This script performs time tests for DataFrames and a LazyFrame. |
12+
| `streaming_test.py` | This script performs time tests for a LazyFrame with streaming enabled. |
13+
| `dataframe_conversions.py` | This file contains the code used to convert between pandas and Polars DataFrames, plus a Narwhals example. |
14+
| `sample_pandas_and_polars_code.py` | This file contains the code used to illustrate the differences between pandas and Polars syntax. |
15+
| `dataFrame_plots.ipynb` | This Jupyter Notebook file contains the plotting code to demonstrate default plotting capabilities. |
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import numpy as np
2+
3+
def generate_data(number_of_rows):
4+
rng = np.random.default_rng()
5+
6+
return {
7+
"order_id": range(1, number_of_rows + 1),
8+
"region": rng.choice(
9+
["North", "South", "East", "West"], size=number_of_rows
10+
),
11+
"sales_person": rng.choice(
12+
["Armstrong", "Aldrin", "Collins"], size=number_of_rows
13+
),
14+
"product": rng.choice(
15+
["Helmet", "Oxygen", "Boots", "Gloves"], size=number_of_rows
16+
),
17+
"sales_income": rng.integers(1, 5001, size=number_of_rows),
18+
}
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
import functools
2+
import sys
3+
from timeit import Timer
4+
5+
import pandas as pd
6+
import polars as pl
7+
8+
from data_generation import data_generation
9+
10+
11+
def create_pandas_dataframe(test_data):
12+
return pd.DataFrame(test_data).convert_dtypes(dtype_backend="pyarrow")
13+
14+
15+
def create_polars_dataframe(test_data):
16+
return pl.DataFrame(test_data)
17+
18+
19+
def create_polars_lazyframe(test_data):
20+
return pl.LazyFrame(test_data)
21+
22+
23+
def analyze_pandas_dataframe(pandas_df):
24+
pandas_df.groupby(["region", "product", "sales_person"])[
25+
"sales_income"
26+
].sum()
27+
28+
def analyze_polars_dataframe(polars_df):
29+
polars_df.group_by(["region", "product", "sales_person"]).agg(
30+
total_sales=pl.col("sales_income").sum()
31+
)
32+
33+
def analyze_polars_lazyframe(polars_lf):
34+
polars_lf.group_by(["region", "product", "sales_person"]).agg(
35+
total_sales=pl.col("sales_income").sum()
36+
).collect()
37+
38+
39+
test_data = generate_data(int(sys.argv[1]))
40+
41+
print(f"\nPandas dataframe creation time for {int(sys.argv[1])} rows:")
42+
print(Timer(functools.partial(create_pandas_dataframe, test_data)).timeit(100))
43+
print(f"\nPolars dataframe creation time for {int(sys.argv[1])} rows:")
44+
print(Timer(functools.partial(create_polars_dataframe, test_data)).timeit(100))
45+
print(f"\nPolars lazyframe creation time for {int(sys.argv[1])} rows:")
46+
print(Timer(functools.partial(create_polars_lazyframe, test_data)).timeit(100))
47+
48+
print()
49+
50+
pandas_df = create_pandas_dataframe(test_data)
51+
polars_df = create_polars_dataframe(test_data)
52+
polars_lf = create_polars_lazyframe(test_data)
53+
54+
print(f"Pandas dataframe analysis time for {int(sys.argv[1])} rows:")
55+
print(
56+
Timer(functools.partial(analyze_pandas_dataframe, pandas_df)).timeit(100)
57+
)
58+
59+
print()
60+
print(f"Polars dataframe analysis time for {int(sys.argv[1])} rows:")
61+
print(
62+
Timer(functools.partial(analyze_polars_dataframe, polars_df)).timeit(100)
63+
)
64+
65+
print()
66+
print(f"Polars lazyframe analysis time for {int(sys.argv[1])} rows:")
67+
print(
68+
Timer(functools.partial(analyze_polars_lazyframe, polars_lf)).timeit(100)
69+
)
70+
71+
print()
72+
print("\nShow Boots sales in the East region for pandas DataFrame")
73+
print(
74+
analyze_pandas_dataframe(pandas_df)["East"]["Boots"]
75+
)
76+
77+
print("\nShow Boots sales in the East region for Polars DataFrame")
78+
print(
79+
(
80+
analyze_polars_dataframe(polars_df)
81+
.filter(
82+
pl.col("region") == "East",
83+
pl.col("product") == "Boots",
84+
)
85+
)
86+
)
87+
88+
print("\nShow Boots sales in the East region for Polars LazyFrame")
89+
print(
90+
(
91+
analyze_polars_lazyframe(polars_lf)
92+
.filter(
93+
pl.col("region") == "East",
94+
pl.col("product") == "Boots",
95+
)
96+
)
97+
)
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,29 @@
1-
import narwhals as nw
2-
import polars as pl
3-
from data_generation import generate_data
4-
5-
polars_df = pl.DataFrame(generate_data(4))
6-
polars_df
7-
8-
pandas_df = polars_df.to_pandas()
9-
type(pandas_df)
10-
pandas_df
11-
12-
polars_df = pl.from_pandas(pandas_df)
13-
type(polars_df)
14-
polars_df
15-
16-
17-
def universal_groupby(df):
18-
return (
19-
nw.from_native(df)
20-
.group_by("region")
21-
.agg(nw.col("sales_income").sum())
22-
.sort("region")
23-
.to_native()
24-
)
25-
26-
27-
universal_groupby(pandas_df)
28-
29-
universal_groupby(polars_df)
1+
import narwhals as nw
2+
import polars as pl
3+
from data_generation import generate_data
4+
5+
polars_df = pl.DataFrame(data_generation(4))
6+
polars_df
7+
8+
pandas_df = polars_df.to_pandas()
9+
type(pandas_df)
10+
pandas_df
11+
12+
polars_df = pl.from_pandas(pandas_df)
13+
type(polars_df)
14+
polars_df
15+
16+
17+
def universal_groupby(df):
18+
return (
19+
nw.from_native(df)
20+
.group_by("region")
21+
.agg(nw.col("sales_income").sum())
22+
.sort("region")
23+
.to_native()
24+
)
25+
26+
27+
universal_groupby(pandas_df)
28+
29+
universal_groupby(polars_df)
3.49 MB
Binary file not shown.
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import pandas as pd
2+
import polars as pl
3+
4+
5+
orders_pandas = pd.read_parquet("online_retail.parquet")
6+
7+
orders_pandas["Total"] = (
8+
orders_pandas["Quantity"] * orders_pandas["UnitPrice"]
9+
)
10+
11+
orders_pandas[["InvoiceNo", "Quantity", "UnitPrice", "Total"]][
12+
orders_pandas["Total"] > 100
13+
].head(3)
14+
15+
16+
(
17+
orders_pandas
18+
.assign(Total=orders_pandas["Quantity"] * orders_pandas["UnitPrice"])
19+
.filter(["InvoiceNo", "Quantity", "UnitPrice", "Total"])
20+
.query("Total > 100")
21+
).head(3)
22+
23+
24+
orders_polars = pl.read_parquet("online_retail.parquet")
25+
26+
(
27+
orders_polars.select(
28+
pl.col(["InvoiceNo", "Quantity", "UnitPrice"]),
29+
Total=pl.col("Quantity") * pl.col("UnitPrice"),
30+
).filter(pl.col("Total") > 100)
31+
).head(3)
32+

polars-vs-pandas/streaming_test.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import functools
2+
import sys
3+
from timeit import Timer
4+
5+
import polars as pl
6+
7+
from data_generation import generate_data
8+
9+
def create_polars_lazyframe(test_data):
10+
return pl.LazyFrame(test_data)
11+
12+
def analyze_polars_lazyframe(polars_lf):
13+
polars_lf.group_by(["region", "product", "sales_person"]).agg(
14+
total_sales=pl.col("sales_income").sum()
15+
).collect()
16+
17+
def analyze_polars_streaming(polars_lf):
18+
polars_lf.group_by(["region", "product", "sales_person"]).agg(
19+
total_sales=pl.col("sales_income").sum()
20+
).collect(engine="streaming")
21+
22+
test_data = generate_data(int(sys.argv[1]))
23+
24+
polars_lf = create_polars_lazyframe(test_data)
25+
26+
print(f"Polars lazyframe analysis time for {int(sys.argv[1])} rows:")
27+
print(
28+
Timer(functools.partial(analyze_polars_lazyframe, polars_lf)).timeit(100)
29+
)
30+
31+
print(f"\nPolars streaming analysis time for {int(sys.argv[1])} rows:")
32+
print(
33+
Timer(functools.partial(analyze_polars_streaming, polars_lf)).timeit(100)
34+
)

0 commit comments

Comments
 (0)