Skip to content

Commit d50dcef

Browse files
committed
Post TR1 Commit
1 parent 67525c7 commit d50dcef

File tree

8 files changed

+432
-0
lines changed

8 files changed

+432
-0
lines changed

polars_v_pandas/DataFrame_Plots.ipynb

Lines changed: 207 additions & 0 deletions
Large diffs are not rendered by default.

polars_v_pandas/README.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
The materials contained in this download are designed to complement the RealPython tutorial [Polars vs pandas - What's the Difference](https://realpython.com/polars-vs-pandas-difference/).
2+
3+
You should create a new folder named marimo on your computer and place each of these files inside it. You may also consider creating a [Python virtual environment](https://realpython.com/python-virtual-environments-a-primer/) within this folder.
4+
5+
Your download bundle contains the following files:
6+
7+
8+
9+
Online\_Retail.parquet - This parquet file contains retail data used in some of the queries.
10+
11+
data\_generation.py - This script contains the data\_generation() function used to generate different quantities of data.
12+
13+
code\_speed\_test.py - This script performs time tests for pandas and Polars DataFrames.
14+
15+
dataframe\_and\_lazyframe\_time\_tests.py - This script performs time tests for DataFrames and a LazyFrame.
16+
17+
streaming\_test.py - This script performs time tests for a LazyFrame with streaming enabled.
18+
19+
20+
21+
dataframe\_conversions.py - This file contains the code used to convert between pandas and Polars DataFrames, plus a Narwhals example.
22+
23+
sample\_pandas\_and\_polars\_code.py - This file contains the code used to illustrate the differences between pandas and Polars syntax.
24+
25+
DataFrame\_Plots.ipynb - This Jupyter Notebook file contains the plotting code to demonstrate default plotting capabilities.
26+

polars_v_pandas/data_generation.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import numpy as np
2+
3+
4+
def data_generation(number_of_rows):
5+
rng = np.random.default_rng()
6+
7+
return {
8+
"order_id": range(1, number_of_rows + 1),
9+
"region": rng.choice(
10+
["North", "South", "East", "West"], size=number_of_rows
11+
),
12+
"sales_person": rng.choice(
13+
["Armstrong", "Aldrin", "Collins"], size=number_of_rows
14+
),
15+
"product": rng.choice(
16+
["Helmet", "Oxygen", "Boots", "Gloves"], size=number_of_rows
17+
),
18+
"sales_income": rng.integers(1, 5001, size=number_of_rows),
19+
}
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
import functools
2+
import sys
3+
from timeit import Timer
4+
5+
import pandas as pd
6+
import polars as pl
7+
8+
from data_generation import data_generation
9+
10+
11+
def create_pandas_dataframe(test_data):
12+
return pd.DataFrame(test_data).convert_dtypes(dtype_backend="pyarrow")
13+
14+
15+
def create_polars_dataframe(test_data):
16+
return pl.DataFrame(test_data)
17+
18+
19+
def create_polars_lazyframe(test_data):
20+
return pl.LazyFrame(test_data)
21+
22+
23+
def analyze_pandas_dataframe(pandas_df):
24+
pandas_df.groupby(["region", "product", "sales_person"])[
25+
"sales_income"
26+
].sum()
27+
28+
29+
def analyze_polars_dataframe(polars_df):
30+
polars_df.group_by(["region", "product", "sales_person"]).agg(
31+
total_sales=pl.col("sales_income").sum()
32+
)
33+
34+
35+
def analyze_polars_lazyframe(polars_lf):
36+
polars_lf.group_by(["region", "product", "sales_person"]).agg(
37+
total_sales=pl.col("sales_income").sum()
38+
).collect()
39+
40+
41+
test_data = data_generation(int(sys.argv[1]))
42+
43+
print(f"Pandas dataframe creation time for {int(sys.argv[1])} rows:")
44+
print(Timer(functools.partial(create_pandas_dataframe, test_data)).timeit(100))
45+
print()
46+
print(f"Polars dataframe creation time for {int(sys.argv[1])} rows:")
47+
print(Timer(functools.partial(create_polars_dataframe, test_data)).timeit(100))
48+
print()
49+
print(f"Polars lazyframe creation time for {int(sys.argv[1])} rows:")
50+
print(Timer(functools.partial(create_polars_lazyframe, test_data)).timeit(100))
51+
52+
print()
53+
54+
pandas_df = create_pandas_dataframe(test_data)
55+
polars_df = create_polars_dataframe(test_data)
56+
polars_lf = create_polars_lazyframe(test_data)
57+
58+
print(f"Pandas dataframe analysis time for {int(sys.argv[1])} rows:")
59+
print(
60+
Timer(functools.partial(analyze_pandas_dataframe, pandas_df)).timeit(100)
61+
)
62+
63+
print()
64+
print(f"Polars dataframe analysis time for {int(sys.argv[1])} rows:")
65+
print(
66+
Timer(functools.partial(analyze_polars_dataframe, polars_df)).timeit(100)
67+
)
68+
69+
print()
70+
print(f"Polars lazyframe analysis time for {int(sys.argv[1])} rows:")
71+
print(
72+
Timer(functools.partial(analyze_polars_lazyframe, polars_lf)).timeit(100)
73+
)
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import narwhals as nw
2+
import polars as pl
3+
4+
from data_generation import data_generation
5+
6+
polars_df = pl.DataFrame(data_generation(4))
7+
polars_df
8+
9+
pandas_df = polars_df.to_pandas()
10+
type(pandas_df)
11+
pandas_df
12+
13+
polars_df = pl.from_pandas(pandas_df)
14+
type(polars_df)
15+
polars_df
16+
17+
18+
def agnositic_groupby(df):
19+
return (
20+
nw.from_native(df)
21+
.group_by("region")
22+
.agg(nw.col("sales_income").sum())
23+
.sort("region")
24+
.to_native()
25+
)
26+
27+
28+
agnositic_groupby(pandas_df)
29+
30+
agnositic_groupby(polars_df)
31+
3.49 MB
Binary file not shown.
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import pandas as pd
2+
import polars as pl
3+
4+
orders_pandas = pd.read_parquet("online_retail.parquet")
5+
6+
orders_pandas["Total"] = orders_pandas["Quantity"] * orders_pandas["UnitPrice"]
7+
8+
orders_pandas[["InvoiceNo", "Quantity", "UnitPrice", "Total"]][
9+
orders_pandas["Total"] > 10
10+
].head(3)
11+
12+
13+
(
14+
orders_pandas
15+
.assign(Total=orders_pandas["Quantity"] * orders_pandas["UnitPrice"])
16+
.filter(["InvoiceNo", "Quantity", "UnitPrice", "Total"])
17+
.query("Total > 100")
18+
).head(3)
19+
20+
21+
(
22+
orders_pandas
23+
.assign(Total=orders_pandas["Quantity"] * orders_pandas["UnitPrice"])
24+
.filter(["InvoiceNo", "Quantity", "UnitPrice", "Total"])
25+
# .query("Total > 100")
26+
).head(3)
27+
28+
29+
orders_polars = pl.read_parquet("online_retail.parquet")
30+
31+
(
32+
orders_polars.select(
33+
pl.col(["InvoiceNo", "Quantity", "UnitPrice"]),
34+
total=pl.col("Quantity") * pl.col("UnitPrice"),
35+
).filter(pl.col("total") > 10)
36+
).head(3)

polars_v_pandas/streaming_test.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import functools
2+
import sys
3+
from timeit import Timer
4+
5+
import pandas as pd
6+
import polars as pl
7+
8+
from data_generation import data_generation
9+
10+
11+
def create_polars_lazyframe(test_data):
12+
return pl.LazyFrame(test_data)
13+
14+
15+
def analyze_polars_lazyframe(polars_lf):
16+
polars_lf.group_by(["region", "product", "sales_person"]).agg(
17+
total_sales=pl.col("sales_income").sum()
18+
).collect()
19+
20+
21+
def analyze_polars_streaming(polars_lf):
22+
polars_lf.group_by(["region", "product", "sales_person"]).agg(
23+
total_sales=pl.col("sales_income").sum()
24+
).collect(engine="streaming")
25+
26+
27+
test_data = data_generation(int(sys.argv[1]))
28+
29+
polars_lf = create_polars_lazyframe(test_data)
30+
31+
print()
32+
print(f"Polars lazyframe analysis time for {int(sys.argv[1])} rows:")
33+
print(
34+
Timer(functools.partial(analyze_polars_lazyframe, polars_lf)).timeit(100)
35+
)
36+
37+
print(f"Polars streaming analysis time for {int(sys.argv[1])} rows:")
38+
print(
39+
Timer(functools.partial(analyze_polars_streaming, polars_lf)).timeit(100)
40+
)

0 commit comments

Comments
 (0)