Skip to content

Commit 9ab1006

Browse files
Update materials post SG update (pre-DR)
1 parent 6b97d8c commit 9ab1006

9 files changed

+188
-284
lines changed

polars-vs-pandas/DataFrame_Plots.ipynb

Lines changed: 77 additions & 80 deletions
Large diffs are not rendered by default.

polars-vs-pandas/code_speed_test.py

Lines changed: 0 additions & 44 deletions
This file was deleted.
Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,17 @@
11
import numpy as np
22

33

4-
def data_generation(number_of_rows):
4+
def generate_data(number_of_rows):
5+
rng = np.random.default_rng()
6+
57
return {
68
"order_id": range(1, number_of_rows + 1),
7-
"region": np.random.choice(
8-
["North", "South", "East", "West"], size=number_of_rows
9-
),
10-
"sales_person": np.random.choice(
9+
"region": rng.choice(["North", "South", "East", "West"], size=number_of_rows),
10+
"sales_person": rng.choice(
1111
["Armstrong", "Aldrin", "Collins"], size=number_of_rows
1212
),
13-
"product": np.random.choice(
13+
"product": rng.choice(
1414
["Helmet", "Oxygen", "Boots", "Gloves"], size=number_of_rows
1515
),
16-
"sales_income": np.random.randint(1, 5001, size=number_of_rows),
16+
"sales_income": rng.integers(1, 5001, size=number_of_rows),
1717
}

polars-vs-pandas/dataframe_and_lazyframe_time_tests.py

Lines changed: 58 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,82 +1,88 @@
1+
import functools
12
import sys
2-
import time
3+
from timeit import Timer
34

45
import pandas as pd
56
import polars as pl
6-
from data_generation import data_generation
77

8-
# Data Generation
8+
from data_generation import generate_data
99

10-
test_data = data_generation(int(sys.argv[1]))
1110

12-
# Polars DataFrame Test
11+
def create_pandas_dataframe(test_data):
12+
return pd.DataFrame(test_data).convert_dtypes(dtype_backend="pyarrow")
1313

14-
overall_time_start = time.time()
1514

16-
polars_dataframe = pl.DataFrame(test_data)
15+
def create_polars_dataframe(test_data):
16+
return pl.DataFrame(test_data)
1717

18-
processing_time_start = time.time()
1918

20-
(
21-
polars_dataframe.group_by(["region", "product", "sales_person"]).agg(
22-
total_sales=pl.col("sales_income").sum()
23-
)
24-
)
25-
26-
end_time = time.time()
27-
28-
del polars_dataframe
29-
30-
print(
31-
f"Polars DataFrame creation: {processing_time_start - overall_time_start}"
32-
)
33-
print(f"Polars DataFrame query runtime: {end_time - processing_time_start}")
34-
print(f"Polars DataFrame overall time: {end_time - overall_time_start}")
35-
print()
36-
37-
# Polars LazyFrame Test
19+
def create_polars_lazyframe(test_data):
20+
return pl.LazyFrame(test_data)
3821

39-
overall_time_start = time.time()
4022

41-
polars_lazyframe = pl.LazyFrame(test_data)
23+
def analyze_pandas_dataframe(pandas_df):
24+
return pandas_df.groupby(["region", "product", "sales_person"])[
25+
"sales_income"
26+
].sum()
4227

43-
processing_time_start = time.time()
4428

45-
(
46-
polars_lazyframe.group_by(["region", "product", "sales_person"]).agg(
29+
def analyze_polars_dataframe(polars_df):
30+
return polars_df.group_by(["region", "product", "sales_person"]).agg(
4731
total_sales=pl.col("sales_income").sum()
4832
)
49-
).collect()
5033

51-
end_time = time.time()
5234

53-
del polars_lazyframe
35+
def analyze_polars_lazyframe(polars_lf):
36+
return (
37+
polars_lf.group_by(["region", "product", "sales_person"])
38+
.agg(total_sales=pl.col("sales_income").sum())
39+
.collect()
40+
)
5441

55-
print(
56-
f"Polars LazyFrame creation: {processing_time_start - overall_time_start}"
57-
)
58-
print(f"Polars LazyFrame query runtime: {end_time - processing_time_start}")
59-
print(f"Polars LazyFrame overall time: {end_time - overall_time_start}")
60-
print()
6142

62-
# Pandas DataFrame Test
43+
test_data = generate_data(int(sys.argv[1]))
44+
45+
print("Creating Dataframes...")
46+
print(f"Pandas dataframe creation time for {int(sys.argv[1])} rows:")
47+
print(Timer(functools.partial(create_pandas_dataframe, test_data)).timeit(100))
48+
print(f"\nPolars dataframe creation time for {int(sys.argv[1])} rows:")
49+
print(Timer(functools.partial(create_polars_dataframe, test_data)).timeit(100))
50+
print(f"\nPolars lazyframe creation time for {int(sys.argv[1])} rows:")
51+
print(Timer(functools.partial(create_polars_lazyframe, test_data)).timeit(100))
6352

64-
overall_time_start = time.time()
53+
pandas_df = create_pandas_dataframe(test_data)
54+
polars_df = create_polars_dataframe(test_data)
55+
polars_lf = create_polars_lazyframe(test_data)
6556

66-
pandas_dataframe = pd.DataFrame(test_data)
57+
print("\nAnalyzing Dataframes...")
58+
print(f"Pandas dataframe analysis time for {int(sys.argv[1])} rows:")
59+
print(Timer(functools.partial(analyze_pandas_dataframe, pandas_df)).timeit(100))
6760

68-
processing_time_start = time.time()
61+
print(f"\nPolars dataframe analysis time for {int(sys.argv[1])} rows:")
62+
print(Timer(functools.partial(analyze_polars_dataframe, polars_df)).timeit(100))
6963

70-
pandas_dataframe.groupby(["region", "product", "sales_person"])[
71-
"sales_income"
72-
].sum()
64+
print(f"\nPolars lazyframe analysis time for {int(sys.argv[1])} rows:")
65+
print(Timer(functools.partial(analyze_polars_lazyframe, polars_lf)).timeit(100))
7366

74-
end_time = time.time()
67+
print("\nShow Boots sales in the East region for pandas DataFrame")
68+
print(analyze_pandas_dataframe(pandas_df)["East"]["Boots"])
7569

76-
del pandas_dataframe
70+
print("\nShow Boots sales in the East region for polars DataFrame")
71+
print(
72+
(
73+
analyze_polars_dataframe(polars_df).filter(
74+
pl.col("region") == "East",
75+
pl.col("product") == "Boots",
76+
)
77+
)
78+
)
7779

80+
print("\nShow Boots sales in the East region for pandas LazyFrame")
7881
print(
79-
f"Pandas DataFrame creation: {processing_time_start - overall_time_start}"
82+
(
83+
analyze_polars_lazyframe(polars_lf).filter(
84+
pl.col("region") == "East",
85+
pl.col("product") == "Boots",
86+
)
87+
)
8088
)
81-
print(f"Pandas DataFrame query runtime: {end_time - processing_time_start}")
82-
print(f"Pandas DataFrame overall time: {end_time - overall_time_start}")

polars-vs-pandas/dataframe_and_lazyframe_time_tests_v2.py

Lines changed: 0 additions & 66 deletions
This file was deleted.

polars-vs-pandas/dataframe_conversions.py

Lines changed: 0 additions & 31 deletions
This file was deleted.

polars-vs-pandas/git_issue.txt

-968 KB
Binary file not shown.
Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,30 @@
11
import pandas as pd
22
import polars as pl
33

4-
orders_pandas = pd.read_parquet("Online_Retail.parquet")
4+
# Pandas index-based syntax
5+
orders_pandas = pd.read_parquet("online_retail.parquet")
56

67
orders_pandas["Total"] = orders_pandas["Quantity"] * orders_pandas["UnitPrice"]
78

89
orders_pandas[["InvoiceNo", "Quantity", "UnitPrice", "Total"]][
9-
orders_pandas["Total"] > 10
10+
orders_pandas["Total"] > 100
1011
].head(3)
1112

13+
# Pandas method chaining syntax
14+
orders_pandas = pd.read_parquet("online_retail.parquet")
1215

13-
orders_polars = pl.read_csv("online_retail.csv")
16+
(
17+
orders_pandas.assign(Total=orders_pandas["Quantity"] * orders_pandas["UnitPrice"])
18+
.filter(["InvoiceNo", "Quantity", "UnitPrice", "Total"])
19+
.query("Total > 100")
20+
).head(3)
1421

22+
# Polars method chaining syntax
1523
orders_polars = pl.read_parquet("online_retail.parquet")
1624

1725
(
1826
orders_polars.select(
1927
pl.col(["InvoiceNo", "Quantity", "UnitPrice"]),
2028
total=pl.col("Quantity") * pl.col("UnitPrice"),
21-
).filter(pl.col("total") > 10)
29+
).filter(pl.col("total") > 100)
2230
).head(3)

polars-vs-pandas/streaming_test.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import functools
2+
import sys
3+
from timeit import Timer
4+
5+
import polars as pl
6+
7+
from data_generation import generate_data
8+
9+
10+
def create_polars_lazyframe(test_data):
11+
return pl.LazyFrame(test_data)
12+
13+
14+
def analyze_polars_lazyframe(polars_lf):
15+
polars_lf.group_by(["region", "product", "sales_person"]).agg(
16+
total_sales=pl.col("sales_income").sum()
17+
).collect()
18+
19+
20+
def analyze_polars_streaming(polars_lf):
21+
polars_lf.group_by(["region", "product", "sales_person"]).agg(
22+
total_sales=pl.col("sales_income").sum()
23+
).collect(engine="streaming")
24+
25+
26+
test_data = generate_data(int(sys.argv[1]))
27+
28+
polars_lf = create_polars_lazyframe(test_data)
29+
30+
print(f"Polars lazyframe analysis time for {int(sys.argv[1])} rows:")
31+
print(Timer(functools.partial(analyze_polars_lazyframe, polars_lf)).timeit(100))
32+
33+
print(f"\nPolars streaming analysis time for {int(sys.argv[1])} rows:")
34+
print(Timer(functools.partial(analyze_polars_streaming, polars_lf)).timeit(100))

0 commit comments

Comments
 (0)