Skip to content

Commit 77a3b7f

Browse files
committed
Initial Commit TR1
1 parent 757644a commit 77a3b7f

10 files changed

+495
-0
lines changed

polars_vs_pandas/DataFrame_Plots.ipynb

Lines changed: 207 additions & 0 deletions
Large diffs are not rendered by default.
3.49 MB
Binary file not shown.

polars_vs_pandas/README.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
The materials contained in this download are designed to complement the RealPython tutorial [Polars vs pandas - What's the Difference](https://realpython.com/polars-vs-pandas-difference/).
2+
3+
You should create a new folder named marimo on your computer and place each of these files inside it. You may also consider creating a [Python virtual environment](https://realpython.com/python-virtual-environments-a-primer/) within this folder.
4+
5+
Your download bundle contains the following files:
6+
7+
Online_Retail.parquet - This parquet file contains retail data used in some of the queries.
8+
9+
data_generation.py - This script contains the data_generation() function used to generate different quantities of data.
10+
11+
code_speed_test.py - This script performs time tests for pandas and Polars DataFrames.
12+
13+
dataframe_and_lazyframe_time_tests.py - This script performs time tests for DataFrames and a LazyFrame.
14+
15+
dataframe_and_lazyframe_time_tests_v2.py - This script performs time tests for a LazyFrame with streaming enabled.
16+
17+
18+
dataframe_conversions.py - This file contains the code used to convert between pandas and Polars DataFrames, plus a Narwhals example.
19+
20+
sample_pandas_and_polars_code.py - This file contains the code used to illustrate the differences between pandas and Polars syntax.
21+
22+
DataFrame_Plots.ipynb - This Jupyter Notebook file contains the plotting code to demonstrate default plotting capabilities.
23+
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import sys
2+
import time
3+
4+
import pandas as pd
5+
import polars as pl
6+
from data_generation import data_generation
7+
8+
# Create DataFrames
9+
10+
data_source = data_generation(int(sys.argv[1]))
11+
12+
orders_pandas = pd.DataFrame(data_source).convert_dtypes(
13+
dtype_backend="pyarrow"
14+
)
15+
16+
orders_polars = pl.DataFrame(data_source)
17+
18+
# pandas DataFrame Test
19+
20+
start_time = time.time()
21+
22+
orders_pandas.groupby("region")["sales_income"].sum()
23+
24+
end_time = time.time()
25+
26+
print(f"pandas Time Taken: {end_time-start_time}.")
27+
28+
# Polars DataFrame Test
29+
30+
start_time = time.time()
31+
32+
(
33+
orders_polars.group_by("region").agg(
34+
total=pl.col("sales_income").sum(),
35+
)
36+
)
37+
38+
end_time = time.time()
39+
40+
# Results
41+
42+
print(f"polars Time Taken: {end_time-start_time}.")
43+
44+
print(f"----- For {sys.argv[1]} rows")
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import numpy as np
2+
3+
4+
def data_generation(number_of_rows):
5+
return {
6+
"order_id": range(1, number_of_rows + 1),
7+
"region": np.random.choice(
8+
["North", "South", "East", "West"], size=number_of_rows
9+
),
10+
"sales_person": np.random.choice(
11+
["Armstrong", "Aldrin", "Collins"], size=number_of_rows
12+
),
13+
"product": np.random.choice(
14+
["Helmet", "Oxygen", "Boots", "Gloves"], size=number_of_rows
15+
),
16+
"sales_income": np.random.randint(1, 5001, size=number_of_rows),
17+
}
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
import sys
2+
import time
3+
4+
import pandas as pd
5+
import polars as pl
6+
7+
from data_generation import data_generation
8+
9+
# Data Generation
10+
11+
test_data = data_generation(int(sys.argv[1]))
12+
13+
# Polars DataFrame Test
14+
15+
overall_time_start = time.time()
16+
17+
polars_dataframe = pl.DataFrame(test_data)
18+
19+
processing_time_start = time.time()
20+
21+
(
22+
polars_dataframe.group_by(["region", "product", "sales_person"]).agg(
23+
total_sales=pl.col("sales_income").sum()
24+
)
25+
)
26+
27+
end_time = time.time()
28+
29+
del polars_dataframe
30+
31+
print(
32+
f"Polars DataFrame creation: {processing_time_start - overall_time_start}"
33+
)
34+
print(f"Polars DataFrame query runtime: {end_time - processing_time_start}")
35+
print(f"Polars DataFrame overall time: {end_time - overall_time_start}")
36+
print()
37+
38+
# Polars LazyFrame Test
39+
40+
overall_time_start = time.time()
41+
42+
polars_lazyframe = pl.LazyFrame(test_data)
43+
44+
processing_time_start = time.time()
45+
46+
(
47+
polars_lazyframe.group_by(["region", "product", "sales_person"]).agg(
48+
total_sales=pl.col("sales_income").sum()
49+
)
50+
).collect()
51+
52+
end_time = time.time()
53+
54+
del polars_lazyframe
55+
56+
print(
57+
f"Polars LazyFrame creation: {processing_time_start - overall_time_start}"
58+
)
59+
print(f"Polars LazyFrame query runtime: {end_time - processing_time_start}")
60+
print(f"Polars LazyFrame overall time: {end_time - overall_time_start}")
61+
print()
62+
63+
# Pandas DataFrame Test
64+
65+
overall_time_start = time.time()
66+
67+
pandas_dataframe = pd.DataFrame(test_data)
68+
69+
processing_time_start = time.time()
70+
71+
pandas_dataframe.groupby(["region", "product", "sales_person"])[
72+
"sales_income"
73+
].sum()
74+
75+
end_time = time.time()
76+
77+
del pandas_dataframe
78+
79+
print(
80+
f"Pandas DataFrame creation: {processing_time_start - overall_time_start}"
81+
)
82+
print(f"Pandas DataFrame query runtime: {end_time - processing_time_start}")
83+
print(f"Pandas DataFrame overall time: {end_time - overall_time_start}")
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import sys
2+
import time
3+
4+
import pandas as pd
5+
import polars as pl
6+
7+
from data_generation import data_generation
8+
9+
# Data Generation
10+
11+
test_data = data_generation(int(sys.argv[1]))
12+
13+
# Polars DataFrame Test
14+
15+
polars_dataframe = pl.DataFrame(test_data)
16+
17+
(
18+
polars_dataframe.group_by(["region", "product", "sales_person"]).agg(
19+
total_sales=pl.col("sales_income").sum()
20+
)
21+
)
22+
23+
# Polars LazyFrame Test
24+
25+
polars_lazyframe = pl.LazyFrame(test_data)
26+
27+
(
28+
polars_lazyframe.group_by(["region", "product", "sales_person"]).agg(
29+
total_sales=pl.col("sales_income").sum()
30+
)
31+
).collect()
32+
33+
34+
# Pandas DataFrame Test
35+
36+
pandas_dataframe = pd.DataFrame(test_data)
37+
38+
pandas_dataframe.groupby(["region", "product", "sales_person"])[
39+
"sales_income"
40+
].sum()
41+
42+
# Polars LazyFrame Streaming Test
43+
44+
overall_time_start = time.time()
45+
46+
polars_lazyframe = pl.LazyFrame(test_data)
47+
48+
processing_time_start = time.time()
49+
50+
(
51+
polars_lazyframe.group_by(["region", "product", "sales_person"]).agg(
52+
total_sales=pl.col("sales_income").sum()
53+
)
54+
).collect(engine="streaming")
55+
56+
end_time = time.time()
57+
58+
print(
59+
f"Polars Streaming LazyFrame creation: {processing_time_start - overall_time_start}"
60+
)
61+
print(
62+
f"Polars Streaming LazyFrame query runtime: {end_time - processing_time_start}"
63+
)
64+
print(
65+
f"Polars Streaming LazyFrame overall time: {end_time - overall_time_start}"
66+
)
67+
print()
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import narwhals as nw
2+
import pandas as pd
3+
import polars as pl
4+
5+
polars_df = pl.DataFrame(
6+
{
7+
"a": ["a", "b", "a", "b", "c"],
8+
"b": [1, 2, 1, 3, 3],
9+
"c": [5, 4, 3, 2, 1],
10+
}
11+
)
12+
13+
pandas_df = polars_df.to_pandas()
14+
type(pandas_df)
15+
16+
polars_df = pl.from_pandas(pandas_df)
17+
type(polars_df)
18+
19+
20+
def agnositic_groupby(df):
21+
return (
22+
nw.from_native(df)
23+
.group_by("a")
24+
.agg(nw.col("b").sum())
25+
.sort("a")
26+
.to_native()
27+
)
28+
29+
30+
agnositic_groupby(pandas_df)
31+
32+
agnositic_groupby(polars_df)

polars_vs_pandas/git_issue.txt

968 KB
Binary file not shown.
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import pandas as pd
2+
import polars as pl
3+
4+
orders_pandas = pd.read_parquet("Online_Retail.parquet")
5+
6+
orders_pandas["Total"] = orders_pandas["Quantity"] * orders_pandas["UnitPrice"]
7+
8+
orders_pandas[["InvoiceNo", "Quantity", "UnitPrice", "Total"]][
9+
orders_pandas["Total"] > 10
10+
].head(3)
11+
12+
13+
orders_polars = pl.read_csv("online_retail.csv")
14+
15+
orders_polars = pl.read_parquet("online_retail.parquet")
16+
17+
(
18+
orders_polars.select(
19+
pl.col(["InvoiceNo", "Quantity", "UnitPrice"]),
20+
total=pl.col("Quantity") * pl.col("UnitPrice"),
21+
).filter(pl.col("total") > 10)
22+
).head(3)

0 commit comments

Comments
 (0)