Skip to content

Commit f823d8e

Browse files
committed
folder name correction
1 parent a17347a commit f823d8e

10 files changed

+492
-0
lines changed

polars-vs-pandas/DataFrame_Plots.ipynb

Lines changed: 207 additions & 0 deletions
Large diffs are not rendered by default.
3.49 MB
Binary file not shown.

polars-vs-pandas/README.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
The materials contained in this download are designed to complement the RealPython tutorial [Polars vs pandas - What's the Difference](https://realpython.com/polars-vs-pandas-difference/).
2+
3+
You should create a new folder named marimo on your computer and place each of these files inside it. You may also consider creating a [Python virtual environment](https://realpython.com/python-virtual-environments-a-primer/) within this folder.
4+
5+
Your download bundle contains the following files:
6+
7+
Online_Retail.parquet - This parquet file contains retail data used in some of the queries.
8+
9+
data_generation.py - This script contains the data_generation() function used to generate different quantities of data.
10+
11+
code_speed_test.py - This script performs time tests for pandas and Polars DataFrames.
12+
13+
dataframe_and_lazyframe_time_tests.py - This script performs time tests for DataFrames and a LazyFrame.
14+
15+
dataframe_and_lazyframe_time_tests_v2.py - This script performs time tests for a LazyFrame with streaming enabled.
16+
17+
18+
dataframe_conversions.py - This file contains the code used to convert between pandas and Polars DataFrames, plus a Narwhals example.
19+
20+
sample_pandas_and_polars_code.py - This file contains the code used to illustrate the differences between pandas and Polars syntax.
21+
22+
DataFrame_Plots.ipynb - This Jupyter Notebook file contains the plotting code to demonstrate default plotting capabilities.
23+
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import sys
2+
import time
3+
4+
import pandas as pd
5+
import polars as pl
6+
from data_generation import data_generation
7+
8+
# Create DataFrames
9+
10+
data_source = data_generation(int(sys.argv[1]))
11+
12+
orders_pandas = pd.DataFrame(data_source).convert_dtypes(
13+
dtype_backend="pyarrow"
14+
)
15+
16+
orders_polars = pl.DataFrame(data_source)
17+
18+
# pandas DataFrame Test
19+
20+
start_time = time.time()
21+
22+
orders_pandas.groupby("region")["sales_income"].sum()
23+
24+
end_time = time.time()
25+
26+
print(f"pandas Time Taken: {end_time-start_time}.")
27+
28+
# Polars DataFrame Test
29+
30+
start_time = time.time()
31+
32+
(
33+
orders_polars.group_by("region").agg(
34+
total=pl.col("sales_income").sum(),
35+
)
36+
)
37+
38+
end_time = time.time()
39+
40+
# Results
41+
42+
print(f"polars Time Taken: {end_time-start_time}.")
43+
44+
print(f"----- For {sys.argv[1]} rows")
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import numpy as np
2+
3+
4+
def data_generation(number_of_rows):
5+
return {
6+
"order_id": range(1, number_of_rows + 1),
7+
"region": np.random.choice(
8+
["North", "South", "East", "West"], size=number_of_rows
9+
),
10+
"sales_person": np.random.choice(
11+
["Armstrong", "Aldrin", "Collins"], size=number_of_rows
12+
),
13+
"product": np.random.choice(
14+
["Helmet", "Oxygen", "Boots", "Gloves"], size=number_of_rows
15+
),
16+
"sales_income": np.random.randint(1, 5001, size=number_of_rows),
17+
}
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import sys
2+
import time
3+
4+
import pandas as pd
5+
import polars as pl
6+
from data_generation import data_generation
7+
8+
# Data Generation
9+
10+
test_data = data_generation(int(sys.argv[1]))
11+
12+
# Polars DataFrame Test
13+
14+
overall_time_start = time.time()
15+
16+
polars_dataframe = pl.DataFrame(test_data)
17+
18+
processing_time_start = time.time()
19+
20+
(
21+
polars_dataframe.group_by(["region", "product", "sales_person"]).agg(
22+
total_sales=pl.col("sales_income").sum()
23+
)
24+
)
25+
26+
end_time = time.time()
27+
28+
del polars_dataframe
29+
30+
print(
31+
f"Polars DataFrame creation: {processing_time_start - overall_time_start}"
32+
)
33+
print(f"Polars DataFrame query runtime: {end_time - processing_time_start}")
34+
print(f"Polars DataFrame overall time: {end_time - overall_time_start}")
35+
print()
36+
37+
# Polars LazyFrame Test
38+
39+
overall_time_start = time.time()
40+
41+
polars_lazyframe = pl.LazyFrame(test_data)
42+
43+
processing_time_start = time.time()
44+
45+
(
46+
polars_lazyframe.group_by(["region", "product", "sales_person"]).agg(
47+
total_sales=pl.col("sales_income").sum()
48+
)
49+
).collect()
50+
51+
end_time = time.time()
52+
53+
del polars_lazyframe
54+
55+
print(
56+
f"Polars LazyFrame creation: {processing_time_start - overall_time_start}"
57+
)
58+
print(f"Polars LazyFrame query runtime: {end_time - processing_time_start}")
59+
print(f"Polars LazyFrame overall time: {end_time - overall_time_start}")
60+
print()
61+
62+
# Pandas DataFrame Test
63+
64+
overall_time_start = time.time()
65+
66+
pandas_dataframe = pd.DataFrame(test_data)
67+
68+
processing_time_start = time.time()
69+
70+
pandas_dataframe.groupby(["region", "product", "sales_person"])[
71+
"sales_income"
72+
].sum()
73+
74+
end_time = time.time()
75+
76+
del pandas_dataframe
77+
78+
print(
79+
f"Pandas DataFrame creation: {processing_time_start - overall_time_start}"
80+
)
81+
print(f"Pandas DataFrame query runtime: {end_time - processing_time_start}")
82+
print(f"Pandas DataFrame overall time: {end_time - overall_time_start}")
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import sys
2+
import time
3+
4+
import pandas as pd
5+
import polars as pl
6+
from data_generation import data_generation
7+
8+
# Data Generation
9+
10+
test_data = data_generation(int(sys.argv[1]))
11+
12+
# Polars DataFrame Test
13+
14+
polars_dataframe = pl.DataFrame(test_data)
15+
16+
(
17+
polars_dataframe.group_by(["region", "product", "sales_person"]).agg(
18+
total_sales=pl.col("sales_income").sum()
19+
)
20+
)
21+
22+
# Polars LazyFrame Test
23+
24+
polars_lazyframe = pl.LazyFrame(test_data)
25+
26+
(
27+
polars_lazyframe.group_by(["region", "product", "sales_person"]).agg(
28+
total_sales=pl.col("sales_income").sum()
29+
)
30+
).collect()
31+
32+
33+
# Pandas DataFrame Test
34+
35+
pandas_dataframe = pd.DataFrame(test_data)
36+
37+
pandas_dataframe.groupby(["region", "product", "sales_person"])[
38+
"sales_income"
39+
].sum()
40+
41+
# Polars LazyFrame Streaming Test
42+
43+
overall_time_start = time.time()
44+
45+
polars_lazyframe = pl.LazyFrame(test_data)
46+
47+
processing_time_start = time.time()
48+
49+
(
50+
polars_lazyframe.group_by(["region", "product", "sales_person"]).agg(
51+
total_sales=pl.col("sales_income").sum()
52+
)
53+
).collect(engine="streaming")
54+
55+
end_time = time.time()
56+
57+
print(
58+
f"Polars Streaming LazyFrame creation: {processing_time_start - overall_time_start}"
59+
)
60+
print(
61+
f"Polars Streaming LazyFrame query runtime: {end_time - processing_time_start}"
62+
)
63+
print(
64+
f"Polars Streaming LazyFrame overall time: {end_time - overall_time_start}"
65+
)
66+
print()
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import narwhals as nw
2+
import polars as pl
3+
4+
polars_df = pl.DataFrame(
5+
{
6+
"a": ["a", "b", "a", "b", "c"],
7+
"b": [1, 2, 1, 3, 3],
8+
"c": [5, 4, 3, 2, 1],
9+
}
10+
)
11+
12+
pandas_df = polars_df.to_pandas()
13+
type(pandas_df)
14+
15+
polars_df = pl.from_pandas(pandas_df)
16+
type(polars_df)
17+
18+
19+
def agnositic_groupby(df):
20+
return (
21+
nw.from_native(df)
22+
.group_by("a")
23+
.agg(nw.col("b").sum())
24+
.sort("a")
25+
.to_native()
26+
)
27+
28+
29+
agnositic_groupby(pandas_df)
30+
31+
agnositic_groupby(polars_df)

polars-vs-pandas/git_issue.txt

968 KB
Binary file not shown.
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import pandas as pd
2+
import polars as pl
3+
4+
orders_pandas = pd.read_parquet("Online_Retail.parquet")
5+
6+
orders_pandas["Total"] = orders_pandas["Quantity"] * orders_pandas["UnitPrice"]
7+
8+
orders_pandas[["InvoiceNo", "Quantity", "UnitPrice", "Total"]][
9+
orders_pandas["Total"] > 10
10+
].head(3)
11+
12+
13+
orders_polars = pl.read_csv("online_retail.csv")
14+
15+
orders_polars = pl.read_parquet("online_retail.parquet")
16+
17+
(
18+
orders_polars.select(
19+
pl.col(["InvoiceNo", "Quantity", "UnitPrice"]),
20+
total=pl.col("Quantity") * pl.col("UnitPrice"),
21+
).filter(pl.col("total") > 10)
22+
).head(3)

0 commit comments

Comments
 (0)