Skip to content

Commit 492d2d6

Browse files
committed
Final QA
1 parent 30d38e2 commit 492d2d6

File tree

10 files changed

+244
-242
lines changed

10 files changed

+244
-242
lines changed

polars-vs-pandas/README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@ Your download bundle contains the following files:
66

77
| File | Description |
88
|-----------------------------------------|------------------------------------------------------------------------------------------------------------|
9+
| `benchmark.py` | This script performs time tests for DataFrames and a LazyFrame. |
10+
| `conversions.py` | This file contains the code used to convert between pandas and Polars DataFrames, plus a Narwhals example. |
911
| `data_generation.py` | This script contains the `generate_data()` function used to generate different quantities of data. |
10-
| `dataframe_and_lazyframe_time_tests.py` | This script performs time tests for DataFrames and a LazyFrame. |
11-
| `dataframe_conversions.py` | This file contains the code used to convert between pandas and Polars DataFrames, plus a Narwhals example. |
12-
| `DataFrame_plots.ipynb` | This Jupyter Notebook file contains the plotting code to demonstrate default plotting capabilities. |
13-
| `online_retail.parquet` | This parquet file contains retail data used in some of the queries. |
14-
| `sample_pandas_and_polars_code.py` | This file contains the code used to illustrate the differences between pandas and Polars syntax. |
12+
| `online_retail.parquet` | This Parquet file contains retail data used in some of the queries. |
13+
| `pandas_polars_demo.py` | This file contains the code used to illustrate the differences between pandas and Polars syntax. |
14+
| `plots.ipynb` | This Jupyter Notebook file contains the plotting code to demonstrate default plotting capabilities. |
1515
| `streaming_test.py` | This script performs time tests for a LazyFrame with streaming enabled. |

polars-vs-pandas/benchmark.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
"""
2+
Running:
3+
$ python benchmark.py 500
4+
"""
5+
6+
import functools
7+
import sys
8+
from timeit import Timer
9+
10+
import pandas as pd
11+
import polars as pl
12+
13+
from data_generation import generate_data
14+
15+
16+
def create_pandas_dataframe(test_data):
17+
return pd.DataFrame(test_data).convert_dtypes(dtype_backend="pyarrow")
18+
19+
20+
def create_polars_dataframe(test_data):
21+
return pl.DataFrame(test_data)
22+
23+
24+
def create_polars_lazyframe(test_data):
25+
return pl.LazyFrame(test_data)
26+
27+
28+
def analyze_pandas_dataframe(pandas_df):
29+
return pandas_df.groupby(["region", "product", "sales_person"])[
30+
"sales_income"
31+
].sum()
32+
33+
34+
def analyze_polars_dataframe(polars_df):
35+
return polars_df.group_by(["region", "product", "sales_person"]).agg(
36+
total_sales=pl.col("sales_income").sum()
37+
)
38+
39+
40+
def analyze_polars_lazyframe(polars_lf):
41+
return (
42+
polars_lf.group_by(["region", "product", "sales_person"])
43+
.agg(total_sales=pl.col("sales_income").sum())
44+
.collect()
45+
)
46+
47+
48+
print("Creating DataFrames...")
49+
50+
test_data = generate_data(int(sys.argv[1]))
51+
52+
print(f"Pandas dataframe creation time for {int(sys.argv[1]):,} rows:")
53+
print(Timer(functools.partial(create_pandas_dataframe, test_data)).timeit(100))
54+
print(f"\nPolars dataframe creation time for {int(sys.argv[1]):,} rows:")
55+
print(Timer(functools.partial(create_polars_dataframe, test_data)).timeit(100))
56+
print(f"\nPolars lazyframe creation time for {int(sys.argv[1]):,} rows:")
57+
print(Timer(functools.partial(create_polars_lazyframe, test_data)).timeit(100))
58+
59+
print("-" * 50)
60+
print("Analyzing DataFrames...")
61+
62+
pandas_df = create_pandas_dataframe(test_data)
63+
polars_df = create_polars_dataframe(test_data)
64+
polars_lf = create_polars_lazyframe(test_data)
65+
66+
print(f"Pandas dataframe analysis time for {int(sys.argv[1]):,} rows:")
67+
print(
68+
Timer(functools.partial(analyze_pandas_dataframe, pandas_df)).timeit(100)
69+
)
70+
71+
print()
72+
print(f"Polars dataframe analysis time for {int(sys.argv[1]):,} rows:")
73+
print(
74+
Timer(functools.partial(analyze_polars_dataframe, polars_df)).timeit(100)
75+
)
76+
77+
print()
78+
print(f"Polars lazyframe analysis time for {int(sys.argv[1]):,} rows:")
79+
print(
80+
Timer(functools.partial(analyze_polars_lazyframe, polars_lf)).timeit(100)
81+
)
82+
83+
print("\nShow Boots sales in the East region for pandas DataFrame")
84+
print(analyze_pandas_dataframe(pandas_df)["East"]["Boots"])
85+
86+
print("\nShow Boots sales in the East region for Polars DataFrame")
87+
print(
88+
(
89+
analyze_polars_dataframe(polars_df).filter(
90+
pl.col("region") == "East",
91+
pl.col("product") == "Boots",
92+
)
93+
)
94+
)
95+
96+
print("\nShow Boots sales in the East region for Polars LazyFrame")
97+
print(
98+
(
99+
analyze_polars_lazyframe(polars_lf).filter(
100+
pl.col("region") == "East",
101+
pl.col("product") == "Boots",
102+
)
103+
)
104+
)

polars-vs-pandas/conversions.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import narwhals as nw
2+
import polars as pl
3+
4+
from data_generation import generate_data
5+
6+
7+
def universal_groupby(df):
8+
return (
9+
nw.from_native(df)
10+
.group_by("region")
11+
.agg(nw.col("sales_income").sum())
12+
.sort("region")
13+
.to_native()
14+
)
15+
16+
17+
polars_df = pl.DataFrame(generate_data(4))
18+
print(polars_df)
19+
20+
print("\nPolars to pandas:")
21+
pandas_df = polars_df.to_pandas()
22+
print(type(pandas_df))
23+
print(pandas_df)
24+
25+
print("\npandas to Polars:")
26+
polars_df = pl.from_pandas(pandas_df)
27+
print(type(polars_df))
28+
print(polars_df)
29+
30+
print("\nNarwhals with pandas:")
31+
print(universal_groupby(pandas_df))
32+
33+
print("\nNarwhals with Polars:")
34+
print(universal_groupby(polars_df))

polars-vs-pandas/dataframe_and_lazyframe_time_tests.py

Lines changed: 0 additions & 113 deletions
This file was deleted.

polars-vs-pandas/dataframe_conversions.py

Lines changed: 0 additions & 29 deletions
This file was deleted.
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import pandas as pd
2+
import polars as pl
3+
4+
print("Index-Based syntax in pandas:")
5+
orders_pandas = pd.read_parquet("online_retail.parquet")
6+
orders_pandas["Total"] = orders_pandas["Quantity"] * orders_pandas["UnitPrice"]
7+
print(
8+
orders_pandas[["InvoiceNo", "Quantity", "UnitPrice", "Total"]][
9+
orders_pandas["Total"] > 100
10+
].head(3)
11+
)
12+
13+
print()
14+
15+
print("Method-chaining syntax in pandas:")
16+
orders_pandas = pd.read_parquet("online_retail.parquet")
17+
print(
18+
(
19+
orders_pandas.assign(
20+
Total=orders_pandas["Quantity"] * orders_pandas["UnitPrice"]
21+
)
22+
.filter(["InvoiceNo", "Quantity", "UnitPrice", "Total"])
23+
.query("Total > 100")
24+
).head(3)
25+
)
26+
27+
print()
28+
29+
print("Method-chaining syntax in Polars:")
30+
orders_polars = pl.read_parquet("online_retail.parquet")
31+
print(
32+
(
33+
orders_polars.select(
34+
pl.col(["InvoiceNo", "Quantity", "UnitPrice"]),
35+
Total=pl.col("Quantity") * pl.col("UnitPrice"),
36+
).filter(pl.col("Total") > 100)
37+
).head(3)
38+
)

0 commit comments

Comments
 (0)