|
| 1 | +""" |
| 2 | +Running: |
| 3 | +$ python benchmark.py 500 |
| 4 | +""" |
| 5 | + |
| 6 | +import functools |
| 7 | +import sys |
| 8 | +from timeit import Timer |
| 9 | + |
| 10 | +import pandas as pd |
| 11 | +import polars as pl |
| 12 | + |
| 13 | +from data_generation import generate_data |
| 14 | + |
| 15 | + |
| 16 | +def create_pandas_dataframe(test_data): |
| 17 | + return pd.DataFrame(test_data).convert_dtypes(dtype_backend="pyarrow") |
| 18 | + |
| 19 | + |
| 20 | +def create_polars_dataframe(test_data): |
| 21 | + return pl.DataFrame(test_data) |
| 22 | + |
| 23 | + |
| 24 | +def create_polars_lazyframe(test_data): |
| 25 | + return pl.LazyFrame(test_data) |
| 26 | + |
| 27 | + |
| 28 | +def analyze_pandas_dataframe(pandas_df): |
| 29 | + return pandas_df.groupby(["region", "product", "sales_person"])[ |
| 30 | + "sales_income" |
| 31 | + ].sum() |
| 32 | + |
| 33 | + |
| 34 | +def analyze_polars_dataframe(polars_df): |
| 35 | + return polars_df.group_by(["region", "product", "sales_person"]).agg( |
| 36 | + total_sales=pl.col("sales_income").sum() |
| 37 | + ) |
| 38 | + |
| 39 | + |
| 40 | +def analyze_polars_lazyframe(polars_lf): |
| 41 | + return ( |
| 42 | + polars_lf.group_by(["region", "product", "sales_person"]) |
| 43 | + .agg(total_sales=pl.col("sales_income").sum()) |
| 44 | + .collect() |
| 45 | + ) |
| 46 | + |
| 47 | + |
| 48 | +print("Creating DataFrames...") |
| 49 | + |
| 50 | +test_data = generate_data(int(sys.argv[1])) |
| 51 | + |
| 52 | +print(f"Pandas dataframe creation time for {int(sys.argv[1]):,} rows:") |
| 53 | +print(Timer(functools.partial(create_pandas_dataframe, test_data)).timeit(100)) |
| 54 | +print(f"\nPolars dataframe creation time for {int(sys.argv[1]):,} rows:") |
| 55 | +print(Timer(functools.partial(create_polars_dataframe, test_data)).timeit(100)) |
| 56 | +print(f"\nPolars lazyframe creation time for {int(sys.argv[1]):,} rows:") |
| 57 | +print(Timer(functools.partial(create_polars_lazyframe, test_data)).timeit(100)) |
| 58 | + |
| 59 | +print("-" * 50) |
| 60 | +print("Analyzing DataFrames...") |
| 61 | + |
| 62 | +pandas_df = create_pandas_dataframe(test_data) |
| 63 | +polars_df = create_polars_dataframe(test_data) |
| 64 | +polars_lf = create_polars_lazyframe(test_data) |
| 65 | + |
| 66 | +print(f"Pandas dataframe analysis time for {int(sys.argv[1]):,} rows:") |
| 67 | +print( |
| 68 | + Timer(functools.partial(analyze_pandas_dataframe, pandas_df)).timeit(100) |
| 69 | +) |
| 70 | + |
| 71 | +print() |
| 72 | +print(f"Polars dataframe analysis time for {int(sys.argv[1]):,} rows:") |
| 73 | +print( |
| 74 | + Timer(functools.partial(analyze_polars_dataframe, polars_df)).timeit(100) |
| 75 | +) |
| 76 | + |
| 77 | +print() |
| 78 | +print(f"Polars lazyframe analysis time for {int(sys.argv[1]):,} rows:") |
| 79 | +print( |
| 80 | + Timer(functools.partial(analyze_polars_lazyframe, polars_lf)).timeit(100) |
| 81 | +) |
| 82 | + |
| 83 | +print("\nShow Boots sales in the East region for pandas DataFrame") |
| 84 | +print(analyze_pandas_dataframe(pandas_df)["East"]["Boots"]) |
| 85 | + |
| 86 | +print("\nShow Boots sales in the East region for Polars DataFrame") |
| 87 | +print( |
| 88 | + ( |
| 89 | + analyze_polars_dataframe(polars_df).filter( |
| 90 | + pl.col("region") == "East", |
| 91 | + pl.col("product") == "Boots", |
| 92 | + ) |
| 93 | + ) |
| 94 | +) |
| 95 | + |
| 96 | +print("\nShow Boots sales in the East region for Polars LazyFrame") |
| 97 | +print( |
| 98 | + ( |
| 99 | + analyze_polars_lazyframe(polars_lf).filter( |
| 100 | + pl.col("region") == "East", |
| 101 | + pl.col("product") == "Boots", |
| 102 | + ) |
| 103 | + ) |
| 104 | +) |
0 commit comments