|
| 1 | +import functools |
1 | 2 | import sys |
2 | | -import time |
| 3 | +from timeit import Timer |
3 | 4 |
|
4 | 5 | import pandas as pd |
5 | 6 | import polars as pl |
6 | | -from data_generation import data_generation |
7 | 7 |
|
8 | | -# Data Generation |
| 8 | +from data_generation import generate_data |
9 | 9 |
|
10 | | -test_data = data_generation(int(sys.argv[1])) |
11 | 10 |
|
12 | | -# Polars DataFrame Test |
| 11 | +def create_pandas_dataframe(test_data): |
| 12 | + return pd.DataFrame(test_data).convert_dtypes(dtype_backend="pyarrow") |
13 | 13 |
|
14 | | -overall_time_start = time.time() |
15 | 14 |
|
16 | | -polars_dataframe = pl.DataFrame(test_data) |
| 15 | +def create_polars_dataframe(test_data): |
| 16 | + return pl.DataFrame(test_data) |
17 | 17 |
|
18 | | -processing_time_start = time.time() |
19 | 18 |
|
20 | | -( |
21 | | - polars_dataframe.group_by(["region", "product", "sales_person"]).agg( |
22 | | - total_sales=pl.col("sales_income").sum() |
23 | | - ) |
24 | | -) |
25 | | - |
26 | | -end_time = time.time() |
27 | | - |
28 | | -del polars_dataframe |
29 | | - |
30 | | -print( |
31 | | - f"Polars DataFrame creation: {processing_time_start - overall_time_start}" |
32 | | -) |
33 | | -print(f"Polars DataFrame query runtime: {end_time - processing_time_start}") |
34 | | -print(f"Polars DataFrame overall time: {end_time - overall_time_start}") |
35 | | -print() |
36 | | - |
37 | | -# Polars LazyFrame Test |
| 19 | +def create_polars_lazyframe(test_data): |
| 20 | + return pl.LazyFrame(test_data) |
38 | 21 |
|
39 | | -overall_time_start = time.time() |
40 | 22 |
|
41 | | -polars_lazyframe = pl.LazyFrame(test_data) |
| 23 | +def analyze_pandas_dataframe(pandas_df): |
| 24 | + return pandas_df.groupby(["region", "product", "sales_person"])[ |
| 25 | + "sales_income" |
| 26 | + ].sum() |
42 | 27 |
|
43 | | -processing_time_start = time.time() |
44 | 28 |
|
45 | | -( |
46 | | - polars_lazyframe.group_by(["region", "product", "sales_person"]).agg( |
| 29 | +def analyze_polars_dataframe(polars_df): |
| 30 | + return polars_df.group_by(["region", "product", "sales_person"]).agg( |
47 | 31 | total_sales=pl.col("sales_income").sum() |
48 | 32 | ) |
49 | | -).collect() |
50 | 33 |
|
51 | | -end_time = time.time() |
52 | 34 |
|
53 | | -del polars_lazyframe |
| 35 | +def analyze_polars_lazyframe(polars_lf): |
| 36 | + return ( |
| 37 | + polars_lf.group_by(["region", "product", "sales_person"]) |
| 38 | + .agg(total_sales=pl.col("sales_income").sum()) |
| 39 | + .collect() |
| 40 | + ) |
54 | 41 |
|
55 | | -print( |
56 | | - f"Polars LazyFrame creation: {processing_time_start - overall_time_start}" |
57 | | -) |
58 | | -print(f"Polars LazyFrame query runtime: {end_time - processing_time_start}") |
59 | | -print(f"Polars LazyFrame overall time: {end_time - overall_time_start}") |
60 | | -print() |
61 | 42 |
|
62 | | -# Pandas DataFrame Test |
| 43 | +test_data = generate_data(int(sys.argv[1])) |
| 44 | + |
| 45 | +print("Creating Dataframes...") |
| 46 | +print(f"Pandas dataframe creation time for {int(sys.argv[1])} rows:") |
| 47 | +print(Timer(functools.partial(create_pandas_dataframe, test_data)).timeit(100)) |
| 48 | +print(f"\nPolars dataframe creation time for {int(sys.argv[1])} rows:") |
| 49 | +print(Timer(functools.partial(create_polars_dataframe, test_data)).timeit(100)) |
| 50 | +print(f"\nPolars lazyframe creation time for {int(sys.argv[1])} rows:") |
| 51 | +print(Timer(functools.partial(create_polars_lazyframe, test_data)).timeit(100)) |
63 | 52 |
|
64 | | -overall_time_start = time.time() |
| 53 | +pandas_df = create_pandas_dataframe(test_data) |
| 54 | +polars_df = create_polars_dataframe(test_data) |
| 55 | +polars_lf = create_polars_lazyframe(test_data) |
65 | 56 |
|
66 | | -pandas_dataframe = pd.DataFrame(test_data) |
| 57 | +print("\nAnalyzing Dataframes...") |
| 58 | +print(f"Pandas dataframe analysis time for {int(sys.argv[1])} rows:") |
| 59 | +print(Timer(functools.partial(analyze_pandas_dataframe, pandas_df)).timeit(100)) |
67 | 60 |
|
68 | | -processing_time_start = time.time() |
| 61 | +print(f"\nPolars dataframe analysis time for {int(sys.argv[1])} rows:") |
| 62 | +print(Timer(functools.partial(analyze_polars_dataframe, polars_df)).timeit(100)) |
69 | 63 |
|
70 | | -pandas_dataframe.groupby(["region", "product", "sales_person"])[ |
71 | | - "sales_income" |
72 | | -].sum() |
| 64 | +print(f"\nPolars lazyframe analysis time for {int(sys.argv[1])} rows:") |
| 65 | +print(Timer(functools.partial(analyze_polars_lazyframe, polars_lf)).timeit(100)) |
73 | 66 |
|
74 | | -end_time = time.time() |
| 67 | +print("\nShow Boots sales in the East region for pandas DataFrame") |
| 68 | +print(analyze_pandas_dataframe(pandas_df)["East"]["Boots"]) |
75 | 69 |
|
76 | | -del pandas_dataframe |
| 70 | +print("\nShow Boots sales in the East region for polars DataFrame") |
| 71 | +print( |
| 72 | + ( |
| 73 | + analyze_polars_dataframe(polars_df).filter( |
| 74 | + pl.col("region") == "East", |
| 75 | + pl.col("product") == "Boots", |
| 76 | + ) |
| 77 | + ) |
| 78 | +) |
77 | 79 |
|
| 80 | +print("\nShow Boots sales in the East region for pandas LazyFrame") |
78 | 81 | print( |
79 | | - f"Pandas DataFrame creation: {processing_time_start - overall_time_start}" |
| 82 | + ( |
| 83 | + analyze_polars_lazyframe(polars_lf).filter( |
| 84 | + pl.col("region") == "East", |
| 85 | + pl.col("product") == "Boots", |
| 86 | + ) |
| 87 | + ) |
80 | 88 | ) |
81 | | -print(f"Pandas DataFrame query runtime: {end_time - processing_time_start}") |
82 | | -print(f"Pandas DataFrame overall time: {end_time - overall_time_start}") |
|
0 commit comments