Polars vs pandas (#697)

eyrei123 · bzaczynski · stephengruppetta · web-flow · commit 570414056e4c · 2025-09-21T21:31:15.000+02:00
* addition of deleted file

* code error fix

* Rename agnositic_groupby to universal_groupby

* Upload files from a ZIP archive

* Various fixes and amendments

* Final QA

* Fix linter issues

---------

Co-authored-by: Bartosz Zaczyński &lt;bartosz.zaczynski@gmail.com&gt;
Co-authored-by: stephengruppetta &lt;51741022+stephengruppetta@users.noreply.github.com&gt;
diff --git a/polars-vs-pandas/README.md b/polars-vs-pandas/README.md
@@ -0,0 +1,15 @@
+# Polars vs pandas: What's the Difference?
+
+The materials contained in this folder are designed to complement the Real Python tutorial [Polars vs pandas: What's the Difference?](https://realpython.com/polars-vs-pandas/).
+
+Your download bundle contains the following files:
+
+| File                                    | Description                                                                                                |
+|-----------------------------------------|------------------------------------------------------------------------------------------------------------|
+| `benchmark.py`                          | This script performs time tests for DataFrames and a LazyFrame.                                            |
+| `conversions.py`                        | This file contains the code used to convert between pandas and Polars DataFrames, plus a Narwhals example. |
+| `data_generation.py`                    | This script contains the `generate_data()` function used to generate different quantities of data.         |
+| `online_retail.parquet`                 | This Parquet file contains retail data used in some of the queries.                                        |
+| `pandas_polars_demo.py`                 | This file contains the code used to illustrate the differences between pandas and Polars syntax.           |
+| `plots.ipynb`                           | This Jupyter Notebook file contains the plotting code to demonstrate default plotting capabilities.        |
+| `streaming_test.py`                     | This script performs time tests for a LazyFrame with streaming enabled.                                    |
diff --git a/polars-vs-pandas/benchmark.py b/polars-vs-pandas/benchmark.py
@@ -0,0 +1,103 @@
+"""
+Running:
+$ python benchmark.py 500
+"""
+
+import functools
+import sys
+from timeit import Timer
+
+import pandas as pd
+import polars as pl
+from data_generation import generate_data
+
+
+def create_pandas_dataframe(test_data):
+    return pd.DataFrame(test_data).convert_dtypes(dtype_backend="pyarrow")
+
+
+def create_polars_dataframe(test_data):
+    return pl.DataFrame(test_data)
+
+
+def create_polars_lazyframe(test_data):
+    return pl.LazyFrame(test_data)
+
+
+def analyze_pandas_dataframe(pandas_df):
+    return pandas_df.groupby(["region", "product", "sales_person"])[
+        "sales_income"
+    ].sum()
+
+
+def analyze_polars_dataframe(polars_df):
+    return polars_df.group_by(["region", "product", "sales_person"]).agg(
+        total_sales=pl.col("sales_income").sum()
+    )
+
+
+def analyze_polars_lazyframe(polars_lf):
+    return (
+        polars_lf.group_by(["region", "product", "sales_person"])
+        .agg(total_sales=pl.col("sales_income").sum())
+        .collect()
+    )
+
+
+print("Creating DataFrames...")
+
+test_data = generate_data(int(sys.argv[1]))
+
+print(f"Pandas dataframe creation time for {int(sys.argv[1]):,} rows:")
+print(Timer(functools.partial(create_pandas_dataframe, test_data)).timeit(100))
+print(f"\nPolars dataframe creation time for {int(sys.argv[1]):,} rows:")
+print(Timer(functools.partial(create_polars_dataframe, test_data)).timeit(100))
+print(f"\nPolars lazyframe creation time for {int(sys.argv[1]):,} rows:")
+print(Timer(functools.partial(create_polars_lazyframe, test_data)).timeit(100))
+
+print("-" * 50)
+print("Analyzing DataFrames...")
+
+pandas_df = create_pandas_dataframe(test_data)
+polars_df = create_polars_dataframe(test_data)
+polars_lf = create_polars_lazyframe(test_data)
+
+print(f"Pandas dataframe analysis time for {int(sys.argv[1]):,} rows:")
+print(
+    Timer(functools.partial(analyze_pandas_dataframe, pandas_df)).timeit(100)
+)
+
+print()
+print(f"Polars dataframe analysis time for {int(sys.argv[1]):,} rows:")
+print(
+    Timer(functools.partial(analyze_polars_dataframe, polars_df)).timeit(100)
+)
+
+print()
+print(f"Polars lazyframe analysis time for {int(sys.argv[1]):,} rows:")
+print(
+    Timer(functools.partial(analyze_polars_lazyframe, polars_lf)).timeit(100)
+)
+
+print("\nShow Boots sales in the East region for pandas DataFrame")
+print(analyze_pandas_dataframe(pandas_df)["East"]["Boots"])
+
+print("\nShow Boots sales in the East region for Polars DataFrame")
+print(
+    (
+        analyze_polars_dataframe(polars_df).filter(
+            pl.col("region") == "East",
+            pl.col("product") == "Boots",
+        )
+    )
+)
+
+print("\nShow Boots sales in the East region for Polars LazyFrame")
+print(
+    (
+        analyze_polars_lazyframe(polars_lf).filter(
+            pl.col("region") == "East",
+            pl.col("product") == "Boots",
+        )
+    )
+)
diff --git a/polars-vs-pandas/conversions.py b/polars-vs-pandas/conversions.py
@@ -0,0 +1,33 @@
+import narwhals as nw
+import polars as pl
+from data_generation import generate_data
+
+
+def universal_groupby(df):
+    return (
+        nw.from_native(df)
+        .group_by("region")
+        .agg(nw.col("sales_income").sum())
+        .sort("region")
+        .to_native()
+    )
+
+
+polars_df = pl.DataFrame(generate_data(4))
+print(polars_df)
+
+print("\nPolars to pandas:")
+pandas_df = polars_df.to_pandas()
+print(type(pandas_df))
+print(pandas_df)
+
+print("\npandas to Polars:")
+polars_df = pl.from_pandas(pandas_df)
+print(type(polars_df))
+print(polars_df)
+
+print("\nNarwhals with pandas:")
+print(universal_groupby(pandas_df))
+
+print("\nNarwhals with Polars:")
+print(universal_groupby(polars_df))
diff --git a/polars-vs-pandas/data_generation.py b/polars-vs-pandas/data_generation.py
@@ -0,0 +1,19 @@
+import numpy as np
+
+
+def generate_data(number_of_rows):
+    rng = np.random.default_rng()
+
+    return {
+        "order_id": range(1, number_of_rows + 1),
+        "region": rng.choice(
+            ["North", "South", "East", "West"], size=number_of_rows
+        ),
+        "sales_person": rng.choice(
+            ["Armstrong", "Aldrin", "Collins"], size=number_of_rows
+        ),
+        "product": rng.choice(
+            ["Helmet", "Oxygen", "Boots", "Gloves"], size=number_of_rows
+        ),
+        "sales_income": rng.integers(1, 5001, size=number_of_rows),
+    }
diff --git a/polars-vs-pandas/online_retail.parquet b/polars-vs-pandas/online_retail.parquet
diff --git a/polars-vs-pandas/pandas_polars_demo.py b/polars-vs-pandas/pandas_polars_demo.py
@@ -0,0 +1,38 @@
+import pandas as pd
+import polars as pl
+
+print("Index-Based syntax in pandas:")
+orders_pandas = pd.read_parquet("online_retail.parquet")
+orders_pandas["Total"] = orders_pandas["Quantity"] * orders_pandas["UnitPrice"]
+print(
+    orders_pandas[["InvoiceNo", "Quantity", "UnitPrice", "Total"]][
+        orders_pandas["Total"] > 100
+    ].head(3)
+)
+
+print()
+
+print("Method-chaining syntax in pandas:")
+orders_pandas = pd.read_parquet("online_retail.parquet")
+print(
+    (
+        orders_pandas.assign(
+            Total=orders_pandas["Quantity"] * orders_pandas["UnitPrice"]
+        )
+        .filter(["InvoiceNo", "Quantity", "UnitPrice", "Total"])
+        .query("Total > 100")
+    ).head(3)
+)
+
+print()
+
+print("Method-chaining syntax in Polars:")
+orders_polars = pl.read_parquet("online_retail.parquet")
+print(
+    (
+        orders_polars.select(
+            pl.col(["InvoiceNo", "Quantity", "UnitPrice"]),
+            Total=pl.col("Quantity") * pl.col("UnitPrice"),
+        ).filter(pl.col("Total") > 100)
+    ).head(3)
+)
diff --git a/polars-vs-pandas/plots.ipynb b/polars-vs-pandas/plots.ipynb
@@ -0,0 +1,102 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "8ced8243-d770-437e-a90d-f794ffa57fc0",
+   "metadata": {},
+   "source": [
+    "# Dataframe Plots"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1cf56fd3-605c-4449-8a5e-d0fd94b49080",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from data_generation import generate_data\n",
+    "\n",
+    "sales_data = generate_data(50)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "913c18ed-373b-400e-ba27-38ca8e7b70b9",
+   "metadata": {},
+   "source": [
+    "## polars Plotting"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "014e9e56-8fff-45ab-85ff-eb51840f2bc7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import polars as pl\n",
+    "\n",
+    "orders_polars = pl.DataFrame(sales_data)\n",
+    "\n",
+    "(\n",
+    "    orders_polars.group_by(\"region\")\n",
+    "    .agg(total_sales=pl.col(\"sales_income\").sum())\n",
+    "    .plot.bar(x=\"region\", y=\"total_sales\")\n",
+    "    .properties(width=200, height=200, title=\"Total Sales per Region ($)\")\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a62335eb-4763-4c46-adb2-d7386914f56b",
+   "metadata": {},
+   "source": [
+    "## Pandas Plotting"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "85929590-e514-4497-b396-58cfe26e59d3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "orders_pandas = pd.DataFrame(sales_data)\n",
+    "\n",
+    "(\n",
+    "    orders_pandas.groupby(\n",
+    "        [\n",
+    "            \"region\",\n",
+    "        ]\n",
+    "    )[\"sales_income\"]\n",
+    "    .sum()\n",
+    "    .plot(kind=\"bar\", title=\"Total Sales per Region ($)\", ylabel=\"total_sales\")\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/polars-vs-pandas/requirements.txt b/polars-vs-pandas/requirements.txt
diff --git a/polars-vs-pandas/streaming_test.py b/polars-vs-pandas/streaming_test.py