Merge pull request #420 from realpython/python_polars

KateFinegan · web-flow · commit e129aaeaa0f8 · 2023-08-09T12:51:54.000-06:00
Python polars article code
diff --git a/python-polars/README.md b/python-polars/README.md
@@ -0,0 +1,17 @@
+# Python Polars: A Lightning-Fast DataFrame Library
+
+Supporting code for the Real Python tutorial [Python Polars: A Lightning-Fast DataFrame Library](https://realpython.com/polars-python/). 
+
+To run the code in this tutorial, you should have `polars`, `pandas`, `numpy`, `requests`, and `matplotlib` installed in your environment. 
+
+If you want to install Polars with all of the library's optional dependencies, then you can run:
+
+```console
+$ python -m pip install "polars[all]" requests matplotlib
+```
+
+Otherwise, you'll at least need to include the `pandas` and `numpy` feature flags:
+
+```console
+$ python -m pip install "polars[numpy, pandas]" requests matplotlib
+```
diff --git a/python-polars/downloads.py b/python-polars/downloads.py
@@ -0,0 +1,14 @@
+import pathlib
+import requests
+
+
+def download_file(file_url: str, local_file_path: pathlib.Path) -> None:
+    """Download a file and save it with the specified file name."""
+    response = requests.get(file_url)
+    if response:
+        local_file_path.write_bytes(response.content)
+        print(f"File successfully downloaded and stored at: {local_file_path}")
+    else:
+        raise requests.exceptions.RequestException(
+            f"Failed to download the file. Status code: {response.status_code}"
+        )
diff --git a/python-polars/expressions_and_contexts.py b/python-polars/expressions_and_contexts.py
@@ -0,0 +1,32 @@
+import numpy as np
+import polars as pl
+
+num_rows = 5000
+rng = np.random.default_rng(seed=7)
+
+buildings_data = {
+    "sqft": rng.exponential(scale=1000, size=num_rows),
+    "year": rng.integers(low=1995, high=2023, size=num_rows),
+    "building_type": rng.choice(a=["A", "B", "C"], size=num_rows),
+}
+buildings = pl.DataFrame(buildings_data)
+
+print(buildings.select("sqft"))
+
+print(buildings.select(pl.col("sqft")))
+
+print(buildings.select(pl.col("sqft").sort() / 1000))
+
+after_2015 = buildings.filter(pl.col("year") > 2015)
+print(after_2015.shape)
+print(after_2015.select(pl.col("year").min()))
+
+print(
+    buildings.groupby("building_type").agg(
+        [
+            pl.mean("sqft").alias("mean_sqft"),
+            pl.median("year").alias("median_year"),
+            pl.count(),
+        ]
+    )
+)
diff --git a/python-polars/getting_started.py b/python-polars/getting_started.py
@@ -0,0 +1,20 @@
+import numpy as np
+import polars as pl
+
+num_rows = 5000
+rng = np.random.default_rng(seed=7)
+
+buildings_data = {
+    "sqft": rng.exponential(scale=1000, size=num_rows),
+    "year": rng.integers(low=1995, high=2023, size=num_rows),
+    "building_type": rng.choice(a=["A", "B", "C"], size=num_rows),
+}
+
+buildings = pl.DataFrame(buildings_data)
+print(buildings)
+
+print(buildings.schema)
+
+print(buildings.head())
+
+print(buildings.describe())
diff --git a/python-polars/lazy_api.py b/python-polars/lazy_api.py
@@ -0,0 +1,41 @@
+import numpy as np
+import polars as pl
+
+num_rows = 5000
+rng = np.random.default_rng(seed=7)
+
+buildings = {
+    "sqft": rng.exponential(scale=1000, size=num_rows),
+    "price": rng.exponential(scale=100_000, size=num_rows),
+    "year": rng.integers(low=1995, high=2023, size=num_rows),
+    "building_type": rng.choice(a=["A", "B", "C"], size=num_rows),
+}
+buildings_lazy = pl.LazyFrame(buildings)
+print(buildings_lazy)
+
+lazy_query = (
+    buildings_lazy.with_columns(
+        (pl.col("price") / pl.col("sqft")).alias("price_per_sqft")
+    )
+    .filter(pl.col("price_per_sqft") > 100)
+    .filter(pl.col("year") < 2010)
+)
+print(lazy_query)
+
+lazy_query.show_graph()
+
+print(lazy_query.explain())
+
+lazy_query = (
+    buildings_lazy.with_columns(
+        (pl.col("price") / pl.col("sqft")).alias("price_per_sqft")
+    )
+    .filter(pl.col("price_per_sqft") > 100)
+    .filter(pl.col("year") < 2010)
+)
+
+print(lazy_query.collect().select(pl.col(["price_per_sqft", "year"])))
+
+print(
+    lazy_query.collect().select(pl.col(["price_per_sqft", "year"])).describe()
+)
diff --git a/python-polars/polars_integration.py b/python-polars/polars_integration.py
@@ -0,0 +1,35 @@
+import numpy as np
+import pandas as pd
+import polars as pl
+
+data = pl.DataFrame({"A": [1, 2, 3, 4, 5], "B": [6, 7, 8, 9, 10]})
+
+data.write_csv("data.csv")
+data.write_ndjson("data.json")
+data.write_parquet("data.parquet")
+
+data_csv = pl.read_csv("data.csv")
+data_csv_lazy = pl.scan_csv("data.csv")
+print(data_csv_lazy.schema)
+
+data_json = pl.read_ndjson("data.json")
+data_json_lazy = pl.scan_ndjson("data.json")
+print(data_json_lazy.schema)
+
+data_parquet = pl.read_parquet("data.parquet")
+data_parquet_lazy = pl.scan_parquet("data.parquet")
+print(data_parquet_lazy.schema)
+
+polars_data = pl.DataFrame({"A": [1, 2, 3, 4, 5], "B": [6, 7, 8, 9, 10]})
+
+pandas_data = pd.DataFrame({"A": [1, 2, 3, 4, 5], "B": [6, 7, 8, 9, 10]})
+
+numpy_data = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]).T
+
+print(pl.from_pandas(pandas_data))
+
+print(pl.from_numpy(numpy_data, schema={"A": pl.Int64, "B": pl.Int64}))
+
+print(polars_data.to_pandas())
+
+print(polars_data.to_numpy())
diff --git a/python-polars/scanning_data.py b/python-polars/scanning_data.py
@@ -0,0 +1,31 @@
+import pathlib
+import polars as pl
+from downloads import download_file
+
+url = "https://data.wa.gov/api/views/f6w7-q2d2/rows.csv?accessType=DOWNLOAD"
+local_file_path = pathlib.Path("electric_cars.csv")
+
+download_file(url, local_file_path)
+
+lazy_car_data = pl.scan_csv(local_file_path)
+print(lazy_car_data)
+
+print(lazy_car_data.schema)
+
+lazy_car_query = (
+    lazy_car_data.filter((pl.col("Model Year") >= 2018))
+    .filter(
+        pl.col("Electric Vehicle Type") == "Battery Electric Vehicle (BEV)"
+    )
+    .groupby(["State", "Make"])
+    .agg(
+        pl.mean("Electric Range").alias("Average Electric Range"),
+        pl.min("Model Year").alias("Oldest Model Year"),
+        pl.count().alias("Number of Cars"),
+    )
+    .filter(pl.col("Average Electric Range") > 0)
+    .filter(pl.col("Number of Cars") > 5)
+    .sort(pl.col("Number of Cars"), descending=True)
+)
+
+print(lazy_car_query.collect())