Skip to content

Commit e129aae

Browse files
authored
Merge pull request #420 from realpython/python_polars
Python polars article code
2 parents ad8d6c4 + 9de0b12 commit e129aae

File tree

7 files changed

+190
-0
lines changed

7 files changed

+190
-0
lines changed

python-polars/README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Python Polars: A Lightning-Fast DataFrame Library
2+
3+
Supporting code for the Real Python tutorial [Python Polars: A Lightning-Fast DataFrame Library](https://realpython.com/polars-python/).
4+
5+
To run the code in this tutorial, you should have `polars`, `pandas`, `numpy`, `requests`, and `matplotlib` installed in your environment.
6+
7+
If you want to install Polars with all of the library's optional dependencies, then you can run:
8+
9+
```console
10+
$ python -m pip install "polars[all]" requests matplotlib
11+
```
12+
13+
Otherwise, you'll at least need to include the `pandas` and `numpy` feature flags:
14+
15+
```console
16+
$ python -m pip install "polars[numpy, pandas]" requests matplotlib
17+
```

python-polars/downloads.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
import pathlib
2+
import requests
3+
4+
5+
def download_file(file_url: str, local_file_path: pathlib.Path) -> None:
6+
"""Download a file and save it with the specified file name."""
7+
response = requests.get(file_url)
8+
if response:
9+
local_file_path.write_bytes(response.content)
10+
print(f"File successfully downloaded and stored at: {local_file_path}")
11+
else:
12+
raise requests.exceptions.RequestException(
13+
f"Failed to download the file. Status code: {response.status_code}"
14+
)
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import numpy as np
2+
import polars as pl
3+
4+
num_rows = 5000
5+
rng = np.random.default_rng(seed=7)
6+
7+
buildings_data = {
8+
"sqft": rng.exponential(scale=1000, size=num_rows),
9+
"year": rng.integers(low=1995, high=2023, size=num_rows),
10+
"building_type": rng.choice(a=["A", "B", "C"], size=num_rows),
11+
}
12+
buildings = pl.DataFrame(buildings_data)
13+
14+
print(buildings.select("sqft"))
15+
16+
print(buildings.select(pl.col("sqft")))
17+
18+
print(buildings.select(pl.col("sqft").sort() / 1000))
19+
20+
after_2015 = buildings.filter(pl.col("year") > 2015)
21+
print(after_2015.shape)
22+
print(after_2015.select(pl.col("year").min()))
23+
24+
print(
25+
buildings.groupby("building_type").agg(
26+
[
27+
pl.mean("sqft").alias("mean_sqft"),
28+
pl.median("year").alias("median_year"),
29+
pl.count(),
30+
]
31+
)
32+
)

python-polars/getting_started.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
import numpy as np
2+
import polars as pl
3+
4+
num_rows = 5000
5+
rng = np.random.default_rng(seed=7)
6+
7+
buildings_data = {
8+
"sqft": rng.exponential(scale=1000, size=num_rows),
9+
"year": rng.integers(low=1995, high=2023, size=num_rows),
10+
"building_type": rng.choice(a=["A", "B", "C"], size=num_rows),
11+
}
12+
13+
buildings = pl.DataFrame(buildings_data)
14+
print(buildings)
15+
16+
print(buildings.schema)
17+
18+
print(buildings.head())
19+
20+
print(buildings.describe())

python-polars/lazy_api.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import numpy as np
2+
import polars as pl
3+
4+
num_rows = 5000
5+
rng = np.random.default_rng(seed=7)
6+
7+
buildings = {
8+
"sqft": rng.exponential(scale=1000, size=num_rows),
9+
"price": rng.exponential(scale=100_000, size=num_rows),
10+
"year": rng.integers(low=1995, high=2023, size=num_rows),
11+
"building_type": rng.choice(a=["A", "B", "C"], size=num_rows),
12+
}
13+
buildings_lazy = pl.LazyFrame(buildings)
14+
print(buildings_lazy)
15+
16+
lazy_query = (
17+
buildings_lazy.with_columns(
18+
(pl.col("price") / pl.col("sqft")).alias("price_per_sqft")
19+
)
20+
.filter(pl.col("price_per_sqft") > 100)
21+
.filter(pl.col("year") < 2010)
22+
)
23+
print(lazy_query)
24+
25+
lazy_query.show_graph()
26+
27+
print(lazy_query.explain())
28+
29+
lazy_query = (
30+
buildings_lazy.with_columns(
31+
(pl.col("price") / pl.col("sqft")).alias("price_per_sqft")
32+
)
33+
.filter(pl.col("price_per_sqft") > 100)
34+
.filter(pl.col("year") < 2010)
35+
)
36+
37+
print(lazy_query.collect().select(pl.col(["price_per_sqft", "year"])))
38+
39+
print(
40+
lazy_query.collect().select(pl.col(["price_per_sqft", "year"])).describe()
41+
)
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import numpy as np
2+
import pandas as pd
3+
import polars as pl
4+
5+
data = pl.DataFrame({"A": [1, 2, 3, 4, 5], "B": [6, 7, 8, 9, 10]})
6+
7+
data.write_csv("data.csv")
8+
data.write_ndjson("data.json")
9+
data.write_parquet("data.parquet")
10+
11+
data_csv = pl.read_csv("data.csv")
12+
data_csv_lazy = pl.scan_csv("data.csv")
13+
print(data_csv_lazy.schema)
14+
15+
data_json = pl.read_ndjson("data.json")
16+
data_json_lazy = pl.scan_ndjson("data.json")
17+
print(data_json_lazy.schema)
18+
19+
data_parquet = pl.read_parquet("data.parquet")
20+
data_parquet_lazy = pl.scan_parquet("data.parquet")
21+
print(data_parquet_lazy.schema)
22+
23+
polars_data = pl.DataFrame({"A": [1, 2, 3, 4, 5], "B": [6, 7, 8, 9, 10]})
24+
25+
pandas_data = pd.DataFrame({"A": [1, 2, 3, 4, 5], "B": [6, 7, 8, 9, 10]})
26+
27+
numpy_data = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]).T
28+
29+
print(pl.from_pandas(pandas_data))
30+
31+
print(pl.from_numpy(numpy_data, schema={"A": pl.Int64, "B": pl.Int64}))
32+
33+
print(polars_data.to_pandas())
34+
35+
print(polars_data.to_numpy())

python-polars/scanning_data.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import pathlib
2+
import polars as pl
3+
from downloads import download_file
4+
5+
url = "https://data.wa.gov/api/views/f6w7-q2d2/rows.csv?accessType=DOWNLOAD"
6+
local_file_path = pathlib.Path("electric_cars.csv")
7+
8+
download_file(url, local_file_path)
9+
10+
lazy_car_data = pl.scan_csv(local_file_path)
11+
print(lazy_car_data)
12+
13+
print(lazy_car_data.schema)
14+
15+
lazy_car_query = (
16+
lazy_car_data.filter((pl.col("Model Year") >= 2018))
17+
.filter(
18+
pl.col("Electric Vehicle Type") == "Battery Electric Vehicle (BEV)"
19+
)
20+
.groupby(["State", "Make"])
21+
.agg(
22+
pl.mean("Electric Range").alias("Average Electric Range"),
23+
pl.min("Model Year").alias("Oldest Model Year"),
24+
pl.count().alias("Number of Cars"),
25+
)
26+
.filter(pl.col("Average Electric Range") > 0)
27+
.filter(pl.col("Number of Cars") > 5)
28+
.sort(pl.col("Number of Cars"), descending=True)
29+
)
30+
31+
print(lazy_car_query.collect())

0 commit comments

Comments
 (0)