Skip to content

Commit c5e5377

Browse files
authored
Merge branch 'master' into build-a-rest-api-frontend
2 parents b213f13 + 97809cc commit c5e5377

34 files changed

+19614
-1
lines changed

pandas-iterate-over-rows/README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# How to Iterate Over Rows in pandas, and Why You Shouldn't
2+
3+
In these supplementary materials, you'll find the examples discussed in the Real Python tutorial [How to Iterate Over Rows in pandas, and Why You Shouldn't](https://realpython.com/pandas-iterate-over-rows/), along with some bonus materials.
4+
5+
In addition to the third-party packages used in the tutorial, you should also install [perfplot](https://github.com/nschloe/perfplot) to play with the bonus examples. To install all dependencies into your virtual environment, you can run the following command:
6+
7+
```console
8+
$ python -m pip install pandas httpx codetiming perfplot
9+
```
10+
11+
You'll also find some additional code samples along the way, showing alternatives to looping, such as using list comprehensions. You'll also learn about alternative ways of looping.
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import pandas as pd
2+
from codetiming import Timer
3+
4+
5+
def loop_cumsum(products):
6+
cumulative_sum = []
7+
for product in products.itertuples():
8+
income = product.sales * product.unit_price
9+
if cumulative_sum:
10+
cumulative_sum.append(cumulative_sum[-1] + income)
11+
else:
12+
cumulative_sum.append(income)
13+
return products.assign(cumulative_income=cumulative_sum)
14+
15+
16+
def pandas_cumsum(products):
17+
return products.assign(
18+
income=lambda df: df["sales"] * df["unit_price"],
19+
cumulative_income=lambda df: df["income"].cumsum(),
20+
).drop(columns="income")
21+
22+
23+
for func in [loop_cumsum, pandas_cumsum]:
24+
products = pd.read_csv("resources/products.csv")
25+
products = pd.concat(products for _ in range(1000))
26+
with Timer(name=func.__name__, text="{name:20}: {milliseconds:.2f} ms"):
27+
func(products)
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import pandas as pd
2+
import perfplot
3+
4+
5+
def loop_cumsum(products):
6+
cumulative_sum = []
7+
for product in products.itertuples():
8+
income = product.sales * product.unit_price
9+
if cumulative_sum:
10+
cumulative_sum.append(cumulative_sum[-1] + income)
11+
else:
12+
cumulative_sum.append(income)
13+
return products.assign(cumulative_income=cumulative_sum)
14+
15+
16+
def pandas_cumsum(products):
17+
return products.assign(
18+
income=lambda df: df["sales"] * df["unit_price"],
19+
cumulative_income=lambda df: df["income"].cumsum(),
20+
).drop(columns="income")
21+
22+
23+
def get_products(n):
24+
products = pd.read_csv("resources/products.csv")
25+
if n < len(products):
26+
return products.iloc[:n]
27+
return pd.concat([products for _ in range((n // len(products)) + 1)]).iloc[
28+
:n
29+
]
30+
31+
32+
plot = perfplot.bench(
33+
n_range=[2**i for i in range(20)],
34+
setup=get_products,
35+
kernels=[pandas_cumsum, loop_cumsum],
36+
labels=["pandas cumsum", "loop cumsum"],
37+
equality_check=None,
38+
title="Loop vs pandas Cumulative Sum",
39+
xlabel="Number of Rows",
40+
)
41+
42+
plot.show()
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
"""
2+
Plotting how long it takes to do a complex string replace on a whole column
3+
with various methods.
4+
5+
In the dataset, in the `place_of_publication` column, you've got entries like
6+
these:
7+
8+
London
9+
London; Virtue & Yorston
10+
Oxford
11+
pp. 40. G. Bryan & Co: Oxford, 1898
12+
Plymouth
13+
pp. 40. W. Cann: Plymouth, [1876?]
14+
15+
Most of these are just city names, but some have additional and unwanted
16+
information. For these, you want to detect if it has one of the city names,
17+
replacing the whole value with just the city name.
18+
"""
19+
20+
import pandas as pd
21+
import perfplot
22+
23+
books = pd.read_csv("resources/books.csv")
24+
25+
CITIES = ["London", "Plymouth", "Oxford", "Boston"]
26+
27+
28+
def _replace_city(text):
29+
for city in CITIES:
30+
if city in text:
31+
return city
32+
return text
33+
34+
35+
def clean_pub_replace(df):
36+
col = df["place_of_publication"]
37+
for city in CITIES:
38+
col = col.replace(rf".*{city}.*", city, regex=True)
39+
return col
40+
41+
42+
def clean_pub_itertuples(df):
43+
return [_replace_city(row.place_of_publication) for row in df.itertuples()]
44+
45+
46+
def clean_pub_iterrows(df):
47+
return [
48+
_replace_city(row["place_of_publication"]) for _, row in df.iterrows()
49+
]
50+
51+
52+
def clean_pub_apply(df):
53+
col = df["place_of_publication"]
54+
for city in CITIES:
55+
col = col.apply(lambda val: city if city in val else val)
56+
return col
57+
58+
59+
def clean_pub_list_comp(df):
60+
return [_replace_city(place) for place in df["place_of_publication"]]
61+
62+
63+
def get_books(n):
64+
books = pd.read_csv("resources/books.csv")
65+
if n < len(books):
66+
return books.iloc[:n]
67+
return pd.concat([books for _ in range((n // len(books)) + 1)]).iloc[:n]
68+
69+
70+
plot = perfplot.bench(
71+
setup=lambda n: get_books(n),
72+
kernels=[
73+
clean_pub_replace,
74+
clean_pub_itertuples,
75+
clean_pub_iterrows,
76+
clean_pub_apply,
77+
clean_pub_list_comp,
78+
],
79+
labels=["replace", "itertuples", "iterrows", "apply", "list comp"],
80+
n_range=[i**2 for i in range(1, 40, 2)],
81+
equality_check=None,
82+
)
83+
84+
plot.show()
85+
plot.show(logy=True)
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# %%
2+
import httpx
3+
import pandas as pd
4+
5+
# %% Read CSV and rename headers
6+
websites = pd.read_csv("resources/popular_websites.csv", index_col=0)
7+
print(websites)
8+
9+
10+
# %% Define function to check connection
11+
def check_connection(name, url):
12+
try:
13+
response = httpx.get(url)
14+
location = response.headers.get("location")
15+
if location is None or location.startswith(url):
16+
print(f"{name} is online!")
17+
else:
18+
print(f"{name} is online! But redirects to {location}")
19+
return True
20+
except httpx.ConnectError:
21+
print(f"Failed to establish a connection with {url}")
22+
return False
23+
24+
25+
# %% Use .itertuples() to iterate through all rows
26+
for website in websites.itertuples():
27+
check_connection(website.name, website.url)
28+
29+
# %% You may use .iterrows() if you have dynamic columnnames
30+
name_column = "name"
31+
url_column = "url"
32+
for _, website in websites.iterrows():
33+
check_connection(website[name_column], website[url_column])
34+
35+
# %% Use list comprehension to iterate through all rows
36+
# Note that this creates a list that is thrown away again
37+
[
38+
check_connection(website.name, website.url)
39+
for website in websites.itertuples()
40+
]
41+
42+
# %% Use the index to iterate through rows
43+
for i in websites.index:
44+
print({**websites.iloc[i]})
45+
46+
# %% Transpose and cast to dictionary to iterate through rows
47+
for website in websites.T.to_dict().values():
48+
print(website)
49+
50+
# %% Use .agg() to aggregate over columns
51+
websites.agg(
52+
total_views=("total_views", "sum"),
53+
average_views=("total_views", "mean"),
54+
)
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# %%
2+
import pandas as pd
3+
4+
# %% Get the cumulative sum with .itertuples()
5+
products = pd.read_csv("resources/products.csv")
6+
7+
cumulative_sum = []
8+
9+
for product in products.itertuples():
10+
income = product.sales * product.unit_price
11+
if cumulative_sum:
12+
cumulative_sum.append(cumulative_sum[-1] + income)
13+
else:
14+
cumulative_sum.append(income)
15+
16+
products = products.assign(cumulative_income=cumulative_sum)
17+
18+
# %% To get cumulative sum, instead of looping, you can create intermediate
19+
# columns and use .cumsum()
20+
products = (
21+
pd.read_csv("resources/products.csv")
22+
.assign(
23+
income=lambda df: df["sales"] * df["unit_price"],
24+
cumulatative_income=lambda df: df["income"].cumsum(),
25+
)
26+
.drop(columns="income")
27+
)

0 commit comments

Comments
 (0)