Skip to content

Commit cc1f668

Browse files
committed
reworking to include perfplot and integrating gahjelle's changes
1 parent 5d47f57 commit cc1f668

14 files changed

+8533
-167
lines changed
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
people = [
2+
["name", "age", "job"],
3+
["Stephanie Gould", 100, "Banker"],
4+
["Christopher Ward", 115, "Doctor, general practice"],
5+
["Jill Santiago", 80, "Insurance account manager"],
6+
["John Lewis", 58, "Animal technologist"],
7+
["Bianca Moore", 39, "Plant breeder/geneticist"],
8+
]
9+
10+
11+
for row in people[1:]:
12+
print(row)
13+
14+
import pandas as pd
15+
16+
people = pd.DataFrame(people[1:], columns=people[0])
17+
18+
print(people)
19+
20+
for row in people:
21+
print(row)
22+
23+
for row in people.itertuples():
24+
print(row)
25+
26+
for row in people.itertuples():
27+
print(f"{row[1]} is a {row.job}")
28+
29+
people = [
30+
[
31+
"name",
32+
"Stephanie Gould",
33+
"Christopher Ward",
34+
"Jill Santiago",
35+
"John Lewis",
36+
"Bianca Moore",
37+
],
38+
["age", 44, 85, 27, 21, 112],
39+
[
40+
"job",
41+
"Banker",
42+
"Doctor, general practice",
43+
"Insurance account manager",
44+
"Animal technologist",
45+
"Plant breeder/geneticist",
46+
],
47+
]
Lines changed: 61 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -1,88 +1,83 @@
11
"""
2-
Timing how long it takes to do a complex string replace on a whole column
3-
with various methods
2+
Plotting how long it takes to do a complex string replace on a whole column
3+
with various methods.
4+
5+
In the dataset, in the `place_of_publication` column, you've got entries like
6+
these:
7+
8+
London
9+
London; Virtue & Yorston
10+
Oxford
11+
pp. 40. G. Bryan & Co: Oxford, 1898
12+
Plymouth
13+
pp. 40. W. Cann: Plymouth, [1876?]
14+
15+
Most of these are just city names, but some have additional and unwanted
16+
information. For these, you want to detect if it has one of the city names,
17+
replacing the whole value with just the city name.
418
"""
519

6-
import codetiming
720
import pandas as pd
21+
import perfplot
822

923
books = pd.read_csv("resources/books.csv")
1024

1125
CITIES = ["London", "Plymouth", "Oxford", "Boston"]
1226

1327

14-
def clean_pub_replace(df):
15-
def clean_pub_replace_inner(df):
16-
col = df["place_of_publication"]
17-
for city in CITIES:
18-
col = col.replace(rf".*{city}.*", city, regex=True)
19-
return col
28+
def _replace_city(text):
29+
for city in CITIES:
30+
if city in text:
31+
return city
32+
return text
2033

21-
return df.assign(place_of_publication=clean_pub_replace_inner)
2234

35+
def clean_pub_replace(df):
36+
col = df["place_of_publication"]
37+
for city in CITIES:
38+
col = col.replace(rf".*{city}.*", city, regex=True)
39+
return col
2340

24-
def clean_pub_apply(df):
25-
def clean_pub_apply_inner(df):
26-
col = df["place_of_publication"]
27-
for city in CITIES:
28-
col = col.apply(lambda val: city if city in val else val)
29-
return col
3041

31-
return df.assign(place_of_publication=clean_pub_apply_inner)
42+
def clean_pub_itertuples(df):
43+
return [_replace_city(row.place_of_publication) for row in df.itertuples()]
3244

3345

3446
def clean_pub_iterrows(df):
35-
def clean_pub_iterrows_inner(df):
36-
col = []
37-
for _, row in df.iterrows():
38-
place = row["place_of_publication"]
39-
40-
for name in CITIES:
41-
place = name if name in place else place
42-
43-
col.append(place)
47+
return [
48+
_replace_city(row["place_of_publication"]) for _, row in df.iterrows()
49+
]
4450

45-
return col
4651

47-
return df.assign(place_of_publication=clean_pub_iterrows_inner)
48-
49-
50-
def clean_pub_itertuples(df):
51-
def clean_pub_itertuples_inner(df):
52-
col = []
53-
for row in df.itertuples():
54-
place = row.place_of_publication
55-
for name in CITIES:
56-
place = name if name in place else place
57-
58-
col.append(place)
59-
return col
60-
61-
return df.assign(place_of_publication=clean_pub_itertuples_inner)
52+
def clean_pub_apply(df):
53+
col = df["place_of_publication"]
54+
for city in CITIES:
55+
col = col.apply(lambda val: city if city in val else val)
56+
return col
6257

6358

6459
def clean_pub_list_comp(df):
65-
def replace_city(text):
66-
for city in CITIES:
67-
if city in text:
68-
return city
69-
70-
return text
71-
72-
def clean_pub_list_comp_inner(df):
73-
return [replace_city(place) for place in df["place_of_publication"]]
74-
75-
return df.assign(place_of_publication=clean_pub_list_comp_inner)
76-
77-
78-
for f in [
79-
clean_pub_apply,
80-
clean_pub_iterrows,
81-
clean_pub_itertuples,
82-
clean_pub_replace,
83-
clean_pub_list_comp,
84-
]:
85-
with codetiming.Timer(
86-
name=f.__name__, text="{name:20}: {milliseconds:.2f} ms"
87-
):
88-
print(f(books).head())
60+
return [_replace_city(place) for place in df["place_of_publication"]]
61+
62+
63+
def get_books(n):
64+
books = pd.read_csv("resources/books.csv")
65+
if n < len(books):
66+
return books.iloc[:n]
67+
return pd.concat([books for _ in range((n // len(books)) + 1)]).iloc[:n]
68+
69+
70+
perfplot.live(
71+
setup=lambda n: get_books(n),
72+
kernels=[
73+
clean_pub_replace,
74+
clean_pub_itertuples,
75+
clean_pub_iterrows,
76+
clean_pub_apply,
77+
clean_pub_list_comp,
78+
],
79+
labels=["replace", "itertuples", "iterrows", "apply", "list comp"],
80+
n_range=[i**2 for i in range(1, 40, 2)],
81+
equality_check=None,
82+
logy=True,
83+
)

pandas-iterate-over-rows/fix_place_of_pub_no_inner.py

Lines changed: 0 additions & 85 deletions
This file was deleted.
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import random
2+
3+
from faker import Faker
4+
5+
fake = Faker()
6+
Faker.seed(2)
7+
8+
people = [
9+
["name", "age", "job"],
10+
*[
11+
[
12+
f"{fake.first_name()} {fake.last_name()}",
13+
random.randint(18, 120),
14+
fake.job(),
15+
]
16+
for _ in range(5)
17+
],
18+
]
19+
20+
print(people)
21+
22+
people_transposed = list(map(list, zip(*people)))
23+
24+
print(people_transposed)

pandas-iterate-over-rows/how_to_loop.py

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,44 +3,37 @@
33
import pandas as pd
44

55
# %% Read CSV and rename headers
6-
7-
webs = pd.read_csv("resources/popular_websites.csv")
6+
webs = pd.read_csv("resources/popular_websites.csv", index_col=0)
87

98
# %% Define function to check connection
10-
11-
129
def check_connection(name, url):
1310
try:
1411
httpx.get(url)
15-
1612
except httpx.ConnectError:
1713
print("Failed to establish a connection")
1814
return False
19-
else:
20-
print(f"{name} is online!")
21-
return True
15+
print(f"{name} is online!")
16+
return True
2217

2318

2419
# %% Use .itertuples() to iterate through all rows
25-
2620
for web in webs.itertuples():
2721
check_connection(web.website, web.url)
2822

23+
# %%
24+
for _, web in webs.iterrows():
25+
check_connection(web["website"], web["url"])
2926

3027
# %% Use list comprehension to iterate through all rows
31-
3228
[check_connection(web.website, web.url) for web in webs.itertuples()]
3329

3430
# %% Use the index to iterate through rows
35-
3631
for i in webs.index:
3732
print({**webs.iloc[i]})
3833

3934
# %% Transpose and cast to dictionary to iterate through rows
40-
4135
for row in webs.T.to_dict().values():
4236
print(row)
4337

44-
4538
# %%
4639
webs.aggregate(["sum"])

pandas-iterate-over-rows/products.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
# %% To get cumulative sum, instead of looping, you can create intermediate
66
# columns and use .cumsum()
7-
87
products = (
98
pd.read_csv("resources/products.csv")
109
.assign(
@@ -15,7 +14,6 @@
1514
)
1615

1716
# %% The equivalent way to do that with only .itertuples()
18-
1917
products = pd.read_csv("resources/products.csv")
2018

2119
cumulative_sum = []
-683 KB
Binary file not shown.

0 commit comments

Comments
 (0)