Skip to content

Commit 5d47f57

Browse files
committed
Bugfixes and non-inner versions
1 parent 235c1ad commit 5d47f57

File tree

2 files changed

+89
-3
lines changed

2 files changed

+89
-3
lines changed

pandas-iterate-over-rows/fix_place_of_pub.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def clean_pub_replace(df):
1515
def clean_pub_replace_inner(df):
1616
col = df["place_of_publication"]
1717
for city in CITIES:
18-
col.replace(rf".*{city}.*", city, regex=True)
18+
col = col.replace(rf".*{city}.*", city, regex=True)
1919
return col
2020

2121
return df.assign(place_of_publication=clean_pub_replace_inner)
@@ -55,7 +55,8 @@ def clean_pub_itertuples_inner(df):
5555
for name in CITIES:
5656
place = name if name in place else place
5757

58-
col.append(place)
58+
col.append(place)
59+
return col
5960

6061
return df.assign(place_of_publication=clean_pub_itertuples_inner)
6162

@@ -84,4 +85,4 @@ def clean_pub_list_comp_inner(df):
8485
with codetiming.Timer(
8586
name=f.__name__, text="{name:20}: {milliseconds:.2f} ms"
8687
):
87-
f(books)
88+
print(f(books).head())
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
"""
2+
Timing how long it takes to do a complex string replace on a whole column
3+
with various methods
4+
"""
5+
6+
import codetiming
7+
import pandas as pd
8+
9+
# books = pd.read_csv("resources/books.csv")
10+
books = (
11+
pd.read_csv(
12+
"https://github.com/realpython/python-data-cleaning/raw/master/Datasets/BL-Flickr-Images-Book.csv"
13+
)
14+
.rename(
15+
columns={
16+
"Place of Publication": "place_of_publication",
17+
"Title": "title",
18+
"Author": "author",
19+
}
20+
)
21+
.loc[:, ["title", "author", "place_of_publication"]]
22+
)
23+
24+
CITIES = ["London", "Plymouth", "Oxford", "Boston"]
25+
26+
27+
def _replace_city(text):
28+
for city in CITIES:
29+
if city in text:
30+
return city
31+
32+
return text
33+
34+
35+
def clean_pub_replace(df):
36+
col = df["place_of_publication"]
37+
for city in CITIES:
38+
col = col.replace(rf".*{city}.*", city, regex=True)
39+
return col
40+
41+
42+
def clean_pub_apply(df):
43+
col = df["place_of_publication"]
44+
for city in CITIES:
45+
col = col.apply(lambda val: city if city in val else val)
46+
return col
47+
48+
49+
def clean_pub_iterrows(df):
50+
return [
51+
_replace_city(row["place_of_publication"]) for _, row in df.iterrows()
52+
]
53+
54+
# col = []
55+
# for _, row in df.iterrows():
56+
# place = row["place_of_publication"]
57+
# col.append(_replace_city(place))
58+
# return col
59+
60+
61+
def clean_pub_itertuples(df):
62+
return [_replace_city(row.place_of_publication) for row in df.itertuples()]
63+
64+
# col = []
65+
# for row in df.itertuples():
66+
# place = row.place_of_publication
67+
# col.append(_replace_city(place))
68+
# return col
69+
70+
71+
def clean_pub_list_comp(df):
72+
return [_replace_city(place) for place in df["place_of_publication"]]
73+
74+
75+
for clean_func in [
76+
clean_pub_replace,
77+
clean_pub_apply,
78+
clean_pub_iterrows,
79+
clean_pub_itertuples,
80+
clean_pub_list_comp,
81+
]:
82+
with codetiming.Timer(
83+
name=clean_func.__name__, text="{name:20}: {milliseconds:.2f} ms"
84+
):
85+
books.assign(place_of_publication=clean_func)

0 commit comments

Comments
 (0)