Skip to content

Commit 3f21736

Browse files
committed
first upload
1 parent 3e797d0 commit 3f21736

File tree

5 files changed

+188
-0
lines changed

5 files changed

+188
-0
lines changed
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
"""
2+
Timing how long it takes to do a complex string replace on a whole column
3+
with various methods
4+
"""
5+
6+
import codetiming
7+
import pandas as pd
8+
9+
books = pd.read_csv("resources/books.csv")
10+
11+
CITIES = ["London", "Plymouth", "Oxford", "Boston"]
12+
13+
14+
def clean_pub_replace(df):
15+
def clean_pub_replace_inner(df):
16+
col = df["place_of_publication"]
17+
for city in CITIES:
18+
col.replace(rf".*{city}.*", city, regex=True)
19+
return col
20+
21+
return df.assign(place_of_publication=clean_pub_replace_inner)
22+
23+
24+
def clean_pub_apply(df):
25+
def clean_pub_apply_inner(df):
26+
col = df["place_of_publication"]
27+
for city in CITIES:
28+
col = col.apply(lambda val: city if city in val else val)
29+
return col
30+
31+
return df.assign(place_of_publication=clean_pub_apply_inner)
32+
33+
34+
def clean_pub_iterrows(df):
35+
def clean_pub_iterrows_inner(df):
36+
col = []
37+
for _, row in df.iterrows():
38+
place = row["place_of_publication"]
39+
40+
for name in CITIES:
41+
place = name if name in place else place
42+
43+
col.append(place)
44+
45+
return col
46+
47+
return df.assign(place_of_publication=clean_pub_iterrows_inner)
48+
49+
50+
def clean_pub_itertuples(df):
51+
def clean_pub_itertuples_inner(df):
52+
col = []
53+
for row in df.itertuples():
54+
place = row.place_of_publication
55+
for name in CITIES:
56+
place = name if name in place else place
57+
58+
col.append(place)
59+
60+
return df.assign(place_of_publication=clean_pub_itertuples_inner)
61+
62+
63+
def clean_pub_list_comp(df):
64+
def replace_city(text):
65+
for city in CITIES:
66+
if city in text:
67+
return city
68+
69+
return text
70+
71+
def clean_pub_list_comp_inner(df):
72+
return [replace_city(place) for place in df["place_of_publication"]]
73+
74+
return df.assign(place_of_publication=clean_pub_list_comp_inner)
75+
76+
77+
for f in [
78+
clean_pub_apply,
79+
clean_pub_iterrows,
80+
clean_pub_itertuples,
81+
clean_pub_replace,
82+
clean_pub_list_comp,
83+
]:
84+
with codetiming.Timer(
85+
name=f.__name__, text="{name:20}: {milliseconds:.2f} ms"
86+
):
87+
f(books)
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# %%
2+
import httpx
3+
import pandas as pd
4+
5+
# %% Read CSV and rename headers
6+
7+
webs = pd.read_csv("resources/popular_websites.csv")
8+
9+
# %% Define function to check connection
10+
11+
12+
def check_connection(name, url):
13+
try:
14+
httpx.get(url)
15+
16+
except httpx.ConnectError:
17+
print("Failed to establish a connection")
18+
return False
19+
else:
20+
print(f"{name} is online!")
21+
return True
22+
23+
24+
# %% Use .itertuples() to iterate through all rows
25+
26+
for web in webs.itertuples():
27+
check_connection(web.website, web.url)
28+
29+
30+
# %% Use list comprehension to iterate through all rows
31+
32+
[check_connection(web.website, web.url) for web in webs.itertuples()]
33+
34+
# %% Use the index to iterate through rows
35+
36+
for i in webs.index:
37+
print({**webs.iloc[i]})
38+
39+
# %% Transpose and cast to dictionary to iterate through rows
40+
41+
for row in webs.T.to_dict().values():
42+
print(row)
43+
44+
45+
# %%
46+
webs.aggregate(["sum"])
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# %%
2+
3+
import pandas as pd
4+
5+
# %% To get cumulative sum, instead of looping, you can create intermediate
6+
# columns and use .cumsum()
7+
8+
products = (
9+
pd.read_csv("resources/products.csv")
10+
.assign(
11+
income=lambda df: df["sales"] * df["unit_price"],
12+
cumulatative_income=lambda df: df["income"].cumsum(),
13+
)
14+
.drop(columns="income")
15+
)
16+
17+
# %% The equivalent way to do that with only .itertuples()
18+
19+
products = pd.read_csv("resources/products.csv")
20+
21+
cumulative_sum = []
22+
23+
for row in products.itertuples():
24+
if cumulative_sum:
25+
cumulative_sum.append(
26+
cumulative_sum[-1] + (row.sales * row.unit_price)
27+
)
28+
else:
29+
cumulative_sum.append(row.sales * row.unit_price)
30+
31+
products.assign(cumulative_income=cumulative_sum)
32+
# %%
683 KB
Binary file not shown.
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# %% Different ways to take a sum of a column
2+
3+
import pandas as pd
4+
5+
# %%
6+
7+
webs = pd.read_csv("resources/popular_websites.csv")
8+
9+
# %% Best way: use the dedicated pandas method
10+
11+
webs["total_views"].sum()
12+
13+
# %% List comprehension
14+
15+
sum(row.total_views for row in webs.itertuples())
16+
17+
# %% itertuples()
18+
19+
total = 0
20+
for row in webs.itertuples():
21+
total += row.total_views
22+
23+
total

0 commit comments

Comments
 (0)