first upload

iansedano · iansedano · commit 3f21736d6673 · 2022-12-23T15:45:29.000+01:00
diff --git a/pandas-iterate-over-rows/fix_place_of_pub.py b/pandas-iterate-over-rows/fix_place_of_pub.py
@@ -0,0 +1,87 @@
+"""
+Timing how long it takes to do a complex string replace on a whole column
+with various methods
+"""
+
+import codetiming
+import pandas as pd
+
+books = pd.read_csv("resources/books.csv")
+
+CITIES = ["London", "Plymouth", "Oxford", "Boston"]
+
+
+def clean_pub_replace(df):
+    def clean_pub_replace_inner(df):
+        col = df["place_of_publication"]
+        for city in CITIES:
+            col.replace(rf".*{city}.*", city, regex=True)
+        return col
+
+    return df.assign(place_of_publication=clean_pub_replace_inner)
+
+
+def clean_pub_apply(df):
+    def clean_pub_apply_inner(df):
+        col = df["place_of_publication"]
+        for city in CITIES:
+            col = col.apply(lambda val: city if city in val else val)
+        return col
+
+    return df.assign(place_of_publication=clean_pub_apply_inner)
+
+
+def clean_pub_iterrows(df):
+    def clean_pub_iterrows_inner(df):
+        col = []
+        for _, row in df.iterrows():
+            place = row["place_of_publication"]
+
+            for name in CITIES:
+                place = name if name in place else place
+
+            col.append(place)
+
+        return col
+
+    return df.assign(place_of_publication=clean_pub_iterrows_inner)
+
+
+def clean_pub_itertuples(df):
+    def clean_pub_itertuples_inner(df):
+        col = []
+        for row in df.itertuples():
+            place = row.place_of_publication
+            for name in CITIES:
+                place = name if name in place else place
+
+        col.append(place)
+
+    return df.assign(place_of_publication=clean_pub_itertuples_inner)
+
+
+def clean_pub_list_comp(df):
+    def replace_city(text):
+        for city in CITIES:
+            if city in text:
+                return city
+
+        return text
+
+    def clean_pub_list_comp_inner(df):
+        return [replace_city(place) for place in df["place_of_publication"]]
+
+    return df.assign(place_of_publication=clean_pub_list_comp_inner)
+
+
+for f in [
+    clean_pub_apply,
+    clean_pub_iterrows,
+    clean_pub_itertuples,
+    clean_pub_replace,
+    clean_pub_list_comp,
+]:
+    with codetiming.Timer(
+        name=f.__name__, text="{name:20}: {milliseconds:.2f} ms"
+    ):
+        f(books)
diff --git a/pandas-iterate-over-rows/how_to_loop.py b/pandas-iterate-over-rows/how_to_loop.py
@@ -0,0 +1,46 @@
+# %%
+import httpx
+import pandas as pd
+
+# %% Read CSV and rename headers
+
+webs = pd.read_csv("resources/popular_websites.csv")
+
+# %% Define function to check connection
+
+
+def check_connection(name, url):
+    try:
+        httpx.get(url)
+
+    except httpx.ConnectError:
+        print("Failed to establish a connection")
+        return False
+    else:
+        print(f"{name} is online!")
+        return True
+
+
+# %% Use .itertuples() to iterate through all rows
+
+for web in webs.itertuples():
+    check_connection(web.website, web.url)
+
+
+# %% Use list comprehension to iterate through all rows
+
+[check_connection(web.website, web.url) for web in webs.itertuples()]
+
+# %% Use the index to iterate through rows
+
+for i in webs.index:
+    print({**webs.iloc[i]})
+
+# %% Transpose and cast to dictionary to iterate through rows
+
+for row in webs.T.to_dict().values():
+    print(row)
+
+
+# %%
+webs.aggregate(["sum"])
diff --git a/pandas-iterate-over-rows/products.py b/pandas-iterate-over-rows/products.py
@@ -0,0 +1,32 @@
+# %%
+
+import pandas as pd
+
+# %% To get cumulative sum, instead of looping, you can create intermediate
+# columns and use .cumsum()
+
+products = (
+    pd.read_csv("resources/products.csv")
+    .assign(
+        income=lambda df: df["sales"] * df["unit_price"],
+        cumulatative_income=lambda df: df["income"].cumsum(),
+    )
+    .drop(columns="income")
+)
+
+# %% The equivalent way to do that with only .itertuples()
+
+products = pd.read_csv("resources/products.csv")
+
+cumulative_sum = []
+
+for row in products.itertuples():
+    if cumulative_sum:
+        cumulative_sum.append(
+            cumulative_sum[-1] + (row.sales * row.unit_price)
+        )
+    else:
+        cumulative_sum.append(row.sales * row.unit_price)
+
+products.assign(cumulative_income=cumulative_sum)
+# %%
diff --git a/pandas-iterate-over-rows/resources.zip b/pandas-iterate-over-rows/resources.zip
diff --git a/pandas-iterate-over-rows/take_sum.py b/pandas-iterate-over-rows/take_sum.py
@@ -0,0 +1,23 @@
+# %% Different ways to take a sum of a column
+
+import pandas as pd
+
+# %%
+
+webs = pd.read_csv("resources/popular_websites.csv")
+
+# %% Best way: use the dedicated pandas method
+
+webs["total_views"].sum()
+
+# %% List comprehension
+
+sum(row.total_views for row in webs.itertuples())
+
+# %% itertuples()
+
+total = 0
+for row in webs.itertuples():
+    total += row.total_views
+
+total