realpython
diff --git a/‎pandas-iterate-over-rows/examples.py‎
Lines changed: 47 additions & 0 deletions b/‎pandas-iterate-over-rows/examples.py‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎pandas-iterate-over-rows/fix_place_of_pub.py‎
Lines changed: 61 additions & 66 deletions b/‎pandas-iterate-over-rows/fix_place_of_pub.py‎
Lines changed: 61 additions & 66 deletions
diff --git a/‎pandas-iterate-over-rows/fix_place_of_pub_no_inner.py‎
Lines changed: 0 additions & 85 deletions b/‎pandas-iterate-over-rows/fix_place_of_pub_no_inner.py‎
Lines changed: 0 additions & 85 deletions
diff --git a/‎pandas-iterate-over-rows/generate_data.py‎
Lines changed: 24 additions & 0 deletions b/‎pandas-iterate-over-rows/generate_data.py‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎pandas-iterate-over-rows/how_to_loop.py‎
Lines changed: 6 additions & 13 deletions b/‎pandas-iterate-over-rows/how_to_loop.py‎
Lines changed: 6 additions & 13 deletions
diff --git a/‎pandas-iterate-over-rows/products.py‎
Lines changed: 0 additions & 2 deletions b/‎pandas-iterate-over-rows/products.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎pandas-iterate-over-rows/resources.zip‎
-683 KB b/‎pandas-iterate-over-rows/resources.zip‎
-683 KB
@@ -0,0 +1,47 @@
+people = [
+    ["name", "age", "job"],
+    ["Stephanie Gould", 100, "Banker"],
+    ["Christopher Ward", 115, "Doctor, general practice"],
+    ["Jill Santiago", 80, "Insurance account manager"],
+    ["John Lewis", 58, "Animal technologist"],
+    ["Bianca Moore", 39, "Plant breeder/geneticist"],
+]
+
+
+for row in people[1:]:
+    print(row)
+
+import pandas as pd
+
+people = pd.DataFrame(people[1:], columns=people[0])
+
+print(people)
+
+for row in people:
+    print(row)
+
+for row in people.itertuples():
+    print(row)
+
+for row in people.itertuples():
+    print(f"{row[1]} is a {row.job}")
+
+people = [
+    [
+        "name",
+        "Stephanie Gould",
+        "Christopher Ward",
+        "Jill Santiago",
+        "John Lewis",
+        "Bianca Moore",
+    ],
+    ["age", 44, 85, 27, 21, 112],
+    [
+        "job",
+        "Banker",
+        "Doctor, general practice",
+        "Insurance account manager",
+        "Animal technologist",
+        "Plant breeder/geneticist",
+    ],
+]
@@ -1,88 +1,83 @@
 """
-Timing how long it takes to do a complex string replace on a whole column
-with various methods
+Plotting how long it takes to do a complex string replace on a whole column
+with various methods.
+
+In the dataset, in the `place_of_publication` column, you've got entries like
+these:
+
+London
+London; Virtue & Yorston
+Oxford
+pp. 40. G. Bryan & Co: Oxford, 1898
+Plymouth
+pp. 40. W. Cann: Plymouth, [1876?]
+
+Most of these are just city names, but some have additional and unwanted
+information. For these, you want to detect if it has one of the city names,
+replacing the whole value with just the city name.
 """
 
-import codetiming
 import pandas as pd
+import perfplot
 
 books = pd.read_csv("resources/books.csv")
 
 CITIES = ["London", "Plymouth", "Oxford", "Boston"]
 
 
-def clean_pub_replace(df):
-    def clean_pub_replace_inner(df):
-        col = df["place_of_publication"]
-        for city in CITIES:
-            col = col.replace(rf".*{city}.*", city, regex=True)
-        return col
+def _replace_city(text):
+    for city in CITIES:
+        if city in text:
+            return city
+    return text
 
-    return df.assign(place_of_publication=clean_pub_replace_inner)
 
+def clean_pub_replace(df):
+    col = df["place_of_publication"]
+    for city in CITIES:
+        col = col.replace(rf".*{city}.*", city, regex=True)
+    return col
 
-def clean_pub_apply(df):
-    def clean_pub_apply_inner(df):
-        col = df["place_of_publication"]
-        for city in CITIES:
-            col = col.apply(lambda val: city if city in val else val)
-        return col
 
-    return df.assign(place_of_publication=clean_pub_apply_inner)
+def clean_pub_itertuples(df):
+    return [_replace_city(row.place_of_publication) for row in df.itertuples()]
 
 
 def clean_pub_iterrows(df):
-    def clean_pub_iterrows_inner(df):
-        col = []
-        for _, row in df.iterrows():
-            place = row["place_of_publication"]
-
-            for name in CITIES:
-                place = name if name in place else place
-
-            col.append(place)
+    return [
+        _replace_city(row["place_of_publication"]) for _, row in df.iterrows()
+    ]
 
-        return col
 
-    return df.assign(place_of_publication=clean_pub_iterrows_inner)
-
-
-def clean_pub_itertuples(df):
-    def clean_pub_itertuples_inner(df):
-        col = []
-        for row in df.itertuples():
-            place = row.place_of_publication
-            for name in CITIES:
-                place = name if name in place else place
-
-            col.append(place)
-        return col
-
-    return df.assign(place_of_publication=clean_pub_itertuples_inner)
+def clean_pub_apply(df):
+    col = df["place_of_publication"]
+    for city in CITIES:
+        col = col.apply(lambda val: city if city in val else val)
+    return col
 
 
 def clean_pub_list_comp(df):
-    def replace_city(text):
-        for city in CITIES:
-            if city in text:
-                return city
-
-        return text
-
-    def clean_pub_list_comp_inner(df):
-        return [replace_city(place) for place in df["place_of_publication"]]
-
-    return df.assign(place_of_publication=clean_pub_list_comp_inner)
-
-
-for f in [
-    clean_pub_apply,
-    clean_pub_iterrows,
-    clean_pub_itertuples,
-    clean_pub_replace,
-    clean_pub_list_comp,
-]:
-    with codetiming.Timer(
-        name=f.__name__, text="{name:20}: {milliseconds:.2f} ms"
-    ):
-        print(f(books).head())
+    return [_replace_city(place) for place in df["place_of_publication"]]
+
+
+def get_books(n):
+    books = pd.read_csv("resources/books.csv")
+    if n < len(books):
+        return books.iloc[:n]
+    return pd.concat([books for _ in range((n // len(books)) + 1)]).iloc[:n]
+
+
+perfplot.live(
+    setup=lambda n: get_books(n),
+    kernels=[
+        clean_pub_replace,
+        clean_pub_itertuples,
+        clean_pub_iterrows,
+        clean_pub_apply,
+        clean_pub_list_comp,
+    ],
+    labels=["replace", "itertuples", "iterrows", "apply", "list comp"],
+    n_range=[i**2 for i in range(1, 40, 2)],
+    equality_check=None,
+    logy=True,
+)
@@ -0,0 +1,24 @@
+import random
+
+from faker import Faker
+
+fake = Faker()
+Faker.seed(2)
+
+people = [
+    ["name", "age", "job"],
+    *[
+        [
+            f"{fake.first_name()} {fake.last_name()}",
+            random.randint(18, 120),
+            fake.job(),
+        ]
+        for _ in range(5)
+    ],
+]
+
+print(people)
+
+people_transposed = list(map(list, zip(*people)))
+
+print(people_transposed)
@@ -3,44 +3,37 @@
 import pandas as pd
 
 # %% Read CSV and rename headers
-
-webs = pd.read_csv("resources/popular_websites.csv")
+webs = pd.read_csv("resources/popular_websites.csv", index_col=0)
 
 # %% Define function to check connection
-
-
 def check_connection(name, url):
     try:
         httpx.get(url)
-
     except httpx.ConnectError:
         print("Failed to establish a connection")
         return False
-    else:
-        print(f"{name} is online!")
-        return True
+    print(f"{name} is online!")
+    return True
 
 
 # %% Use .itertuples() to iterate through all rows
-
 for web in webs.itertuples():
     check_connection(web.website, web.url)
 
+# %%
+for _, web in webs.iterrows():
+    check_connection(web["website"], web["url"])
 
 # %% Use list comprehension to iterate through all rows
-
 [check_connection(web.website, web.url) for web in webs.itertuples()]
 
 # %% Use the index to iterate through rows
-
 for i in webs.index:
     print({**webs.iloc[i]})
 
 # %% Transpose and cast to dictionary to iterate through rows
-
 for row in webs.T.to_dict().values():
     print(row)
 
-
 # %%
 webs.aggregate(["sum"])
@@ -4,7 +4,6 @@
 
 # %% To get cumulative sum, instead of looping, you can create intermediate
 # columns and use .cumsum()
-
 products = (
     pd.read_csv("resources/products.csv")
     .assign(
@@ -15,7 +14,6 @@
 )
 
 # %% The equivalent way to do that with only .itertuples()
-
 products = pd.read_csv("resources/products.csv")
 
 cumulative_sum = []