realpython
diff --git a/‎pandas-iterate-over-rows/README.md‎
Lines changed: 11 additions & 0 deletions b/‎pandas-iterate-over-rows/README.md‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎pandas-iterate-over-rows/cumulative_sum_codetiming.py‎
Lines changed: 27 additions & 0 deletions b/‎pandas-iterate-over-rows/cumulative_sum_codetiming.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎pandas-iterate-over-rows/cumulative_sum_perfplot.py‎
Lines changed: 42 additions & 0 deletions b/‎pandas-iterate-over-rows/cumulative_sum_perfplot.py‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎pandas-iterate-over-rows/fix_place_of_pub_perfplot.py‎
Lines changed: 85 additions & 0 deletions b/‎pandas-iterate-over-rows/fix_place_of_pub_perfplot.py‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎pandas-iterate-over-rows/how_to_loop.py‎
Lines changed: 54 additions & 0 deletions b/‎pandas-iterate-over-rows/how_to_loop.py‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎pandas-iterate-over-rows/products.py‎
Lines changed: 27 additions & 0 deletions b/‎pandas-iterate-over-rows/products.py‎
Lines changed: 27 additions & 0 deletions
@@ -0,0 +1,11 @@
+# How to Iterate Over Rows in pandas, and Why You Shouldn't
+
+In these supplementary materials, you'll find the examples discussed in the Real Python tutorial [How to Iterate Over Rows in pandas, and Why You Shouldn't](https://realpython.com/pandas-iterate-over-rows/), along with some bonus materials.
+
+In addition to the third-party packages used in the tutorial, you should also install [perfplot](https://github.com/nschloe/perfplot) to play with the bonus examples. To install all dependencies into your virtual environment, you can run the following command:
+
+```console
+$ python -m pip install pandas httpx codetiming perfplot
+```
+
+You'll also find some additional code samples along the way, showing alternatives to looping, such as using list comprehensions. You'll also learn about alternative ways of looping.
@@ -0,0 +1,27 @@
+import pandas as pd
+from codetiming import Timer
+
+
+def loop_cumsum(products):
+    cumulative_sum = []
+    for product in products.itertuples():
+        income = product.sales * product.unit_price
+        if cumulative_sum:
+            cumulative_sum.append(cumulative_sum[-1] + income)
+        else:
+            cumulative_sum.append(income)
+    return products.assign(cumulative_income=cumulative_sum)
+
+
+def pandas_cumsum(products):
+    return products.assign(
+        income=lambda df: df["sales"] * df["unit_price"],
+        cumulative_income=lambda df: df["income"].cumsum(),
+    ).drop(columns="income")
+
+
+for func in [loop_cumsum, pandas_cumsum]:
+    products = pd.read_csv("resources/products.csv")
+    products = pd.concat(products for _ in range(1000))
+    with Timer(name=func.__name__, text="{name:20}: {milliseconds:.2f} ms"):
+        func(products)
@@ -0,0 +1,42 @@
+import pandas as pd
+import perfplot
+
+
+def loop_cumsum(products):
+    cumulative_sum = []
+    for product in products.itertuples():
+        income = product.sales * product.unit_price
+        if cumulative_sum:
+            cumulative_sum.append(cumulative_sum[-1] + income)
+        else:
+            cumulative_sum.append(income)
+    return products.assign(cumulative_income=cumulative_sum)
+
+
+def pandas_cumsum(products):
+    return products.assign(
+        income=lambda df: df["sales"] * df["unit_price"],
+        cumulative_income=lambda df: df["income"].cumsum(),
+    ).drop(columns="income")
+
+
+def get_products(n):
+    products = pd.read_csv("resources/products.csv")
+    if n < len(products):
+        return products.iloc[:n]
+    return pd.concat([products for _ in range((n // len(products)) + 1)]).iloc[
+        :n
+    ]
+
+
+plot = perfplot.bench(
+    n_range=[2**i for i in range(20)],
+    setup=get_products,
+    kernels=[pandas_cumsum, loop_cumsum],
+    labels=["pandas cumsum", "loop cumsum"],
+    equality_check=None,
+    title="Loop vs pandas Cumulative Sum",
+    xlabel="Number of Rows",
+)
+
+plot.show()
@@ -0,0 +1,85 @@
+"""
+Plotting how long it takes to do a complex string replace on a whole column
+with various methods.
+
+In the dataset, in the `place_of_publication` column, you've got entries like
+these:
+
+London
+London; Virtue & Yorston
+Oxford
+pp. 40. G. Bryan & Co: Oxford, 1898
+Plymouth
+pp. 40. W. Cann: Plymouth, [1876?]
+
+Most of these are just city names, but some have additional and unwanted
+information. For these, you want to detect if it has one of the city names,
+replacing the whole value with just the city name.
+"""
+
+import pandas as pd
+import perfplot
+
+books = pd.read_csv("resources/books.csv")
+
+CITIES = ["London", "Plymouth", "Oxford", "Boston"]
+
+
+def _replace_city(text):
+    for city in CITIES:
+        if city in text:
+            return city
+    return text
+
+
+def clean_pub_replace(df):
+    col = df["place_of_publication"]
+    for city in CITIES:
+        col = col.replace(rf".*{city}.*", city, regex=True)
+    return col
+
+
+def clean_pub_itertuples(df):
+    return [_replace_city(row.place_of_publication) for row in df.itertuples()]
+
+
+def clean_pub_iterrows(df):
+    return [
+        _replace_city(row["place_of_publication"]) for _, row in df.iterrows()
+    ]
+
+
+def clean_pub_apply(df):
+    col = df["place_of_publication"]
+    for city in CITIES:
+        col = col.apply(lambda val: city if city in val else val)
+    return col
+
+
+def clean_pub_list_comp(df):
+    return [_replace_city(place) for place in df["place_of_publication"]]
+
+
+def get_books(n):
+    books = pd.read_csv("resources/books.csv")
+    if n < len(books):
+        return books.iloc[:n]
+    return pd.concat([books for _ in range((n // len(books)) + 1)]).iloc[:n]
+
+
+plot = perfplot.bench(
+    setup=lambda n: get_books(n),
+    kernels=[
+        clean_pub_replace,
+        clean_pub_itertuples,
+        clean_pub_iterrows,
+        clean_pub_apply,
+        clean_pub_list_comp,
+    ],
+    labels=["replace", "itertuples", "iterrows", "apply", "list comp"],
+    n_range=[i**2 for i in range(1, 40, 2)],
+    equality_check=None,
+)
+
+plot.show()
+plot.show(logy=True)
@@ -0,0 +1,54 @@
+# %%
+import httpx
+import pandas as pd
+
+# %% Read CSV and rename headers
+websites = pd.read_csv("resources/popular_websites.csv", index_col=0)
+print(websites)
+
+
+# %% Define function to check connection
+def check_connection(name, url):
+    try:
+        response = httpx.get(url)
+        location = response.headers.get("location")
+        if location is None or location.startswith(url):
+            print(f"{name} is online!")
+        else:
+            print(f"{name} is online! But redirects to {location}")
+        return True
+    except httpx.ConnectError:
+        print(f"Failed to establish a connection with {url}")
+        return False
+
+
+# %% Use .itertuples() to iterate through all rows
+for website in websites.itertuples():
+    check_connection(website.name, website.url)
+
+# %% You may use .iterrows() if you have dynamic columnnames
+name_column = "name"
+url_column = "url"
+for _, website in websites.iterrows():
+    check_connection(website[name_column], website[url_column])
+
+# %% Use list comprehension to iterate through all rows
+#    Note that this creates a list that is thrown away again
+[
+    check_connection(website.name, website.url)
+    for website in websites.itertuples()
+]
+
+# %% Use the index to iterate through rows
+for i in websites.index:
+    print({**websites.iloc[i]})
+
+# %% Transpose and cast to dictionary to iterate through rows
+for website in websites.T.to_dict().values():
+    print(website)
+
+# %% Use .agg() to aggregate over columns
+websites.agg(
+    total_views=("total_views", "sum"),
+    average_views=("total_views", "mean"),
+)
@@ -0,0 +1,27 @@
+# %%
+import pandas as pd
+
+# %% Get the cumulative sum with .itertuples()
+products = pd.read_csv("resources/products.csv")
+
+cumulative_sum = []
+
+for product in products.itertuples():
+    income = product.sales * product.unit_price
+    if cumulative_sum:
+        cumulative_sum.append(cumulative_sum[-1] + income)
+    else:
+        cumulative_sum.append(income)
+
+products = products.assign(cumulative_income=cumulative_sum)
+
+# %% To get cumulative sum, instead of looping, you can create intermediate
+# columns and use .cumsum()
+products = (
+    pd.read_csv("resources/products.csv")
+    .assign(
+        income=lambda df: df["sales"] * df["unit_price"],
+        cumulatative_income=lambda df: df["income"].cumsum(),
+    )
+    .drop(columns="income")
+)