Merge pull request #254 from Joyakis/error-handling

TimidRobot · web-flow · commit 0266293e87e5 · 2025-12-16T23:54:24.000-08:00
Add Shared function to open data files
diff --git a/scripts/2-process/gcs_process.py b/scripts/2-process/gcs_process.py
@@ -311,7 +311,9 @@ def main():
 
     # Count data
     file1_count = shared.path_join(PATHS["data_1-fetch"], "gcs_1_count.csv")
-    count_data = pd.read_csv(file1_count, usecols=["TOOL_IDENTIFIER", "COUNT"])
+    count_data = shared.open_data_file(
+        LOGGER, file1_count, usecols=["TOOL_IDENTIFIER", "COUNT"]
+    )
     process_product_totals(args, count_data)
     process_latest_prior_retired_totals(args, count_data)
     process_totals_by_free_cultural(args, count_data)
@@ -321,17 +323,19 @@ def main():
     file2_language = shared.path_join(
         PATHS["data_1-fetch"], "gcs_2_count_by_language.csv"
     )
-    language_data = pd.read_csv(
-        file2_language, usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"]
+    language_data = shared.open_data_file(
+        LOGGER,
+        file2_language,
+        usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"],
     )
     process_totals_by_language(args, language_data)
 
     # Country data
     file3_country = shared.path_join(
         PATHS["data_1-fetch"], "gcs_3_count_by_country.csv"
     )
-    country_data = pd.read_csv(
-        file3_country, usecols=["TOOL_IDENTIFIER", "COUNTRY", "COUNT"]
+    country_data = shared.open_data_file(
+        LOGGER, file3_country, usecols=["TOOL_IDENTIFIER", "COUNTRY", "COUNT"]
     )
     process_totals_by_country(args, country_data)
 
diff --git a/scripts/2-process/github_process.py b/scripts/2-process/github_process.py
@@ -178,7 +178,9 @@ def main():
     shared.git_fetch_and_merge(args, PATHS["repo"])
 
     file_count = shared.path_join(PATHS["data_1-fetch"], "github_1_count.csv")
-    count_data = pd.read_csv(file_count, usecols=["TOOL_IDENTIFIER", "COUNT"])
+    count_data = shared.open_data_file(
+        LOGGER, file_count, usecols=["TOOL_IDENTIFIER", "COUNT"]
+    )
     process_totals_by_license(args, count_data)
     process_totals_by_restriction(args, count_data)
 
diff --git a/scripts/2-process/wikipedia_process.py b/scripts/2-process/wikipedia_process.py
@@ -151,7 +151,9 @@ def main():
     file_count = shared.path_join(
         PATHS["data_1-fetch"], "wikipedia_count_by_languages.csv"
     )
-    count_data = pd.read_csv(file_count, usecols=["LANGUAGE_NAME_EN", "COUNT"])
+    count_data = shared.open_data_file(
+        LOGGER, file_count, usecols=["LANGUAGE_NAME_EN", "COUNT"]
+    )
     process_language_representation(args, count_data)
     process_highest_language_usage(args, count_data)
     process_least_language_usage(args, count_data)
diff --git a/scripts/3-report/gcs_report.py b/scripts/3-report/gcs_report.py
@@ -11,7 +11,6 @@
 import traceback
 
 # Third-party
-import pandas as pd
 from pygments import highlight
 from pygments.formatters import TerminalFormatter
 from pygments.lexers import PythonTracebackLexer
@@ -80,7 +79,7 @@ def gcs_intro(args):
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "CC legal tool product"
-    data = pd.read_csv(file_path, index_col=name_label)
+    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
     total_count = f"{data['Count'].sum():,d}"
     shared.update_readme(
         args,
@@ -111,7 +110,8 @@ def plot_products(args):
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "CC legal tool product"
-    data = pd.read_csv(file_path, index_col=name_label)
+    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
+
     data = data[::-1]  # reverse order
 
     title = "Products totals and percentages"
@@ -156,7 +156,7 @@ def plot_tool_status(args):
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "CC legal tool"
-    data = pd.read_csv(file_path, index_col=name_label)
+    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
     data.sort_values(name_label, ascending=False, inplace=True)
 
     title = "CC legal tools status"
@@ -199,7 +199,7 @@ def plot_latest_tools(args):
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "CC legal tool"
-    data = pd.read_csv(file_path, index_col=name_label)
+    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
     data.sort_values(name_label, ascending=False, inplace=True)
 
     title = "Latest CC legal tools"
@@ -241,7 +241,7 @@ def plot_prior_tools(args):
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "CC legal tool"
-    data = pd.read_csv(file_path, index_col=name_label)
+    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
     data.sort_values(name_label, ascending=False, inplace=True)
 
     title = "Prior CC legal tools"
@@ -286,7 +286,7 @@ def plot_retired_tools(args):
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "CC legal tool"
-    data = pd.read_csv(file_path, index_col=name_label)
+    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
     data.sort_values(name_label, ascending=False, inplace=True)
 
     title = "Retired CC legal tools"
@@ -332,7 +332,7 @@ def plot_countries_highest_usage(args):
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "Country"
     data_label = "Count"
-    data = pd.read_csv(file_path, index_col=name_label)
+    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
     total_count = f"{data['Count'].sum():,d}"
     data.sort_values(data_label, ascending=False, inplace=True)
     data = data[:10]  # limit to highest 10
@@ -385,7 +385,7 @@ def plot_languages_highest_usage(args):
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "Language"
     data_label = "Count"
-    data = pd.read_csv(file_path, index_col=name_label)
+    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
     total_count = f"{data['Count'].sum():,d}"
     data.sort_values(data_label, ascending=False, inplace=True)
     data = data[:10]  # limit to highest 10
@@ -439,7 +439,7 @@ def plot_free_culture(args):
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "Category"
     data_label = "Count"
-    data = pd.read_csv(file_path, index_col=name_label)
+    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
 
     title = "Approved for Free Cultural Works"
     plt = plot.combined_plot(
diff --git a/scripts/3-report/github_report.py b/scripts/3-report/github_report.py
@@ -11,7 +11,6 @@
 import traceback
 
 # Third-party
-import pandas as pd
 from pygments import highlight
 from pygments.formatters import TerminalFormatter
 from pygments.lexers import PythonTracebackLexer
@@ -77,11 +76,8 @@ def load_data(args):
         PATHS["data"], f"{selected_quarter}", "1-fetch", "github_1_count.csv"
     )
 
-    if not os.path.exists(file_path):
-        LOGGER.error(f"Data file not found: {file_path}")
-        return pd.DataFrame()
+    data = shared.open_data_file(LOGGER, file_path)
 
-    data = pd.read_csv(file_path)
     LOGGER.info(f"Data loaded from {file_path}")
     return data
 
@@ -97,7 +93,7 @@ def github_intro(args):
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "TOOL_IDENTIFIER"
-    data = pd.read_csv(file_path, index_col=name_label)
+    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
     total_repositories = data.loc["Total public repositories", "COUNT"]
     cc_total = data[data.index.str.startswith("CC")]["COUNT"].sum()
     cc_percentage = f"{(cc_total / total_repositories) * 100:.2f}%"
@@ -152,7 +148,7 @@ def plot_totals_by_license_type(args):
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "License"
     data_label = "Count"
-    data = pd.read_csv(file_path, index_col=name_label)
+    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
     data.sort_values(data_label, ascending=True, inplace=True)
     title = "Totals by license type"
     plt = plot.combined_plot(
@@ -201,7 +197,7 @@ def plot_totals_by_restriction(args):
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "Category"
     data_label = "Count"
-    data = pd.read_csv(file_path, index_col=name_label)
+    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
     data.sort_values(name_label, ascending=False, inplace=True)
     title = "Totals by restriction"
     plt = plot.combined_plot(
diff --git a/scripts/3-report/wikipedia_report.py b/scripts/3-report/wikipedia_report.py
@@ -11,7 +11,6 @@
 import traceback
 
 # Third-party
-import pandas as pd
 from pygments import highlight
 from pygments.formatters import TerminalFormatter
 from pygments.lexers import PythonTracebackLexer
@@ -87,9 +86,11 @@ def wikipedia_intro(args):
     )
     name_label = "LANGUAGE_NAME_EN"
     name_label_top10 = "Language"
-    data = pd.read_csv(file_path, index_col=name_label)
+    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
     total_articles = data["COUNT"].sum()
-    top10 = pd.read_csv(file_path_top10, index_col=name_label_top10)
+    top10 = shared.open_data_file(
+        LOGGER, file_path_top10, index_col=name_label_top10
+    )
     top10_articles = top10["Count"].sum()
     top10_percentage = (top10_articles / total_articles) * 100
     average_articles = total_articles / len(data)
@@ -131,7 +132,7 @@ def plot_language_representation(args):
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "Category"
     data_label = "Count"
-    data = pd.read_csv(file_path, index_col=name_label)
+    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
     data.sort_values(data_label, ascending=True, inplace=True)
     title = "Language Representation"
     plt = plot.combined_plot(
@@ -176,7 +177,7 @@ def plot_highest_language_usage(args):
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "Language"
     data_label = "Count"
-    data = pd.read_csv(file_path, index_col=name_label)
+    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
     data.sort_values(data_label, ascending=True, inplace=True)
     title = "Most represented languages"
     plt = plot.combined_plot(
@@ -219,7 +220,7 @@ def plot_least_language_usage(args):
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "Language"
     data_label = "Count"
-    data = pd.read_csv(file_path, index_col=name_label)
+    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
     data.sort_values(data_label, ascending=True, inplace=True)
     title = "Least represented languages"
     plt = plot.combined_plot(
diff --git a/scripts/shared.py b/scripts/shared.py
@@ -6,6 +6,7 @@
 from datetime import datetime, timezone
 
 # Third-party
+import pandas as pd
 from git import InvalidGitRepositoryError, NoSuchPathError, Repo
 from pandas import PeriodIndex
 from requests import Session
@@ -66,6 +67,38 @@ def get_session(accept_header=None, session=None):
     return session
 
 
+def open_data_file(
+    logger,
+    file_path,
+    usecols=None,
+    index_col=None,
+):
+    """
+    Open a CSV data file safely and convert expected errors into
+    QuantifyingException. This shared function ensures all process/report
+    scripts benefit from the same error handling.
+    """
+    try:
+        # Reading the file
+        return pd.read_csv(file_path, usecols=usecols, index_col=index_col)
+    # File does not exist
+    except FileNotFoundError:
+        raise QuantifyingException(
+            message=f"Data file not found: {file_path}", exit_code=1
+        )
+    # Empty or invalid CSV file
+    except pd.errors.EmptyDataError:
+        raise QuantifyingException(
+            message=f"CSV file is empty or invalid: {file_path}", exit_code=1
+        )
+    # Permission denied
+    except PermissionError:
+        raise QuantifyingException(
+            message=f"Permission denied when accessing data file: {file_path}",
+            exit_code=1,
+        )
+
+
 def git_fetch_and_merge(args, repo_path, branch=None):
     if not args.enable_git:
         return