Skip to content

Commit 0266293

Browse files
authored
Merge pull request #254 from Joyakis/error-handling
Add Shared function to open data files
2 parents b0aa349 + 3ddc5c9 commit 0266293

File tree

7 files changed

+69
-31
lines changed

7 files changed

+69
-31
lines changed

scripts/2-process/gcs_process.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,9 @@ def main():
311311

312312
# Count data
313313
file1_count = shared.path_join(PATHS["data_1-fetch"], "gcs_1_count.csv")
314-
count_data = pd.read_csv(file1_count, usecols=["TOOL_IDENTIFIER", "COUNT"])
314+
count_data = shared.open_data_file(
315+
LOGGER, file1_count, usecols=["TOOL_IDENTIFIER", "COUNT"]
316+
)
315317
process_product_totals(args, count_data)
316318
process_latest_prior_retired_totals(args, count_data)
317319
process_totals_by_free_cultural(args, count_data)
@@ -321,17 +323,19 @@ def main():
321323
file2_language = shared.path_join(
322324
PATHS["data_1-fetch"], "gcs_2_count_by_language.csv"
323325
)
324-
language_data = pd.read_csv(
325-
file2_language, usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"]
326+
language_data = shared.open_data_file(
327+
LOGGER,
328+
file2_language,
329+
usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"],
326330
)
327331
process_totals_by_language(args, language_data)
328332

329333
# Country data
330334
file3_country = shared.path_join(
331335
PATHS["data_1-fetch"], "gcs_3_count_by_country.csv"
332336
)
333-
country_data = pd.read_csv(
334-
file3_country, usecols=["TOOL_IDENTIFIER", "COUNTRY", "COUNT"]
337+
country_data = shared.open_data_file(
338+
LOGGER, file3_country, usecols=["TOOL_IDENTIFIER", "COUNTRY", "COUNT"]
335339
)
336340
process_totals_by_country(args, country_data)
337341

scripts/2-process/github_process.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,9 @@ def main():
178178
shared.git_fetch_and_merge(args, PATHS["repo"])
179179

180180
file_count = shared.path_join(PATHS["data_1-fetch"], "github_1_count.csv")
181-
count_data = pd.read_csv(file_count, usecols=["TOOL_IDENTIFIER", "COUNT"])
181+
count_data = shared.open_data_file(
182+
LOGGER, file_count, usecols=["TOOL_IDENTIFIER", "COUNT"]
183+
)
182184
process_totals_by_license(args, count_data)
183185
process_totals_by_restriction(args, count_data)
184186

scripts/2-process/wikipedia_process.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,9 @@ def main():
151151
file_count = shared.path_join(
152152
PATHS["data_1-fetch"], "wikipedia_count_by_languages.csv"
153153
)
154-
count_data = pd.read_csv(file_count, usecols=["LANGUAGE_NAME_EN", "COUNT"])
154+
count_data = shared.open_data_file(
155+
LOGGER, file_count, usecols=["LANGUAGE_NAME_EN", "COUNT"]
156+
)
155157
process_language_representation(args, count_data)
156158
process_highest_language_usage(args, count_data)
157159
process_least_language_usage(args, count_data)

scripts/3-report/gcs_report.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
import traceback
1212

1313
# Third-party
14-
import pandas as pd
1514
from pygments import highlight
1615
from pygments.formatters import TerminalFormatter
1716
from pygments.lexers import PythonTracebackLexer
@@ -80,7 +79,7 @@ def gcs_intro(args):
8079
)
8180
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
8281
name_label = "CC legal tool product"
83-
data = pd.read_csv(file_path, index_col=name_label)
82+
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
8483
total_count = f"{data['Count'].sum():,d}"
8584
shared.update_readme(
8685
args,
@@ -111,7 +110,8 @@ def plot_products(args):
111110
)
112111
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
113112
name_label = "CC legal tool product"
114-
data = pd.read_csv(file_path, index_col=name_label)
113+
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
114+
115115
data = data[::-1] # reverse order
116116

117117
title = "Products totals and percentages"
@@ -156,7 +156,7 @@ def plot_tool_status(args):
156156
)
157157
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
158158
name_label = "CC legal tool"
159-
data = pd.read_csv(file_path, index_col=name_label)
159+
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
160160
data.sort_values(name_label, ascending=False, inplace=True)
161161

162162
title = "CC legal tools status"
@@ -199,7 +199,7 @@ def plot_latest_tools(args):
199199
)
200200
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
201201
name_label = "CC legal tool"
202-
data = pd.read_csv(file_path, index_col=name_label)
202+
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
203203
data.sort_values(name_label, ascending=False, inplace=True)
204204

205205
title = "Latest CC legal tools"
@@ -241,7 +241,7 @@ def plot_prior_tools(args):
241241
)
242242
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
243243
name_label = "CC legal tool"
244-
data = pd.read_csv(file_path, index_col=name_label)
244+
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
245245
data.sort_values(name_label, ascending=False, inplace=True)
246246

247247
title = "Prior CC legal tools"
@@ -286,7 +286,7 @@ def plot_retired_tools(args):
286286
)
287287
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
288288
name_label = "CC legal tool"
289-
data = pd.read_csv(file_path, index_col=name_label)
289+
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
290290
data.sort_values(name_label, ascending=False, inplace=True)
291291

292292
title = "Retired CC legal tools"
@@ -332,7 +332,7 @@ def plot_countries_highest_usage(args):
332332
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
333333
name_label = "Country"
334334
data_label = "Count"
335-
data = pd.read_csv(file_path, index_col=name_label)
335+
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
336336
total_count = f"{data['Count'].sum():,d}"
337337
data.sort_values(data_label, ascending=False, inplace=True)
338338
data = data[:10] # limit to highest 10
@@ -385,7 +385,7 @@ def plot_languages_highest_usage(args):
385385
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
386386
name_label = "Language"
387387
data_label = "Count"
388-
data = pd.read_csv(file_path, index_col=name_label)
388+
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
389389
total_count = f"{data['Count'].sum():,d}"
390390
data.sort_values(data_label, ascending=False, inplace=True)
391391
data = data[:10] # limit to highest 10
@@ -439,7 +439,7 @@ def plot_free_culture(args):
439439
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
440440
name_label = "Category"
441441
data_label = "Count"
442-
data = pd.read_csv(file_path, index_col=name_label)
442+
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
443443

444444
title = "Approved for Free Cultural Works"
445445
plt = plot.combined_plot(

scripts/3-report/github_report.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
import traceback
1212

1313
# Third-party
14-
import pandas as pd
1514
from pygments import highlight
1615
from pygments.formatters import TerminalFormatter
1716
from pygments.lexers import PythonTracebackLexer
@@ -77,11 +76,8 @@ def load_data(args):
7776
PATHS["data"], f"{selected_quarter}", "1-fetch", "github_1_count.csv"
7877
)
7978

80-
if not os.path.exists(file_path):
81-
LOGGER.error(f"Data file not found: {file_path}")
82-
return pd.DataFrame()
79+
data = shared.open_data_file(LOGGER, file_path)
8380

84-
data = pd.read_csv(file_path)
8581
LOGGER.info(f"Data loaded from {file_path}")
8682
return data
8783

@@ -97,7 +93,7 @@ def github_intro(args):
9793
)
9894
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
9995
name_label = "TOOL_IDENTIFIER"
100-
data = pd.read_csv(file_path, index_col=name_label)
96+
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
10197
total_repositories = data.loc["Total public repositories", "COUNT"]
10298
cc_total = data[data.index.str.startswith("CC")]["COUNT"].sum()
10399
cc_percentage = f"{(cc_total / total_repositories) * 100:.2f}%"
@@ -152,7 +148,7 @@ def plot_totals_by_license_type(args):
152148
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
153149
name_label = "License"
154150
data_label = "Count"
155-
data = pd.read_csv(file_path, index_col=name_label)
151+
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
156152
data.sort_values(data_label, ascending=True, inplace=True)
157153
title = "Totals by license type"
158154
plt = plot.combined_plot(
@@ -201,7 +197,7 @@ def plot_totals_by_restriction(args):
201197
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
202198
name_label = "Category"
203199
data_label = "Count"
204-
data = pd.read_csv(file_path, index_col=name_label)
200+
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
205201
data.sort_values(name_label, ascending=False, inplace=True)
206202
title = "Totals by restriction"
207203
plt = plot.combined_plot(

scripts/3-report/wikipedia_report.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
import traceback
1212

1313
# Third-party
14-
import pandas as pd
1514
from pygments import highlight
1615
from pygments.formatters import TerminalFormatter
1716
from pygments.lexers import PythonTracebackLexer
@@ -87,9 +86,11 @@ def wikipedia_intro(args):
8786
)
8887
name_label = "LANGUAGE_NAME_EN"
8988
name_label_top10 = "Language"
90-
data = pd.read_csv(file_path, index_col=name_label)
89+
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
9190
total_articles = data["COUNT"].sum()
92-
top10 = pd.read_csv(file_path_top10, index_col=name_label_top10)
91+
top10 = shared.open_data_file(
92+
LOGGER, file_path_top10, index_col=name_label_top10
93+
)
9394
top10_articles = top10["Count"].sum()
9495
top10_percentage = (top10_articles / total_articles) * 100
9596
average_articles = total_articles / len(data)
@@ -131,7 +132,7 @@ def plot_language_representation(args):
131132
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
132133
name_label = "Category"
133134
data_label = "Count"
134-
data = pd.read_csv(file_path, index_col=name_label)
135+
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
135136
data.sort_values(data_label, ascending=True, inplace=True)
136137
title = "Language Representation"
137138
plt = plot.combined_plot(
@@ -176,7 +177,7 @@ def plot_highest_language_usage(args):
176177
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
177178
name_label = "Language"
178179
data_label = "Count"
179-
data = pd.read_csv(file_path, index_col=name_label)
180+
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
180181
data.sort_values(data_label, ascending=True, inplace=True)
181182
title = "Most represented languages"
182183
plt = plot.combined_plot(
@@ -219,7 +220,7 @@ def plot_least_language_usage(args):
219220
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
220221
name_label = "Language"
221222
data_label = "Count"
222-
data = pd.read_csv(file_path, index_col=name_label)
223+
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
223224
data.sort_values(data_label, ascending=True, inplace=True)
224225
title = "Least represented languages"
225226
plt = plot.combined_plot(

scripts/shared.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from datetime import datetime, timezone
77

88
# Third-party
9+
import pandas as pd
910
from git import InvalidGitRepositoryError, NoSuchPathError, Repo
1011
from pandas import PeriodIndex
1112
from requests import Session
@@ -66,6 +67,38 @@ def get_session(accept_header=None, session=None):
6667
return session
6768

6869

70+
def open_data_file(
71+
logger,
72+
file_path,
73+
usecols=None,
74+
index_col=None,
75+
):
76+
"""
77+
Open a CSV data file safely and convert expected errors into
78+
QuantifyingException. This shared function ensures all process/report
79+
scripts benefit from the same error handling.
80+
"""
81+
try:
82+
# Reading the file
83+
return pd.read_csv(file_path, usecols=usecols, index_col=index_col)
84+
# File does not exist
85+
except FileNotFoundError:
86+
raise QuantifyingException(
87+
message=f"Data file not found: {file_path}", exit_code=1
88+
)
89+
# Empty or invalid CSV file
90+
except pd.errors.EmptyDataError:
91+
raise QuantifyingException(
92+
message=f"CSV file is empty or invalid: {file_path}", exit_code=1
93+
)
94+
# Permission denied
95+
except PermissionError:
96+
raise QuantifyingException(
97+
message=f"Permission denied when accessing data file: {file_path}",
98+
exit_code=1,
99+
)
100+
101+
69102
def git_fetch_and_merge(args, repo_path, branch=None):
70103
if not args.enable_git:
71104
return

0 commit comments

Comments
 (0)