diff --git a/dev/create_gcs_query_plan.py b/dev/create_gcs_query_plan.py index b2cab29f..1c3fc6a7 100755 --- a/dev/create_gcs_query_plan.py +++ b/dev/create_gcs_query_plan.py @@ -116,7 +116,7 @@ def sort_tools(url): def get_tool_urls(): LOGGER.info("Loading CC Legal Tool paths and adding prefix") - file_path = os.path.join(PATHS["data"], "legal-tool-paths.txt") + file_path = shared.path_join(PATHS["data"], "legal-tool-paths.txt") prefix = "//creativecommons.org/" tool_urls = [] with open(file_path, "r") as file_obj: @@ -128,14 +128,14 @@ def get_tool_urls(): def load_countries(): - file_path = os.path.join(PATHS["data"], "gcs_country_collection.yaml") + file_path = shared.path_join(PATHS["data"], "gcs_country_collection.yaml") with open(file_path, "r") as file_obj: countries = yaml.safe_load(file_obj) return countries def load_languages(): - file_path = os.path.join(PATHS["data"], "gcs_language_collection.yaml") + file_path = shared.path_join(PATHS["data"], "gcs_language_collection.yaml") with open(file_path, "r") as file_obj: languages = yaml.safe_load(file_obj) return languages @@ -202,7 +202,7 @@ def create_query_plan(tool_urls, countries, languages): def save_plan(plan): LOGGER.info("Saving Google query plan to CSV") - file_path = os.path.join(PATHS["data"], "gcs_query_plan.csv") + file_path = shared.path_join(PATHS["data"], "gcs_query_plan.csv") fieldnames = [ "TOOL_URL", "TOOL_IDENTIFIER", diff --git a/dev/prioritize_tools.py b/dev/prioritize_tools.py index 6320b4a4..5a04e4c7 100755 --- a/dev/prioritize_tools.py +++ b/dev/prioritize_tools.py @@ -39,7 +39,7 @@ def get_tool_urls(): LOGGER.info("Loading CC Legal Tool paths and adding prefix") - file_path = os.path.join(PATHS["data"], "legal-tool-paths.txt") + file_path = shared.path_join(PATHS["data"], "legal-tool-paths.txt") prefix = "//creativecommons.org/" tool_urls = [] with open(file_path, "r") as file_obj: @@ -110,7 +110,7 @@ def sort_tools(path): def save_tools_list(tool_urls): LOGGER.info("Saving prioritized CC Legal Tool URLs") - file_path = os.path.join(PATHS["data"], "prioritized-tool-urls.txt") + file_path = shared.path_join(PATHS["data"], "prioritized-tool-urls.txt") tool_urls.append("") # ensure file has end of file newline with open(file_path, "w") as file_obj: file_obj.writelines("\n".join(tool_urls)) diff --git a/scripts/1-fetch/gcs_fetch.py b/scripts/1-fetch/gcs_fetch.py index 29c8039b..49a2886d 100755 --- a/scripts/1-fetch/gcs_fetch.py +++ b/scripts/1-fetch/gcs_fetch.py @@ -143,7 +143,7 @@ def get_last_completed_plan_index(): def load_plan(): plan = [] - file_path = os.path.join(PATHS["data"], "gcs_query_plan.csv") + file_path = shared.path_join(PATHS["data"], "gcs_query_plan.csv") with open(file_path, "r", newline="") as file_obj: plan = list(csv.DictReader(file_obj, dialect="unix")) return plan diff --git a/scripts/2-process/gcs_process.py b/scripts/2-process/gcs_process.py index 09aab97e..9396a995 100755 --- a/scripts/2-process/gcs_process.py +++ b/scripts/2-process/gcs_process.py @@ -4,11 +4,18 @@ for analysis and comparison between quarters. """ # Standard library +import argparse +import csv import os import sys +import textwrap import traceback -# import pandas as pd +# Third-party +import pandas as pd +from pygments import highlight +from pygments.formatters import TerminalFormatter +from pygments.lexers import PythonTracebackLexer # Add parent directory so shared can be imported sys.path.append(os.path.join(os.path.dirname(__file__), "..")) @@ -19,6 +26,36 @@ # Setup LOGGER, PATHS = shared.setup(__file__) +# Constants +FILE1_COUNT = shared.path_join(PATHS["data_1-fetch"], "gcs_1_count.csv") +FILE2_LANGUAGE = shared.path_join( + PATHS["data_1-fetch"], "gcs_2_count_by_language.csv" +) +FILE3_COUNTRY = shared.path_join( + PATHS["data_1-fetch"], "gcs_3_count_by_country.csv" +) +QUARTER = os.path.basename(PATHS["data_quarter"]) + + +def parse_arguments(): + """ + Parse command-line options, returns parsed argument namespace. + """ + LOGGER.info("Parsing command-line options") + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--enable-save", + action="store_true", + help="Enable saving results", + ) + parser.add_argument( + "--enable-git", + action="store_true", + help="Enable git actions (fetch, merge, add, commit, and push)", + ) + return parser.parse_args() + + # def load_quarter_data(quarter): # """ # Load data for a specific quarter. @@ -157,19 +194,184 @@ # return parser.parse_args() -def main(): - raise shared.QuantifyingException("No current code for Phase 2", 0) +def data_to_csv(args, data, file_path): + if not args.enable_save: + return + os.makedirs(PATHS["data_phase"], exist_ok=True) + # emulate csv.unix_dialect + data.to_csv( + file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n" + ) + + +def process_top_25_tools(args, count_data): + LOGGER.info("Processing top 25 tools") + data = count_data.sort_values("COUNT", ascending=False) + data.reset_index(drop=True, inplace=True) + data = data.iloc[:25] + data.rename( + columns={"TOOL_IDENTIFIER": "CC legal tool", "COUNT": "Count"}, + inplace=True, + ) + file_path = shared.path_join(PATHS["data_phase"], "gcs_top_25_tools.csv") + data_to_csv(args, data, file_path) + + +def process_totals_by_product(args, count_data): + LOGGER.info("Processing totals by product") + data = { + "Licenses version 4.0": 0, + "Licenses version 3.0": 0, + "Licenses version 2.x": 0, + "Licenses version 1.0": 0, + "CC0 1.0": 0, + "Public Domain Mark 1.0": 0, + "Certification 1.0 US": 0, + } + for row in count_data.itertuples(index=False): + tool = row[0] + count = row[1] + if tool.startswith("PDM"): + key = "Public Domain Mark 1.0" + elif "CC0" in tool: + key = "CC0 1.0" + elif "PUBLICDOMAIN" in tool: + key = "Certification 1.0 US" + elif "4.0" in tool: + key = "Licenses version 4.0" + elif "3.0" in tool: + key = "Licenses version 3.0" + elif "2." in tool: + key = "Licenses version 2.x" + elif "1.0" in tool: + key = "Licenses version 1.0" + else: + raise shared.QuantifyingException("Invalid TOOL_IDENTIFIER") + data[key] += count + + data = pd.DataFrame( + data.items(), columns=["CC legal tool product", "Count"] + ) + file_path = shared.path_join( + PATHS["data_phase"], "gcs_totals_by_product.csv" + ) + data_to_csv(args, data, file_path) + + +def process_totals_by_unit(args, count_data): + LOGGER.info("Processing totals by unit") + data = {} + for row in count_data.itertuples(index=False): + tool = row[0] + count = row[1] + if tool.startswith("PDM"): + key = "mark" + elif "CC0" in tool: + key = "cc0" + elif "PUBLICDOMAIN" in tool: + key = "certification" + else: + parts = tool.split() + key = parts[1].lower() + if key == "by-nd-nc": + key = "by-nc-nd" + if key not in data.keys(): + data[key] = count + else: + data[key] += count + + data = pd.DataFrame(data.items(), columns=["Legal Tool Unit", "Count"]) + data.sort_values("Count", ascending=False, inplace=True) + data.reset_index(drop=True, inplace=True) + file_path = shared.path_join(PATHS["data_phase"], "gcs_totals_by_unit.csv") + data_to_csv(args, data, file_path) + + +def process_totals_by_free_cultural(args, count_data): + LOGGER.info("Processing totals by Approved for Free Cultural Works") + data = { + "Approved for Free Cultural Works": 0, + "Limited uses": 0, + } + for row in count_data.itertuples(index=False): + tool = row[0] + count = row[1] + if tool.startswith("PDM") or "CC0" in tool or "PUBLICDOMAIN" in tool: + key = "Approved for Free Cultural Works" + else: + parts = tool.split() + unit = parts[1].lower() + if unit in ["by-sa", "by", "sa", "sampling+"]: + key = "Approved for Free Cultural Works" + else: + key = "Limited uses" + data[key] += count + + data = pd.DataFrame(data.items(), columns=["Category", "Count"]) + data.sort_values("Count", ascending=False, inplace=True) + data.reset_index(drop=True, inplace=True) + file_path = shared.path_join( + PATHS["data_phase"], "gcs_totals_by_free_cultural.csv" + ) + data_to_csv(args, data, file_path) + + +def process_totals_by_restrictions(args, count_data): + LOGGER.info("Processing totals by restriction") + data = {"level 0": 0, "level 1": 0, "level 2": 0, "level 3": 0} + for row in count_data.itertuples(index=False): + tool = row[0] + count = row[1] + if tool.startswith("PDM") or "CC0" in tool or "PUBLICDOMAIN" in tool: + key = "level 0" + else: + parts = tool.split() + unit = parts[1].lower() + if unit in ["by-sa", "by", "sa", "sampling+"]: + key = "level 1" + elif unit in ["by-nc", "by-nc-sa", "sampling", "nc", "nc-sa"]: + key = "level 2" + else: + key = "level 3" + data[key] += count + + data = pd.DataFrame(data.items(), columns=["Category", "Count"]) + file_path = shared.path_join( + PATHS["data_phase"], "gcs_totals_by_restrictions.csv" + ) + data_to_csv(args, data, file_path) + - # # Fetch and merge changes - # shared.fetch_and_merge(PATHS["repo"]) +def main(): + args = parse_arguments() + shared.log_paths(LOGGER, PATHS) + shared.git_fetch_and_merge(args, PATHS["repo"]) + + # Count data + count_data = pd.read_csv(FILE1_COUNT, usecols=["TOOL_IDENTIFIER", "COUNT"]) + process_top_25_tools(args, count_data) + process_totals_by_product(args, count_data) + process_totals_by_unit(args, count_data) + process_totals_by_free_cultural(args, count_data) + process_totals_by_restrictions(args, count_data) + + # # Langauge data + # langauge_data = pd.read_csv( + # FILE2_LANGUAGE, usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"] + # ) - # # Add and commit changes - # shared.add_and_commit( - # PATHS["repo"], PATHS["data_quarter"], "Fetched and updated new data" + # # Country data + # country_data = pd.read_csv( + # FILE3_COUNTRY, usecols=["TOOL_IDENTIFIER", "COUNTRY", "COUNT"] # ) - # # Push changes - # shared.push_changes(PATHS["repo"]) + args = shared.git_add_and_commit( + args, + PATHS["repo"], + PATHS["data_quarter"], + f"Add and commit new Google Custom Search (GCS) data for {QUARTER}", + ) + shared.git_push_changes(args, PATHS["repo"]) if __name__ == "__main__": @@ -188,5 +390,13 @@ def main(): LOGGER.info("(130) Halted via KeyboardInterrupt.") sys.exit(130) except Exception: - LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") + traceback_formatted = textwrap.indent( + highlight( + traceback.format_exc(), + PythonTracebackLexer(), + TerminalFormatter(), + ), + " ", + ) + LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}") sys.exit(1) diff --git a/scripts/shared.py b/scripts/shared.py index 51f9c678..dc682669 100644 --- a/scripts/shared.py +++ b/scripts/shared.py @@ -31,20 +31,19 @@ def setup(current_file): # Paths paths = {} - paths["repo"] = os.path.dirname( - os.path.abspath(os.path.realpath(os.path.join(__file__, ".."))) - ) - paths["dotenv"] = os.path.join(paths["repo"], ".env") + paths["repo"] = os.path.dirname(path_join(__file__, "..")) + paths["dotenv"] = path_join(paths["repo"], ".env") paths["data"] = os.path.dirname( os.path.abspath(os.path.realpath(current_file)) ) - phase = os.path.basename( + current_phase = os.path.basename( os.path.dirname(os.path.abspath(os.path.realpath(current_file))) ) - paths["data"] = os.path.join(paths["repo"], "data") - data_quarter = os.path.join(paths["data"], f"{quarter}") - paths["state"] = os.path.join(data_quarter, "state.yaml") - paths["data_phase"] = os.path.join(data_quarter, phase) + paths["data"] = path_join(paths["repo"], "data") + data_quarter = path_join(paths["data"], f"{quarter}") + for phase in ["1-fetch", "2-process", "3-report"]: + paths[f"data_{phase}"] = path_join(data_quarter, phase) + paths["data_phase"] = path_join(data_quarter, current_phase) paths["data_quarter"] = data_quarter @@ -53,9 +52,14 @@ def setup(current_file): def log_paths(logger, paths): paths_list = [] + repo_path = paths["repo"] for label, path in paths.items(): label = f"{label}:" - paths_list.append(f"\n{' ' * 12}{label:<11} {path}") + if label == "repo:": + paths_list.append(f"\n{' ' * 4}{label} {path}") + else: + path_new = path.replace(repo_path, ".") + paths_list.append(f"\n{' ' * 8}{label:<15} {path_new}") paths_list = "".join(paths_list) logger.info(f"PATHS:{paths_list}") @@ -137,7 +141,7 @@ def update_readme( """ Update the README.md file with the generated images and descriptions. """ - readme_path = os.path.join(paths["data"], args.quarter, "README.md") + readme_path = path_join(paths["data"], args.quarter, "README.md") # Define section markers for each data source section_marker_start = f""