Merge pull request #147 from creativecommons/initial-process

TimidRobot · web-flow · commit 1e01660fd379 · 2024-12-09T12:21:43.000-08:00
Initial data processing of fetched GCS count data
diff --git a/dev/create_gcs_query_plan.py b/dev/create_gcs_query_plan.py
@@ -116,7 +116,7 @@ def sort_tools(url):
 
 def get_tool_urls():
     LOGGER.info("Loading CC Legal Tool paths and adding prefix")
-    file_path = os.path.join(PATHS["data"], "legal-tool-paths.txt")
+    file_path = shared.path_join(PATHS["data"], "legal-tool-paths.txt")
     prefix = "//creativecommons.org/"
     tool_urls = []
     with open(file_path, "r") as file_obj:
@@ -128,14 +128,14 @@ def get_tool_urls():
 
 
 def load_countries():
-    file_path = os.path.join(PATHS["data"], "gcs_country_collection.yaml")
+    file_path = shared.path_join(PATHS["data"], "gcs_country_collection.yaml")
     with open(file_path, "r") as file_obj:
         countries = yaml.safe_load(file_obj)
     return countries
 
 
 def load_languages():
-    file_path = os.path.join(PATHS["data"], "gcs_language_collection.yaml")
+    file_path = shared.path_join(PATHS["data"], "gcs_language_collection.yaml")
     with open(file_path, "r") as file_obj:
         languages = yaml.safe_load(file_obj)
     return languages
@@ -202,7 +202,7 @@ def create_query_plan(tool_urls, countries, languages):
 
 def save_plan(plan):
     LOGGER.info("Saving Google query plan to CSV")
-    file_path = os.path.join(PATHS["data"], "gcs_query_plan.csv")
+    file_path = shared.path_join(PATHS["data"], "gcs_query_plan.csv")
     fieldnames = [
         "TOOL_URL",
         "TOOL_IDENTIFIER",
diff --git a/dev/prioritize_tools.py b/dev/prioritize_tools.py
@@ -39,7 +39,7 @@
 
 def get_tool_urls():
     LOGGER.info("Loading CC Legal Tool paths and adding prefix")
-    file_path = os.path.join(PATHS["data"], "legal-tool-paths.txt")
+    file_path = shared.path_join(PATHS["data"], "legal-tool-paths.txt")
     prefix = "//creativecommons.org/"
     tool_urls = []
     with open(file_path, "r") as file_obj:
@@ -110,7 +110,7 @@ def sort_tools(path):
 
 def save_tools_list(tool_urls):
     LOGGER.info("Saving prioritized CC Legal Tool URLs")
-    file_path = os.path.join(PATHS["data"], "prioritized-tool-urls.txt")
+    file_path = shared.path_join(PATHS["data"], "prioritized-tool-urls.txt")
     tool_urls.append("")  # ensure file has end of file newline
     with open(file_path, "w") as file_obj:
         file_obj.writelines("\n".join(tool_urls))
diff --git a/scripts/1-fetch/gcs_fetch.py b/scripts/1-fetch/gcs_fetch.py
@@ -143,7 +143,7 @@ def get_last_completed_plan_index():
 
 def load_plan():
     plan = []
-    file_path = os.path.join(PATHS["data"], "gcs_query_plan.csv")
+    file_path = shared.path_join(PATHS["data"], "gcs_query_plan.csv")
     with open(file_path, "r", newline="") as file_obj:
         plan = list(csv.DictReader(file_obj, dialect="unix"))
     return plan
diff --git a/scripts/2-process/gcs_process.py b/scripts/2-process/gcs_process.py
@@ -4,11 +4,18 @@
 for analysis and comparison between quarters.
 """
 # Standard library
+import argparse
+import csv
 import os
 import sys
+import textwrap
 import traceback
 
-# import pandas as pd
+# Third-party
+import pandas as pd
+from pygments import highlight
+from pygments.formatters import TerminalFormatter
+from pygments.lexers import PythonTracebackLexer
 
 # Add parent directory so shared can be imported
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
@@ -19,6 +26,36 @@
 # Setup
 LOGGER, PATHS = shared.setup(__file__)
 
+# Constants
+FILE1_COUNT = shared.path_join(PATHS["data_1-fetch"], "gcs_1_count.csv")
+FILE2_LANGUAGE = shared.path_join(
+    PATHS["data_1-fetch"], "gcs_2_count_by_language.csv"
+)
+FILE3_COUNTRY = shared.path_join(
+    PATHS["data_1-fetch"], "gcs_3_count_by_country.csv"
+)
+QUARTER = os.path.basename(PATHS["data_quarter"])
+
+
+def parse_arguments():
+    """
+    Parse command-line options, returns parsed argument namespace.
+    """
+    LOGGER.info("Parsing command-line options")
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--enable-save",
+        action="store_true",
+        help="Enable saving results",
+    )
+    parser.add_argument(
+        "--enable-git",
+        action="store_true",
+        help="Enable git actions (fetch, merge, add, commit, and push)",
+    )
+    return parser.parse_args()
+
+
 # def load_quarter_data(quarter):
 #     """
 #     Load data for a specific quarter.
@@ -157,19 +194,184 @@
 #     return parser.parse_args()
 
 
-def main():
-    raise shared.QuantifyingException("No current code for Phase 2", 0)
+def data_to_csv(args, data, file_path):
+    if not args.enable_save:
+        return
+    os.makedirs(PATHS["data_phase"], exist_ok=True)
+    # emulate csv.unix_dialect
+    data.to_csv(
+        file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n"
+    )
+
+
+def process_top_25_tools(args, count_data):
+    LOGGER.info("Processing top 25 tools")
+    data = count_data.sort_values("COUNT", ascending=False)
+    data.reset_index(drop=True, inplace=True)
+    data = data.iloc[:25]
+    data.rename(
+        columns={"TOOL_IDENTIFIER": "CC legal tool", "COUNT": "Count"},
+        inplace=True,
+    )
+    file_path = shared.path_join(PATHS["data_phase"], "gcs_top_25_tools.csv")
+    data_to_csv(args, data, file_path)
+
+
+def process_totals_by_product(args, count_data):
+    LOGGER.info("Processing totals by product")
+    data = {
+        "Licenses version 4.0": 0,
+        "Licenses version 3.0": 0,
+        "Licenses version 2.x": 0,
+        "Licenses version 1.0": 0,
+        "CC0 1.0": 0,
+        "Public Domain Mark 1.0": 0,
+        "Certification 1.0 US": 0,
+    }
+    for row in count_data.itertuples(index=False):
+        tool = row[0]
+        count = row[1]
+        if tool.startswith("PDM"):
+            key = "Public Domain Mark 1.0"
+        elif "CC0" in tool:
+            key = "CC0 1.0"
+        elif "PUBLICDOMAIN" in tool:
+            key = "Certification 1.0 US"
+        elif "4.0" in tool:
+            key = "Licenses version 4.0"
+        elif "3.0" in tool:
+            key = "Licenses version 3.0"
+        elif "2." in tool:
+            key = "Licenses version 2.x"
+        elif "1.0" in tool:
+            key = "Licenses version 1.0"
+        else:
+            raise shared.QuantifyingException("Invalid TOOL_IDENTIFIER")
+        data[key] += count
+
+    data = pd.DataFrame(
+        data.items(), columns=["CC legal tool product", "Count"]
+    )
+    file_path = shared.path_join(
+        PATHS["data_phase"], "gcs_totals_by_product.csv"
+    )
+    data_to_csv(args, data, file_path)
+
+
+def process_totals_by_unit(args, count_data):
+    LOGGER.info("Processing totals by unit")
+    data = {}
+    for row in count_data.itertuples(index=False):
+        tool = row[0]
+        count = row[1]
+        if tool.startswith("PDM"):
+            key = "mark"
+        elif "CC0" in tool:
+            key = "cc0"
+        elif "PUBLICDOMAIN" in tool:
+            key = "certification"
+        else:
+            parts = tool.split()
+            key = parts[1].lower()
+            if key == "by-nd-nc":
+                key = "by-nc-nd"
+        if key not in data.keys():
+            data[key] = count
+        else:
+            data[key] += count
+
+    data = pd.DataFrame(data.items(), columns=["Legal Tool Unit", "Count"])
+    data.sort_values("Count", ascending=False, inplace=True)
+    data.reset_index(drop=True, inplace=True)
+    file_path = shared.path_join(PATHS["data_phase"], "gcs_totals_by_unit.csv")
+    data_to_csv(args, data, file_path)
+
+
+def process_totals_by_free_cultural(args, count_data):
+    LOGGER.info("Processing totals by Approved for Free Cultural Works")
+    data = {
+        "Approved for Free Cultural Works": 0,
+        "Limited uses": 0,
+    }
+    for row in count_data.itertuples(index=False):
+        tool = row[0]
+        count = row[1]
+        if tool.startswith("PDM") or "CC0" in tool or "PUBLICDOMAIN" in tool:
+            key = "Approved for Free Cultural Works"
+        else:
+            parts = tool.split()
+            unit = parts[1].lower()
+            if unit in ["by-sa", "by", "sa", "sampling+"]:
+                key = "Approved for Free Cultural Works"
+            else:
+                key = "Limited uses"
+        data[key] += count
+
+    data = pd.DataFrame(data.items(), columns=["Category", "Count"])
+    data.sort_values("Count", ascending=False, inplace=True)
+    data.reset_index(drop=True, inplace=True)
+    file_path = shared.path_join(
+        PATHS["data_phase"], "gcs_totals_by_free_cultural.csv"
+    )
+    data_to_csv(args, data, file_path)
+
+
+def process_totals_by_restrictions(args, count_data):
+    LOGGER.info("Processing totals by restriction")
+    data = {"level 0": 0, "level 1": 0, "level 2": 0, "level 3": 0}
+    for row in count_data.itertuples(index=False):
+        tool = row[0]
+        count = row[1]
+        if tool.startswith("PDM") or "CC0" in tool or "PUBLICDOMAIN" in tool:
+            key = "level 0"
+        else:
+            parts = tool.split()
+            unit = parts[1].lower()
+            if unit in ["by-sa", "by", "sa", "sampling+"]:
+                key = "level 1"
+            elif unit in ["by-nc", "by-nc-sa", "sampling", "nc", "nc-sa"]:
+                key = "level 2"
+            else:
+                key = "level 3"
+        data[key] += count
+
+    data = pd.DataFrame(data.items(), columns=["Category", "Count"])
+    file_path = shared.path_join(
+        PATHS["data_phase"], "gcs_totals_by_restrictions.csv"
+    )
+    data_to_csv(args, data, file_path)
+
 
-    # # Fetch and merge changes
-    # shared.fetch_and_merge(PATHS["repo"])
+def main():
+    args = parse_arguments()
+    shared.log_paths(LOGGER, PATHS)
+    shared.git_fetch_and_merge(args, PATHS["repo"])
+
+    # Count data
+    count_data = pd.read_csv(FILE1_COUNT, usecols=["TOOL_IDENTIFIER", "COUNT"])
+    process_top_25_tools(args, count_data)
+    process_totals_by_product(args, count_data)
+    process_totals_by_unit(args, count_data)
+    process_totals_by_free_cultural(args, count_data)
+    process_totals_by_restrictions(args, count_data)
+
+    # # Langauge data
+    # langauge_data = pd.read_csv(
+    #     FILE2_LANGUAGE, usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"]
+    # )
 
-    # # Add and commit changes
-    # shared.add_and_commit(
-    #     PATHS["repo"], PATHS["data_quarter"], "Fetched and updated new data"
+    # # Country data
+    # country_data = pd.read_csv(
+    #     FILE3_COUNTRY, usecols=["TOOL_IDENTIFIER", "COUNTRY", "COUNT"]
     # )
 
-    # # Push changes
-    # shared.push_changes(PATHS["repo"])
+    args = shared.git_add_and_commit(
+        args,
+        PATHS["repo"],
+        PATHS["data_quarter"],
+        f"Add and commit new Google Custom Search (GCS) data for {QUARTER}",
+    )
+    shared.git_push_changes(args, PATHS["repo"])
 
 
 if __name__ == "__main__":
@@ -188,5 +390,13 @@ def main():
         LOGGER.info("(130) Halted via KeyboardInterrupt.")
         sys.exit(130)
     except Exception:
-        LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}")
+        traceback_formatted = textwrap.indent(
+            highlight(
+                traceback.format_exc(),
+                PythonTracebackLexer(),
+                TerminalFormatter(),
+            ),
+            "    ",
+        )
+        LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}")
         sys.exit(1)
diff --git a/scripts/shared.py b/scripts/shared.py
@@ -31,20 +31,19 @@ def setup(current_file):
 
     # Paths
     paths = {}
-    paths["repo"] = os.path.dirname(
-        os.path.abspath(os.path.realpath(os.path.join(__file__, "..")))
-    )
-    paths["dotenv"] = os.path.join(paths["repo"], ".env")
+    paths["repo"] = os.path.dirname(path_join(__file__, ".."))
+    paths["dotenv"] = path_join(paths["repo"], ".env")
     paths["data"] = os.path.dirname(
         os.path.abspath(os.path.realpath(current_file))
     )
-    phase = os.path.basename(
+    current_phase = os.path.basename(
         os.path.dirname(os.path.abspath(os.path.realpath(current_file)))
     )
-    paths["data"] = os.path.join(paths["repo"], "data")
-    data_quarter = os.path.join(paths["data"], f"{quarter}")
-    paths["state"] = os.path.join(data_quarter, "state.yaml")
-    paths["data_phase"] = os.path.join(data_quarter, phase)
+    paths["data"] = path_join(paths["repo"], "data")
+    data_quarter = path_join(paths["data"], f"{quarter}")
+    for phase in ["1-fetch", "2-process", "3-report"]:
+        paths[f"data_{phase}"] = path_join(data_quarter, phase)
+    paths["data_phase"] = path_join(data_quarter, current_phase)
 
     paths["data_quarter"] = data_quarter
 
@@ -53,9 +52,14 @@ def setup(current_file):
 
 def log_paths(logger, paths):
     paths_list = []
+    repo_path = paths["repo"]
     for label, path in paths.items():
         label = f"{label}:"
-        paths_list.append(f"\n{' ' * 12}{label:<11} {path}")
+        if label == "repo:":
+            paths_list.append(f"\n{' ' * 4}{label} {path}")
+        else:
+            path_new = path.replace(repo_path, ".")
+            paths_list.append(f"\n{' ' * 8}{label:<15} {path_new}")
     paths_list = "".join(paths_list)
     logger.info(f"PATHS:{paths_list}")
 
@@ -137,7 +141,7 @@ def update_readme(
     """
     Update the README.md file with the generated images and descriptions.
     """
-    readme_path = os.path.join(paths["data"], args.quarter, "README.md")
+    readme_path = path_join(paths["data"], args.quarter, "README.md")
 
     # Define section markers for each data source
     section_marker_start = f"<!-- {data_source} Start -->"