begin processings fetched data

TimidRobot · TimidRobot · commit ed7aa893da8d · 2024-12-06T15:29:37.000-08:00
diff --git a/scripts/2-process/gcs_process.py b/scripts/2-process/gcs_process.py
@@ -4,11 +4,18 @@
 for analysis and comparison between quarters.
 """
 # Standard library
+import argparse
+import csv
 import os
 import sys
+import textwrap
 import traceback
 
-# import pandas as pd
+# Third-party
+import pandas as pd
+from pygments import highlight
+from pygments.formatters import TerminalFormatter
+from pygments.lexers import PythonTracebackLexer
 
 # Add parent directory so shared can be imported
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
@@ -19,6 +26,36 @@
 # Setup
 LOGGER, PATHS = shared.setup(__file__)
 
+# Constants
+FILE1_COUNT = shared.path_join(PATHS["data_1-fetch"], "gcs_1_count.csv")
+FILE2_LANGUAGE = shared.path_join(
+    PATHS["data_1-fetch"], "gcs_2_count_by_language.csv"
+)
+FILE3_COUNTRY = shared.path_join(
+    PATHS["data_1-fetch"], "gcs_3_count_by_country.csv"
+)
+QUARTER = os.path.basename(PATHS["data_quarter"])
+
+
+def parse_arguments():
+    """
+    Parse command-line options, returns parsed argument namespace.
+    """
+    LOGGER.info("Parsing command-line options")
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--enable-save",
+        action="store_true",
+        help="Enable saving results",
+    )
+    parser.add_argument(
+        "--enable-git",
+        action="store_true",
+        help="Enable git actions (fetch, merge, add, commit, and push)",
+    )
+    return parser.parse_args()
+
+
 # def load_quarter_data(quarter):
 #     """
 #     Load data for a specific quarter.
@@ -157,19 +194,184 @@
 #     return parser.parse_args()
 
 
-def main():
-    raise shared.QuantifyingException("No current code for Phase 2", 0)
+def data_to_csv(args, data, file_path):
+    if not args.enable_save:
+        return
+    os.makedirs(PATHS["data_phase"], exist_ok=True)
+    # emulate csv.unix_dialect
+    data.to_csv(
+        file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n"
+    )
+
+
+def process_top_25_tools(args, count_data):
+    LOGGER.info("Processing top 25 tools")
+    data = count_data.sort_values("COUNT", ascending=False)
+    data.reset_index(drop=True, inplace=True)
+    data = data.iloc[:25]
+    data.rename(
+        columns={"TOOL_IDENTIFIER": "CC legal tool", "COUNT": "Count"},
+        inplace=True,
+    )
+    file_path = shared.path_join(PATHS["data_phase"], "gcs_top_25_tools.csv")
+    data_to_csv(args, data, file_path)
+
+
+def process_totals_by_product(args, count_data):
+    LOGGER.info("Processing totals by product")
+    data = {
+        "Licenses version 4.0": 0,
+        "Licenses version 3.0": 0,
+        "Licenses version 2.x": 0,
+        "Licenses version 1.0": 0,
+        "CC0 1.0": 0,
+        "Public Domain Mark 1.0": 0,
+        "Certification 1.0 US": 0,
+    }
+    for row in count_data.itertuples(index=False):
+        tool = row[0]
+        count = row[1]
+        if tool.startswith("PDM"):
+            key = "Public Domain Mark 1.0"
+        elif "CC0" in tool:
+            key = "CC0 1.0"
+        elif "PUBLICDOMAIN" in tool:
+            key = "Certification 1.0 US"
+        elif "4.0" in tool:
+            key = "Licenses version 4.0"
+        elif "3.0" in tool:
+            key = "Licenses version 3.0"
+        elif "2." in tool:
+            key = "Licenses version 2.x"
+        elif "1.0" in tool:
+            key = "Licenses version 1.0"
+        else:
+            raise shared.QuantifyingException("Invalid TOOL_IDENTIFIER")
+        data[key] += count
+
+    data = pd.DataFrame(
+        data.items(), columns=["CC legal tool product", "Count"]
+    )
+    file_path = shared.path_join(
+        PATHS["data_phase"], "gcs_totals_by_product.csv"
+    )
+    data_to_csv(args, data, file_path)
+
+
+def process_totals_by_unit(args, count_data):
+    LOGGER.info("Processing totals by unit")
+    data = {}
+    for row in count_data.itertuples(index=False):
+        tool = row[0]
+        count = row[1]
+        if tool.startswith("PDM"):
+            key = "mark"
+        elif "CC0" in tool:
+            key = "cc0"
+        elif "PUBLICDOMAIN" in tool:
+            key = "certification"
+        else:
+            parts = tool.split()
+            key = parts[1].lower()
+            if key == "by-nd-nc":
+                key = "by-nc-nd"
+        if key not in data.keys():
+            data[key] = count
+        else:
+            data[key] += count
+
+    data = pd.DataFrame(data.items(), columns=["Legal Tool Unit", "Count"])
+    data.sort_values("Count", ascending=False, inplace=True)
+    data.reset_index(drop=True, inplace=True)
+    file_path = shared.path_join(PATHS["data_phase"], "gcs_totals_by_unit.csv")
+    data_to_csv(args, data, file_path)
+
+
+def process_totals_by_free_cultural(args, count_data):
+    LOGGER.info("Processing totals by Approved for Free Cultural Works")
+    data = {
+        "Approved for Free Cultural Works": 0,
+        "Limited uses": 0,
+    }
+    for row in count_data.itertuples(index=False):
+        tool = row[0]
+        count = row[1]
+        if tool.startswith("PDM") or "CC0" in tool or "PUBLICDOMAIN" in tool:
+            key = "Approved for Free Cultural Works"
+        else:
+            parts = tool.split()
+            unit = parts[1].lower()
+            if unit in ["by-sa", "by", "sa", "sampling+"]:
+                key = "Approved for Free Cultural Works"
+            else:
+                key = "Limited uses"
+        data[key] += count
+
+    data = pd.DataFrame(data.items(), columns=["Category", "Count"])
+    data.sort_values("Count", ascending=False, inplace=True)
+    data.reset_index(drop=True, inplace=True)
+    file_path = shared.path_join(
+        PATHS["data_phase"], "gcs_totals_by_free_cultural.csv"
+    )
+    data_to_csv(args, data, file_path)
+
+
+def process_totals_by_restrictions(args, count_data):
+    LOGGER.info("Processing totals by restriction")
+    data = {"level 0": 0, "level 1": 0, "level 2": 0, "level 3": 0}
+    for row in count_data.itertuples(index=False):
+        tool = row[0]
+        count = row[1]
+        if tool.startswith("PDM") or "CC0" in tool or "PUBLICDOMAIN" in tool:
+            key = "level 0"
+        else:
+            parts = tool.split()
+            unit = parts[1].lower()
+            if unit in ["by-sa", "by", "sa", "sampling+"]:
+                key = "level 1"
+            elif unit in ["by-nc", "by-nc-sa", "sampling", "nc", "nc-sa"]:
+                key = "level 2"
+            else:
+                key = "level 3"
+        data[key] += count
+
+    data = pd.DataFrame(data.items(), columns=["Category", "Count"])
+    file_path = shared.path_join(
+        PATHS["data_phase"], "gcs_totals_by_restrictions.csv"
+    )
+    data_to_csv(args, data, file_path)
+
 
-    # # Fetch and merge changes
-    # shared.fetch_and_merge(PATHS["repo"])
+def main():
+    args = parse_arguments()
+    shared.log_paths(LOGGER, PATHS)
+    shared.git_fetch_and_merge(args, PATHS["repo"])
+
+    # Count data
+    count_data = pd.read_csv(FILE1_COUNT, usecols=["TOOL_IDENTIFIER", "COUNT"])
+    process_top_25_tools(args, count_data)
+    process_totals_by_product(args, count_data)
+    process_totals_by_unit(args, count_data)
+    process_totals_by_free_cultural(args, count_data)
+    process_totals_by_restrictions(args, count_data)
+
+    # # Langauge data
+    # langauge_data = pd.read_csv(
+    #     FILE2_LANGUAGE, usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"]
+    # )
 
-    # # Add and commit changes
-    # shared.add_and_commit(
-    #     PATHS["repo"], PATHS["data_quarter"], "Fetched and updated new data"
+    # # Country data
+    # country_data = pd.read_csv(
+    #     FILE3_COUNTRY, usecols=["TOOL_IDENTIFIER", "COUNTRY", "COUNT"]
     # )
 
-    # # Push changes
-    # shared.push_changes(PATHS["repo"])
+    args = shared.git_add_and_commit(
+        args,
+        PATHS["repo"],
+        PATHS["data_quarter"],
+        f"Add and commit new Google Custom Search (GCS) data for {QUARTER}",
+    )
+    shared.git_push_changes(args, PATHS["repo"])
 
 
 if __name__ == "__main__":
@@ -188,5 +390,13 @@ def main():
         LOGGER.info("(130) Halted via KeyboardInterrupt.")
         sys.exit(130)
     except Exception:
-        LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}")
+        traceback_formatted = textwrap.indent(
+            highlight(
+                traceback.format_exc(),
+                PythonTracebackLexer(),
+                TerminalFormatter(),
+            ),
+            "    ",
+        )
+        LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}")
         sys.exit(1)