creativecommons
diff --git a/‎scripts/2-process/github_process.py‎
Lines changed: 128 additions & 13 deletions b/‎scripts/2-process/github_process.py‎
Lines changed: 128 additions & 13 deletions
@@ -1,14 +1,18 @@
 #!/usr/bin/env python
 """
-This file is dedicated to processing Github data
+This file is dedicated to processing GitHub data
 for analysis and comparison between quarters.
 """
 # Standard library
+import argparse
+import csv
 import os
 import sys
 import traceback
 
+# Third-party
 # import pandas as pd
+import pandas as pd
 
 # Add parent directory so shared can be imported
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
@@ -19,6 +23,112 @@
 # Setup
 LOGGER, PATHS = shared.setup(__file__)
 
+# Constants
+QUARTER = os.path.basename(PATHS["data_quarter"])
+
+
+def parse_arguments():
+    """
+    Parse command-line options, returns parsed argument namespace.
+    """
+    LOGGER.info("Parsing command-line options")
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--quarter",
+        default=QUARTER,
+        help=f"Data quarter in format YYYYQx (default: {QUARTER})",
+    )
+    parser.add_argument(
+        "--enable-save",
+        action="store_true",
+        help="Enable saving results (default: False)",
+    )
+    parser.add_argument(
+        "--enable-git",
+        action="store_true",
+        help="Enable git actions such as fetch, merge, add, commit, and push"
+        " (default: False)",
+    )
+    args = parser.parse_args()
+    if not args.enable_save and args.enable_git:
+        parser.error("--enable-git requires --enable-save")
+    if args.quarter != QUARTER:
+        global PATHS
+        PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter)
+    args.logger = LOGGER
+    args.paths = PATHS
+    return args
+
+
+def data_to_csv(args, data, file_path):
+    if not args.enable_save:
+        return
+    os.makedirs(PATHS["data_phase"], exist_ok=True)
+    # emulate csv.unix_dialect
+    data.to_csv(
+        file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n"
+    )
+
+
+def process_totals_by_license(args, count_data):
+    """
+    Processing count data: totals by License
+    """
+    LOGGER.info(process_totals_by_license.__doc__.strip())
+    data = {}
+
+    for row in count_data.itertuples(index=False):
+        tool = str(row.TOOL_IDENTIFIER)
+        count = int(row.COUNT)
+
+        if tool == "Total public repositories":
+            continue
+
+        data[tool] = count
+
+    data = pd.DataFrame(data.items(), columns=["License", "Count"])
+    data.sort_values("License", ascending=True, inplace=True)
+    data.reset_index(drop=True, inplace=True)
+    file_path = shared.path_join(
+        PATHS["data_phase"], "github_totals_by_license.csv"
+    )
+    data_to_csv(args, data, file_path)
+
+
+def process_totals_by_restriction(args, count_data):
+    """
+    Processing count data: totals by restriction
+    """
+    # https://creativecommons.org/public-domain/freeworks/
+    LOGGER.info(process_totals_by_restriction.__doc__.strip())
+    data = {"Copyleft": 0, "Permissive": 0, "Public domain": 0}
+
+    for row in count_data.itertuples(index=False):
+        tool = str(row.TOOL_IDENTIFIER)
+        count = int(row.COUNT)
+
+        if tool == "Total public repositories":
+            continue
+
+        if tool in ["BSD Zero Clause License", "CC0 1.0", "Unlicense"]:
+            key = "Public domain"
+        elif tool in ["MIT No Attribution", "CC BY 4.0"]:
+            key = "Permissive"
+        elif tool in ["CC BY-SA 4.0"]:
+            key = "Copyleft"
+        else:
+            continue
+
+        data[key] += count
+    data = pd.DataFrame(data.items(), columns=["Category", "Count"])
+    data.sort_values("Category", ascending=True, inplace=True)
+    data.reset_index(drop=True, inplace=True)
+    file_path = shared.path_join(
+        PATHS["data_phase"], "github_totals_by_restriction.csv"
+    )
+    data_to_csv(args, data, file_path)
+
+
 # def load_quarter_data(quarter):
 #     """
 #     Load data for a specific quarter.
@@ -63,18 +173,23 @@
 
 
 def main():
-    raise shared.QuantifyingException("No current code for Phase 2", 0)
-
-    # # Fetch and merge changes
-    # shared.fetch_and_merge(PATHS["repo"])
-
-    # # Add and commit changes
-    # shared.add_and_commit(
-    #     PATHS["repo"], PATHS["data_quarter"], "Fetched and updated new data"
-    # )
-
-    # # Push changes
-    # shared.push_changes(PATHS["repo"])
+    args = parse_arguments()
+    shared.paths_log(LOGGER, PATHS)
+    shared.git_fetch_and_merge(args, PATHS["repo"])
+
+    file_count = shared.path_join(PATHS["data_1-fetch"], "github_1_count.csv")
+    count_data = pd.read_csv(file_count, usecols=["TOOL_IDENTIFIER", "COUNT"])
+    process_totals_by_license(args, count_data)
+    process_totals_by_restriction(args, count_data)
+
+    # Push changes
+    args = shared.git_add_and_commit(
+        args,
+        PATHS["repo"],
+        PATHS["data_quarter"],
+        f"Add and commit new GitHub data for {QUARTER}",
+    )
+    shared.git_push_changes(args, PATHS["repo"])
 
 
 if __name__ == "__main__":