Skip to content

Commit 253c577

Browse files
authored
Merge pull request #213 from oree-xx/Github/processing
Improve GitHub processing and reporting
2 parents ddb3958 + 12f8799 commit 253c577

File tree

2 files changed

+305
-108
lines changed

2 files changed

+305
-108
lines changed

scripts/2-process/github_process.py

Lines changed: 128 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,18 @@
11
#!/usr/bin/env python
22
"""
3-
This file is dedicated to processing Github data
3+
This file is dedicated to processing GitHub data
44
for analysis and comparison between quarters.
55
"""
66
# Standard library
7+
import argparse
8+
import csv
79
import os
810
import sys
911
import traceback
1012

13+
# Third-party
1114
# import pandas as pd
15+
import pandas as pd
1216

1317
# Add parent directory so shared can be imported
1418
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
@@ -19,6 +23,112 @@
1923
# Setup
2024
LOGGER, PATHS = shared.setup(__file__)
2125

26+
# Constants
27+
QUARTER = os.path.basename(PATHS["data_quarter"])
28+
29+
30+
def parse_arguments():
31+
"""
32+
Parse command-line options, returns parsed argument namespace.
33+
"""
34+
LOGGER.info("Parsing command-line options")
35+
parser = argparse.ArgumentParser(description=__doc__)
36+
parser.add_argument(
37+
"--quarter",
38+
default=QUARTER,
39+
help=f"Data quarter in format YYYYQx (default: {QUARTER})",
40+
)
41+
parser.add_argument(
42+
"--enable-save",
43+
action="store_true",
44+
help="Enable saving results (default: False)",
45+
)
46+
parser.add_argument(
47+
"--enable-git",
48+
action="store_true",
49+
help="Enable git actions such as fetch, merge, add, commit, and push"
50+
" (default: False)",
51+
)
52+
args = parser.parse_args()
53+
if not args.enable_save and args.enable_git:
54+
parser.error("--enable-git requires --enable-save")
55+
if args.quarter != QUARTER:
56+
global PATHS
57+
PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter)
58+
args.logger = LOGGER
59+
args.paths = PATHS
60+
return args
61+
62+
63+
def data_to_csv(args, data, file_path):
64+
if not args.enable_save:
65+
return
66+
os.makedirs(PATHS["data_phase"], exist_ok=True)
67+
# emulate csv.unix_dialect
68+
data.to_csv(
69+
file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n"
70+
)
71+
72+
73+
def process_totals_by_license(args, count_data):
74+
"""
75+
Processing count data: totals by License
76+
"""
77+
LOGGER.info(process_totals_by_license.__doc__.strip())
78+
data = {}
79+
80+
for row in count_data.itertuples(index=False):
81+
tool = str(row.TOOL_IDENTIFIER)
82+
count = int(row.COUNT)
83+
84+
if tool == "Total public repositories":
85+
continue
86+
87+
data[tool] = count
88+
89+
data = pd.DataFrame(data.items(), columns=["License", "Count"])
90+
data.sort_values("License", ascending=True, inplace=True)
91+
data.reset_index(drop=True, inplace=True)
92+
file_path = shared.path_join(
93+
PATHS["data_phase"], "github_totals_by_license.csv"
94+
)
95+
data_to_csv(args, data, file_path)
96+
97+
98+
def process_totals_by_restriction(args, count_data):
99+
"""
100+
Processing count data: totals by restriction
101+
"""
102+
# https://creativecommons.org/public-domain/freeworks/
103+
LOGGER.info(process_totals_by_restriction.__doc__.strip())
104+
data = {"Copyleft": 0, "Permissive": 0, "Public domain": 0}
105+
106+
for row in count_data.itertuples(index=False):
107+
tool = str(row.TOOL_IDENTIFIER)
108+
count = int(row.COUNT)
109+
110+
if tool == "Total public repositories":
111+
continue
112+
113+
if tool in ["BSD Zero Clause License", "CC0 1.0", "Unlicense"]:
114+
key = "Public domain"
115+
elif tool in ["MIT No Attribution", "CC BY 4.0"]:
116+
key = "Permissive"
117+
elif tool in ["CC BY-SA 4.0"]:
118+
key = "Copyleft"
119+
else:
120+
continue
121+
122+
data[key] += count
123+
data = pd.DataFrame(data.items(), columns=["Category", "Count"])
124+
data.sort_values("Category", ascending=True, inplace=True)
125+
data.reset_index(drop=True, inplace=True)
126+
file_path = shared.path_join(
127+
PATHS["data_phase"], "github_totals_by_restriction.csv"
128+
)
129+
data_to_csv(args, data, file_path)
130+
131+
22132
# def load_quarter_data(quarter):
23133
# """
24134
# Load data for a specific quarter.
@@ -63,18 +173,23 @@
63173

64174

65175
def main():
66-
raise shared.QuantifyingException("No current code for Phase 2", 0)
67-
68-
# # Fetch and merge changes
69-
# shared.fetch_and_merge(PATHS["repo"])
70-
71-
# # Add and commit changes
72-
# shared.add_and_commit(
73-
# PATHS["repo"], PATHS["data_quarter"], "Fetched and updated new data"
74-
# )
75-
76-
# # Push changes
77-
# shared.push_changes(PATHS["repo"])
176+
args = parse_arguments()
177+
shared.paths_log(LOGGER, PATHS)
178+
shared.git_fetch_and_merge(args, PATHS["repo"])
179+
180+
file_count = shared.path_join(PATHS["data_1-fetch"], "github_1_count.csv")
181+
count_data = pd.read_csv(file_count, usecols=["TOOL_IDENTIFIER", "COUNT"])
182+
process_totals_by_license(args, count_data)
183+
process_totals_by_restriction(args, count_data)
184+
185+
# Push changes
186+
args = shared.git_add_and_commit(
187+
args,
188+
PATHS["repo"],
189+
PATHS["data_quarter"],
190+
f"Add and commit new GitHub data for {QUARTER}",
191+
)
192+
shared.git_push_changes(args, PATHS["repo"])
78193

79194

80195
if __name__ == "__main__":

0 commit comments

Comments
 (0)