Skip to content

Commit ed7aa89

Browse files
committed
begin processings fetched data
1 parent 42b3d0d commit ed7aa89

File tree

1 file changed

+221
-11
lines changed

1 file changed

+221
-11
lines changed

scripts/2-process/gcs_process.py

Lines changed: 221 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,18 @@
44
for analysis and comparison between quarters.
55
"""
66
# Standard library
7+
import argparse
8+
import csv
79
import os
810
import sys
11+
import textwrap
912
import traceback
1013

11-
# import pandas as pd
14+
# Third-party
15+
import pandas as pd
16+
from pygments import highlight
17+
from pygments.formatters import TerminalFormatter
18+
from pygments.lexers import PythonTracebackLexer
1219

1320
# Add parent directory so shared can be imported
1421
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
@@ -19,6 +26,36 @@
1926
# Setup
2027
LOGGER, PATHS = shared.setup(__file__)
2128

29+
# Constants
30+
FILE1_COUNT = shared.path_join(PATHS["data_1-fetch"], "gcs_1_count.csv")
31+
FILE2_LANGUAGE = shared.path_join(
32+
PATHS["data_1-fetch"], "gcs_2_count_by_language.csv"
33+
)
34+
FILE3_COUNTRY = shared.path_join(
35+
PATHS["data_1-fetch"], "gcs_3_count_by_country.csv"
36+
)
37+
QUARTER = os.path.basename(PATHS["data_quarter"])
38+
39+
40+
def parse_arguments():
41+
"""
42+
Parse command-line options, returns parsed argument namespace.
43+
"""
44+
LOGGER.info("Parsing command-line options")
45+
parser = argparse.ArgumentParser(description=__doc__)
46+
parser.add_argument(
47+
"--enable-save",
48+
action="store_true",
49+
help="Enable saving results",
50+
)
51+
parser.add_argument(
52+
"--enable-git",
53+
action="store_true",
54+
help="Enable git actions (fetch, merge, add, commit, and push)",
55+
)
56+
return parser.parse_args()
57+
58+
2259
# def load_quarter_data(quarter):
2360
# """
2461
# Load data for a specific quarter.
@@ -157,19 +194,184 @@
157194
# return parser.parse_args()
158195

159196

160-
def main():
161-
raise shared.QuantifyingException("No current code for Phase 2", 0)
197+
def data_to_csv(args, data, file_path):
198+
if not args.enable_save:
199+
return
200+
os.makedirs(PATHS["data_phase"], exist_ok=True)
201+
# emulate csv.unix_dialect
202+
data.to_csv(
203+
file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n"
204+
)
205+
206+
207+
def process_top_25_tools(args, count_data):
208+
LOGGER.info("Processing top 25 tools")
209+
data = count_data.sort_values("COUNT", ascending=False)
210+
data.reset_index(drop=True, inplace=True)
211+
data = data.iloc[:25]
212+
data.rename(
213+
columns={"TOOL_IDENTIFIER": "CC legal tool", "COUNT": "Count"},
214+
inplace=True,
215+
)
216+
file_path = shared.path_join(PATHS["data_phase"], "gcs_top_25_tools.csv")
217+
data_to_csv(args, data, file_path)
218+
219+
220+
def process_totals_by_product(args, count_data):
221+
LOGGER.info("Processing totals by product")
222+
data = {
223+
"Licenses version 4.0": 0,
224+
"Licenses version 3.0": 0,
225+
"Licenses version 2.x": 0,
226+
"Licenses version 1.0": 0,
227+
"CC0 1.0": 0,
228+
"Public Domain Mark 1.0": 0,
229+
"Certification 1.0 US": 0,
230+
}
231+
for row in count_data.itertuples(index=False):
232+
tool = row[0]
233+
count = row[1]
234+
if tool.startswith("PDM"):
235+
key = "Public Domain Mark 1.0"
236+
elif "CC0" in tool:
237+
key = "CC0 1.0"
238+
elif "PUBLICDOMAIN" in tool:
239+
key = "Certification 1.0 US"
240+
elif "4.0" in tool:
241+
key = "Licenses version 4.0"
242+
elif "3.0" in tool:
243+
key = "Licenses version 3.0"
244+
elif "2." in tool:
245+
key = "Licenses version 2.x"
246+
elif "1.0" in tool:
247+
key = "Licenses version 1.0"
248+
else:
249+
raise shared.QuantifyingException("Invalid TOOL_IDENTIFIER")
250+
data[key] += count
251+
252+
data = pd.DataFrame(
253+
data.items(), columns=["CC legal tool product", "Count"]
254+
)
255+
file_path = shared.path_join(
256+
PATHS["data_phase"], "gcs_totals_by_product.csv"
257+
)
258+
data_to_csv(args, data, file_path)
259+
260+
261+
def process_totals_by_unit(args, count_data):
262+
LOGGER.info("Processing totals by unit")
263+
data = {}
264+
for row in count_data.itertuples(index=False):
265+
tool = row[0]
266+
count = row[1]
267+
if tool.startswith("PDM"):
268+
key = "mark"
269+
elif "CC0" in tool:
270+
key = "cc0"
271+
elif "PUBLICDOMAIN" in tool:
272+
key = "certification"
273+
else:
274+
parts = tool.split()
275+
key = parts[1].lower()
276+
if key == "by-nd-nc":
277+
key = "by-nc-nd"
278+
if key not in data.keys():
279+
data[key] = count
280+
else:
281+
data[key] += count
282+
283+
data = pd.DataFrame(data.items(), columns=["Legal Tool Unit", "Count"])
284+
data.sort_values("Count", ascending=False, inplace=True)
285+
data.reset_index(drop=True, inplace=True)
286+
file_path = shared.path_join(PATHS["data_phase"], "gcs_totals_by_unit.csv")
287+
data_to_csv(args, data, file_path)
288+
289+
290+
def process_totals_by_free_cultural(args, count_data):
291+
LOGGER.info("Processing totals by Approved for Free Cultural Works")
292+
data = {
293+
"Approved for Free Cultural Works": 0,
294+
"Limited uses": 0,
295+
}
296+
for row in count_data.itertuples(index=False):
297+
tool = row[0]
298+
count = row[1]
299+
if tool.startswith("PDM") or "CC0" in tool or "PUBLICDOMAIN" in tool:
300+
key = "Approved for Free Cultural Works"
301+
else:
302+
parts = tool.split()
303+
unit = parts[1].lower()
304+
if unit in ["by-sa", "by", "sa", "sampling+"]:
305+
key = "Approved for Free Cultural Works"
306+
else:
307+
key = "Limited uses"
308+
data[key] += count
309+
310+
data = pd.DataFrame(data.items(), columns=["Category", "Count"])
311+
data.sort_values("Count", ascending=False, inplace=True)
312+
data.reset_index(drop=True, inplace=True)
313+
file_path = shared.path_join(
314+
PATHS["data_phase"], "gcs_totals_by_free_cultural.csv"
315+
)
316+
data_to_csv(args, data, file_path)
317+
318+
319+
def process_totals_by_restrictions(args, count_data):
320+
LOGGER.info("Processing totals by restriction")
321+
data = {"level 0": 0, "level 1": 0, "level 2": 0, "level 3": 0}
322+
for row in count_data.itertuples(index=False):
323+
tool = row[0]
324+
count = row[1]
325+
if tool.startswith("PDM") or "CC0" in tool or "PUBLICDOMAIN" in tool:
326+
key = "level 0"
327+
else:
328+
parts = tool.split()
329+
unit = parts[1].lower()
330+
if unit in ["by-sa", "by", "sa", "sampling+"]:
331+
key = "level 1"
332+
elif unit in ["by-nc", "by-nc-sa", "sampling", "nc", "nc-sa"]:
333+
key = "level 2"
334+
else:
335+
key = "level 3"
336+
data[key] += count
337+
338+
data = pd.DataFrame(data.items(), columns=["Category", "Count"])
339+
file_path = shared.path_join(
340+
PATHS["data_phase"], "gcs_totals_by_restrictions.csv"
341+
)
342+
data_to_csv(args, data, file_path)
343+
162344

163-
# # Fetch and merge changes
164-
# shared.fetch_and_merge(PATHS["repo"])
345+
def main():
346+
args = parse_arguments()
347+
shared.log_paths(LOGGER, PATHS)
348+
shared.git_fetch_and_merge(args, PATHS["repo"])
349+
350+
# Count data
351+
count_data = pd.read_csv(FILE1_COUNT, usecols=["TOOL_IDENTIFIER", "COUNT"])
352+
process_top_25_tools(args, count_data)
353+
process_totals_by_product(args, count_data)
354+
process_totals_by_unit(args, count_data)
355+
process_totals_by_free_cultural(args, count_data)
356+
process_totals_by_restrictions(args, count_data)
357+
358+
# # Langauge data
359+
# langauge_data = pd.read_csv(
360+
# FILE2_LANGUAGE, usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"]
361+
# )
165362

166-
# # Add and commit changes
167-
# shared.add_and_commit(
168-
# PATHS["repo"], PATHS["data_quarter"], "Fetched and updated new data"
363+
# # Country data
364+
# country_data = pd.read_csv(
365+
# FILE3_COUNTRY, usecols=["TOOL_IDENTIFIER", "COUNTRY", "COUNT"]
169366
# )
170367

171-
# # Push changes
172-
# shared.push_changes(PATHS["repo"])
368+
args = shared.git_add_and_commit(
369+
args,
370+
PATHS["repo"],
371+
PATHS["data_quarter"],
372+
f"Add and commit new Google Custom Search (GCS) data for {QUARTER}",
373+
)
374+
shared.git_push_changes(args, PATHS["repo"])
173375

174376

175377
if __name__ == "__main__":
@@ -188,5 +390,13 @@ def main():
188390
LOGGER.info("(130) Halted via KeyboardInterrupt.")
189391
sys.exit(130)
190392
except Exception:
191-
LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}")
393+
traceback_formatted = textwrap.indent(
394+
highlight(
395+
traceback.format_exc(),
396+
PythonTracebackLexer(),
397+
TerminalFormatter(),
398+
),
399+
" ",
400+
)
401+
LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}")
192402
sys.exit(1)

0 commit comments

Comments
 (0)