Skip to content

Commit 1e01660

Browse files
authored
Merge pull request #147 from creativecommons/initial-process
Initial data processing of fetched GCS count data
2 parents 9674b08 + f535132 commit 1e01660

File tree

5 files changed

+243
-29
lines changed

5 files changed

+243
-29
lines changed

dev/create_gcs_query_plan.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ def sort_tools(url):
116116

117117
def get_tool_urls():
118118
LOGGER.info("Loading CC Legal Tool paths and adding prefix")
119-
file_path = os.path.join(PATHS["data"], "legal-tool-paths.txt")
119+
file_path = shared.path_join(PATHS["data"], "legal-tool-paths.txt")
120120
prefix = "//creativecommons.org/"
121121
tool_urls = []
122122
with open(file_path, "r") as file_obj:
@@ -128,14 +128,14 @@ def get_tool_urls():
128128

129129

130130
def load_countries():
131-
file_path = os.path.join(PATHS["data"], "gcs_country_collection.yaml")
131+
file_path = shared.path_join(PATHS["data"], "gcs_country_collection.yaml")
132132
with open(file_path, "r") as file_obj:
133133
countries = yaml.safe_load(file_obj)
134134
return countries
135135

136136

137137
def load_languages():
138-
file_path = os.path.join(PATHS["data"], "gcs_language_collection.yaml")
138+
file_path = shared.path_join(PATHS["data"], "gcs_language_collection.yaml")
139139
with open(file_path, "r") as file_obj:
140140
languages = yaml.safe_load(file_obj)
141141
return languages
@@ -202,7 +202,7 @@ def create_query_plan(tool_urls, countries, languages):
202202

203203
def save_plan(plan):
204204
LOGGER.info("Saving Google query plan to CSV")
205-
file_path = os.path.join(PATHS["data"], "gcs_query_plan.csv")
205+
file_path = shared.path_join(PATHS["data"], "gcs_query_plan.csv")
206206
fieldnames = [
207207
"TOOL_URL",
208208
"TOOL_IDENTIFIER",

dev/prioritize_tools.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939

4040
def get_tool_urls():
4141
LOGGER.info("Loading CC Legal Tool paths and adding prefix")
42-
file_path = os.path.join(PATHS["data"], "legal-tool-paths.txt")
42+
file_path = shared.path_join(PATHS["data"], "legal-tool-paths.txt")
4343
prefix = "//creativecommons.org/"
4444
tool_urls = []
4545
with open(file_path, "r") as file_obj:
@@ -110,7 +110,7 @@ def sort_tools(path):
110110

111111
def save_tools_list(tool_urls):
112112
LOGGER.info("Saving prioritized CC Legal Tool URLs")
113-
file_path = os.path.join(PATHS["data"], "prioritized-tool-urls.txt")
113+
file_path = shared.path_join(PATHS["data"], "prioritized-tool-urls.txt")
114114
tool_urls.append("") # ensure file has end of file newline
115115
with open(file_path, "w") as file_obj:
116116
file_obj.writelines("\n".join(tool_urls))

scripts/1-fetch/gcs_fetch.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ def get_last_completed_plan_index():
143143

144144
def load_plan():
145145
plan = []
146-
file_path = os.path.join(PATHS["data"], "gcs_query_plan.csv")
146+
file_path = shared.path_join(PATHS["data"], "gcs_query_plan.csv")
147147
with open(file_path, "r", newline="") as file_obj:
148148
plan = list(csv.DictReader(file_obj, dialect="unix"))
149149
return plan

scripts/2-process/gcs_process.py

Lines changed: 221 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,18 @@
44
for analysis and comparison between quarters.
55
"""
66
# Standard library
7+
import argparse
8+
import csv
79
import os
810
import sys
11+
import textwrap
912
import traceback
1013

11-
# import pandas as pd
14+
# Third-party
15+
import pandas as pd
16+
from pygments import highlight
17+
from pygments.formatters import TerminalFormatter
18+
from pygments.lexers import PythonTracebackLexer
1219

1320
# Add parent directory so shared can be imported
1421
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
@@ -19,6 +26,36 @@
1926
# Setup
2027
LOGGER, PATHS = shared.setup(__file__)
2128

29+
# Constants
30+
FILE1_COUNT = shared.path_join(PATHS["data_1-fetch"], "gcs_1_count.csv")
31+
FILE2_LANGUAGE = shared.path_join(
32+
PATHS["data_1-fetch"], "gcs_2_count_by_language.csv"
33+
)
34+
FILE3_COUNTRY = shared.path_join(
35+
PATHS["data_1-fetch"], "gcs_3_count_by_country.csv"
36+
)
37+
QUARTER = os.path.basename(PATHS["data_quarter"])
38+
39+
40+
def parse_arguments():
41+
"""
42+
Parse command-line options, returns parsed argument namespace.
43+
"""
44+
LOGGER.info("Parsing command-line options")
45+
parser = argparse.ArgumentParser(description=__doc__)
46+
parser.add_argument(
47+
"--enable-save",
48+
action="store_true",
49+
help="Enable saving results",
50+
)
51+
parser.add_argument(
52+
"--enable-git",
53+
action="store_true",
54+
help="Enable git actions (fetch, merge, add, commit, and push)",
55+
)
56+
return parser.parse_args()
57+
58+
2259
# def load_quarter_data(quarter):
2360
# """
2461
# Load data for a specific quarter.
@@ -157,19 +194,184 @@
157194
# return parser.parse_args()
158195

159196

160-
def main():
161-
raise shared.QuantifyingException("No current code for Phase 2", 0)
197+
def data_to_csv(args, data, file_path):
198+
if not args.enable_save:
199+
return
200+
os.makedirs(PATHS["data_phase"], exist_ok=True)
201+
# emulate csv.unix_dialect
202+
data.to_csv(
203+
file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n"
204+
)
205+
206+
207+
def process_top_25_tools(args, count_data):
208+
LOGGER.info("Processing top 25 tools")
209+
data = count_data.sort_values("COUNT", ascending=False)
210+
data.reset_index(drop=True, inplace=True)
211+
data = data.iloc[:25]
212+
data.rename(
213+
columns={"TOOL_IDENTIFIER": "CC legal tool", "COUNT": "Count"},
214+
inplace=True,
215+
)
216+
file_path = shared.path_join(PATHS["data_phase"], "gcs_top_25_tools.csv")
217+
data_to_csv(args, data, file_path)
218+
219+
220+
def process_totals_by_product(args, count_data):
221+
LOGGER.info("Processing totals by product")
222+
data = {
223+
"Licenses version 4.0": 0,
224+
"Licenses version 3.0": 0,
225+
"Licenses version 2.x": 0,
226+
"Licenses version 1.0": 0,
227+
"CC0 1.0": 0,
228+
"Public Domain Mark 1.0": 0,
229+
"Certification 1.0 US": 0,
230+
}
231+
for row in count_data.itertuples(index=False):
232+
tool = row[0]
233+
count = row[1]
234+
if tool.startswith("PDM"):
235+
key = "Public Domain Mark 1.0"
236+
elif "CC0" in tool:
237+
key = "CC0 1.0"
238+
elif "PUBLICDOMAIN" in tool:
239+
key = "Certification 1.0 US"
240+
elif "4.0" in tool:
241+
key = "Licenses version 4.0"
242+
elif "3.0" in tool:
243+
key = "Licenses version 3.0"
244+
elif "2." in tool:
245+
key = "Licenses version 2.x"
246+
elif "1.0" in tool:
247+
key = "Licenses version 1.0"
248+
else:
249+
raise shared.QuantifyingException("Invalid TOOL_IDENTIFIER")
250+
data[key] += count
251+
252+
data = pd.DataFrame(
253+
data.items(), columns=["CC legal tool product", "Count"]
254+
)
255+
file_path = shared.path_join(
256+
PATHS["data_phase"], "gcs_totals_by_product.csv"
257+
)
258+
data_to_csv(args, data, file_path)
259+
260+
261+
def process_totals_by_unit(args, count_data):
262+
LOGGER.info("Processing totals by unit")
263+
data = {}
264+
for row in count_data.itertuples(index=False):
265+
tool = row[0]
266+
count = row[1]
267+
if tool.startswith("PDM"):
268+
key = "mark"
269+
elif "CC0" in tool:
270+
key = "cc0"
271+
elif "PUBLICDOMAIN" in tool:
272+
key = "certification"
273+
else:
274+
parts = tool.split()
275+
key = parts[1].lower()
276+
if key == "by-nd-nc":
277+
key = "by-nc-nd"
278+
if key not in data.keys():
279+
data[key] = count
280+
else:
281+
data[key] += count
282+
283+
data = pd.DataFrame(data.items(), columns=["Legal Tool Unit", "Count"])
284+
data.sort_values("Count", ascending=False, inplace=True)
285+
data.reset_index(drop=True, inplace=True)
286+
file_path = shared.path_join(PATHS["data_phase"], "gcs_totals_by_unit.csv")
287+
data_to_csv(args, data, file_path)
288+
289+
290+
def process_totals_by_free_cultural(args, count_data):
291+
LOGGER.info("Processing totals by Approved for Free Cultural Works")
292+
data = {
293+
"Approved for Free Cultural Works": 0,
294+
"Limited uses": 0,
295+
}
296+
for row in count_data.itertuples(index=False):
297+
tool = row[0]
298+
count = row[1]
299+
if tool.startswith("PDM") or "CC0" in tool or "PUBLICDOMAIN" in tool:
300+
key = "Approved for Free Cultural Works"
301+
else:
302+
parts = tool.split()
303+
unit = parts[1].lower()
304+
if unit in ["by-sa", "by", "sa", "sampling+"]:
305+
key = "Approved for Free Cultural Works"
306+
else:
307+
key = "Limited uses"
308+
data[key] += count
309+
310+
data = pd.DataFrame(data.items(), columns=["Category", "Count"])
311+
data.sort_values("Count", ascending=False, inplace=True)
312+
data.reset_index(drop=True, inplace=True)
313+
file_path = shared.path_join(
314+
PATHS["data_phase"], "gcs_totals_by_free_cultural.csv"
315+
)
316+
data_to_csv(args, data, file_path)
317+
318+
319+
def process_totals_by_restrictions(args, count_data):
320+
LOGGER.info("Processing totals by restriction")
321+
data = {"level 0": 0, "level 1": 0, "level 2": 0, "level 3": 0}
322+
for row in count_data.itertuples(index=False):
323+
tool = row[0]
324+
count = row[1]
325+
if tool.startswith("PDM") or "CC0" in tool or "PUBLICDOMAIN" in tool:
326+
key = "level 0"
327+
else:
328+
parts = tool.split()
329+
unit = parts[1].lower()
330+
if unit in ["by-sa", "by", "sa", "sampling+"]:
331+
key = "level 1"
332+
elif unit in ["by-nc", "by-nc-sa", "sampling", "nc", "nc-sa"]:
333+
key = "level 2"
334+
else:
335+
key = "level 3"
336+
data[key] += count
337+
338+
data = pd.DataFrame(data.items(), columns=["Category", "Count"])
339+
file_path = shared.path_join(
340+
PATHS["data_phase"], "gcs_totals_by_restrictions.csv"
341+
)
342+
data_to_csv(args, data, file_path)
343+
162344

163-
# # Fetch and merge changes
164-
# shared.fetch_and_merge(PATHS["repo"])
345+
def main():
346+
args = parse_arguments()
347+
shared.log_paths(LOGGER, PATHS)
348+
shared.git_fetch_and_merge(args, PATHS["repo"])
349+
350+
# Count data
351+
count_data = pd.read_csv(FILE1_COUNT, usecols=["TOOL_IDENTIFIER", "COUNT"])
352+
process_top_25_tools(args, count_data)
353+
process_totals_by_product(args, count_data)
354+
process_totals_by_unit(args, count_data)
355+
process_totals_by_free_cultural(args, count_data)
356+
process_totals_by_restrictions(args, count_data)
357+
358+
# # Langauge data
359+
# langauge_data = pd.read_csv(
360+
# FILE2_LANGUAGE, usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"]
361+
# )
165362

166-
# # Add and commit changes
167-
# shared.add_and_commit(
168-
# PATHS["repo"], PATHS["data_quarter"], "Fetched and updated new data"
363+
# # Country data
364+
# country_data = pd.read_csv(
365+
# FILE3_COUNTRY, usecols=["TOOL_IDENTIFIER", "COUNTRY", "COUNT"]
169366
# )
170367

171-
# # Push changes
172-
# shared.push_changes(PATHS["repo"])
368+
args = shared.git_add_and_commit(
369+
args,
370+
PATHS["repo"],
371+
PATHS["data_quarter"],
372+
f"Add and commit new Google Custom Search (GCS) data for {QUARTER}",
373+
)
374+
shared.git_push_changes(args, PATHS["repo"])
173375

174376

175377
if __name__ == "__main__":
@@ -188,5 +390,13 @@ def main():
188390
LOGGER.info("(130) Halted via KeyboardInterrupt.")
189391
sys.exit(130)
190392
except Exception:
191-
LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}")
393+
traceback_formatted = textwrap.indent(
394+
highlight(
395+
traceback.format_exc(),
396+
PythonTracebackLexer(),
397+
TerminalFormatter(),
398+
),
399+
" ",
400+
)
401+
LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}")
192402
sys.exit(1)

scripts/shared.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,20 +31,19 @@ def setup(current_file):
3131

3232
# Paths
3333
paths = {}
34-
paths["repo"] = os.path.dirname(
35-
os.path.abspath(os.path.realpath(os.path.join(__file__, "..")))
36-
)
37-
paths["dotenv"] = os.path.join(paths["repo"], ".env")
34+
paths["repo"] = os.path.dirname(path_join(__file__, ".."))
35+
paths["dotenv"] = path_join(paths["repo"], ".env")
3836
paths["data"] = os.path.dirname(
3937
os.path.abspath(os.path.realpath(current_file))
4038
)
41-
phase = os.path.basename(
39+
current_phase = os.path.basename(
4240
os.path.dirname(os.path.abspath(os.path.realpath(current_file)))
4341
)
44-
paths["data"] = os.path.join(paths["repo"], "data")
45-
data_quarter = os.path.join(paths["data"], f"{quarter}")
46-
paths["state"] = os.path.join(data_quarter, "state.yaml")
47-
paths["data_phase"] = os.path.join(data_quarter, phase)
42+
paths["data"] = path_join(paths["repo"], "data")
43+
data_quarter = path_join(paths["data"], f"{quarter}")
44+
for phase in ["1-fetch", "2-process", "3-report"]:
45+
paths[f"data_{phase}"] = path_join(data_quarter, phase)
46+
paths["data_phase"] = path_join(data_quarter, current_phase)
4847

4948
paths["data_quarter"] = data_quarter
5049

@@ -53,9 +52,14 @@ def setup(current_file):
5352

5453
def log_paths(logger, paths):
5554
paths_list = []
55+
repo_path = paths["repo"]
5656
for label, path in paths.items():
5757
label = f"{label}:"
58-
paths_list.append(f"\n{' ' * 12}{label:<11} {path}")
58+
if label == "repo:":
59+
paths_list.append(f"\n{' ' * 4}{label} {path}")
60+
else:
61+
path_new = path.replace(repo_path, ".")
62+
paths_list.append(f"\n{' ' * 8}{label:<15} {path_new}")
5963
paths_list = "".join(paths_list)
6064
logger.info(f"PATHS:{paths_list}")
6165

@@ -137,7 +141,7 @@ def update_readme(
137141
"""
138142
Update the README.md file with the generated images and descriptions.
139143
"""
140-
readme_path = os.path.join(paths["data"], args.quarter, "README.md")
144+
readme_path = path_join(paths["data"], args.quarter, "README.md")
141145

142146
# Define section markers for each data source
143147
section_marker_start = f"<!-- {data_source} Start -->"

0 commit comments

Comments
 (0)