Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions dev/create_gcs_query_plan.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def sort_tools(url):

def get_tool_urls():
LOGGER.info("Loading CC Legal Tool paths and adding prefix")
file_path = os.path.join(PATHS["data"], "legal-tool-paths.txt")
file_path = shared.path_join(PATHS["data"], "legal-tool-paths.txt")
prefix = "//creativecommons.org/"
tool_urls = []
with open(file_path, "r") as file_obj:
Expand All @@ -128,14 +128,14 @@ def get_tool_urls():


def load_countries():
file_path = os.path.join(PATHS["data"], "gcs_country_collection.yaml")
file_path = shared.path_join(PATHS["data"], "gcs_country_collection.yaml")
with open(file_path, "r") as file_obj:
countries = yaml.safe_load(file_obj)
return countries


def load_languages():
file_path = os.path.join(PATHS["data"], "gcs_language_collection.yaml")
file_path = shared.path_join(PATHS["data"], "gcs_language_collection.yaml")
with open(file_path, "r") as file_obj:
languages = yaml.safe_load(file_obj)
return languages
Expand Down Expand Up @@ -202,7 +202,7 @@ def create_query_plan(tool_urls, countries, languages):

def save_plan(plan):
LOGGER.info("Saving Google query plan to CSV")
file_path = os.path.join(PATHS["data"], "gcs_query_plan.csv")
file_path = shared.path_join(PATHS["data"], "gcs_query_plan.csv")
fieldnames = [
"TOOL_URL",
"TOOL_IDENTIFIER",
Expand Down
4 changes: 2 additions & 2 deletions dev/prioritize_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@

def get_tool_urls():
LOGGER.info("Loading CC Legal Tool paths and adding prefix")
file_path = os.path.join(PATHS["data"], "legal-tool-paths.txt")
file_path = shared.path_join(PATHS["data"], "legal-tool-paths.txt")
prefix = "//creativecommons.org/"
tool_urls = []
with open(file_path, "r") as file_obj:
Expand Down Expand Up @@ -110,7 +110,7 @@ def sort_tools(path):

def save_tools_list(tool_urls):
LOGGER.info("Saving prioritized CC Legal Tool URLs")
file_path = os.path.join(PATHS["data"], "prioritized-tool-urls.txt")
file_path = shared.path_join(PATHS["data"], "prioritized-tool-urls.txt")
tool_urls.append("") # ensure file has end of file newline
with open(file_path, "w") as file_obj:
file_obj.writelines("\n".join(tool_urls))
Expand Down
2 changes: 1 addition & 1 deletion scripts/1-fetch/gcs_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def get_last_completed_plan_index():

def load_plan():
plan = []
file_path = os.path.join(PATHS["data"], "gcs_query_plan.csv")
file_path = shared.path_join(PATHS["data"], "gcs_query_plan.csv")
with open(file_path, "r", newline="") as file_obj:
plan = list(csv.DictReader(file_obj, dialect="unix"))
return plan
Expand Down
232 changes: 221 additions & 11 deletions scripts/2-process/gcs_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,18 @@
for analysis and comparison between quarters.
"""
# Standard library
import argparse
import csv
import os
import sys
import textwrap
import traceback

# import pandas as pd
# Third-party
import pandas as pd
from pygments import highlight
from pygments.formatters import TerminalFormatter
from pygments.lexers import PythonTracebackLexer

# Add parent directory so shared can be imported
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
Expand All @@ -19,6 +26,36 @@
# Setup
LOGGER, PATHS = shared.setup(__file__)

# Constants
FILE1_COUNT = shared.path_join(PATHS["data_1-fetch"], "gcs_1_count.csv")
FILE2_LANGUAGE = shared.path_join(
PATHS["data_1-fetch"], "gcs_2_count_by_language.csv"
)
FILE3_COUNTRY = shared.path_join(
PATHS["data_1-fetch"], "gcs_3_count_by_country.csv"
)
QUARTER = os.path.basename(PATHS["data_quarter"])


def parse_arguments():
"""
Parse command-line options, returns parsed argument namespace.
"""
LOGGER.info("Parsing command-line options")
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--enable-save",
action="store_true",
help="Enable saving results",
)
parser.add_argument(
"--enable-git",
action="store_true",
help="Enable git actions (fetch, merge, add, commit, and push)",
)
return parser.parse_args()


# def load_quarter_data(quarter):
# """
# Load data for a specific quarter.
Expand Down Expand Up @@ -157,19 +194,184 @@
# return parser.parse_args()


def main():
raise shared.QuantifyingException("No current code for Phase 2", 0)
def data_to_csv(args, data, file_path):
if not args.enable_save:
return
os.makedirs(PATHS["data_phase"], exist_ok=True)
# emulate csv.unix_dialect
data.to_csv(
file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n"
)


def process_top_25_tools(args, count_data):
LOGGER.info("Processing top 25 tools")
data = count_data.sort_values("COUNT", ascending=False)
data.reset_index(drop=True, inplace=True)
data = data.iloc[:25]
data.rename(
columns={"TOOL_IDENTIFIER": "CC legal tool", "COUNT": "Count"},
inplace=True,
)
file_path = shared.path_join(PATHS["data_phase"], "gcs_top_25_tools.csv")
data_to_csv(args, data, file_path)


def process_totals_by_product(args, count_data):
LOGGER.info("Processing totals by product")
data = {
"Licenses version 4.0": 0,
"Licenses version 3.0": 0,
"Licenses version 2.x": 0,
"Licenses version 1.0": 0,
"CC0 1.0": 0,
"Public Domain Mark 1.0": 0,
"Certification 1.0 US": 0,
}
for row in count_data.itertuples(index=False):
tool = row[0]
count = row[1]
if tool.startswith("PDM"):
key = "Public Domain Mark 1.0"
elif "CC0" in tool:
key = "CC0 1.0"
elif "PUBLICDOMAIN" in tool:
key = "Certification 1.0 US"
elif "4.0" in tool:
key = "Licenses version 4.0"
elif "3.0" in tool:
key = "Licenses version 3.0"
elif "2." in tool:
key = "Licenses version 2.x"
elif "1.0" in tool:
key = "Licenses version 1.0"
else:
raise shared.QuantifyingException("Invalid TOOL_IDENTIFIER")
data[key] += count

data = pd.DataFrame(
data.items(), columns=["CC legal tool product", "Count"]
)
file_path = shared.path_join(
PATHS["data_phase"], "gcs_totals_by_product.csv"
)
data_to_csv(args, data, file_path)


def process_totals_by_unit(args, count_data):
LOGGER.info("Processing totals by unit")
data = {}
for row in count_data.itertuples(index=False):
tool = row[0]
count = row[1]
if tool.startswith("PDM"):
key = "mark"
elif "CC0" in tool:
key = "cc0"
elif "PUBLICDOMAIN" in tool:
key = "certification"
else:
parts = tool.split()
key = parts[1].lower()
if key == "by-nd-nc":
key = "by-nc-nd"
if key not in data.keys():
data[key] = count
else:
data[key] += count

data = pd.DataFrame(data.items(), columns=["Legal Tool Unit", "Count"])
data.sort_values("Count", ascending=False, inplace=True)
data.reset_index(drop=True, inplace=True)
file_path = shared.path_join(PATHS["data_phase"], "gcs_totals_by_unit.csv")
data_to_csv(args, data, file_path)


def process_totals_by_free_cultural(args, count_data):
LOGGER.info("Processing totals by Approved for Free Cultural Works")
data = {
"Approved for Free Cultural Works": 0,
"Limited uses": 0,
}
for row in count_data.itertuples(index=False):
tool = row[0]
count = row[1]
if tool.startswith("PDM") or "CC0" in tool or "PUBLICDOMAIN" in tool:
key = "Approved for Free Cultural Works"
else:
parts = tool.split()
unit = parts[1].lower()
if unit in ["by-sa", "by", "sa", "sampling+"]:
key = "Approved for Free Cultural Works"
else:
key = "Limited uses"
data[key] += count

data = pd.DataFrame(data.items(), columns=["Category", "Count"])
data.sort_values("Count", ascending=False, inplace=True)
data.reset_index(drop=True, inplace=True)
file_path = shared.path_join(
PATHS["data_phase"], "gcs_totals_by_free_cultural.csv"
)
data_to_csv(args, data, file_path)


def process_totals_by_restrictions(args, count_data):
LOGGER.info("Processing totals by restriction")
data = {"level 0": 0, "level 1": 0, "level 2": 0, "level 3": 0}
for row in count_data.itertuples(index=False):
tool = row[0]
count = row[1]
if tool.startswith("PDM") or "CC0" in tool or "PUBLICDOMAIN" in tool:
key = "level 0"
else:
parts = tool.split()
unit = parts[1].lower()
if unit in ["by-sa", "by", "sa", "sampling+"]:
key = "level 1"
elif unit in ["by-nc", "by-nc-sa", "sampling", "nc", "nc-sa"]:
key = "level 2"
else:
key = "level 3"
data[key] += count

data = pd.DataFrame(data.items(), columns=["Category", "Count"])
file_path = shared.path_join(
PATHS["data_phase"], "gcs_totals_by_restrictions.csv"
)
data_to_csv(args, data, file_path)


# # Fetch and merge changes
# shared.fetch_and_merge(PATHS["repo"])
def main():
args = parse_arguments()
shared.log_paths(LOGGER, PATHS)
shared.git_fetch_and_merge(args, PATHS["repo"])

# Count data
count_data = pd.read_csv(FILE1_COUNT, usecols=["TOOL_IDENTIFIER", "COUNT"])
process_top_25_tools(args, count_data)
process_totals_by_product(args, count_data)
process_totals_by_unit(args, count_data)
process_totals_by_free_cultural(args, count_data)
process_totals_by_restrictions(args, count_data)

# # Langauge data
# langauge_data = pd.read_csv(
# FILE2_LANGUAGE, usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"]
# )

# # Add and commit changes
# shared.add_and_commit(
# PATHS["repo"], PATHS["data_quarter"], "Fetched and updated new data"
# # Country data
# country_data = pd.read_csv(
# FILE3_COUNTRY, usecols=["TOOL_IDENTIFIER", "COUNTRY", "COUNT"]
# )

# # Push changes
# shared.push_changes(PATHS["repo"])
args = shared.git_add_and_commit(
args,
PATHS["repo"],
PATHS["data_quarter"],
f"Add and commit new Google Custom Search (GCS) data for {QUARTER}",
)
shared.git_push_changes(args, PATHS["repo"])


if __name__ == "__main__":
Expand All @@ -188,5 +390,13 @@ def main():
LOGGER.info("(130) Halted via KeyboardInterrupt.")
sys.exit(130)
except Exception:
LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}")
traceback_formatted = textwrap.indent(
highlight(
traceback.format_exc(),
PythonTracebackLexer(),
TerminalFormatter(),
),
" ",
)
LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}")
sys.exit(1)
26 changes: 15 additions & 11 deletions scripts/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,20 +31,19 @@ def setup(current_file):

# Paths
paths = {}
paths["repo"] = os.path.dirname(
os.path.abspath(os.path.realpath(os.path.join(__file__, "..")))
)
paths["dotenv"] = os.path.join(paths["repo"], ".env")
paths["repo"] = os.path.dirname(path_join(__file__, ".."))
paths["dotenv"] = path_join(paths["repo"], ".env")
paths["data"] = os.path.dirname(
os.path.abspath(os.path.realpath(current_file))
)
phase = os.path.basename(
current_phase = os.path.basename(
os.path.dirname(os.path.abspath(os.path.realpath(current_file)))
)
paths["data"] = os.path.join(paths["repo"], "data")
data_quarter = os.path.join(paths["data"], f"{quarter}")
paths["state"] = os.path.join(data_quarter, "state.yaml")
paths["data_phase"] = os.path.join(data_quarter, phase)
paths["data"] = path_join(paths["repo"], "data")
data_quarter = path_join(paths["data"], f"{quarter}")
for phase in ["1-fetch", "2-process", "3-report"]:
paths[f"data_{phase}"] = path_join(data_quarter, phase)
paths["data_phase"] = path_join(data_quarter, current_phase)

paths["data_quarter"] = data_quarter

Expand All @@ -53,9 +52,14 @@ def setup(current_file):

def log_paths(logger, paths):
paths_list = []
repo_path = paths["repo"]
for label, path in paths.items():
label = f"{label}:"
paths_list.append(f"\n{' ' * 12}{label:<11} {path}")
if label == "repo:":
paths_list.append(f"\n{' ' * 4}{label} {path}")
else:
path_new = path.replace(repo_path, ".")
paths_list.append(f"\n{' ' * 8}{label:<15} {path_new}")
paths_list = "".join(paths_list)
logger.info(f"PATHS:{paths_list}")

Expand Down Expand Up @@ -137,7 +141,7 @@ def update_readme(
"""
Update the README.md file with the generated images and descriptions.
"""
readme_path = os.path.join(paths["data"], args.quarter, "README.md")
readme_path = path_join(paths["data"], args.quarter, "README.md")

# Define section markers for each data source
section_marker_start = f"<!-- {data_source} Start -->"
Expand Down
Loading