diff --git a/scripts/1-fetch/arxiv_fetch.py b/scripts/1-fetch/arxiv_fetch.py new file mode 100755 index 00000000..78a29d35 --- /dev/null +++ b/scripts/1-fetch/arxiv_fetch.py @@ -0,0 +1,428 @@ +#!/usr/bin/env python +""" +Fetch ArXiv papers with CC license information and generate count reports. +""" +# Standard library +import argparse +import csv +import os +import sys +import textwrap +import time +import traceback +import urllib.parse +from collections import defaultdict + +# Third-party +import feedparser +import requests +from pygments import highlight +from pygments.formatters import TerminalFormatter +from pygments.lexers import PythonTracebackLexer +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry + +# Add parent directory so shared can be imported +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) + +# First-party/Local +import shared # noqa: E402 + +# Setup +LOGGER, PATHS = shared.setup(__file__) + +# Constants +BASE_URL = "http://export.arxiv.org/api/query?" +FILE_ARXIV_COUNT = shared.path_join(PATHS["data_1-fetch"], "arxiv_1_count.csv") +FILE_ARXIV_CATEGORY = shared.path_join( + PATHS["data_1-fetch"], "arxiv_2_count_by_category.csv" +) +FILE_ARXIV_YEAR = shared.path_join( + PATHS["data_1-fetch"], "arxiv_3_count_by_year.csv" +) +FILE_ARXIV_AUTHOR = shared.path_join( + PATHS["data_1-fetch"], "arxiv_4_count_by_author_count.csv" +) + +HEADER_COUNT = ["TOOL_IDENTIFIER", "COUNT"] +HEADER_CATEGORY = ["TOOL_IDENTIFIER", "CATEGORY", "COUNT"] +HEADER_YEAR = ["TOOL_IDENTIFIER", "YEAR", "COUNT"] +HEADER_AUTHOR = ["TOOL_IDENTIFIER", "AUTHOR_COUNT", "COUNT"] + +QUARTER = os.path.basename(PATHS["data_quarter"]) + +# Log the start of the script execution here +LOGGER.info("Script execution started.") + + +def parse_arguments(): + """Parse command-line options, returns parsed argument namespace.""" + LOGGER.info("Parsing command-line options") + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--limit", + type=int, + default=800, + help="Limit number of papers to fetch (default: 800)", + ) + parser.add_argument( + "--enable-save", + action="store_true", + help="Enable saving results", + ) + parser.add_argument( + "--enable-git", + action="store_true", + help="Enable git actions (fetch, merge, add, commit, and push)", + ) + args = parser.parse_args() + if not args.enable_save and args.enable_git: + parser.error("--enable-git requires --enable-save") + return args + + +def initialize_data_file(file_path, headers): + """Initialize CSV file with headers if it doesn't exist.""" + if not os.path.isfile(file_path): + with open(file_path, "w", newline="") as file_obj: + writer = csv.DictWriter( + file_obj, fieldnames=headers, dialect="unix" + ) + writer.writeheader() + + +def initialize_all_data_files(args): + """Initialize all data files.""" + if not args.enable_save: + return + + os.makedirs(PATHS["data_1-fetch"], exist_ok=True) + initialize_data_file(FILE_ARXIV_COUNT, HEADER_COUNT) + initialize_data_file(FILE_ARXIV_CATEGORY, HEADER_CATEGORY) + initialize_data_file(FILE_ARXIV_YEAR, HEADER_YEAR) + initialize_data_file(FILE_ARXIV_AUTHOR, HEADER_AUTHOR) + + +def extract_license_info(entry): + """Extract CC license information from ArXiv entry.""" + license_info = "Unknown" + + # Check for license in rights field + if hasattr(entry, "rights") and entry.rights: + license_info = entry.rights.upper() + if "CC0" in license_info or "CC 0" in license_info: + license_info = "CC0" + elif "CC BY-NC-ND" in license_info: + license_info = "CC BY-NC-ND" + elif "CC BY-NC-SA" in license_info: + license_info = "CC BY-NC-SA" + elif "CC BY-ND" in license_info: + license_info = "CC BY-ND" + elif "CC BY-SA" in license_info: + license_info = "CC BY-SA" + elif "CC BY-NC" in license_info: + license_info = "CC BY-NC" + elif "CC BY" in license_info: + license_info = "CC BY" + elif "CREATIVE COMMONS" in license_info: + license_info = "Creative Commons" + else: + license_info = "Unknown" + + # Check for license in summary/abstract + if license_info == "Unknown" and hasattr(entry, "summary"): + license_info = entry.summary.upper() + if "CC0" in license_info or "CC 0" in license_info: + license_info = "CC0" + elif "CC BY-NC-ND" in license_info: + license_info = "CC BY-NC-ND" + elif "CC BY-NC-SA" in license_info: + license_info = "CC BY-NC-SA" + elif "CC BY-ND" in license_info: + license_info = "CC BY-ND" + elif "CC BY-SA" in license_info: + license_info = "CC BY-SA" + elif "CC BY-NC" in license_info: + license_info = "CC BY-NC" + elif "CC BY" in license_info: + license_info = "CC BY" + elif "CREATIVE COMMONS" in license_info: + license_info = "Creative Commons" + else: + license_info = "Unknown" + + return license_info + + +def extract_category_from_entry(entry): + """Extract primary category from ArXiv entry.""" + if ( + hasattr(entry, "arxiv_primary_category") + and entry.arxiv_primary_category + ): + return entry.arxiv_primary_category.get("term", "Unknown") + elif hasattr(entry, "tags") and entry.tags: + # Get first category from tags + for tag in entry.tags: + if hasattr(tag, "term"): + return tag.term + return "Unknown" + + +def extract_year_from_entry(entry): + """Extract publication year from ArXiv entry.""" + if hasattr(entry, "published"): + try: + return entry.published[:4] # Extract year from date string + except (AttributeError, IndexError): + pass + return "Unknown" + + +def extract_author_count_from_entry(entry): + """Extract number of authors from ArXiv entry.""" + if hasattr(entry, "authors"): + return str(len(entry.authors)) + elif hasattr(entry, "author"): + return "1" + return "Unknown" + + +def get_requests_session(): + """Create requests session with retry logic.""" + retry_strategy = Retry( + total=5, + backoff_factor=3, + status_forcelist=[408, 429, 500, 502, 503, 504], + ) + session = requests.Session() + session.mount("http://", HTTPAdapter(max_retries=retry_strategy)) + session.mount("https://", HTTPAdapter(max_retries=retry_strategy)) + return session + + +def query_arxiv(args): + """Query ArXiv API for papers with potential CC licenses.""" + LOGGER.info("Beginning to fetch results from ArXiv API") + + session = get_requests_session() + results_per_iteration = 50 + + search_queries = [ + 'all:"creative commons"', + 'all:"CC BY"', + 'all:"CC BY-NC"', + 'all:"CC BY-SA"', + 'all:"CC BY-ND"', + 'all:"CC BY-NC-SA"', + 'all:"CC BY-NC-ND"', + 'all:"CC0"', + ] + + # Data structures for counting + license_counts = defaultdict(int) + category_counts = defaultdict(lambda: defaultdict(int)) + year_counts = defaultdict(lambda: defaultdict(int)) + author_counts = defaultdict(lambda: defaultdict(int)) + + total_fetched = 0 + + for search_query in search_queries: + if total_fetched >= args.limit: + break + + LOGGER.info(f"Searching for: {search_query}") + consecutive_empty_calls = 0 + + for start in range( + 0, min(args.limit - total_fetched, 500), results_per_iteration + ): + encoded_query = urllib.parse.quote_plus(search_query) + query = ( + f"search_query={encoded_query}&start={start}" + f"&max_results={results_per_iteration}" + ) + + papers_found_in_batch = 0 + + try: + LOGGER.info( + f"Fetching results {start} - " + f"{start + results_per_iteration}" + ) + response = session.get(BASE_URL + query, timeout=30) + response.raise_for_status() + feed = feedparser.parse(response.content) + + for entry in feed.entries: + if total_fetched >= args.limit: + break + + license_info = extract_license_info(entry) + + if license_info != "Unknown": + category = extract_category_from_entry(entry) + year = extract_year_from_entry(entry) + author_count = extract_author_count_from_entry(entry) + + # Count by license + license_counts[license_info] += 1 + + # Count by category and license + category_counts[license_info][category] += 1 + + # Count by year and license + year_counts[license_info][year] += 1 + + # Count by author count and license + author_counts[license_info][author_count] += 1 + + total_fetched += 1 + papers_found_in_batch += 1 + + LOGGER.info( + f"Found CC licensed paper: {license_info} - " + f"{category} - {year}" + ) + + # ArXiv recommends 3-second delay between calls + time.sleep(3) + + except requests.RequestException as e: + LOGGER.error(f"Request failed: {e}") + break + + if papers_found_in_batch == 0: + consecutive_empty_calls += 1 + if consecutive_empty_calls >= 2: + LOGGER.info( + f"No new papers found in 2 consecutive calls for " + f"query: {search_query}. Moving to next query." + ) + break + else: + consecutive_empty_calls = 0 + + # Save results + if args.enable_save: + save_count_data( + license_counts, category_counts, year_counts, author_counts + ) + + LOGGER.info(f"Total CC licensed papers fetched: {total_fetched}") + + +def save_count_data( + license_counts, category_counts, year_counts, author_counts +): + """Save count data to CSV files.""" + # Save license counts + with open(FILE_ARXIV_COUNT, "w", newline="") as file_obj: + writer = csv.DictWriter( + file_obj, fieldnames=HEADER_COUNT, dialect="unix" + ) + writer.writeheader() + + for license_type, count in license_counts.items(): + writer.writerow( + { + "TOOL_IDENTIFIER": license_type, + "COUNT": count, + } + ) + + # Save category counts + with open(FILE_ARXIV_CATEGORY, "w", newline="") as file_obj: + writer = csv.DictWriter( + file_obj, fieldnames=HEADER_CATEGORY, dialect="unix" + ) + writer.writeheader() + + for license_type, categories in category_counts.items(): + for category, count in categories.items(): + writer.writerow( + { + "TOOL_IDENTIFIER": license_type, + "CATEGORY": category, + "COUNT": count, + } + ) + + # Save year counts + with open(FILE_ARXIV_YEAR, "w", newline="") as file_obj: + writer = csv.DictWriter( + file_obj, fieldnames=HEADER_YEAR, dialect="unix" + ) + writer.writeheader() + + for license_type, years in year_counts.items(): + for year, count in years.items(): + writer.writerow( + { + "TOOL_IDENTIFIER": license_type, + "YEAR": year, + "COUNT": count, + } + ) + + # Save author count data + with open(FILE_ARXIV_AUTHOR, "w", newline="") as file_obj: + writer = csv.DictWriter( + file_obj, fieldnames=HEADER_AUTHOR, dialect="unix" + ) + writer.writeheader() + + for license_type, author_counts_data in author_counts.items(): + for author_count, count in author_counts_data.items(): + writer.writerow( + { + "TOOL_IDENTIFIER": license_type, + "AUTHOR_COUNT": author_count, + "COUNT": count, + } + ) + + +def main(): + """Main function.""" + args = parse_arguments() + shared.paths_log(LOGGER, PATHS) + shared.git_fetch_and_merge(args, PATHS["repo"]) + initialize_all_data_files(args) + query_arxiv(args) + args = shared.git_add_and_commit( + args, + PATHS["repo"], + PATHS["data_quarter"], + f"Add and commit new ArXiv CC license data for {QUARTER}", + ) + shared.git_push_changes(args, PATHS["repo"]) + + +if __name__ == "__main__": + try: + main() + except shared.QuantifyingException as e: + if e.exit_code == 0: + LOGGER.info(e.message) + else: + LOGGER.error(e.message) + sys.exit(e.exit_code) + except SystemExit as e: + if e.code != 0: + LOGGER.error(f"System exit with code: {e.code}") + sys.exit(e.code) + except KeyboardInterrupt: + LOGGER.info("(130) Halted via KeyboardInterrupt.") + sys.exit(130) + except Exception: + traceback_formatted = textwrap.indent( + highlight( + traceback.format_exc(), + PythonTracebackLexer(), + TerminalFormatter(), + ), + " ", + ) + LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}") + sys.exit(1) diff --git a/scripts/3-report/gcs_report.py b/scripts/3-report/gcs_report.py deleted file mode 100755 index 105313fa..00000000 --- a/scripts/3-report/gcs_report.py +++ /dev/null @@ -1,529 +0,0 @@ -#!/usr/bin/env python -""" -This file is dedicated to visualizing and analyzing the data collected -from Google Custom Search (GCS). -""" -# Standard library -import argparse -import os -import sys -import textwrap -import traceback - -# Third-party -import pandas as pd -from pygments import highlight -from pygments.formatters import TerminalFormatter -from pygments.lexers import PythonTracebackLexer - -# Add parent directory so shared can be imported -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) - -# First-party/Local -import plot # noqa: E402 -import shared # noqa: E402 - -# Setup -LOGGER, PATHS = shared.setup(__file__) - -# Constants -QUARTER = os.path.basename(PATHS["data_quarter"]) -SECTION = "Google Custom Search (GCS)" - - -def parse_arguments(): - """ - Parses command-line arguments, returns parsed arguments. - """ - LOGGER.info("Parsing command-line arguments") - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument( - "--quarter", - default=QUARTER, - help=f"Data quarter in format YYYYQx (default: {QUARTER})", - ) - parser.add_argument( - "--show-plots", - action="store_true", - help="Show generated plots (default: False)", - ) - parser.add_argument( - "--enable-save", - action="store_true", - help="Enable saving results (default: False)", - ) - parser.add_argument( - "--enable-git", - action="store_true", - help="Enable git actions such as fetch, merge, add, commit, and push" - " (default: False)", - ) - args = parser.parse_args() - if not args.enable_save and args.enable_git: - parser.error("--enable-git requires --enable-save") - if args.quarter != QUARTER: - global PATHS - PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter) - args.logger = LOGGER - args.paths = PATHS - return args - - -def gcs_intro(args): - """ - Write Google Custom Search (GCS) introduction. - """ - LOGGER.info(gcs_intro.__doc__.strip()) - file_path = shared.path_join( - PATHS["data_2-process"], - "gcs_product_totals.csv", - ) - LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") - name_label = "CC legal tool product" - data = pd.read_csv(file_path, index_col=name_label) - total_count = f"{data['Count'].sum():,d}" - shared.update_readme( - args, - SECTION, - "Overview", - None, - None, - "Google Custom Search (GCS) data uses the `totalResults` returned by" - " API for search queries of the legal tool URLs (quoted and using" - " `linkSite` for accuracy), countries codes, and language codes.\n" - "\n" - f"**The results indicate there are a total of {total_count} online" - " works in the commons--documents that are licensed or put in the" - " public domain using a Creative Commons (CC) legal tool.**\n" - "\n" - "Thank you Google for providing the Programable Search Engine: Custom" - " Search JSON API!\n", - ) - - -def plot_products(args): - """ - Create plots for CC legal tool product totals and percentages - """ - LOGGER.info(plot_products.__doc__.strip()) - file_path = shared.path_join( - PATHS["data_2-process"], "gcs_product_totals.csv" - ) - LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") - name_label = "CC legal tool product" - data = pd.read_csv(file_path, index_col=name_label) - data = data[::-1] # reverse order - - title = "Products totals and percentages" - plt = plot.combined_plot( - args=args, - data=data, - title=title, - name_label=name_label, - data_label="Count", - bar_xscale="log", - bar_ylabel=name_label, - ) - - image_path = shared.path_join( - PATHS["data_phase"], "gcs_product_totals.png" - ) - LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") - - if args.enable_save: - # Create the directory if it does not exist - os.makedirs(PATHS["data_phase"], exist_ok=True) - plt.savefig(image_path) - - shared.update_readme( - args, - SECTION, - title, - image_path, - "Plots showing Creative Commons (CC) legal tool product totals and" - " percentages.", - ) - - -def plot_tool_status(args): - """ - Create plots for the CC legal tool status totals and percentages - """ - LOGGER.info(plot_tool_status.__doc__.strip()) - file_path = shared.path_join( - PATHS["data_2-process"], - "gcs_status_combined_totals.csv", - ) - LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") - name_label = "CC legal tool" - data = pd.read_csv(file_path, index_col=name_label) - data.sort_values(name_label, ascending=False, inplace=True) - - title = "CC legal tools status" - plt = plot.combined_plot( - args=args, - data=data, - title=title, - name_label=name_label, - data_label="Count", - bar_xscale="log", - bar_ylabel="CC legal tool status", - ) - - image_path = shared.path_join(PATHS["data_phase"], "gcs_tool_status.png") - LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") - - if args.enable_save: - # Create the directory if it does not exist - os.makedirs(PATHS["data_phase"], exist_ok=True) - plt.savefig(image_path) - - shared.update_readme( - args, - SECTION, - title, - image_path, - "Plots showing Creative Commons (CC) legal tool status totals and" - " percentages.", - ) - - -def plot_latest_tools(args): - """ - Create plots for latest CC legal tool totals and percentages - """ - LOGGER.info(plot_latest_tools.__doc__.strip()) - file_path = shared.path_join( - PATHS["data_2-process"], - "gcs_status_latest_totals.csv", - ) - LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") - name_label = "CC legal tool" - data = pd.read_csv(file_path, index_col=name_label) - data.sort_values(name_label, ascending=False, inplace=True) - - title = "Latest CC legal tools" - plt = plot.combined_plot( - args=args, - data=data, - title=title, - name_label=name_label, - data_label="Count", - ) - - image_path = shared.path_join( - PATHS["data_phase"], "gcs_status_latest_tools.png" - ) - LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") - - if args.enable_save: - # Create the directory if it does not exist - os.makedirs(PATHS["data_phase"], exist_ok=True) - plt.savefig(image_path) - - shared.update_readme( - args, - SECTION, - title, - image_path, - "Plots showing latest Creative Commons (CC) legal tool totals and" - " percentages.", - ) - - -def plot_prior_tools(args): - """ - Create plots for prior CC legal tool totals and percentages - """ - LOGGER.info(plot_prior_tools.__doc__.strip()) - file_path = shared.path_join( - PATHS["data_2-process"], "gcs_status_prior_totals.csv" - ) - LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") - name_label = "CC legal tool" - data = pd.read_csv(file_path, index_col=name_label) - data.sort_values(name_label, ascending=False, inplace=True) - - title = "Prior CC legal tools" - plt = plot.combined_plot( - args=args, - data=data, - title=title, - name_label=name_label, - data_label="Count", - ) - - image_path = shared.path_join( - PATHS["data_phase"], "gcs_status_prior_tools.png" - ) - LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") - - if args.enable_save: - # Create the directory if it does not exist - os.makedirs(PATHS["data_phase"], exist_ok=True) - plt.savefig(image_path) - - shared.update_readme( - args, - SECTION, - title, - image_path, - "Plots showing prior Creative Commons (CC) legal tool totals and" - " percentages.", - "The unit names have been normalized (~~`CC BY-ND-NC`~~ =>" - " `CC BY-NC-ND`).", - ) - - -def plot_retired_tools(args): - """ - Create plots for retired CC legal tool totals and percentages - """ - LOGGER.info(plot_retired_tools.__doc__.strip()) - file_path = shared.path_join( - PATHS["data_2-process"], - "gcs_status_retired_totals.csv", - ) - LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") - name_label = "CC legal tool" - data = pd.read_csv(file_path, index_col=name_label) - data.sort_values(name_label, ascending=False, inplace=True) - - title = "Retired CC legal tools" - plt = plot.combined_plot( - args=args, - data=data, - title=title, - name_label=name_label, - data_label="Count", - bar_xscale="log", - ) - - image_path = shared.path_join( - PATHS["data_phase"], "gcs_status_retired_tools.png" - ) - LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") - - if args.enable_save: - # Create the directory if it does not exist - os.makedirs(PATHS["data_phase"], exist_ok=True) - plt.savefig(image_path) - - shared.update_readme( - args, - SECTION, - title, - image_path, - "Plots showing retired Creative Commons (CC) legal tools total and" - " percentages.", - "For more information on retired legal tools, see [Retired Legal Tools" - " - Creative Commons](https://creativecommons.org/retiredlicenses/).", - ) - - -def plot_countries_highest_usage(args): - """ - Create plots for the countries with highest usage of latest tools - """ - LOGGER.info(plot_countries_highest_usage.__doc__.strip()) - file_path = shared.path_join( - PATHS["data_2-process"], "gcs_totals_by_country.csv" - ) - LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") - name_label = "Country" - data_label = "Count" - data = pd.read_csv(file_path, index_col=name_label) - total_count = f"{data['Count'].sum():,d}" - data.sort_values(data_label, ascending=False, inplace=True) - data = data[:10] # limit to highest 10 - data = data[::-1] # reverse order - - title = "Countries with highest usage of latest tools" - plt = plot.combined_plot( - args=args, - data=data, - title=title, - name_label=name_label, - data_label=data_label, - bar_xscale="log", - ) - - image_path = shared.path_join( - PATHS["data_phase"], "gcs_countries_highest_usage_latest_tools.png" - ) - LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") - - if args.enable_save: - # Create the directory if it does not exist - os.makedirs(PATHS["data_phase"], exist_ok=True) - plt.savefig(image_path) - - shared.update_readme( - args, - SECTION, - title, - image_path, - "Plots showing countries with the highest useage of the latest" - " Creative Commons (CC) legal tools.", - "The latest tools include Licenses version 4.0 (CC BY 4.0, CC BY-NC" - " 4.0, CC BY-NC-ND 4.0, CC BY-NC-SA 4.0, CC-BY-ND 4.0, CC BY-SA 4.0)," - " CC0 1.0, and the Public Domain Mark (PDM 1.0).\n" - "\n" - f"The complete data set indicates there are a total of {total_count}" - " online works using a latest CC legal tool.", - ) - - -def plot_languages_highest_usage(args): - """ - Create plots for the languages with highest usage of latest tools - """ - LOGGER.info(plot_languages_highest_usage.__doc__.strip()) - file_path = shared.path_join( - PATHS["data_2-process"], "gcs_totals_by_language.csv" - ) - LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") - name_label = "Language" - data_label = "Count" - data = pd.read_csv(file_path, index_col=name_label) - total_count = f"{data['Count'].sum():,d}" - data.sort_values(data_label, ascending=False, inplace=True) - data = data[:10] # limit to highest 10 - data = data[::-1] # reverse order - - title = "Languages with highest usage of latest tools" - plt = plot.combined_plot( - args=args, - data=data, - title=title, - name_label=name_label, - data_label=data_label, - bar_xscale="log", - ) - - image_path = shared.path_join( - PATHS["data_phase"], "gcs_languages_highest_usage_latest_tools.png" - ) - LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") - - if args.enable_save: - # Create the directory if it does not exist - os.makedirs(PATHS["data_phase"], exist_ok=True) - plt.savefig(image_path) - - shared.update_readme( - args, - SECTION, - title, - image_path, - "Plots showing languages with the highest useage of the latest" - " Creative Commons (CC) legal tools.", - "The latest tools include Licenses version 4.0 (CC BY 4.0, CC BY-NC" - " 4.0, CC BY-NC-ND 4.0, CC BY-NC-SA 4.0, CC-BY-ND 4.0, CC BY-SA 4.0)," - " CC0 1.0, and the Public Domain Mark (PDM 1.0).\n" - "\n" - f"The complete data set indicates there are a total of {total_count}" - " online works using a latest CC legal tool.", - ) - - -def plot_free_culture(args): - """ - Create plots for the languages with highest usage of latest tools - """ - LOGGER.info(plot_free_culture.__doc__.strip()) - file_path = shared.path_join( - PATHS["data_2-process"], - "gcs_totals_by_free_cultural.csv", - ) - LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") - name_label = "Category" - data_label = "Count" - data = pd.read_csv(file_path, index_col=name_label) - - title = "Approved for Free Cultural Works" - plt = plot.combined_plot( - args=args, - data=data, - title=title, - name_label=name_label, - data_label=data_label, - ) - - image_path = shared.path_join(PATHS["data_phase"], "gcs_free_culture.png") - LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") - - if args.enable_save: - # Create the directory if it does not exist - os.makedirs(PATHS["data_phase"], exist_ok=True) - plt.savefig(image_path) - - shared.update_readme( - args, - SECTION, - title, - image_path, - "Plots showing Approved for Free Cultural Works legal tool usage.", - "[Understanding Free Cultural Works - Creative" - " Commons](https://creativecommons.org/public-domain/freeworks/):\n" - "\n" - '> Using [the Freedom Defined definition of a "Free Cultural Work"],' - " material licensed under CC BY or BY-SA is a free cultural work. (So" - " is anything in the worldwide public domain marked with CC0 or the" - " Public Domain Mark.) CC’s other licenses– BY-NC, BY-ND, BY-NC-SA," - " and BY-NC-ND–only allow more limited uses, and material under these" - " licenses is not considered a free cultural work.", - ) - - -def main(): - args = parse_arguments() - shared.paths_log(LOGGER, PATHS) - shared.git_fetch_and_merge(args, PATHS["repo"]) - - gcs_intro(args) - plot_products(args) - plot_tool_status(args) - plot_latest_tools(args) - plot_prior_tools(args) - plot_retired_tools(args) - plot_countries_highest_usage(args) - plot_languages_highest_usage(args) - plot_free_culture(args) - - args = shared.git_add_and_commit( - args, - PATHS["repo"], - PATHS["data_quarter"], - f"Add and commit Google Custom Search (GCS) reports for {QUARTER}", - ) - shared.git_push_changes(args, PATHS["repo"]) - - -if __name__ == "__main__": - try: - main() - except shared.QuantifyingException as e: - if e.exit_code == 0: - LOGGER.info(e.message) - else: - LOGGER.error(e.message) - sys.exit(e.exit_code) - except SystemExit as e: - if e.code != 0: - LOGGER.error(f"System exit with code: {e.code}") - sys.exit(e.code) - except KeyboardInterrupt: - LOGGER.info("(130) Halted via KeyboardInterrupt.") - sys.exit(130) - except Exception: - traceback_formatted = textwrap.indent( - highlight( - traceback.format_exc(), - PythonTracebackLexer(), - TerminalFormatter(), - ), - " ", - ) - LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}") - sys.exit(1)