|
| 1 | +#!/usr/bin/env python |
| 2 | +""" |
| 3 | +Fetch CC Legal Tool usage from the Museums Victoria Collections API. |
| 4 | +""" |
| 5 | + |
| 6 | +# Standard library |
| 7 | +import argparse |
| 8 | +import csv |
| 9 | +import os |
| 10 | +import re |
| 11 | +import sys |
| 12 | +import textwrap |
| 13 | +import traceback |
| 14 | +from collections import defaultdict |
| 15 | + |
| 16 | +# Third-party |
| 17 | +import requests |
| 18 | +from pygments import highlight |
| 19 | +from pygments.formatters import TerminalFormatter |
| 20 | +from pygments.lexers import PythonTracebackLexer |
| 21 | + |
| 22 | +# Add parent directory so shared can be imported |
| 23 | +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) |
| 24 | + |
| 25 | +# First-party/Local |
| 26 | +import shared # noqa: E402 |
| 27 | + |
| 28 | +# Setup |
| 29 | +LOGGER, PATHS = shared.setup(__file__) |
| 30 | + |
| 31 | +# Constants |
| 32 | +BASE_URL = "https://collections.museumsvictoria.com.au/api/search" |
| 33 | +FILE1_COUNT = shared.path_join( |
| 34 | + PATHS["data_phase"], "museums_victoria_1_count.csv" |
| 35 | +) |
| 36 | +FILE2_MEDIA = shared.path_join( |
| 37 | + PATHS["data_phase"], "museums_victoria_2_count_by_media.csv" |
| 38 | +) |
| 39 | +FILE3_RECORD = shared.path_join( |
| 40 | + PATHS["data_phase"], "museums_victoria_3_count_by_record.csv" |
| 41 | +) |
| 42 | +HEADER1_COUNT = ["TOOL IDENTIFIER", "COUNT"] |
| 43 | +HEADER2_MEDIA = ["TOOL IDENTIFIER", "MEDIA TYPE", "COUNT"] |
| 44 | +HEADER3_RECORD = ["TOOL IDENTIFIER", "RECORD TYPE", "COUNT"] |
| 45 | +PER_PAGE = 100 |
| 46 | +QUARTER = os.path.basename(PATHS["data_quarter"]) |
| 47 | +RECORD_TYPES = [ |
| 48 | + "article", |
| 49 | + "item", |
| 50 | + "species", |
| 51 | + "specimen", |
| 52 | +] # Type of record to return |
| 53 | + |
| 54 | + |
| 55 | +def parse_arguments(): |
| 56 | + """ |
| 57 | + Parse command-line options, returns parsed argument namespace. |
| 58 | + """ |
| 59 | + LOGGER.info("Parsing command-line options") |
| 60 | + parser = argparse.ArgumentParser(description=__doc__) |
| 61 | + parser.add_argument( |
| 62 | + "--enable-save", |
| 63 | + action="store_true", |
| 64 | + help="Enable saving results", |
| 65 | + ) |
| 66 | + parser.add_argument( |
| 67 | + "--enable-git", |
| 68 | + action="store_true", |
| 69 | + help="Enable git actions (fetch, merge, add, commit, and push)", |
| 70 | + ) |
| 71 | + parser.add_argument( |
| 72 | + "--limit", |
| 73 | + type=int, |
| 74 | + default=None, |
| 75 | + help="Maximum number of records to fetch per each record type", |
| 76 | + ) |
| 77 | + args = parser.parse_args() |
| 78 | + if not args.enable_save and args.enable_git: |
| 79 | + parser.error("--enable-git requires --enable-save") |
| 80 | + return args |
| 81 | + |
| 82 | + |
| 83 | +def initialize_data_file(file_path, header): |
| 84 | + with open(file_path, "w", encoding="utf-8", newline="\n") as file_obj: |
| 85 | + writer = csv.DictWriter(file_obj, fieldnames=header, dialect="unix") |
| 86 | + writer.writeheader() |
| 87 | + |
| 88 | + |
| 89 | +def initialize_all_data_files(args): |
| 90 | + if not args.enable_save: |
| 91 | + return |
| 92 | + |
| 93 | + # Create data directory for this phase |
| 94 | + os.makedirs(PATHS["data_phase"], exist_ok=True) |
| 95 | + |
| 96 | + initialize_data_file(FILE1_COUNT, HEADER1_COUNT) |
| 97 | + initialize_data_file(FILE2_MEDIA, HEADER2_MEDIA) |
| 98 | + initialize_data_file(FILE3_RECORD, HEADER3_RECORD) |
| 99 | + |
| 100 | + |
| 101 | +def write_counts_to_csv(args, data: dict): |
| 102 | + if not args.enable_save: |
| 103 | + return |
| 104 | + for data in data.items(): |
| 105 | + rows = [] |
| 106 | + file_path = data[0] |
| 107 | + if file_path == FILE2_MEDIA: |
| 108 | + fieldnames = HEADER2_MEDIA |
| 109 | + for media_type in data[1].items(): |
| 110 | + rows.extend( |
| 111 | + { |
| 112 | + "TOOL IDENTIFIER": row[0], |
| 113 | + "MEDIA TYPE": media_type[0], |
| 114 | + "COUNT": row[1], |
| 115 | + } |
| 116 | + for row in media_type[1].items() |
| 117 | + ) |
| 118 | + elif file_path == FILE3_RECORD: |
| 119 | + fieldnames = HEADER3_RECORD |
| 120 | + for record_type in data[1].items(): |
| 121 | + rows.extend( |
| 122 | + { |
| 123 | + "TOOL IDENTIFIER": row[0], |
| 124 | + "RECORD TYPE": record_type[0], |
| 125 | + "COUNT": row[1], |
| 126 | + } |
| 127 | + for row in record_type[1].items() |
| 128 | + ) |
| 129 | + else: |
| 130 | + fieldnames = HEADER1_COUNT |
| 131 | + rows = [ |
| 132 | + { |
| 133 | + "TOOL IDENTIFIER": row[0], |
| 134 | + "COUNT": row[1], |
| 135 | + } |
| 136 | + for row in data[1].items() |
| 137 | + ] |
| 138 | + with open(file_path, "a", encoding="utf-8", newline="\n") as file_obj: |
| 139 | + writer = csv.DictWriter( |
| 140 | + file_obj, fieldnames=fieldnames, dialect="unix" |
| 141 | + ) |
| 142 | + writer.writerows(rows) |
| 143 | + |
| 144 | + |
| 145 | +def fetch_museums_victoria_data(args, session): |
| 146 | + """ |
| 147 | + Fetches all records with images from the Museums Victoria API by iterating |
| 148 | + through all record types and handling pagination. |
| 149 | + """ |
| 150 | + |
| 151 | + record_counts = defaultdict(lambda: defaultdict(int)) |
| 152 | + media_counts = defaultdict(lambda: defaultdict(int)) |
| 153 | + licences_count = defaultdict(int) |
| 154 | + |
| 155 | + # Iterate through each record type |
| 156 | + for record_type in RECORD_TYPES: |
| 157 | + records_processed = 0 |
| 158 | + current_page = 1 |
| 159 | + total_pages = None |
| 160 | + per_page = min(PER_PAGE, args.limit) if args.limit else PER_PAGE |
| 161 | + |
| 162 | + while True: |
| 163 | + # 1. Construct the API query parameters |
| 164 | + params = { |
| 165 | + "envelope": "true", |
| 166 | + "page": current_page, |
| 167 | + "perpage": per_page, |
| 168 | + "recordtype": record_type, |
| 169 | + } |
| 170 | + LOGGER.info( |
| 171 | + f"fetching page {current_page} of {record_type}s " |
| 172 | + f"(records {(current_page * per_page) - per_page}-" |
| 173 | + f"{current_page * per_page})" |
| 174 | + ) |
| 175 | + try: |
| 176 | + r = session.get(BASE_URL, params=params, timeout=30) |
| 177 | + r.raise_for_status() |
| 178 | + except requests.HTTPError as e: |
| 179 | + raise shared.QuantifyingException(f"HTTP Error: {e}", 1) |
| 180 | + except requests.RequestException as e: |
| 181 | + raise shared.QuantifyingException(f"Request Exception: {e}", 1) |
| 182 | + except KeyError as e: |
| 183 | + raise shared.QuantifyingException(f"KeyError: {e}", 1) |
| 184 | + data = r.json() |
| 185 | + results = data.get("response", []) |
| 186 | + for res in results: |
| 187 | + records_processed += 1 |
| 188 | + media_list = res.get("media", []) |
| 189 | + for media_item in media_list: |
| 190 | + licence_data = media_item.get("licence") |
| 191 | + |
| 192 | + # COUNTING THE UNIQUE LICENCE TYPES |
| 193 | + license_short_name = licence_data.get("shortName") |
| 194 | + version_number = re.search( |
| 195 | + r"\b\d+\.\d+\b", licence_data.get("name") |
| 196 | + ) |
| 197 | + if version_number: |
| 198 | + license_short_name = ( |
| 199 | + f"{license_short_name} {version_number.group()}" |
| 200 | + ) |
| 201 | + |
| 202 | + if license_short_name: |
| 203 | + licences_count[license_short_name] += 1 |
| 204 | + |
| 205 | + # COUNTING LICENSES BY MEDIA TYPES |
| 206 | + media_type = media_item.get("type") |
| 207 | + media_counts[media_type][license_short_name] += 1 |
| 208 | + |
| 209 | + # COUNTING LICENSES BY RECORD TYPES |
| 210 | + record_counts[record_type][license_short_name] += 1 |
| 211 | + if total_pages is None: |
| 212 | + headers = data.get("headers", {}) |
| 213 | + total_pages = int(headers.get("totalResults", "0")) |
| 214 | + |
| 215 | + if args.limit is not None and records_processed >= args.limit: |
| 216 | + LOGGER.info( |
| 217 | + f"Limit Reached: {records_processed} processed. " |
| 218 | + f"Skipping remaining records for {record_type}." |
| 219 | + ) |
| 220 | + break |
| 221 | + current_page += 1 |
| 222 | + |
| 223 | + if current_page > total_pages: |
| 224 | + break |
| 225 | + |
| 226 | + return { |
| 227 | + FILE1_COUNT: dict(sorted(licences_count.items())), |
| 228 | + FILE2_MEDIA: sort_nested_defaultdict(media_counts), |
| 229 | + FILE3_RECORD: sort_nested_defaultdict(record_counts), |
| 230 | + } |
| 231 | + |
| 232 | + |
| 233 | +def sort_nested_defaultdict(d): |
| 234 | + """Convert defaultdicts to regular dicts and sort all keys recursively.""" |
| 235 | + if isinstance(d, defaultdict): |
| 236 | + d = {k: sort_nested_defaultdict(v) for k, v in sorted(d.items())} |
| 237 | + elif isinstance(d, dict): |
| 238 | + d = {k: sort_nested_defaultdict(v) for k, v in sorted(d.items())} |
| 239 | + return d |
| 240 | + |
| 241 | + |
| 242 | +def main(): |
| 243 | + args = parse_arguments() |
| 244 | + shared.paths_log(LOGGER, PATHS) |
| 245 | + shared.git_fetch_and_merge(args, PATHS["repo"]) |
| 246 | + initialize_all_data_files(args) |
| 247 | + data = fetch_museums_victoria_data(args, shared.get_session()) |
| 248 | + write_counts_to_csv(args, data) |
| 249 | + args = shared.git_add_and_commit( |
| 250 | + args, |
| 251 | + PATHS["repo"], |
| 252 | + PATHS["data_quarter"], |
| 253 | + f"Add and commit new Museums Victoria data for {QUARTER}", |
| 254 | + ) |
| 255 | + shared.git_push_changes(args, PATHS["repo"]) |
| 256 | + |
| 257 | + |
| 258 | +if __name__ == "__main__": |
| 259 | + try: |
| 260 | + main() |
| 261 | + except shared.QuantifyingException as e: |
| 262 | + if e.exit_code == 0: |
| 263 | + LOGGER.info(e.message) |
| 264 | + else: |
| 265 | + LOGGER.error(e.message) |
| 266 | + sys.exit(e.exit_code) |
| 267 | + except SystemExit as e: |
| 268 | + if e.code != 0: |
| 269 | + LOGGER.error(f"System exit with code: {e.code}") |
| 270 | + sys.exit(e.code) |
| 271 | + except KeyboardInterrupt: |
| 272 | + LOGGER.info("(130) Halted via KeyboardInterrupt.") |
| 273 | + sys.exit(130) |
| 274 | + except Exception: |
| 275 | + traceback_formatted = textwrap.indent( |
| 276 | + highlight( |
| 277 | + traceback.format_exc(), |
| 278 | + PythonTracebackLexer(), |
| 279 | + TerminalFormatter(), |
| 280 | + ), |
| 281 | + " ", |
| 282 | + ) |
| 283 | + LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}") |
| 284 | + sys.exit(1) |
0 commit comments