|
| 1 | +#!/usr/bin/env python |
| 2 | +""" |
| 3 | +Fetch metrics usage from Smithsonian Institution Open Access API. |
| 4 | +""" |
| 5 | + |
| 6 | +# Standard library |
| 7 | +import argparse |
| 8 | +import csv |
| 9 | +import os |
| 10 | +import sys |
| 11 | +import textwrap |
| 12 | +import traceback |
| 13 | +from operator import itemgetter |
| 14 | + |
| 15 | +# Third-party |
| 16 | +import requests |
| 17 | +from pygments import highlight |
| 18 | +from pygments.formatters import TerminalFormatter |
| 19 | +from pygments.lexers import PythonTracebackLexer |
| 20 | + |
| 21 | +# Add parent directory so shared can be imported |
| 22 | +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) |
| 23 | + |
| 24 | +# First-party/Local |
| 25 | +import shared # noqa: E402 |
| 26 | + |
| 27 | +# Setup |
| 28 | +LOGGER, PATHS = shared.setup(__file__) |
| 29 | + |
| 30 | +# Constants |
| 31 | +API_DATA_GOV_TOKEN = os.getenv("API_DATA_GOV_TOKEN") |
| 32 | +FILE_1_METRICS = os.path.join(PATHS["data_phase"], "smithsonian_1_metrics.csv") |
| 33 | +FILE_2_UNITS = os.path.join(PATHS["data_phase"], "smithsonian_1_units.csv") |
| 34 | +HEADER_1_METRICS = [ |
| 35 | + "CC0_RECORDS", |
| 36 | + "CC0_RECORDS_WITH_CC0_MEDIA", |
| 37 | + "CC0_MEDIA", |
| 38 | + "CC0_MEDIA_PERCENTAGE", |
| 39 | + "TOTAL_OBJECTS", |
| 40 | +] |
| 41 | +HEADER_2_UNITS = [ |
| 42 | + "UNIT", |
| 43 | + "CC0_RECORDS", |
| 44 | + "CC0_RECORDS_WITH_CC0_MEDIA", |
| 45 | + "TOTAL_OBJECTS", |
| 46 | +] |
| 47 | +QUARTER = os.path.basename(PATHS["data_quarter"]) |
| 48 | + |
| 49 | + |
| 50 | +def parse_arguments(): |
| 51 | + """ |
| 52 | + Parse command-line options, returns parsed argument namespace. |
| 53 | + """ |
| 54 | + LOGGER.info("Parsing command-line options") |
| 55 | + parser = argparse.ArgumentParser(description=__doc__) |
| 56 | + parser.add_argument( |
| 57 | + "--enable-save", |
| 58 | + action="store_true", |
| 59 | + help="Enable saving results", |
| 60 | + ) |
| 61 | + parser.add_argument( |
| 62 | + "--enable-git", |
| 63 | + action="store_true", |
| 64 | + help="Enable git actions (fetch, merge, add, commit, and push)", |
| 65 | + ) |
| 66 | + args = parser.parse_args() |
| 67 | + if not args.enable_save and args.enable_git: |
| 68 | + parser.error("--enable-git requires --enable-save") |
| 69 | + return args |
| 70 | + |
| 71 | + |
| 72 | +def check_for_completion(): |
| 73 | + completed_metrics = False |
| 74 | + completed_units = False |
| 75 | + |
| 76 | + try: |
| 77 | + with open(FILE_1_METRICS, "r", newline="") as file_obj: |
| 78 | + reader = csv.DictReader(file_obj, dialect="unix") |
| 79 | + if len(list(reader)) > 0: |
| 80 | + completed_metrics = True |
| 81 | + except FileNotFoundError: |
| 82 | + pass # File may not be found without --enable-save, etc. |
| 83 | + |
| 84 | + try: |
| 85 | + with open(FILE_2_UNITS, "r", newline="") as file_obj: |
| 86 | + reader = csv.DictReader(file_obj, dialect="unix") |
| 87 | + if len(list(reader)) > 30: |
| 88 | + completed_units = True |
| 89 | + except FileNotFoundError: |
| 90 | + pass # File may not be found without --enable-save, etc. |
| 91 | + |
| 92 | + if completed_metrics and completed_units: |
| 93 | + raise shared.QuantifyingException( |
| 94 | + f"Data fetch completed for {QUARTER}", 0 |
| 95 | + ) |
| 96 | + |
| 97 | + |
| 98 | +def write_data(args, data_metrics, data_units): |
| 99 | + if not args.enable_save: |
| 100 | + return args |
| 101 | + |
| 102 | + # Create data directory for this phase |
| 103 | + os.makedirs(PATHS["data_phase"], exist_ok=True) |
| 104 | + |
| 105 | + with open(FILE_1_METRICS, "w", encoding="utf-8", newline="\n") as file_obj: |
| 106 | + writer = csv.DictWriter( |
| 107 | + file_obj, fieldnames=HEADER_1_METRICS, dialect="unix" |
| 108 | + ) |
| 109 | + writer.writeheader() |
| 110 | + for row in data_metrics: |
| 111 | + writer.writerow(row) |
| 112 | + |
| 113 | + with open(FILE_2_UNITS, "w", encoding="utf-8", newline="\n") as file_obj: |
| 114 | + writer = csv.DictWriter( |
| 115 | + file_obj, fieldnames=HEADER_2_UNITS, dialect="unix" |
| 116 | + ) |
| 117 | + writer.writeheader() |
| 118 | + for row in data_units: |
| 119 | + writer.writerow(row) |
| 120 | + |
| 121 | + return args |
| 122 | + |
| 123 | + |
| 124 | +def query_smithsonian(args, session): |
| 125 | + if not API_DATA_GOV_TOKEN: |
| 126 | + raise shared.QuantifyingException( |
| 127 | + "Authentication (API_DATA_GOV_TOKEN) required. Please ensure your" |
| 128 | + " API key is set in .env", |
| 129 | + 1, |
| 130 | + ) |
| 131 | + LOGGER.info("Fetch data from API") |
| 132 | + url = "https://api.si.edu/openaccess/api/v1.0/stats" |
| 133 | + params = {"api_key": API_DATA_GOV_TOKEN} |
| 134 | + try: |
| 135 | + with session.get(url, params=params) as response: |
| 136 | + response.raise_for_status() |
| 137 | + data = response.json()["response"] |
| 138 | + except requests.HTTPError as e: |
| 139 | + raise shared.QuantifyingException(f"HTTP Error: {e}", 1) |
| 140 | + except requests.RequestException as e: |
| 141 | + raise shared.QuantifyingException(f"Request Exception: {e}", 1) |
| 142 | + except KeyError as e: |
| 143 | + raise shared.QuantifyingException(f"KeyError: {e}", 1) |
| 144 | + data_metrics = [ |
| 145 | + { |
| 146 | + "CC0_MEDIA": data["metrics"]["CC0_media"], |
| 147 | + "CC0_MEDIA_PERCENTAGE": data["metrics"]["CC0_media_percentage"], |
| 148 | + "CC0_RECORDS": data["metrics"]["CC0_records"], |
| 149 | + "CC0_RECORDS_WITH_CC0_MEDIA": data["metrics"][ |
| 150 | + "CC0_records_with_CC0_media" |
| 151 | + ], |
| 152 | + "TOTAL_OBJECTS": data["total_objects"], |
| 153 | + } |
| 154 | + ] |
| 155 | + data_units = [] |
| 156 | + for unit in data["units"]: |
| 157 | + if unit["total_objects"] == 0: |
| 158 | + continue |
| 159 | + data_units.append( |
| 160 | + { |
| 161 | + "UNIT": unit["unit"], |
| 162 | + "CC0_RECORDS": unit["metrics"]["CC0_records"], |
| 163 | + "CC0_RECORDS_WITH_CC0_MEDIA": unit["metrics"][ |
| 164 | + "CC0_records_with_CC0_media" |
| 165 | + ], |
| 166 | + "TOTAL_OBJECTS": unit["total_objects"], |
| 167 | + } |
| 168 | + ) |
| 169 | + data_units = sorted(data_units, key=itemgetter("UNIT")) |
| 170 | + LOGGER.info(f"Fetched stats for {len(data_units)} units") |
| 171 | + return data_metrics, data_units |
| 172 | + |
| 173 | + |
| 174 | +def main(): |
| 175 | + args = parse_arguments() |
| 176 | + shared.paths_log(LOGGER, PATHS) |
| 177 | + check_for_completion() |
| 178 | + session = shared.get_session() |
| 179 | + data_metrics, data_units = query_smithsonian(args, session) |
| 180 | + args = write_data(args, data_metrics, data_units) |
| 181 | + args = shared.git_add_and_commit( |
| 182 | + args, |
| 183 | + PATHS["repo"], |
| 184 | + PATHS["data_quarter"], |
| 185 | + f"Add and commit new Smithsonian data for {QUARTER}", |
| 186 | + ) |
| 187 | + shared.git_push_changes(args, PATHS["repo"]) |
| 188 | + |
| 189 | + |
| 190 | +if __name__ == "__main__": |
| 191 | + try: |
| 192 | + main() |
| 193 | + except shared.QuantifyingException as e: |
| 194 | + if e.exit_code == 0: |
| 195 | + LOGGER.info(e.message) |
| 196 | + else: |
| 197 | + LOGGER.error(e.message) |
| 198 | + sys.exit(e.exit_code) |
| 199 | + except SystemExit as e: |
| 200 | + if e.code != 0: |
| 201 | + LOGGER.error(f"System exit with code: {e.code}") |
| 202 | + sys.exit(e.code) |
| 203 | + except KeyboardInterrupt: |
| 204 | + LOGGER.info("(130) Halted via KeyboardInterrupt.") |
| 205 | + sys.exit(130) |
| 206 | + except Exception: |
| 207 | + traceback_formatted = textwrap.indent( |
| 208 | + highlight( |
| 209 | + traceback.format_exc(), |
| 210 | + PythonTracebackLexer(), |
| 211 | + TerminalFormatter(), |
| 212 | + ), |
| 213 | + " ", |
| 214 | + ) |
| 215 | + LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}") |
| 216 | + sys.exit(1) |
0 commit comments