Skip to content

Commit ee9d6a6

Browse files
committed
add smithsonian fetch
1 parent b0aa349 commit ee9d6a6

File tree

3 files changed

+238
-0
lines changed

3 files changed

+238
-0
lines changed

env.example

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,10 @@
3737
# https://docs.github.com/en/rest/authentication/authenticating-to-the-rest-api
3838

3939
# GH_TOKEN =
40+
41+
42+
# Smithsonian
43+
44+
# https://edan.si.edu/openaccess/apidocs/
45+
46+
# API_DATA_GOV_TOKEN =
Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
#!/usr/bin/env python
2+
"""
3+
Fetch metrics usage from Smithsonian Institution Open Access API.
4+
"""
5+
6+
# Standard library
7+
import argparse
8+
import csv
9+
import os
10+
import sys
11+
import textwrap
12+
import traceback
13+
from operator import itemgetter
14+
15+
# Third-party
16+
import requests
17+
from pygments import highlight
18+
from pygments.formatters import TerminalFormatter
19+
from pygments.lexers import PythonTracebackLexer
20+
21+
# Add parent directory so shared can be imported
22+
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
23+
24+
# First-party/Local
25+
import shared # noqa: E402
26+
27+
# Setup
28+
LOGGER, PATHS = shared.setup(__file__)
29+
30+
# Constants
31+
API_DATA_GOV_TOKEN = os.getenv("API_DATA_GOV_TOKEN")
32+
FILE_1_METRICS = os.path.join(PATHS["data_phase"], "smithsonian_1_metrics.csv")
33+
FILE_2_UNITS = os.path.join(PATHS["data_phase"], "smithsonian_1_units.csv")
34+
HEADER_1_METRICS = [
35+
"CC0_RECORDS",
36+
"CC0_RECORDS_WITH_CC0_MEDIA",
37+
"CC0_MEDIA",
38+
"CC0_MEDIA_PERCENTAGE",
39+
"TOTAL_OBJECTS",
40+
]
41+
HEADER_2_UNITS = [
42+
"UNIT",
43+
"CC0_RECORDS",
44+
"CC0_RECORDS_WITH_CC0_MEDIA",
45+
"TOTAL_OBJECTS",
46+
]
47+
QUARTER = os.path.basename(PATHS["data_quarter"])
48+
49+
50+
def parse_arguments():
51+
"""
52+
Parse command-line options, returns parsed argument namespace.
53+
"""
54+
LOGGER.info("Parsing command-line options")
55+
parser = argparse.ArgumentParser(description=__doc__)
56+
parser.add_argument(
57+
"--enable-save",
58+
action="store_true",
59+
help="Enable saving results",
60+
)
61+
parser.add_argument(
62+
"--enable-git",
63+
action="store_true",
64+
help="Enable git actions (fetch, merge, add, commit, and push)",
65+
)
66+
args = parser.parse_args()
67+
if not args.enable_save and args.enable_git:
68+
parser.error("--enable-git requires --enable-save")
69+
return args
70+
71+
72+
def check_for_completion():
73+
completed_metrics = False
74+
completed_units = False
75+
76+
try:
77+
with open(FILE_1_METRICS, "r", newline="") as file_obj:
78+
reader = csv.DictReader(file_obj, dialect="unix")
79+
if len(list(reader)) > 0:
80+
completed_metrics = True
81+
except FileNotFoundError:
82+
pass # File may not be found without --enable-save, etc.
83+
84+
try:
85+
with open(FILE_2_UNITS, "r", newline="") as file_obj:
86+
reader = csv.DictReader(file_obj, dialect="unix")
87+
if len(list(reader)) > 30:
88+
completed_units = True
89+
except FileNotFoundError:
90+
pass # File may not be found without --enable-save, etc.
91+
92+
if completed_metrics and completed_units:
93+
raise shared.QuantifyingException(
94+
f"Data fetch completed for {QUARTER}", 0
95+
)
96+
97+
98+
def write_data(args, data_metrics, data_units):
99+
if not args.enable_save:
100+
return args
101+
102+
# Create data directory for this phase
103+
os.makedirs(PATHS["data_phase"], exist_ok=True)
104+
105+
with open(FILE_1_METRICS, "w", encoding="utf-8", newline="\n") as file_obj:
106+
writer = csv.DictWriter(
107+
file_obj, fieldnames=HEADER_1_METRICS, dialect="unix"
108+
)
109+
writer.writeheader()
110+
for row in data_metrics:
111+
writer.writerow(row)
112+
113+
with open(FILE_2_UNITS, "w", encoding="utf-8", newline="\n") as file_obj:
114+
writer = csv.DictWriter(
115+
file_obj, fieldnames=HEADER_2_UNITS, dialect="unix"
116+
)
117+
writer.writeheader()
118+
for row in data_units:
119+
writer.writerow(row)
120+
121+
return args
122+
123+
124+
def query_smithsonian(args, session):
125+
if not API_DATA_GOV_TOKEN:
126+
raise shared.QuantifyingException(
127+
"Authentication (API_DATA_GOV_TOKEN) required. Please ensure your"
128+
" API key is set in .env",
129+
1,
130+
)
131+
LOGGER.info("Fetch data from API")
132+
url = "https://api.si.edu/openaccess/api/v1.0/stats"
133+
params = {"api_key": API_DATA_GOV_TOKEN}
134+
try:
135+
with session.get(url, params=params) as response:
136+
response.raise_for_status()
137+
data = response.json()["response"]
138+
except requests.HTTPError as e:
139+
raise shared.QuantifyingException(f"HTTP Error: {e}", 1)
140+
except requests.RequestException as e:
141+
raise shared.QuantifyingException(f"Request Exception: {e}", 1)
142+
except KeyError as e:
143+
raise shared.QuantifyingException(f"KeyError: {e}", 1)
144+
data_metrics = [
145+
{
146+
"CC0_MEDIA": data["metrics"]["CC0_media"],
147+
"CC0_MEDIA_PERCENTAGE": data["metrics"]["CC0_media_percentage"],
148+
"CC0_RECORDS": data["metrics"]["CC0_records"],
149+
"CC0_RECORDS_WITH_CC0_MEDIA": data["metrics"][
150+
"CC0_records_with_CC0_media"
151+
],
152+
"TOTAL_OBJECTS": data["total_objects"],
153+
}
154+
]
155+
data_units = []
156+
for unit in data["units"]:
157+
if unit["total_objects"] == 0:
158+
continue
159+
data_units.append(
160+
{
161+
"UNIT": unit["unit"],
162+
"CC0_RECORDS": unit["metrics"]["CC0_records"],
163+
"CC0_RECORDS_WITH_CC0_MEDIA": unit["metrics"][
164+
"CC0_records_with_CC0_media"
165+
],
166+
"TOTAL_OBJECTS": unit["total_objects"],
167+
}
168+
)
169+
data_units = sorted(data_units, key=itemgetter("UNIT"))
170+
LOGGER.info(f"Fetched stats for {len(data_units)} units")
171+
return data_metrics, data_units
172+
173+
174+
def main():
175+
args = parse_arguments()
176+
shared.paths_log(LOGGER, PATHS)
177+
check_for_completion()
178+
session = shared.get_session()
179+
data_metrics, data_units = query_smithsonian(args, session)
180+
args = write_data(args, data_metrics, data_units)
181+
args = shared.git_add_and_commit(
182+
args,
183+
PATHS["repo"],
184+
PATHS["data_quarter"],
185+
f"Add and commit new Smithsonian data for {QUARTER}",
186+
)
187+
shared.git_push_changes(args, PATHS["repo"])
188+
189+
190+
if __name__ == "__main__":
191+
try:
192+
main()
193+
except shared.QuantifyingException as e:
194+
if e.exit_code == 0:
195+
LOGGER.info(e.message)
196+
else:
197+
LOGGER.error(e.message)
198+
sys.exit(e.exit_code)
199+
except SystemExit as e:
200+
if e.code != 0:
201+
LOGGER.error(f"System exit with code: {e.code}")
202+
sys.exit(e.code)
203+
except KeyboardInterrupt:
204+
LOGGER.info("(130) Halted via KeyboardInterrupt.")
205+
sys.exit(130)
206+
except Exception:
207+
traceback_formatted = textwrap.indent(
208+
highlight(
209+
traceback.format_exc(),
210+
PythonTracebackLexer(),
211+
TerminalFormatter(),
212+
),
213+
" ",
214+
)
215+
LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}")
216+
sys.exit(1)

sources.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,21 @@ license_version breakdown.
147147
- Supported licenses: `by`, `by-nc`, `by-nc-nd`, `by-nc-sa`, `by-nd`, `by-sa`, `cc0`, `nc-sampling+`, `pdm`, `sampling+`
148148

149149

150+
## Smithsonian
151+
152+
**Description:** The Smithsonian Institution Open Access API offers a metrics
153+
API for stats about CC0 objects/media.
154+
155+
**API documentation link:**
156+
- [metrics - Documentation](https://edan.si.edu/openaccess/apidocs/#api-metrics)
157+
- [Developer Manual - api.data.gov](https://api.data.gov/docs/developer-manual/)
158+
159+
**API information:**
160+
- API key required
161+
- Hourly Limit: 1,000 requests per hour
162+
- Data available in a JSON format
163+
164+
150165
## Wikipedia
151166

152167
**Description:** The Wikipedia API allows users to query statistics of pages,

0 commit comments

Comments
 (0)