Skip to content

Commit 02fea36

Browse files
authored
Merge pull request #223 from sbarhin/museums
Add Museums Victoria fetch
2 parents b0aa349 + cc32e79 commit 02fea36

File tree

2 files changed

+301
-0
lines changed

2 files changed

+301
-0
lines changed
Lines changed: 284 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,284 @@
1+
#!/usr/bin/env python
2+
"""
3+
Fetch CC Legal Tool usage from the Museums Victoria Collections API.
4+
"""
5+
6+
# Standard library
7+
import argparse
8+
import csv
9+
import os
10+
import re
11+
import sys
12+
import textwrap
13+
import traceback
14+
from collections import defaultdict
15+
16+
# Third-party
17+
import requests
18+
from pygments import highlight
19+
from pygments.formatters import TerminalFormatter
20+
from pygments.lexers import PythonTracebackLexer
21+
22+
# Add parent directory so shared can be imported
23+
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
24+
25+
# First-party/Local
26+
import shared # noqa: E402
27+
28+
# Setup
29+
LOGGER, PATHS = shared.setup(__file__)
30+
31+
# Constants
32+
BASE_URL = "https://collections.museumsvictoria.com.au/api/search"
33+
FILE1_COUNT = shared.path_join(
34+
PATHS["data_phase"], "museums_victoria_1_count.csv"
35+
)
36+
FILE2_MEDIA = shared.path_join(
37+
PATHS["data_phase"], "museums_victoria_2_count_by_media.csv"
38+
)
39+
FILE3_RECORD = shared.path_join(
40+
PATHS["data_phase"], "museums_victoria_3_count_by_record.csv"
41+
)
42+
HEADER1_COUNT = ["TOOL IDENTIFIER", "COUNT"]
43+
HEADER2_MEDIA = ["TOOL IDENTIFIER", "MEDIA TYPE", "COUNT"]
44+
HEADER3_RECORD = ["TOOL IDENTIFIER", "RECORD TYPE", "COUNT"]
45+
PER_PAGE = 100
46+
QUARTER = os.path.basename(PATHS["data_quarter"])
47+
RECORD_TYPES = [
48+
"article",
49+
"item",
50+
"species",
51+
"specimen",
52+
] # Type of record to return
53+
54+
55+
def parse_arguments():
56+
"""
57+
Parse command-line options, returns parsed argument namespace.
58+
"""
59+
LOGGER.info("Parsing command-line options")
60+
parser = argparse.ArgumentParser(description=__doc__)
61+
parser.add_argument(
62+
"--enable-save",
63+
action="store_true",
64+
help="Enable saving results",
65+
)
66+
parser.add_argument(
67+
"--enable-git",
68+
action="store_true",
69+
help="Enable git actions (fetch, merge, add, commit, and push)",
70+
)
71+
parser.add_argument(
72+
"--limit",
73+
type=int,
74+
default=None,
75+
help="Maximum number of records to fetch per each record type",
76+
)
77+
args = parser.parse_args()
78+
if not args.enable_save and args.enable_git:
79+
parser.error("--enable-git requires --enable-save")
80+
return args
81+
82+
83+
def initialize_data_file(file_path, header):
84+
with open(file_path, "w", encoding="utf-8", newline="\n") as file_obj:
85+
writer = csv.DictWriter(file_obj, fieldnames=header, dialect="unix")
86+
writer.writeheader()
87+
88+
89+
def initialize_all_data_files(args):
90+
if not args.enable_save:
91+
return
92+
93+
# Create data directory for this phase
94+
os.makedirs(PATHS["data_phase"], exist_ok=True)
95+
96+
initialize_data_file(FILE1_COUNT, HEADER1_COUNT)
97+
initialize_data_file(FILE2_MEDIA, HEADER2_MEDIA)
98+
initialize_data_file(FILE3_RECORD, HEADER3_RECORD)
99+
100+
101+
def write_counts_to_csv(args, data: dict):
102+
if not args.enable_save:
103+
return
104+
for data in data.items():
105+
rows = []
106+
file_path = data[0]
107+
if file_path == FILE2_MEDIA:
108+
fieldnames = HEADER2_MEDIA
109+
for media_type in data[1].items():
110+
rows.extend(
111+
{
112+
"TOOL IDENTIFIER": row[0],
113+
"MEDIA TYPE": media_type[0],
114+
"COUNT": row[1],
115+
}
116+
for row in media_type[1].items()
117+
)
118+
elif file_path == FILE3_RECORD:
119+
fieldnames = HEADER3_RECORD
120+
for record_type in data[1].items():
121+
rows.extend(
122+
{
123+
"TOOL IDENTIFIER": row[0],
124+
"RECORD TYPE": record_type[0],
125+
"COUNT": row[1],
126+
}
127+
for row in record_type[1].items()
128+
)
129+
else:
130+
fieldnames = HEADER1_COUNT
131+
rows = [
132+
{
133+
"TOOL IDENTIFIER": row[0],
134+
"COUNT": row[1],
135+
}
136+
for row in data[1].items()
137+
]
138+
with open(file_path, "a", encoding="utf-8", newline="\n") as file_obj:
139+
writer = csv.DictWriter(
140+
file_obj, fieldnames=fieldnames, dialect="unix"
141+
)
142+
writer.writerows(rows)
143+
144+
145+
def fetch_museums_victoria_data(args, session):
146+
"""
147+
Fetches all records with images from the Museums Victoria API by iterating
148+
through all record types and handling pagination.
149+
"""
150+
151+
record_counts = defaultdict(lambda: defaultdict(int))
152+
media_counts = defaultdict(lambda: defaultdict(int))
153+
licences_count = defaultdict(int)
154+
155+
# Iterate through each record type
156+
for record_type in RECORD_TYPES:
157+
records_processed = 0
158+
current_page = 1
159+
total_pages = None
160+
per_page = min(PER_PAGE, args.limit) if args.limit else PER_PAGE
161+
162+
while True:
163+
# 1. Construct the API query parameters
164+
params = {
165+
"envelope": "true",
166+
"page": current_page,
167+
"perpage": per_page,
168+
"recordtype": record_type,
169+
}
170+
LOGGER.info(
171+
f"fetching page {current_page} of {record_type}s "
172+
f"(records {(current_page * per_page) - per_page}-"
173+
f"{current_page * per_page})"
174+
)
175+
try:
176+
r = session.get(BASE_URL, params=params, timeout=30)
177+
r.raise_for_status()
178+
except requests.HTTPError as e:
179+
raise shared.QuantifyingException(f"HTTP Error: {e}", 1)
180+
except requests.RequestException as e:
181+
raise shared.QuantifyingException(f"Request Exception: {e}", 1)
182+
except KeyError as e:
183+
raise shared.QuantifyingException(f"KeyError: {e}", 1)
184+
data = r.json()
185+
results = data.get("response", [])
186+
for res in results:
187+
records_processed += 1
188+
media_list = res.get("media", [])
189+
for media_item in media_list:
190+
licence_data = media_item.get("licence")
191+
192+
# COUNTING THE UNIQUE LICENCE TYPES
193+
license_short_name = licence_data.get("shortName")
194+
version_number = re.search(
195+
r"\b\d+\.\d+\b", licence_data.get("name")
196+
)
197+
if version_number:
198+
license_short_name = (
199+
f"{license_short_name} {version_number.group()}"
200+
)
201+
202+
if license_short_name:
203+
licences_count[license_short_name] += 1
204+
205+
# COUNTING LICENSES BY MEDIA TYPES
206+
media_type = media_item.get("type")
207+
media_counts[media_type][license_short_name] += 1
208+
209+
# COUNTING LICENSES BY RECORD TYPES
210+
record_counts[record_type][license_short_name] += 1
211+
if total_pages is None:
212+
headers = data.get("headers", {})
213+
total_pages = int(headers.get("totalResults", "0"))
214+
215+
if args.limit is not None and records_processed >= args.limit:
216+
LOGGER.info(
217+
f"Limit Reached: {records_processed} processed. "
218+
f"Skipping remaining records for {record_type}."
219+
)
220+
break
221+
current_page += 1
222+
223+
if current_page > total_pages:
224+
break
225+
226+
return {
227+
FILE1_COUNT: dict(sorted(licences_count.items())),
228+
FILE2_MEDIA: sort_nested_defaultdict(media_counts),
229+
FILE3_RECORD: sort_nested_defaultdict(record_counts),
230+
}
231+
232+
233+
def sort_nested_defaultdict(d):
234+
"""Convert defaultdicts to regular dicts and sort all keys recursively."""
235+
if isinstance(d, defaultdict):
236+
d = {k: sort_nested_defaultdict(v) for k, v in sorted(d.items())}
237+
elif isinstance(d, dict):
238+
d = {k: sort_nested_defaultdict(v) for k, v in sorted(d.items())}
239+
return d
240+
241+
242+
def main():
243+
args = parse_arguments()
244+
shared.paths_log(LOGGER, PATHS)
245+
shared.git_fetch_and_merge(args, PATHS["repo"])
246+
initialize_all_data_files(args)
247+
data = fetch_museums_victoria_data(args, shared.get_session())
248+
write_counts_to_csv(args, data)
249+
args = shared.git_add_and_commit(
250+
args,
251+
PATHS["repo"],
252+
PATHS["data_quarter"],
253+
f"Add and commit new Museums Victoria data for {QUARTER}",
254+
)
255+
shared.git_push_changes(args, PATHS["repo"])
256+
257+
258+
if __name__ == "__main__":
259+
try:
260+
main()
261+
except shared.QuantifyingException as e:
262+
if e.exit_code == 0:
263+
LOGGER.info(e.message)
264+
else:
265+
LOGGER.error(e.message)
266+
sys.exit(e.exit_code)
267+
except SystemExit as e:
268+
if e.code != 0:
269+
LOGGER.error(f"System exit with code: {e.code}")
270+
sys.exit(e.code)
271+
except KeyboardInterrupt:
272+
LOGGER.info("(130) Halted via KeyboardInterrupt.")
273+
sys.exit(130)
274+
except Exception:
275+
traceback_formatted = textwrap.indent(
276+
highlight(
277+
traceback.format_exc(),
278+
PythonTracebackLexer(),
279+
TerminalFormatter(),
280+
),
281+
" ",
282+
)
283+
LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}")
284+
sys.exit(1)

sources.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,23 @@ and access towards related query data using a programmable search engine.
121121
- Data available through JSON format
122122

123123

124+
## Museums Victoria
125+
126+
**Description:** Museums Victoria is Australia’s
127+
largest public museum organisation, and the principal custodian of the State’s scientific,
128+
cultural and heritage collections. Its API provides access to meta data on licenses, use rights
129+
and open access. We can also get information on article type, image license, display location,
130+
locality, museum location, etc. There are 150,000 objects available.
131+
132+
**API documentation link**
133+
- [Museum Victoria API documentation](https://collections.museumsvictoria.com.au/developers)
134+
135+
**API Information**
136+
- No API key required
137+
- No query limit
138+
- At the moment it only supports the GET verb and responses are in JSON only.
139+
140+
124141
## Openverse
125142

126143
**Description:** Openverse is a search engine for openly licensed media,

0 commit comments

Comments
 (0)