Skip to content

Commit c120add

Browse files
authored
Merge pull request #185 from Babi-B/openverse-fetch
Add Openverse Fetch Script (Initial Implementation)
2 parents 9702757 + a51cf3b commit c120add

File tree

1 file changed

+261
-0
lines changed

1 file changed

+261
-0
lines changed

scripts/1-fetch/openverse_fetch.py

Lines changed: 261 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,261 @@
1+
#!/usr/bin/env python
2+
"""
3+
Fetch CC Legal Tool usage from Openverse API.
4+
5+
Note:
6+
Because anonymous Openverse API access
7+
returns a maximum of ~240 result count
8+
per source-license combination, this
9+
script currently provides approximate counts.
10+
It does not include pagination or license_version
11+
breakdown.
12+
"""
13+
14+
# Standard library
15+
import argparse
16+
import csv
17+
import os
18+
import sys
19+
import textwrap
20+
import traceback
21+
import urllib
22+
23+
# Third-party
24+
import requests
25+
from pygments import highlight
26+
from pygments.formatters import TerminalFormatter
27+
from pygments.lexers import PythonTracebackLexer
28+
from requests.adapters import HTTPAdapter
29+
from urllib3.util.retry import Retry
30+
31+
# Add parent directory so shared can be imported
32+
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
33+
34+
# First-party/Local
35+
import shared # noqa: E402
36+
37+
# Setup
38+
LOGGER, PATHS = shared.setup(__file__)
39+
40+
# Constants
41+
FILE_PATH = os.path.join(PATHS["data_phase"], "openverse_fetch.csv")
42+
MEDIA_TYPES = ["audio", "images"]
43+
OPENVERSE_BASE_URL = "https://api.openverse.org/v1"
44+
OPENVERSE_FIELDS = [
45+
"SOURCE",
46+
"MEDIA_TYPE",
47+
"TOOL_IDENTIFIER",
48+
"MEDIA_COUNT",
49+
]
50+
OPENVERSE_LEGAL_TOOLS = [
51+
"by",
52+
"by-nc",
53+
"by-nc-nd",
54+
"by-nc-sa",
55+
"by-nd",
56+
"by-sa",
57+
"cc0",
58+
"nc-sampling+",
59+
"pdm",
60+
"sampling+",
61+
]
62+
63+
64+
def parse_arguments():
65+
"""
66+
Parse command-line options, returns parsed argument namespace.
67+
"""
68+
LOGGER.info("Parsing command-line options")
69+
parser = argparse.ArgumentParser(description=__doc__)
70+
parser.add_argument(
71+
"--enable-save",
72+
action="store_true",
73+
help="Enable saving results",
74+
)
75+
parser.add_argument(
76+
"--enable-git",
77+
action="store_true",
78+
help="Enable git actions (fetch, merge, add, commit, and push)",
79+
)
80+
args = parser.parse_args()
81+
if not args.enable_save and args.enable_git:
82+
parser.error("--enable-git requires --enable-save")
83+
return args
84+
85+
86+
def get_requests_session():
87+
max_retries = Retry(
88+
total=5,
89+
backoff_factor=10,
90+
status_forcelist=shared.STATUS_FORCELIST,
91+
)
92+
session = requests.Session()
93+
session.mount("https://", HTTPAdapter(max_retries=max_retries))
94+
session.headers.update(
95+
{"accept": "application/json", "User-Agent": shared.USER_AGENT}
96+
)
97+
return session
98+
99+
100+
def get_all_sources_and_licenses(session, media_type):
101+
"""
102+
Fetch all available sources for a given media_type.
103+
"""
104+
LOGGER.info(f"Fetching all sources for the /{media_type}/ endpoint")
105+
url = f"{OPENVERSE_BASE_URL}/{media_type}/stats/?format=json"
106+
try:
107+
response = session.get(url)
108+
response.raise_for_status()
109+
records = response.json()
110+
raw_sources = sorted(
111+
[
112+
record["source_name"]
113+
for record in records
114+
if "source_name" in record
115+
]
116+
)
117+
"""
118+
To ensure the sources in /stats/ endpoints are truly
119+
indexed in Openverse's catalog.
120+
"""
121+
valid_sources = set()
122+
for source in raw_sources:
123+
new_response = session.get(
124+
f"{OPENVERSE_BASE_URL}/{media_type}/?"
125+
f"source={source}&format=json"
126+
)
127+
if new_response.status_code == 200:
128+
valid_sources.add(source)
129+
else:
130+
LOGGER.warning(
131+
f"Skipping source {source}:"
132+
f" not available in /{media_type}/ endpoint"
133+
)
134+
LOGGER.info(
135+
f"Found {len(valid_sources)} valid sources for {media_type}"
136+
)
137+
return valid_sources, set(OPENVERSE_LEGAL_TOOLS)
138+
except (requests.HTTPError, requests.RequestException) as e:
139+
raise shared.QuantifyingException(
140+
f"Failed to fetch sources and licenses: {e}", exit_code=1
141+
)
142+
143+
144+
def query_openverse(session):
145+
"""
146+
Fetch available sources given the media_type and use
147+
standard list of Openverse's standard licenses.
148+
"""
149+
tally = {}
150+
for media_type in MEDIA_TYPES:
151+
LOGGER.info(f"FETCHING {media_type.upper()} DATA...")
152+
sources, licenses = get_all_sources_and_licenses(session, media_type)
153+
for source_name in sources:
154+
for license in licenses:
155+
# encode the license to escape '+' e.g sampling+
156+
encoded_license = urllib.parse.quote(license, safe="")
157+
url = (
158+
f"{OPENVERSE_BASE_URL}/{media_type}/?"
159+
f"source={source_name}&"
160+
f"license={encoded_license}"
161+
"&format=json&page=1"
162+
)
163+
LOGGER.info(
164+
"Fetching Openverse data:"
165+
f" media_type={media_type} |"
166+
f" source={source_name} |"
167+
f" license={license}"
168+
)
169+
try:
170+
response = session.get(url)
171+
if response.status_code == 401:
172+
raise shared.QuantifyingException(
173+
"Unauthorized(401): Check API key for"
174+
f" {media_type}.",
175+
exit_code=1,
176+
)
177+
response.raise_for_status()
178+
data = response.json()
179+
count = data.get("result_count", 0)
180+
# Skip (source x license) with result_count = 0
181+
if count > 0:
182+
key = (source_name, media_type, license)
183+
tally[key] = count
184+
else:
185+
LOGGER.warning(
186+
f"Skipping ({source_name}, {license}): count is 0"
187+
)
188+
except (requests.HTTPError, requests.RequestException) as e:
189+
raise shared.QuantifyingException(
190+
f"Openverse fetch failed: {e}", exit_code=1
191+
)
192+
LOGGER.info("Aggregating the data")
193+
aggregate = []
194+
for (source, media_type, license_code), media_count in tally.items():
195+
# Append prefix "cc" except for 'pdm' and 'cc0'
196+
if license_code not in ["pdm", "cc0"]:
197+
tool_identifier = f"cc {license_code}"
198+
else:
199+
tool_identifier = license_code
200+
aggregate.append(
201+
{
202+
OPENVERSE_FIELDS[0]: source,
203+
OPENVERSE_FIELDS[1]: media_type,
204+
OPENVERSE_FIELDS[2]: tool_identifier.upper(),
205+
OPENVERSE_FIELDS[3]: media_count,
206+
}
207+
)
208+
return aggregate
209+
210+
211+
def write_data(args, data):
212+
if not args.enable_save:
213+
return
214+
os.makedirs(PATHS["data_phase"], exist_ok=True)
215+
with open(FILE_PATH, "w", encoding="utf-8", newline="") as file_obj:
216+
writer = csv.DictWriter(
217+
file_obj,
218+
fieldnames=OPENVERSE_FIELDS,
219+
dialect="unix",
220+
)
221+
writer.writeheader()
222+
for row in data:
223+
writer.writerow(row)
224+
225+
226+
def main():
227+
args = parse_arguments()
228+
session = get_requests_session()
229+
LOGGER.info("Starting Openverse Fetch Script...")
230+
records = query_openverse(session)
231+
write_data(args, records)
232+
LOGGER.info(f"Fetched {len(records)} unique Openverse records.")
233+
234+
235+
if __name__ == "__main__":
236+
try:
237+
main()
238+
except shared.QuantifyingException as e:
239+
if e.exit_code == 0:
240+
LOGGER.info(e.message)
241+
else:
242+
LOGGER.error(e.message)
243+
sys.exit(e.exit_code)
244+
except SystemExit as e:
245+
if e.code != 0:
246+
LOGGER.error(f"System exit with code: {e.code}")
247+
sys.exit(e.code)
248+
except KeyboardInterrupt:
249+
LOGGER.info("(130) Halted via KeyboardInterrupt.")
250+
sys.exit(130)
251+
except Exception:
252+
traceback_formatted = textwrap.indent(
253+
highlight(
254+
traceback.format_exc(),
255+
PythonTracebackLexer(),
256+
TerminalFormatter(),
257+
),
258+
" ",
259+
)
260+
LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}")
261+
sys.exit(1)

0 commit comments

Comments
 (0)