Skip to content
Merged
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
12042b5
initial openverse fetch script (formatted by Black)
Babi-B Oct 14, 2025
e3f61f7
chore: remove temporary data file created during fetch script testing
Babi-B Oct 15, 2025
f4f6a16
Moved RETRY_STATUS_FORCELIST to shared.py and fix import order E402
Babi-B Oct 15, 2025
8d3c157
Add error handling for 404 unauthorized responses
Babi-B Oct 15, 2025
a2f8fa5
Add scripts directory so script can be run from the root
Babi-B Oct 15, 2025
c6ec079
Merge branch 'creativecommons:main' into openverse-fetch
Babi-B Oct 16, 2025
6cc1e6d
Add user user agent to style library
Babi-B Oct 16, 2025
030fe54
Fetch aggregate media count by license and source
Babi-B Oct 17, 2025
1a68acf
Refactor code
Babi-B Oct 18, 2025
d33302e
Fetch by source and license WIP
Babi-B Oct 18, 2025
97bbe9a
Create CC_TOOL_IDENTIFIER column
Babi-B Oct 18, 2025
5aef97d
Refactor
Babi-B Oct 20, 2025
6b98c9d
Fetch result counts per source x license from first page
Babi-B Oct 21, 2025
4e493fd
Merge branch 'main' into openverse-fetch
Babi-B Oct 21, 2025
8285e02
Merge branch 'creativecommons:main' into openverse-fetch
Babi-B Oct 21, 2025
e6e552c
updated to match origin/main
Babi-B Oct 21, 2025
d6b35f2
fetch sources from /stats/ endpoint
Babi-B Oct 22, 2025
b11661b
Merge branch 'creativecommons:main' into openverse-fetch
Babi-B Oct 22, 2025
1623c6a
create cc licenses
Babi-B Oct 22, 2025
b06b7af
removed media with results_counts=0
Babi-B Oct 22, 2025
ea1eb95
refactor code
Babi-B Oct 23, 2025
acb94d4
Merge branch 'creativecommons:main' into openverse-fetch
Babi-B Oct 23, 2025
34d4f43
refactor code
Babi-B Oct 24, 2025
d5f4f5f
refactor code
Babi-B Oct 28, 2025
c8c0b4b
Merge branch 'creativecommons:main' into openverse-fetch
Babi-B Oct 28, 2025
3d37e35
refactor code
Babi-B Oct 29, 2025
c6ce80d
Merge branch 'openverse-fetch' of https://github.com/Babi-B/quantifyi…
Babi-B Oct 29, 2025
042d0e5
Merge branch 'creativecommons:main' into openverse-fetch
Babi-B Oct 29, 2025
eedd3ce
Merge branch 'openverse-fetch' of https://github.com/Babi-B/quantifyi…
Babi-B Oct 29, 2025
d4f1116
Set permission to execute
Babi-B Oct 30, 2025
538310c
Merge branch 'creativecommons:main' into openverse-fetch
Babi-B Oct 30, 2025
111fa80
Correct typo
Babi-B Oct 30, 2025
25f4569
create a difference to test execution
Babi-B Oct 30, 2025
f12d6f5
Make script executable
Babi-B Oct 30, 2025
a51cf3b
Make openverse_fetch.py executable
Babi-B Oct 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
261 changes: 261 additions & 0 deletions scripts/1-fetch/openverse_fetch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,261 @@
#!/usr/bin/env python
"""
Fetch CC Legal Tool usage from Openverse API.

Note:
Because anonymous Openverse API access
returns a maximum of ~240 result count
per source-license combination, this
script currently provides approximate counts.
It does not include pagination or license_version
breakdown.
"""

# Standard library
import argparse
import csv
import os
import sys
import textwrap
import traceback
import urllib

# Third-party
import requests
from pygments import highlight
from pygments.formatters import TerminalFormatter
from pygments.lexers import PythonTracebackLexer
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Add parent directory so shared can be imported
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))

# First-party/Local
import shared # noqa: E402

# Setup
LOGGER, PATHS = shared.setup(__file__)

# Constants
FILE_PATH = os.path.join(PATHS["data_phase"], "openverse_fetch.csv")
MEDIA_TYPES = ["audio", "images"]
OPENVERSE_BASE_URL = "https://api.openverse.org/v1"
OPENVERSE_FIELDS = [
"SOURCE",
"MEDIA_TYPE",
"TOOL_IDENTIFIER",
"MEDIA_COUNT",
]
OPENVERSE_LEGAL_TOOLS = [
"by",
"by-nc",
"by-nc-nd",
"by-nc-sa",
"by-nd",
"by-sa",
"cc0",
"nc-sampling+",
"pdm",
"sampling+",
]


def parse_arguments():
"""
Parse command-line options, returns parsed argument namespace.
"""
LOGGER.info("Parsing command-line options")
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--enable-save",
action="store_true",
help="Enable saving results",
)
parser.add_argument(
"--enable-git",
action="store_true",
help="Enable git actions (fetch, merge, add, commit, and push)",
)
args = parser.parse_args()
if not args.enable_save and args.enable_git:
parser.error("--enable-git requires --enable-save")
return args


def get_requests_session():
max_retries = Retry(
total=5,
backoff_factor=10,
status_forcelist=shared.STATUS_FORCELIST,
)
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=max_retries))
session.headers.update(
{"accept": "application/json", "User-Agent": shared.USER_AGENT}
)
return session


def get_all_sources_and_licenses(session, media_type):
"""
Fetch all available sources for a given media_type.
"""
LOGGER.info(f"Fetching all sources for the /{media_type}/ endpoint")
url = f"{OPENVERSE_BASE_URL}/{media_type}/stats/?format=json"
try:
response = session.get(url)
response.raise_for_status()
records = response.json()
raw_sources = sorted(
[
record["source_name"]
for record in records
if "source_name" in record
]
)
"""
To ensure the sources in /stats/ endpoints are truly
indexed in Openverse's catalog.
"""
valid_sources = set()
for source in raw_sources:
new_response = session.get(
f"{OPENVERSE_BASE_URL}/{media_type}/?"
f"source={source}&format=json"
)
if new_response.status_code == 200:
valid_sources.add(source)
else:
LOGGER.warning(
f"Skipping source {source}:"
f" not available in /{media_type}/ endpoint"
)
LOGGER.info(
f"Found {len(valid_sources)} valid sources for {media_type}"
)
return valid_sources, set(OPENVERSE_LEGAL_TOOLS)
except (requests.HTTPError, requests.RequestException) as e:
raise shared.QuantifyingException(
f"Failed to fetch sources and licenses: {e}", exit_code=1
)


def query_openverse(session):
"""
Fetch available sources given the media_type and use
standard list of Openverse's standard licenses.
"""
tally = {}
for media_type in MEDIA_TYPES:
LOGGER.info(f"FETCHING {media_type.upper()} DATA...")
sources, licenses = get_all_sources_and_licenses(session, media_type)
for source_name in sources:
for license in licenses:
# encode the license to escape '+' e.g sampling+
encoded_license = urllib.parse.quote(license, safe="")
url = (
f"{OPENVERSE_BASE_URL}/{media_type}/?"
f"source={source_name}&"
f"license={encoded_license}"
"&format=json&page=1"
)
LOGGER.info(
"Fetching Openverse data:"
f" media_type={media_type} |"
f" source={source_name} |"
f" license={license}"
)
try:
response = session.get(url)
if response.status_code == 401:
raise shared.QuantifyingException(
"Unauthorized(401): Check API key for"
f" {media_type}.",
exit_code=1,
)
response.raise_for_status()
data = response.json()
count = data.get("result_count", 0)
# Skip (source x license) with result_count = 0
if count > 0:
key = (source_name, media_type, license)
tally[key] = count
else:
LOGGER.warning(
f"Skipping ({source_name}, {license}): count is 0"
)
except (requests.HTTPError, requests.RequestException) as e:
raise shared.QuantifyingException(
f"Openverse fetch failed: {e}", exit_code=1
)
LOGGER.info("Aggregating the data")
aggregate = []
for (source, media_type, license_code), media_count in tally.items():
# Append prefix "cc" except for 'pdm' and 'cc0'
if license_code not in ["pdm", "cc0"]:
tool_identifier = f"cc {license_code}"
else:
tool_identifier = license_code
aggregate.append(
{
OPENVERSE_FIELDS[0]: source,
OPENVERSE_FIELDS[1]: media_type,
OPENVERSE_FIELDS[2]: tool_identifier.upper(),
OPENVERSE_FIELDS[3]: media_count,
}
)
return aggregate


def write_data(args, data):
if not args.enable_save:
return
os.makedirs(PATHS["data_phase"], exist_ok=True)
with open(FILE_PATH, "w", encoding="utf-8", newline="") as file_obj:
writer = csv.DictWriter(
file_obj,
fieldnames=OPENVERSE_FIELDS,
dialect="unix",
)
writer.writeheader()
for row in data:
writer.writerow(row)


def main():
args = parse_arguments()
session = get_requests_session()
LOGGER.info("Starting Openverse Fetch Script...")
records = query_openverse(session)
write_data(args, records)
LOGGER.info(f"Fetched {len(records)} unique Openverse records.")


if __name__ == "__main__":
try:
main()
except shared.QuantifyingException as e:
if e.exit_code == 0:
LOGGER.info(e.message)
else:
LOGGER.error(e.message)
sys.exit(e.exit_code)
except SystemExit as e:
if e.code != 0:
LOGGER.error(f"System exit with code: {e.code}")
sys.exit(e.code)
except KeyboardInterrupt:
LOGGER.info("(130) Halted via KeyboardInterrupt.")
sys.exit(130)
except Exception:
traceback_formatted = textwrap.indent(
highlight(
traceback.format_exc(),
PythonTracebackLexer(),
TerminalFormatter(),
),
" ",
)
LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}")
sys.exit(1)