Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
12042b5
initial openverse fetch script (formatted by Black)
Babi-B Oct 14, 2025
e3f61f7
chore: remove temporary data file created during fetch script testing
Babi-B Oct 15, 2025
f4f6a16
Moved RETRY_STATUS_FORCELIST to shared.py and fix import order E402
Babi-B Oct 15, 2025
8d3c157
Add error handling for 404 unauthorized responses
Babi-B Oct 15, 2025
a2f8fa5
Add scripts directory so script can be run from the root
Babi-B Oct 15, 2025
c6ec079
Merge branch 'creativecommons:main' into openverse-fetch
Babi-B Oct 16, 2025
6cc1e6d
Add user user agent to style library
Babi-B Oct 16, 2025
030fe54
Fetch aggregate media count by license and source
Babi-B Oct 17, 2025
1a68acf
Refactor code
Babi-B Oct 18, 2025
d33302e
Fetch by source and license WIP
Babi-B Oct 18, 2025
97bbe9a
Create CC_TOOL_IDENTIFIER column
Babi-B Oct 18, 2025
5aef97d
Refactor
Babi-B Oct 20, 2025
6b98c9d
Fetch result counts per source x license from first page
Babi-B Oct 21, 2025
4e493fd
Merge branch 'main' into openverse-fetch
Babi-B Oct 21, 2025
8285e02
Merge branch 'creativecommons:main' into openverse-fetch
Babi-B Oct 21, 2025
e6e552c
updated to match origin/main
Babi-B Oct 21, 2025
d6b35f2
fetch sources from /stats/ endpoint
Babi-B Oct 22, 2025
b11661b
Merge branch 'creativecommons:main' into openverse-fetch
Babi-B Oct 22, 2025
1623c6a
create cc licenses
Babi-B Oct 22, 2025
b06b7af
removed media with results_counts=0
Babi-B Oct 22, 2025
ea1eb95
refactor code
Babi-B Oct 23, 2025
acb94d4
Merge branch 'creativecommons:main' into openverse-fetch
Babi-B Oct 23, 2025
34d4f43
refactor code
Babi-B Oct 24, 2025
d5f4f5f
refactor code
Babi-B Oct 28, 2025
c8c0b4b
Merge branch 'creativecommons:main' into openverse-fetch
Babi-B Oct 28, 2025
3d37e35
refactor code
Babi-B Oct 29, 2025
c6ce80d
Merge branch 'openverse-fetch' of https://github.com/Babi-B/quantifyi…
Babi-B Oct 29, 2025
042d0e5
Merge branch 'creativecommons:main' into openverse-fetch
Babi-B Oct 29, 2025
eedd3ce
Merge branch 'openverse-fetch' of https://github.com/Babi-B/quantifyi…
Babi-B Oct 29, 2025
d4f1116
Set permission to execute
Babi-B Oct 30, 2025
538310c
Merge branch 'creativecommons:main' into openverse-fetch
Babi-B Oct 30, 2025
111fa80
Correct typo
Babi-B Oct 30, 2025
25f4569
create a difference to test execution
Babi-B Oct 30, 2025
f12d6f5
Make script executable
Babi-B Oct 30, 2025
a51cf3b
Make openverse_fetch.py executable
Babi-B Oct 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 141 additions & 0 deletions scripts/1-fetch/openverse_fetch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#!/usr/bin/env python
"""
Fetch CC Legal Tool usage from Openverse API.
"""

# Standard library
import argparse
import csv
import os
import sys
import textwrap
import traceback

# Third-party
import requests
from pygments import highlight
from pygments.formatters import TerminalFormatter
from pygments.lexers import PythonTracebackLexer
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Add parent directory so shared can be imported
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))

# First-party/Local
import shared # noqa: E402
from shared import RETRY_STATUS_FORCELIST # noqa: E402

# Setup
LOGGER, PATHS = shared.setup(__file__)

# Constants
FILE_PATH = os.path.join(PATHS["data_phase"], "openverse_fetch.csv")
OPENVERSE_FIELDS = ["id", "title", "creator", "license"]


def parse_arguments():
"""
Parse command-line options, returns parsed argument namespace.
"""
LOGGER.info("Parsing command-line options")
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--enable-save",
action="store_true",
help="Enable saving results",
)
parser.add_argument(
"--enable-git",
action="store_true",
help="Enable git actions (fetch, merge, add, commit, and push)",
)
args = parser.parse_args()
if not args.enable_save and args.enable_git:
parser.error("--enable-git requires --enable-save")
return args


def get_requests_session():
max_retries = Retry(
total=5,
backoff_factor=5,
status_forcelist=RETRY_STATUS_FORCELIST,
)
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=max_retries))
session.headers.update({"accept": "application/json"})
return session


def query_openverse(session, page_size=10):
"""
Fetch a small number of works from Openverse API.
"""
url = f"https://api.openverse.engineering/v1/images/?page_size={page_size}"
try:
response = session.get(url)
if response.status_code == 401:
raise shared.QuantifyingException(
"Unauthorized(401): Check API key.", exit_code=1
)
response.raise_for_status()
data = response.json()
works = data.get("results", [])
extracted = []
for work in works:
extracted.append(
{field: work.get(field, "") for field in OPENVERSE_FIELDS}
)
return extracted
except requests.RequestException as e:
LOGGER.error(f"Openverse fetch failed: {e}")
raise shared.QuantifyingException(f"Openverse fetch failed: {e}")


def write_data(args, data):
if not args.enable_save:
return
os.makedirs(PATHS["data_phase"], exist_ok=True)
with open(FILE_PATH, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=OPENVERSE_FIELDS)
writer.writeheader()
for row in data:
writer.writerow(row)


def main():
args = parse_arguments()
session = get_requests_session()
works = query_openverse(session)
write_data(args, works)
LOGGER.info(f"Fetched {len(works)} Openverse works")


if __name__ == "__main__":
try:
main()
except shared.QuantifyingException as e:
if e.exit_code == 0:
LOGGER.info(e.message)
else:
LOGGER.error(e.message)
sys.exit(e.exit_code)
except SystemExit as e:
if e.code != 0:
LOGGER.error(f"System exit with code: {e.code}")
sys.exit(e.code)
except KeyboardInterrupt:
LOGGER.info("(130) Halted via KeyboardInterrupt.")
sys.exit(130)
except Exception:
traceback_formatted = textwrap.indent(
highlight(
traceback.format_exc(),
PythonTracebackLexer(),
TerminalFormatter(),
),
" ",
)
LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}")
sys.exit(1)
11 changes: 11 additions & 0 deletions scripts/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,17 @@
from git import InvalidGitRepositoryError, NoSuchPathError, Repo
from pandas import PeriodIndex

# constants
RETRY_STATUS_FORCELIST = [
408, # Request Timeout
422, # Unprocessable Content (Validation failed, or endpoint spammed)
429, # Too Many Requests
500, # Internal Server Error
502, # Bad Gateway
503, # Service Unavailable
504, # Gateway Timeout
]


class QuantifyingException(Exception):
def __init__(self, message, exit_code=None):
Expand Down