Skip to content

Commit b023018

Browse files
authored
Merge pull request #167 from oree-xx/wikipedia
Add Wikipedia as data source
2 parents 9aa5a8f + 7fec985 commit b023018

File tree

2 files changed

+223
-0
lines changed

2 files changed

+223
-0
lines changed

scripts/1-fetch/wikipedia_fetch.py

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
#!/usr/bin/env python
2+
"""
3+
Fetch CC Legal Tool usage from Wikipedia API.
4+
"""
5+
6+
# Standard library
7+
import argparse
8+
import csv
9+
import os
10+
import sys
11+
import textwrap
12+
import traceback
13+
14+
# Third-party
15+
import requests
16+
from pygments import highlight
17+
from pygments.formatters import TerminalFormatter
18+
from pygments.lexers import PythonTracebackLexer
19+
from requests.adapters import HTTPAdapter
20+
from urllib3.util.retry import Retry
21+
22+
# Add parent directory so shared can be imported
23+
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
24+
25+
# First-party/Local
26+
import shared # noqa: E402
27+
28+
# Setup
29+
LOGGER, PATHS = shared.setup(__file__)
30+
31+
# Constants
32+
FILE_LANGUAGES = os.path.join(
33+
PATHS["data_phase"], "wikipedia_count_by_languages.csv"
34+
)
35+
HEADER_LANGUAGES = [
36+
"LANGUAGE_CODE",
37+
"LANGUAGE_NAME_EN",
38+
"LANGUAGE_NAME",
39+
"COUNT",
40+
]
41+
QUARTER = os.path.basename(PATHS["data_quarter"])
42+
WIKIPEDIA_BASE_URL = "https://en.wikipedia.org/w/api.php"
43+
WIKIPEDIA_MATRIX_URL = "https://meta.wikimedia.org/w/api.php"
44+
45+
46+
def parse_arguments():
47+
"""
48+
Parse command-line options, returns parsed argument namespace.
49+
"""
50+
LOGGER.info("Parsing command-line options")
51+
parser = argparse.ArgumentParser(description=__doc__)
52+
parser.add_argument(
53+
"--enable-save",
54+
action="store_true",
55+
help="Enable saving results",
56+
)
57+
parser.add_argument(
58+
"--enable-git",
59+
action="store_true",
60+
help="Enable git actions (fetch, merge, add, commit, and push)",
61+
)
62+
args = parser.parse_args()
63+
if not args.enable_save and args.enable_git:
64+
parser.error("--enable-git requires --enable-save")
65+
return args
66+
67+
68+
def get_requests_session():
69+
max_retries = Retry(
70+
total=5,
71+
backoff_factor=10,
72+
status_forcelist=shared.STATUS_FORCELIST,
73+
)
74+
session = requests.Session()
75+
session.mount("https://", HTTPAdapter(max_retries=max_retries))
76+
session.headers.update({"User-Agent": shared.USER_AGENT})
77+
return session
78+
79+
80+
def write_data(args, tool_data):
81+
if not args.enable_save:
82+
return args
83+
LOGGER.info("Saving fetched data")
84+
os.makedirs(PATHS["data_phase"], exist_ok=True)
85+
86+
with open(FILE_LANGUAGES, "w", newline="", encoding="utf-8") as file_obj:
87+
writer = csv.DictWriter(
88+
file_obj, fieldnames=HEADER_LANGUAGES, dialect="unix"
89+
)
90+
writer.writeheader()
91+
for row in tool_data:
92+
writer.writerow(row)
93+
return args
94+
95+
96+
def query_wikipedia_languages(session):
97+
LOGGER.info("Fetching article counts from all language Wikipedias")
98+
tool_data = []
99+
100+
# Gets all language wikipedias
101+
params = {"action": "sitematrix", "format": "json", "uselang": "en"}
102+
r = session.get(WIKIPEDIA_MATRIX_URL, params=params, timeout=30)
103+
data = r.json()["sitematrix"]
104+
105+
languages = []
106+
for key, val in data.items():
107+
if not isinstance(val, dict):
108+
continue
109+
if key.isdigit():
110+
language_code = val.get("code")
111+
language_name = val.get("name")
112+
language_name_en = val.get("localname")
113+
for site in val.get("site", []):
114+
if "wikipedia.org" in site["url"]:
115+
languages.append(
116+
{
117+
"code": language_code,
118+
"name": language_name,
119+
"name_en": language_name_en,
120+
"url": site["url"],
121+
}
122+
)
123+
# For each language wikipedia, fetch statistics.
124+
for site in languages:
125+
base_url = f"{site['url']}/w/api.php"
126+
params = {
127+
"action": "query",
128+
"meta": "siteinfo",
129+
"siprop": "statistics",
130+
"format": "json",
131+
}
132+
try:
133+
r = session.get(base_url, params=params, timeout=30)
134+
r.raise_for_status()
135+
data = r.json()
136+
stats = data["query"]["statistics"]
137+
article_count = stats.get("articles", 0)
138+
language_code = site["code"]
139+
language_name = site["name"]
140+
language_name_en = site["name_en"]
141+
142+
language_display = f"{language_code}"
143+
if language_name_en:
144+
language_display = f"{language_display} {language_name_en}"
145+
if language_name:
146+
language_display = f"{language_display} ({language_name})"
147+
148+
if article_count == 0:
149+
LOGGER.warning(f"Skipping {language_display} with 0 articles")
150+
continue
151+
tool_data.append(
152+
{
153+
"LANGUAGE_CODE": language_code,
154+
"LANGUAGE_NAME": language_name,
155+
"LANGUAGE_NAME_EN": language_name_en,
156+
"COUNT": article_count,
157+
}
158+
)
159+
LOGGER.info(f"{language_display}: {article_count}")
160+
161+
except Exception as e:
162+
LOGGER.warning(f"Failed to fetch for {language_display}): {e}")
163+
164+
return tool_data
165+
166+
167+
def main():
168+
args = parse_arguments()
169+
shared.paths_log(LOGGER, PATHS)
170+
shared.git_fetch_and_merge(args, PATHS["repo"])
171+
tool_data = query_wikipedia_languages(get_requests_session())
172+
args = write_data(args, tool_data)
173+
args = shared.git_add_and_commit(
174+
args,
175+
PATHS["repo"],
176+
PATHS["data_quarter"],
177+
f"Add and commit new Wikipedia data for {QUARTER}",
178+
)
179+
shared.git_push_changes(args, PATHS["repo"])
180+
181+
182+
if __name__ == "__main__":
183+
try:
184+
main()
185+
except shared.QuantifyingException as e:
186+
if e.exit_code == 0:
187+
LOGGER.info(e.message)
188+
else:
189+
LOGGER.error(e.message)
190+
sys.exit(e.exit_code)
191+
except SystemExit as e:
192+
if e.code != 0:
193+
LOGGER.error(f"System exit with code: {e.code}")
194+
sys.exit(e.code)
195+
except KeyboardInterrupt:
196+
LOGGER.info("(130) Halted via KeyboardInterrupt.")
197+
sys.exit(130)
198+
except Exception:
199+
traceback_formatted = textwrap.indent(
200+
highlight(
201+
traceback.format_exc(),
202+
PythonTracebackLexer(),
203+
TerminalFormatter(),
204+
),
205+
" ",
206+
)
207+
LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}")
208+
sys.exit(1)

scripts/shared.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,21 @@
77
from git import InvalidGitRepositoryError, NoSuchPathError, Repo
88
from pandas import PeriodIndex
99

10+
# Constants
11+
STATUS_FORCELIST = [
12+
408, # Request Timeout
13+
422, # Unprocessable Content (Validation failed, endpoint spammed, etc.)
14+
429, # Too Many Requests
15+
500, # Internal Server Error
16+
502, # Bad Gateway
17+
503, # Service Unavailable
18+
504, # Gateway Timeout
19+
]
20+
USER_AGENT = (
21+
"QuantifyingTheCommons/1.0 "
22+
"(https://github.com/creativecommons/quantifying)"
23+
)
24+
1025

1126
class QuantifyingException(Exception):
1227
def __init__(self, message, exit_code=None):

0 commit comments

Comments
 (0)