Skip to content

Commit 03e4f8f

Browse files
committed
Merge branch 'main' into internetarchive
2 parents 6572ba3 + b0aa349 commit 03e4f8f

File tree

3 files changed

+493
-2
lines changed

3 files changed

+493
-2
lines changed

.cc-metadata.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,3 @@ english_name: Quantifying the Commons
66
technologies: Python
77
# Whether this repository should be featured on the CC Open Source site
88
featured: true
9-
# Slack channel name
10-
slack: "cc-dev-quantifying"
Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
#!/usr/bin/env python
2+
"""
3+
This file is dedicated to processing Wikipedia data
4+
for analysis and comparison between quarters.
5+
"""
6+
# Standard library
7+
import argparse
8+
import csv
9+
import os
10+
import sys
11+
import textwrap
12+
import traceback
13+
14+
# Third-party
15+
import pandas as pd
16+
from pygments import highlight
17+
from pygments.formatters import TerminalFormatter
18+
from pygments.lexers import PythonTracebackLexer
19+
20+
# Add parent directory so shared can be imported
21+
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
22+
23+
# First-party/Local
24+
import shared # noqa: E402
25+
26+
# Setup
27+
LOGGER, PATHS = shared.setup(__file__)
28+
29+
# Constants
30+
QUARTER = os.path.basename(PATHS["data_quarter"])
31+
32+
33+
def parse_arguments():
34+
"""
35+
Parse command-line options, returns parsed argument namespace.
36+
"""
37+
LOGGER.info("Parsing command-line options")
38+
parser = argparse.ArgumentParser(description=__doc__)
39+
parser.add_argument(
40+
"--quarter",
41+
default=QUARTER,
42+
help=f"Data quarter in format YYYYQx (default: {QUARTER})",
43+
)
44+
parser.add_argument(
45+
"--enable-save",
46+
action="store_true",
47+
help="Enable saving results (default: False)",
48+
)
49+
parser.add_argument(
50+
"--enable-git",
51+
action="store_true",
52+
help="Enable git actions such as fetch, merge, add, commit, and push"
53+
" (default: False)",
54+
)
55+
args = parser.parse_args()
56+
if not args.enable_save and args.enable_git:
57+
parser.error("--enable-git requires --enable-save")
58+
if args.quarter != QUARTER:
59+
global PATHS
60+
PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter)
61+
args.logger = LOGGER
62+
args.paths = PATHS
63+
return args
64+
65+
66+
def data_to_csv(args, data, file_path):
67+
if not args.enable_save:
68+
return
69+
os.makedirs(PATHS["data_phase"], exist_ok=True)
70+
# emulate csv.unix_dialect
71+
data.to_csv(
72+
file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n"
73+
)
74+
75+
76+
def process_highest_language_usage(args, count_data):
77+
"""
78+
Processing count data: Most represented languages
79+
"""
80+
LOGGER.info(process_highest_language_usage.__doc__.strip())
81+
data = {}
82+
83+
for row in count_data.itertuples(index=False):
84+
Language = row.LANGUAGE_NAME_EN
85+
Count = row.COUNT
86+
data[Language] = Count
87+
88+
data = pd.DataFrame(data.items(), columns=["Language", "Count"])
89+
data.sort_values("Count", ascending=False, inplace=True)
90+
top_10 = data.head(10)
91+
file_path = shared.path_join(
92+
PATHS["data_phase"], "wikipedia_highest_language_usage.csv"
93+
)
94+
data_to_csv(args, top_10, file_path)
95+
96+
97+
def process_least_language_usage(args, count_data):
98+
"""
99+
Processing count data: Least represented languages
100+
"""
101+
LOGGER.info(process_least_language_usage.__doc__.strip())
102+
data = {}
103+
104+
for row in count_data.itertuples(index=False):
105+
Language = row.LANGUAGE_NAME_EN
106+
Count = row.COUNT
107+
108+
if Count >= 1:
109+
data[Language] = Count
110+
111+
data = pd.DataFrame(data.items(), columns=["Language", "Count"])
112+
data.sort_values("Count", ascending=True, inplace=True)
113+
bottom_10 = data.head(10)
114+
file_path = shared.path_join(
115+
PATHS["data_phase"], "wikipedia_least_language_usage.csv"
116+
)
117+
data_to_csv(args, bottom_10, file_path)
118+
119+
120+
def process_language_representation(args, count_data):
121+
"""
122+
Processing count data: Language representation
123+
"""
124+
LOGGER.info(process_language_representation.__doc__.strip())
125+
data = {}
126+
127+
for row in count_data.itertuples(index=False):
128+
Language = row.LANGUAGE_NAME_EN
129+
Count = row.COUNT
130+
data[Language] = Count
131+
132+
data = pd.DataFrame(data.items(), columns=["Language", "Count"])
133+
average_count = data["Count"].mean()
134+
135+
data["Category"] = data["Count"].apply(
136+
lambda x: "Underrepresented" if x < average_count else "Represented"
137+
)
138+
language_counts = data.groupby("Category").size().reset_index(name="Count")
139+
language_counts.sort_values("Count", ascending=False, inplace=True)
140+
file_path = shared.path_join(
141+
PATHS["data_phase"], "wikipedia_language_representation.csv"
142+
)
143+
data_to_csv(args, language_counts, file_path)
144+
145+
146+
def main():
147+
args = parse_arguments()
148+
shared.paths_log(LOGGER, PATHS)
149+
shared.git_fetch_and_merge(args, PATHS["repo"])
150+
151+
file_count = shared.path_join(
152+
PATHS["data_1-fetch"], "wikipedia_count_by_languages.csv"
153+
)
154+
count_data = pd.read_csv(file_count, usecols=["LANGUAGE_NAME_EN", "COUNT"])
155+
process_language_representation(args, count_data)
156+
process_highest_language_usage(args, count_data)
157+
process_least_language_usage(args, count_data)
158+
159+
# Push changes
160+
args = shared.git_add_and_commit(
161+
args,
162+
PATHS["repo"],
163+
PATHS["data_quarter"],
164+
f"Add and commit new Wikipedia data for {QUARTER}",
165+
)
166+
shared.git_push_changes(args, PATHS["repo"])
167+
168+
169+
if __name__ == "__main__":
170+
try:
171+
main()
172+
except shared.QuantifyingException as e:
173+
if e.exit_code == 0:
174+
LOGGER.info(e.message)
175+
else:
176+
LOGGER.error(e.message)
177+
sys.exit(e.exit_code)
178+
except SystemExit as e:
179+
if e.code != 0:
180+
LOGGER.error(f"System exit with code: {e.code}")
181+
sys.exit(e.code)
182+
except KeyboardInterrupt:
183+
LOGGER.info("(130) Halted via KeyboardInterrupt.")
184+
sys.exit(130)
185+
except Exception:
186+
traceback_formatted = textwrap.indent(
187+
highlight(
188+
traceback.format_exc(),
189+
PythonTracebackLexer(),
190+
TerminalFormatter(),
191+
),
192+
" ",
193+
)
194+
LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}")
195+
sys.exit(1)

0 commit comments

Comments
 (0)