Skip to content

Commit 68748d6

Browse files
authored
Merge pull request #138 from creativecommons/update-github-fetch
Update and enable GitHub fetch (and small updates to GCS fetch)
2 parents ad847de + 93f6f60 commit 68748d6

File tree

4 files changed

+182
-176
lines changed

4 files changed

+182
-176
lines changed

.github/workflows/1-fetch.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,3 +54,5 @@ jobs:
5454
run: |
5555
./scripts/1-fetch/gcs_fetch.py \
5656
--limit=100 --enable-save --enable-git
57+
./scripts/1-fetch/github_fetch.py \
58+
--enable-save --enable-git
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
"TOOL_IDENTIFIER","SPDX_IDENTIFIER","COUNT"
2+
"BSD Zero Clause License","0BSD","59919"
3+
"CC0 1.0","CC0-1.0","275634"
4+
"CC BY 4.0","CC-BY-4.0","94575"
5+
"CC BY-SA 4.0","CC-BY-SA-4.0","26190"
6+
"MIT No Attribution","MIT-0","22355"
7+
"Unlicense","Unlicense","359858"
8+
"Total public repositories","N/A","128210047"

scripts/1-fetch/gcs_fetch.py

Lines changed: 34 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/usr/bin/env python
22
"""
3-
This file is dedicated to querying data from the Google Custom Search API.
3+
Fetch CC Legal Tool usage data from Google Custom Search (GCS) API.
44
"""
55
# Standard library
66
import argparse
@@ -53,30 +53,30 @@
5353

5454
def parse_arguments():
5555
"""
56-
Parses command-line arguments, returns parsed arguments.
56+
Parse command-line options, returns parsed argument namespace.
5757
"""
58-
LOGGER.info("Parsing command-line arguments")
59-
parser = argparse.ArgumentParser(description="Google Custom Search Script")
58+
LOGGER.info("Parsing command-line options")
59+
parser = argparse.ArgumentParser(description=__doc__)
6060
parser.add_argument(
61-
"--dev",
61+
"--limit",
62+
type=int,
63+
default=1,
64+
help="Limit queries (default: 1)",
65+
)
66+
parser.add_argument(
67+
"--enable-save",
6268
action="store_true",
63-
help="Development mode: avoid hitting API (generates fake data)",
69+
help="Enable saving results",
6470
)
6571
parser.add_argument(
6672
"--enable-git",
6773
action="store_true",
6874
help="Enable git actions (fetch, merge, add, commit, and push)",
6975
)
7076
parser.add_argument(
71-
"--enable-save",
77+
"--dev",
7278
action="store_true",
73-
help="Enable saving results",
74-
)
75-
parser.add_argument(
76-
"--limit",
77-
type=int,
78-
default=1,
79-
help="Limit queries (default: 1)",
79+
help="Development mode: avoid hitting API (generate fake data)",
8080
)
8181
return parser.parse_args()
8282

@@ -100,7 +100,10 @@ def initialize_data_file(file_path, header):
100100
writer.writeheader()
101101

102102

103-
def initialize_all_data_files():
103+
def initialize_all_data_files(args):
104+
if not args.enable_save:
105+
return
106+
104107
# Create data directory for this phase
105108
os.makedirs(PATHS["data_phase"], exist_ok=True)
106109

@@ -112,17 +115,20 @@ def initialize_all_data_files():
112115
def get_last_completed_plan_index():
113116
last_completed_plan_index = 0
114117
for file_path in [FILE1_COUNT, FILE2_LANGUAGE, FILE3_COUNTRY]:
115-
with open(file_path, "r", newline="") as file_obj:
116-
reader = csv.DictReader(file_obj, dialect="unix")
117-
for row in reader:
118-
pass # skip through to last row
119-
try:
120-
last_completed_plan_index = max(
121-
last_completed_plan_index,
122-
int(row["PLAN_INDEX"]),
123-
)
124-
except UnboundLocalError:
125-
pass
118+
try:
119+
with open(file_path, "r", newline="") as file_obj:
120+
reader = csv.DictReader(file_obj, dialect="unix")
121+
for row in reader:
122+
pass # skip through to last row
123+
try:
124+
last_completed_plan_index = max(
125+
last_completed_plan_index,
126+
int(row["PLAN_INDEX"]),
127+
)
128+
except UnboundLocalError:
129+
pass # Data row may not be found with --enable-save, etc.
130+
except FileNotFoundError:
131+
pass # File may not be found without --enable-save, etc.
126132
LOGGER.info(f"Last completed plan index: {last_completed_plan_index}")
127133
return last_completed_plan_index
128134

@@ -249,7 +255,7 @@ def main():
249255
args = parse_arguments()
250256
shared.log_paths(LOGGER, PATHS)
251257
service = get_search_service()
252-
initialize_all_data_files()
258+
initialize_all_data_files(args)
253259
last_completed_plan_index = get_last_completed_plan_index()
254260
if last_completed_plan_index == 2867:
255261
LOGGER.info(f"Data fetch completed for {QUARTER}")
@@ -260,7 +266,7 @@ def main():
260266
args,
261267
PATHS["repo"],
262268
PATHS["data_quarter"],
263-
"Add and commit new Google Custom Search (GCS) data for" f" {QUARTER}",
269+
f"Add and commit new Google Custom Search (GCS) data for {QUARTER}",
264270
)
265271
shared.git_push_changes(args, PATHS["repo"])
266272

0 commit comments

Comments
 (0)