Skip to content

Commit e9087fe

Browse files
authored
Merge pull request #136 from creativecommons/gcs-fetch-refactor
Refactor GCS Fetch phase (and disable other phases, for now)
2 parents 111a72d + 489c8f8 commit e9087fe

18 files changed

+3539
-700
lines changed
Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,15 @@ name: Fetch Data
22

33
on:
44
schedule:
5-
# at 01:15 on days 1-20 in first month of each quarter
6-
- cron: '15 1 1-20 1,4,7,10 *'
5+
# Normal schedule
6+
# # at 01:15 on all days in first month of each quarter
7+
# - cron: '15 1 * 1,4,7,10 *'
8+
# # at 01:15 on days 1-14 in second month of each quarter
9+
# - cron: '15 1 1-14 2,5,8,11 *'
10+
# Temp schedule
11+
# at 01:15 on all days in all months
12+
- cron: '15 1 * * *'
13+
714
workflow_dispatch:
815

916
jobs:
@@ -44,14 +51,6 @@ jobs:
4451
run: echo "PYTHONPATH=./scripts" >> $GITHUB_ENV
4552

4653
- name: Run fetch script
47-
run:
48-
# ./scripts/1-fetch/deviantart_fetched.py
49-
# ./scripts/1-fetch/flickr_fetched.py
50-
# ./scripts/1-fetch/github_fetched.py
51-
# ./scripts/1-fetch/internetarchive_fetched.py
52-
# ./scripts/1-fetch/metmuseum_fetched.py
53-
# ./scripts/1-fetch/vimeo_fetched.py
54-
# ./scripts/1-fetch/wikicommons_fetched.py
55-
# ./scripts/1-fetch/wikipedia_fetched.py
56-
# ./scripts/1-fetch/youtube_fetched.py
57-
./scripts/1-fetch/gcs_fetched.py
54+
run: |
55+
./scripts/1-fetch/gcs_fetched.py \
56+
--enable-git --enable-save --limit=100
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@ name: Process Data
22

33
on:
44
schedule:
5-
# at 01:15 on days 1-20 in second month of each quarter
6-
- cron: '15 1 1-20 2,5,8,11 *'
5+
# at 02:15 on days 15-28 in second month of each quarter
6+
- cron: '15 2 15-28 2,5,8,11 *'
77
workflow_dispatch:
88

99
jobs:
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@ name: Generate Report
22

33
on:
44
schedule:
5-
# at 01:15 on days 1-20 in third month of each quarter
6-
- cron: '15 1 1-20 3,6,9,12 *'
5+
# at 03:15 on all days in third month of each quarter
6+
- cron: '15 3 * 3,6,9,12 *'
77
workflow_dispatch:
88

99
jobs:
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
"PLAN_INDEX","TOOL_IDENTIFIER","COUNT"
2+
"1","CC BY-NC 4.0","154000000"
3+
"2","CC BY-NC-ND 4.0","73400000"
4+
"3","CC BY-NC-SA 4.0","70500000"
5+
"4","CC BY-ND 4.0","122000000"
6+
"5","CC BY-SA 4.0","185000000"
7+
"6","PDM 1.0","13100000"
8+
"7","CC0 1.0","29700000"
9+
"8","CC BY 3.0","221000000"
10+
"9","CC BY-NC 3.0","102000000"
11+
"10","CC BY-NC-ND 3.0","52400000"
12+
"11","CC BY-NC-SA 3.0","36200000"
13+
"12","CC BY-ND 3.0","245000000"
14+
"13","CC BY-SA 3.0","105000000"
15+
"14","CC BY 2.5","258000000"
16+
"15","CC BY-NC 2.5","72600000"
17+
"16","CC BY-NC-ND 2.5","72200000"
18+
"17","CC BY-NC-SA 2.5","30300000"
19+
"18","CC BY-ND 2.5","54300000"
20+
"19","CC BY-SA 2.5","130000000"
21+
"20","CC BY 2.1","245000000"
22+
"21","CC BY-NC 2.1","64500000"
23+
"22","CC BY-NC-ND 2.1","64500000"
24+
"23","CC BY-NC-SA 2.1","28300000"
25+
"24","CC BY-ND 2.1","244000000"
26+
"25","CC BY-SA 2.1","128000000"
27+
"26","CC BY 2.0","181000000"
28+
"27","CC BY-NC 2.0","42800000"
29+
"28","CC BY-NC-ND 2.0","54400000"
30+
"29","CC BY-NC-SA 2.0","18700000"
31+
"30","CC BY-ND 2.0","31300000"
32+
"31","CC BY-ND-NC 2.0","15000000"
33+
"32","CC BY-SA 2.0","84100000"
34+
"33","CC BY 1.0","231000000"
35+
"34","CC BY-NC 1.0","53400000"
36+
"35","CC BY-NC-SA 1.0","22300000"
37+
"36","CC BY-ND 1.0","233000000"
38+
"37","CC BY-ND-NC 1.0","16200000"
39+
"38","CC BY-SA 1.0","102000000"
40+
"39","CC BY 3.0 AM","139000000"
41+
"40","CC BY-NC 3.0 AM","50400000"
42+
"41","CC BY-NC-ND 3.0 AM","50400000"
43+
"42","CC BY-NC-SA 3.0 AM","21000000"
44+
"43","CC BY-ND 3.0 AM","159000000"
45+
"44","CC BY-SA 3.0 AM","80500000"
46+
"45","CC BY 3.0 AT","793000000"
47+
"46","CC BY-NC 3.0 AT","229000000"
48+
"47","CC BY-NC-ND 3.0 AT","102000000"
49+
"48","CC BY-NC-SA 3.0 AT","128000000"
50+
"49","CC BY-ND 3.0 AT","279000000"
51+
"50","CC BY-SA 3.0 AT","344000000"
52+
"51","CC BY 3.0 AU","203000000"
53+
"52","CC BY-NC 3.0 AU","59900000"
54+
"53","CC BY-NC-ND 3.0 AU","17400000"
55+
"54","CC BY-NC-SA 3.0 AU","26100000"
56+
"55","CC BY-ND 3.0 AU","54000000"
57+
"56","CC BY-SA 3.0 AU","129000000"
58+
"57","CC BY 3.0 AZ","21400000"
59+
"58","CC BY-NC 3.0 AZ","6190000"
60+
"59","CC BY-NC-ND 3.0 AZ","6180000"
61+
"60","CC BY-NC-SA 3.0 AZ","741000"
62+
"61","CC BY-ND 3.0 AZ","24500000"
63+
"62","CC BY-SA 3.0 AZ","9670000"
64+
"63","CC BY 3.0 BR","55000000"
65+
"64","CC BY-NC 3.0 BR","15700000"
66+
"65","CC BY-NC-ND 3.0 BR","15600000"
67+
"66","CC BY-NC-SA 3.0 BR","6080000"
68+
"67","CC BY-ND 3.0 BR","67100000"
69+
"68","CC BY-SA 3.0 BR","22300000"
70+
"69","CC BY 3.0 CA","128000000"
71+
"70","CC BY-NC 3.0 CA","37800000"
72+
"71","CC BY-NC-ND 3.0 CA","37800000"
73+
"72","CC BY-NC-SA 3.0 CA","17800000"
74+
"73","CC BY-ND 3.0 CA","150000000"
75+
"74","CC BY-SA 3.0 CA","73700000"
76+
"75","CC BY 3.0 CH","103000000"
77+
"76","CC BY-NC 3.0 CH","26600000"
78+
"77","CC BY-NC-ND 3.0 CH","19200000"
79+
"78","CC BY-NC-SA 3.0 CH","11600000"
80+
"79","CC BY-ND 3.0 CH","122000000"
81+
"80","CC BY-SA 3.0 CH","46300000"
82+
"81","CC BY 3.0 CL","204000000"
83+
"82","CC BY-NC 3.0 CL","58700000"
84+
"83","CC BY-NC-ND 3.0 CL","13900000"
85+
"84","CC BY-NC-SA 3.0 CL","28800000"
86+
"85","CC BY-ND 3.0 CL","55300000"
87+
"86","CC BY-SA 3.0 CL","123000000"
88+
"87","CC BY 3.0 CN","24800000"
89+
"88","CC BY-NC 3.0 CN","6780000"
90+
"89","CC BY-NC-ND 3.0 CN","6860000"
91+
"90","CC BY-NC-SA 3.0 CN","702000"
92+
"91","CC BY-ND 3.0 CN","29000000"
93+
"92","CC BY-SA 3.0 CN","14100000"
94+
"93","CC BY 3.0 CR","41000000"
95+
"94","CC BY-NC 3.0 CR","10800000"
96+
"95","CC BY-NC-ND 3.0 CR","10700000"
97+
"96","CC BY-NC-SA 3.0 CR","2070000"
98+
"97","CC BY-ND 3.0 CR","48900000"
99+
"98","CC BY-SA 3.0 CR","25100000"
100+
"99","CC BY 3.0 CZ","2470000"
101+
"100","CC BY-NC 3.0 CZ","519000"
102+
"101","CC BY-NC-ND 3.0 CZ","491000"
103+
"102","CC BY-NC-SA 3.0 CZ","196000"
104+
"103","CC BY-ND 3.0 CZ","2540000"
105+
"104","CC BY-SA 3.0 CZ","399000"
106+
"105","CC BY 3.0 DE","579000000"
107+
"106","CC BY-NC 3.0 DE","175000000"
108+
"107","CC BY-NC-ND 3.0 DE","56700000"
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"PLAN_INDEX","TOOL_IDENTIFIER","LANGUAGE","COUNT"
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"PLAN_INDEX","TOOL_IDENTIFIER","COUNTRY","COUNT"

data/2024Q4/1-fetch/gcs_fetched.csv

Lines changed: 0 additions & 15 deletions
This file was deleted.

data/2024Q4/state.yaml

Lines changed: 0 additions & 1 deletion
This file was deleted.

data/gcs_language_collection.yaml

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
# Based on:
2+
# https://developers.google.com/custom-search/docs/xml_results_appendices#language-collection-values
3+
#
4+
# Reformatted with ./dev/reformat_collection_values_to_yaml.sh
5+
- language: Arabic
6+
lr: lang_ar
7+
- language: Bulgarian
8+
lr: lang_bg
9+
- language: Catalan
10+
lr: lang_ca
11+
- language: Chinese (Simplified)
12+
lr: lang_zh-CN
13+
- language: Chinese (Traditional)
14+
lr: lang_zh-TW
15+
- language: Croatian
16+
lr: lang_hr
17+
- language: Czech
18+
lr: lang_cs
19+
- language: Danish
20+
lr: lang_da
21+
- language: Dutch
22+
lr: lang_nl
23+
- language: English
24+
lr: lang_en
25+
- language: Estonian
26+
lr: lang_et
27+
- language: Finnish
28+
lr: lang_fi
29+
- language: French
30+
lr: lang_fr
31+
- language: German
32+
lr: lang_de
33+
- language: Greek
34+
lr: lang_el
35+
- language: Hebrew
36+
lr: lang_iw
37+
- language: Hungarian
38+
lr: lang_hu
39+
- language: Icelandic
40+
lr: lang_is
41+
- language: Indonesian
42+
lr: lang_id
43+
- language: Italian
44+
lr: lang_it
45+
- language: Japanese
46+
lr: lang_ja
47+
- language: Korean
48+
lr: lang_ko
49+
- language: Latvian
50+
lr: lang_lv
51+
- language: Lithuanian
52+
lr: lang_lt
53+
- language: Norwegian
54+
lr: lang_no
55+
- language: Polish
56+
lr: lang_pl
57+
- language: Portuguese
58+
lr: lang_pt
59+
- language: Romanian
60+
lr: lang_ro
61+
- language: Russian
62+
lr: lang_ru
63+
- language: Serbian
64+
lr: lang_sr
65+
- language: Slovak
66+
lr: lang_sk
67+
- language: Slovenian
68+
lr: lang_sl
69+
- language: Spanish
70+
lr: lang_es
71+
- language: Swedish
72+
lr: lang_sv
73+
- language: Turkish
74+
lr: lang_tr

0 commit comments

Comments
 (0)