11#! /bin/bash
2-
32set -euo pipefail
43
54# Force default language for output sorting to be bytewise. Necessary to ensure uniformity amongst
65# UNIX commands.
76export LC_ALL=C
7+ PYTHON=python3
8+ LANGWIKI=frwiki
89
910# By default, the latest Wikipedia dump will be downloaded. If a download date in the format
1011# YYYYMMDD is provided as the first argument, it will be used instead.
1112if [[ $# -eq 0 ]]; then
12- DOWNLOAD_DATE=$( wget -q -O- https://dumps.wikimedia.your. org/enwiki / | grep -Po ' \d{8}' | sort | tail -n1)
13+ DOWNLOAD_DATE=$( wget -q -O- https://dumps.wikimedia.org/$LANGWIKI / | grep -Po ' \d{8}' | sort | tail -n1)
1314else
1415 if [ ${# 1} -ne 8 ]; then
1516 echo " [ERROR] Invalid download date provided: $1 "
2223ROOT_DIR=` pwd`
2324OUT_DIR=" dump"
2425
25- DOWNLOAD_URL=" https://dumps.wikimedia.your.org/enwiki/$DOWNLOAD_DATE "
26- TORRENT_URL=" https://tools.wmflabs.org/dump-torrents/enwiki/$DOWNLOAD_DATE "
26+ DELETE_PROGRESSIVELY=false
27+ DOWNLOAD_URL=" https://dumps.wikimedia.org/$LANGWIKI /$DOWNLOAD_DATE "
28+ TORRENT_URL=" https://tools.wmflabs.org/dump-torrents/$LANGWIKI /$DOWNLOAD_DATE "
2729
28- SHA1SUM_FILENAME=" enwiki-$DOWNLOAD_DATE -sha1sums.txt"
29- REDIRECTS_FILENAME=" enwiki-$DOWNLOAD_DATE -redirect.sql.gz"
30- PAGES_FILENAME=" enwiki-$DOWNLOAD_DATE -page.sql.gz"
31- LINKS_FILENAME=" enwiki-$DOWNLOAD_DATE -pagelinks.sql.gz"
30+ SHA1SUM_FILENAME=" $LANGWIKI -$DOWNLOAD_DATE -sha1sums.txt"
3231
32+ REDIRECTS_FILENAME=" $LANGWIKI -$DOWNLOAD_DATE -redirect.sql.gz"
33+ PAGES_FILENAME=" $LANGWIKI -$DOWNLOAD_DATE -page.sql.gz"
34+ LINKS_FILENAME=" $LANGWIKI -$DOWNLOAD_DATE -pagelinks.sql.gz"
35+ TARGETS_FILENAME=" $LANGWIKI -$DOWNLOAD_DATE -linktarget.sql.gz"
3336
3437# Make the output directory if it doesn't already exist and move to it
3538mkdir -p $OUT_DIR
@@ -77,6 +80,7 @@ download_file "sha1sums" $SHA1SUM_FILENAME
7780download_file " redirects" $REDIRECTS_FILENAME
7881download_file " pages" $PAGES_FILENAME
7982download_file " links" $LINKS_FILENAME
83+ download_file " targets" $TARGETS_FILENAME
8084
8185# #########################
8286# TRIM WIKIPEDIA DUMPS #
@@ -103,7 +107,7 @@ if [ ! -f redirects.txt.gz ]; then
103107else
104108 echo " [WARN] Already trimmed redirects file"
105109fi
106-
110+ if $DELETE_PROGRESSIVELY ; then rm $REDIRECTS_FILENAME ; fi
107111if [ ! -f pages.txt.gz ]; then
108112 echo
109113 echo " [INFO] Trimming pages file"
@@ -116,16 +120,16 @@ if [ ! -f pages.txt.gz ]; then
116120 # Splice out the page title and whether or not the page is a redirect
117121 # Zip into output file
118122 time pigz -dc $PAGES_FILENAME \
119- | sed -n ' s/^INSERT INTO `page` VALUES (//p' \
120- | sed -e ' s/),(/\' $' \n /g' \
121- | egrep " ^[0-9]+,0," \
122- | sed -e $" s/,0,'/\t/" \
123- | sed -e $" s/',[^,]*,\([01]\).*/\t\1/" \
123+ | sed -n ' s/^INSERT INTO `page` VALUES //p' \
124+ | egrep -o " \([0-9]+,0,'([^']*(\\\\ ')?)+',[01]," \
125+ | sed -re $" s/^\(([0-9]+),0,'/\1\t/" \
126+ | sed -re $" s/',([01]),/\t\1/" \
124127 | pigz --fast > pages.txt.gz.tmp
125128 mv pages.txt.gz.tmp pages.txt.gz
126129else
127130 echo " [WARN] Already trimmed pages file"
128131fi
132+ if $DELETE_PROGRESSIVELY ; then rm $PAGES_FILENAME ; fi
129133
130134if [ ! -f links.txt.gz ]; then
131135 echo
@@ -141,14 +145,38 @@ if [ ! -f links.txt.gz ]; then
141145 time pigz -dc $LINKS_FILENAME \
142146 | sed -n ' s/^INSERT INTO `pagelinks` VALUES (//p' \
143147 | sed -e ' s/),(/\' $' \n /g' \
144- | egrep " ^[0-9]+,0,.*,0$" \
145- | sed -e $" s/,0,'/\t/g" \
146- | sed -e " s/',0//g" \
148+ | egrep " ^[0-9]+,0,[0-9]+$" \
149+ | sed -e $" s/,0,/\t/g" \
147150 | pigz --fast > links.txt.gz.tmp
148151 mv links.txt.gz.tmp links.txt.gz
149152else
150153 echo " [WARN] Already trimmed links file"
151154fi
155+ if $DELETE_PROGRESSIVELY ; then rm $LINKS_FILENAME ; fi
156+
157+ if [ ! -f targets.txt.gz ]; then
158+ echo
159+ echo " [INFO] Trimming targets file"
160+
161+ # Unzip
162+ # Remove all lines that don't start with INSERT INTO...
163+ # Split into individual records
164+ # Only keep records in namespace 0
165+ # Replace namespace with a tab
166+ # Remove everything starting at the to page name's closing apostrophe
167+ # Zip into output file
168+ time pigz -dc $TARGETS_FILENAME \
169+ | sed -n ' s/^INSERT INTO `linktarget` VALUES (//p' \
170+ | sed -e ' s/),(/\' $' \n /g' \
171+ | egrep " ^[0-9]+,0,.*$" \
172+ | sed -e $" s/,0,'/\t/g" \
173+ | sed -e " s/'$//g" \
174+ | pigz --fast > targets.txt.gz.tmp
175+ mv targets.txt.gz.tmp targets.txt.gz
176+ else
177+ echo " [WARN] Already trimmed targets file"
178+ fi
179+ if $DELETE_PROGRESSIVELY ; then rm $TARGETS_FILENAME ; fi
152180
153181
154182# ##########################################
157185if [ ! -f redirects.with_ids.txt.gz ]; then
158186 echo
159187 echo " [INFO] Replacing titles in redirects file"
160- time python " $ROOT_DIR /replace_titles_in_redirects_file.py" pages.txt.gz redirects.txt.gz \
188+ time $PYTHON " $ROOT_DIR /replace_titles_in_redirects_file.py" pages.txt.gz redirects.txt.gz \
161189 | sort -S 100% -t $' \t ' -k 1n,1n \
162190 | pigz --fast > redirects.with_ids.txt.gz.tmp
163191 mv redirects.with_ids.txt.gz.tmp redirects.with_ids.txt.gz
164192else
165193 echo " [WARN] Already replaced titles in redirects file"
166194fi
195+ if $DELETE_PROGRESSIVELY ; then rm redirects.txt.gz; fi
196+
197+ if [ ! -f targets.with_ids.txt.gz ]; then
198+ echo
199+ echo " [INFO] Replacing titles and redirects in targets file"
200+ time $PYTHON " $ROOT_DIR /replace_titles_and_redirects_in_targets_file.py" pages.txt.gz redirects.with_ids.txt.gz targets.txt.gz \
201+ | pigz --fast > targets.with_ids.txt.gz.tmp
202+ mv targets.with_ids.txt.gz.tmp targets.with_ids.txt.gz
203+ else
204+ echo " [WARN] Already replaced titles and redirects in targets file"
205+ fi
206+ if $DELETE_PROGRESSIVELY ; then rm targets.txt.gz; fi
167207
168208if [ ! -f links.with_ids.txt.gz ]; then
169209 echo
170210 echo " [INFO] Replacing titles and redirects in links file"
171- time python " $ROOT_DIR /replace_titles_and_redirects_in_links_file.py" pages.txt.gz redirects.with_ids.txt.gz links.txt.gz \
211+ time $PYTHON " $ROOT_DIR /replace_titles_and_redirects_in_links_file.py" pages.txt.gz redirects.with_ids.txt.gz targets .with_ids.txt.gz links.txt.gz \
172212 | pigz --fast > links.with_ids.txt.gz.tmp
173213 mv links.with_ids.txt.gz.tmp links.with_ids.txt.gz
174214else
175215 echo " [WARN] Already replaced titles and redirects in links file"
176216fi
217+ if $DELETE_PROGRESSIVELY ; then rm links.txt.gz targets.with_ids.txt.gz; fi
177218
178219if [ ! -f pages.pruned.txt.gz ]; then
179220 echo
180221 echo " [INFO] Pruning pages which are marked as redirects but with no redirect"
181- time python " $ROOT_DIR /prune_pages_file.py" pages.txt.gz redirects.with_ids.txt.gz \
222+ time $PYTHON " $ROOT_DIR /prune_pages_file.py" pages.txt.gz redirects.with_ids.txt.gz \
182223 | pigz --fast > pages.pruned.txt.gz
183224else
184225 echo " [WARN] Already pruned pages which are marked as redirects but with no redirect"
185226fi
227+ if $DELETE_PROGRESSIVELY ; then rm pages.txt.gz; fi
186228
187229# ####################
188230# SORT LINKS FILE #
@@ -210,6 +252,7 @@ if [ ! -f links.sorted_by_target_id.txt.gz ]; then
210252else
211253 echo " [WARN] Already sorted links file by target page ID"
212254fi
255+ if $DELETE_PROGRESSIVELY ; then rm links.with_ids.txt.gz; fi
213256
214257
215258# ############################
@@ -225,6 +268,7 @@ if [ ! -f links.grouped_by_source_id.txt.gz ]; then
225268else
226269 echo " [WARN] Already grouped source links file by source page ID"
227270fi
271+ if $DELETE_PROGRESSIVELY ; then rm links.sorted_by_source_id.txt.gz; fi
228272
229273if [ ! -f links.grouped_by_target_id.txt.gz ]; then
230274 echo
@@ -235,6 +279,7 @@ if [ ! -f links.grouped_by_target_id.txt.gz ]; then
235279else
236280 echo " [WARN] Already grouped target links file by target page ID"
237281fi
282+ if $DELETE_PROGRESSIVELY ; then rm links.sorted_by_target_id.txt.gz; fi
238283
239284
240285# ###############################
243288if [ ! -f links.with_counts.txt.gz ]; then
244289 echo
245290 echo " [INFO] Combining grouped links files"
246- time python " $ROOT_DIR /combine_grouped_links_files.py" links.grouped_by_source_id.txt.gz links.grouped_by_target_id.txt.gz \
291+ time $PYTHON " $ROOT_DIR /combine_grouped_links_files.py" links.grouped_by_source_id.txt.gz links.grouped_by_target_id.txt.gz \
247292 | pigz --fast > links.with_counts.txt.gz.tmp
248293 mv links.with_counts.txt.gz.tmp links.with_counts.txt.gz
249294else
250295 echo " [WARN] Already combined grouped links files"
251296fi
297+ if $DELETE_PROGRESSIVELY ; then rm links.grouped_by_source_id.txt.gz links.grouped_by_target_id.txt.gz; fi
252298
253299
254300# ###########################
@@ -258,14 +304,17 @@ if [ ! -f sdow.sqlite ]; then
258304 echo
259305 echo " [INFO] Creating redirects table"
260306 time pigz -dc redirects.with_ids.txt.gz | sqlite3 sdow.sqlite " .read $ROOT_DIR /../sql/createRedirectsTable.sql"
307+ if $DELETE_PROGRESSIVELY ; then rm redirects.with_ids.txt.gz; fi
261308
262309 echo
263310 echo " [INFO] Creating pages table"
264311 time pigz -dc pages.pruned.txt.gz | sqlite3 sdow.sqlite " .read $ROOT_DIR /../sql/createPagesTable.sql"
312+ if $DELETE_PROGRESSIVELY ; then rm pages.pruned.txt.gz; fi
265313
266314 echo
267315 echo " [INFO] Creating links table"
268316 time pigz -dc links.with_counts.txt.gz | sqlite3 sdow.sqlite " .read $ROOT_DIR /../sql/createLinksTable.sql"
317+ if $DELETE_PROGRESSIVELY ; then rm links.with_counts.txt.gz; fi
269318
270319 echo
271320 echo " [INFO] Compressing SQLite file"
274323 echo " [WARN] Already created SQLite database"
275324fi
276325
277-
278326echo
279327echo " [INFO] All done!"
0 commit comments