MysaaJava
diff --git a/‎scripts/README.md‎
Lines changed: 80 additions & 0 deletions b/‎scripts/README.md‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎scripts/buildDatabase.sh‎
Lines changed: 70 additions & 22 deletions b/‎scripts/buildDatabase.sh‎
Lines changed: 70 additions & 22 deletions
diff --git a/‎scripts/combine_grouped_links_files.py‎
Lines changed: 17 additions & 16 deletions b/‎scripts/combine_grouped_links_files.py‎
Lines changed: 17 additions & 16 deletions
diff --git a/‎scripts/prune_pages_file.py‎
Lines changed: 6 additions & 6 deletions b/‎scripts/prune_pages_file.py‎
Lines changed: 6 additions & 6 deletions
@@ -0,0 +1,80 @@
+# Description of the process
+
+## Parsing of the tables
+
+### links.txt
+- `pl_from` -> Id of the "from" page of this link
+- (`pl_namespace`) -> We keep only if equals 0 (= namespace of the "from" page of this link)
+- `pl_target_id` -> Target of this link  (foreign key to `linktarget`)
+
+### targets.txt
+- `lt_id` -> Id of this link (index)
+- (`lt_ns`) -> We keep only if equals 0 (= namespace of the targeted page)
+- `lt_title` -> Title of the targeted page
+
+### pages.txt
+- `page_id` -> Id of the page
+- (`page_namespace`) -> We keep only if equals 0 (= namespace of this page)
+- `page_title` -> Title of this page
+- `page_is_redirect` -> Boolean wether this page is a redirect
+- Ignore the eight following
+
+### redirects.txt
+- `rd_from` -> Id of the page from which we are redirected
+- (`rd_namespace`) -> We keep only if equals 0 (= namespace of the page we are redirected to)
+- `rd_title` -> Title of the page we are redirected to
+- Ignore the two following
+
+## Joining the tables
+
+### redirects.with_ids.txt (replace_titles_in_redirects_file.py)
+Replaces for each redirection, `rd_title` with the targetted `page_id` by matching on `page_title`.
+The targetted page_id is then computed as a redirect recursively, until we get on a "final" page.
+- `rd_from` -> The id of the page we are redirected from
+- `page_id` -> The id of the page we get to following redirections recursively
+
+### targets.with_ids.txt (replace_titles_and_redirects_in_targets_file.py)
+Replaces, for each linktarget, `lt_title` with the targetted `page_id` by matching on `page_title`.
+We then compute the "final" page obtained from this page following redirection, with the file `redirects.with_ids.txt`.
+- `lt_id` -> Id of this link
+- `page_id` -> The id of the page this link is pointing to, after having followed all redirections
+
+### links.with_ids.txt (replace_titles_and_redirects_in_links_file.py)
+Replaces, for each pagelink, `lt_id` with the targetted `page_id` by joining with `links.with_ids.txt`.
+- `pl_from` -> Id of the "from" page, after having followed all redirections
+- `page_id` -> Id of the "to" page, after having followed all redirections
+
+### page.pruned.txt (prune_pages_file.py)
+Prunes the pages file by removing pages which are marked as redirects but have no corresponding redirect in the redirects file.
+TEMPORARY DISABLED as it removed too many pages
+
+## Sorting, grouping, and counting the links
+
+### links.sorted_by_XXX_id.txt
+Then we sorts the `links.with_ids.txt` according to the first "source" id, into
+the file `links.sorted_by_source_id.txt`, and according to the second "target" id 
+into the file `links.sorted_by_target_id.txt`.
+
+### links.grouped_by_XXX_id.txt
+Then, we use those two files to *GROUP BY* the links by source and by target.
+The file `links.grouped_by_source_id.txt` is like this
+- `pl_from` -> Id of the "from" page
+- `targets` -> A `|`-separated string of the ids the "from" page targets
+
+The file `links.grouped_by_target_id.txt` is like this
+- `froms` -> A `|`-separated string of the ids of the pages targeting the "target" page
+- `pl_target` -> Id of the "target" page
+
+### links.with_counts.txt (combine_grouped_links_files.py)
+We merge the two files `links.grouped_by_XXX_id.txt` creating the following file
+- `page_id` -> The id of the page
+- `outgoing_links_count` -> The number of outgoing links from this page
+- `incoming_links_count` -> The number of incoming links to this page
+- `outgoing_links` -> A `|`-separated string of the ids of the pages this page links to
+- `incoming_links` -> A `|`-separated string of the ids of the pages linking to this page
+
+## Making the database
+To make the database, we copy directly the contents of the three files into the corresponding tables
+- `links.with_counts.txt`
+- `page.pruned.txt`
+- `redirects.with_ids.txt`
@@ -1,15 +1,16 @@
 #!/bin/bash
-
 set -euo pipefail
 
 # Force default language for output sorting to be bytewise. Necessary to ensure uniformity amongst
 # UNIX commands.
 export LC_ALL=C
+PYTHON=python3
+LANGWIKI=frwiki
 
 # By default, the latest Wikipedia dump will be downloaded. If a download date in the format
 # YYYYMMDD is provided as the first argument, it will be used instead.
 if [[ $# -eq 0 ]]; then
-  DOWNLOAD_DATE=$(wget -q -O- https://dumps.wikimedia.your.org/enwiki/ | grep -Po '\d{8}' | sort | tail -n1)
+  DOWNLOAD_DATE=$(wget -q -O- https://dumps.wikimedia.org/$LANGWIKI/ | grep -Po '\d{8}' | sort | tail -n1)
 else
   if [ ${#1} -ne 8 ]; then
     echo "[ERROR] Invalid download date provided: $1"
@@ -22,14 +23,16 @@ fi
 ROOT_DIR=`pwd`
 OUT_DIR="dump"
 
-DOWNLOAD_URL="https://dumps.wikimedia.your.org/enwiki/$DOWNLOAD_DATE"
-TORRENT_URL="https://tools.wmflabs.org/dump-torrents/enwiki/$DOWNLOAD_DATE"
+DELETE_PROGRESSIVELY=false
+DOWNLOAD_URL="https://dumps.wikimedia.org/$LANGWIKI/$DOWNLOAD_DATE"
+TORRENT_URL="https://tools.wmflabs.org/dump-torrents/$LANGWIKI/$DOWNLOAD_DATE"
 
-SHA1SUM_FILENAME="enwiki-$DOWNLOAD_DATE-sha1sums.txt"
-REDIRECTS_FILENAME="enwiki-$DOWNLOAD_DATE-redirect.sql.gz"
-PAGES_FILENAME="enwiki-$DOWNLOAD_DATE-page.sql.gz"
-LINKS_FILENAME="enwiki-$DOWNLOAD_DATE-pagelinks.sql.gz"
+SHA1SUM_FILENAME="$LANGWIKI-$DOWNLOAD_DATE-sha1sums.txt"
 
+REDIRECTS_FILENAME="$LANGWIKI-$DOWNLOAD_DATE-redirect.sql.gz"
+PAGES_FILENAME="$LANGWIKI-$DOWNLOAD_DATE-page.sql.gz"
+LINKS_FILENAME="$LANGWIKI-$DOWNLOAD_DATE-pagelinks.sql.gz"
+TARGETS_FILENAME="$LANGWIKI-$DOWNLOAD_DATE-linktarget.sql.gz"
 
 # Make the output directory if it doesn't already exist and move to it
 mkdir -p $OUT_DIR
@@ -77,6 +80,7 @@ download_file "sha1sums" $SHA1SUM_FILENAME
 download_file "redirects" $REDIRECTS_FILENAME
 download_file "pages" $PAGES_FILENAME
 download_file "links" $LINKS_FILENAME
+download_file "targets" $TARGETS_FILENAME
 
 ##########################
 #  TRIM WIKIPEDIA DUMPS  #
@@ -103,7 +107,7 @@ if [ ! -f redirects.txt.gz ]; then
 else
   echo "[WARN] Already trimmed redirects file"
 fi
-
+if $DELETE_PROGRESSIVELY; then rm $REDIRECTS_FILENAME; fi
 if [ ! -f pages.txt.gz ]; then
   echo
   echo "[INFO] Trimming pages file"
@@ -116,16 +120,16 @@ if [ ! -f pages.txt.gz ]; then
   # Splice out the page title and whether or not the page is a redirect
   # Zip into output file
   time pigz -dc $PAGES_FILENAME \
-    | sed -n 's/^INSERT INTO `page` VALUES (//p' \
-    | sed -e 's/),(/\'$'\n/g' \
-    | egrep "^[0-9]+,0," \
-    | sed -e $"s/,0,'/\t/" \
-    | sed -e $"s/',[^,]*,\([01]\).*/\t\1/" \
+    | sed -n 's/^INSERT INTO `page` VALUES //p' \
+    | egrep -o "\([0-9]+,0,'([^']*(\\\\')?)+',[01]," \
+    | sed -re $"s/^\(([0-9]+),0,'/\1\t/" \
+    | sed -re $"s/',([01]),/\t\1/" \
     | pigz --fast > pages.txt.gz.tmp
   mv pages.txt.gz.tmp pages.txt.gz
 else
   echo "[WARN] Already trimmed pages file"
 fi
+if $DELETE_PROGRESSIVELY; then rm $PAGES_FILENAME; fi
 
 if [ ! -f links.txt.gz ]; then
   echo
@@ -141,14 +145,38 @@ if [ ! -f links.txt.gz ]; then
   time pigz -dc $LINKS_FILENAME \
     | sed -n 's/^INSERT INTO `pagelinks` VALUES (//p' \
     | sed -e 's/),(/\'$'\n/g' \
-    | egrep "^[0-9]+,0,.*,0$" \
-    | sed -e $"s/,0,'/\t/g" \
-    | sed -e "s/',0//g" \
+    | egrep "^[0-9]+,0,[0-9]+$" \
+    | sed -e $"s/,0,/\t/g" \
     | pigz --fast > links.txt.gz.tmp
   mv links.txt.gz.tmp links.txt.gz
 else
   echo "[WARN] Already trimmed links file"
 fi
+if $DELETE_PROGRESSIVELY; then rm $LINKS_FILENAME; fi
+
+if [ ! -f targets.txt.gz ]; then
+  echo
+  echo "[INFO] Trimming targets file"
+
+  # Unzip
+  # Remove all lines that don't start with INSERT INTO...
+  # Split into individual records
+  # Only keep records in namespace 0
+  # Replace namespace with a tab
+  # Remove everything starting at the to page name's closing apostrophe
+  # Zip into output file
+  time pigz -dc $TARGETS_FILENAME \
+    | sed -n 's/^INSERT INTO `linktarget` VALUES (//p' \
+    | sed -e 's/),(/\'$'\n/g' \
+    | egrep "^[0-9]+,0,.*$" \
+    | sed -e $"s/,0,'/\t/g" \
+    | sed -e "s/'$//g" \
+    | pigz --fast > targets.txt.gz.tmp
+  mv targets.txt.gz.tmp targets.txt.gz
+else
+  echo "[WARN] Already trimmed targets file"
+fi
+if $DELETE_PROGRESSIVELY; then rm $TARGETS_FILENAME; fi
 
 
 ###########################################
@@ -157,32 +185,46 @@ fi
 if [ ! -f redirects.with_ids.txt.gz ]; then
   echo
   echo "[INFO] Replacing titles in redirects file"
-  time python "$ROOT_DIR/replace_titles_in_redirects_file.py" pages.txt.gz redirects.txt.gz \
+  time $PYTHON "$ROOT_DIR/replace_titles_in_redirects_file.py" pages.txt.gz redirects.txt.gz \
     | sort -S 100% -t $'\t' -k 1n,1n \
     | pigz --fast > redirects.with_ids.txt.gz.tmp
   mv redirects.with_ids.txt.gz.tmp redirects.with_ids.txt.gz
 else
   echo "[WARN] Already replaced titles in redirects file"
 fi
+if $DELETE_PROGRESSIVELY; then rm redirects.txt.gz; fi
+
+if [ ! -f targets.with_ids.txt.gz ]; then
+  echo
+  echo "[INFO] Replacing titles and redirects in targets file"
+  time $PYTHON "$ROOT_DIR/replace_titles_and_redirects_in_targets_file.py" pages.txt.gz redirects.with_ids.txt.gz targets.txt.gz \
+    | pigz --fast > targets.with_ids.txt.gz.tmp
+  mv targets.with_ids.txt.gz.tmp targets.with_ids.txt.gz
+else
+  echo "[WARN] Already replaced titles and redirects in targets file"
+fi
+if $DELETE_PROGRESSIVELY; then rm targets.txt.gz; fi
 
 if [ ! -f links.with_ids.txt.gz ]; then
   echo
   echo "[INFO] Replacing titles and redirects in links file"
-  time python "$ROOT_DIR/replace_titles_and_redirects_in_links_file.py" pages.txt.gz redirects.with_ids.txt.gz links.txt.gz \
+  time $PYTHON "$ROOT_DIR/replace_titles_and_redirects_in_links_file.py" pages.txt.gz redirects.with_ids.txt.gz targets.with_ids.txt.gz links.txt.gz \
     | pigz --fast > links.with_ids.txt.gz.tmp
   mv links.with_ids.txt.gz.tmp links.with_ids.txt.gz
 else
   echo "[WARN] Already replaced titles and redirects in links file"
 fi
+if $DELETE_PROGRESSIVELY; then rm links.txt.gz targets.with_ids.txt.gz; fi
 
 if [ ! -f pages.pruned.txt.gz ]; then
   echo
   echo "[INFO] Pruning pages which are marked as redirects but with no redirect"
-  time python "$ROOT_DIR/prune_pages_file.py" pages.txt.gz redirects.with_ids.txt.gz \
+  time $PYTHON "$ROOT_DIR/prune_pages_file.py" pages.txt.gz redirects.with_ids.txt.gz \
     | pigz --fast > pages.pruned.txt.gz
 else
   echo "[WARN] Already pruned pages which are marked as redirects but with no redirect"
 fi
+if $DELETE_PROGRESSIVELY; then rm pages.txt.gz; fi
 
 #####################
 #  SORT LINKS FILE  #
@@ -210,6 +252,7 @@ if [ ! -f links.sorted_by_target_id.txt.gz ]; then
 else
   echo "[WARN] Already sorted links file by target page ID"
 fi
+if $DELETE_PROGRESSIVELY; then rm links.with_ids.txt.gz; fi
 
 
 #############################
@@ -225,6 +268,7 @@ if [ ! -f links.grouped_by_source_id.txt.gz ]; then
 else
   echo "[WARN] Already grouped source links file by source page ID"
 fi
+if $DELETE_PROGRESSIVELY; then rm links.sorted_by_source_id.txt.gz; fi
 
 if [ ! -f links.grouped_by_target_id.txt.gz ]; then
   echo
@@ -235,6 +279,7 @@ if [ ! -f links.grouped_by_target_id.txt.gz ]; then
 else
   echo "[WARN] Already grouped target links file by target page ID"
 fi
+if $DELETE_PROGRESSIVELY; then rm links.sorted_by_target_id.txt.gz; fi
 
 
 ################################
@@ -243,12 +288,13 @@ fi
 if [ ! -f links.with_counts.txt.gz ]; then
   echo
   echo "[INFO] Combining grouped links files"
-  time python "$ROOT_DIR/combine_grouped_links_files.py" links.grouped_by_source_id.txt.gz links.grouped_by_target_id.txt.gz \
+  time $PYTHON "$ROOT_DIR/combine_grouped_links_files.py" links.grouped_by_source_id.txt.gz links.grouped_by_target_id.txt.gz \
     | pigz --fast > links.with_counts.txt.gz.tmp
   mv links.with_counts.txt.gz.tmp links.with_counts.txt.gz
 else
   echo "[WARN] Already combined grouped links files"
 fi
+if $DELETE_PROGRESSIVELY; then rm links.grouped_by_source_id.txt.gz links.grouped_by_target_id.txt.gz; fi
 
 
 ############################
@@ -258,14 +304,17 @@ if [ ! -f sdow.sqlite ]; then
   echo
   echo "[INFO] Creating redirects table"
   time pigz -dc redirects.with_ids.txt.gz | sqlite3 sdow.sqlite ".read $ROOT_DIR/../sql/createRedirectsTable.sql"
+  if $DELETE_PROGRESSIVELY; then rm redirects.with_ids.txt.gz; fi
 
   echo
   echo "[INFO] Creating pages table"
   time pigz -dc pages.pruned.txt.gz | sqlite3 sdow.sqlite ".read $ROOT_DIR/../sql/createPagesTable.sql"
+  if $DELETE_PROGRESSIVELY; then rm pages.pruned.txt.gz; fi
 
   echo
   echo "[INFO] Creating links table"
   time pigz -dc links.with_counts.txt.gz | sqlite3 sdow.sqlite ".read $ROOT_DIR/../sql/createLinksTable.sql"
+  if $DELETE_PROGRESSIVELY; then rm links.with_counts.txt.gz; fi
 
   echo
   echo "[INFO] Compressing SQLite file"
@@ -274,6 +323,5 @@ else
   echo "[WARN] Already created SQLite database"
 fi
 
-
 echo
 echo "[INFO] All done!"
@@ -28,26 +28,27 @@
 
 # Create a dictionary of page IDs to their incoming and outgoing links.
 LINKS = defaultdict(lambda: defaultdict(str))
-for line in io.BufferedReader(gzip.open(OUTGOING_LINKS_FILE, 'r')):
-  [source_page_id, target_page_ids] = line.rstrip('\n').split('\t')
-  LINKS[source_page_id]['outgoing'] = target_page_ids
+# outgoing is [0], incoming is [1]
+for line in io.BufferedReader(gzip.open(OUTGOING_LINKS_FILE, 'rb')):
+  [source_page_id, target_page_ids] = line.rstrip(b'\n').split(b'\t')
+  LINKS[int(source_page_id)][0] = target_page_ids
 
-for line in io.BufferedReader(gzip.open(INCOMING_LINKS_FILE, 'r')):
-  [target_page_id, source_page_ids] = line.rstrip('\n').split('\t')
-  LINKS[target_page_id]['incoming'] = source_page_ids
+for line in io.BufferedReader(gzip.open(INCOMING_LINKS_FILE, 'rb')):
+  [target_page_id, source_page_ids] = line.rstrip(b'\n').split(b'\t')
+  LINKS[int(target_page_id)][1] = source_page_ids
 
 # For each page in the links dictionary, print out its incoming and outgoing links as well as their
 # counts.
-for page_id, links in LINKS.iteritems():
-  outgoing_links = links.get('outgoing', '')
-  outgoing_links_count = 0 if outgoing_links is '' else len(
-      outgoing_links.split('|'))
+for page_id, links in LINKS.items():
+  outgoing_links = links.get(0, b'')
+  outgoing_links_count = 0 if outgoing_links==b'' else len(
+      outgoing_links.split(b'|'))
 
-  incoming_links = links.get('incoming', '')
-  incoming_links_count = 0 if incoming_links is '' else len(
-      incoming_links.split('|'))
+  incoming_links = links.get(1, b'')
+  incoming_links_count = 0 if incoming_links==b'' else len(
+      incoming_links.split(b'|'))
 
-  columns = [page_id, str(outgoing_links_count), str(
-      incoming_links_count), outgoing_links, incoming_links]
+  columns = [str(page_id).encode(), str(outgoing_links_count).encode(), str(
+      incoming_links_count).encode(), outgoing_links, incoming_links]
 
-  print('\t'.join(columns))
+  print(b'\t'.join(columns).decode())
@@ -28,14 +28,14 @@
 
 # Create a dictionary of redirects.
 REDIRECTS = {}
-for line in io.BufferedReader(gzip.open(REDIRECTS_FILE, 'r')):
-  [source_page_id, _] = line.rstrip('\n').split('\t')
+for line in io.BufferedReader(gzip.open(REDIRECTS_FILE, 'rb')):
+  [source_page_id, _] = line.rstrip(b'\n').split(b'\t')
   REDIRECTS[source_page_id] = True
 
 # Loop through the pages file, ignoring pages which are marked as redirects but which do not have a
 # corresponding redirect in the redirects dictionary, printing the remaining pages to stdout.
-for line in io.BufferedReader(gzip.open(PAGES_FILE, 'r')):
-  [page_id, page_title, is_redirect] = line.rstrip('\n').split('\t')
+for line in io.BufferedReader(gzip.open(PAGES_FILE, 'rb')):
+  [page_id, page_title, is_redirect] = line.rstrip(b'\n').split(b'\t')
 
-  if is_redirect == '0' or page_id in REDIRECTS:
-    print('\t'.join([page_id, page_title, is_redirect]))
+  if True or is_redirect == '0' or page_id in REDIRECTS:
+    print(b'\t'.join([page_id, page_title, is_redirect]).decode())