diff --git a/Dockerfile b/Dockerfile index a78e2aa..7d9a4be 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,28 +1,37 @@ -FROM debian:stable - -ENV DEBIAN_FRONTEND=noninteractive - -RUN apt update -RUN apt -y install --no-install-recommends git ca-certificates curl wget apt-utils - -# install: -# - node and yarn -# - go-ipfs -RUN curl -sL https://deb.nodesource.com/setup_14.x -o nodesource_setup.sh \ - && bash nodesource_setup.sh \ - && apt -y install --no-install-recommends nodejs \ - && npm install -g yarn \ - && wget -nv https://dist.ipfs.io/go-ipfs/v0.8.0/go-ipfs_v0.8.0_linux-amd64.tar.gz \ - && tar xvfz go-ipfs_v0.8.0_linux-amd64.tar.gz \ - && mv go-ipfs/ipfs /usr/local/bin/ipfs \ - && rm -r go-ipfs && rm go-ipfs_v0.8.0_linux-amd64.tar.gz \ - && ipfs init -p server,local-discovery,flatfs,randomports --empty-repo \ - && ipfs config --json 'Experimental.ShardingEnabled' true - -# TODO: move repo init after external volume is mounted - -ENV DEBIAN_FRONTEND=dialog - -RUN mkdir /root/distributed-wikipedia-mirror -VOLUME ["/root/distributed-wikipedia-mirror"] -WORKDIR /root/distributed-wikipedia-mirror +# This Dockerfile creates a self-contained image in which mirrorzim.sh can be executed. +# It also runs ipfs daemon. +# +# You can build the image as follows (remember to use this repo as context for the build): +# docker build . --platform=linux/amd64 -f Dockerfile -t distributed-wikipedia-mirror +# +# You can then run the container anywhere as follows: +# docker run -v $(pwd)/tmp:/root/tmp --ulimit nofile=65536:65536 -p 4001:4001/tcp -p 4001:4001/udp distributed-wikipedia-mirror + +FROM stedolan/jq:latest AS jq +FROM openzim/zim-tools:3.1.0 AS zimdump +FROM ipfs/go-ipfs:v0.12.0 AS ipfs +FROM node:16 + +# if false, ipfs daemon will not be started +ENV IPFS_DAEMON_ENABLED true + +RUN apt-get update && apt-get install --no-install-recommends --assume-yes rsync moreutils + +COPY --from=jq /usr/local/bin/jq /usr/local/bin/ +COPY --from=zimdump /usr/local/bin/zimdump /usr/local/bin/ +COPY --from=ipfs /usr/local/bin/ipfs /usr/local/bin/ + +COPY assets /root/assets +COPY bin /root/bin +COPY src /root/src +COPY tools /root/tools +COPY mirrorzim.sh package.json tsconfig.json /root/ + +RUN mkdir /root/snapshots /root/tmp +RUN cd /root && yarn + +EXPOSE 4001/tcp +EXPOSE 4001/udp + +WORKDIR /root +ENTRYPOINT [ "tools/entrypoint.sh" ] diff --git a/README.md b/README.md index 09dbfce..639838e 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Putting Wikipedia Snapshots on IPFS and working towards making it fully read-wri ## Existing Mirrors -There are various ways one can access the mirrors: through a [DNSLink](https://docs.ipfs.io/concepts/glossary/#dnslink), public [gateway](https://docs.ipfs.io/concepts/glossary/#gateway) or directly with a [CID](https://docs.ipfs.io/concepts/glossary/#cid). +There are various ways one can access the mirrors: through a [DNSLink](https://docs.ipfs.io/concepts/glossary/#dnslink), public [gateway](https://docs.ipfs.io/concepts/glossary/#gateway) or directly with a [CID](https://docs.ipfs.io/concepts/glossary/#cid). You can [read all about the available methods here](https://blog.ipfs.io/2021-05-31-distributed-wikipedia-mirror-update/#improved-access-to-wikipedia-mirrors). @@ -274,21 +274,24 @@ $ ./mirrorzim.sh --languagecode=cu --wikitype=wikipedia --hostingdnsdomain=cu.wi ## Docker build A `Dockerfile` with all the software requirements is provided. -For now it is only a handy container for running the process on non-Linux -systems or if you don't want to pollute your system with all the dependencies. -In the future it will be end-to-end blackbox that takes ZIM and spits out CID -and repo. +It is a handy container for running the process on non-Linux systems, if you don't want to pollute your system with all the dependencies or if you want to run the process in the cloud. +It is an end-to-end blackbox that takes mirrorzim.sh arguments, spits out CID and runs IPFS daemon. To build the docker image: ```sh -docker build . -t distributed-wikipedia-mirror-build +docker build . --platform=linux/amd64 -f Dockerfile -t distributed-wikipedia-mirror ``` -To use it as a development environment: +And then, to run it: ```sh -docker run -it -v $(pwd):/root/distributed-wikipedia-mirror --net=host --entrypoint bash distributed-wikipedia-mirror-build +docker run -v $(pwd)/tmp:/root/tmp --ulimit nofile=65536:65536 -p 4001:4001/tcp -p 4001:4001/udp distributed-wikipedia-mirror +``` + +If you don't care for hosting the mirror out of the Docker container, you can run this instead: +```sh +docker run -e IPFS_DAEMON_ENABLED=false -v $(pwd)/tmp:/root/tmp --ulimit nofile=65536:65536 -p 4001:4001/tcp -p 4001:4001/udp distributed-wikipedia-mirror ``` # How to Help diff --git a/mirrorzim.sh b/mirrorzim.sh index f065e70..4e3ec62 100755 --- a/mirrorzim.sh +++ b/mirrorzim.sh @@ -11,19 +11,23 @@ usage() { echo "" echo "SYNOPSIS" echo " $0 --languagecode= --wikitype=" + echo " [--tag=]" + echo " [--edition=]" echo " [--hostingdnsdomain=]" echo " [--hostingipnshash=]" echo " [--mainpageversion=]" echo "" echo "OPTIONS" echo "" - echo " -l, --languagecode string - the language of the wikimedia property e.g. tr - turkish, en - english" - echo " -w, --wikitype string - the type of the wikimedia property e.g. wikipedia, wikiquote" - echo " -d, --hostingdnsdomain string - the DNS domain name the mirror will be hosted at e.g. tr.wikipedia-on-ipfs.org" - echo " -i, --hostingipnshash string - the IPNS hash the mirror will be hosted at e.g. QmVH1VzGBydSfmNG7rmdDjAeBZ71UVeEahVbNpFQtwZK8W" - echo " -v, --mainpageversion string - an override hack used on Turkish Wikipedia, it sets the main page version as there are issues with the Kiwix version id" - - exit 2 + echo " -l, --languagecode string - the language of the wikimedia property e.g. tr - turkish, en - english" + echo " -w, --wikitype string - the type of the wikimedia property e.g. wikipedia, wikiquote" + echo " -t, --tag string - the tag of the wikimedia property e.g. all, top (defaults to all)" + echo " -e, --edition string - the edition of the wikimedia property e.g. maxi, mini (defaults to maxi)" + echo " -c, --date string - the date of the wikimedia property e.g. latest (defaults to latest)" + echo " -d, --hostingdnsdomain string - the DNS domain name the mirror will be hosted at e.g. tr.wikipedia-on-ipfs.org" + echo " -i, --hostingipnshash string - the IPNS hash the mirror will be hosted at e.g. QmVH1VzGBydSfmNG7rmdDjAeBZ71UVeEahVbNpFQtwZK8W" + echo " -v, --mainpageversion string - an override hack used on Turkish Wikipedia, it sets the main page version as there are issues with the Kiwix version id" + exit 2 } @@ -38,6 +42,18 @@ case $i in WIKI_TYPE="${i#*=}" shift ;; + -t=*|--tag=*) + TAG="${i#*=}" + shift + ;; + -e=*|--edition=*) + EDITION="${i#*=}" + shift + ;; + -c=*|--date=*) + DATE="${i#*=}" + shift + ;; -d=*|--hostingdnsdomain=*) HOSTING_DNS_DOMAIN="${i#*=}" shift @@ -70,6 +86,18 @@ if [ -z ${WIKI_TYPE+x} ]; then usage fi +if [ -z ${TAG+x} ]; then + TAG="all" +fi + +if [ -z ${EDITION+x} ]; then + EDITION="maxi" +fi + +if [ -z ${DATE+x} ]; then + DATE="latest" +fi + if [ -z ${HOSTING_DNS_DOMAIN+x} ]; then HOSTING_DNS_DOMAIN="" fi @@ -87,7 +115,7 @@ PATH=$PATH:$(realpath ./bin) which zimdump &> /dev/null || (curl --progress-bar -L https://download.openzim.org/release/zim-tools/zim-tools_linux-x86_64-3.1.0.tar.gz | tar -xvz --strip-components=1 -C ./bin zim-tools_linux-x86_64-3.1.0/zimdump && chmod +x ./bin/zimdump) printf "\nDownload and verify the zim file...\n" -ZIM_FILE_SOURCE_URL="$(./tools/getzim.sh download $WIKI_TYPE $WIKI_TYPE $LANGUAGE_CODE all maxi latest | grep 'URL:' | cut -d' ' -f3)" +ZIM_FILE_SOURCE_URL="$(./tools/getzim.sh download $WIKI_TYPE $WIKI_TYPE $LANGUAGE_CODE $TAG $EDITION $DATE | grep 'URL:' | cut -d' ' -f3)" ZIM_FILE=$(echo $ZIM_FILE_SOURCE_URL | rev | cut -d'/' -f1 | rev) TMP_DIRECTORY="./tmp/$(echo $ZIM_FILE | cut -d'.' -f1)" @@ -116,11 +144,8 @@ node ./bin/run $TMP_DIRECTORY \ ${HOSTING_IPNS_HASH:+--hostingipnshash=$HOSTING_IPNS_HASH} \ ${MAIN_PAGE_VERSION:+--mainpageversion=$MAIN_PAGE_VERSION} -printf "\n-------------------------\n" -printf "\nIPFS_PATH=$IPFS_PATH\n" - printf "\nAdding the processed tmp directory to IPFS\n(this part may take long time on a slow disk):\n" -CID=$(ipfs add -r --cid-version 1 --pin=false --offline -Qp $TMP_DIRECTORY) +CID=$(ipfs add -r --cid-version 1 --pin=false --offline -Q -p $TMP_DIRECTORY) MFS_DIR="/${ZIM_FILE}__$(date +%F_%T)" # pin by adding to MFS under a meaningful name diff --git a/tools/entrypoint.sh b/tools/entrypoint.sh new file mode 100755 index 0000000..00e7c01 --- /dev/null +++ b/tools/entrypoint.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +set -eu + +ipfs init -p server,flatfs --empty-repo +ipfs config --json Experimental.AcceleratedDHTClient true +ipfs config --json 'Datastore.Spec.mounts' "$(ipfs config 'Datastore.Spec.mounts' | jq -c '.[0].child.sync=false')" + +./mirrorzim.sh "$@" | ts + +if [[ "$IPFS_DAEMON_ENABLED" == "true" ]]; then + ipfs daemon +fi diff --git a/tools/find_original_main_page_url.sh b/tools/find_original_main_page_url.sh deleted file mode 100755 index fe3ea17..0000000 --- a/tools/find_original_main_page_url.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash -# vim: set ts=2 sw=2: - -set -euo pipefail - -# Landing pages shipping with ZIM file are either truncated or Kiwix-specific. -# This script finds the URL of original version of the langing page -# mathing the timestamp of snapshot in unpacked ZIM directory - -usage() { - echo "USAGE:" - echo " $0
"; - echo "" - exit 2 -} - -if [ -z "${1-}" ]; then - echo "Missing main page name (eg. Main_Page.html) " - usage -fi - -if [ -z "${2-}" ]; then - echo "Missing unpacked zim dir (eg. ./out) " - usage -fi - -MAIN_PAGE=$1 -ZIM_ROOT=$2 - -SNAPSHOT_URL=$(grep -io 'https://[^"]*oldid=[^"]*' "$ZIM_ROOT/A/$MAIN_PAGE") - -echo $SNAPSHOT_URL